linux/arch/x86/crypto/serpent-sse2-i586-asm_32.S
<<
>>
Prefs
   1/*
   2 * Serpent Cipher 4-way parallel algorithm (i586/SSE2)
   3 *
   4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
   5 *
   6 * Based on crypto/serpent.c by
   7 *  Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
   8 *                2003 Herbert Valerio Riedel <hvr@gnu.org>
   9 *
  10 * This program is free software; you can redistribute it and/or modify
  11 * it under the terms of the GNU General Public License as published by
  12 * the Free Software Foundation; either version 2 of the License, or
  13 * (at your option) any later version.
  14 *
  15 * This program is distributed in the hope that it will be useful,
  16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18 * GNU General Public License for more details.
  19 *
  20 * You should have received a copy of the GNU General Public License
  21 * along with this program; if not, write to the Free Software
  22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
  23 * USA
  24 *
  25 */
  26
  27#include <linux/linkage.h>
  28
  29.file "serpent-sse2-i586-asm_32.S"
  30.text
  31
  32#define arg_ctx 4
  33#define arg_dst 8
  34#define arg_src 12
  35#define arg_xor 16
  36
  37/**********************************************************************
  38  4-way SSE2 serpent
  39 **********************************************************************/
  40#define CTX %edx
  41
  42#define RA %xmm0
  43#define RB %xmm1
  44#define RC %xmm2
  45#define RD %xmm3
  46#define RE %xmm4
  47
  48#define RT0 %xmm5
  49#define RT1 %xmm6
  50
  51#define RNOT %xmm7
  52
  53#define get_key(i, j, t) \
  54        movd (4*(i)+(j))*4(CTX), t; \
  55        pshufd $0, t, t;
  56
  57#define K(x0, x1, x2, x3, x4, i) \
  58        get_key(i, 0, x4); \
  59        get_key(i, 1, RT0); \
  60        get_key(i, 2, RT1); \
  61        pxor x4,                x0; \
  62        pxor RT0,               x1; \
  63        pxor RT1,               x2; \
  64        get_key(i, 3, x4); \
  65        pxor x4,                x3;
  66
  67#define LK(x0, x1, x2, x3, x4, i) \
  68        movdqa x0,              x4; \
  69        pslld $13,              x0; \
  70        psrld $(32 - 13),       x4; \
  71        por x4,                 x0; \
  72        pxor x0,                x1; \
  73        movdqa x2,              x4; \
  74        pslld $3,               x2; \
  75        psrld $(32 - 3),        x4; \
  76        por x4,                 x2; \
  77        pxor x2,                x1; \
  78        movdqa x1,              x4; \
  79        pslld $1,               x1; \
  80        psrld $(32 - 1),        x4; \
  81        por x4,                 x1; \
  82        movdqa x0,              x4; \
  83        pslld $3,               x4; \
  84        pxor x2,                x3; \
  85        pxor x4,                x3; \
  86        movdqa x3,              x4; \
  87        pslld $7,               x3; \
  88        psrld $(32 - 7),        x4; \
  89        por x4,                 x3; \
  90        movdqa x1,              x4; \
  91        pslld $7,               x4; \
  92        pxor x1,                x0; \
  93        pxor x3,                x0; \
  94        pxor x3,                x2; \
  95        pxor x4,                x2; \
  96        movdqa x0,              x4; \
  97        get_key(i, 1, RT0); \
  98        pxor RT0,               x1; \
  99        get_key(i, 3, RT0); \
 100        pxor RT0,               x3; \
 101        pslld $5,               x0; \
 102        psrld $(32 - 5),        x4; \
 103        por x4,                 x0; \
 104        movdqa x2,              x4; \
 105        pslld $22,              x2; \
 106        psrld $(32 - 22),       x4; \
 107        por x4,                 x2; \
 108        get_key(i, 0, RT0); \
 109        pxor RT0,               x0; \
 110        get_key(i, 2, RT0); \
 111        pxor RT0,               x2;
 112
 113#define KL(x0, x1, x2, x3, x4, i) \
 114        K(x0, x1, x2, x3, x4, i); \
 115        movdqa x0,              x4; \
 116        psrld $5,               x0; \
 117        pslld $(32 - 5),        x4; \
 118        por x4,                 x0; \
 119        movdqa x2,              x4; \
 120        psrld $22,              x2; \
 121        pslld $(32 - 22),       x4; \
 122        por x4,                 x2; \
 123        pxor x3,                x2; \
 124        pxor x3,                x0; \
 125        movdqa x1,              x4; \
 126        pslld $7,               x4; \
 127        pxor x1,                x0; \
 128        pxor x4,                x2; \
 129        movdqa x1,              x4; \
 130        psrld $1,               x1; \
 131        pslld $(32 - 1),        x4; \
 132        por x4,                 x1; \
 133        movdqa x3,              x4; \
 134        psrld $7,               x3; \
 135        pslld $(32 - 7),        x4; \
 136        por x4,                 x3; \
 137        pxor x0,                x1; \
 138        movdqa x0,              x4; \
 139        pslld $3,               x4; \
 140        pxor x4,                x3; \
 141        movdqa x0,              x4; \
 142        psrld $13,              x0; \
 143        pslld $(32 - 13),       x4; \
 144        por x4,                 x0; \
 145        pxor x2,                x1; \
 146        pxor x2,                x3; \
 147        movdqa x2,              x4; \
 148        psrld $3,               x2; \
 149        pslld $(32 - 3),        x4; \
 150        por x4,                 x2;
 151
 152#define S0(x0, x1, x2, x3, x4) \
 153        movdqa x3,              x4; \
 154        por x0,                 x3; \
 155        pxor x4,                x0; \
 156        pxor x2,                x4; \
 157        pxor RNOT,              x4; \
 158        pxor x1,                x3; \
 159        pand x0,                x1; \
 160        pxor x4,                x1; \
 161        pxor x0,                x2; \
 162        pxor x3,                x0; \
 163        por x0,                 x4; \
 164        pxor x2,                x0; \
 165        pand x1,                x2; \
 166        pxor x2,                x3; \
 167        pxor RNOT,              x1; \
 168        pxor x4,                x2; \
 169        pxor x2,                x1;
 170
 171#define S1(x0, x1, x2, x3, x4) \
 172        movdqa x1,              x4; \
 173        pxor x0,                x1; \
 174        pxor x3,                x0; \
 175        pxor RNOT,              x3; \
 176        pand x1,                x4; \
 177        por x1,                 x0; \
 178        pxor x2,                x3; \
 179        pxor x3,                x0; \
 180        pxor x3,                x1; \
 181        pxor x4,                x3; \
 182        por x4,                 x1; \
 183        pxor x2,                x4; \
 184        pand x0,                x2; \
 185        pxor x1,                x2; \
 186        por x0,                 x1; \
 187        pxor RNOT,              x0; \
 188        pxor x2,                x0; \
 189        pxor x1,                x4;
 190
 191#define S2(x0, x1, x2, x3, x4) \
 192        pxor RNOT,              x3; \
 193        pxor x0,                x1; \
 194        movdqa x0,              x4; \
 195        pand x2,                x0; \
 196        pxor x3,                x0; \
 197        por x4,                 x3; \
 198        pxor x1,                x2; \
 199        pxor x1,                x3; \
 200        pand x0,                x1; \
 201        pxor x2,                x0; \
 202        pand x3,                x2; \
 203        por x1,                 x3; \
 204        pxor RNOT,              x0; \
 205        pxor x0,                x3; \
 206        pxor x0,                x4; \
 207        pxor x2,                x0; \
 208        por x2,                 x1;
 209
 210#define S3(x0, x1, x2, x3, x4) \
 211        movdqa x1,              x4; \
 212        pxor x3,                x1; \
 213        por x0,                 x3; \
 214        pand x0,                x4; \
 215        pxor x2,                x0; \
 216        pxor x1,                x2; \
 217        pand x3,                x1; \
 218        pxor x3,                x2; \
 219        por x4,                 x0; \
 220        pxor x3,                x4; \
 221        pxor x0,                x1; \
 222        pand x3,                x0; \
 223        pand x4,                x3; \
 224        pxor x2,                x3; \
 225        por x1,                 x4; \
 226        pand x1,                x2; \
 227        pxor x3,                x4; \
 228        pxor x3,                x0; \
 229        pxor x2,                x3;
 230
 231#define S4(x0, x1, x2, x3, x4) \
 232        movdqa x3,              x4; \
 233        pand x0,                x3; \
 234        pxor x4,                x0; \
 235        pxor x2,                x3; \
 236        por x4,                 x2; \
 237        pxor x1,                x0; \
 238        pxor x3,                x4; \
 239        por x0,                 x2; \
 240        pxor x1,                x2; \
 241        pand x0,                x1; \
 242        pxor x4,                x1; \
 243        pand x2,                x4; \
 244        pxor x3,                x2; \
 245        pxor x0,                x4; \
 246        por x1,                 x3; \
 247        pxor RNOT,              x1; \
 248        pxor x0,                x3;
 249
 250#define S5(x0, x1, x2, x3, x4) \
 251        movdqa x1,              x4; \
 252        por x0,                 x1; \
 253        pxor x1,                x2; \
 254        pxor RNOT,              x3; \
 255        pxor x0,                x4; \
 256        pxor x2,                x0; \
 257        pand x4,                x1; \
 258        por x3,                 x4; \
 259        pxor x0,                x4; \
 260        pand x3,                x0; \
 261        pxor x3,                x1; \
 262        pxor x2,                x3; \
 263        pxor x1,                x0; \
 264        pand x4,                x2; \
 265        pxor x2,                x1; \
 266        pand x0,                x2; \
 267        pxor x2,                x3;
 268
 269#define S6(x0, x1, x2, x3, x4) \
 270        movdqa x1,              x4; \
 271        pxor x0,                x3; \
 272        pxor x2,                x1; \
 273        pxor x0,                x2; \
 274        pand x3,                x0; \
 275        por x3,                 x1; \
 276        pxor RNOT,              x4; \
 277        pxor x1,                x0; \
 278        pxor x2,                x1; \
 279        pxor x4,                x3; \
 280        pxor x0,                x4; \
 281        pand x0,                x2; \
 282        pxor x1,                x4; \
 283        pxor x3,                x2; \
 284        pand x1,                x3; \
 285        pxor x0,                x3; \
 286        pxor x2,                x1;
 287
 288#define S7(x0, x1, x2, x3, x4) \
 289        pxor RNOT,              x1; \
 290        movdqa x1,              x4; \
 291        pxor RNOT,              x0; \
 292        pand x2,                x1; \
 293        pxor x3,                x1; \
 294        por x4,                 x3; \
 295        pxor x2,                x4; \
 296        pxor x3,                x2; \
 297        pxor x0,                x3; \
 298        por x1,                 x0; \
 299        pand x0,                x2; \
 300        pxor x4,                x0; \
 301        pxor x3,                x4; \
 302        pand x0,                x3; \
 303        pxor x1,                x4; \
 304        pxor x4,                x2; \
 305        pxor x1,                x3; \
 306        por x0,                 x4; \
 307        pxor x1,                x4;
 308
 309#define SI0(x0, x1, x2, x3, x4) \
 310        movdqa x3,              x4; \
 311        pxor x0,                x1; \
 312        por x1,                 x3; \
 313        pxor x1,                x4; \
 314        pxor RNOT,              x0; \
 315        pxor x3,                x2; \
 316        pxor x0,                x3; \
 317        pand x1,                x0; \
 318        pxor x2,                x0; \
 319        pand x3,                x2; \
 320        pxor x4,                x3; \
 321        pxor x3,                x2; \
 322        pxor x3,                x1; \
 323        pand x0,                x3; \
 324        pxor x0,                x1; \
 325        pxor x2,                x0; \
 326        pxor x3,                x4;
 327
 328#define SI1(x0, x1, x2, x3, x4) \
 329        pxor x3,                x1; \
 330        movdqa x0,              x4; \
 331        pxor x2,                x0; \
 332        pxor RNOT,              x2; \
 333        por x1,                 x4; \
 334        pxor x3,                x4; \
 335        pand x1,                x3; \
 336        pxor x2,                x1; \
 337        pand x4,                x2; \
 338        pxor x1,                x4; \
 339        por x3,                 x1; \
 340        pxor x0,                x3; \
 341        pxor x0,                x2; \
 342        por x4,                 x0; \
 343        pxor x4,                x2; \
 344        pxor x0,                x1; \
 345        pxor x1,                x4;
 346
 347#define SI2(x0, x1, x2, x3, x4) \
 348        pxor x1,                x2; \
 349        movdqa x3,              x4; \
 350        pxor RNOT,              x3; \
 351        por x2,                 x3; \
 352        pxor x4,                x2; \
 353        pxor x0,                x4; \
 354        pxor x1,                x3; \
 355        por x2,                 x1; \
 356        pxor x0,                x2; \
 357        pxor x4,                x1; \
 358        por x3,                 x4; \
 359        pxor x3,                x2; \
 360        pxor x2,                x4; \
 361        pand x1,                x2; \
 362        pxor x3,                x2; \
 363        pxor x4,                x3; \
 364        pxor x0,                x4;
 365
 366#define SI3(x0, x1, x2, x3, x4) \
 367        pxor x1,                x2; \
 368        movdqa x1,              x4; \
 369        pand x2,                x1; \
 370        pxor x0,                x1; \
 371        por x4,                 x0; \
 372        pxor x3,                x4; \
 373        pxor x3,                x0; \
 374        por x1,                 x3; \
 375        pxor x2,                x1; \
 376        pxor x3,                x1; \
 377        pxor x2,                x0; \
 378        pxor x3,                x2; \
 379        pand x1,                x3; \
 380        pxor x0,                x1; \
 381        pand x2,                x0; \
 382        pxor x3,                x4; \
 383        pxor x0,                x3; \
 384        pxor x1,                x0;
 385
 386#define SI4(x0, x1, x2, x3, x4) \
 387        pxor x3,                x2; \
 388        movdqa x0,              x4; \
 389        pand x1,                x0; \
 390        pxor x2,                x0; \
 391        por x3,                 x2; \
 392        pxor RNOT,              x4; \
 393        pxor x0,                x1; \
 394        pxor x2,                x0; \
 395        pand x4,                x2; \
 396        pxor x0,                x2; \
 397        por x4,                 x0; \
 398        pxor x3,                x0; \
 399        pand x2,                x3; \
 400        pxor x3,                x4; \
 401        pxor x1,                x3; \
 402        pand x0,                x1; \
 403        pxor x1,                x4; \
 404        pxor x3,                x0;
 405
 406#define SI5(x0, x1, x2, x3, x4) \
 407        movdqa x1,              x4; \
 408        por x2,                 x1; \
 409        pxor x4,                x2; \
 410        pxor x3,                x1; \
 411        pand x4,                x3; \
 412        pxor x3,                x2; \
 413        por x0,                 x3; \
 414        pxor RNOT,              x0; \
 415        pxor x2,                x3; \
 416        por x0,                 x2; \
 417        pxor x1,                x4; \
 418        pxor x4,                x2; \
 419        pand x0,                x4; \
 420        pxor x1,                x0; \
 421        pxor x3,                x1; \
 422        pand x2,                x0; \
 423        pxor x3,                x2; \
 424        pxor x2,                x0; \
 425        pxor x4,                x2; \
 426        pxor x3,                x4;
 427
 428#define SI6(x0, x1, x2, x3, x4) \
 429        pxor x2,                x0; \
 430        movdqa x0,              x4; \
 431        pand x3,                x0; \
 432        pxor x3,                x2; \
 433        pxor x2,                x0; \
 434        pxor x1,                x3; \
 435        por x4,                 x2; \
 436        pxor x3,                x2; \
 437        pand x0,                x3; \
 438        pxor RNOT,              x0; \
 439        pxor x1,                x3; \
 440        pand x2,                x1; \
 441        pxor x0,                x4; \
 442        pxor x4,                x3; \
 443        pxor x2,                x4; \
 444        pxor x1,                x0; \
 445        pxor x0,                x2;
 446
 447#define SI7(x0, x1, x2, x3, x4) \
 448        movdqa x3,              x4; \
 449        pand x0,                x3; \
 450        pxor x2,                x0; \
 451        por x4,                 x2; \
 452        pxor x1,                x4; \
 453        pxor RNOT,              x0; \
 454        por x3,                 x1; \
 455        pxor x0,                x4; \
 456        pand x2,                x0; \
 457        pxor x1,                x0; \
 458        pand x2,                x1; \
 459        pxor x2,                x3; \
 460        pxor x3,                x4; \
 461        pand x3,                x2; \
 462        por x0,                 x3; \
 463        pxor x4,                x1; \
 464        pxor x4,                x3; \
 465        pand x0,                x4; \
 466        pxor x2,                x4;
 467
 468#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 469        movdqa x0,              t2; \
 470        punpckldq x1,           x0; \
 471        punpckhdq x1,           t2; \
 472        movdqa x2,              t1; \
 473        punpckhdq x3,           x2; \
 474        punpckldq x3,           t1; \
 475        movdqa x0,              x1; \
 476        punpcklqdq t1,          x0; \
 477        punpckhqdq t1,          x1; \
 478        movdqa t2,              x3; \
 479        punpcklqdq x2,          t2; \
 480        punpckhqdq x2,          x3; \
 481        movdqa t2,              x2;
 482
 483#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
 484        movdqu (0*4*4)(in),     x0; \
 485        movdqu (1*4*4)(in),     x1; \
 486        movdqu (2*4*4)(in),     x2; \
 487        movdqu (3*4*4)(in),     x3; \
 488        \
 489        transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
 490
 491#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
 492        transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 493        \
 494        movdqu x0, (0*4*4)(out); \
 495        movdqu x1, (1*4*4)(out); \
 496        movdqu x2, (2*4*4)(out); \
 497        movdqu x3, (3*4*4)(out);
 498
 499#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
 500        transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 501        \
 502        movdqu (0*4*4)(out),    t0; \
 503        pxor t0,                x0; \
 504        movdqu x0,              (0*4*4)(out); \
 505        movdqu (1*4*4)(out),    t0; \
 506        pxor t0,                x1; \
 507        movdqu x1,              (1*4*4)(out); \
 508        movdqu (2*4*4)(out),    t0; \
 509        pxor t0,                x2; \
 510        movdqu x2,              (2*4*4)(out); \
 511        movdqu (3*4*4)(out),    t0; \
 512        pxor t0,                x3; \
 513        movdqu x3,              (3*4*4)(out);
 514
 515ENTRY(__serpent_enc_blk_4way)
 516        /* input:
 517         *      arg_ctx(%esp): ctx, CTX
 518         *      arg_dst(%esp): dst
 519         *      arg_src(%esp): src
 520         *      arg_xor(%esp): bool, if true: xor output
 521         */
 522
 523        pcmpeqd RNOT, RNOT;
 524
 525        movl arg_ctx(%esp), CTX;
 526
 527        movl arg_src(%esp), %eax;
 528        read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
 529
 530                                         K(RA, RB, RC, RD, RE, 0);
 531        S0(RA, RB, RC, RD, RE);         LK(RC, RB, RD, RA, RE, 1);
 532        S1(RC, RB, RD, RA, RE);         LK(RE, RD, RA, RC, RB, 2);
 533        S2(RE, RD, RA, RC, RB);         LK(RB, RD, RE, RC, RA, 3);
 534        S3(RB, RD, RE, RC, RA);         LK(RC, RA, RD, RB, RE, 4);
 535        S4(RC, RA, RD, RB, RE);         LK(RA, RD, RB, RE, RC, 5);
 536        S5(RA, RD, RB, RE, RC);         LK(RC, RA, RD, RE, RB, 6);
 537        S6(RC, RA, RD, RE, RB);         LK(RD, RB, RA, RE, RC, 7);
 538        S7(RD, RB, RA, RE, RC);         LK(RC, RA, RE, RD, RB, 8);
 539        S0(RC, RA, RE, RD, RB);         LK(RE, RA, RD, RC, RB, 9);
 540        S1(RE, RA, RD, RC, RB);         LK(RB, RD, RC, RE, RA, 10);
 541        S2(RB, RD, RC, RE, RA);         LK(RA, RD, RB, RE, RC, 11);
 542        S3(RA, RD, RB, RE, RC);         LK(RE, RC, RD, RA, RB, 12);
 543        S4(RE, RC, RD, RA, RB);         LK(RC, RD, RA, RB, RE, 13);
 544        S5(RC, RD, RA, RB, RE);         LK(RE, RC, RD, RB, RA, 14);
 545        S6(RE, RC, RD, RB, RA);         LK(RD, RA, RC, RB, RE, 15);
 546        S7(RD, RA, RC, RB, RE);         LK(RE, RC, RB, RD, RA, 16);
 547        S0(RE, RC, RB, RD, RA);         LK(RB, RC, RD, RE, RA, 17);
 548        S1(RB, RC, RD, RE, RA);         LK(RA, RD, RE, RB, RC, 18);
 549        S2(RA, RD, RE, RB, RC);         LK(RC, RD, RA, RB, RE, 19);
 550        S3(RC, RD, RA, RB, RE);         LK(RB, RE, RD, RC, RA, 20);
 551        S4(RB, RE, RD, RC, RA);         LK(RE, RD, RC, RA, RB, 21);
 552        S5(RE, RD, RC, RA, RB);         LK(RB, RE, RD, RA, RC, 22);
 553        S6(RB, RE, RD, RA, RC);         LK(RD, RC, RE, RA, RB, 23);
 554        S7(RD, RC, RE, RA, RB);         LK(RB, RE, RA, RD, RC, 24);
 555        S0(RB, RE, RA, RD, RC);         LK(RA, RE, RD, RB, RC, 25);
 556        S1(RA, RE, RD, RB, RC);         LK(RC, RD, RB, RA, RE, 26);
 557        S2(RC, RD, RB, RA, RE);         LK(RE, RD, RC, RA, RB, 27);
 558        S3(RE, RD, RC, RA, RB);         LK(RA, RB, RD, RE, RC, 28);
 559        S4(RA, RB, RD, RE, RC);         LK(RB, RD, RE, RC, RA, 29);
 560        S5(RB, RD, RE, RC, RA);         LK(RA, RB, RD, RC, RE, 30);
 561        S6(RA, RB, RD, RC, RE);         LK(RD, RE, RB, RC, RA, 31);
 562        S7(RD, RE, RB, RC, RA);          K(RA, RB, RC, RD, RE, 32);
 563
 564        movl arg_dst(%esp), %eax;
 565
 566        cmpb $0, arg_xor(%esp);
 567        jnz .L__enc_xor4;
 568
 569        write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
 570
 571        ret;
 572
 573.L__enc_xor4:
 574        xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
 575
 576        ret;
 577ENDPROC(__serpent_enc_blk_4way)
 578
 579ENTRY(serpent_dec_blk_4way)
 580        /* input:
 581         *      arg_ctx(%esp): ctx, CTX
 582         *      arg_dst(%esp): dst
 583         *      arg_src(%esp): src
 584         */
 585
 586        pcmpeqd RNOT, RNOT;
 587
 588        movl arg_ctx(%esp), CTX;
 589
 590        movl arg_src(%esp), %eax;
 591        read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
 592
 593                                         K(RA, RB, RC, RD, RE, 32);
 594        SI7(RA, RB, RC, RD, RE);        KL(RB, RD, RA, RE, RC, 31);
 595        SI6(RB, RD, RA, RE, RC);        KL(RA, RC, RE, RB, RD, 30);
 596        SI5(RA, RC, RE, RB, RD);        KL(RC, RD, RA, RE, RB, 29);
 597        SI4(RC, RD, RA, RE, RB);        KL(RC, RA, RB, RE, RD, 28);
 598        SI3(RC, RA, RB, RE, RD);        KL(RB, RC, RD, RE, RA, 27);
 599        SI2(RB, RC, RD, RE, RA);        KL(RC, RA, RE, RD, RB, 26);
 600        SI1(RC, RA, RE, RD, RB);        KL(RB, RA, RE, RD, RC, 25);
 601        SI0(RB, RA, RE, RD, RC);        KL(RE, RC, RA, RB, RD, 24);
 602        SI7(RE, RC, RA, RB, RD);        KL(RC, RB, RE, RD, RA, 23);
 603        SI6(RC, RB, RE, RD, RA);        KL(RE, RA, RD, RC, RB, 22);
 604        SI5(RE, RA, RD, RC, RB);        KL(RA, RB, RE, RD, RC, 21);
 605        SI4(RA, RB, RE, RD, RC);        KL(RA, RE, RC, RD, RB, 20);
 606        SI3(RA, RE, RC, RD, RB);        KL(RC, RA, RB, RD, RE, 19);
 607        SI2(RC, RA, RB, RD, RE);        KL(RA, RE, RD, RB, RC, 18);
 608        SI1(RA, RE, RD, RB, RC);        KL(RC, RE, RD, RB, RA, 17);
 609        SI0(RC, RE, RD, RB, RA);        KL(RD, RA, RE, RC, RB, 16);
 610        SI7(RD, RA, RE, RC, RB);        KL(RA, RC, RD, RB, RE, 15);
 611        SI6(RA, RC, RD, RB, RE);        KL(RD, RE, RB, RA, RC, 14);
 612        SI5(RD, RE, RB, RA, RC);        KL(RE, RC, RD, RB, RA, 13);
 613        SI4(RE, RC, RD, RB, RA);        KL(RE, RD, RA, RB, RC, 12);
 614        SI3(RE, RD, RA, RB, RC);        KL(RA, RE, RC, RB, RD, 11);
 615        SI2(RA, RE, RC, RB, RD);        KL(RE, RD, RB, RC, RA, 10);
 616        SI1(RE, RD, RB, RC, RA);        KL(RA, RD, RB, RC, RE, 9);
 617        SI0(RA, RD, RB, RC, RE);        KL(RB, RE, RD, RA, RC, 8);
 618        SI7(RB, RE, RD, RA, RC);        KL(RE, RA, RB, RC, RD, 7);
 619        SI6(RE, RA, RB, RC, RD);        KL(RB, RD, RC, RE, RA, 6);
 620        SI5(RB, RD, RC, RE, RA);        KL(RD, RA, RB, RC, RE, 5);
 621        SI4(RD, RA, RB, RC, RE);        KL(RD, RB, RE, RC, RA, 4);
 622        SI3(RD, RB, RE, RC, RA);        KL(RE, RD, RA, RC, RB, 3);
 623        SI2(RE, RD, RA, RC, RB);        KL(RD, RB, RC, RA, RE, 2);
 624        SI1(RD, RB, RC, RA, RE);        KL(RE, RB, RC, RA, RD, 1);
 625        SI0(RE, RB, RC, RA, RD);         K(RC, RD, RB, RE, RA, 0);
 626
 627        movl arg_dst(%esp), %eax;
 628        write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA);
 629
 630        ret;
 631ENDPROC(serpent_dec_blk_4way)
 632