linux/arch/x86/crypto/serpent-sse2-i586-asm_32.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-or-later */
   2/*
   3 * Serpent Cipher 4-way parallel algorithm (i586/SSE2)
   4 *
   5 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
   6 *
   7 * Based on crypto/serpent.c by
   8 *  Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
   9 *                2003 Herbert Valerio Riedel <hvr@gnu.org>
  10 */
  11
  12#include <linux/linkage.h>
  13
  14.file "serpent-sse2-i586-asm_32.S"
  15.text
  16
  17#define arg_ctx 4
  18#define arg_dst 8
  19#define arg_src 12
  20#define arg_xor 16
  21
  22/**********************************************************************
  23  4-way SSE2 serpent
  24 **********************************************************************/
  25#define CTX %edx
  26
  27#define RA %xmm0
  28#define RB %xmm1
  29#define RC %xmm2
  30#define RD %xmm3
  31#define RE %xmm4
  32
  33#define RT0 %xmm5
  34#define RT1 %xmm6
  35
  36#define RNOT %xmm7
  37
  38#define get_key(i, j, t) \
  39        movd (4*(i)+(j))*4(CTX), t; \
  40        pshufd $0, t, t;
  41
  42#define K(x0, x1, x2, x3, x4, i) \
  43        get_key(i, 0, x4); \
  44        get_key(i, 1, RT0); \
  45        get_key(i, 2, RT1); \
  46        pxor x4,                x0; \
  47        pxor RT0,               x1; \
  48        pxor RT1,               x2; \
  49        get_key(i, 3, x4); \
  50        pxor x4,                x3;
  51
  52#define LK(x0, x1, x2, x3, x4, i) \
  53        movdqa x0,              x4; \
  54        pslld $13,              x0; \
  55        psrld $(32 - 13),       x4; \
  56        por x4,                 x0; \
  57        pxor x0,                x1; \
  58        movdqa x2,              x4; \
  59        pslld $3,               x2; \
  60        psrld $(32 - 3),        x4; \
  61        por x4,                 x2; \
  62        pxor x2,                x1; \
  63        movdqa x1,              x4; \
  64        pslld $1,               x1; \
  65        psrld $(32 - 1),        x4; \
  66        por x4,                 x1; \
  67        movdqa x0,              x4; \
  68        pslld $3,               x4; \
  69        pxor x2,                x3; \
  70        pxor x4,                x3; \
  71        movdqa x3,              x4; \
  72        pslld $7,               x3; \
  73        psrld $(32 - 7),        x4; \
  74        por x4,                 x3; \
  75        movdqa x1,              x4; \
  76        pslld $7,               x4; \
  77        pxor x1,                x0; \
  78        pxor x3,                x0; \
  79        pxor x3,                x2; \
  80        pxor x4,                x2; \
  81        movdqa x0,              x4; \
  82        get_key(i, 1, RT0); \
  83        pxor RT0,               x1; \
  84        get_key(i, 3, RT0); \
  85        pxor RT0,               x3; \
  86        pslld $5,               x0; \
  87        psrld $(32 - 5),        x4; \
  88        por x4,                 x0; \
  89        movdqa x2,              x4; \
  90        pslld $22,              x2; \
  91        psrld $(32 - 22),       x4; \
  92        por x4,                 x2; \
  93        get_key(i, 0, RT0); \
  94        pxor RT0,               x0; \
  95        get_key(i, 2, RT0); \
  96        pxor RT0,               x2;
  97
  98#define KL(x0, x1, x2, x3, x4, i) \
  99        K(x0, x1, x2, x3, x4, i); \
 100        movdqa x0,              x4; \
 101        psrld $5,               x0; \
 102        pslld $(32 - 5),        x4; \
 103        por x4,                 x0; \
 104        movdqa x2,              x4; \
 105        psrld $22,              x2; \
 106        pslld $(32 - 22),       x4; \
 107        por x4,                 x2; \
 108        pxor x3,                x2; \
 109        pxor x3,                x0; \
 110        movdqa x1,              x4; \
 111        pslld $7,               x4; \
 112        pxor x1,                x0; \
 113        pxor x4,                x2; \
 114        movdqa x1,              x4; \
 115        psrld $1,               x1; \
 116        pslld $(32 - 1),        x4; \
 117        por x4,                 x1; \
 118        movdqa x3,              x4; \
 119        psrld $7,               x3; \
 120        pslld $(32 - 7),        x4; \
 121        por x4,                 x3; \
 122        pxor x0,                x1; \
 123        movdqa x0,              x4; \
 124        pslld $3,               x4; \
 125        pxor x4,                x3; \
 126        movdqa x0,              x4; \
 127        psrld $13,              x0; \
 128        pslld $(32 - 13),       x4; \
 129        por x4,                 x0; \
 130        pxor x2,                x1; \
 131        pxor x2,                x3; \
 132        movdqa x2,              x4; \
 133        psrld $3,               x2; \
 134        pslld $(32 - 3),        x4; \
 135        por x4,                 x2;
 136
 137#define S0(x0, x1, x2, x3, x4) \
 138        movdqa x3,              x4; \
 139        por x0,                 x3; \
 140        pxor x4,                x0; \
 141        pxor x2,                x4; \
 142        pxor RNOT,              x4; \
 143        pxor x1,                x3; \
 144        pand x0,                x1; \
 145        pxor x4,                x1; \
 146        pxor x0,                x2; \
 147        pxor x3,                x0; \
 148        por x0,                 x4; \
 149        pxor x2,                x0; \
 150        pand x1,                x2; \
 151        pxor x2,                x3; \
 152        pxor RNOT,              x1; \
 153        pxor x4,                x2; \
 154        pxor x2,                x1;
 155
 156#define S1(x0, x1, x2, x3, x4) \
 157        movdqa x1,              x4; \
 158        pxor x0,                x1; \
 159        pxor x3,                x0; \
 160        pxor RNOT,              x3; \
 161        pand x1,                x4; \
 162        por x1,                 x0; \
 163        pxor x2,                x3; \
 164        pxor x3,                x0; \
 165        pxor x3,                x1; \
 166        pxor x4,                x3; \
 167        por x4,                 x1; \
 168        pxor x2,                x4; \
 169        pand x0,                x2; \
 170        pxor x1,                x2; \
 171        por x0,                 x1; \
 172        pxor RNOT,              x0; \
 173        pxor x2,                x0; \
 174        pxor x1,                x4;
 175
 176#define S2(x0, x1, x2, x3, x4) \
 177        pxor RNOT,              x3; \
 178        pxor x0,                x1; \
 179        movdqa x0,              x4; \
 180        pand x2,                x0; \
 181        pxor x3,                x0; \
 182        por x4,                 x3; \
 183        pxor x1,                x2; \
 184        pxor x1,                x3; \
 185        pand x0,                x1; \
 186        pxor x2,                x0; \
 187        pand x3,                x2; \
 188        por x1,                 x3; \
 189        pxor RNOT,              x0; \
 190        pxor x0,                x3; \
 191        pxor x0,                x4; \
 192        pxor x2,                x0; \
 193        por x2,                 x1;
 194
 195#define S3(x0, x1, x2, x3, x4) \
 196        movdqa x1,              x4; \
 197        pxor x3,                x1; \
 198        por x0,                 x3; \
 199        pand x0,                x4; \
 200        pxor x2,                x0; \
 201        pxor x1,                x2; \
 202        pand x3,                x1; \
 203        pxor x3,                x2; \
 204        por x4,                 x0; \
 205        pxor x3,                x4; \
 206        pxor x0,                x1; \
 207        pand x3,                x0; \
 208        pand x4,                x3; \
 209        pxor x2,                x3; \
 210        por x1,                 x4; \
 211        pand x1,                x2; \
 212        pxor x3,                x4; \
 213        pxor x3,                x0; \
 214        pxor x2,                x3;
 215
 216#define S4(x0, x1, x2, x3, x4) \
 217        movdqa x3,              x4; \
 218        pand x0,                x3; \
 219        pxor x4,                x0; \
 220        pxor x2,                x3; \
 221        por x4,                 x2; \
 222        pxor x1,                x0; \
 223        pxor x3,                x4; \
 224        por x0,                 x2; \
 225        pxor x1,                x2; \
 226        pand x0,                x1; \
 227        pxor x4,                x1; \
 228        pand x2,                x4; \
 229        pxor x3,                x2; \
 230        pxor x0,                x4; \
 231        por x1,                 x3; \
 232        pxor RNOT,              x1; \
 233        pxor x0,                x3;
 234
 235#define S5(x0, x1, x2, x3, x4) \
 236        movdqa x1,              x4; \
 237        por x0,                 x1; \
 238        pxor x1,                x2; \
 239        pxor RNOT,              x3; \
 240        pxor x0,                x4; \
 241        pxor x2,                x0; \
 242        pand x4,                x1; \
 243        por x3,                 x4; \
 244        pxor x0,                x4; \
 245        pand x3,                x0; \
 246        pxor x3,                x1; \
 247        pxor x2,                x3; \
 248        pxor x1,                x0; \
 249        pand x4,                x2; \
 250        pxor x2,                x1; \
 251        pand x0,                x2; \
 252        pxor x2,                x3;
 253
 254#define S6(x0, x1, x2, x3, x4) \
 255        movdqa x1,              x4; \
 256        pxor x0,                x3; \
 257        pxor x2,                x1; \
 258        pxor x0,                x2; \
 259        pand x3,                x0; \
 260        por x3,                 x1; \
 261        pxor RNOT,              x4; \
 262        pxor x1,                x0; \
 263        pxor x2,                x1; \
 264        pxor x4,                x3; \
 265        pxor x0,                x4; \
 266        pand x0,                x2; \
 267        pxor x1,                x4; \
 268        pxor x3,                x2; \
 269        pand x1,                x3; \
 270        pxor x0,                x3; \
 271        pxor x2,                x1;
 272
 273#define S7(x0, x1, x2, x3, x4) \
 274        pxor RNOT,              x1; \
 275        movdqa x1,              x4; \
 276        pxor RNOT,              x0; \
 277        pand x2,                x1; \
 278        pxor x3,                x1; \
 279        por x4,                 x3; \
 280        pxor x2,                x4; \
 281        pxor x3,                x2; \
 282        pxor x0,                x3; \
 283        por x1,                 x0; \
 284        pand x0,                x2; \
 285        pxor x4,                x0; \
 286        pxor x3,                x4; \
 287        pand x0,                x3; \
 288        pxor x1,                x4; \
 289        pxor x4,                x2; \
 290        pxor x1,                x3; \
 291        por x0,                 x4; \
 292        pxor x1,                x4;
 293
 294#define SI0(x0, x1, x2, x3, x4) \
 295        movdqa x3,              x4; \
 296        pxor x0,                x1; \
 297        por x1,                 x3; \
 298        pxor x1,                x4; \
 299        pxor RNOT,              x0; \
 300        pxor x3,                x2; \
 301        pxor x0,                x3; \
 302        pand x1,                x0; \
 303        pxor x2,                x0; \
 304        pand x3,                x2; \
 305        pxor x4,                x3; \
 306        pxor x3,                x2; \
 307        pxor x3,                x1; \
 308        pand x0,                x3; \
 309        pxor x0,                x1; \
 310        pxor x2,                x0; \
 311        pxor x3,                x4;
 312
 313#define SI1(x0, x1, x2, x3, x4) \
 314        pxor x3,                x1; \
 315        movdqa x0,              x4; \
 316        pxor x2,                x0; \
 317        pxor RNOT,              x2; \
 318        por x1,                 x4; \
 319        pxor x3,                x4; \
 320        pand x1,                x3; \
 321        pxor x2,                x1; \
 322        pand x4,                x2; \
 323        pxor x1,                x4; \
 324        por x3,                 x1; \
 325        pxor x0,                x3; \
 326        pxor x0,                x2; \
 327        por x4,                 x0; \
 328        pxor x4,                x2; \
 329        pxor x0,                x1; \
 330        pxor x1,                x4;
 331
 332#define SI2(x0, x1, x2, x3, x4) \
 333        pxor x1,                x2; \
 334        movdqa x3,              x4; \
 335        pxor RNOT,              x3; \
 336        por x2,                 x3; \
 337        pxor x4,                x2; \
 338        pxor x0,                x4; \
 339        pxor x1,                x3; \
 340        por x2,                 x1; \
 341        pxor x0,                x2; \
 342        pxor x4,                x1; \
 343        por x3,                 x4; \
 344        pxor x3,                x2; \
 345        pxor x2,                x4; \
 346        pand x1,                x2; \
 347        pxor x3,                x2; \
 348        pxor x4,                x3; \
 349        pxor x0,                x4;
 350
 351#define SI3(x0, x1, x2, x3, x4) \
 352        pxor x1,                x2; \
 353        movdqa x1,              x4; \
 354        pand x2,                x1; \
 355        pxor x0,                x1; \
 356        por x4,                 x0; \
 357        pxor x3,                x4; \
 358        pxor x3,                x0; \
 359        por x1,                 x3; \
 360        pxor x2,                x1; \
 361        pxor x3,                x1; \
 362        pxor x2,                x0; \
 363        pxor x3,                x2; \
 364        pand x1,                x3; \
 365        pxor x0,                x1; \
 366        pand x2,                x0; \
 367        pxor x3,                x4; \
 368        pxor x0,                x3; \
 369        pxor x1,                x0;
 370
 371#define SI4(x0, x1, x2, x3, x4) \
 372        pxor x3,                x2; \
 373        movdqa x0,              x4; \
 374        pand x1,                x0; \
 375        pxor x2,                x0; \
 376        por x3,                 x2; \
 377        pxor RNOT,              x4; \
 378        pxor x0,                x1; \
 379        pxor x2,                x0; \
 380        pand x4,                x2; \
 381        pxor x0,                x2; \
 382        por x4,                 x0; \
 383        pxor x3,                x0; \
 384        pand x2,                x3; \
 385        pxor x3,                x4; \
 386        pxor x1,                x3; \
 387        pand x0,                x1; \
 388        pxor x1,                x4; \
 389        pxor x3,                x0;
 390
 391#define SI5(x0, x1, x2, x3, x4) \
 392        movdqa x1,              x4; \
 393        por x2,                 x1; \
 394        pxor x4,                x2; \
 395        pxor x3,                x1; \
 396        pand x4,                x3; \
 397        pxor x3,                x2; \
 398        por x0,                 x3; \
 399        pxor RNOT,              x0; \
 400        pxor x2,                x3; \
 401        por x0,                 x2; \
 402        pxor x1,                x4; \
 403        pxor x4,                x2; \
 404        pand x0,                x4; \
 405        pxor x1,                x0; \
 406        pxor x3,                x1; \
 407        pand x2,                x0; \
 408        pxor x3,                x2; \
 409        pxor x2,                x0; \
 410        pxor x4,                x2; \
 411        pxor x3,                x4;
 412
 413#define SI6(x0, x1, x2, x3, x4) \
 414        pxor x2,                x0; \
 415        movdqa x0,              x4; \
 416        pand x3,                x0; \
 417        pxor x3,                x2; \
 418        pxor x2,                x0; \
 419        pxor x1,                x3; \
 420        por x4,                 x2; \
 421        pxor x3,                x2; \
 422        pand x0,                x3; \
 423        pxor RNOT,              x0; \
 424        pxor x1,                x3; \
 425        pand x2,                x1; \
 426        pxor x0,                x4; \
 427        pxor x4,                x3; \
 428        pxor x2,                x4; \
 429        pxor x1,                x0; \
 430        pxor x0,                x2;
 431
 432#define SI7(x0, x1, x2, x3, x4) \
 433        movdqa x3,              x4; \
 434        pand x0,                x3; \
 435        pxor x2,                x0; \
 436        por x4,                 x2; \
 437        pxor x1,                x4; \
 438        pxor RNOT,              x0; \
 439        por x3,                 x1; \
 440        pxor x0,                x4; \
 441        pand x2,                x0; \
 442        pxor x1,                x0; \
 443        pand x2,                x1; \
 444        pxor x2,                x3; \
 445        pxor x3,                x4; \
 446        pand x3,                x2; \
 447        por x0,                 x3; \
 448        pxor x4,                x1; \
 449        pxor x4,                x3; \
 450        pand x0,                x4; \
 451        pxor x2,                x4;
 452
 453#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 454        movdqa x0,              t2; \
 455        punpckldq x1,           x0; \
 456        punpckhdq x1,           t2; \
 457        movdqa x2,              t1; \
 458        punpckhdq x3,           x2; \
 459        punpckldq x3,           t1; \
 460        movdqa x0,              x1; \
 461        punpcklqdq t1,          x0; \
 462        punpckhqdq t1,          x1; \
 463        movdqa t2,              x3; \
 464        punpcklqdq x2,          t2; \
 465        punpckhqdq x2,          x3; \
 466        movdqa t2,              x2;
 467
 468#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
 469        movdqu (0*4*4)(in),     x0; \
 470        movdqu (1*4*4)(in),     x1; \
 471        movdqu (2*4*4)(in),     x2; \
 472        movdqu (3*4*4)(in),     x3; \
 473        \
 474        transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
 475
 476#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
 477        transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 478        \
 479        movdqu x0, (0*4*4)(out); \
 480        movdqu x1, (1*4*4)(out); \
 481        movdqu x2, (2*4*4)(out); \
 482        movdqu x3, (3*4*4)(out);
 483
 484#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
 485        transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 486        \
 487        movdqu (0*4*4)(out),    t0; \
 488        pxor t0,                x0; \
 489        movdqu x0,              (0*4*4)(out); \
 490        movdqu (1*4*4)(out),    t0; \
 491        pxor t0,                x1; \
 492        movdqu x1,              (1*4*4)(out); \
 493        movdqu (2*4*4)(out),    t0; \
 494        pxor t0,                x2; \
 495        movdqu x2,              (2*4*4)(out); \
 496        movdqu (3*4*4)(out),    t0; \
 497        pxor t0,                x3; \
 498        movdqu x3,              (3*4*4)(out);
 499
 500SYM_FUNC_START(__serpent_enc_blk_4way)
 501        /* input:
 502         *      arg_ctx(%esp): ctx, CTX
 503         *      arg_dst(%esp): dst
 504         *      arg_src(%esp): src
 505         *      arg_xor(%esp): bool, if true: xor output
 506         */
 507
 508        pcmpeqd RNOT, RNOT;
 509
 510        movl arg_ctx(%esp), CTX;
 511
 512        movl arg_src(%esp), %eax;
 513        read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
 514
 515                                         K(RA, RB, RC, RD, RE, 0);
 516        S0(RA, RB, RC, RD, RE);         LK(RC, RB, RD, RA, RE, 1);
 517        S1(RC, RB, RD, RA, RE);         LK(RE, RD, RA, RC, RB, 2);
 518        S2(RE, RD, RA, RC, RB);         LK(RB, RD, RE, RC, RA, 3);
 519        S3(RB, RD, RE, RC, RA);         LK(RC, RA, RD, RB, RE, 4);
 520        S4(RC, RA, RD, RB, RE);         LK(RA, RD, RB, RE, RC, 5);
 521        S5(RA, RD, RB, RE, RC);         LK(RC, RA, RD, RE, RB, 6);
 522        S6(RC, RA, RD, RE, RB);         LK(RD, RB, RA, RE, RC, 7);
 523        S7(RD, RB, RA, RE, RC);         LK(RC, RA, RE, RD, RB, 8);
 524        S0(RC, RA, RE, RD, RB);         LK(RE, RA, RD, RC, RB, 9);
 525        S1(RE, RA, RD, RC, RB);         LK(RB, RD, RC, RE, RA, 10);
 526        S2(RB, RD, RC, RE, RA);         LK(RA, RD, RB, RE, RC, 11);
 527        S3(RA, RD, RB, RE, RC);         LK(RE, RC, RD, RA, RB, 12);
 528        S4(RE, RC, RD, RA, RB);         LK(RC, RD, RA, RB, RE, 13);
 529        S5(RC, RD, RA, RB, RE);         LK(RE, RC, RD, RB, RA, 14);
 530        S6(RE, RC, RD, RB, RA);         LK(RD, RA, RC, RB, RE, 15);
 531        S7(RD, RA, RC, RB, RE);         LK(RE, RC, RB, RD, RA, 16);
 532        S0(RE, RC, RB, RD, RA);         LK(RB, RC, RD, RE, RA, 17);
 533        S1(RB, RC, RD, RE, RA);         LK(RA, RD, RE, RB, RC, 18);
 534        S2(RA, RD, RE, RB, RC);         LK(RC, RD, RA, RB, RE, 19);
 535        S3(RC, RD, RA, RB, RE);         LK(RB, RE, RD, RC, RA, 20);
 536        S4(RB, RE, RD, RC, RA);         LK(RE, RD, RC, RA, RB, 21);
 537        S5(RE, RD, RC, RA, RB);         LK(RB, RE, RD, RA, RC, 22);
 538        S6(RB, RE, RD, RA, RC);         LK(RD, RC, RE, RA, RB, 23);
 539        S7(RD, RC, RE, RA, RB);         LK(RB, RE, RA, RD, RC, 24);
 540        S0(RB, RE, RA, RD, RC);         LK(RA, RE, RD, RB, RC, 25);
 541        S1(RA, RE, RD, RB, RC);         LK(RC, RD, RB, RA, RE, 26);
 542        S2(RC, RD, RB, RA, RE);         LK(RE, RD, RC, RA, RB, 27);
 543        S3(RE, RD, RC, RA, RB);         LK(RA, RB, RD, RE, RC, 28);
 544        S4(RA, RB, RD, RE, RC);         LK(RB, RD, RE, RC, RA, 29);
 545        S5(RB, RD, RE, RC, RA);         LK(RA, RB, RD, RC, RE, 30);
 546        S6(RA, RB, RD, RC, RE);         LK(RD, RE, RB, RC, RA, 31);
 547        S7(RD, RE, RB, RC, RA);          K(RA, RB, RC, RD, RE, 32);
 548
 549        movl arg_dst(%esp), %eax;
 550
 551        cmpb $0, arg_xor(%esp);
 552        jnz .L__enc_xor4;
 553
 554        write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
 555
 556        ret;
 557
 558.L__enc_xor4:
 559        xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
 560
 561        ret;
 562SYM_FUNC_END(__serpent_enc_blk_4way)
 563
 564SYM_FUNC_START(serpent_dec_blk_4way)
 565        /* input:
 566         *      arg_ctx(%esp): ctx, CTX
 567         *      arg_dst(%esp): dst
 568         *      arg_src(%esp): src
 569         */
 570
 571        pcmpeqd RNOT, RNOT;
 572
 573        movl arg_ctx(%esp), CTX;
 574
 575        movl arg_src(%esp), %eax;
 576        read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
 577
 578                                         K(RA, RB, RC, RD, RE, 32);
 579        SI7(RA, RB, RC, RD, RE);        KL(RB, RD, RA, RE, RC, 31);
 580        SI6(RB, RD, RA, RE, RC);        KL(RA, RC, RE, RB, RD, 30);
 581        SI5(RA, RC, RE, RB, RD);        KL(RC, RD, RA, RE, RB, 29);
 582        SI4(RC, RD, RA, RE, RB);        KL(RC, RA, RB, RE, RD, 28);
 583        SI3(RC, RA, RB, RE, RD);        KL(RB, RC, RD, RE, RA, 27);
 584        SI2(RB, RC, RD, RE, RA);        KL(RC, RA, RE, RD, RB, 26);
 585        SI1(RC, RA, RE, RD, RB);        KL(RB, RA, RE, RD, RC, 25);
 586        SI0(RB, RA, RE, RD, RC);        KL(RE, RC, RA, RB, RD, 24);
 587        SI7(RE, RC, RA, RB, RD);        KL(RC, RB, RE, RD, RA, 23);
 588        SI6(RC, RB, RE, RD, RA);        KL(RE, RA, RD, RC, RB, 22);
 589        SI5(RE, RA, RD, RC, RB);        KL(RA, RB, RE, RD, RC, 21);
 590        SI4(RA, RB, RE, RD, RC);        KL(RA, RE, RC, RD, RB, 20);
 591        SI3(RA, RE, RC, RD, RB);        KL(RC, RA, RB, RD, RE, 19);
 592        SI2(RC, RA, RB, RD, RE);        KL(RA, RE, RD, RB, RC, 18);
 593        SI1(RA, RE, RD, RB, RC);        KL(RC, RE, RD, RB, RA, 17);
 594        SI0(RC, RE, RD, RB, RA);        KL(RD, RA, RE, RC, RB, 16);
 595        SI7(RD, RA, RE, RC, RB);        KL(RA, RC, RD, RB, RE, 15);
 596        SI6(RA, RC, RD, RB, RE);        KL(RD, RE, RB, RA, RC, 14);
 597        SI5(RD, RE, RB, RA, RC);        KL(RE, RC, RD, RB, RA, 13);
 598        SI4(RE, RC, RD, RB, RA);        KL(RE, RD, RA, RB, RC, 12);
 599        SI3(RE, RD, RA, RB, RC);        KL(RA, RE, RC, RB, RD, 11);
 600        SI2(RA, RE, RC, RB, RD);        KL(RE, RD, RB, RC, RA, 10);
 601        SI1(RE, RD, RB, RC, RA);        KL(RA, RD, RB, RC, RE, 9);
 602        SI0(RA, RD, RB, RC, RE);        KL(RB, RE, RD, RA, RC, 8);
 603        SI7(RB, RE, RD, RA, RC);        KL(RE, RA, RB, RC, RD, 7);
 604        SI6(RE, RA, RB, RC, RD);        KL(RB, RD, RC, RE, RA, 6);
 605        SI5(RB, RD, RC, RE, RA);        KL(RD, RA, RB, RC, RE, 5);
 606        SI4(RD, RA, RB, RC, RE);        KL(RD, RB, RE, RC, RA, 4);
 607        SI3(RD, RB, RE, RC, RA);        KL(RE, RD, RA, RC, RB, 3);
 608        SI2(RE, RD, RA, RC, RB);        KL(RD, RB, RC, RA, RE, 2);
 609        SI1(RD, RB, RC, RA, RE);        KL(RE, RB, RC, RA, RD, 1);
 610        SI0(RE, RB, RC, RA, RD);         K(RC, RD, RB, RE, RA, 0);
 611
 612        movl arg_dst(%esp), %eax;
 613        write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA);
 614
 615        ret;
 616SYM_FUNC_END(serpent_dec_blk_4way)
 617