linux/arch/x86/crypto/serpent-sse2-i586-asm_32.S
<<
>>
Prefs
   1/*
   2 * Serpent Cipher 4-way parallel algorithm (i586/SSE2)
   3 *
   4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
   5 *
   6 * Based on crypto/serpent.c by
   7 *  Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
   8 *                2003 Herbert Valerio Riedel <hvr@gnu.org>
   9 *
  10 * This program is free software; you can redistribute it and/or modify
  11 * it under the terms of the GNU General Public License as published by
  12 * the Free Software Foundation; either version 2 of the License, or
  13 * (at your option) any later version.
  14 *
  15 * This program is distributed in the hope that it will be useful,
  16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18 * GNU General Public License for more details.
  19 *
  20 * You should have received a copy of the GNU General Public License
  21 * along with this program; if not, write to the Free Software
  22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
  23 * USA
  24 *
  25 */
  26
  27.file "serpent-sse2-i586-asm_32.S"
  28.text
  29
  30#define arg_ctx 4
  31#define arg_dst 8
  32#define arg_src 12
  33#define arg_xor 16
  34
  35/**********************************************************************
  36  4-way SSE2 serpent
  37 **********************************************************************/
  38#define CTX %edx
  39
  40#define RA %xmm0
  41#define RB %xmm1
  42#define RC %xmm2
  43#define RD %xmm3
  44#define RE %xmm4
  45
  46#define RT0 %xmm5
  47#define RT1 %xmm6
  48
  49#define RNOT %xmm7
  50
  51#define get_key(i, j, t) \
  52        movd (4*(i)+(j))*4(CTX), t; \
  53        pshufd $0, t, t;
  54
  55#define K(x0, x1, x2, x3, x4, i) \
  56        get_key(i, 0, x4); \
  57        get_key(i, 1, RT0); \
  58        get_key(i, 2, RT1); \
  59        pxor x4,                x0; \
  60        pxor RT0,               x1; \
  61        pxor RT1,               x2; \
  62        get_key(i, 3, x4); \
  63        pxor x4,                x3;
  64
  65#define LK(x0, x1, x2, x3, x4, i) \
  66        movdqa x0,              x4; \
  67        pslld $13,              x0; \
  68        psrld $(32 - 13),       x4; \
  69        por x4,                 x0; \
  70        pxor x0,                x1; \
  71        movdqa x2,              x4; \
  72        pslld $3,               x2; \
  73        psrld $(32 - 3),        x4; \
  74        por x4,                 x2; \
  75        pxor x2,                x1; \
  76        movdqa x1,              x4; \
  77        pslld $1,               x1; \
  78        psrld $(32 - 1),        x4; \
  79        por x4,                 x1; \
  80        movdqa x0,              x4; \
  81        pslld $3,               x4; \
  82        pxor x2,                x3; \
  83        pxor x4,                x3; \
  84        movdqa x3,              x4; \
  85        pslld $7,               x3; \
  86        psrld $(32 - 7),        x4; \
  87        por x4,                 x3; \
  88        movdqa x1,              x4; \
  89        pslld $7,               x4; \
  90        pxor x1,                x0; \
  91        pxor x3,                x0; \
  92        pxor x3,                x2; \
  93        pxor x4,                x2; \
  94        movdqa x0,              x4; \
  95        get_key(i, 1, RT0); \
  96        pxor RT0,               x1; \
  97        get_key(i, 3, RT0); \
  98        pxor RT0,               x3; \
  99        pslld $5,               x0; \
 100        psrld $(32 - 5),        x4; \
 101        por x4,                 x0; \
 102        movdqa x2,              x4; \
 103        pslld $22,              x2; \
 104        psrld $(32 - 22),       x4; \
 105        por x4,                 x2; \
 106        get_key(i, 0, RT0); \
 107        pxor RT0,               x0; \
 108        get_key(i, 2, RT0); \
 109        pxor RT0,               x2;
 110
 111#define KL(x0, x1, x2, x3, x4, i) \
 112        K(x0, x1, x2, x3, x4, i); \
 113        movdqa x0,              x4; \
 114        psrld $5,               x0; \
 115        pslld $(32 - 5),        x4; \
 116        por x4,                 x0; \
 117        movdqa x2,              x4; \
 118        psrld $22,              x2; \
 119        pslld $(32 - 22),       x4; \
 120        por x4,                 x2; \
 121        pxor x3,                x2; \
 122        pxor x3,                x0; \
 123        movdqa x1,              x4; \
 124        pslld $7,               x4; \
 125        pxor x1,                x0; \
 126        pxor x4,                x2; \
 127        movdqa x1,              x4; \
 128        psrld $1,               x1; \
 129        pslld $(32 - 1),        x4; \
 130        por x4,                 x1; \
 131        movdqa x3,              x4; \
 132        psrld $7,               x3; \
 133        pslld $(32 - 7),        x4; \
 134        por x4,                 x3; \
 135        pxor x0,                x1; \
 136        movdqa x0,              x4; \
 137        pslld $3,               x4; \
 138        pxor x4,                x3; \
 139        movdqa x0,              x4; \
 140        psrld $13,              x0; \
 141        pslld $(32 - 13),       x4; \
 142        por x4,                 x0; \
 143        pxor x2,                x1; \
 144        pxor x2,                x3; \
 145        movdqa x2,              x4; \
 146        psrld $3,               x2; \
 147        pslld $(32 - 3),        x4; \
 148        por x4,                 x2;
 149
 150#define S0(x0, x1, x2, x3, x4) \
 151        movdqa x3,              x4; \
 152        por x0,                 x3; \
 153        pxor x4,                x0; \
 154        pxor x2,                x4; \
 155        pxor RNOT,              x4; \
 156        pxor x1,                x3; \
 157        pand x0,                x1; \
 158        pxor x4,                x1; \
 159        pxor x0,                x2; \
 160        pxor x3,                x0; \
 161        por x0,                 x4; \
 162        pxor x2,                x0; \
 163        pand x1,                x2; \
 164        pxor x2,                x3; \
 165        pxor RNOT,              x1; \
 166        pxor x4,                x2; \
 167        pxor x2,                x1;
 168
 169#define S1(x0, x1, x2, x3, x4) \
 170        movdqa x1,              x4; \
 171        pxor x0,                x1; \
 172        pxor x3,                x0; \
 173        pxor RNOT,              x3; \
 174        pand x1,                x4; \
 175        por x1,                 x0; \
 176        pxor x2,                x3; \
 177        pxor x3,                x0; \
 178        pxor x3,                x1; \
 179        pxor x4,                x3; \
 180        por x4,                 x1; \
 181        pxor x2,                x4; \
 182        pand x0,                x2; \
 183        pxor x1,                x2; \
 184        por x0,                 x1; \
 185        pxor RNOT,              x0; \
 186        pxor x2,                x0; \
 187        pxor x1,                x4;
 188
 189#define S2(x0, x1, x2, x3, x4) \
 190        pxor RNOT,              x3; \
 191        pxor x0,                x1; \
 192        movdqa x0,              x4; \
 193        pand x2,                x0; \
 194        pxor x3,                x0; \
 195        por x4,                 x3; \
 196        pxor x1,                x2; \
 197        pxor x1,                x3; \
 198        pand x0,                x1; \
 199        pxor x2,                x0; \
 200        pand x3,                x2; \
 201        por x1,                 x3; \
 202        pxor RNOT,              x0; \
 203        pxor x0,                x3; \
 204        pxor x0,                x4; \
 205        pxor x2,                x0; \
 206        por x2,                 x1;
 207
 208#define S3(x0, x1, x2, x3, x4) \
 209        movdqa x1,              x4; \
 210        pxor x3,                x1; \
 211        por x0,                 x3; \
 212        pand x0,                x4; \
 213        pxor x2,                x0; \
 214        pxor x1,                x2; \
 215        pand x3,                x1; \
 216        pxor x3,                x2; \
 217        por x4,                 x0; \
 218        pxor x3,                x4; \
 219        pxor x0,                x1; \
 220        pand x3,                x0; \
 221        pand x4,                x3; \
 222        pxor x2,                x3; \
 223        por x1,                 x4; \
 224        pand x1,                x2; \
 225        pxor x3,                x4; \
 226        pxor x3,                x0; \
 227        pxor x2,                x3;
 228
 229#define S4(x0, x1, x2, x3, x4) \
 230        movdqa x3,              x4; \
 231        pand x0,                x3; \
 232        pxor x4,                x0; \
 233        pxor x2,                x3; \
 234        por x4,                 x2; \
 235        pxor x1,                x0; \
 236        pxor x3,                x4; \
 237        por x0,                 x2; \
 238        pxor x1,                x2; \
 239        pand x0,                x1; \
 240        pxor x4,                x1; \
 241        pand x2,                x4; \
 242        pxor x3,                x2; \
 243        pxor x0,                x4; \
 244        por x1,                 x3; \
 245        pxor RNOT,              x1; \
 246        pxor x0,                x3;
 247
 248#define S5(x0, x1, x2, x3, x4) \
 249        movdqa x1,              x4; \
 250        por x0,                 x1; \
 251        pxor x1,                x2; \
 252        pxor RNOT,              x3; \
 253        pxor x0,                x4; \
 254        pxor x2,                x0; \
 255        pand x4,                x1; \
 256        por x3,                 x4; \
 257        pxor x0,                x4; \
 258        pand x3,                x0; \
 259        pxor x3,                x1; \
 260        pxor x2,                x3; \
 261        pxor x1,                x0; \
 262        pand x4,                x2; \
 263        pxor x2,                x1; \
 264        pand x0,                x2; \
 265        pxor x2,                x3;
 266
 267#define S6(x0, x1, x2, x3, x4) \
 268        movdqa x1,              x4; \
 269        pxor x0,                x3; \
 270        pxor x2,                x1; \
 271        pxor x0,                x2; \
 272        pand x3,                x0; \
 273        por x3,                 x1; \
 274        pxor RNOT,              x4; \
 275        pxor x1,                x0; \
 276        pxor x2,                x1; \
 277        pxor x4,                x3; \
 278        pxor x0,                x4; \
 279        pand x0,                x2; \
 280        pxor x1,                x4; \
 281        pxor x3,                x2; \
 282        pand x1,                x3; \
 283        pxor x0,                x3; \
 284        pxor x2,                x1;
 285
 286#define S7(x0, x1, x2, x3, x4) \
 287        pxor RNOT,              x1; \
 288        movdqa x1,              x4; \
 289        pxor RNOT,              x0; \
 290        pand x2,                x1; \
 291        pxor x3,                x1; \
 292        por x4,                 x3; \
 293        pxor x2,                x4; \
 294        pxor x3,                x2; \
 295        pxor x0,                x3; \
 296        por x1,                 x0; \
 297        pand x0,                x2; \
 298        pxor x4,                x0; \
 299        pxor x3,                x4; \
 300        pand x0,                x3; \
 301        pxor x1,                x4; \
 302        pxor x4,                x2; \
 303        pxor x1,                x3; \
 304        por x0,                 x4; \
 305        pxor x1,                x4;
 306
 307#define SI0(x0, x1, x2, x3, x4) \
 308        movdqa x3,              x4; \
 309        pxor x0,                x1; \
 310        por x1,                 x3; \
 311        pxor x1,                x4; \
 312        pxor RNOT,              x0; \
 313        pxor x3,                x2; \
 314        pxor x0,                x3; \
 315        pand x1,                x0; \
 316        pxor x2,                x0; \
 317        pand x3,                x2; \
 318        pxor x4,                x3; \
 319        pxor x3,                x2; \
 320        pxor x3,                x1; \
 321        pand x0,                x3; \
 322        pxor x0,                x1; \
 323        pxor x2,                x0; \
 324        pxor x3,                x4;
 325
 326#define SI1(x0, x1, x2, x3, x4) \
 327        pxor x3,                x1; \
 328        movdqa x0,              x4; \
 329        pxor x2,                x0; \
 330        pxor RNOT,              x2; \
 331        por x1,                 x4; \
 332        pxor x3,                x4; \
 333        pand x1,                x3; \
 334        pxor x2,                x1; \
 335        pand x4,                x2; \
 336        pxor x1,                x4; \
 337        por x3,                 x1; \
 338        pxor x0,                x3; \
 339        pxor x0,                x2; \
 340        por x4,                 x0; \
 341        pxor x4,                x2; \
 342        pxor x0,                x1; \
 343        pxor x1,                x4;
 344
 345#define SI2(x0, x1, x2, x3, x4) \
 346        pxor x1,                x2; \
 347        movdqa x3,              x4; \
 348        pxor RNOT,              x3; \
 349        por x2,                 x3; \
 350        pxor x4,                x2; \
 351        pxor x0,                x4; \
 352        pxor x1,                x3; \
 353        por x2,                 x1; \
 354        pxor x0,                x2; \
 355        pxor x4,                x1; \
 356        por x3,                 x4; \
 357        pxor x3,                x2; \
 358        pxor x2,                x4; \
 359        pand x1,                x2; \
 360        pxor x3,                x2; \
 361        pxor x4,                x3; \
 362        pxor x0,                x4;
 363
 364#define SI3(x0, x1, x2, x3, x4) \
 365        pxor x1,                x2; \
 366        movdqa x1,              x4; \
 367        pand x2,                x1; \
 368        pxor x0,                x1; \
 369        por x4,                 x0; \
 370        pxor x3,                x4; \
 371        pxor x3,                x0; \
 372        por x1,                 x3; \
 373        pxor x2,                x1; \
 374        pxor x3,                x1; \
 375        pxor x2,                x0; \
 376        pxor x3,                x2; \
 377        pand x1,                x3; \
 378        pxor x0,                x1; \
 379        pand x2,                x0; \
 380        pxor x3,                x4; \
 381        pxor x0,                x3; \
 382        pxor x1,                x0;
 383
 384#define SI4(x0, x1, x2, x3, x4) \
 385        pxor x3,                x2; \
 386        movdqa x0,              x4; \
 387        pand x1,                x0; \
 388        pxor x2,                x0; \
 389        por x3,                 x2; \
 390        pxor RNOT,              x4; \
 391        pxor x0,                x1; \
 392        pxor x2,                x0; \
 393        pand x4,                x2; \
 394        pxor x0,                x2; \
 395        por x4,                 x0; \
 396        pxor x3,                x0; \
 397        pand x2,                x3; \
 398        pxor x3,                x4; \
 399        pxor x1,                x3; \
 400        pand x0,                x1; \
 401        pxor x1,                x4; \
 402        pxor x3,                x0;
 403
 404#define SI5(x0, x1, x2, x3, x4) \
 405        movdqa x1,              x4; \
 406        por x2,                 x1; \
 407        pxor x4,                x2; \
 408        pxor x3,                x1; \
 409        pand x4,                x3; \
 410        pxor x3,                x2; \
 411        por x0,                 x3; \
 412        pxor RNOT,              x0; \
 413        pxor x2,                x3; \
 414        por x0,                 x2; \
 415        pxor x1,                x4; \
 416        pxor x4,                x2; \
 417        pand x0,                x4; \
 418        pxor x1,                x0; \
 419        pxor x3,                x1; \
 420        pand x2,                x0; \
 421        pxor x3,                x2; \
 422        pxor x2,                x0; \
 423        pxor x4,                x2; \
 424        pxor x3,                x4;
 425
 426#define SI6(x0, x1, x2, x3, x4) \
 427        pxor x2,                x0; \
 428        movdqa x0,              x4; \
 429        pand x3,                x0; \
 430        pxor x3,                x2; \
 431        pxor x2,                x0; \
 432        pxor x1,                x3; \
 433        por x4,                 x2; \
 434        pxor x3,                x2; \
 435        pand x0,                x3; \
 436        pxor RNOT,              x0; \
 437        pxor x1,                x3; \
 438        pand x2,                x1; \
 439        pxor x0,                x4; \
 440        pxor x4,                x3; \
 441        pxor x2,                x4; \
 442        pxor x1,                x0; \
 443        pxor x0,                x2;
 444
 445#define SI7(x0, x1, x2, x3, x4) \
 446        movdqa x3,              x4; \
 447        pand x0,                x3; \
 448        pxor x2,                x0; \
 449        por x4,                 x2; \
 450        pxor x1,                x4; \
 451        pxor RNOT,              x0; \
 452        por x3,                 x1; \
 453        pxor x0,                x4; \
 454        pand x2,                x0; \
 455        pxor x1,                x0; \
 456        pand x2,                x1; \
 457        pxor x2,                x3; \
 458        pxor x3,                x4; \
 459        pand x3,                x2; \
 460        por x0,                 x3; \
 461        pxor x4,                x1; \
 462        pxor x4,                x3; \
 463        pand x0,                x4; \
 464        pxor x2,                x4;
 465
 466#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 467        movdqa x0,              t2; \
 468        punpckldq x1,           x0; \
 469        punpckhdq x1,           t2; \
 470        movdqa x2,              t1; \
 471        punpckhdq x3,           x2; \
 472        punpckldq x3,           t1; \
 473        movdqa x0,              x1; \
 474        punpcklqdq t1,          x0; \
 475        punpckhqdq t1,          x1; \
 476        movdqa t2,              x3; \
 477        punpcklqdq x2,          t2; \
 478        punpckhqdq x2,          x3; \
 479        movdqa t2,              x2;
 480
 481#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
 482        movdqu (0*4*4)(in),     x0; \
 483        movdqu (1*4*4)(in),     x1; \
 484        movdqu (2*4*4)(in),     x2; \
 485        movdqu (3*4*4)(in),     x3; \
 486        \
 487        transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
 488
 489#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
 490        transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 491        \
 492        movdqu x0, (0*4*4)(out); \
 493        movdqu x1, (1*4*4)(out); \
 494        movdqu x2, (2*4*4)(out); \
 495        movdqu x3, (3*4*4)(out);
 496
 497#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
 498        transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 499        \
 500        movdqu (0*4*4)(out),    t0; \
 501        pxor t0,                x0; \
 502        movdqu x0,              (0*4*4)(out); \
 503        movdqu (1*4*4)(out),    t0; \
 504        pxor t0,                x1; \
 505        movdqu x1,              (1*4*4)(out); \
 506        movdqu (2*4*4)(out),    t0; \
 507        pxor t0,                x2; \
 508        movdqu x2,              (2*4*4)(out); \
 509        movdqu (3*4*4)(out),    t0; \
 510        pxor t0,                x3; \
 511        movdqu x3,              (3*4*4)(out);
 512
 513.align 8
 514.global __serpent_enc_blk_4way
 515.type   __serpent_enc_blk_4way,@function;
 516
 517__serpent_enc_blk_4way:
 518        /* input:
 519         *      arg_ctx(%esp): ctx, CTX
 520         *      arg_dst(%esp): dst
 521         *      arg_src(%esp): src
 522         *      arg_xor(%esp): bool, if true: xor output
 523         */
 524
 525        pcmpeqd RNOT, RNOT;
 526
 527        movl arg_ctx(%esp), CTX;
 528
 529        movl arg_src(%esp), %eax;
 530        read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
 531
 532                                         K(RA, RB, RC, RD, RE, 0);
 533        S0(RA, RB, RC, RD, RE);         LK(RC, RB, RD, RA, RE, 1);
 534        S1(RC, RB, RD, RA, RE);         LK(RE, RD, RA, RC, RB, 2);
 535        S2(RE, RD, RA, RC, RB);         LK(RB, RD, RE, RC, RA, 3);
 536        S3(RB, RD, RE, RC, RA);         LK(RC, RA, RD, RB, RE, 4);
 537        S4(RC, RA, RD, RB, RE);         LK(RA, RD, RB, RE, RC, 5);
 538        S5(RA, RD, RB, RE, RC);         LK(RC, RA, RD, RE, RB, 6);
 539        S6(RC, RA, RD, RE, RB);         LK(RD, RB, RA, RE, RC, 7);
 540        S7(RD, RB, RA, RE, RC);         LK(RC, RA, RE, RD, RB, 8);
 541        S0(RC, RA, RE, RD, RB);         LK(RE, RA, RD, RC, RB, 9);
 542        S1(RE, RA, RD, RC, RB);         LK(RB, RD, RC, RE, RA, 10);
 543        S2(RB, RD, RC, RE, RA);         LK(RA, RD, RB, RE, RC, 11);
 544        S3(RA, RD, RB, RE, RC);         LK(RE, RC, RD, RA, RB, 12);
 545        S4(RE, RC, RD, RA, RB);         LK(RC, RD, RA, RB, RE, 13);
 546        S5(RC, RD, RA, RB, RE);         LK(RE, RC, RD, RB, RA, 14);
 547        S6(RE, RC, RD, RB, RA);         LK(RD, RA, RC, RB, RE, 15);
 548        S7(RD, RA, RC, RB, RE);         LK(RE, RC, RB, RD, RA, 16);
 549        S0(RE, RC, RB, RD, RA);         LK(RB, RC, RD, RE, RA, 17);
 550        S1(RB, RC, RD, RE, RA);         LK(RA, RD, RE, RB, RC, 18);
 551        S2(RA, RD, RE, RB, RC);         LK(RC, RD, RA, RB, RE, 19);
 552        S3(RC, RD, RA, RB, RE);         LK(RB, RE, RD, RC, RA, 20);
 553        S4(RB, RE, RD, RC, RA);         LK(RE, RD, RC, RA, RB, 21);
 554        S5(RE, RD, RC, RA, RB);         LK(RB, RE, RD, RA, RC, 22);
 555        S6(RB, RE, RD, RA, RC);         LK(RD, RC, RE, RA, RB, 23);
 556        S7(RD, RC, RE, RA, RB);         LK(RB, RE, RA, RD, RC, 24);
 557        S0(RB, RE, RA, RD, RC);         LK(RA, RE, RD, RB, RC, 25);
 558        S1(RA, RE, RD, RB, RC);         LK(RC, RD, RB, RA, RE, 26);
 559        S2(RC, RD, RB, RA, RE);         LK(RE, RD, RC, RA, RB, 27);
 560        S3(RE, RD, RC, RA, RB);         LK(RA, RB, RD, RE, RC, 28);
 561        S4(RA, RB, RD, RE, RC);         LK(RB, RD, RE, RC, RA, 29);
 562        S5(RB, RD, RE, RC, RA);         LK(RA, RB, RD, RC, RE, 30);
 563        S6(RA, RB, RD, RC, RE);         LK(RD, RE, RB, RC, RA, 31);
 564        S7(RD, RE, RB, RC, RA);          K(RA, RB, RC, RD, RE, 32);
 565
 566        movl arg_dst(%esp), %eax;
 567
 568        cmpb $0, arg_xor(%esp);
 569        jnz __enc_xor4;
 570
 571        write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
 572
 573        ret;
 574
 575__enc_xor4:
 576        xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
 577
 578        ret;
 579
 580.align 8
 581.global serpent_dec_blk_4way
 582.type   serpent_dec_blk_4way,@function;
 583
 584serpent_dec_blk_4way:
 585        /* input:
 586         *      arg_ctx(%esp): ctx, CTX
 587         *      arg_dst(%esp): dst
 588         *      arg_src(%esp): src
 589         */
 590
 591        pcmpeqd RNOT, RNOT;
 592
 593        movl arg_ctx(%esp), CTX;
 594
 595        movl arg_src(%esp), %eax;
 596        read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
 597
 598                                         K(RA, RB, RC, RD, RE, 32);
 599        SI7(RA, RB, RC, RD, RE);        KL(RB, RD, RA, RE, RC, 31);
 600        SI6(RB, RD, RA, RE, RC);        KL(RA, RC, RE, RB, RD, 30);
 601        SI5(RA, RC, RE, RB, RD);        KL(RC, RD, RA, RE, RB, 29);
 602        SI4(RC, RD, RA, RE, RB);        KL(RC, RA, RB, RE, RD, 28);
 603        SI3(RC, RA, RB, RE, RD);        KL(RB, RC, RD, RE, RA, 27);
 604        SI2(RB, RC, RD, RE, RA);        KL(RC, RA, RE, RD, RB, 26);
 605        SI1(RC, RA, RE, RD, RB);        KL(RB, RA, RE, RD, RC, 25);
 606        SI0(RB, RA, RE, RD, RC);        KL(RE, RC, RA, RB, RD, 24);
 607        SI7(RE, RC, RA, RB, RD);        KL(RC, RB, RE, RD, RA, 23);
 608        SI6(RC, RB, RE, RD, RA);        KL(RE, RA, RD, RC, RB, 22);
 609        SI5(RE, RA, RD, RC, RB);        KL(RA, RB, RE, RD, RC, 21);
 610        SI4(RA, RB, RE, RD, RC);        KL(RA, RE, RC, RD, RB, 20);
 611        SI3(RA, RE, RC, RD, RB);        KL(RC, RA, RB, RD, RE, 19);
 612        SI2(RC, RA, RB, RD, RE);        KL(RA, RE, RD, RB, RC, 18);
 613        SI1(RA, RE, RD, RB, RC);        KL(RC, RE, RD, RB, RA, 17);
 614        SI0(RC, RE, RD, RB, RA);        KL(RD, RA, RE, RC, RB, 16);
 615        SI7(RD, RA, RE, RC, RB);        KL(RA, RC, RD, RB, RE, 15);
 616        SI6(RA, RC, RD, RB, RE);        KL(RD, RE, RB, RA, RC, 14);
 617        SI5(RD, RE, RB, RA, RC);        KL(RE, RC, RD, RB, RA, 13);
 618        SI4(RE, RC, RD, RB, RA);        KL(RE, RD, RA, RB, RC, 12);
 619        SI3(RE, RD, RA, RB, RC);        KL(RA, RE, RC, RB, RD, 11);
 620        SI2(RA, RE, RC, RB, RD);        KL(RE, RD, RB, RC, RA, 10);
 621        SI1(RE, RD, RB, RC, RA);        KL(RA, RD, RB, RC, RE, 9);
 622        SI0(RA, RD, RB, RC, RE);        KL(RB, RE, RD, RA, RC, 8);
 623        SI7(RB, RE, RD, RA, RC);        KL(RE, RA, RB, RC, RD, 7);
 624        SI6(RE, RA, RB, RC, RD);        KL(RB, RD, RC, RE, RA, 6);
 625        SI5(RB, RD, RC, RE, RA);        KL(RD, RA, RB, RC, RE, 5);
 626        SI4(RD, RA, RB, RC, RE);        KL(RD, RB, RE, RC, RA, 4);
 627        SI3(RD, RB, RE, RC, RA);        KL(RE, RD, RA, RC, RB, 3);
 628        SI2(RE, RD, RA, RC, RB);        KL(RD, RB, RC, RA, RE, 2);
 629        SI1(RD, RB, RC, RA, RE);        KL(RE, RB, RC, RA, RD, 1);
 630        SI0(RE, RB, RC, RA, RD);         K(RC, RD, RB, RE, RA, 0);
 631
 632        movl arg_dst(%esp), %eax;
 633        write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA);
 634
 635        ret;
 636