1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27#include <linux/linkage.h>
28
29.file "serpent-sse2-x86_64-asm_64.S"
30.text
31
32#define CTX %rdi
33
34
35
36
37#define RA1 %xmm0
38#define RB1 %xmm1
39#define RC1 %xmm2
40#define RD1 %xmm3
41#define RE1 %xmm4
42
43#define RA2 %xmm5
44#define RB2 %xmm6
45#define RC2 %xmm7
46#define RD2 %xmm8
47#define RE2 %xmm9
48
49#define RNOT %xmm10
50
51#define RK0 %xmm11
52#define RK1 %xmm12
53#define RK2 %xmm13
54#define RK3 %xmm14
55
56#define S0_1(x0, x1, x2, x3, x4) \
57 movdqa x3, x4; \
58 por x0, x3; \
59 pxor x4, x0; \
60 pxor x2, x4; \
61 pxor RNOT, x4; \
62 pxor x1, x3; \
63 pand x0, x1; \
64 pxor x4, x1; \
65 pxor x0, x2;
66#define S0_2(x0, x1, x2, x3, x4) \
67 pxor x3, x0; \
68 por x0, x4; \
69 pxor x2, x0; \
70 pand x1, x2; \
71 pxor x2, x3; \
72 pxor RNOT, x1; \
73 pxor x4, x2; \
74 pxor x2, x1;
75
76#define S1_1(x0, x1, x2, x3, x4) \
77 movdqa x1, x4; \
78 pxor x0, x1; \
79 pxor x3, x0; \
80 pxor RNOT, x3; \
81 pand x1, x4; \
82 por x1, x0; \
83 pxor x2, x3; \
84 pxor x3, x0; \
85 pxor x3, x1;
86#define S1_2(x0, x1, x2, x3, x4) \
87 pxor x4, x3; \
88 por x4, x1; \
89 pxor x2, x4; \
90 pand x0, x2; \
91 pxor x1, x2; \
92 por x0, x1; \
93 pxor RNOT, x0; \
94 pxor x2, x0; \
95 pxor x1, x4;
96
97#define S2_1(x0, x1, x2, x3, x4) \
98 pxor RNOT, x3; \
99 pxor x0, x1; \
100 movdqa x0, x4; \
101 pand x2, x0; \
102 pxor x3, x0; \
103 por x4, x3; \
104 pxor x1, x2; \
105 pxor x1, x3; \
106 pand x0, x1;
107#define S2_2(x0, x1, x2, x3, x4) \
108 pxor x2, x0; \
109 pand x3, x2; \
110 por x1, x3; \
111 pxor RNOT, x0; \
112 pxor x0, x3; \
113 pxor x0, x4; \
114 pxor x2, x0; \
115 por x2, x1;
116
117#define S3_1(x0, x1, x2, x3, x4) \
118 movdqa x1, x4; \
119 pxor x3, x1; \
120 por x0, x3; \
121 pand x0, x4; \
122 pxor x2, x0; \
123 pxor x1, x2; \
124 pand x3, x1; \
125 pxor x3, x2; \
126 por x4, x0; \
127 pxor x3, x4;
128#define S3_2(x0, x1, x2, x3, x4) \
129 pxor x0, x1; \
130 pand x3, x0; \
131 pand x4, x3; \
132 pxor x2, x3; \
133 por x1, x4; \
134 pand x1, x2; \
135 pxor x3, x4; \
136 pxor x3, x0; \
137 pxor x2, x3;
138
139#define S4_1(x0, x1, x2, x3, x4) \
140 movdqa x3, x4; \
141 pand x0, x3; \
142 pxor x4, x0; \
143 pxor x2, x3; \
144 por x4, x2; \
145 pxor x1, x0; \
146 pxor x3, x4; \
147 por x0, x2; \
148 pxor x1, x2;
149#define S4_2(x0, x1, x2, x3, x4) \
150 pand x0, x1; \
151 pxor x4, x1; \
152 pand x2, x4; \
153 pxor x3, x2; \
154 pxor x0, x4; \
155 por x1, x3; \
156 pxor RNOT, x1; \
157 pxor x0, x3;
158
159#define S5_1(x0, x1, x2, x3, x4) \
160 movdqa x1, x4; \
161 por x0, x1; \
162 pxor x1, x2; \
163 pxor RNOT, x3; \
164 pxor x0, x4; \
165 pxor x2, x0; \
166 pand x4, x1; \
167 por x3, x4; \
168 pxor x0, x4;
169#define S5_2(x0, x1, x2, x3, x4) \
170 pand x3, x0; \
171 pxor x3, x1; \
172 pxor x2, x3; \
173 pxor x1, x0; \
174 pand x4, x2; \
175 pxor x2, x1; \
176 pand x0, x2; \
177 pxor x2, x3;
178
179#define S6_1(x0, x1, x2, x3, x4) \
180 movdqa x1, x4; \
181 pxor x0, x3; \
182 pxor x2, x1; \
183 pxor x0, x2; \
184 pand x3, x0; \
185 por x3, x1; \
186 pxor RNOT, x4; \
187 pxor x1, x0; \
188 pxor x2, x1;
189#define S6_2(x0, x1, x2, x3, x4) \
190 pxor x4, x3; \
191 pxor x0, x4; \
192 pand x0, x2; \
193 pxor x1, x4; \
194 pxor x3, x2; \
195 pand x1, x3; \
196 pxor x0, x3; \
197 pxor x2, x1;
198
199#define S7_1(x0, x1, x2, x3, x4) \
200 pxor RNOT, x1; \
201 movdqa x1, x4; \
202 pxor RNOT, x0; \
203 pand x2, x1; \
204 pxor x3, x1; \
205 por x4, x3; \
206 pxor x2, x4; \
207 pxor x3, x2; \
208 pxor x0, x3; \
209 por x1, x0;
210#define S7_2(x0, x1, x2, x3, x4) \
211 pand x0, x2; \
212 pxor x4, x0; \
213 pxor x3, x4; \
214 pand x0, x3; \
215 pxor x1, x4; \
216 pxor x4, x2; \
217 pxor x1, x3; \
218 por x0, x4; \
219 pxor x1, x4;
220
221#define SI0_1(x0, x1, x2, x3, x4) \
222 movdqa x3, x4; \
223 pxor x0, x1; \
224 por x1, x3; \
225 pxor x1, x4; \
226 pxor RNOT, x0; \
227 pxor x3, x2; \
228 pxor x0, x3; \
229 pand x1, x0; \
230 pxor x2, x0;
231#define SI0_2(x0, x1, x2, x3, x4) \
232 pand x3, x2; \
233 pxor x4, x3; \
234 pxor x3, x2; \
235 pxor x3, x1; \
236 pand x0, x3; \
237 pxor x0, x1; \
238 pxor x2, x0; \
239 pxor x3, x4;
240
241#define SI1_1(x0, x1, x2, x3, x4) \
242 pxor x3, x1; \
243 movdqa x0, x4; \
244 pxor x2, x0; \
245 pxor RNOT, x2; \
246 por x1, x4; \
247 pxor x3, x4; \
248 pand x1, x3; \
249 pxor x2, x1; \
250 pand x4, x2;
251#define SI1_2(x0, x1, x2, x3, x4) \
252 pxor x1, x4; \
253 por x3, x1; \
254 pxor x0, x3; \
255 pxor x0, x2; \
256 por x4, x0; \
257 pxor x4, x2; \
258 pxor x0, x1; \
259 pxor x1, x4;
260
261#define SI2_1(x0, x1, x2, x3, x4) \
262 pxor x1, x2; \
263 movdqa x3, x4; \
264 pxor RNOT, x3; \
265 por x2, x3; \
266 pxor x4, x2; \
267 pxor x0, x4; \
268 pxor x1, x3; \
269 por x2, x1; \
270 pxor x0, x2;
271#define SI2_2(x0, x1, x2, x3, x4) \
272 pxor x4, x1; \
273 por x3, x4; \
274 pxor x3, x2; \
275 pxor x2, x4; \
276 pand x1, x2; \
277 pxor x3, x2; \
278 pxor x4, x3; \
279 pxor x0, x4;
280
281#define SI3_1(x0, x1, x2, x3, x4) \
282 pxor x1, x2; \
283 movdqa x1, x4; \
284 pand x2, x1; \
285 pxor x0, x1; \
286 por x4, x0; \
287 pxor x3, x4; \
288 pxor x3, x0; \
289 por x1, x3; \
290 pxor x2, x1;
291#define SI3_2(x0, x1, x2, x3, x4) \
292 pxor x3, x1; \
293 pxor x2, x0; \
294 pxor x3, x2; \
295 pand x1, x3; \
296 pxor x0, x1; \
297 pand x2, x0; \
298 pxor x3, x4; \
299 pxor x0, x3; \
300 pxor x1, x0;
301
302#define SI4_1(x0, x1, x2, x3, x4) \
303 pxor x3, x2; \
304 movdqa x0, x4; \
305 pand x1, x0; \
306 pxor x2, x0; \
307 por x3, x2; \
308 pxor RNOT, x4; \
309 pxor x0, x1; \
310 pxor x2, x0; \
311 pand x4, x2;
312#define SI4_2(x0, x1, x2, x3, x4) \
313 pxor x0, x2; \
314 por x4, x0; \
315 pxor x3, x0; \
316 pand x2, x3; \
317 pxor x3, x4; \
318 pxor x1, x3; \
319 pand x0, x1; \
320 pxor x1, x4; \
321 pxor x3, x0;
322
323#define SI5_1(x0, x1, x2, x3, x4) \
324 movdqa x1, x4; \
325 por x2, x1; \
326 pxor x4, x2; \
327 pxor x3, x1; \
328 pand x4, x3; \
329 pxor x3, x2; \
330 por x0, x3; \
331 pxor RNOT, x0; \
332 pxor x2, x3; \
333 por x0, x2;
334#define SI5_2(x0, x1, x2, x3, x4) \
335 pxor x1, x4; \
336 pxor x4, x2; \
337 pand x0, x4; \
338 pxor x1, x0; \
339 pxor x3, x1; \
340 pand x2, x0; \
341 pxor x3, x2; \
342 pxor x2, x0; \
343 pxor x4, x2; \
344 pxor x3, x4;
345
346#define SI6_1(x0, x1, x2, x3, x4) \
347 pxor x2, x0; \
348 movdqa x0, x4; \
349 pand x3, x0; \
350 pxor x3, x2; \
351 pxor x2, x0; \
352 pxor x1, x3; \
353 por x4, x2; \
354 pxor x3, x2; \
355 pand x0, x3;
356#define SI6_2(x0, x1, x2, x3, x4) \
357 pxor RNOT, x0; \
358 pxor x1, x3; \
359 pand x2, x1; \
360 pxor x0, x4; \
361 pxor x4, x3; \
362 pxor x2, x4; \
363 pxor x1, x0; \
364 pxor x0, x2;
365
366#define SI7_1(x0, x1, x2, x3, x4) \
367 movdqa x3, x4; \
368 pand x0, x3; \
369 pxor x2, x0; \
370 por x4, x2; \
371 pxor x1, x4; \
372 pxor RNOT, x0; \
373 por x3, x1; \
374 pxor x0, x4; \
375 pand x2, x0; \
376 pxor x1, x0;
377#define SI7_2(x0, x1, x2, x3, x4) \
378 pand x2, x1; \
379 pxor x2, x3; \
380 pxor x3, x4; \
381 pand x3, x2; \
382 por x0, x3; \
383 pxor x4, x1; \
384 pxor x4, x3; \
385 pand x0, x4; \
386 pxor x2, x4;
387
388#define get_key(i, j, t) \
389 movd (4*(i)+(j))*4(CTX), t; \
390 pshufd $0, t, t;
391
392#define K2(x0, x1, x2, x3, x4, i) \
393 get_key(i, 0, RK0); \
394 get_key(i, 1, RK1); \
395 get_key(i, 2, RK2); \
396 get_key(i, 3, RK3); \
397 pxor RK0, x0
398 pxor RK1, x1
399 pxor RK2, x2
400 pxor RK3, x3
401 pxor RK0, x0
402 pxor RK1, x1
403 pxor RK2, x2
404 pxor RK3, x3
405
406#define LK2(x0, x1, x2, x3, x4, i) \
407 movdqa x0
408 pslld $13, x0
409 psrld $(32 - 13), x4
410 por x4
411 pxor x0
412 movdqa x2
413 pslld $3, x2
414 psrld $(32 - 3), x4
415 por x4
416 pxor x2
417 movdqa x0
418 pslld $13, x0
419 psrld $(32 - 13), x4
420 por x4
421 pxor x0
422 movdqa x2
423 pslld $3, x2
424 psrld $(32 - 3), x4
425 por x4
426 pxor x2
427 movdqa x1
428 pslld $1, x1
429 psrld $(32 - 1), x4
430 por x4
431 movdqa x0
432 pslld $3, x4
433 pxor x2
434 pxor x4
435 movdqa x3
436 get_key(i, 1, RK1); \
437 movdqa x1
438 pslld $1, x1
439 psrld $(32 - 1), x4
440 por x4
441 movdqa x0
442 pslld $3, x4
443 pxor x2
444 pxor x4
445 movdqa x3
446 get_key(i, 3, RK3); \
447 pslld $7, x3
448 psrld $(32 - 7), x4
449 por x4
450 movdqa x1
451 pslld $7, x4
452 pxor x1
453 pxor x3
454 pxor x3
455 pxor x4
456 get_key(i, 0, RK0); \
457 pslld $7, x3
458 psrld $(32 - 7), x4
459 por x4
460 movdqa x1
461 pslld $7, x4
462 pxor x1
463 pxor x3
464 pxor x3
465 pxor x4
466 get_key(i, 2, RK2); \
467 pxor RK1, x1
468 pxor RK3, x3
469 movdqa x0
470 pslld $5, x0
471 psrld $(32 - 5), x4
472 por x4
473 movdqa x2
474 pslld $22, x2
475 psrld $(32 - 22), x4
476 por x4
477 pxor RK0, x0
478 pxor RK2, x2
479 pxor RK1, x1
480 pxor RK3, x3
481 movdqa x0
482 pslld $5, x0
483 psrld $(32 - 5), x4
484 por x4
485 movdqa x2
486 pslld $22, x2
487 psrld $(32 - 22), x4
488 por x4
489 pxor RK0, x0
490 pxor RK2, x2
491
492#define KL2(x0, x1, x2, x3, x4, i) \
493 pxor RK0, x0
494 pxor RK2, x2
495 movdqa x0
496 psrld $5, x0
497 pslld $(32 - 5), x4
498 por x4
499 pxor RK3, x3
500 pxor RK1, x1
501 movdqa x2
502 psrld $22, x2
503 pslld $(32 - 22), x4
504 por x4
505 pxor x3
506 pxor RK0, x0
507 pxor RK2, x2
508 movdqa x0
509 psrld $5, x0
510 pslld $(32 - 5), x4
511 por x4
512 pxor RK3, x3
513 pxor RK1, x1
514 movdqa x2
515 psrld $22, x2
516 pslld $(32 - 22), x4
517 por x4
518 pxor x3
519 pxor x3
520 movdqa x1
521 pslld $7, x4
522 pxor x1
523 pxor x4
524 movdqa x1
525 psrld $1, x1
526 pslld $(32 - 1), x4
527 por x4
528 pxor x3
529 movdqa x1
530 pslld $7, x4
531 pxor x1
532 pxor x4
533 movdqa x1
534 psrld $1, x1
535 pslld $(32 - 1), x4
536 por x4
537 movdqa x3
538 psrld $7, x3
539 pslld $(32 - 7), x4
540 por x4
541 pxor x0
542 movdqa x0
543 pslld $3, x4
544 pxor x4
545 movdqa x0
546 movdqa x3
547 psrld $7, x3
548 pslld $(32 - 7), x4
549 por x4
550 pxor x0
551 movdqa x0
552 pslld $3, x4
553 pxor x4
554 movdqa x0
555 psrld $13, x0
556 pslld $(32 - 13), x4
557 por x4
558 pxor x2
559 pxor x2
560 movdqa x2
561 psrld $3, x2
562 pslld $(32 - 3), x4
563 por x4
564 psrld $13, x0
565 pslld $(32 - 13), x4
566 por x4
567 pxor x2
568 pxor x2
569 movdqa x2
570 psrld $3, x2
571 pslld $(32 - 3), x4
572 por x4
573
574#define S(SBOX, x0, x1, x2, x3, x4) \
575 SBOX
576 SBOX
577 SBOX
578 SBOX
579
580#define SP(SBOX, x0, x1, x2, x3, x4, i) \
581 get_key(i, 0, RK0); \
582 SBOX
583 get_key(i, 2, RK2); \
584 SBOX
585 get_key(i, 3, RK3); \
586 SBOX
587 get_key(i, 1, RK1); \
588 SBOX
589
590#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
591 movdqa x0, t2; \
592 punpckldq x1, x0; \
593 punpckhdq x1, t2; \
594 movdqa x2, t1; \
595 punpckhdq x3, x2; \
596 punpckldq x3, t1; \
597 movdqa x0, x1; \
598 punpcklqdq t1, x0; \
599 punpckhqdq t1, x1; \
600 movdqa t2, x3; \
601 punpcklqdq x2, t2; \
602 punpckhqdq x2, x3; \
603 movdqa t2, x2;
604
605#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
606 movdqu (0*4*4)(in), x0; \
607 movdqu (1*4*4)(in), x1; \
608 movdqu (2*4*4)(in), x2; \
609 movdqu (3*4*4)(in), x3; \
610 \
611 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
612
613#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
614 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
615 \
616 movdqu x0, (0*4*4)(out); \
617 movdqu x1, (1*4*4)(out); \
618 movdqu x2, (2*4*4)(out); \
619 movdqu x3, (3*4*4)(out);
620
621#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
622 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
623 \
624 movdqu (0*4*4)(out), t0; \
625 pxor t0, x0; \
626 movdqu x0, (0*4*4)(out); \
627 movdqu (1*4*4)(out), t0; \
628 pxor t0, x1; \
629 movdqu x1, (1*4*4)(out); \
630 movdqu (2*4*4)(out), t0; \
631 pxor t0, x2; \
632 movdqu x2, (2*4*4)(out); \
633 movdqu (3*4*4)(out), t0; \
634 pxor t0, x3; \
635 movdqu x3, (3*4*4)(out);
636
637ENTRY(__serpent_enc_blk_8way)
638
639
640
641
642
643
644
645 pcmpeqd RNOT, RNOT;
646
647 leaq (4*4*4)(%rdx), %rax;
648 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
649 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
650
651 K2(RA, RB, RC, RD, RE, 0);
652 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
653 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
654 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
655 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
656 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
657 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
658 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
659 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
660 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
661 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
662 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
663 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
664 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
665 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
666 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
667 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
668 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
669 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
670 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
671 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
672 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
673 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
674 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
675 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
676 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
677 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
678 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
679 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
680 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
681 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
682 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
683 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
684
685 leaq (4*4*4)(%rsi), %rax;
686
687 testb %cl, %cl;
688 jnz .L__enc_xor8;
689
690 write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
691 write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
692
693 ret;
694
695.L__enc_xor8:
696 xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
697 xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
698
699 ret;
700ENDPROC(__serpent_enc_blk_8way)
701
702ENTRY(serpent_dec_blk_8way)
703
704
705
706
707
708
709 pcmpeqd RNOT, RNOT;
710
711 leaq (4*4*4)(%rdx), %rax;
712 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
713 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
714
715 K2(RA, RB, RC, RD, RE, 32);
716 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
717 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
718 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
719 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
720 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
721 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
722 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
723 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
724 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
725 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
726 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
727 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
728 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
729 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
730 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
731 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
732 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
733 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
734 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
735 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
736 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
737 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
738 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
739 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
740 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
741 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
742 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
743 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
744 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
745 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
746 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
747 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
748
749 leaq (4*4*4)(%rsi), %rax;
750 write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
751 write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
752
753 ret;
754ENDPROC(serpent_dec_blk_8way)
755