1
2
3
4
5
6
7
8
9
10
11
12#include <linux/linkage.h>
13
14.file "serpent-sse2-x86_64-asm_64.S"
15.text
16
17#define CTX %rdi
18
19
20
21
22#define RA1 %xmm0
23#define RB1 %xmm1
24#define RC1 %xmm2
25#define RD1 %xmm3
26#define RE1 %xmm4
27
28#define RA2 %xmm5
29#define RB2 %xmm6
30#define RC2 %xmm7
31#define RD2 %xmm8
32#define RE2 %xmm9
33
34#define RNOT %xmm10
35
36#define RK0 %xmm11
37#define RK1 %xmm12
38#define RK2 %xmm13
39#define RK3 %xmm14
40
41#define S0_1(x0, x1, x2, x3, x4) \
42 movdqa x3, x4; \
43 por x0, x3; \
44 pxor x4, x0; \
45 pxor x2, x4; \
46 pxor RNOT, x4; \
47 pxor x1, x3; \
48 pand x0, x1; \
49 pxor x4, x1; \
50 pxor x0, x2;
51#define S0_2(x0, x1, x2, x3, x4) \
52 pxor x3, x0; \
53 por x0, x4; \
54 pxor x2, x0; \
55 pand x1, x2; \
56 pxor x2, x3; \
57 pxor RNOT, x1; \
58 pxor x4, x2; \
59 pxor x2, x1;
60
61#define S1_1(x0, x1, x2, x3, x4) \
62 movdqa x1, x4; \
63 pxor x0, x1; \
64 pxor x3, x0; \
65 pxor RNOT, x3; \
66 pand x1, x4; \
67 por x1, x0; \
68 pxor x2, x3; \
69 pxor x3, x0; \
70 pxor x3, x1;
71#define S1_2(x0, x1, x2, x3, x4) \
72 pxor x4, x3; \
73 por x4, x1; \
74 pxor x2, x4; \
75 pand x0, x2; \
76 pxor x1, x2; \
77 por x0, x1; \
78 pxor RNOT, x0; \
79 pxor x2, x0; \
80 pxor x1, x4;
81
82#define S2_1(x0, x1, x2, x3, x4) \
83 pxor RNOT, x3; \
84 pxor x0, x1; \
85 movdqa x0, x4; \
86 pand x2, x0; \
87 pxor x3, x0; \
88 por x4, x3; \
89 pxor x1, x2; \
90 pxor x1, x3; \
91 pand x0, x1;
92#define S2_2(x0, x1, x2, x3, x4) \
93 pxor x2, x0; \
94 pand x3, x2; \
95 por x1, x3; \
96 pxor RNOT, x0; \
97 pxor x0, x3; \
98 pxor x0, x4; \
99 pxor x2, x0; \
100 por x2, x1;
101
102#define S3_1(x0, x1, x2, x3, x4) \
103 movdqa x1, x4; \
104 pxor x3, x1; \
105 por x0, x3; \
106 pand x0, x4; \
107 pxor x2, x0; \
108 pxor x1, x2; \
109 pand x3, x1; \
110 pxor x3, x2; \
111 por x4, x0; \
112 pxor x3, x4;
113#define S3_2(x0, x1, x2, x3, x4) \
114 pxor x0, x1; \
115 pand x3, x0; \
116 pand x4, x3; \
117 pxor x2, x3; \
118 por x1, x4; \
119 pand x1, x2; \
120 pxor x3, x4; \
121 pxor x3, x0; \
122 pxor x2, x3;
123
124#define S4_1(x0, x1, x2, x3, x4) \
125 movdqa x3, x4; \
126 pand x0, x3; \
127 pxor x4, x0; \
128 pxor x2, x3; \
129 por x4, x2; \
130 pxor x1, x0; \
131 pxor x3, x4; \
132 por x0, x2; \
133 pxor x1, x2;
134#define S4_2(x0, x1, x2, x3, x4) \
135 pand x0, x1; \
136 pxor x4, x1; \
137 pand x2, x4; \
138 pxor x3, x2; \
139 pxor x0, x4; \
140 por x1, x3; \
141 pxor RNOT, x1; \
142 pxor x0, x3;
143
144#define S5_1(x0, x1, x2, x3, x4) \
145 movdqa x1, x4; \
146 por x0, x1; \
147 pxor x1, x2; \
148 pxor RNOT, x3; \
149 pxor x0, x4; \
150 pxor x2, x0; \
151 pand x4, x1; \
152 por x3, x4; \
153 pxor x0, x4;
154#define S5_2(x0, x1, x2, x3, x4) \
155 pand x3, x0; \
156 pxor x3, x1; \
157 pxor x2, x3; \
158 pxor x1, x0; \
159 pand x4, x2; \
160 pxor x2, x1; \
161 pand x0, x2; \
162 pxor x2, x3;
163
164#define S6_1(x0, x1, x2, x3, x4) \
165 movdqa x1, x4; \
166 pxor x0, x3; \
167 pxor x2, x1; \
168 pxor x0, x2; \
169 pand x3, x0; \
170 por x3, x1; \
171 pxor RNOT, x4; \
172 pxor x1, x0; \
173 pxor x2, x1;
174#define S6_2(x0, x1, x2, x3, x4) \
175 pxor x4, x3; \
176 pxor x0, x4; \
177 pand x0, x2; \
178 pxor x1, x4; \
179 pxor x3, x2; \
180 pand x1, x3; \
181 pxor x0, x3; \
182 pxor x2, x1;
183
184#define S7_1(x0, x1, x2, x3, x4) \
185 pxor RNOT, x1; \
186 movdqa x1, x4; \
187 pxor RNOT, x0; \
188 pand x2, x1; \
189 pxor x3, x1; \
190 por x4, x3; \
191 pxor x2, x4; \
192 pxor x3, x2; \
193 pxor x0, x3; \
194 por x1, x0;
195#define S7_2(x0, x1, x2, x3, x4) \
196 pand x0, x2; \
197 pxor x4, x0; \
198 pxor x3, x4; \
199 pand x0, x3; \
200 pxor x1, x4; \
201 pxor x4, x2; \
202 pxor x1, x3; \
203 por x0, x4; \
204 pxor x1, x4;
205
206#define SI0_1(x0, x1, x2, x3, x4) \
207 movdqa x3, x4; \
208 pxor x0, x1; \
209 por x1, x3; \
210 pxor x1, x4; \
211 pxor RNOT, x0; \
212 pxor x3, x2; \
213 pxor x0, x3; \
214 pand x1, x0; \
215 pxor x2, x0;
216#define SI0_2(x0, x1, x2, x3, x4) \
217 pand x3, x2; \
218 pxor x4, x3; \
219 pxor x3, x2; \
220 pxor x3, x1; \
221 pand x0, x3; \
222 pxor x0, x1; \
223 pxor x2, x0; \
224 pxor x3, x4;
225
226#define SI1_1(x0, x1, x2, x3, x4) \
227 pxor x3, x1; \
228 movdqa x0, x4; \
229 pxor x2, x0; \
230 pxor RNOT, x2; \
231 por x1, x4; \
232 pxor x3, x4; \
233 pand x1, x3; \
234 pxor x2, x1; \
235 pand x4, x2;
236#define SI1_2(x0, x1, x2, x3, x4) \
237 pxor x1, x4; \
238 por x3, x1; \
239 pxor x0, x3; \
240 pxor x0, x2; \
241 por x4, x0; \
242 pxor x4, x2; \
243 pxor x0, x1; \
244 pxor x1, x4;
245
246#define SI2_1(x0, x1, x2, x3, x4) \
247 pxor x1, x2; \
248 movdqa x3, x4; \
249 pxor RNOT, x3; \
250 por x2, x3; \
251 pxor x4, x2; \
252 pxor x0, x4; \
253 pxor x1, x3; \
254 por x2, x1; \
255 pxor x0, x2;
256#define SI2_2(x0, x1, x2, x3, x4) \
257 pxor x4, x1; \
258 por x3, x4; \
259 pxor x3, x2; \
260 pxor x2, x4; \
261 pand x1, x2; \
262 pxor x3, x2; \
263 pxor x4, x3; \
264 pxor x0, x4;
265
266#define SI3_1(x0, x1, x2, x3, x4) \
267 pxor x1, x2; \
268 movdqa x1, x4; \
269 pand x2, x1; \
270 pxor x0, x1; \
271 por x4, x0; \
272 pxor x3, x4; \
273 pxor x3, x0; \
274 por x1, x3; \
275 pxor x2, x1;
276#define SI3_2(x0, x1, x2, x3, x4) \
277 pxor x3, x1; \
278 pxor x2, x0; \
279 pxor x3, x2; \
280 pand x1, x3; \
281 pxor x0, x1; \
282 pand x2, x0; \
283 pxor x3, x4; \
284 pxor x0, x3; \
285 pxor x1, x0;
286
287#define SI4_1(x0, x1, x2, x3, x4) \
288 pxor x3, x2; \
289 movdqa x0, x4; \
290 pand x1, x0; \
291 pxor x2, x0; \
292 por x3, x2; \
293 pxor RNOT, x4; \
294 pxor x0, x1; \
295 pxor x2, x0; \
296 pand x4, x2;
297#define SI4_2(x0, x1, x2, x3, x4) \
298 pxor x0, x2; \
299 por x4, x0; \
300 pxor x3, x0; \
301 pand x2, x3; \
302 pxor x3, x4; \
303 pxor x1, x3; \
304 pand x0, x1; \
305 pxor x1, x4; \
306 pxor x3, x0;
307
308#define SI5_1(x0, x1, x2, x3, x4) \
309 movdqa x1, x4; \
310 por x2, x1; \
311 pxor x4, x2; \
312 pxor x3, x1; \
313 pand x4, x3; \
314 pxor x3, x2; \
315 por x0, x3; \
316 pxor RNOT, x0; \
317 pxor x2, x3; \
318 por x0, x2;
319#define SI5_2(x0, x1, x2, x3, x4) \
320 pxor x1, x4; \
321 pxor x4, x2; \
322 pand x0, x4; \
323 pxor x1, x0; \
324 pxor x3, x1; \
325 pand x2, x0; \
326 pxor x3, x2; \
327 pxor x2, x0; \
328 pxor x4, x2; \
329 pxor x3, x4;
330
331#define SI6_1(x0, x1, x2, x3, x4) \
332 pxor x2, x0; \
333 movdqa x0, x4; \
334 pand x3, x0; \
335 pxor x3, x2; \
336 pxor x2, x0; \
337 pxor x1, x3; \
338 por x4, x2; \
339 pxor x3, x2; \
340 pand x0, x3;
341#define SI6_2(x0, x1, x2, x3, x4) \
342 pxor RNOT, x0; \
343 pxor x1, x3; \
344 pand x2, x1; \
345 pxor x0, x4; \
346 pxor x4, x3; \
347 pxor x2, x4; \
348 pxor x1, x0; \
349 pxor x0, x2;
350
351#define SI7_1(x0, x1, x2, x3, x4) \
352 movdqa x3, x4; \
353 pand x0, x3; \
354 pxor x2, x0; \
355 por x4, x2; \
356 pxor x1, x4; \
357 pxor RNOT, x0; \
358 por x3, x1; \
359 pxor x0, x4; \
360 pand x2, x0; \
361 pxor x1, x0;
362#define SI7_2(x0, x1, x2, x3, x4) \
363 pand x2, x1; \
364 pxor x2, x3; \
365 pxor x3, x4; \
366 pand x3, x2; \
367 por x0, x3; \
368 pxor x4, x1; \
369 pxor x4, x3; \
370 pand x0, x4; \
371 pxor x2, x4;
372
373#define get_key(i, j, t) \
374 movd (4*(i)+(j))*4(CTX), t; \
375 pshufd $0, t, t;
376
377#define K2(x0, x1, x2, x3, x4, i) \
378 get_key(i, 0, RK0); \
379 get_key(i, 1, RK1); \
380 get_key(i, 2, RK2); \
381 get_key(i, 3, RK3); \
382 pxor RK0, x0
383 pxor RK1, x1
384 pxor RK2, x2
385 pxor RK3, x3
386 pxor RK0, x0
387 pxor RK1, x1
388 pxor RK2, x2
389 pxor RK3, x3
390
391#define LK2(x0, x1, x2, x3, x4, i) \
392 movdqa x0
393 pslld $13, x0
394 psrld $(32 - 13), x4
395 por x4
396 pxor x0
397 movdqa x2
398 pslld $3, x2
399 psrld $(32 - 3), x4
400 por x4
401 pxor x2
402 movdqa x0
403 pslld $13, x0
404 psrld $(32 - 13), x4
405 por x4
406 pxor x0
407 movdqa x2
408 pslld $3, x2
409 psrld $(32 - 3), x4
410 por x4
411 pxor x2
412 movdqa x1
413 pslld $1, x1
414 psrld $(32 - 1), x4
415 por x4
416 movdqa x0
417 pslld $3, x4
418 pxor x2
419 pxor x4
420 movdqa x3
421 get_key(i, 1, RK1); \
422 movdqa x1
423 pslld $1, x1
424 psrld $(32 - 1), x4
425 por x4
426 movdqa x0
427 pslld $3, x4
428 pxor x2
429 pxor x4
430 movdqa x3
431 get_key(i, 3, RK3); \
432 pslld $7, x3
433 psrld $(32 - 7), x4
434 por x4
435 movdqa x1
436 pslld $7, x4
437 pxor x1
438 pxor x3
439 pxor x3
440 pxor x4
441 get_key(i, 0, RK0); \
442 pslld $7, x3
443 psrld $(32 - 7), x4
444 por x4
445 movdqa x1
446 pslld $7, x4
447 pxor x1
448 pxor x3
449 pxor x3
450 pxor x4
451 get_key(i, 2, RK2); \
452 pxor RK1, x1
453 pxor RK3, x3
454 movdqa x0
455 pslld $5, x0
456 psrld $(32 - 5), x4
457 por x4
458 movdqa x2
459 pslld $22, x2
460 psrld $(32 - 22), x4
461 por x4
462 pxor RK0, x0
463 pxor RK2, x2
464 pxor RK1, x1
465 pxor RK3, x3
466 movdqa x0
467 pslld $5, x0
468 psrld $(32 - 5), x4
469 por x4
470 movdqa x2
471 pslld $22, x2
472 psrld $(32 - 22), x4
473 por x4
474 pxor RK0, x0
475 pxor RK2, x2
476
477#define KL2(x0, x1, x2, x3, x4, i) \
478 pxor RK0, x0
479 pxor RK2, x2
480 movdqa x0
481 psrld $5, x0
482 pslld $(32 - 5), x4
483 por x4
484 pxor RK3, x3
485 pxor RK1, x1
486 movdqa x2
487 psrld $22, x2
488 pslld $(32 - 22), x4
489 por x4
490 pxor x3
491 pxor RK0, x0
492 pxor RK2, x2
493 movdqa x0
494 psrld $5, x0
495 pslld $(32 - 5), x4
496 por x4
497 pxor RK3, x3
498 pxor RK1, x1
499 movdqa x2
500 psrld $22, x2
501 pslld $(32 - 22), x4
502 por x4
503 pxor x3
504 pxor x3
505 movdqa x1
506 pslld $7, x4
507 pxor x1
508 pxor x4
509 movdqa x1
510 psrld $1, x1
511 pslld $(32 - 1), x4
512 por x4
513 pxor x3
514 movdqa x1
515 pslld $7, x4
516 pxor x1
517 pxor x4
518 movdqa x1
519 psrld $1, x1
520 pslld $(32 - 1), x4
521 por x4
522 movdqa x3
523 psrld $7, x3
524 pslld $(32 - 7), x4
525 por x4
526 pxor x0
527 movdqa x0
528 pslld $3, x4
529 pxor x4
530 movdqa x0
531 movdqa x3
532 psrld $7, x3
533 pslld $(32 - 7), x4
534 por x4
535 pxor x0
536 movdqa x0
537 pslld $3, x4
538 pxor x4
539 movdqa x0
540 psrld $13, x0
541 pslld $(32 - 13), x4
542 por x4
543 pxor x2
544 pxor x2
545 movdqa x2
546 psrld $3, x2
547 pslld $(32 - 3), x4
548 por x4
549 psrld $13, x0
550 pslld $(32 - 13), x4
551 por x4
552 pxor x2
553 pxor x2
554 movdqa x2
555 psrld $3, x2
556 pslld $(32 - 3), x4
557 por x4
558
559#define S(SBOX, x0, x1, x2, x3, x4) \
560 SBOX
561 SBOX
562 SBOX
563 SBOX
564
565#define SP(SBOX, x0, x1, x2, x3, x4, i) \
566 get_key(i, 0, RK0); \
567 SBOX
568 get_key(i, 2, RK2); \
569 SBOX
570 get_key(i, 3, RK3); \
571 SBOX
572 get_key(i, 1, RK1); \
573 SBOX
574
575#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
576 movdqa x0, t2; \
577 punpckldq x1, x0; \
578 punpckhdq x1, t2; \
579 movdqa x2, t1; \
580 punpckhdq x3, x2; \
581 punpckldq x3, t1; \
582 movdqa x0, x1; \
583 punpcklqdq t1, x0; \
584 punpckhqdq t1, x1; \
585 movdqa t2, x3; \
586 punpcklqdq x2, t2; \
587 punpckhqdq x2, x3; \
588 movdqa t2, x2;
589
590#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
591 movdqu (0*4*4)(in), x0; \
592 movdqu (1*4*4)(in), x1; \
593 movdqu (2*4*4)(in), x2; \
594 movdqu (3*4*4)(in), x3; \
595 \
596 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
597
598#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
599 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
600 \
601 movdqu x0, (0*4*4)(out); \
602 movdqu x1, (1*4*4)(out); \
603 movdqu x2, (2*4*4)(out); \
604 movdqu x3, (3*4*4)(out);
605
606#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
607 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
608 \
609 movdqu (0*4*4)(out), t0; \
610 pxor t0, x0; \
611 movdqu x0, (0*4*4)(out); \
612 movdqu (1*4*4)(out), t0; \
613 pxor t0, x1; \
614 movdqu x1, (1*4*4)(out); \
615 movdqu (2*4*4)(out), t0; \
616 pxor t0, x2; \
617 movdqu x2, (2*4*4)(out); \
618 movdqu (3*4*4)(out), t0; \
619 pxor t0, x3; \
620 movdqu x3, (3*4*4)(out);
621
622SYM_FUNC_START(__serpent_enc_blk_8way)
623
624
625
626
627
628
629
630 pcmpeqd RNOT, RNOT;
631
632 leaq (4*4*4)(%rdx), %rax;
633 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
634 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
635
636 K2(RA, RB, RC, RD, RE, 0);
637 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
638 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
639 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
640 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
641 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
642 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
643 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
644 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
645 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
646 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
647 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
648 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
649 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
650 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
651 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
652 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
653 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
654 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
655 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
656 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
657 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
658 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
659 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
660 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
661 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
662 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
663 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
664 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
665 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
666 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
667 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
668 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
669
670 leaq (4*4*4)(%rsi), %rax;
671
672 testb %cl, %cl;
673 jnz .L__enc_xor8;
674
675 write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
676 write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
677
678 ret;
679
680.L__enc_xor8:
681 xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
682 xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
683
684 ret;
685SYM_FUNC_END(__serpent_enc_blk_8way)
686
687SYM_FUNC_START(serpent_dec_blk_8way)
688
689
690
691
692
693
694 pcmpeqd RNOT, RNOT;
695
696 leaq (4*4*4)(%rdx), %rax;
697 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
698 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
699
700 K2(RA, RB, RC, RD, RE, 32);
701 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
702 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
703 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
704 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
705 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
706 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
707 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
708 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
709 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
710 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
711 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
712 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
713 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
714 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
715 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
716 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
717 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
718 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
719 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
720 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
721 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
722 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
723 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
724 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
725 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
726 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
727 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
728 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
729 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
730 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
731 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
732 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
733
734 leaq (4*4*4)(%rsi), %rax;
735 write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
736 write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
737
738 ret;
739SYM_FUNC_END(serpent_dec_blk_8way)
740