1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26#include <linux/linkage.h>
27#include <asm/frame.h>
28#include "glue_helper-asm-avx.S"
29
30.file "serpent-avx-x86_64-asm_64.S"
31
32.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16
33.align 16
34.Lbswap128_mask:
35 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
36.section .rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16
37.align 16
38.Lxts_gf128mul_and_shl1_mask:
39 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
40
41.text
42
43#define CTX %rdi
44
45
46
47
48#define RA1 %xmm0
49#define RB1 %xmm1
50#define RC1 %xmm2
51#define RD1 %xmm3
52#define RE1 %xmm4
53
54#define tp %xmm5
55
56#define RA2 %xmm6
57#define RB2 %xmm7
58#define RC2 %xmm8
59#define RD2 %xmm9
60#define RE2 %xmm10
61
62#define RNOT %xmm11
63
64#define RK0 %xmm12
65#define RK1 %xmm13
66#define RK2 %xmm14
67#define RK3 %xmm15
68
69
70#define S0_1(x0, x1, x2, x3, x4) \
71 vpor x0, x3, tp; \
72 vpxor x3, x0, x0; \
73 vpxor x2, x3, x4; \
74 vpxor RNOT, x4, x4; \
75 vpxor x1, tp, x3; \
76 vpand x0, x1, x1; \
77 vpxor x4, x1, x1; \
78 vpxor x0, x2, x2;
79#define S0_2(x0, x1, x2, x3, x4) \
80 vpxor x3, x0, x0; \
81 vpor x0, x4, x4; \
82 vpxor x2, x0, x0; \
83 vpand x1, x2, x2; \
84 vpxor x2, x3, x3; \
85 vpxor RNOT, x1, x1; \
86 vpxor x4, x2, x2; \
87 vpxor x2, x1, x1;
88
89#define S1_1(x0, x1, x2, x3, x4) \
90 vpxor x0, x1, tp; \
91 vpxor x3, x0, x0; \
92 vpxor RNOT, x3, x3; \
93 vpand tp, x1, x4; \
94 vpor tp, x0, x0; \
95 vpxor x2, x3, x3; \
96 vpxor x3, x0, x0; \
97 vpxor x3, tp, x1;
98#define S1_2(x0, x1, x2, x3, x4) \
99 vpxor x4, x3, x3; \
100 vpor x4, x1, x1; \
101 vpxor x2, x4, x4; \
102 vpand x0, x2, x2; \
103 vpxor x1, x2, x2; \
104 vpor x0, x1, x1; \
105 vpxor RNOT, x0, x0; \
106 vpxor x2, x0, x0; \
107 vpxor x1, x4, x4;
108
109#define S2_1(x0, x1, x2, x3, x4) \
110 vpxor RNOT, x3, x3; \
111 vpxor x0, x1, x1; \
112 vpand x2, x0, tp; \
113 vpxor x3, tp, tp; \
114 vpor x0, x3, x3; \
115 vpxor x1, x2, x2; \
116 vpxor x1, x3, x3; \
117 vpand tp, x1, x1;
118#define S2_2(x0, x1, x2, x3, x4) \
119 vpxor x2, tp, tp; \
120 vpand x3, x2, x2; \
121 vpor x1, x3, x3; \
122 vpxor RNOT, tp, tp; \
123 vpxor tp, x3, x3; \
124 vpxor tp, x0, x4; \
125 vpxor x2, tp, x0; \
126 vpor x2, x1, x1;
127
128#define S3_1(x0, x1, x2, x3, x4) \
129 vpxor x3, x1, tp; \
130 vpor x0, x3, x3; \
131 vpand x0, x1, x4; \
132 vpxor x2, x0, x0; \
133 vpxor tp, x2, x2; \
134 vpand x3, tp, x1; \
135 vpxor x3, x2, x2; \
136 vpor x4, x0, x0; \
137 vpxor x3, x4, x4;
138#define S3_2(x0, x1, x2, x3, x4) \
139 vpxor x0, x1, x1; \
140 vpand x3, x0, x0; \
141 vpand x4, x3, x3; \
142 vpxor x2, x3, x3; \
143 vpor x1, x4, x4; \
144 vpand x1, x2, x2; \
145 vpxor x3, x4, x4; \
146 vpxor x3, x0, x0; \
147 vpxor x2, x3, x3;
148
149#define S4_1(x0, x1, x2, x3, x4) \
150 vpand x0, x3, tp; \
151 vpxor x3, x0, x0; \
152 vpxor x2, tp, tp; \
153 vpor x3, x2, x2; \
154 vpxor x1, x0, x0; \
155 vpxor tp, x3, x4; \
156 vpor x0, x2, x2; \
157 vpxor x1, x2, x2;
158#define S4_2(x0, x1, x2, x3, x4) \
159 vpand x0, x1, x1; \
160 vpxor x4, x1, x1; \
161 vpand x2, x4, x4; \
162 vpxor tp, x2, x2; \
163 vpxor x0, x4, x4; \
164 vpor x1, tp, x3; \
165 vpxor RNOT, x1, x1; \
166 vpxor x0, x3, x3;
167
168#define S5_1(x0, x1, x2, x3, x4) \
169 vpor x0, x1, tp; \
170 vpxor tp, x2, x2; \
171 vpxor RNOT, x3, x3; \
172 vpxor x0, x1, x4; \
173 vpxor x2, x0, x0; \
174 vpand x4, tp, x1; \
175 vpor x3, x4, x4; \
176 vpxor x0, x4, x4;
177#define S5_2(x0, x1, x2, x3, x4) \
178 vpand x3, x0, x0; \
179 vpxor x3, x1, x1; \
180 vpxor x2, x3, x3; \
181 vpxor x1, x0, x0; \
182 vpand x4, x2, x2; \
183 vpxor x2, x1, x1; \
184 vpand x0, x2, x2; \
185 vpxor x2, x3, x3;
186
187#define S6_1(x0, x1, x2, x3, x4) \
188 vpxor x0, x3, x3; \
189 vpxor x2, x1, tp; \
190 vpxor x0, x2, x2; \
191 vpand x3, x0, x0; \
192 vpor x3, tp, tp; \
193 vpxor RNOT, x1, x4; \
194 vpxor tp, x0, x0; \
195 vpxor x2, tp, x1;
196#define S6_2(x0, x1, x2, x3, x4) \
197 vpxor x4, x3, x3; \
198 vpxor x0, x4, x4; \
199 vpand x0, x2, x2; \
200 vpxor x1, x4, x4; \
201 vpxor x3, x2, x2; \
202 vpand x1, x3, x3; \
203 vpxor x0, x3, x3; \
204 vpxor x2, x1, x1;
205
206#define S7_1(x0, x1, x2, x3, x4) \
207 vpxor RNOT, x1, tp; \
208 vpxor RNOT, x0, x0; \
209 vpand x2, tp, x1; \
210 vpxor x3, x1, x1; \
211 vpor tp, x3, x3; \
212 vpxor x2, tp, x4; \
213 vpxor x3, x2, x2; \
214 vpxor x0, x3, x3; \
215 vpor x1, x0, x0;
216#define S7_2(x0, x1, x2, x3, x4) \
217 vpand x0, x2, x2; \
218 vpxor x4, x0, x0; \
219 vpxor x3, x4, x4; \
220 vpand x0, x3, x3; \
221 vpxor x1, x4, x4; \
222 vpxor x4, x2, x2; \
223 vpxor x1, x3, x3; \
224 vpor x0, x4, x4; \
225 vpxor x1, x4, x4;
226
227#define SI0_1(x0, x1, x2, x3, x4) \
228 vpxor x0, x1, x1; \
229 vpor x1, x3, tp; \
230 vpxor x1, x3, x4; \
231 vpxor RNOT, x0, x0; \
232 vpxor tp, x2, x2; \
233 vpxor x0, tp, x3; \
234 vpand x1, x0, x0; \
235 vpxor x2, x0, x0;
236#define SI0_2(x0, x1, x2, x3, x4) \
237 vpand x3, x2, x2; \
238 vpxor x4, x3, x3; \
239 vpxor x3, x2, x2; \
240 vpxor x3, x1, x1; \
241 vpand x0, x3, x3; \
242 vpxor x0, x1, x1; \
243 vpxor x2, x0, x0; \
244 vpxor x3, x4, x4;
245
246#define SI1_1(x0, x1, x2, x3, x4) \
247 vpxor x3, x1, x1; \
248 vpxor x2, x0, tp; \
249 vpxor RNOT, x2, x2; \
250 vpor x1, x0, x4; \
251 vpxor x3, x4, x4; \
252 vpand x1, x3, x3; \
253 vpxor x2, x1, x1; \
254 vpand x4, x2, x2;
255#define SI1_2(x0, x1, x2, x3, x4) \
256 vpxor x1, x4, x4; \
257 vpor x3, x1, x1; \
258 vpxor tp, x3, x3; \
259 vpxor tp, x2, x2; \
260 vpor x4, tp, x0; \
261 vpxor x4, x2, x2; \
262 vpxor x0, x1, x1; \
263 vpxor x1, x4, x4;
264
265#define SI2_1(x0, x1, x2, x3, x4) \
266 vpxor x1, x2, x2; \
267 vpxor RNOT, x3, tp; \
268 vpor x2, tp, tp; \
269 vpxor x3, x2, x2; \
270 vpxor x0, x3, x4; \
271 vpxor x1, tp, x3; \
272 vpor x2, x1, x1; \
273 vpxor x0, x2, x2;
274#define SI2_2(x0, x1, x2, x3, x4) \
275 vpxor x4, x1, x1; \
276 vpor x3, x4, x4; \
277 vpxor x3, x2, x2; \
278 vpxor x2, x4, x4; \
279 vpand x1, x2, x2; \
280 vpxor x3, x2, x2; \
281 vpxor x4, x3, x3; \
282 vpxor x0, x4, x4;
283
284#define SI3_1(x0, x1, x2, x3, x4) \
285 vpxor x1, x2, x2; \
286 vpand x2, x1, tp; \
287 vpxor x0, tp, tp; \
288 vpor x1, x0, x0; \
289 vpxor x3, x1, x4; \
290 vpxor x3, x0, x0; \
291 vpor tp, x3, x3; \
292 vpxor x2, tp, x1;
293#define SI3_2(x0, x1, x2, x3, x4) \
294 vpxor x3, x1, x1; \
295 vpxor x2, x0, x0; \
296 vpxor x3, x2, x2; \
297 vpand x1, x3, x3; \
298 vpxor x0, x1, x1; \
299 vpand x2, x0, x0; \
300 vpxor x3, x4, x4; \
301 vpxor x0, x3, x3; \
302 vpxor x1, x0, x0;
303
304#define SI4_1(x0, x1, x2, x3, x4) \
305 vpxor x3, x2, x2; \
306 vpand x1, x0, tp; \
307 vpxor x2, tp, tp; \
308 vpor x3, x2, x2; \
309 vpxor RNOT, x0, x4; \
310 vpxor tp, x1, x1; \
311 vpxor x2, tp, x0; \
312 vpand x4, x2, x2;
313#define SI4_2(x0, x1, x2, x3, x4) \
314 vpxor x0, x2, x2; \
315 vpor x4, x0, x0; \
316 vpxor x3, x0, x0; \
317 vpand x2, x3, x3; \
318 vpxor x3, x4, x4; \
319 vpxor x1, x3, x3; \
320 vpand x0, x1, x1; \
321 vpxor x1, x4, x4; \
322 vpxor x3, x0, x0;
323
324#define SI5_1(x0, x1, x2, x3, x4) \
325 vpor x2, x1, tp; \
326 vpxor x1, x2, x2; \
327 vpxor x3, tp, tp; \
328 vpand x1, x3, x3; \
329 vpxor x3, x2, x2; \
330 vpor x0, x3, x3; \
331 vpxor RNOT, x0, x0; \
332 vpxor x2, x3, x3; \
333 vpor x0, x2, x2;
334#define SI5_2(x0, x1, x2, x3, x4) \
335 vpxor tp, x1, x4; \
336 vpxor x4, x2, x2; \
337 vpand x0, x4, x4; \
338 vpxor tp, x0, x0; \
339 vpxor x3, tp, x1; \
340 vpand x2, x0, x0; \
341 vpxor x3, x2, x2; \
342 vpxor x2, x0, x0; \
343 vpxor x4, x2, x2; \
344 vpxor x3, x4, x4;
345
346#define SI6_1(x0, x1, x2, x3, x4) \
347 vpxor x2, x0, x0; \
348 vpand x3, x0, tp; \
349 vpxor x3, x2, x2; \
350 vpxor x2, tp, tp; \
351 vpxor x1, x3, x3; \
352 vpor x0, x2, x2; \
353 vpxor x3, x2, x2; \
354 vpand tp, x3, x3;
355#define SI6_2(x0, x1, x2, x3, x4) \
356 vpxor RNOT, tp, tp; \
357 vpxor x1, x3, x3; \
358 vpand x2, x1, x1; \
359 vpxor tp, x0, x4; \
360 vpxor x4, x3, x3; \
361 vpxor x2, x4, x4; \
362 vpxor x1, tp, x0; \
363 vpxor x0, x2, x2;
364
365#define SI7_1(x0, x1, x2, x3, x4) \
366 vpand x0, x3, tp; \
367 vpxor x2, x0, x0; \
368 vpor x3, x2, x2; \
369 vpxor x1, x3, x4; \
370 vpxor RNOT, x0, x0; \
371 vpor tp, x1, x1; \
372 vpxor x0, x4, x4; \
373 vpand x2, x0, x0; \
374 vpxor x1, x0, x0;
375#define SI7_2(x0, x1, x2, x3, x4) \
376 vpand x2, x1, x1; \
377 vpxor x2, tp, x3; \
378 vpxor x3, x4, x4; \
379 vpand x3, x2, x2; \
380 vpor x0, x3, x3; \
381 vpxor x4, x1, x1; \
382 vpxor x4, x3, x3; \
383 vpand x0, x4, x4; \
384 vpxor x2, x4, x4;
385
386#define get_key(i, j, t) \
387 vbroadcastss (4*(i)+(j))*4(CTX), t;
388
389#define K2(x0, x1, x2, x3, x4, i) \
390 get_key(i, 0, RK0); \
391 get_key(i, 1, RK1); \
392 get_key(i, 2, RK2); \
393 get_key(i, 3, RK3); \
394 vpxor RK0, x0
395 vpxor RK1, x1
396 vpxor RK2, x2
397 vpxor RK3, x3
398 vpxor RK0, x0
399 vpxor RK1, x1
400 vpxor RK2, x2
401 vpxor RK3, x3
402
403#define LK2(x0, x1, x2, x3, x4, i) \
404 vpslld $13, x0
405 vpsrld $(32 - 13), x0
406 vpor x4
407 vpxor x0
408 vpslld $3, x2
409 vpsrld $(32 - 3), x2
410 vpor x4
411 vpxor x2
412 vpslld $13, x0
413 vpsrld $(32 - 13), x0
414 vpor x4
415 vpxor x0
416 vpslld $3, x2
417 vpsrld $(32 - 3), x2
418 vpor x4
419 vpxor x2
420 vpslld $1, x1
421 vpsrld $(32 - 1), x1
422 vpor x4
423 vpslld $3, x0
424 vpxor x2
425 vpxor x4
426 get_key(i, 1, RK1); \
427 vpslld $1, x1
428 vpsrld $(32 - 1), x1
429 vpor x4
430 vpslld $3, x0
431 vpxor x2
432 vpxor x4
433 get_key(i, 3, RK3); \
434 vpslld $7, x3
435 vpsrld $(32 - 7), x3
436 vpor x4
437 vpslld $7, x1
438 vpxor x1
439 vpxor x3
440 vpxor x3
441 vpxor x4
442 get_key(i, 0, RK0); \
443 vpslld $7, x3
444 vpsrld $(32 - 7), x3
445 vpor x4
446 vpslld $7, x1
447 vpxor x1
448 vpxor x3
449 vpxor x3
450 vpxor x4
451 get_key(i, 2, RK2); \
452 vpxor RK1, x1
453 vpxor RK3, x3
454 vpslld $5, x0
455 vpsrld $(32 - 5), x0
456 vpor x4
457 vpslld $22, x2
458 vpsrld $(32 - 22), x2
459 vpor x4
460 vpxor RK0, x0
461 vpxor RK2, x2
462 vpxor RK1, x1
463 vpxor RK3, x3
464 vpslld $5, x0
465 vpsrld $(32 - 5), x0
466 vpor x4
467 vpslld $22, x2
468 vpsrld $(32 - 22), x2
469 vpor x4
470 vpxor RK0, x0
471 vpxor RK2, x2
472
473#define KL2(x0, x1, x2, x3, x4, i) \
474 vpxor RK0, x0
475 vpxor RK2, x2
476 vpsrld $5, x0
477 vpslld $(32 - 5), x0
478 vpor x4
479 vpxor RK3, x3
480 vpxor RK1, x1
481 vpsrld $22, x2
482 vpslld $(32 - 22), x2
483 vpor x4
484 vpxor x3
485 vpxor RK0, x0
486 vpxor RK2, x2
487 vpsrld $5, x0
488 vpslld $(32 - 5), x0
489 vpor x4
490 vpxor RK3, x3
491 vpxor RK1, x1
492 vpsrld $22, x2
493 vpslld $(32 - 22), x2
494 vpor x4
495 vpxor x3
496 vpxor x3
497 vpslld $7, x1
498 vpxor x1
499 vpxor x4
500 vpsrld $1, x1
501 vpslld $(32 - 1), x1
502 vpor x4
503 vpxor x3
504 vpslld $7, x1
505 vpxor x1
506 vpxor x4
507 vpsrld $1, x1
508 vpslld $(32 - 1), x1
509 vpor x4
510 vpsrld $7, x3
511 vpslld $(32 - 7), x3
512 vpor x4
513 vpxor x0
514 vpslld $3, x0
515 vpxor x4
516 vpsrld $7, x3
517 vpslld $(32 - 7), x3
518 vpor x4
519 vpxor x0
520 vpslld $3, x0
521 vpxor x4
522 vpsrld $13, x0
523 vpslld $(32 - 13), x0
524 vpor x4
525 vpxor x2
526 vpxor x2
527 vpsrld $3, x2
528 vpslld $(32 - 3), x2
529 vpor x4
530 vpsrld $13, x0
531 vpslld $(32 - 13), x0
532 vpor x4
533 vpxor x2
534 vpxor x2
535 vpsrld $3, x2
536 vpslld $(32 - 3), x2
537 vpor x4
538
539#define S(SBOX, x0, x1, x2, x3, x4) \
540 SBOX
541 SBOX
542 SBOX
543 SBOX
544
545#define SP(SBOX, x0, x1, x2, x3, x4, i) \
546 get_key(i, 0, RK0); \
547 SBOX
548 get_key(i, 2, RK2); \
549 SBOX
550 get_key(i, 3, RK3); \
551 SBOX
552 get_key(i, 1, RK1); \
553 SBOX
554
555#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
556 vpunpckldq x1, x0, t0; \
557 vpunpckhdq x1, x0, t2; \
558 vpunpckldq x3, x2, t1; \
559 vpunpckhdq x3, x2, x3; \
560 \
561 vpunpcklqdq t1, t0, x0; \
562 vpunpckhqdq t1, t0, x1; \
563 vpunpcklqdq x3, t2, x2; \
564 vpunpckhqdq x3, t2, x3;
565
566#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
567 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
568
569#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
570 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
571
572.align 8
573__serpent_enc_blk8_avx:
574
575
576
577
578
579
580
581 vpcmpeqd RNOT, RNOT, RNOT;
582
583 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
584 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
585
586 K2(RA, RB, RC, RD, RE, 0);
587 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
588 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
589 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
590 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
591 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
592 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
593 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
594 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
595 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
596 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
597 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
598 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
599 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
600 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
601 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
602 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
603 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
604 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
605 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
606 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
607 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
608 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
609 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
610 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
611 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
612 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
613 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
614 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
615 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
616 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
617 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
618 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
619
620 write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
621 write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
622
623 ret;
624ENDPROC(__serpent_enc_blk8_avx)
625
626.align 8
627__serpent_dec_blk8_avx:
628
629
630
631
632
633
634
635 vpcmpeqd RNOT, RNOT, RNOT;
636
637 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
638 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
639
640 K2(RA, RB, RC, RD, RE, 32);
641 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
642 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
643 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
644 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
645 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
646 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
647 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
648 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
649 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
650 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
651 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
652 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
653 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
654 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
655 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
656 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
657 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
658 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
659 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
660 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
661 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
662 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
663 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
664 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
665 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
666 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
667 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
668 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
669 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
670 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
671 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
672 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
673
674 write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
675 write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
676
677 ret;
678ENDPROC(__serpent_dec_blk8_avx)
679
680ENTRY(serpent_ecb_enc_8way_avx)
681
682
683
684
685
686 FRAME_BEGIN
687
688 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
689
690 call __serpent_enc_blk8_avx;
691
692 store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
693
694 FRAME_END
695 ret;
696ENDPROC(serpent_ecb_enc_8way_avx)
697
698ENTRY(serpent_ecb_dec_8way_avx)
699
700
701
702
703
704 FRAME_BEGIN
705
706 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
707
708 call __serpent_dec_blk8_avx;
709
710 store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
711
712 FRAME_END
713 ret;
714ENDPROC(serpent_ecb_dec_8way_avx)
715
716ENTRY(serpent_cbc_dec_8way_avx)
717
718
719
720
721
722 FRAME_BEGIN
723
724 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
725
726 call __serpent_dec_blk8_avx;
727
728 store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
729
730 FRAME_END
731 ret;
732ENDPROC(serpent_cbc_dec_8way_avx)
733
734ENTRY(serpent_ctr_8way_avx)
735
736
737
738
739
740
741 FRAME_BEGIN
742
743 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
744 RD2, RK0, RK1, RK2);
745
746 call __serpent_enc_blk8_avx;
747
748 store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
749
750 FRAME_END
751 ret;
752ENDPROC(serpent_ctr_8way_avx)
753
754ENTRY(serpent_xts_enc_8way_avx)
755
756
757
758
759
760
761 FRAME_BEGIN
762
763
764 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
765 RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
766
767 call __serpent_enc_blk8_avx;
768
769
770 store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
771
772 FRAME_END
773 ret;
774ENDPROC(serpent_xts_enc_8way_avx)
775
776ENTRY(serpent_xts_dec_8way_avx)
777
778
779
780
781
782
783 FRAME_BEGIN
784
785
786 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
787 RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
788
789 call __serpent_dec_blk8_avx;
790
791
792 store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
793
794 FRAME_END
795 ret;
796ENDPROC(serpent_xts_dec_8way_avx)
797