1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26#include <linux/linkage.h>
27#include "glue_helper-asm-avx.S"
28
29.file "serpent-avx-x86_64-asm_64.S"
30
31.data
32.align 16
33
34.Lbswap128_mask:
35 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
36.Lxts_gf128mul_and_shl1_mask:
37 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
38
39.text
40
41#define CTX %rdi
42
43
44
45
46#define RA1 %xmm0
47#define RB1 %xmm1
48#define RC1 %xmm2
49#define RD1 %xmm3
50#define RE1 %xmm4
51
52#define tp %xmm5
53
54#define RA2 %xmm6
55#define RB2 %xmm7
56#define RC2 %xmm8
57#define RD2 %xmm9
58#define RE2 %xmm10
59
60#define RNOT %xmm11
61
62#define RK0 %xmm12
63#define RK1 %xmm13
64#define RK2 %xmm14
65#define RK3 %xmm15
66
67
68#define S0_1(x0, x1, x2, x3, x4) \
69 vpor x0, x3, tp; \
70 vpxor x3, x0, x0; \
71 vpxor x2, x3, x4; \
72 vpxor RNOT, x4, x4; \
73 vpxor x1, tp, x3; \
74 vpand x0, x1, x1; \
75 vpxor x4, x1, x1; \
76 vpxor x0, x2, x2;
77#define S0_2(x0, x1, x2, x3, x4) \
78 vpxor x3, x0, x0; \
79 vpor x0, x4, x4; \
80 vpxor x2, x0, x0; \
81 vpand x1, x2, x2; \
82 vpxor x2, x3, x3; \
83 vpxor RNOT, x1, x1; \
84 vpxor x4, x2, x2; \
85 vpxor x2, x1, x1;
86
87#define S1_1(x0, x1, x2, x3, x4) \
88 vpxor x0, x1, tp; \
89 vpxor x3, x0, x0; \
90 vpxor RNOT, x3, x3; \
91 vpand tp, x1, x4; \
92 vpor tp, x0, x0; \
93 vpxor x2, x3, x3; \
94 vpxor x3, x0, x0; \
95 vpxor x3, tp, x1;
96#define S1_2(x0, x1, x2, x3, x4) \
97 vpxor x4, x3, x3; \
98 vpor x4, x1, x1; \
99 vpxor x2, x4, x4; \
100 vpand x0, x2, x2; \
101 vpxor x1, x2, x2; \
102 vpor x0, x1, x1; \
103 vpxor RNOT, x0, x0; \
104 vpxor x2, x0, x0; \
105 vpxor x1, x4, x4;
106
107#define S2_1(x0, x1, x2, x3, x4) \
108 vpxor RNOT, x3, x3; \
109 vpxor x0, x1, x1; \
110 vpand x2, x0, tp; \
111 vpxor x3, tp, tp; \
112 vpor x0, x3, x3; \
113 vpxor x1, x2, x2; \
114 vpxor x1, x3, x3; \
115 vpand tp, x1, x1;
116#define S2_2(x0, x1, x2, x3, x4) \
117 vpxor x2, tp, tp; \
118 vpand x3, x2, x2; \
119 vpor x1, x3, x3; \
120 vpxor RNOT, tp, tp; \
121 vpxor tp, x3, x3; \
122 vpxor tp, x0, x4; \
123 vpxor x2, tp, x0; \
124 vpor x2, x1, x1;
125
126#define S3_1(x0, x1, x2, x3, x4) \
127 vpxor x3, x1, tp; \
128 vpor x0, x3, x3; \
129 vpand x0, x1, x4; \
130 vpxor x2, x0, x0; \
131 vpxor tp, x2, x2; \
132 vpand x3, tp, x1; \
133 vpxor x3, x2, x2; \
134 vpor x4, x0, x0; \
135 vpxor x3, x4, x4;
136#define S3_2(x0, x1, x2, x3, x4) \
137 vpxor x0, x1, x1; \
138 vpand x3, x0, x0; \
139 vpand x4, x3, x3; \
140 vpxor x2, x3, x3; \
141 vpor x1, x4, x4; \
142 vpand x1, x2, x2; \
143 vpxor x3, x4, x4; \
144 vpxor x3, x0, x0; \
145 vpxor x2, x3, x3;
146
147#define S4_1(x0, x1, x2, x3, x4) \
148 vpand x0, x3, tp; \
149 vpxor x3, x0, x0; \
150 vpxor x2, tp, tp; \
151 vpor x3, x2, x2; \
152 vpxor x1, x0, x0; \
153 vpxor tp, x3, x4; \
154 vpor x0, x2, x2; \
155 vpxor x1, x2, x2;
156#define S4_2(x0, x1, x2, x3, x4) \
157 vpand x0, x1, x1; \
158 vpxor x4, x1, x1; \
159 vpand x2, x4, x4; \
160 vpxor tp, x2, x2; \
161 vpxor x0, x4, x4; \
162 vpor x1, tp, x3; \
163 vpxor RNOT, x1, x1; \
164 vpxor x0, x3, x3;
165
166#define S5_1(x0, x1, x2, x3, x4) \
167 vpor x0, x1, tp; \
168 vpxor tp, x2, x2; \
169 vpxor RNOT, x3, x3; \
170 vpxor x0, x1, x4; \
171 vpxor x2, x0, x0; \
172 vpand x4, tp, x1; \
173 vpor x3, x4, x4; \
174 vpxor x0, x4, x4;
175#define S5_2(x0, x1, x2, x3, x4) \
176 vpand x3, x0, x0; \
177 vpxor x3, x1, x1; \
178 vpxor x2, x3, x3; \
179 vpxor x1, x0, x0; \
180 vpand x4, x2, x2; \
181 vpxor x2, x1, x1; \
182 vpand x0, x2, x2; \
183 vpxor x2, x3, x3;
184
185#define S6_1(x0, x1, x2, x3, x4) \
186 vpxor x0, x3, x3; \
187 vpxor x2, x1, tp; \
188 vpxor x0, x2, x2; \
189 vpand x3, x0, x0; \
190 vpor x3, tp, tp; \
191 vpxor RNOT, x1, x4; \
192 vpxor tp, x0, x0; \
193 vpxor x2, tp, x1;
194#define S6_2(x0, x1, x2, x3, x4) \
195 vpxor x4, x3, x3; \
196 vpxor x0, x4, x4; \
197 vpand x0, x2, x2; \
198 vpxor x1, x4, x4; \
199 vpxor x3, x2, x2; \
200 vpand x1, x3, x3; \
201 vpxor x0, x3, x3; \
202 vpxor x2, x1, x1;
203
204#define S7_1(x0, x1, x2, x3, x4) \
205 vpxor RNOT, x1, tp; \
206 vpxor RNOT, x0, x0; \
207 vpand x2, tp, x1; \
208 vpxor x3, x1, x1; \
209 vpor tp, x3, x3; \
210 vpxor x2, tp, x4; \
211 vpxor x3, x2, x2; \
212 vpxor x0, x3, x3; \
213 vpor x1, x0, x0;
214#define S7_2(x0, x1, x2, x3, x4) \
215 vpand x0, x2, x2; \
216 vpxor x4, x0, x0; \
217 vpxor x3, x4, x4; \
218 vpand x0, x3, x3; \
219 vpxor x1, x4, x4; \
220 vpxor x4, x2, x2; \
221 vpxor x1, x3, x3; \
222 vpor x0, x4, x4; \
223 vpxor x1, x4, x4;
224
225#define SI0_1(x0, x1, x2, x3, x4) \
226 vpxor x0, x1, x1; \
227 vpor x1, x3, tp; \
228 vpxor x1, x3, x4; \
229 vpxor RNOT, x0, x0; \
230 vpxor tp, x2, x2; \
231 vpxor x0, tp, x3; \
232 vpand x1, x0, x0; \
233 vpxor x2, x0, x0;
234#define SI0_2(x0, x1, x2, x3, x4) \
235 vpand x3, x2, x2; \
236 vpxor x4, x3, x3; \
237 vpxor x3, x2, x2; \
238 vpxor x3, x1, x1; \
239 vpand x0, x3, x3; \
240 vpxor x0, x1, x1; \
241 vpxor x2, x0, x0; \
242 vpxor x3, x4, x4;
243
244#define SI1_1(x0, x1, x2, x3, x4) \
245 vpxor x3, x1, x1; \
246 vpxor x2, x0, tp; \
247 vpxor RNOT, x2, x2; \
248 vpor x1, x0, x4; \
249 vpxor x3, x4, x4; \
250 vpand x1, x3, x3; \
251 vpxor x2, x1, x1; \
252 vpand x4, x2, x2;
253#define SI1_2(x0, x1, x2, x3, x4) \
254 vpxor x1, x4, x4; \
255 vpor x3, x1, x1; \
256 vpxor tp, x3, x3; \
257 vpxor tp, x2, x2; \
258 vpor x4, tp, x0; \
259 vpxor x4, x2, x2; \
260 vpxor x0, x1, x1; \
261 vpxor x1, x4, x4;
262
263#define SI2_1(x0, x1, x2, x3, x4) \
264 vpxor x1, x2, x2; \
265 vpxor RNOT, x3, tp; \
266 vpor x2, tp, tp; \
267 vpxor x3, x2, x2; \
268 vpxor x0, x3, x4; \
269 vpxor x1, tp, x3; \
270 vpor x2, x1, x1; \
271 vpxor x0, x2, x2;
272#define SI2_2(x0, x1, x2, x3, x4) \
273 vpxor x4, x1, x1; \
274 vpor x3, x4, x4; \
275 vpxor x3, x2, x2; \
276 vpxor x2, x4, x4; \
277 vpand x1, x2, x2; \
278 vpxor x3, x2, x2; \
279 vpxor x4, x3, x3; \
280 vpxor x0, x4, x4;
281
282#define SI3_1(x0, x1, x2, x3, x4) \
283 vpxor x1, x2, x2; \
284 vpand x2, x1, tp; \
285 vpxor x0, tp, tp; \
286 vpor x1, x0, x0; \
287 vpxor x3, x1, x4; \
288 vpxor x3, x0, x0; \
289 vpor tp, x3, x3; \
290 vpxor x2, tp, x1;
291#define SI3_2(x0, x1, x2, x3, x4) \
292 vpxor x3, x1, x1; \
293 vpxor x2, x0, x0; \
294 vpxor x3, x2, x2; \
295 vpand x1, x3, x3; \
296 vpxor x0, x1, x1; \
297 vpand x2, x0, x0; \
298 vpxor x3, x4, x4; \
299 vpxor x0, x3, x3; \
300 vpxor x1, x0, x0;
301
302#define SI4_1(x0, x1, x2, x3, x4) \
303 vpxor x3, x2, x2; \
304 vpand x1, x0, tp; \
305 vpxor x2, tp, tp; \
306 vpor x3, x2, x2; \
307 vpxor RNOT, x0, x4; \
308 vpxor tp, x1, x1; \
309 vpxor x2, tp, x0; \
310 vpand x4, x2, x2;
311#define SI4_2(x0, x1, x2, x3, x4) \
312 vpxor x0, x2, x2; \
313 vpor x4, x0, x0; \
314 vpxor x3, x0, x0; \
315 vpand x2, x3, x3; \
316 vpxor x3, x4, x4; \
317 vpxor x1, x3, x3; \
318 vpand x0, x1, x1; \
319 vpxor x1, x4, x4; \
320 vpxor x3, x0, x0;
321
322#define SI5_1(x0, x1, x2, x3, x4) \
323 vpor x2, x1, tp; \
324 vpxor x1, x2, x2; \
325 vpxor x3, tp, tp; \
326 vpand x1, x3, x3; \
327 vpxor x3, x2, x2; \
328 vpor x0, x3, x3; \
329 vpxor RNOT, x0, x0; \
330 vpxor x2, x3, x3; \
331 vpor x0, x2, x2;
332#define SI5_2(x0, x1, x2, x3, x4) \
333 vpxor tp, x1, x4; \
334 vpxor x4, x2, x2; \
335 vpand x0, x4, x4; \
336 vpxor tp, x0, x0; \
337 vpxor x3, tp, x1; \
338 vpand x2, x0, x0; \
339 vpxor x3, x2, x2; \
340 vpxor x2, x0, x0; \
341 vpxor x4, x2, x2; \
342 vpxor x3, x4, x4;
343
344#define SI6_1(x0, x1, x2, x3, x4) \
345 vpxor x2, x0, x0; \
346 vpand x3, x0, tp; \
347 vpxor x3, x2, x2; \
348 vpxor x2, tp, tp; \
349 vpxor x1, x3, x3; \
350 vpor x0, x2, x2; \
351 vpxor x3, x2, x2; \
352 vpand tp, x3, x3;
353#define SI6_2(x0, x1, x2, x3, x4) \
354 vpxor RNOT, tp, tp; \
355 vpxor x1, x3, x3; \
356 vpand x2, x1, x1; \
357 vpxor tp, x0, x4; \
358 vpxor x4, x3, x3; \
359 vpxor x2, x4, x4; \
360 vpxor x1, tp, x0; \
361 vpxor x0, x2, x2;
362
363#define SI7_1(x0, x1, x2, x3, x4) \
364 vpand x0, x3, tp; \
365 vpxor x2, x0, x0; \
366 vpor x3, x2, x2; \
367 vpxor x1, x3, x4; \
368 vpxor RNOT, x0, x0; \
369 vpor tp, x1, x1; \
370 vpxor x0, x4, x4; \
371 vpand x2, x0, x0; \
372 vpxor x1, x0, x0;
373#define SI7_2(x0, x1, x2, x3, x4) \
374 vpand x2, x1, x1; \
375 vpxor x2, tp, x3; \
376 vpxor x3, x4, x4; \
377 vpand x3, x2, x2; \
378 vpor x0, x3, x3; \
379 vpxor x4, x1, x1; \
380 vpxor x4, x3, x3; \
381 vpand x0, x4, x4; \
382 vpxor x2, x4, x4;
383
384#define get_key(i, j, t) \
385 vbroadcastss (4*(i)+(j))*4(CTX), t;
386
387#define K2(x0, x1, x2, x3, x4, i) \
388 get_key(i, 0, RK0); \
389 get_key(i, 1, RK1); \
390 get_key(i, 2, RK2); \
391 get_key(i, 3, RK3); \
392 vpxor RK0, x0
393 vpxor RK1, x1
394 vpxor RK2, x2
395 vpxor RK3, x3
396 vpxor RK0, x0
397 vpxor RK1, x1
398 vpxor RK2, x2
399 vpxor RK3, x3
400
401#define LK2(x0, x1, x2, x3, x4, i) \
402 vpslld $13, x0
403 vpsrld $(32 - 13), x0
404 vpor x4
405 vpxor x0
406 vpslld $3, x2
407 vpsrld $(32 - 3), x2
408 vpor x4
409 vpxor x2
410 vpslld $13, x0
411 vpsrld $(32 - 13), x0
412 vpor x4
413 vpxor x0
414 vpslld $3, x2
415 vpsrld $(32 - 3), x2
416 vpor x4
417 vpxor x2
418 vpslld $1, x1
419 vpsrld $(32 - 1), x1
420 vpor x4
421 vpslld $3, x0
422 vpxor x2
423 vpxor x4
424 get_key(i, 1, RK1); \
425 vpslld $1, x1
426 vpsrld $(32 - 1), x1
427 vpor x4
428 vpslld $3, x0
429 vpxor x2
430 vpxor x4
431 get_key(i, 3, RK3); \
432 vpslld $7, x3
433 vpsrld $(32 - 7), x3
434 vpor x4
435 vpslld $7, x1
436 vpxor x1
437 vpxor x3
438 vpxor x3
439 vpxor x4
440 get_key(i, 0, RK0); \
441 vpslld $7, x3
442 vpsrld $(32 - 7), x3
443 vpor x4
444 vpslld $7, x1
445 vpxor x1
446 vpxor x3
447 vpxor x3
448 vpxor x4
449 get_key(i, 2, RK2); \
450 vpxor RK1, x1
451 vpxor RK3, x3
452 vpslld $5, x0
453 vpsrld $(32 - 5), x0
454 vpor x4
455 vpslld $22, x2
456 vpsrld $(32 - 22), x2
457 vpor x4
458 vpxor RK0, x0
459 vpxor RK2, x2
460 vpxor RK1, x1
461 vpxor RK3, x3
462 vpslld $5, x0
463 vpsrld $(32 - 5), x0
464 vpor x4
465 vpslld $22, x2
466 vpsrld $(32 - 22), x2
467 vpor x4
468 vpxor RK0, x0
469 vpxor RK2, x2
470
471#define KL2(x0, x1, x2, x3, x4, i) \
472 vpxor RK0, x0
473 vpxor RK2, x2
474 vpsrld $5, x0
475 vpslld $(32 - 5), x0
476 vpor x4
477 vpxor RK3, x3
478 vpxor RK1, x1
479 vpsrld $22, x2
480 vpslld $(32 - 22), x2
481 vpor x4
482 vpxor x3
483 vpxor RK0, x0
484 vpxor RK2, x2
485 vpsrld $5, x0
486 vpslld $(32 - 5), x0
487 vpor x4
488 vpxor RK3, x3
489 vpxor RK1, x1
490 vpsrld $22, x2
491 vpslld $(32 - 22), x2
492 vpor x4
493 vpxor x3
494 vpxor x3
495 vpslld $7, x1
496 vpxor x1
497 vpxor x4
498 vpsrld $1, x1
499 vpslld $(32 - 1), x1
500 vpor x4
501 vpxor x3
502 vpslld $7, x1
503 vpxor x1
504 vpxor x4
505 vpsrld $1, x1
506 vpslld $(32 - 1), x1
507 vpor x4
508 vpsrld $7, x3
509 vpslld $(32 - 7), x3
510 vpor x4
511 vpxor x0
512 vpslld $3, x0
513 vpxor x4
514 vpsrld $7, x3
515 vpslld $(32 - 7), x3
516 vpor x4
517 vpxor x0
518 vpslld $3, x0
519 vpxor x4
520 vpsrld $13, x0
521 vpslld $(32 - 13), x0
522 vpor x4
523 vpxor x2
524 vpxor x2
525 vpsrld $3, x2
526 vpslld $(32 - 3), x2
527 vpor x4
528 vpsrld $13, x0
529 vpslld $(32 - 13), x0
530 vpor x4
531 vpxor x2
532 vpxor x2
533 vpsrld $3, x2
534 vpslld $(32 - 3), x2
535 vpor x4
536
537#define S(SBOX, x0, x1, x2, x3, x4) \
538 SBOX
539 SBOX
540 SBOX
541 SBOX
542
543#define SP(SBOX, x0, x1, x2, x3, x4, i) \
544 get_key(i, 0, RK0); \
545 SBOX
546 get_key(i, 2, RK2); \
547 SBOX
548 get_key(i, 3, RK3); \
549 SBOX
550 get_key(i, 1, RK1); \
551 SBOX
552
553#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
554 vpunpckldq x1, x0, t0; \
555 vpunpckhdq x1, x0, t2; \
556 vpunpckldq x3, x2, t1; \
557 vpunpckhdq x3, x2, x3; \
558 \
559 vpunpcklqdq t1, t0, x0; \
560 vpunpckhqdq t1, t0, x1; \
561 vpunpcklqdq x3, t2, x2; \
562 vpunpckhqdq x3, t2, x3;
563
564#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
565 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
566
567#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
568 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
569
570.align 8
571__serpent_enc_blk8_avx:
572
573
574
575
576
577
578
579 vpcmpeqd RNOT, RNOT, RNOT;
580
581 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
582 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
583
584 K2(RA, RB, RC, RD, RE, 0);
585 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
586 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
587 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
588 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
589 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
590 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
591 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
592 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
593 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
594 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
595 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
596 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
597 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
598 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
599 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
600 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
601 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
602 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
603 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
604 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
605 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
606 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
607 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
608 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
609 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
610 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
611 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
612 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
613 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
614 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
615 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
616 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
617
618 write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
619 write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
620
621 ret;
622ENDPROC(__serpent_enc_blk8_avx)
623
624.align 8
625__serpent_dec_blk8_avx:
626
627
628
629
630
631
632
633 vpcmpeqd RNOT, RNOT, RNOT;
634
635 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
636 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
637
638 K2(RA, RB, RC, RD, RE, 32);
639 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
640 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
641 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
642 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
643 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
644 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
645 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
646 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
647 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
648 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
649 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
650 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
651 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
652 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
653 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
654 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
655 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
656 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
657 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
658 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
659 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
660 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
661 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
662 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
663 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
664 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
665 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
666 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
667 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
668 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
669 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
670 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
671
672 write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
673 write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
674
675 ret;
676ENDPROC(__serpent_dec_blk8_avx)
677
678ENTRY(serpent_ecb_enc_8way_avx)
679
680
681
682
683
684
685 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
686
687 call __serpent_enc_blk8_avx;
688
689 store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
690
691 ret;
692ENDPROC(serpent_ecb_enc_8way_avx)
693
694ENTRY(serpent_ecb_dec_8way_avx)
695
696
697
698
699
700
701 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
702
703 call __serpent_dec_blk8_avx;
704
705 store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
706
707 ret;
708ENDPROC(serpent_ecb_dec_8way_avx)
709
710ENTRY(serpent_cbc_dec_8way_avx)
711
712
713
714
715
716
717 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
718
719 call __serpent_dec_blk8_avx;
720
721 store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
722
723 ret;
724ENDPROC(serpent_cbc_dec_8way_avx)
725
726ENTRY(serpent_ctr_8way_avx)
727
728
729
730
731
732
733
734 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
735 RD2, RK0, RK1, RK2);
736
737 call __serpent_enc_blk8_avx;
738
739 store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
740
741 ret;
742ENDPROC(serpent_ctr_8way_avx)
743
744ENTRY(serpent_xts_enc_8way_avx)
745
746
747
748
749
750
751
752
753 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
754 RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
755
756 call __serpent_enc_blk8_avx;
757
758
759 store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
760
761 ret;
762ENDPROC(serpent_xts_enc_8way_avx)
763
764ENTRY(serpent_xts_dec_8way_avx)
765
766
767
768
769
770
771
772
773 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
774 RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
775
776 call __serpent_dec_blk8_avx;
777
778
779 store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
780
781 ret;
782ENDPROC(serpent_xts_dec_8way_avx)
783