1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26#include <linux/linkage.h>
27#include <asm/frame.h>
28#include "glue_helper-asm-avx.S"
29
30.file "serpent-avx-x86_64-asm_64.S"
31
32.data
33.align 16
34
35.Lbswap128_mask:
36 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
37.Lxts_gf128mul_and_shl1_mask:
38 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
39
40.text
41
42#define CTX %rdi
43
44
45
46
47#define RA1 %xmm0
48#define RB1 %xmm1
49#define RC1 %xmm2
50#define RD1 %xmm3
51#define RE1 %xmm4
52
53#define tp %xmm5
54
55#define RA2 %xmm6
56#define RB2 %xmm7
57#define RC2 %xmm8
58#define RD2 %xmm9
59#define RE2 %xmm10
60
61#define RNOT %xmm11
62
63#define RK0 %xmm12
64#define RK1 %xmm13
65#define RK2 %xmm14
66#define RK3 %xmm15
67
68
69#define S0_1(x0, x1, x2, x3, x4) \
70 vpor x0, x3, tp; \
71 vpxor x3, x0, x0; \
72 vpxor x2, x3, x4; \
73 vpxor RNOT, x4, x4; \
74 vpxor x1, tp, x3; \
75 vpand x0, x1, x1; \
76 vpxor x4, x1, x1; \
77 vpxor x0, x2, x2;
78#define S0_2(x0, x1, x2, x3, x4) \
79 vpxor x3, x0, x0; \
80 vpor x0, x4, x4; \
81 vpxor x2, x0, x0; \
82 vpand x1, x2, x2; \
83 vpxor x2, x3, x3; \
84 vpxor RNOT, x1, x1; \
85 vpxor x4, x2, x2; \
86 vpxor x2, x1, x1;
87
88#define S1_1(x0, x1, x2, x3, x4) \
89 vpxor x0, x1, tp; \
90 vpxor x3, x0, x0; \
91 vpxor RNOT, x3, x3; \
92 vpand tp, x1, x4; \
93 vpor tp, x0, x0; \
94 vpxor x2, x3, x3; \
95 vpxor x3, x0, x0; \
96 vpxor x3, tp, x1;
97#define S1_2(x0, x1, x2, x3, x4) \
98 vpxor x4, x3, x3; \
99 vpor x4, x1, x1; \
100 vpxor x2, x4, x4; \
101 vpand x0, x2, x2; \
102 vpxor x1, x2, x2; \
103 vpor x0, x1, x1; \
104 vpxor RNOT, x0, x0; \
105 vpxor x2, x0, x0; \
106 vpxor x1, x4, x4;
107
108#define S2_1(x0, x1, x2, x3, x4) \
109 vpxor RNOT, x3, x3; \
110 vpxor x0, x1, x1; \
111 vpand x2, x0, tp; \
112 vpxor x3, tp, tp; \
113 vpor x0, x3, x3; \
114 vpxor x1, x2, x2; \
115 vpxor x1, x3, x3; \
116 vpand tp, x1, x1;
117#define S2_2(x0, x1, x2, x3, x4) \
118 vpxor x2, tp, tp; \
119 vpand x3, x2, x2; \
120 vpor x1, x3, x3; \
121 vpxor RNOT, tp, tp; \
122 vpxor tp, x3, x3; \
123 vpxor tp, x0, x4; \
124 vpxor x2, tp, x0; \
125 vpor x2, x1, x1;
126
127#define S3_1(x0, x1, x2, x3, x4) \
128 vpxor x3, x1, tp; \
129 vpor x0, x3, x3; \
130 vpand x0, x1, x4; \
131 vpxor x2, x0, x0; \
132 vpxor tp, x2, x2; \
133 vpand x3, tp, x1; \
134 vpxor x3, x2, x2; \
135 vpor x4, x0, x0; \
136 vpxor x3, x4, x4;
137#define S3_2(x0, x1, x2, x3, x4) \
138 vpxor x0, x1, x1; \
139 vpand x3, x0, x0; \
140 vpand x4, x3, x3; \
141 vpxor x2, x3, x3; \
142 vpor x1, x4, x4; \
143 vpand x1, x2, x2; \
144 vpxor x3, x4, x4; \
145 vpxor x3, x0, x0; \
146 vpxor x2, x3, x3;
147
148#define S4_1(x0, x1, x2, x3, x4) \
149 vpand x0, x3, tp; \
150 vpxor x3, x0, x0; \
151 vpxor x2, tp, tp; \
152 vpor x3, x2, x2; \
153 vpxor x1, x0, x0; \
154 vpxor tp, x3, x4; \
155 vpor x0, x2, x2; \
156 vpxor x1, x2, x2;
157#define S4_2(x0, x1, x2, x3, x4) \
158 vpand x0, x1, x1; \
159 vpxor x4, x1, x1; \
160 vpand x2, x4, x4; \
161 vpxor tp, x2, x2; \
162 vpxor x0, x4, x4; \
163 vpor x1, tp, x3; \
164 vpxor RNOT, x1, x1; \
165 vpxor x0, x3, x3;
166
167#define S5_1(x0, x1, x2, x3, x4) \
168 vpor x0, x1, tp; \
169 vpxor tp, x2, x2; \
170 vpxor RNOT, x3, x3; \
171 vpxor x0, x1, x4; \
172 vpxor x2, x0, x0; \
173 vpand x4, tp, x1; \
174 vpor x3, x4, x4; \
175 vpxor x0, x4, x4;
176#define S5_2(x0, x1, x2, x3, x4) \
177 vpand x3, x0, x0; \
178 vpxor x3, x1, x1; \
179 vpxor x2, x3, x3; \
180 vpxor x1, x0, x0; \
181 vpand x4, x2, x2; \
182 vpxor x2, x1, x1; \
183 vpand x0, x2, x2; \
184 vpxor x2, x3, x3;
185
186#define S6_1(x0, x1, x2, x3, x4) \
187 vpxor x0, x3, x3; \
188 vpxor x2, x1, tp; \
189 vpxor x0, x2, x2; \
190 vpand x3, x0, x0; \
191 vpor x3, tp, tp; \
192 vpxor RNOT, x1, x4; \
193 vpxor tp, x0, x0; \
194 vpxor x2, tp, x1;
195#define S6_2(x0, x1, x2, x3, x4) \
196 vpxor x4, x3, x3; \
197 vpxor x0, x4, x4; \
198 vpand x0, x2, x2; \
199 vpxor x1, x4, x4; \
200 vpxor x3, x2, x2; \
201 vpand x1, x3, x3; \
202 vpxor x0, x3, x3; \
203 vpxor x2, x1, x1;
204
205#define S7_1(x0, x1, x2, x3, x4) \
206 vpxor RNOT, x1, tp; \
207 vpxor RNOT, x0, x0; \
208 vpand x2, tp, x1; \
209 vpxor x3, x1, x1; \
210 vpor tp, x3, x3; \
211 vpxor x2, tp, x4; \
212 vpxor x3, x2, x2; \
213 vpxor x0, x3, x3; \
214 vpor x1, x0, x0;
215#define S7_2(x0, x1, x2, x3, x4) \
216 vpand x0, x2, x2; \
217 vpxor x4, x0, x0; \
218 vpxor x3, x4, x4; \
219 vpand x0, x3, x3; \
220 vpxor x1, x4, x4; \
221 vpxor x4, x2, x2; \
222 vpxor x1, x3, x3; \
223 vpor x0, x4, x4; \
224 vpxor x1, x4, x4;
225
226#define SI0_1(x0, x1, x2, x3, x4) \
227 vpxor x0, x1, x1; \
228 vpor x1, x3, tp; \
229 vpxor x1, x3, x4; \
230 vpxor RNOT, x0, x0; \
231 vpxor tp, x2, x2; \
232 vpxor x0, tp, x3; \
233 vpand x1, x0, x0; \
234 vpxor x2, x0, x0;
235#define SI0_2(x0, x1, x2, x3, x4) \
236 vpand x3, x2, x2; \
237 vpxor x4, x3, x3; \
238 vpxor x3, x2, x2; \
239 vpxor x3, x1, x1; \
240 vpand x0, x3, x3; \
241 vpxor x0, x1, x1; \
242 vpxor x2, x0, x0; \
243 vpxor x3, x4, x4;
244
245#define SI1_1(x0, x1, x2, x3, x4) \
246 vpxor x3, x1, x1; \
247 vpxor x2, x0, tp; \
248 vpxor RNOT, x2, x2; \
249 vpor x1, x0, x4; \
250 vpxor x3, x4, x4; \
251 vpand x1, x3, x3; \
252 vpxor x2, x1, x1; \
253 vpand x4, x2, x2;
254#define SI1_2(x0, x1, x2, x3, x4) \
255 vpxor x1, x4, x4; \
256 vpor x3, x1, x1; \
257 vpxor tp, x3, x3; \
258 vpxor tp, x2, x2; \
259 vpor x4, tp, x0; \
260 vpxor x4, x2, x2; \
261 vpxor x0, x1, x1; \
262 vpxor x1, x4, x4;
263
264#define SI2_1(x0, x1, x2, x3, x4) \
265 vpxor x1, x2, x2; \
266 vpxor RNOT, x3, tp; \
267 vpor x2, tp, tp; \
268 vpxor x3, x2, x2; \
269 vpxor x0, x3, x4; \
270 vpxor x1, tp, x3; \
271 vpor x2, x1, x1; \
272 vpxor x0, x2, x2;
273#define SI2_2(x0, x1, x2, x3, x4) \
274 vpxor x4, x1, x1; \
275 vpor x3, x4, x4; \
276 vpxor x3, x2, x2; \
277 vpxor x2, x4, x4; \
278 vpand x1, x2, x2; \
279 vpxor x3, x2, x2; \
280 vpxor x4, x3, x3; \
281 vpxor x0, x4, x4;
282
283#define SI3_1(x0, x1, x2, x3, x4) \
284 vpxor x1, x2, x2; \
285 vpand x2, x1, tp; \
286 vpxor x0, tp, tp; \
287 vpor x1, x0, x0; \
288 vpxor x3, x1, x4; \
289 vpxor x3, x0, x0; \
290 vpor tp, x3, x3; \
291 vpxor x2, tp, x1;
292#define SI3_2(x0, x1, x2, x3, x4) \
293 vpxor x3, x1, x1; \
294 vpxor x2, x0, x0; \
295 vpxor x3, x2, x2; \
296 vpand x1, x3, x3; \
297 vpxor x0, x1, x1; \
298 vpand x2, x0, x0; \
299 vpxor x3, x4, x4; \
300 vpxor x0, x3, x3; \
301 vpxor x1, x0, x0;
302
303#define SI4_1(x0, x1, x2, x3, x4) \
304 vpxor x3, x2, x2; \
305 vpand x1, x0, tp; \
306 vpxor x2, tp, tp; \
307 vpor x3, x2, x2; \
308 vpxor RNOT, x0, x4; \
309 vpxor tp, x1, x1; \
310 vpxor x2, tp, x0; \
311 vpand x4, x2, x2;
312#define SI4_2(x0, x1, x2, x3, x4) \
313 vpxor x0, x2, x2; \
314 vpor x4, x0, x0; \
315 vpxor x3, x0, x0; \
316 vpand x2, x3, x3; \
317 vpxor x3, x4, x4; \
318 vpxor x1, x3, x3; \
319 vpand x0, x1, x1; \
320 vpxor x1, x4, x4; \
321 vpxor x3, x0, x0;
322
323#define SI5_1(x0, x1, x2, x3, x4) \
324 vpor x2, x1, tp; \
325 vpxor x1, x2, x2; \
326 vpxor x3, tp, tp; \
327 vpand x1, x3, x3; \
328 vpxor x3, x2, x2; \
329 vpor x0, x3, x3; \
330 vpxor RNOT, x0, x0; \
331 vpxor x2, x3, x3; \
332 vpor x0, x2, x2;
333#define SI5_2(x0, x1, x2, x3, x4) \
334 vpxor tp, x1, x4; \
335 vpxor x4, x2, x2; \
336 vpand x0, x4, x4; \
337 vpxor tp, x0, x0; \
338 vpxor x3, tp, x1; \
339 vpand x2, x0, x0; \
340 vpxor x3, x2, x2; \
341 vpxor x2, x0, x0; \
342 vpxor x4, x2, x2; \
343 vpxor x3, x4, x4;
344
345#define SI6_1(x0, x1, x2, x3, x4) \
346 vpxor x2, x0, x0; \
347 vpand x3, x0, tp; \
348 vpxor x3, x2, x2; \
349 vpxor x2, tp, tp; \
350 vpxor x1, x3, x3; \
351 vpor x0, x2, x2; \
352 vpxor x3, x2, x2; \
353 vpand tp, x3, x3;
354#define SI6_2(x0, x1, x2, x3, x4) \
355 vpxor RNOT, tp, tp; \
356 vpxor x1, x3, x3; \
357 vpand x2, x1, x1; \
358 vpxor tp, x0, x4; \
359 vpxor x4, x3, x3; \
360 vpxor x2, x4, x4; \
361 vpxor x1, tp, x0; \
362 vpxor x0, x2, x2;
363
364#define SI7_1(x0, x1, x2, x3, x4) \
365 vpand x0, x3, tp; \
366 vpxor x2, x0, x0; \
367 vpor x3, x2, x2; \
368 vpxor x1, x3, x4; \
369 vpxor RNOT, x0, x0; \
370 vpor tp, x1, x1; \
371 vpxor x0, x4, x4; \
372 vpand x2, x0, x0; \
373 vpxor x1, x0, x0;
374#define SI7_2(x0, x1, x2, x3, x4) \
375 vpand x2, x1, x1; \
376 vpxor x2, tp, x3; \
377 vpxor x3, x4, x4; \
378 vpand x3, x2, x2; \
379 vpor x0, x3, x3; \
380 vpxor x4, x1, x1; \
381 vpxor x4, x3, x3; \
382 vpand x0, x4, x4; \
383 vpxor x2, x4, x4;
384
385#define get_key(i, j, t) \
386 vbroadcastss (4*(i)+(j))*4(CTX), t;
387
388#define K2(x0, x1, x2, x3, x4, i) \
389 get_key(i, 0, RK0); \
390 get_key(i, 1, RK1); \
391 get_key(i, 2, RK2); \
392 get_key(i, 3, RK3); \
393 vpxor RK0, x0
394 vpxor RK1, x1
395 vpxor RK2, x2
396 vpxor RK3, x3
397 vpxor RK0, x0
398 vpxor RK1, x1
399 vpxor RK2, x2
400 vpxor RK3, x3
401
402#define LK2(x0, x1, x2, x3, x4, i) \
403 vpslld $13, x0
404 vpsrld $(32 - 13), x0
405 vpor x4
406 vpxor x0
407 vpslld $3, x2
408 vpsrld $(32 - 3), x2
409 vpor x4
410 vpxor x2
411 vpslld $13, x0
412 vpsrld $(32 - 13), x0
413 vpor x4
414 vpxor x0
415 vpslld $3, x2
416 vpsrld $(32 - 3), x2
417 vpor x4
418 vpxor x2
419 vpslld $1, x1
420 vpsrld $(32 - 1), x1
421 vpor x4
422 vpslld $3, x0
423 vpxor x2
424 vpxor x4
425 get_key(i, 1, RK1); \
426 vpslld $1, x1
427 vpsrld $(32 - 1), x1
428 vpor x4
429 vpslld $3, x0
430 vpxor x2
431 vpxor x4
432 get_key(i, 3, RK3); \
433 vpslld $7, x3
434 vpsrld $(32 - 7), x3
435 vpor x4
436 vpslld $7, x1
437 vpxor x1
438 vpxor x3
439 vpxor x3
440 vpxor x4
441 get_key(i, 0, RK0); \
442 vpslld $7, x3
443 vpsrld $(32 - 7), x3
444 vpor x4
445 vpslld $7, x1
446 vpxor x1
447 vpxor x3
448 vpxor x3
449 vpxor x4
450 get_key(i, 2, RK2); \
451 vpxor RK1, x1
452 vpxor RK3, x3
453 vpslld $5, x0
454 vpsrld $(32 - 5), x0
455 vpor x4
456 vpslld $22, x2
457 vpsrld $(32 - 22), x2
458 vpor x4
459 vpxor RK0, x0
460 vpxor RK2, x2
461 vpxor RK1, x1
462 vpxor RK3, x3
463 vpslld $5, x0
464 vpsrld $(32 - 5), x0
465 vpor x4
466 vpslld $22, x2
467 vpsrld $(32 - 22), x2
468 vpor x4
469 vpxor RK0, x0
470 vpxor RK2, x2
471
472#define KL2(x0, x1, x2, x3, x4, i) \
473 vpxor RK0, x0
474 vpxor RK2, x2
475 vpsrld $5, x0
476 vpslld $(32 - 5), x0
477 vpor x4
478 vpxor RK3, x3
479 vpxor RK1, x1
480 vpsrld $22, x2
481 vpslld $(32 - 22), x2
482 vpor x4
483 vpxor x3
484 vpxor RK0, x0
485 vpxor RK2, x2
486 vpsrld $5, x0
487 vpslld $(32 - 5), x0
488 vpor x4
489 vpxor RK3, x3
490 vpxor RK1, x1
491 vpsrld $22, x2
492 vpslld $(32 - 22), x2
493 vpor x4
494 vpxor x3
495 vpxor x3
496 vpslld $7, x1
497 vpxor x1
498 vpxor x4
499 vpsrld $1, x1
500 vpslld $(32 - 1), x1
501 vpor x4
502 vpxor x3
503 vpslld $7, x1
504 vpxor x1
505 vpxor x4
506 vpsrld $1, x1
507 vpslld $(32 - 1), x1
508 vpor x4
509 vpsrld $7, x3
510 vpslld $(32 - 7), x3
511 vpor x4
512 vpxor x0
513 vpslld $3, x0
514 vpxor x4
515 vpsrld $7, x3
516 vpslld $(32 - 7), x3
517 vpor x4
518 vpxor x0
519 vpslld $3, x0
520 vpxor x4
521 vpsrld $13, x0
522 vpslld $(32 - 13), x0
523 vpor x4
524 vpxor x2
525 vpxor x2
526 vpsrld $3, x2
527 vpslld $(32 - 3), x2
528 vpor x4
529 vpsrld $13, x0
530 vpslld $(32 - 13), x0
531 vpor x4
532 vpxor x2
533 vpxor x2
534 vpsrld $3, x2
535 vpslld $(32 - 3), x2
536 vpor x4
537
538#define S(SBOX, x0, x1, x2, x3, x4) \
539 SBOX
540 SBOX
541 SBOX
542 SBOX
543
544#define SP(SBOX, x0, x1, x2, x3, x4, i) \
545 get_key(i, 0, RK0); \
546 SBOX
547 get_key(i, 2, RK2); \
548 SBOX
549 get_key(i, 3, RK3); \
550 SBOX
551 get_key(i, 1, RK1); \
552 SBOX
553
554#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
555 vpunpckldq x1, x0, t0; \
556 vpunpckhdq x1, x0, t2; \
557 vpunpckldq x3, x2, t1; \
558 vpunpckhdq x3, x2, x3; \
559 \
560 vpunpcklqdq t1, t0, x0; \
561 vpunpckhqdq t1, t0, x1; \
562 vpunpcklqdq x3, t2, x2; \
563 vpunpckhqdq x3, t2, x3;
564
565#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
566 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
567
568#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
569 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
570
571.align 8
572__serpent_enc_blk8_avx:
573
574
575
576
577
578
579
580 vpcmpeqd RNOT, RNOT, RNOT;
581
582 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
583 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
584
585 K2(RA, RB, RC, RD, RE, 0);
586 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
587 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
588 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
589 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
590 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
591 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
592 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
593 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
594 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
595 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
596 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
597 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
598 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
599 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
600 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
601 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
602 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
603 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
604 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
605 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
606 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
607 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
608 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
609 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
610 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
611 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
612 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
613 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
614 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
615 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
616 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
617 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
618
619 write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
620 write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
621
622 ret;
623ENDPROC(__serpent_enc_blk8_avx)
624
625.align 8
626__serpent_dec_blk8_avx:
627
628
629
630
631
632
633
634 vpcmpeqd RNOT, RNOT, RNOT;
635
636 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
637 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
638
639 K2(RA, RB, RC, RD, RE, 32);
640 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
641 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
642 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
643 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
644 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
645 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
646 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
647 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
648 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
649 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
650 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
651 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
652 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
653 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
654 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
655 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
656 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
657 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
658 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
659 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
660 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
661 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
662 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
663 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
664 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
665 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
666 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
667 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
668 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
669 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
670 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
671 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
672
673 write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
674 write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
675
676 ret;
677ENDPROC(__serpent_dec_blk8_avx)
678
679ENTRY(serpent_ecb_enc_8way_avx)
680
681
682
683
684
685 FRAME_BEGIN
686
687 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
688
689 call __serpent_enc_blk8_avx;
690
691 store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
692
693 FRAME_END
694 ret;
695ENDPROC(serpent_ecb_enc_8way_avx)
696
697ENTRY(serpent_ecb_dec_8way_avx)
698
699
700
701
702
703 FRAME_BEGIN
704
705 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
706
707 call __serpent_dec_blk8_avx;
708
709 store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
710
711 FRAME_END
712 ret;
713ENDPROC(serpent_ecb_dec_8way_avx)
714
715ENTRY(serpent_cbc_dec_8way_avx)
716
717
718
719
720
721 FRAME_BEGIN
722
723 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
724
725 call __serpent_dec_blk8_avx;
726
727 store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
728
729 FRAME_END
730 ret;
731ENDPROC(serpent_cbc_dec_8way_avx)
732
733ENTRY(serpent_ctr_8way_avx)
734
735
736
737
738
739
740 FRAME_BEGIN
741
742 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
743 RD2, RK0, RK1, RK2);
744
745 call __serpent_enc_blk8_avx;
746
747 store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
748
749 FRAME_END
750 ret;
751ENDPROC(serpent_ctr_8way_avx)
752
753ENTRY(serpent_xts_enc_8way_avx)
754
755
756
757
758
759
760 FRAME_BEGIN
761
762
763 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
764 RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
765
766 call __serpent_enc_blk8_avx;
767
768
769 store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
770
771 FRAME_END
772 ret;
773ENDPROC(serpent_xts_enc_8way_avx)
774
775ENTRY(serpent_xts_dec_8way_avx)
776
777
778
779
780
781
782 FRAME_BEGIN
783
784
785 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
786 RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
787
788 call __serpent_dec_blk8_avx;
789
790
791 store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
792
793 FRAME_END
794 ret;
795ENDPROC(serpent_xts_dec_8way_avx)
796