1
2
3
4
5
6
7
8
9
10
11#include <linux/linkage.h>
12#include <asm/frame.h>
13#include "glue_helper-asm-avx.S"
14
15.file "serpent-avx-x86_64-asm_64.S"
16
17.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16
18.align 16
19.Lbswap128_mask:
20 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
21.section .rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16
22.align 16
23.Lxts_gf128mul_and_shl1_mask:
24 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
25
26.text
27
28#define CTX %rdi
29
30
31
32
33#define RA1 %xmm0
34#define RB1 %xmm1
35#define RC1 %xmm2
36#define RD1 %xmm3
37#define RE1 %xmm4
38
39#define tp %xmm5
40
41#define RA2 %xmm6
42#define RB2 %xmm7
43#define RC2 %xmm8
44#define RD2 %xmm9
45#define RE2 %xmm10
46
47#define RNOT %xmm11
48
49#define RK0 %xmm12
50#define RK1 %xmm13
51#define RK2 %xmm14
52#define RK3 %xmm15
53
54
55#define S0_1(x0, x1, x2, x3, x4) \
56 vpor x0, x3, tp; \
57 vpxor x3, x0, x0; \
58 vpxor x2, x3, x4; \
59 vpxor RNOT, x4, x4; \
60 vpxor x1, tp, x3; \
61 vpand x0, x1, x1; \
62 vpxor x4, x1, x1; \
63 vpxor x0, x2, x2;
64#define S0_2(x0, x1, x2, x3, x4) \
65 vpxor x3, x0, x0; \
66 vpor x0, x4, x4; \
67 vpxor x2, x0, x0; \
68 vpand x1, x2, x2; \
69 vpxor x2, x3, x3; \
70 vpxor RNOT, x1, x1; \
71 vpxor x4, x2, x2; \
72 vpxor x2, x1, x1;
73
74#define S1_1(x0, x1, x2, x3, x4) \
75 vpxor x0, x1, tp; \
76 vpxor x3, x0, x0; \
77 vpxor RNOT, x3, x3; \
78 vpand tp, x1, x4; \
79 vpor tp, x0, x0; \
80 vpxor x2, x3, x3; \
81 vpxor x3, x0, x0; \
82 vpxor x3, tp, x1;
83#define S1_2(x0, x1, x2, x3, x4) \
84 vpxor x4, x3, x3; \
85 vpor x4, x1, x1; \
86 vpxor x2, x4, x4; \
87 vpand x0, x2, x2; \
88 vpxor x1, x2, x2; \
89 vpor x0, x1, x1; \
90 vpxor RNOT, x0, x0; \
91 vpxor x2, x0, x0; \
92 vpxor x1, x4, x4;
93
94#define S2_1(x0, x1, x2, x3, x4) \
95 vpxor RNOT, x3, x3; \
96 vpxor x0, x1, x1; \
97 vpand x2, x0, tp; \
98 vpxor x3, tp, tp; \
99 vpor x0, x3, x3; \
100 vpxor x1, x2, x2; \
101 vpxor x1, x3, x3; \
102 vpand tp, x1, x1;
103#define S2_2(x0, x1, x2, x3, x4) \
104 vpxor x2, tp, tp; \
105 vpand x3, x2, x2; \
106 vpor x1, x3, x3; \
107 vpxor RNOT, tp, tp; \
108 vpxor tp, x3, x3; \
109 vpxor tp, x0, x4; \
110 vpxor x2, tp, x0; \
111 vpor x2, x1, x1;
112
113#define S3_1(x0, x1, x2, x3, x4) \
114 vpxor x3, x1, tp; \
115 vpor x0, x3, x3; \
116 vpand x0, x1, x4; \
117 vpxor x2, x0, x0; \
118 vpxor tp, x2, x2; \
119 vpand x3, tp, x1; \
120 vpxor x3, x2, x2; \
121 vpor x4, x0, x0; \
122 vpxor x3, x4, x4;
123#define S3_2(x0, x1, x2, x3, x4) \
124 vpxor x0, x1, x1; \
125 vpand x3, x0, x0; \
126 vpand x4, x3, x3; \
127 vpxor x2, x3, x3; \
128 vpor x1, x4, x4; \
129 vpand x1, x2, x2; \
130 vpxor x3, x4, x4; \
131 vpxor x3, x0, x0; \
132 vpxor x2, x3, x3;
133
134#define S4_1(x0, x1, x2, x3, x4) \
135 vpand x0, x3, tp; \
136 vpxor x3, x0, x0; \
137 vpxor x2, tp, tp; \
138 vpor x3, x2, x2; \
139 vpxor x1, x0, x0; \
140 vpxor tp, x3, x4; \
141 vpor x0, x2, x2; \
142 vpxor x1, x2, x2;
143#define S4_2(x0, x1, x2, x3, x4) \
144 vpand x0, x1, x1; \
145 vpxor x4, x1, x1; \
146 vpand x2, x4, x4; \
147 vpxor tp, x2, x2; \
148 vpxor x0, x4, x4; \
149 vpor x1, tp, x3; \
150 vpxor RNOT, x1, x1; \
151 vpxor x0, x3, x3;
152
153#define S5_1(x0, x1, x2, x3, x4) \
154 vpor x0, x1, tp; \
155 vpxor tp, x2, x2; \
156 vpxor RNOT, x3, x3; \
157 vpxor x0, x1, x4; \
158 vpxor x2, x0, x0; \
159 vpand x4, tp, x1; \
160 vpor x3, x4, x4; \
161 vpxor x0, x4, x4;
162#define S5_2(x0, x1, x2, x3, x4) \
163 vpand x3, x0, x0; \
164 vpxor x3, x1, x1; \
165 vpxor x2, x3, x3; \
166 vpxor x1, x0, x0; \
167 vpand x4, x2, x2; \
168 vpxor x2, x1, x1; \
169 vpand x0, x2, x2; \
170 vpxor x2, x3, x3;
171
172#define S6_1(x0, x1, x2, x3, x4) \
173 vpxor x0, x3, x3; \
174 vpxor x2, x1, tp; \
175 vpxor x0, x2, x2; \
176 vpand x3, x0, x0; \
177 vpor x3, tp, tp; \
178 vpxor RNOT, x1, x4; \
179 vpxor tp, x0, x0; \
180 vpxor x2, tp, x1;
181#define S6_2(x0, x1, x2, x3, x4) \
182 vpxor x4, x3, x3; \
183 vpxor x0, x4, x4; \
184 vpand x0, x2, x2; \
185 vpxor x1, x4, x4; \
186 vpxor x3, x2, x2; \
187 vpand x1, x3, x3; \
188 vpxor x0, x3, x3; \
189 vpxor x2, x1, x1;
190
191#define S7_1(x0, x1, x2, x3, x4) \
192 vpxor RNOT, x1, tp; \
193 vpxor RNOT, x0, x0; \
194 vpand x2, tp, x1; \
195 vpxor x3, x1, x1; \
196 vpor tp, x3, x3; \
197 vpxor x2, tp, x4; \
198 vpxor x3, x2, x2; \
199 vpxor x0, x3, x3; \
200 vpor x1, x0, x0;
201#define S7_2(x0, x1, x2, x3, x4) \
202 vpand x0, x2, x2; \
203 vpxor x4, x0, x0; \
204 vpxor x3, x4, x4; \
205 vpand x0, x3, x3; \
206 vpxor x1, x4, x4; \
207 vpxor x4, x2, x2; \
208 vpxor x1, x3, x3; \
209 vpor x0, x4, x4; \
210 vpxor x1, x4, x4;
211
212#define SI0_1(x0, x1, x2, x3, x4) \
213 vpxor x0, x1, x1; \
214 vpor x1, x3, tp; \
215 vpxor x1, x3, x4; \
216 vpxor RNOT, x0, x0; \
217 vpxor tp, x2, x2; \
218 vpxor x0, tp, x3; \
219 vpand x1, x0, x0; \
220 vpxor x2, x0, x0;
221#define SI0_2(x0, x1, x2, x3, x4) \
222 vpand x3, x2, x2; \
223 vpxor x4, x3, x3; \
224 vpxor x3, x2, x2; \
225 vpxor x3, x1, x1; \
226 vpand x0, x3, x3; \
227 vpxor x0, x1, x1; \
228 vpxor x2, x0, x0; \
229 vpxor x3, x4, x4;
230
231#define SI1_1(x0, x1, x2, x3, x4) \
232 vpxor x3, x1, x1; \
233 vpxor x2, x0, tp; \
234 vpxor RNOT, x2, x2; \
235 vpor x1, x0, x4; \
236 vpxor x3, x4, x4; \
237 vpand x1, x3, x3; \
238 vpxor x2, x1, x1; \
239 vpand x4, x2, x2;
240#define SI1_2(x0, x1, x2, x3, x4) \
241 vpxor x1, x4, x4; \
242 vpor x3, x1, x1; \
243 vpxor tp, x3, x3; \
244 vpxor tp, x2, x2; \
245 vpor x4, tp, x0; \
246 vpxor x4, x2, x2; \
247 vpxor x0, x1, x1; \
248 vpxor x1, x4, x4;
249
250#define SI2_1(x0, x1, x2, x3, x4) \
251 vpxor x1, x2, x2; \
252 vpxor RNOT, x3, tp; \
253 vpor x2, tp, tp; \
254 vpxor x3, x2, x2; \
255 vpxor x0, x3, x4; \
256 vpxor x1, tp, x3; \
257 vpor x2, x1, x1; \
258 vpxor x0, x2, x2;
259#define SI2_2(x0, x1, x2, x3, x4) \
260 vpxor x4, x1, x1; \
261 vpor x3, x4, x4; \
262 vpxor x3, x2, x2; \
263 vpxor x2, x4, x4; \
264 vpand x1, x2, x2; \
265 vpxor x3, x2, x2; \
266 vpxor x4, x3, x3; \
267 vpxor x0, x4, x4;
268
269#define SI3_1(x0, x1, x2, x3, x4) \
270 vpxor x1, x2, x2; \
271 vpand x2, x1, tp; \
272 vpxor x0, tp, tp; \
273 vpor x1, x0, x0; \
274 vpxor x3, x1, x4; \
275 vpxor x3, x0, x0; \
276 vpor tp, x3, x3; \
277 vpxor x2, tp, x1;
278#define SI3_2(x0, x1, x2, x3, x4) \
279 vpxor x3, x1, x1; \
280 vpxor x2, x0, x0; \
281 vpxor x3, x2, x2; \
282 vpand x1, x3, x3; \
283 vpxor x0, x1, x1; \
284 vpand x2, x0, x0; \
285 vpxor x3, x4, x4; \
286 vpxor x0, x3, x3; \
287 vpxor x1, x0, x0;
288
289#define SI4_1(x0, x1, x2, x3, x4) \
290 vpxor x3, x2, x2; \
291 vpand x1, x0, tp; \
292 vpxor x2, tp, tp; \
293 vpor x3, x2, x2; \
294 vpxor RNOT, x0, x4; \
295 vpxor tp, x1, x1; \
296 vpxor x2, tp, x0; \
297 vpand x4, x2, x2;
298#define SI4_2(x0, x1, x2, x3, x4) \
299 vpxor x0, x2, x2; \
300 vpor x4, x0, x0; \
301 vpxor x3, x0, x0; \
302 vpand x2, x3, x3; \
303 vpxor x3, x4, x4; \
304 vpxor x1, x3, x3; \
305 vpand x0, x1, x1; \
306 vpxor x1, x4, x4; \
307 vpxor x3, x0, x0;
308
309#define SI5_1(x0, x1, x2, x3, x4) \
310 vpor x2, x1, tp; \
311 vpxor x1, x2, x2; \
312 vpxor x3, tp, tp; \
313 vpand x1, x3, x3; \
314 vpxor x3, x2, x2; \
315 vpor x0, x3, x3; \
316 vpxor RNOT, x0, x0; \
317 vpxor x2, x3, x3; \
318 vpor x0, x2, x2;
319#define SI5_2(x0, x1, x2, x3, x4) \
320 vpxor tp, x1, x4; \
321 vpxor x4, x2, x2; \
322 vpand x0, x4, x4; \
323 vpxor tp, x0, x0; \
324 vpxor x3, tp, x1; \
325 vpand x2, x0, x0; \
326 vpxor x3, x2, x2; \
327 vpxor x2, x0, x0; \
328 vpxor x4, x2, x2; \
329 vpxor x3, x4, x4;
330
331#define SI6_1(x0, x1, x2, x3, x4) \
332 vpxor x2, x0, x0; \
333 vpand x3, x0, tp; \
334 vpxor x3, x2, x2; \
335 vpxor x2, tp, tp; \
336 vpxor x1, x3, x3; \
337 vpor x0, x2, x2; \
338 vpxor x3, x2, x2; \
339 vpand tp, x3, x3;
340#define SI6_2(x0, x1, x2, x3, x4) \
341 vpxor RNOT, tp, tp; \
342 vpxor x1, x3, x3; \
343 vpand x2, x1, x1; \
344 vpxor tp, x0, x4; \
345 vpxor x4, x3, x3; \
346 vpxor x2, x4, x4; \
347 vpxor x1, tp, x0; \
348 vpxor x0, x2, x2;
349
350#define SI7_1(x0, x1, x2, x3, x4) \
351 vpand x0, x3, tp; \
352 vpxor x2, x0, x0; \
353 vpor x3, x2, x2; \
354 vpxor x1, x3, x4; \
355 vpxor RNOT, x0, x0; \
356 vpor tp, x1, x1; \
357 vpxor x0, x4, x4; \
358 vpand x2, x0, x0; \
359 vpxor x1, x0, x0;
360#define SI7_2(x0, x1, x2, x3, x4) \
361 vpand x2, x1, x1; \
362 vpxor x2, tp, x3; \
363 vpxor x3, x4, x4; \
364 vpand x3, x2, x2; \
365 vpor x0, x3, x3; \
366 vpxor x4, x1, x1; \
367 vpxor x4, x3, x3; \
368 vpand x0, x4, x4; \
369 vpxor x2, x4, x4;
370
371#define get_key(i, j, t) \
372 vbroadcastss (4*(i)+(j))*4(CTX), t;
373
374#define K2(x0, x1, x2, x3, x4, i) \
375 get_key(i, 0, RK0); \
376 get_key(i, 1, RK1); \
377 get_key(i, 2, RK2); \
378 get_key(i, 3, RK3); \
379 vpxor RK0, x0
380 vpxor RK1, x1
381 vpxor RK2, x2
382 vpxor RK3, x3
383 vpxor RK0, x0
384 vpxor RK1, x1
385 vpxor RK2, x2
386 vpxor RK3, x3
387
388#define LK2(x0, x1, x2, x3, x4, i) \
389 vpslld $13, x0
390 vpsrld $(32 - 13), x0
391 vpor x4
392 vpxor x0
393 vpslld $3, x2
394 vpsrld $(32 - 3), x2
395 vpor x4
396 vpxor x2
397 vpslld $13, x0
398 vpsrld $(32 - 13), x0
399 vpor x4
400 vpxor x0
401 vpslld $3, x2
402 vpsrld $(32 - 3), x2
403 vpor x4
404 vpxor x2
405 vpslld $1, x1
406 vpsrld $(32 - 1), x1
407 vpor x4
408 vpslld $3, x0
409 vpxor x2
410 vpxor x4
411 get_key(i, 1, RK1); \
412 vpslld $1, x1
413 vpsrld $(32 - 1), x1
414 vpor x4
415 vpslld $3, x0
416 vpxor x2
417 vpxor x4
418 get_key(i, 3, RK3); \
419 vpslld $7, x3
420 vpsrld $(32 - 7), x3
421 vpor x4
422 vpslld $7, x1
423 vpxor x1
424 vpxor x3
425 vpxor x3
426 vpxor x4
427 get_key(i, 0, RK0); \
428 vpslld $7, x3
429 vpsrld $(32 - 7), x3
430 vpor x4
431 vpslld $7, x1
432 vpxor x1
433 vpxor x3
434 vpxor x3
435 vpxor x4
436 get_key(i, 2, RK2); \
437 vpxor RK1, x1
438 vpxor RK3, x3
439 vpslld $5, x0
440 vpsrld $(32 - 5), x0
441 vpor x4
442 vpslld $22, x2
443 vpsrld $(32 - 22), x2
444 vpor x4
445 vpxor RK0, x0
446 vpxor RK2, x2
447 vpxor RK1, x1
448 vpxor RK3, x3
449 vpslld $5, x0
450 vpsrld $(32 - 5), x0
451 vpor x4
452 vpslld $22, x2
453 vpsrld $(32 - 22), x2
454 vpor x4
455 vpxor RK0, x0
456 vpxor RK2, x2
457
458#define KL2(x0, x1, x2, x3, x4, i) \
459 vpxor RK0, x0
460 vpxor RK2, x2
461 vpsrld $5, x0
462 vpslld $(32 - 5), x0
463 vpor x4
464 vpxor RK3, x3
465 vpxor RK1, x1
466 vpsrld $22, x2
467 vpslld $(32 - 22), x2
468 vpor x4
469 vpxor x3
470 vpxor RK0, x0
471 vpxor RK2, x2
472 vpsrld $5, x0
473 vpslld $(32 - 5), x0
474 vpor x4
475 vpxor RK3, x3
476 vpxor RK1, x1
477 vpsrld $22, x2
478 vpslld $(32 - 22), x2
479 vpor x4
480 vpxor x3
481 vpxor x3
482 vpslld $7, x1
483 vpxor x1
484 vpxor x4
485 vpsrld $1, x1
486 vpslld $(32 - 1), x1
487 vpor x4
488 vpxor x3
489 vpslld $7, x1
490 vpxor x1
491 vpxor x4
492 vpsrld $1, x1
493 vpslld $(32 - 1), x1
494 vpor x4
495 vpsrld $7, x3
496 vpslld $(32 - 7), x3
497 vpor x4
498 vpxor x0
499 vpslld $3, x0
500 vpxor x4
501 vpsrld $7, x3
502 vpslld $(32 - 7), x3
503 vpor x4
504 vpxor x0
505 vpslld $3, x0
506 vpxor x4
507 vpsrld $13, x0
508 vpslld $(32 - 13), x0
509 vpor x4
510 vpxor x2
511 vpxor x2
512 vpsrld $3, x2
513 vpslld $(32 - 3), x2
514 vpor x4
515 vpsrld $13, x0
516 vpslld $(32 - 13), x0
517 vpor x4
518 vpxor x2
519 vpxor x2
520 vpsrld $3, x2
521 vpslld $(32 - 3), x2
522 vpor x4
523
524#define S(SBOX, x0, x1, x2, x3, x4) \
525 SBOX
526 SBOX
527 SBOX
528 SBOX
529
530#define SP(SBOX, x0, x1, x2, x3, x4, i) \
531 get_key(i, 0, RK0); \
532 SBOX
533 get_key(i, 2, RK2); \
534 SBOX
535 get_key(i, 3, RK3); \
536 SBOX
537 get_key(i, 1, RK1); \
538 SBOX
539
540#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
541 vpunpckldq x1, x0, t0; \
542 vpunpckhdq x1, x0, t2; \
543 vpunpckldq x3, x2, t1; \
544 vpunpckhdq x3, x2, x3; \
545 \
546 vpunpcklqdq t1, t0, x0; \
547 vpunpckhqdq t1, t0, x1; \
548 vpunpcklqdq x3, t2, x2; \
549 vpunpckhqdq x3, t2, x3;
550
551#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
552 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
553
554#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
555 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
556
557.align 8
558SYM_FUNC_START_LOCAL(__serpent_enc_blk8_avx)
559
560
561
562
563
564
565
566 vpcmpeqd RNOT, RNOT, RNOT;
567
568 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
569 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
570
571 K2(RA, RB, RC, RD, RE, 0);
572 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
573 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
574 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
575 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
576 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
577 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
578 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
579 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
580 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
581 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
582 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
583 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
584 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
585 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
586 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
587 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
588 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
589 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
590 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
591 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
592 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
593 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
594 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
595 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
596 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
597 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
598 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
599 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
600 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
601 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
602 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
603 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
604
605 write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
606 write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
607
608 ret;
609SYM_FUNC_END(__serpent_enc_blk8_avx)
610
611.align 8
612SYM_FUNC_START_LOCAL(__serpent_dec_blk8_avx)
613
614
615
616
617
618
619
620 vpcmpeqd RNOT, RNOT, RNOT;
621
622 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
623 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
624
625 K2(RA, RB, RC, RD, RE, 32);
626 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
627 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
628 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
629 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
630 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
631 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
632 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
633 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
634 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
635 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
636 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
637 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
638 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
639 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
640 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
641 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
642 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
643 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
644 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
645 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
646 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
647 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
648 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
649 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
650 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
651 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
652 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
653 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
654 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
655 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
656 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
657 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
658
659 write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
660 write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
661
662 ret;
663SYM_FUNC_END(__serpent_dec_blk8_avx)
664
665SYM_FUNC_START(serpent_ecb_enc_8way_avx)
666
667
668
669
670
671 FRAME_BEGIN
672
673 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
674
675 call __serpent_enc_blk8_avx;
676
677 store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
678
679 FRAME_END
680 ret;
681SYM_FUNC_END(serpent_ecb_enc_8way_avx)
682
683SYM_FUNC_START(serpent_ecb_dec_8way_avx)
684
685
686
687
688
689 FRAME_BEGIN
690
691 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
692
693 call __serpent_dec_blk8_avx;
694
695 store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
696
697 FRAME_END
698 ret;
699SYM_FUNC_END(serpent_ecb_dec_8way_avx)
700
701SYM_FUNC_START(serpent_cbc_dec_8way_avx)
702
703
704
705
706
707 FRAME_BEGIN
708
709 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
710
711 call __serpent_dec_blk8_avx;
712
713 store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
714
715 FRAME_END
716 ret;
717SYM_FUNC_END(serpent_cbc_dec_8way_avx)
718
719SYM_FUNC_START(serpent_ctr_8way_avx)
720
721
722
723
724
725
726 FRAME_BEGIN
727
728 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
729 RD2, RK0, RK1, RK2);
730
731 call __serpent_enc_blk8_avx;
732
733 store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
734
735 FRAME_END
736 ret;
737SYM_FUNC_END(serpent_ctr_8way_avx)
738
739SYM_FUNC_START(serpent_xts_enc_8way_avx)
740
741
742
743
744
745
746 FRAME_BEGIN
747
748
749 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
750 RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
751
752 call __serpent_enc_blk8_avx;
753
754
755 store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
756
757 FRAME_END
758 ret;
759SYM_FUNC_END(serpent_xts_enc_8way_avx)
760
761SYM_FUNC_START(serpent_xts_dec_8way_avx)
762
763
764
765
766
767
768 FRAME_BEGIN
769
770
771 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
772 RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
773
774 call __serpent_dec_blk8_avx;
775
776
777 store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
778
779 FRAME_END
780 ret;
781SYM_FUNC_END(serpent_xts_dec_8way_avx)
782