1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include <linux/linkage.h>
18#include "glue_helper-asm-avx2.S"
19
20.file "serpent-avx2-asm_64.S"
21
22.data
23.align 16
24
25.Lbswap128_mask:
26 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
27.Lxts_gf128mul_and_shl1_mask_0:
28 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
29.Lxts_gf128mul_and_shl1_mask_1:
30 .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
31
32.text
33
34#define CTX %rdi
35
36#define RNOT %ymm0
37#define tp %ymm1
38
39#define RA1 %ymm2
40#define RA2 %ymm3
41#define RB1 %ymm4
42#define RB2 %ymm5
43#define RC1 %ymm6
44#define RC2 %ymm7
45#define RD1 %ymm8
46#define RD2 %ymm9
47#define RE1 %ymm10
48#define RE2 %ymm11
49
50#define RK0 %ymm12
51#define RK1 %ymm13
52#define RK2 %ymm14
53#define RK3 %ymm15
54
55#define RK0x %xmm12
56#define RK1x %xmm13
57#define RK2x %xmm14
58#define RK3x %xmm15
59
60#define S0_1(x0, x1, x2, x3, x4) \
61 vpor x0, x3, tp; \
62 vpxor x3, x0, x0; \
63 vpxor x2, x3, x4; \
64 vpxor RNOT, x4, x4; \
65 vpxor x1, tp, x3; \
66 vpand x0, x1, x1; \
67 vpxor x4, x1, x1; \
68 vpxor x0, x2, x2;
69#define S0_2(x0, x1, x2, x3, x4) \
70 vpxor x3, x0, x0; \
71 vpor x0, x4, x4; \
72 vpxor x2, x0, x0; \
73 vpand x1, x2, x2; \
74 vpxor x2, x3, x3; \
75 vpxor RNOT, x1, x1; \
76 vpxor x4, x2, x2; \
77 vpxor x2, x1, x1;
78
79#define S1_1(x0, x1, x2, x3, x4) \
80 vpxor x0, x1, tp; \
81 vpxor x3, x0, x0; \
82 vpxor RNOT, x3, x3; \
83 vpand tp, x1, x4; \
84 vpor tp, x0, x0; \
85 vpxor x2, x3, x3; \
86 vpxor x3, x0, x0; \
87 vpxor x3, tp, x1;
88#define S1_2(x0, x1, x2, x3, x4) \
89 vpxor x4, x3, x3; \
90 vpor x4, x1, x1; \
91 vpxor x2, x4, x4; \
92 vpand x0, x2, x2; \
93 vpxor x1, x2, x2; \
94 vpor x0, x1, x1; \
95 vpxor RNOT, x0, x0; \
96 vpxor x2, x0, x0; \
97 vpxor x1, x4, x4;
98
99#define S2_1(x0, x1, x2, x3, x4) \
100 vpxor RNOT, x3, x3; \
101 vpxor x0, x1, x1; \
102 vpand x2, x0, tp; \
103 vpxor x3, tp, tp; \
104 vpor x0, x3, x3; \
105 vpxor x1, x2, x2; \
106 vpxor x1, x3, x3; \
107 vpand tp, x1, x1;
108#define S2_2(x0, x1, x2, x3, x4) \
109 vpxor x2, tp, tp; \
110 vpand x3, x2, x2; \
111 vpor x1, x3, x3; \
112 vpxor RNOT, tp, tp; \
113 vpxor tp, x3, x3; \
114 vpxor tp, x0, x4; \
115 vpxor x2, tp, x0; \
116 vpor x2, x1, x1;
117
118#define S3_1(x0, x1, x2, x3, x4) \
119 vpxor x3, x1, tp; \
120 vpor x0, x3, x3; \
121 vpand x0, x1, x4; \
122 vpxor x2, x0, x0; \
123 vpxor tp, x2, x2; \
124 vpand x3, tp, x1; \
125 vpxor x3, x2, x2; \
126 vpor x4, x0, x0; \
127 vpxor x3, x4, x4;
128#define S3_2(x0, x1, x2, x3, x4) \
129 vpxor x0, x1, x1; \
130 vpand x3, x0, x0; \
131 vpand x4, x3, x3; \
132 vpxor x2, x3, x3; \
133 vpor x1, x4, x4; \
134 vpand x1, x2, x2; \
135 vpxor x3, x4, x4; \
136 vpxor x3, x0, x0; \
137 vpxor x2, x3, x3;
138
139#define S4_1(x0, x1, x2, x3, x4) \
140 vpand x0, x3, tp; \
141 vpxor x3, x0, x0; \
142 vpxor x2, tp, tp; \
143 vpor x3, x2, x2; \
144 vpxor x1, x0, x0; \
145 vpxor tp, x3, x4; \
146 vpor x0, x2, x2; \
147 vpxor x1, x2, x2;
148#define S4_2(x0, x1, x2, x3, x4) \
149 vpand x0, x1, x1; \
150 vpxor x4, x1, x1; \
151 vpand x2, x4, x4; \
152 vpxor tp, x2, x2; \
153 vpxor x0, x4, x4; \
154 vpor x1, tp, x3; \
155 vpxor RNOT, x1, x1; \
156 vpxor x0, x3, x3;
157
158#define S5_1(x0, x1, x2, x3, x4) \
159 vpor x0, x1, tp; \
160 vpxor tp, x2, x2; \
161 vpxor RNOT, x3, x3; \
162 vpxor x0, x1, x4; \
163 vpxor x2, x0, x0; \
164 vpand x4, tp, x1; \
165 vpor x3, x4, x4; \
166 vpxor x0, x4, x4;
167#define S5_2(x0, x1, x2, x3, x4) \
168 vpand x3, x0, x0; \
169 vpxor x3, x1, x1; \
170 vpxor x2, x3, x3; \
171 vpxor x1, x0, x0; \
172 vpand x4, x2, x2; \
173 vpxor x2, x1, x1; \
174 vpand x0, x2, x2; \
175 vpxor x2, x3, x3;
176
177#define S6_1(x0, x1, x2, x3, x4) \
178 vpxor x0, x3, x3; \
179 vpxor x2, x1, tp; \
180 vpxor x0, x2, x2; \
181 vpand x3, x0, x0; \
182 vpor x3, tp, tp; \
183 vpxor RNOT, x1, x4; \
184 vpxor tp, x0, x0; \
185 vpxor x2, tp, x1;
186#define S6_2(x0, x1, x2, x3, x4) \
187 vpxor x4, x3, x3; \
188 vpxor x0, x4, x4; \
189 vpand x0, x2, x2; \
190 vpxor x1, x4, x4; \
191 vpxor x3, x2, x2; \
192 vpand x1, x3, x3; \
193 vpxor x0, x3, x3; \
194 vpxor x2, x1, x1;
195
196#define S7_1(x0, x1, x2, x3, x4) \
197 vpxor RNOT, x1, tp; \
198 vpxor RNOT, x0, x0; \
199 vpand x2, tp, x1; \
200 vpxor x3, x1, x1; \
201 vpor tp, x3, x3; \
202 vpxor x2, tp, x4; \
203 vpxor x3, x2, x2; \
204 vpxor x0, x3, x3; \
205 vpor x1, x0, x0;
206#define S7_2(x0, x1, x2, x3, x4) \
207 vpand x0, x2, x2; \
208 vpxor x4, x0, x0; \
209 vpxor x3, x4, x4; \
210 vpand x0, x3, x3; \
211 vpxor x1, x4, x4; \
212 vpxor x4, x2, x2; \
213 vpxor x1, x3, x3; \
214 vpor x0, x4, x4; \
215 vpxor x1, x4, x4;
216
217#define SI0_1(x0, x1, x2, x3, x4) \
218 vpxor x0, x1, x1; \
219 vpor x1, x3, tp; \
220 vpxor x1, x3, x4; \
221 vpxor RNOT, x0, x0; \
222 vpxor tp, x2, x2; \
223 vpxor x0, tp, x3; \
224 vpand x1, x0, x0; \
225 vpxor x2, x0, x0;
226#define SI0_2(x0, x1, x2, x3, x4) \
227 vpand x3, x2, x2; \
228 vpxor x4, x3, x3; \
229 vpxor x3, x2, x2; \
230 vpxor x3, x1, x1; \
231 vpand x0, x3, x3; \
232 vpxor x0, x1, x1; \
233 vpxor x2, x0, x0; \
234 vpxor x3, x4, x4;
235
236#define SI1_1(x0, x1, x2, x3, x4) \
237 vpxor x3, x1, x1; \
238 vpxor x2, x0, tp; \
239 vpxor RNOT, x2, x2; \
240 vpor x1, x0, x4; \
241 vpxor x3, x4, x4; \
242 vpand x1, x3, x3; \
243 vpxor x2, x1, x1; \
244 vpand x4, x2, x2;
245#define SI1_2(x0, x1, x2, x3, x4) \
246 vpxor x1, x4, x4; \
247 vpor x3, x1, x1; \
248 vpxor tp, x3, x3; \
249 vpxor tp, x2, x2; \
250 vpor x4, tp, x0; \
251 vpxor x4, x2, x2; \
252 vpxor x0, x1, x1; \
253 vpxor x1, x4, x4;
254
255#define SI2_1(x0, x1, x2, x3, x4) \
256 vpxor x1, x2, x2; \
257 vpxor RNOT, x3, tp; \
258 vpor x2, tp, tp; \
259 vpxor x3, x2, x2; \
260 vpxor x0, x3, x4; \
261 vpxor x1, tp, x3; \
262 vpor x2, x1, x1; \
263 vpxor x0, x2, x2;
264#define SI2_2(x0, x1, x2, x3, x4) \
265 vpxor x4, x1, x1; \
266 vpor x3, x4, x4; \
267 vpxor x3, x2, x2; \
268 vpxor x2, x4, x4; \
269 vpand x1, x2, x2; \
270 vpxor x3, x2, x2; \
271 vpxor x4, x3, x3; \
272 vpxor x0, x4, x4;
273
274#define SI3_1(x0, x1, x2, x3, x4) \
275 vpxor x1, x2, x2; \
276 vpand x2, x1, tp; \
277 vpxor x0, tp, tp; \
278 vpor x1, x0, x0; \
279 vpxor x3, x1, x4; \
280 vpxor x3, x0, x0; \
281 vpor tp, x3, x3; \
282 vpxor x2, tp, x1;
283#define SI3_2(x0, x1, x2, x3, x4) \
284 vpxor x3, x1, x1; \
285 vpxor x2, x0, x0; \
286 vpxor x3, x2, x2; \
287 vpand x1, x3, x3; \
288 vpxor x0, x1, x1; \
289 vpand x2, x0, x0; \
290 vpxor x3, x4, x4; \
291 vpxor x0, x3, x3; \
292 vpxor x1, x0, x0;
293
294#define SI4_1(x0, x1, x2, x3, x4) \
295 vpxor x3, x2, x2; \
296 vpand x1, x0, tp; \
297 vpxor x2, tp, tp; \
298 vpor x3, x2, x2; \
299 vpxor RNOT, x0, x4; \
300 vpxor tp, x1, x1; \
301 vpxor x2, tp, x0; \
302 vpand x4, x2, x2;
303#define SI4_2(x0, x1, x2, x3, x4) \
304 vpxor x0, x2, x2; \
305 vpor x4, x0, x0; \
306 vpxor x3, x0, x0; \
307 vpand x2, x3, x3; \
308 vpxor x3, x4, x4; \
309 vpxor x1, x3, x3; \
310 vpand x0, x1, x1; \
311 vpxor x1, x4, x4; \
312 vpxor x3, x0, x0;
313
314#define SI5_1(x0, x1, x2, x3, x4) \
315 vpor x2, x1, tp; \
316 vpxor x1, x2, x2; \
317 vpxor x3, tp, tp; \
318 vpand x1, x3, x3; \
319 vpxor x3, x2, x2; \
320 vpor x0, x3, x3; \
321 vpxor RNOT, x0, x0; \
322 vpxor x2, x3, x3; \
323 vpor x0, x2, x2;
324#define SI5_2(x0, x1, x2, x3, x4) \
325 vpxor tp, x1, x4; \
326 vpxor x4, x2, x2; \
327 vpand x0, x4, x4; \
328 vpxor tp, x0, x0; \
329 vpxor x3, tp, x1; \
330 vpand x2, x0, x0; \
331 vpxor x3, x2, x2; \
332 vpxor x2, x0, x0; \
333 vpxor x4, x2, x2; \
334 vpxor x3, x4, x4;
335
336#define SI6_1(x0, x1, x2, x3, x4) \
337 vpxor x2, x0, x0; \
338 vpand x3, x0, tp; \
339 vpxor x3, x2, x2; \
340 vpxor x2, tp, tp; \
341 vpxor x1, x3, x3; \
342 vpor x0, x2, x2; \
343 vpxor x3, x2, x2; \
344 vpand tp, x3, x3;
345#define SI6_2(x0, x1, x2, x3, x4) \
346 vpxor RNOT, tp, tp; \
347 vpxor x1, x3, x3; \
348 vpand x2, x1, x1; \
349 vpxor tp, x0, x4; \
350 vpxor x4, x3, x3; \
351 vpxor x2, x4, x4; \
352 vpxor x1, tp, x0; \
353 vpxor x0, x2, x2;
354
355#define SI7_1(x0, x1, x2, x3, x4) \
356 vpand x0, x3, tp; \
357 vpxor x2, x0, x0; \
358 vpor x3, x2, x2; \
359 vpxor x1, x3, x4; \
360 vpxor RNOT, x0, x0; \
361 vpor tp, x1, x1; \
362 vpxor x0, x4, x4; \
363 vpand x2, x0, x0; \
364 vpxor x1, x0, x0;
365#define SI7_2(x0, x1, x2, x3, x4) \
366 vpand x2, x1, x1; \
367 vpxor x2, tp, x3; \
368 vpxor x3, x4, x4; \
369 vpand x3, x2, x2; \
370 vpor x0, x3, x3; \
371 vpxor x4, x1, x1; \
372 vpxor x4, x3, x3; \
373 vpand x0, x4, x4; \
374 vpxor x2, x4, x4;
375
376#define get_key(i,j,t) \
377 vpbroadcastd (4*(i)+(j))*4(CTX), t;
378
379#define K2(x0, x1, x2, x3, x4, i) \
380 get_key(i, 0, RK0); \
381 get_key(i, 1, RK1); \
382 get_key(i, 2, RK2); \
383 get_key(i, 3, RK3); \
384 vpxor RK0, x0
385 vpxor RK1, x1
386 vpxor RK2, x2
387 vpxor RK3, x3
388 vpxor RK0, x0
389 vpxor RK1, x1
390 vpxor RK2, x2
391 vpxor RK3, x3
392
393#define LK2(x0, x1, x2, x3, x4, i) \
394 vpslld $13, x0
395 vpsrld $(32 - 13), x0
396 vpor x4
397 vpxor x0
398 vpslld $3, x2
399 vpsrld $(32 - 3), x2
400 vpor x4
401 vpxor x2
402 vpslld $13, x0
403 vpsrld $(32 - 13), x0
404 vpor x4
405 vpxor x0
406 vpslld $3, x2
407 vpsrld $(32 - 3), x2
408 vpor x4
409 vpxor x2
410 vpslld $1, x1
411 vpsrld $(32 - 1), x1
412 vpor x4
413 vpslld $3, x0
414 vpxor x2
415 vpxor x4
416 get_key(i, 1, RK1); \
417 vpslld $1, x1
418 vpsrld $(32 - 1), x1
419 vpor x4
420 vpslld $3, x0
421 vpxor x2
422 vpxor x4
423 get_key(i, 3, RK3); \
424 vpslld $7, x3
425 vpsrld $(32 - 7), x3
426 vpor x4
427 vpslld $7, x1
428 vpxor x1
429 vpxor x3
430 vpxor x3
431 vpxor x4
432 get_key(i, 0, RK0); \
433 vpslld $7, x3
434 vpsrld $(32 - 7), x3
435 vpor x4
436 vpslld $7, x1
437 vpxor x1
438 vpxor x3
439 vpxor x3
440 vpxor x4
441 get_key(i, 2, RK2); \
442 vpxor RK1, x1
443 vpxor RK3, x3
444 vpslld $5, x0
445 vpsrld $(32 - 5), x0
446 vpor x4
447 vpslld $22, x2
448 vpsrld $(32 - 22), x2
449 vpor x4
450 vpxor RK0, x0
451 vpxor RK2, x2
452 vpxor RK1, x1
453 vpxor RK3, x3
454 vpslld $5, x0
455 vpsrld $(32 - 5), x0
456 vpor x4
457 vpslld $22, x2
458 vpsrld $(32 - 22), x2
459 vpor x4
460 vpxor RK0, x0
461 vpxor RK2, x2
462
463#define KL2(x0, x1, x2, x3, x4, i) \
464 vpxor RK0, x0
465 vpxor RK2, x2
466 vpsrld $5, x0
467 vpslld $(32 - 5), x0
468 vpor x4
469 vpxor RK3, x3
470 vpxor RK1, x1
471 vpsrld $22, x2
472 vpslld $(32 - 22), x2
473 vpor x4
474 vpxor x3
475 vpxor RK0, x0
476 vpxor RK2, x2
477 vpsrld $5, x0
478 vpslld $(32 - 5), x0
479 vpor x4
480 vpxor RK3, x3
481 vpxor RK1, x1
482 vpsrld $22, x2
483 vpslld $(32 - 22), x2
484 vpor x4
485 vpxor x3
486 vpxor x3
487 vpslld $7, x1
488 vpxor x1
489 vpxor x4
490 vpsrld $1, x1
491 vpslld $(32 - 1), x1
492 vpor x4
493 vpxor x3
494 vpslld $7, x1
495 vpxor x1
496 vpxor x4
497 vpsrld $1, x1
498 vpslld $(32 - 1), x1
499 vpor x4
500 vpsrld $7, x3
501 vpslld $(32 - 7), x3
502 vpor x4
503 vpxor x0
504 vpslld $3, x0
505 vpxor x4
506 vpsrld $7, x3
507 vpslld $(32 - 7), x3
508 vpor x4
509 vpxor x0
510 vpslld $3, x0
511 vpxor x4
512 vpsrld $13, x0
513 vpslld $(32 - 13), x0
514 vpor x4
515 vpxor x2
516 vpxor x2
517 vpsrld $3, x2
518 vpslld $(32 - 3), x2
519 vpor x4
520 vpsrld $13, x0
521 vpslld $(32 - 13), x0
522 vpor x4
523 vpxor x2
524 vpxor x2
525 vpsrld $3, x2
526 vpslld $(32 - 3), x2
527 vpor x4
528
529#define S(SBOX, x0, x1, x2, x3, x4) \
530 SBOX
531 SBOX
532 SBOX
533 SBOX
534
535#define SP(SBOX, x0, x1, x2, x3, x4, i) \
536 get_key(i, 0, RK0); \
537 SBOX
538 get_key(i, 2, RK2); \
539 SBOX
540 get_key(i, 3, RK3); \
541 SBOX
542 get_key(i, 1, RK1); \
543 SBOX
544
545#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
546 vpunpckldq x1, x0, t0; \
547 vpunpckhdq x1, x0, t2; \
548 vpunpckldq x3, x2, t1; \
549 vpunpckhdq x3, x2, x3; \
550 \
551 vpunpcklqdq t1, t0, x0; \
552 vpunpckhqdq t1, t0, x1; \
553 vpunpcklqdq x3, t2, x2; \
554 vpunpckhqdq x3, t2, x3;
555
556#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
557 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
558
559#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
560 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
561
562.align 8
563__serpent_enc_blk16:
564
565
566
567
568
569
570
571 vpcmpeqd RNOT, RNOT, RNOT;
572
573 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
574 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
575
576 K2(RA, RB, RC, RD, RE, 0);
577 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
578 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
579 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
580 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
581 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
582 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
583 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
584 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
585 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
586 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
587 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
588 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
589 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
590 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
591 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
592 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
593 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
594 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
595 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
596 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
597 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
598 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
599 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
600 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
601 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
602 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
603 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
604 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
605 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
606 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
607 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
608 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
609
610 write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
611 write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
612
613 ret;
614ENDPROC(__serpent_enc_blk16)
615
616.align 8
617__serpent_dec_blk16:
618
619
620
621
622
623
624
625 vpcmpeqd RNOT, RNOT, RNOT;
626
627 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
628 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
629
630 K2(RA, RB, RC, RD, RE, 32);
631 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
632 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
633 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
634 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
635 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
636 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
637 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
638 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
639 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
640 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
641 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
642 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
643 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
644 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
645 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
646 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
647 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
648 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
649 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
650 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
651 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
652 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
653 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
654 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
655 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
656 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
657 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
658 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
659 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
660 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
661 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
662 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
663
664 write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
665 write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
666
667 ret;
668ENDPROC(__serpent_dec_blk16)
669
670ENTRY(serpent_ecb_enc_16way)
671
672
673
674
675
676
677 vzeroupper;
678
679 load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
680
681 call __serpent_enc_blk16;
682
683 store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
684
685 vzeroupper;
686
687 ret;
688ENDPROC(serpent_ecb_enc_16way)
689
690ENTRY(serpent_ecb_dec_16way)
691
692
693
694
695
696
697 vzeroupper;
698
699 load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
700
701 call __serpent_dec_blk16;
702
703 store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
704
705 vzeroupper;
706
707 ret;
708ENDPROC(serpent_ecb_dec_16way)
709
710ENTRY(serpent_cbc_dec_16way)
711
712
713
714
715
716
717 vzeroupper;
718
719 load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
720
721 call __serpent_dec_blk16;
722
723 store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2,
724 RK0);
725
726 vzeroupper;
727
728 ret;
729ENDPROC(serpent_cbc_dec_16way)
730
731ENTRY(serpent_ctr_16way)
732
733
734
735
736
737
738
739 vzeroupper;
740
741 load_ctr_16way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
742 RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
743 tp);
744
745 call __serpent_enc_blk16;
746
747 store_ctr_16way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
748
749 vzeroupper;
750
751 ret;
752ENDPROC(serpent_ctr_16way)
753
754ENTRY(serpent_xts_enc_16way)
755
756
757
758
759
760
761
762 vzeroupper;
763
764 load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
765 RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
766 .Lxts_gf128mul_and_shl1_mask_0,
767 .Lxts_gf128mul_and_shl1_mask_1);
768
769 call __serpent_enc_blk16;
770
771 store_xts_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
772
773 vzeroupper;
774
775 ret;
776ENDPROC(serpent_xts_enc_16way)
777
778ENTRY(serpent_xts_dec_16way)
779
780
781
782
783
784
785
786 vzeroupper;
787
788 load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
789 RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
790 .Lxts_gf128mul_and_shl1_mask_0,
791 .Lxts_gf128mul_and_shl1_mask_1);
792
793 call __serpent_dec_blk16;
794
795 store_xts_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
796
797 vzeroupper;
798
799 ret;
800ENDPROC(serpent_xts_dec_16way)
801