1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include <linux/linkage.h>
18#include <asm/frame.h>
19#include "glue_helper-asm-avx2.S"
20
21.file "serpent-avx2-asm_64.S"
22
23.data
24.align 16
25
26.Lbswap128_mask:
27 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
28.Lxts_gf128mul_and_shl1_mask_0:
29 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
30.Lxts_gf128mul_and_shl1_mask_1:
31 .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
32
33.text
34
35#define CTX %rdi
36
37#define RNOT %ymm0
38#define tp %ymm1
39
40#define RA1 %ymm2
41#define RA2 %ymm3
42#define RB1 %ymm4
43#define RB2 %ymm5
44#define RC1 %ymm6
45#define RC2 %ymm7
46#define RD1 %ymm8
47#define RD2 %ymm9
48#define RE1 %ymm10
49#define RE2 %ymm11
50
51#define RK0 %ymm12
52#define RK1 %ymm13
53#define RK2 %ymm14
54#define RK3 %ymm15
55
56#define RK0x %xmm12
57#define RK1x %xmm13
58#define RK2x %xmm14
59#define RK3x %xmm15
60
61#define S0_1(x0, x1, x2, x3, x4) \
62 vpor x0, x3, tp; \
63 vpxor x3, x0, x0; \
64 vpxor x2, x3, x4; \
65 vpxor RNOT, x4, x4; \
66 vpxor x1, tp, x3; \
67 vpand x0, x1, x1; \
68 vpxor x4, x1, x1; \
69 vpxor x0, x2, x2;
70#define S0_2(x0, x1, x2, x3, x4) \
71 vpxor x3, x0, x0; \
72 vpor x0, x4, x4; \
73 vpxor x2, x0, x0; \
74 vpand x1, x2, x2; \
75 vpxor x2, x3, x3; \
76 vpxor RNOT, x1, x1; \
77 vpxor x4, x2, x2; \
78 vpxor x2, x1, x1;
79
80#define S1_1(x0, x1, x2, x3, x4) \
81 vpxor x0, x1, tp; \
82 vpxor x3, x0, x0; \
83 vpxor RNOT, x3, x3; \
84 vpand tp, x1, x4; \
85 vpor tp, x0, x0; \
86 vpxor x2, x3, x3; \
87 vpxor x3, x0, x0; \
88 vpxor x3, tp, x1;
89#define S1_2(x0, x1, x2, x3, x4) \
90 vpxor x4, x3, x3; \
91 vpor x4, x1, x1; \
92 vpxor x2, x4, x4; \
93 vpand x0, x2, x2; \
94 vpxor x1, x2, x2; \
95 vpor x0, x1, x1; \
96 vpxor RNOT, x0, x0; \
97 vpxor x2, x0, x0; \
98 vpxor x1, x4, x4;
99
100#define S2_1(x0, x1, x2, x3, x4) \
101 vpxor RNOT, x3, x3; \
102 vpxor x0, x1, x1; \
103 vpand x2, x0, tp; \
104 vpxor x3, tp, tp; \
105 vpor x0, x3, x3; \
106 vpxor x1, x2, x2; \
107 vpxor x1, x3, x3; \
108 vpand tp, x1, x1;
109#define S2_2(x0, x1, x2, x3, x4) \
110 vpxor x2, tp, tp; \
111 vpand x3, x2, x2; \
112 vpor x1, x3, x3; \
113 vpxor RNOT, tp, tp; \
114 vpxor tp, x3, x3; \
115 vpxor tp, x0, x4; \
116 vpxor x2, tp, x0; \
117 vpor x2, x1, x1;
118
119#define S3_1(x0, x1, x2, x3, x4) \
120 vpxor x3, x1, tp; \
121 vpor x0, x3, x3; \
122 vpand x0, x1, x4; \
123 vpxor x2, x0, x0; \
124 vpxor tp, x2, x2; \
125 vpand x3, tp, x1; \
126 vpxor x3, x2, x2; \
127 vpor x4, x0, x0; \
128 vpxor x3, x4, x4;
129#define S3_2(x0, x1, x2, x3, x4) \
130 vpxor x0, x1, x1; \
131 vpand x3, x0, x0; \
132 vpand x4, x3, x3; \
133 vpxor x2, x3, x3; \
134 vpor x1, x4, x4; \
135 vpand x1, x2, x2; \
136 vpxor x3, x4, x4; \
137 vpxor x3, x0, x0; \
138 vpxor x2, x3, x3;
139
140#define S4_1(x0, x1, x2, x3, x4) \
141 vpand x0, x3, tp; \
142 vpxor x3, x0, x0; \
143 vpxor x2, tp, tp; \
144 vpor x3, x2, x2; \
145 vpxor x1, x0, x0; \
146 vpxor tp, x3, x4; \
147 vpor x0, x2, x2; \
148 vpxor x1, x2, x2;
149#define S4_2(x0, x1, x2, x3, x4) \
150 vpand x0, x1, x1; \
151 vpxor x4, x1, x1; \
152 vpand x2, x4, x4; \
153 vpxor tp, x2, x2; \
154 vpxor x0, x4, x4; \
155 vpor x1, tp, x3; \
156 vpxor RNOT, x1, x1; \
157 vpxor x0, x3, x3;
158
159#define S5_1(x0, x1, x2, x3, x4) \
160 vpor x0, x1, tp; \
161 vpxor tp, x2, x2; \
162 vpxor RNOT, x3, x3; \
163 vpxor x0, x1, x4; \
164 vpxor x2, x0, x0; \
165 vpand x4, tp, x1; \
166 vpor x3, x4, x4; \
167 vpxor x0, x4, x4;
168#define S5_2(x0, x1, x2, x3, x4) \
169 vpand x3, x0, x0; \
170 vpxor x3, x1, x1; \
171 vpxor x2, x3, x3; \
172 vpxor x1, x0, x0; \
173 vpand x4, x2, x2; \
174 vpxor x2, x1, x1; \
175 vpand x0, x2, x2; \
176 vpxor x2, x3, x3;
177
178#define S6_1(x0, x1, x2, x3, x4) \
179 vpxor x0, x3, x3; \
180 vpxor x2, x1, tp; \
181 vpxor x0, x2, x2; \
182 vpand x3, x0, x0; \
183 vpor x3, tp, tp; \
184 vpxor RNOT, x1, x4; \
185 vpxor tp, x0, x0; \
186 vpxor x2, tp, x1;
187#define S6_2(x0, x1, x2, x3, x4) \
188 vpxor x4, x3, x3; \
189 vpxor x0, x4, x4; \
190 vpand x0, x2, x2; \
191 vpxor x1, x4, x4; \
192 vpxor x3, x2, x2; \
193 vpand x1, x3, x3; \
194 vpxor x0, x3, x3; \
195 vpxor x2, x1, x1;
196
197#define S7_1(x0, x1, x2, x3, x4) \
198 vpxor RNOT, x1, tp; \
199 vpxor RNOT, x0, x0; \
200 vpand x2, tp, x1; \
201 vpxor x3, x1, x1; \
202 vpor tp, x3, x3; \
203 vpxor x2, tp, x4; \
204 vpxor x3, x2, x2; \
205 vpxor x0, x3, x3; \
206 vpor x1, x0, x0;
207#define S7_2(x0, x1, x2, x3, x4) \
208 vpand x0, x2, x2; \
209 vpxor x4, x0, x0; \
210 vpxor x3, x4, x4; \
211 vpand x0, x3, x3; \
212 vpxor x1, x4, x4; \
213 vpxor x4, x2, x2; \
214 vpxor x1, x3, x3; \
215 vpor x0, x4, x4; \
216 vpxor x1, x4, x4;
217
218#define SI0_1(x0, x1, x2, x3, x4) \
219 vpxor x0, x1, x1; \
220 vpor x1, x3, tp; \
221 vpxor x1, x3, x4; \
222 vpxor RNOT, x0, x0; \
223 vpxor tp, x2, x2; \
224 vpxor x0, tp, x3; \
225 vpand x1, x0, x0; \
226 vpxor x2, x0, x0;
227#define SI0_2(x0, x1, x2, x3, x4) \
228 vpand x3, x2, x2; \
229 vpxor x4, x3, x3; \
230 vpxor x3, x2, x2; \
231 vpxor x3, x1, x1; \
232 vpand x0, x3, x3; \
233 vpxor x0, x1, x1; \
234 vpxor x2, x0, x0; \
235 vpxor x3, x4, x4;
236
237#define SI1_1(x0, x1, x2, x3, x4) \
238 vpxor x3, x1, x1; \
239 vpxor x2, x0, tp; \
240 vpxor RNOT, x2, x2; \
241 vpor x1, x0, x4; \
242 vpxor x3, x4, x4; \
243 vpand x1, x3, x3; \
244 vpxor x2, x1, x1; \
245 vpand x4, x2, x2;
246#define SI1_2(x0, x1, x2, x3, x4) \
247 vpxor x1, x4, x4; \
248 vpor x3, x1, x1; \
249 vpxor tp, x3, x3; \
250 vpxor tp, x2, x2; \
251 vpor x4, tp, x0; \
252 vpxor x4, x2, x2; \
253 vpxor x0, x1, x1; \
254 vpxor x1, x4, x4;
255
256#define SI2_1(x0, x1, x2, x3, x4) \
257 vpxor x1, x2, x2; \
258 vpxor RNOT, x3, tp; \
259 vpor x2, tp, tp; \
260 vpxor x3, x2, x2; \
261 vpxor x0, x3, x4; \
262 vpxor x1, tp, x3; \
263 vpor x2, x1, x1; \
264 vpxor x0, x2, x2;
265#define SI2_2(x0, x1, x2, x3, x4) \
266 vpxor x4, x1, x1; \
267 vpor x3, x4, x4; \
268 vpxor x3, x2, x2; \
269 vpxor x2, x4, x4; \
270 vpand x1, x2, x2; \
271 vpxor x3, x2, x2; \
272 vpxor x4, x3, x3; \
273 vpxor x0, x4, x4;
274
275#define SI3_1(x0, x1, x2, x3, x4) \
276 vpxor x1, x2, x2; \
277 vpand x2, x1, tp; \
278 vpxor x0, tp, tp; \
279 vpor x1, x0, x0; \
280 vpxor x3, x1, x4; \
281 vpxor x3, x0, x0; \
282 vpor tp, x3, x3; \
283 vpxor x2, tp, x1;
284#define SI3_2(x0, x1, x2, x3, x4) \
285 vpxor x3, x1, x1; \
286 vpxor x2, x0, x0; \
287 vpxor x3, x2, x2; \
288 vpand x1, x3, x3; \
289 vpxor x0, x1, x1; \
290 vpand x2, x0, x0; \
291 vpxor x3, x4, x4; \
292 vpxor x0, x3, x3; \
293 vpxor x1, x0, x0;
294
295#define SI4_1(x0, x1, x2, x3, x4) \
296 vpxor x3, x2, x2; \
297 vpand x1, x0, tp; \
298 vpxor x2, tp, tp; \
299 vpor x3, x2, x2; \
300 vpxor RNOT, x0, x4; \
301 vpxor tp, x1, x1; \
302 vpxor x2, tp, x0; \
303 vpand x4, x2, x2;
304#define SI4_2(x0, x1, x2, x3, x4) \
305 vpxor x0, x2, x2; \
306 vpor x4, x0, x0; \
307 vpxor x3, x0, x0; \
308 vpand x2, x3, x3; \
309 vpxor x3, x4, x4; \
310 vpxor x1, x3, x3; \
311 vpand x0, x1, x1; \
312 vpxor x1, x4, x4; \
313 vpxor x3, x0, x0;
314
315#define SI5_1(x0, x1, x2, x3, x4) \
316 vpor x2, x1, tp; \
317 vpxor x1, x2, x2; \
318 vpxor x3, tp, tp; \
319 vpand x1, x3, x3; \
320 vpxor x3, x2, x2; \
321 vpor x0, x3, x3; \
322 vpxor RNOT, x0, x0; \
323 vpxor x2, x3, x3; \
324 vpor x0, x2, x2;
325#define SI5_2(x0, x1, x2, x3, x4) \
326 vpxor tp, x1, x4; \
327 vpxor x4, x2, x2; \
328 vpand x0, x4, x4; \
329 vpxor tp, x0, x0; \
330 vpxor x3, tp, x1; \
331 vpand x2, x0, x0; \
332 vpxor x3, x2, x2; \
333 vpxor x2, x0, x0; \
334 vpxor x4, x2, x2; \
335 vpxor x3, x4, x4;
336
337#define SI6_1(x0, x1, x2, x3, x4) \
338 vpxor x2, x0, x0; \
339 vpand x3, x0, tp; \
340 vpxor x3, x2, x2; \
341 vpxor x2, tp, tp; \
342 vpxor x1, x3, x3; \
343 vpor x0, x2, x2; \
344 vpxor x3, x2, x2; \
345 vpand tp, x3, x3;
346#define SI6_2(x0, x1, x2, x3, x4) \
347 vpxor RNOT, tp, tp; \
348 vpxor x1, x3, x3; \
349 vpand x2, x1, x1; \
350 vpxor tp, x0, x4; \
351 vpxor x4, x3, x3; \
352 vpxor x2, x4, x4; \
353 vpxor x1, tp, x0; \
354 vpxor x0, x2, x2;
355
356#define SI7_1(x0, x1, x2, x3, x4) \
357 vpand x0, x3, tp; \
358 vpxor x2, x0, x0; \
359 vpor x3, x2, x2; \
360 vpxor x1, x3, x4; \
361 vpxor RNOT, x0, x0; \
362 vpor tp, x1, x1; \
363 vpxor x0, x4, x4; \
364 vpand x2, x0, x0; \
365 vpxor x1, x0, x0;
366#define SI7_2(x0, x1, x2, x3, x4) \
367 vpand x2, x1, x1; \
368 vpxor x2, tp, x3; \
369 vpxor x3, x4, x4; \
370 vpand x3, x2, x2; \
371 vpor x0, x3, x3; \
372 vpxor x4, x1, x1; \
373 vpxor x4, x3, x3; \
374 vpand x0, x4, x4; \
375 vpxor x2, x4, x4;
376
377#define get_key(i,j,t) \
378 vpbroadcastd (4*(i)+(j))*4(CTX), t;
379
380#define K2(x0, x1, x2, x3, x4, i) \
381 get_key(i, 0, RK0); \
382 get_key(i, 1, RK1); \
383 get_key(i, 2, RK2); \
384 get_key(i, 3, RK3); \
385 vpxor RK0, x0
386 vpxor RK1, x1
387 vpxor RK2, x2
388 vpxor RK3, x3
389 vpxor RK0, x0
390 vpxor RK1, x1
391 vpxor RK2, x2
392 vpxor RK3, x3
393
394#define LK2(x0, x1, x2, x3, x4, i) \
395 vpslld $13, x0
396 vpsrld $(32 - 13), x0
397 vpor x4
398 vpxor x0
399 vpslld $3, x2
400 vpsrld $(32 - 3), x2
401 vpor x4
402 vpxor x2
403 vpslld $13, x0
404 vpsrld $(32 - 13), x0
405 vpor x4
406 vpxor x0
407 vpslld $3, x2
408 vpsrld $(32 - 3), x2
409 vpor x4
410 vpxor x2
411 vpslld $1, x1
412 vpsrld $(32 - 1), x1
413 vpor x4
414 vpslld $3, x0
415 vpxor x2
416 vpxor x4
417 get_key(i, 1, RK1); \
418 vpslld $1, x1
419 vpsrld $(32 - 1), x1
420 vpor x4
421 vpslld $3, x0
422 vpxor x2
423 vpxor x4
424 get_key(i, 3, RK3); \
425 vpslld $7, x3
426 vpsrld $(32 - 7), x3
427 vpor x4
428 vpslld $7, x1
429 vpxor x1
430 vpxor x3
431 vpxor x3
432 vpxor x4
433 get_key(i, 0, RK0); \
434 vpslld $7, x3
435 vpsrld $(32 - 7), x3
436 vpor x4
437 vpslld $7, x1
438 vpxor x1
439 vpxor x3
440 vpxor x3
441 vpxor x4
442 get_key(i, 2, RK2); \
443 vpxor RK1, x1
444 vpxor RK3, x3
445 vpslld $5, x0
446 vpsrld $(32 - 5), x0
447 vpor x4
448 vpslld $22, x2
449 vpsrld $(32 - 22), x2
450 vpor x4
451 vpxor RK0, x0
452 vpxor RK2, x2
453 vpxor RK1, x1
454 vpxor RK3, x3
455 vpslld $5, x0
456 vpsrld $(32 - 5), x0
457 vpor x4
458 vpslld $22, x2
459 vpsrld $(32 - 22), x2
460 vpor x4
461 vpxor RK0, x0
462 vpxor RK2, x2
463
464#define KL2(x0, x1, x2, x3, x4, i) \
465 vpxor RK0, x0
466 vpxor RK2, x2
467 vpsrld $5, x0
468 vpslld $(32 - 5), x0
469 vpor x4
470 vpxor RK3, x3
471 vpxor RK1, x1
472 vpsrld $22, x2
473 vpslld $(32 - 22), x2
474 vpor x4
475 vpxor x3
476 vpxor RK0, x0
477 vpxor RK2, x2
478 vpsrld $5, x0
479 vpslld $(32 - 5), x0
480 vpor x4
481 vpxor RK3, x3
482 vpxor RK1, x1
483 vpsrld $22, x2
484 vpslld $(32 - 22), x2
485 vpor x4
486 vpxor x3
487 vpxor x3
488 vpslld $7, x1
489 vpxor x1
490 vpxor x4
491 vpsrld $1, x1
492 vpslld $(32 - 1), x1
493 vpor x4
494 vpxor x3
495 vpslld $7, x1
496 vpxor x1
497 vpxor x4
498 vpsrld $1, x1
499 vpslld $(32 - 1), x1
500 vpor x4
501 vpsrld $7, x3
502 vpslld $(32 - 7), x3
503 vpor x4
504 vpxor x0
505 vpslld $3, x0
506 vpxor x4
507 vpsrld $7, x3
508 vpslld $(32 - 7), x3
509 vpor x4
510 vpxor x0
511 vpslld $3, x0
512 vpxor x4
513 vpsrld $13, x0
514 vpslld $(32 - 13), x0
515 vpor x4
516 vpxor x2
517 vpxor x2
518 vpsrld $3, x2
519 vpslld $(32 - 3), x2
520 vpor x4
521 vpsrld $13, x0
522 vpslld $(32 - 13), x0
523 vpor x4
524 vpxor x2
525 vpxor x2
526 vpsrld $3, x2
527 vpslld $(32 - 3), x2
528 vpor x4
529
530#define S(SBOX, x0, x1, x2, x3, x4) \
531 SBOX
532 SBOX
533 SBOX
534 SBOX
535
536#define SP(SBOX, x0, x1, x2, x3, x4, i) \
537 get_key(i, 0, RK0); \
538 SBOX
539 get_key(i, 2, RK2); \
540 SBOX
541 get_key(i, 3, RK3); \
542 SBOX
543 get_key(i, 1, RK1); \
544 SBOX
545
546#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
547 vpunpckldq x1, x0, t0; \
548 vpunpckhdq x1, x0, t2; \
549 vpunpckldq x3, x2, t1; \
550 vpunpckhdq x3, x2, x3; \
551 \
552 vpunpcklqdq t1, t0, x0; \
553 vpunpckhqdq t1, t0, x1; \
554 vpunpcklqdq x3, t2, x2; \
555 vpunpckhqdq x3, t2, x3;
556
557#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
558 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
559
560#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
561 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
562
563.align 8
564__serpent_enc_blk16:
565
566
567
568
569
570
571
572 vpcmpeqd RNOT, RNOT, RNOT;
573
574 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
575 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
576
577 K2(RA, RB, RC, RD, RE, 0);
578 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
579 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
580 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
581 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
582 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
583 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
584 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
585 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
586 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
587 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
588 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
589 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
590 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
591 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
592 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
593 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
594 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
595 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
596 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
597 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
598 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
599 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
600 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
601 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
602 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
603 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
604 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
605 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
606 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
607 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
608 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
609 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
610
611 write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
612 write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
613
614 ret;
615ENDPROC(__serpent_enc_blk16)
616
617.align 8
618__serpent_dec_blk16:
619
620
621
622
623
624
625
626 vpcmpeqd RNOT, RNOT, RNOT;
627
628 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
629 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
630
631 K2(RA, RB, RC, RD, RE, 32);
632 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
633 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
634 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
635 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
636 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
637 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
638 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
639 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
640 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
641 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
642 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
643 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
644 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
645 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
646 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
647 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
648 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
649 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
650 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
651 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
652 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
653 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
654 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
655 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
656 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
657 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
658 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
659 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
660 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
661 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
662 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
663 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
664
665 write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
666 write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
667
668 ret;
669ENDPROC(__serpent_dec_blk16)
670
671ENTRY(serpent_ecb_enc_16way)
672
673
674
675
676
677 FRAME_BEGIN
678
679 vzeroupper;
680
681 load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
682
683 call __serpent_enc_blk16;
684
685 store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
686
687 vzeroupper;
688
689 FRAME_END
690 ret;
691ENDPROC(serpent_ecb_enc_16way)
692
693ENTRY(serpent_ecb_dec_16way)
694
695
696
697
698
699 FRAME_BEGIN
700
701 vzeroupper;
702
703 load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
704
705 call __serpent_dec_blk16;
706
707 store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
708
709 vzeroupper;
710
711 FRAME_END
712 ret;
713ENDPROC(serpent_ecb_dec_16way)
714
715ENTRY(serpent_cbc_dec_16way)
716
717
718
719
720
721 FRAME_BEGIN
722
723 vzeroupper;
724
725 load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
726
727 call __serpent_dec_blk16;
728
729 store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2,
730 RK0);
731
732 vzeroupper;
733
734 FRAME_END
735 ret;
736ENDPROC(serpent_cbc_dec_16way)
737
738ENTRY(serpent_ctr_16way)
739
740
741
742
743
744
745 FRAME_BEGIN
746
747 vzeroupper;
748
749 load_ctr_16way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
750 RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
751 tp);
752
753 call __serpent_enc_blk16;
754
755 store_ctr_16way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
756
757 vzeroupper;
758
759 FRAME_END
760 ret;
761ENDPROC(serpent_ctr_16way)
762
763ENTRY(serpent_xts_enc_16way)
764
765
766
767
768
769
770 FRAME_BEGIN
771
772 vzeroupper;
773
774 load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
775 RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
776 .Lxts_gf128mul_and_shl1_mask_0,
777 .Lxts_gf128mul_and_shl1_mask_1);
778
779 call __serpent_enc_blk16;
780
781 store_xts_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
782
783 vzeroupper;
784
785 FRAME_END
786 ret;
787ENDPROC(serpent_xts_enc_16way)
788
789ENTRY(serpent_xts_dec_16way)
790
791
792
793
794
795
796 FRAME_BEGIN
797
798 vzeroupper;
799
800 load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
801 RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
802 .Lxts_gf128mul_and_shl1_mask_0,
803 .Lxts_gf128mul_and_shl1_mask_1);
804
805 call __serpent_dec_blk16;
806
807 store_xts_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
808
809 vzeroupper;
810
811 FRAME_END
812 ret;
813ENDPROC(serpent_xts_dec_16way)
814