1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include <linux/linkage.h>
18#include <asm/frame.h>
19#include "glue_helper-asm-avx2.S"
20
21.file "serpent-avx2-asm_64.S"
22
23.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16
24.align 16
25.Lbswap128_mask:
26 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
27
28.section .rodata.cst16.xts_gf128mul_and_shl1_mask_0, "aM", @progbits, 16
29.align 16
30.Lxts_gf128mul_and_shl1_mask_0:
31 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
32
33.section .rodata.cst16.xts_gf128mul_and_shl1_mask_1, "aM", @progbits, 16
34.align 16
35.Lxts_gf128mul_and_shl1_mask_1:
36 .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
37
38.text
39
40#define CTX %rdi
41
42#define RNOT %ymm0
43#define tp %ymm1
44
45#define RA1 %ymm2
46#define RA2 %ymm3
47#define RB1 %ymm4
48#define RB2 %ymm5
49#define RC1 %ymm6
50#define RC2 %ymm7
51#define RD1 %ymm8
52#define RD2 %ymm9
53#define RE1 %ymm10
54#define RE2 %ymm11
55
56#define RK0 %ymm12
57#define RK1 %ymm13
58#define RK2 %ymm14
59#define RK3 %ymm15
60
61#define RK0x %xmm12
62#define RK1x %xmm13
63#define RK2x %xmm14
64#define RK3x %xmm15
65
66#define S0_1(x0, x1, x2, x3, x4) \
67 vpor x0, x3, tp; \
68 vpxor x3, x0, x0; \
69 vpxor x2, x3, x4; \
70 vpxor RNOT, x4, x4; \
71 vpxor x1, tp, x3; \
72 vpand x0, x1, x1; \
73 vpxor x4, x1, x1; \
74 vpxor x0, x2, x2;
75#define S0_2(x0, x1, x2, x3, x4) \
76 vpxor x3, x0, x0; \
77 vpor x0, x4, x4; \
78 vpxor x2, x0, x0; \
79 vpand x1, x2, x2; \
80 vpxor x2, x3, x3; \
81 vpxor RNOT, x1, x1; \
82 vpxor x4, x2, x2; \
83 vpxor x2, x1, x1;
84
85#define S1_1(x0, x1, x2, x3, x4) \
86 vpxor x0, x1, tp; \
87 vpxor x3, x0, x0; \
88 vpxor RNOT, x3, x3; \
89 vpand tp, x1, x4; \
90 vpor tp, x0, x0; \
91 vpxor x2, x3, x3; \
92 vpxor x3, x0, x0; \
93 vpxor x3, tp, x1;
94#define S1_2(x0, x1, x2, x3, x4) \
95 vpxor x4, x3, x3; \
96 vpor x4, x1, x1; \
97 vpxor x2, x4, x4; \
98 vpand x0, x2, x2; \
99 vpxor x1, x2, x2; \
100 vpor x0, x1, x1; \
101 vpxor RNOT, x0, x0; \
102 vpxor x2, x0, x0; \
103 vpxor x1, x4, x4;
104
105#define S2_1(x0, x1, x2, x3, x4) \
106 vpxor RNOT, x3, x3; \
107 vpxor x0, x1, x1; \
108 vpand x2, x0, tp; \
109 vpxor x3, tp, tp; \
110 vpor x0, x3, x3; \
111 vpxor x1, x2, x2; \
112 vpxor x1, x3, x3; \
113 vpand tp, x1, x1;
114#define S2_2(x0, x1, x2, x3, x4) \
115 vpxor x2, tp, tp; \
116 vpand x3, x2, x2; \
117 vpor x1, x3, x3; \
118 vpxor RNOT, tp, tp; \
119 vpxor tp, x3, x3; \
120 vpxor tp, x0, x4; \
121 vpxor x2, tp, x0; \
122 vpor x2, x1, x1;
123
124#define S3_1(x0, x1, x2, x3, x4) \
125 vpxor x3, x1, tp; \
126 vpor x0, x3, x3; \
127 vpand x0, x1, x4; \
128 vpxor x2, x0, x0; \
129 vpxor tp, x2, x2; \
130 vpand x3, tp, x1; \
131 vpxor x3, x2, x2; \
132 vpor x4, x0, x0; \
133 vpxor x3, x4, x4;
134#define S3_2(x0, x1, x2, x3, x4) \
135 vpxor x0, x1, x1; \
136 vpand x3, x0, x0; \
137 vpand x4, x3, x3; \
138 vpxor x2, x3, x3; \
139 vpor x1, x4, x4; \
140 vpand x1, x2, x2; \
141 vpxor x3, x4, x4; \
142 vpxor x3, x0, x0; \
143 vpxor x2, x3, x3;
144
145#define S4_1(x0, x1, x2, x3, x4) \
146 vpand x0, x3, tp; \
147 vpxor x3, x0, x0; \
148 vpxor x2, tp, tp; \
149 vpor x3, x2, x2; \
150 vpxor x1, x0, x0; \
151 vpxor tp, x3, x4; \
152 vpor x0, x2, x2; \
153 vpxor x1, x2, x2;
154#define S4_2(x0, x1, x2, x3, x4) \
155 vpand x0, x1, x1; \
156 vpxor x4, x1, x1; \
157 vpand x2, x4, x4; \
158 vpxor tp, x2, x2; \
159 vpxor x0, x4, x4; \
160 vpor x1, tp, x3; \
161 vpxor RNOT, x1, x1; \
162 vpxor x0, x3, x3;
163
164#define S5_1(x0, x1, x2, x3, x4) \
165 vpor x0, x1, tp; \
166 vpxor tp, x2, x2; \
167 vpxor RNOT, x3, x3; \
168 vpxor x0, x1, x4; \
169 vpxor x2, x0, x0; \
170 vpand x4, tp, x1; \
171 vpor x3, x4, x4; \
172 vpxor x0, x4, x4;
173#define S5_2(x0, x1, x2, x3, x4) \
174 vpand x3, x0, x0; \
175 vpxor x3, x1, x1; \
176 vpxor x2, x3, x3; \
177 vpxor x1, x0, x0; \
178 vpand x4, x2, x2; \
179 vpxor x2, x1, x1; \
180 vpand x0, x2, x2; \
181 vpxor x2, x3, x3;
182
183#define S6_1(x0, x1, x2, x3, x4) \
184 vpxor x0, x3, x3; \
185 vpxor x2, x1, tp; \
186 vpxor x0, x2, x2; \
187 vpand x3, x0, x0; \
188 vpor x3, tp, tp; \
189 vpxor RNOT, x1, x4; \
190 vpxor tp, x0, x0; \
191 vpxor x2, tp, x1;
192#define S6_2(x0, x1, x2, x3, x4) \
193 vpxor x4, x3, x3; \
194 vpxor x0, x4, x4; \
195 vpand x0, x2, x2; \
196 vpxor x1, x4, x4; \
197 vpxor x3, x2, x2; \
198 vpand x1, x3, x3; \
199 vpxor x0, x3, x3; \
200 vpxor x2, x1, x1;
201
202#define S7_1(x0, x1, x2, x3, x4) \
203 vpxor RNOT, x1, tp; \
204 vpxor RNOT, x0, x0; \
205 vpand x2, tp, x1; \
206 vpxor x3, x1, x1; \
207 vpor tp, x3, x3; \
208 vpxor x2, tp, x4; \
209 vpxor x3, x2, x2; \
210 vpxor x0, x3, x3; \
211 vpor x1, x0, x0;
212#define S7_2(x0, x1, x2, x3, x4) \
213 vpand x0, x2, x2; \
214 vpxor x4, x0, x0; \
215 vpxor x3, x4, x4; \
216 vpand x0, x3, x3; \
217 vpxor x1, x4, x4; \
218 vpxor x4, x2, x2; \
219 vpxor x1, x3, x3; \
220 vpor x0, x4, x4; \
221 vpxor x1, x4, x4;
222
223#define SI0_1(x0, x1, x2, x3, x4) \
224 vpxor x0, x1, x1; \
225 vpor x1, x3, tp; \
226 vpxor x1, x3, x4; \
227 vpxor RNOT, x0, x0; \
228 vpxor tp, x2, x2; \
229 vpxor x0, tp, x3; \
230 vpand x1, x0, x0; \
231 vpxor x2, x0, x0;
232#define SI0_2(x0, x1, x2, x3, x4) \
233 vpand x3, x2, x2; \
234 vpxor x4, x3, x3; \
235 vpxor x3, x2, x2; \
236 vpxor x3, x1, x1; \
237 vpand x0, x3, x3; \
238 vpxor x0, x1, x1; \
239 vpxor x2, x0, x0; \
240 vpxor x3, x4, x4;
241
242#define SI1_1(x0, x1, x2, x3, x4) \
243 vpxor x3, x1, x1; \
244 vpxor x2, x0, tp; \
245 vpxor RNOT, x2, x2; \
246 vpor x1, x0, x4; \
247 vpxor x3, x4, x4; \
248 vpand x1, x3, x3; \
249 vpxor x2, x1, x1; \
250 vpand x4, x2, x2;
251#define SI1_2(x0, x1, x2, x3, x4) \
252 vpxor x1, x4, x4; \
253 vpor x3, x1, x1; \
254 vpxor tp, x3, x3; \
255 vpxor tp, x2, x2; \
256 vpor x4, tp, x0; \
257 vpxor x4, x2, x2; \
258 vpxor x0, x1, x1; \
259 vpxor x1, x4, x4;
260
261#define SI2_1(x0, x1, x2, x3, x4) \
262 vpxor x1, x2, x2; \
263 vpxor RNOT, x3, tp; \
264 vpor x2, tp, tp; \
265 vpxor x3, x2, x2; \
266 vpxor x0, x3, x4; \
267 vpxor x1, tp, x3; \
268 vpor x2, x1, x1; \
269 vpxor x0, x2, x2;
270#define SI2_2(x0, x1, x2, x3, x4) \
271 vpxor x4, x1, x1; \
272 vpor x3, x4, x4; \
273 vpxor x3, x2, x2; \
274 vpxor x2, x4, x4; \
275 vpand x1, x2, x2; \
276 vpxor x3, x2, x2; \
277 vpxor x4, x3, x3; \
278 vpxor x0, x4, x4;
279
280#define SI3_1(x0, x1, x2, x3, x4) \
281 vpxor x1, x2, x2; \
282 vpand x2, x1, tp; \
283 vpxor x0, tp, tp; \
284 vpor x1, x0, x0; \
285 vpxor x3, x1, x4; \
286 vpxor x3, x0, x0; \
287 vpor tp, x3, x3; \
288 vpxor x2, tp, x1;
289#define SI3_2(x0, x1, x2, x3, x4) \
290 vpxor x3, x1, x1; \
291 vpxor x2, x0, x0; \
292 vpxor x3, x2, x2; \
293 vpand x1, x3, x3; \
294 vpxor x0, x1, x1; \
295 vpand x2, x0, x0; \
296 vpxor x3, x4, x4; \
297 vpxor x0, x3, x3; \
298 vpxor x1, x0, x0;
299
300#define SI4_1(x0, x1, x2, x3, x4) \
301 vpxor x3, x2, x2; \
302 vpand x1, x0, tp; \
303 vpxor x2, tp, tp; \
304 vpor x3, x2, x2; \
305 vpxor RNOT, x0, x4; \
306 vpxor tp, x1, x1; \
307 vpxor x2, tp, x0; \
308 vpand x4, x2, x2;
309#define SI4_2(x0, x1, x2, x3, x4) \
310 vpxor x0, x2, x2; \
311 vpor x4, x0, x0; \
312 vpxor x3, x0, x0; \
313 vpand x2, x3, x3; \
314 vpxor x3, x4, x4; \
315 vpxor x1, x3, x3; \
316 vpand x0, x1, x1; \
317 vpxor x1, x4, x4; \
318 vpxor x3, x0, x0;
319
320#define SI5_1(x0, x1, x2, x3, x4) \
321 vpor x2, x1, tp; \
322 vpxor x1, x2, x2; \
323 vpxor x3, tp, tp; \
324 vpand x1, x3, x3; \
325 vpxor x3, x2, x2; \
326 vpor x0, x3, x3; \
327 vpxor RNOT, x0, x0; \
328 vpxor x2, x3, x3; \
329 vpor x0, x2, x2;
330#define SI5_2(x0, x1, x2, x3, x4) \
331 vpxor tp, x1, x4; \
332 vpxor x4, x2, x2; \
333 vpand x0, x4, x4; \
334 vpxor tp, x0, x0; \
335 vpxor x3, tp, x1; \
336 vpand x2, x0, x0; \
337 vpxor x3, x2, x2; \
338 vpxor x2, x0, x0; \
339 vpxor x4, x2, x2; \
340 vpxor x3, x4, x4;
341
342#define SI6_1(x0, x1, x2, x3, x4) \
343 vpxor x2, x0, x0; \
344 vpand x3, x0, tp; \
345 vpxor x3, x2, x2; \
346 vpxor x2, tp, tp; \
347 vpxor x1, x3, x3; \
348 vpor x0, x2, x2; \
349 vpxor x3, x2, x2; \
350 vpand tp, x3, x3;
351#define SI6_2(x0, x1, x2, x3, x4) \
352 vpxor RNOT, tp, tp; \
353 vpxor x1, x3, x3; \
354 vpand x2, x1, x1; \
355 vpxor tp, x0, x4; \
356 vpxor x4, x3, x3; \
357 vpxor x2, x4, x4; \
358 vpxor x1, tp, x0; \
359 vpxor x0, x2, x2;
360
361#define SI7_1(x0, x1, x2, x3, x4) \
362 vpand x0, x3, tp; \
363 vpxor x2, x0, x0; \
364 vpor x3, x2, x2; \
365 vpxor x1, x3, x4; \
366 vpxor RNOT, x0, x0; \
367 vpor tp, x1, x1; \
368 vpxor x0, x4, x4; \
369 vpand x2, x0, x0; \
370 vpxor x1, x0, x0;
371#define SI7_2(x0, x1, x2, x3, x4) \
372 vpand x2, x1, x1; \
373 vpxor x2, tp, x3; \
374 vpxor x3, x4, x4; \
375 vpand x3, x2, x2; \
376 vpor x0, x3, x3; \
377 vpxor x4, x1, x1; \
378 vpxor x4, x3, x3; \
379 vpand x0, x4, x4; \
380 vpxor x2, x4, x4;
381
382#define get_key(i,j,t) \
383 vpbroadcastd (4*(i)+(j))*4(CTX), t;
384
385#define K2(x0, x1, x2, x3, x4, i) \
386 get_key(i, 0, RK0); \
387 get_key(i, 1, RK1); \
388 get_key(i, 2, RK2); \
389 get_key(i, 3, RK3); \
390 vpxor RK0, x0
391 vpxor RK1, x1
392 vpxor RK2, x2
393 vpxor RK3, x3
394 vpxor RK0, x0
395 vpxor RK1, x1
396 vpxor RK2, x2
397 vpxor RK3, x3
398
399#define LK2(x0, x1, x2, x3, x4, i) \
400 vpslld $13, x0
401 vpsrld $(32 - 13), x0
402 vpor x4
403 vpxor x0
404 vpslld $3, x2
405 vpsrld $(32 - 3), x2
406 vpor x4
407 vpxor x2
408 vpslld $13, x0
409 vpsrld $(32 - 13), x0
410 vpor x4
411 vpxor x0
412 vpslld $3, x2
413 vpsrld $(32 - 3), x2
414 vpor x4
415 vpxor x2
416 vpslld $1, x1
417 vpsrld $(32 - 1), x1
418 vpor x4
419 vpslld $3, x0
420 vpxor x2
421 vpxor x4
422 get_key(i, 1, RK1); \
423 vpslld $1, x1
424 vpsrld $(32 - 1), x1
425 vpor x4
426 vpslld $3, x0
427 vpxor x2
428 vpxor x4
429 get_key(i, 3, RK3); \
430 vpslld $7, x3
431 vpsrld $(32 - 7), x3
432 vpor x4
433 vpslld $7, x1
434 vpxor x1
435 vpxor x3
436 vpxor x3
437 vpxor x4
438 get_key(i, 0, RK0); \
439 vpslld $7, x3
440 vpsrld $(32 - 7), x3
441 vpor x4
442 vpslld $7, x1
443 vpxor x1
444 vpxor x3
445 vpxor x3
446 vpxor x4
447 get_key(i, 2, RK2); \
448 vpxor RK1, x1
449 vpxor RK3, x3
450 vpslld $5, x0
451 vpsrld $(32 - 5), x0
452 vpor x4
453 vpslld $22, x2
454 vpsrld $(32 - 22), x2
455 vpor x4
456 vpxor RK0, x0
457 vpxor RK2, x2
458 vpxor RK1, x1
459 vpxor RK3, x3
460 vpslld $5, x0
461 vpsrld $(32 - 5), x0
462 vpor x4
463 vpslld $22, x2
464 vpsrld $(32 - 22), x2
465 vpor x4
466 vpxor RK0, x0
467 vpxor RK2, x2
468
469#define KL2(x0, x1, x2, x3, x4, i) \
470 vpxor RK0, x0
471 vpxor RK2, x2
472 vpsrld $5, x0
473 vpslld $(32 - 5), x0
474 vpor x4
475 vpxor RK3, x3
476 vpxor RK1, x1
477 vpsrld $22, x2
478 vpslld $(32 - 22), x2
479 vpor x4
480 vpxor x3
481 vpxor RK0, x0
482 vpxor RK2, x2
483 vpsrld $5, x0
484 vpslld $(32 - 5), x0
485 vpor x4
486 vpxor RK3, x3
487 vpxor RK1, x1
488 vpsrld $22, x2
489 vpslld $(32 - 22), x2
490 vpor x4
491 vpxor x3
492 vpxor x3
493 vpslld $7, x1
494 vpxor x1
495 vpxor x4
496 vpsrld $1, x1
497 vpslld $(32 - 1), x1
498 vpor x4
499 vpxor x3
500 vpslld $7, x1
501 vpxor x1
502 vpxor x4
503 vpsrld $1, x1
504 vpslld $(32 - 1), x1
505 vpor x4
506 vpsrld $7, x3
507 vpslld $(32 - 7), x3
508 vpor x4
509 vpxor x0
510 vpslld $3, x0
511 vpxor x4
512 vpsrld $7, x3
513 vpslld $(32 - 7), x3
514 vpor x4
515 vpxor x0
516 vpslld $3, x0
517 vpxor x4
518 vpsrld $13, x0
519 vpslld $(32 - 13), x0
520 vpor x4
521 vpxor x2
522 vpxor x2
523 vpsrld $3, x2
524 vpslld $(32 - 3), x2
525 vpor x4
526 vpsrld $13, x0
527 vpslld $(32 - 13), x0
528 vpor x4
529 vpxor x2
530 vpxor x2
531 vpsrld $3, x2
532 vpslld $(32 - 3), x2
533 vpor x4
534
535#define S(SBOX, x0, x1, x2, x3, x4) \
536 SBOX
537 SBOX
538 SBOX
539 SBOX
540
541#define SP(SBOX, x0, x1, x2, x3, x4, i) \
542 get_key(i, 0, RK0); \
543 SBOX
544 get_key(i, 2, RK2); \
545 SBOX
546 get_key(i, 3, RK3); \
547 SBOX
548 get_key(i, 1, RK1); \
549 SBOX
550
551#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
552 vpunpckldq x1, x0, t0; \
553 vpunpckhdq x1, x0, t2; \
554 vpunpckldq x3, x2, t1; \
555 vpunpckhdq x3, x2, x3; \
556 \
557 vpunpcklqdq t1, t0, x0; \
558 vpunpckhqdq t1, t0, x1; \
559 vpunpcklqdq x3, t2, x2; \
560 vpunpckhqdq x3, t2, x3;
561
562#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
563 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
564
565#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
566 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
567
568.align 8
569__serpent_enc_blk16:
570
571
572
573
574
575
576
577 vpcmpeqd RNOT, RNOT, RNOT;
578
579 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
580 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
581
582 K2(RA, RB, RC, RD, RE, 0);
583 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
584 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
585 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
586 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
587 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
588 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
589 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
590 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
591 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
592 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
593 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
594 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
595 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
596 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
597 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
598 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
599 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
600 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
601 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
602 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
603 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
604 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
605 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
606 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
607 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
608 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
609 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
610 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
611 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
612 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
613 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
614 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
615
616 write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
617 write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
618
619 ret;
620ENDPROC(__serpent_enc_blk16)
621
622.align 8
623__serpent_dec_blk16:
624
625
626
627
628
629
630
631 vpcmpeqd RNOT, RNOT, RNOT;
632
633 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
634 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
635
636 K2(RA, RB, RC, RD, RE, 32);
637 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
638 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
639 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
640 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
641 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
642 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
643 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
644 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
645 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
646 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
647 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
648 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
649 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
650 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
651 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
652 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
653 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
654 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
655 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
656 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
657 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
658 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
659 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
660 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
661 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
662 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
663 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
664 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
665 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
666 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
667 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
668 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
669
670 write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
671 write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
672
673 ret;
674ENDPROC(__serpent_dec_blk16)
675
676ENTRY(serpent_ecb_enc_16way)
677
678
679
680
681
682 FRAME_BEGIN
683
684 vzeroupper;
685
686 load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
687
688 call __serpent_enc_blk16;
689
690 store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
691
692 vzeroupper;
693
694 FRAME_END
695 ret;
696ENDPROC(serpent_ecb_enc_16way)
697
698ENTRY(serpent_ecb_dec_16way)
699
700
701
702
703
704 FRAME_BEGIN
705
706 vzeroupper;
707
708 load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
709
710 call __serpent_dec_blk16;
711
712 store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
713
714 vzeroupper;
715
716 FRAME_END
717 ret;
718ENDPROC(serpent_ecb_dec_16way)
719
720ENTRY(serpent_cbc_dec_16way)
721
722
723
724
725
726 FRAME_BEGIN
727
728 vzeroupper;
729
730 load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
731
732 call __serpent_dec_blk16;
733
734 store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2,
735 RK0);
736
737 vzeroupper;
738
739 FRAME_END
740 ret;
741ENDPROC(serpent_cbc_dec_16way)
742
743ENTRY(serpent_ctr_16way)
744
745
746
747
748
749
750 FRAME_BEGIN
751
752 vzeroupper;
753
754 load_ctr_16way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
755 RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
756 tp);
757
758 call __serpent_enc_blk16;
759
760 store_ctr_16way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
761
762 vzeroupper;
763
764 FRAME_END
765 ret;
766ENDPROC(serpent_ctr_16way)
767
768ENTRY(serpent_xts_enc_16way)
769
770
771
772
773
774
775 FRAME_BEGIN
776
777 vzeroupper;
778
779 load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
780 RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
781 .Lxts_gf128mul_and_shl1_mask_0,
782 .Lxts_gf128mul_and_shl1_mask_1);
783
784 call __serpent_enc_blk16;
785
786 store_xts_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
787
788 vzeroupper;
789
790 FRAME_END
791 ret;
792ENDPROC(serpent_xts_enc_16way)
793
794ENTRY(serpent_xts_dec_16way)
795
796
797
798
799
800
801 FRAME_BEGIN
802
803 vzeroupper;
804
805 load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
806 RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
807 .Lxts_gf128mul_and_shl1_mask_0,
808 .Lxts_gf128mul_and_shl1_mask_1);
809
810 call __serpent_dec_blk16;
811
812 store_xts_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
813
814 vzeroupper;
815
816 FRAME_END
817 ret;
818ENDPROC(serpent_xts_dec_16way)
819