1
2
3
4
5
6
7
8
9
10
11
12#include <linux/linkage.h>
13#include <asm/frame.h>
14#include "glue_helper-asm-avx2.S"
15
16.file "serpent-avx2-asm_64.S"
17
18.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16
19.align 16
20.Lbswap128_mask:
21 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
22
23.text
24
25#define CTX %rdi
26
27#define RNOT %ymm0
28#define tp %ymm1
29
30#define RA1 %ymm2
31#define RA2 %ymm3
32#define RB1 %ymm4
33#define RB2 %ymm5
34#define RC1 %ymm6
35#define RC2 %ymm7
36#define RD1 %ymm8
37#define RD2 %ymm9
38#define RE1 %ymm10
39#define RE2 %ymm11
40
41#define RK0 %ymm12
42#define RK1 %ymm13
43#define RK2 %ymm14
44#define RK3 %ymm15
45
46#define RK0x %xmm12
47#define RK1x %xmm13
48#define RK2x %xmm14
49#define RK3x %xmm15
50
51#define S0_1(x0, x1, x2, x3, x4) \
52 vpor x0, x3, tp; \
53 vpxor x3, x0, x0; \
54 vpxor x2, x3, x4; \
55 vpxor RNOT, x4, x4; \
56 vpxor x1, tp, x3; \
57 vpand x0, x1, x1; \
58 vpxor x4, x1, x1; \
59 vpxor x0, x2, x2;
60#define S0_2(x0, x1, x2, x3, x4) \
61 vpxor x3, x0, x0; \
62 vpor x0, x4, x4; \
63 vpxor x2, x0, x0; \
64 vpand x1, x2, x2; \
65 vpxor x2, x3, x3; \
66 vpxor RNOT, x1, x1; \
67 vpxor x4, x2, x2; \
68 vpxor x2, x1, x1;
69
70#define S1_1(x0, x1, x2, x3, x4) \
71 vpxor x0, x1, tp; \
72 vpxor x3, x0, x0; \
73 vpxor RNOT, x3, x3; \
74 vpand tp, x1, x4; \
75 vpor tp, x0, x0; \
76 vpxor x2, x3, x3; \
77 vpxor x3, x0, x0; \
78 vpxor x3, tp, x1;
79#define S1_2(x0, x1, x2, x3, x4) \
80 vpxor x4, x3, x3; \
81 vpor x4, x1, x1; \
82 vpxor x2, x4, x4; \
83 vpand x0, x2, x2; \
84 vpxor x1, x2, x2; \
85 vpor x0, x1, x1; \
86 vpxor RNOT, x0, x0; \
87 vpxor x2, x0, x0; \
88 vpxor x1, x4, x4;
89
90#define S2_1(x0, x1, x2, x3, x4) \
91 vpxor RNOT, x3, x3; \
92 vpxor x0, x1, x1; \
93 vpand x2, x0, tp; \
94 vpxor x3, tp, tp; \
95 vpor x0, x3, x3; \
96 vpxor x1, x2, x2; \
97 vpxor x1, x3, x3; \
98 vpand tp, x1, x1;
99#define S2_2(x0, x1, x2, x3, x4) \
100 vpxor x2, tp, tp; \
101 vpand x3, x2, x2; \
102 vpor x1, x3, x3; \
103 vpxor RNOT, tp, tp; \
104 vpxor tp, x3, x3; \
105 vpxor tp, x0, x4; \
106 vpxor x2, tp, x0; \
107 vpor x2, x1, x1;
108
109#define S3_1(x0, x1, x2, x3, x4) \
110 vpxor x3, x1, tp; \
111 vpor x0, x3, x3; \
112 vpand x0, x1, x4; \
113 vpxor x2, x0, x0; \
114 vpxor tp, x2, x2; \
115 vpand x3, tp, x1; \
116 vpxor x3, x2, x2; \
117 vpor x4, x0, x0; \
118 vpxor x3, x4, x4;
119#define S3_2(x0, x1, x2, x3, x4) \
120 vpxor x0, x1, x1; \
121 vpand x3, x0, x0; \
122 vpand x4, x3, x3; \
123 vpxor x2, x3, x3; \
124 vpor x1, x4, x4; \
125 vpand x1, x2, x2; \
126 vpxor x3, x4, x4; \
127 vpxor x3, x0, x0; \
128 vpxor x2, x3, x3;
129
130#define S4_1(x0, x1, x2, x3, x4) \
131 vpand x0, x3, tp; \
132 vpxor x3, x0, x0; \
133 vpxor x2, tp, tp; \
134 vpor x3, x2, x2; \
135 vpxor x1, x0, x0; \
136 vpxor tp, x3, x4; \
137 vpor x0, x2, x2; \
138 vpxor x1, x2, x2;
139#define S4_2(x0, x1, x2, x3, x4) \
140 vpand x0, x1, x1; \
141 vpxor x4, x1, x1; \
142 vpand x2, x4, x4; \
143 vpxor tp, x2, x2; \
144 vpxor x0, x4, x4; \
145 vpor x1, tp, x3; \
146 vpxor RNOT, x1, x1; \
147 vpxor x0, x3, x3;
148
149#define S5_1(x0, x1, x2, x3, x4) \
150 vpor x0, x1, tp; \
151 vpxor tp, x2, x2; \
152 vpxor RNOT, x3, x3; \
153 vpxor x0, x1, x4; \
154 vpxor x2, x0, x0; \
155 vpand x4, tp, x1; \
156 vpor x3, x4, x4; \
157 vpxor x0, x4, x4;
158#define S5_2(x0, x1, x2, x3, x4) \
159 vpand x3, x0, x0; \
160 vpxor x3, x1, x1; \
161 vpxor x2, x3, x3; \
162 vpxor x1, x0, x0; \
163 vpand x4, x2, x2; \
164 vpxor x2, x1, x1; \
165 vpand x0, x2, x2; \
166 vpxor x2, x3, x3;
167
168#define S6_1(x0, x1, x2, x3, x4) \
169 vpxor x0, x3, x3; \
170 vpxor x2, x1, tp; \
171 vpxor x0, x2, x2; \
172 vpand x3, x0, x0; \
173 vpor x3, tp, tp; \
174 vpxor RNOT, x1, x4; \
175 vpxor tp, x0, x0; \
176 vpxor x2, tp, x1;
177#define S6_2(x0, x1, x2, x3, x4) \
178 vpxor x4, x3, x3; \
179 vpxor x0, x4, x4; \
180 vpand x0, x2, x2; \
181 vpxor x1, x4, x4; \
182 vpxor x3, x2, x2; \
183 vpand x1, x3, x3; \
184 vpxor x0, x3, x3; \
185 vpxor x2, x1, x1;
186
187#define S7_1(x0, x1, x2, x3, x4) \
188 vpxor RNOT, x1, tp; \
189 vpxor RNOT, x0, x0; \
190 vpand x2, tp, x1; \
191 vpxor x3, x1, x1; \
192 vpor tp, x3, x3; \
193 vpxor x2, tp, x4; \
194 vpxor x3, x2, x2; \
195 vpxor x0, x3, x3; \
196 vpor x1, x0, x0;
197#define S7_2(x0, x1, x2, x3, x4) \
198 vpand x0, x2, x2; \
199 vpxor x4, x0, x0; \
200 vpxor x3, x4, x4; \
201 vpand x0, x3, x3; \
202 vpxor x1, x4, x4; \
203 vpxor x4, x2, x2; \
204 vpxor x1, x3, x3; \
205 vpor x0, x4, x4; \
206 vpxor x1, x4, x4;
207
208#define SI0_1(x0, x1, x2, x3, x4) \
209 vpxor x0, x1, x1; \
210 vpor x1, x3, tp; \
211 vpxor x1, x3, x4; \
212 vpxor RNOT, x0, x0; \
213 vpxor tp, x2, x2; \
214 vpxor x0, tp, x3; \
215 vpand x1, x0, x0; \
216 vpxor x2, x0, x0;
217#define SI0_2(x0, x1, x2, x3, x4) \
218 vpand x3, x2, x2; \
219 vpxor x4, x3, x3; \
220 vpxor x3, x2, x2; \
221 vpxor x3, x1, x1; \
222 vpand x0, x3, x3; \
223 vpxor x0, x1, x1; \
224 vpxor x2, x0, x0; \
225 vpxor x3, x4, x4;
226
227#define SI1_1(x0, x1, x2, x3, x4) \
228 vpxor x3, x1, x1; \
229 vpxor x2, x0, tp; \
230 vpxor RNOT, x2, x2; \
231 vpor x1, x0, x4; \
232 vpxor x3, x4, x4; \
233 vpand x1, x3, x3; \
234 vpxor x2, x1, x1; \
235 vpand x4, x2, x2;
236#define SI1_2(x0, x1, x2, x3, x4) \
237 vpxor x1, x4, x4; \
238 vpor x3, x1, x1; \
239 vpxor tp, x3, x3; \
240 vpxor tp, x2, x2; \
241 vpor x4, tp, x0; \
242 vpxor x4, x2, x2; \
243 vpxor x0, x1, x1; \
244 vpxor x1, x4, x4;
245
246#define SI2_1(x0, x1, x2, x3, x4) \
247 vpxor x1, x2, x2; \
248 vpxor RNOT, x3, tp; \
249 vpor x2, tp, tp; \
250 vpxor x3, x2, x2; \
251 vpxor x0, x3, x4; \
252 vpxor x1, tp, x3; \
253 vpor x2, x1, x1; \
254 vpxor x0, x2, x2;
255#define SI2_2(x0, x1, x2, x3, x4) \
256 vpxor x4, x1, x1; \
257 vpor x3, x4, x4; \
258 vpxor x3, x2, x2; \
259 vpxor x2, x4, x4; \
260 vpand x1, x2, x2; \
261 vpxor x3, x2, x2; \
262 vpxor x4, x3, x3; \
263 vpxor x0, x4, x4;
264
265#define SI3_1(x0, x1, x2, x3, x4) \
266 vpxor x1, x2, x2; \
267 vpand x2, x1, tp; \
268 vpxor x0, tp, tp; \
269 vpor x1, x0, x0; \
270 vpxor x3, x1, x4; \
271 vpxor x3, x0, x0; \
272 vpor tp, x3, x3; \
273 vpxor x2, tp, x1;
274#define SI3_2(x0, x1, x2, x3, x4) \
275 vpxor x3, x1, x1; \
276 vpxor x2, x0, x0; \
277 vpxor x3, x2, x2; \
278 vpand x1, x3, x3; \
279 vpxor x0, x1, x1; \
280 vpand x2, x0, x0; \
281 vpxor x3, x4, x4; \
282 vpxor x0, x3, x3; \
283 vpxor x1, x0, x0;
284
285#define SI4_1(x0, x1, x2, x3, x4) \
286 vpxor x3, x2, x2; \
287 vpand x1, x0, tp; \
288 vpxor x2, tp, tp; \
289 vpor x3, x2, x2; \
290 vpxor RNOT, x0, x4; \
291 vpxor tp, x1, x1; \
292 vpxor x2, tp, x0; \
293 vpand x4, x2, x2;
294#define SI4_2(x0, x1, x2, x3, x4) \
295 vpxor x0, x2, x2; \
296 vpor x4, x0, x0; \
297 vpxor x3, x0, x0; \
298 vpand x2, x3, x3; \
299 vpxor x3, x4, x4; \
300 vpxor x1, x3, x3; \
301 vpand x0, x1, x1; \
302 vpxor x1, x4, x4; \
303 vpxor x3, x0, x0;
304
305#define SI5_1(x0, x1, x2, x3, x4) \
306 vpor x2, x1, tp; \
307 vpxor x1, x2, x2; \
308 vpxor x3, tp, tp; \
309 vpand x1, x3, x3; \
310 vpxor x3, x2, x2; \
311 vpor x0, x3, x3; \
312 vpxor RNOT, x0, x0; \
313 vpxor x2, x3, x3; \
314 vpor x0, x2, x2;
315#define SI5_2(x0, x1, x2, x3, x4) \
316 vpxor tp, x1, x4; \
317 vpxor x4, x2, x2; \
318 vpand x0, x4, x4; \
319 vpxor tp, x0, x0; \
320 vpxor x3, tp, x1; \
321 vpand x2, x0, x0; \
322 vpxor x3, x2, x2; \
323 vpxor x2, x0, x0; \
324 vpxor x4, x2, x2; \
325 vpxor x3, x4, x4;
326
327#define SI6_1(x0, x1, x2, x3, x4) \
328 vpxor x2, x0, x0; \
329 vpand x3, x0, tp; \
330 vpxor x3, x2, x2; \
331 vpxor x2, tp, tp; \
332 vpxor x1, x3, x3; \
333 vpor x0, x2, x2; \
334 vpxor x3, x2, x2; \
335 vpand tp, x3, x3;
336#define SI6_2(x0, x1, x2, x3, x4) \
337 vpxor RNOT, tp, tp; \
338 vpxor x1, x3, x3; \
339 vpand x2, x1, x1; \
340 vpxor tp, x0, x4; \
341 vpxor x4, x3, x3; \
342 vpxor x2, x4, x4; \
343 vpxor x1, tp, x0; \
344 vpxor x0, x2, x2;
345
346#define SI7_1(x0, x1, x2, x3, x4) \
347 vpand x0, x3, tp; \
348 vpxor x2, x0, x0; \
349 vpor x3, x2, x2; \
350 vpxor x1, x3, x4; \
351 vpxor RNOT, x0, x0; \
352 vpor tp, x1, x1; \
353 vpxor x0, x4, x4; \
354 vpand x2, x0, x0; \
355 vpxor x1, x0, x0;
356#define SI7_2(x0, x1, x2, x3, x4) \
357 vpand x2, x1, x1; \
358 vpxor x2, tp, x3; \
359 vpxor x3, x4, x4; \
360 vpand x3, x2, x2; \
361 vpor x0, x3, x3; \
362 vpxor x4, x1, x1; \
363 vpxor x4, x3, x3; \
364 vpand x0, x4, x4; \
365 vpxor x2, x4, x4;
366
367#define get_key(i,j,t) \
368 vpbroadcastd (4*(i)+(j))*4(CTX), t;
369
370#define K2(x0, x1, x2, x3, x4, i) \
371 get_key(i, 0, RK0); \
372 get_key(i, 1, RK1); \
373 get_key(i, 2, RK2); \
374 get_key(i, 3, RK3); \
375 vpxor RK0, x0
376 vpxor RK1, x1
377 vpxor RK2, x2
378 vpxor RK3, x3
379 vpxor RK0, x0
380 vpxor RK1, x1
381 vpxor RK2, x2
382 vpxor RK3, x3
383
384#define LK2(x0, x1, x2, x3, x4, i) \
385 vpslld $13, x0
386 vpsrld $(32 - 13), x0
387 vpor x4
388 vpxor x0
389 vpslld $3, x2
390 vpsrld $(32 - 3), x2
391 vpor x4
392 vpxor x2
393 vpslld $13, x0
394 vpsrld $(32 - 13), x0
395 vpor x4
396 vpxor x0
397 vpslld $3, x2
398 vpsrld $(32 - 3), x2
399 vpor x4
400 vpxor x2
401 vpslld $1, x1
402 vpsrld $(32 - 1), x1
403 vpor x4
404 vpslld $3, x0
405 vpxor x2
406 vpxor x4
407 get_key(i, 1, RK1); \
408 vpslld $1, x1
409 vpsrld $(32 - 1), x1
410 vpor x4
411 vpslld $3, x0
412 vpxor x2
413 vpxor x4
414 get_key(i, 3, RK3); \
415 vpslld $7, x3
416 vpsrld $(32 - 7), x3
417 vpor x4
418 vpslld $7, x1
419 vpxor x1
420 vpxor x3
421 vpxor x3
422 vpxor x4
423 get_key(i, 0, RK0); \
424 vpslld $7, x3
425 vpsrld $(32 - 7), x3
426 vpor x4
427 vpslld $7, x1
428 vpxor x1
429 vpxor x3
430 vpxor x3
431 vpxor x4
432 get_key(i, 2, RK2); \
433 vpxor RK1, x1
434 vpxor RK3, x3
435 vpslld $5, x0
436 vpsrld $(32 - 5), x0
437 vpor x4
438 vpslld $22, x2
439 vpsrld $(32 - 22), x2
440 vpor x4
441 vpxor RK0, x0
442 vpxor RK2, x2
443 vpxor RK1, x1
444 vpxor RK3, x3
445 vpslld $5, x0
446 vpsrld $(32 - 5), x0
447 vpor x4
448 vpslld $22, x2
449 vpsrld $(32 - 22), x2
450 vpor x4
451 vpxor RK0, x0
452 vpxor RK2, x2
453
454#define KL2(x0, x1, x2, x3, x4, i) \
455 vpxor RK0, x0
456 vpxor RK2, x2
457 vpsrld $5, x0
458 vpslld $(32 - 5), x0
459 vpor x4
460 vpxor RK3, x3
461 vpxor RK1, x1
462 vpsrld $22, x2
463 vpslld $(32 - 22), x2
464 vpor x4
465 vpxor x3
466 vpxor RK0, x0
467 vpxor RK2, x2
468 vpsrld $5, x0
469 vpslld $(32 - 5), x0
470 vpor x4
471 vpxor RK3, x3
472 vpxor RK1, x1
473 vpsrld $22, x2
474 vpslld $(32 - 22), x2
475 vpor x4
476 vpxor x3
477 vpxor x3
478 vpslld $7, x1
479 vpxor x1
480 vpxor x4
481 vpsrld $1, x1
482 vpslld $(32 - 1), x1
483 vpor x4
484 vpxor x3
485 vpslld $7, x1
486 vpxor x1
487 vpxor x4
488 vpsrld $1, x1
489 vpslld $(32 - 1), x1
490 vpor x4
491 vpsrld $7, x3
492 vpslld $(32 - 7), x3
493 vpor x4
494 vpxor x0
495 vpslld $3, x0
496 vpxor x4
497 vpsrld $7, x3
498 vpslld $(32 - 7), x3
499 vpor x4
500 vpxor x0
501 vpslld $3, x0
502 vpxor x4
503 vpsrld $13, x0
504 vpslld $(32 - 13), x0
505 vpor x4
506 vpxor x2
507 vpxor x2
508 vpsrld $3, x2
509 vpslld $(32 - 3), x2
510 vpor x4
511 vpsrld $13, x0
512 vpslld $(32 - 13), x0
513 vpor x4
514 vpxor x2
515 vpxor x2
516 vpsrld $3, x2
517 vpslld $(32 - 3), x2
518 vpor x4
519
520#define S(SBOX, x0, x1, x2, x3, x4) \
521 SBOX
522 SBOX
523 SBOX
524 SBOX
525
526#define SP(SBOX, x0, x1, x2, x3, x4, i) \
527 get_key(i, 0, RK0); \
528 SBOX
529 get_key(i, 2, RK2); \
530 SBOX
531 get_key(i, 3, RK3); \
532 SBOX
533 get_key(i, 1, RK1); \
534 SBOX
535
536#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
537 vpunpckldq x1, x0, t0; \
538 vpunpckhdq x1, x0, t2; \
539 vpunpckldq x3, x2, t1; \
540 vpunpckhdq x3, x2, x3; \
541 \
542 vpunpcklqdq t1, t0, x0; \
543 vpunpckhqdq t1, t0, x1; \
544 vpunpcklqdq x3, t2, x2; \
545 vpunpckhqdq x3, t2, x3;
546
547#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
548 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
549
550#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
551 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
552
553.align 8
554SYM_FUNC_START_LOCAL(__serpent_enc_blk16)
555
556
557
558
559
560
561
562 vpcmpeqd RNOT, RNOT, RNOT;
563
564 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
565 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
566
567 K2(RA, RB, RC, RD, RE, 0);
568 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
569 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
570 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
571 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
572 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
573 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
574 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
575 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
576 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
577 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
578 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
579 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
580 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
581 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
582 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
583 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
584 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
585 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
586 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
587 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
588 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
589 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
590 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
591 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
592 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
593 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
594 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
595 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
596 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
597 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
598 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
599 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
600
601 write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
602 write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
603
604 RET;
605SYM_FUNC_END(__serpent_enc_blk16)
606
607.align 8
608SYM_FUNC_START_LOCAL(__serpent_dec_blk16)
609
610
611
612
613
614
615
616 vpcmpeqd RNOT, RNOT, RNOT;
617
618 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
619 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
620
621 K2(RA, RB, RC, RD, RE, 32);
622 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
623 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
624 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
625 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
626 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
627 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
628 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
629 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
630 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
631 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
632 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
633 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
634 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
635 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
636 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
637 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
638 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
639 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
640 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
641 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
642 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
643 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
644 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
645 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
646 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
647 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
648 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
649 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
650 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
651 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
652 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
653 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
654
655 write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
656 write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
657
658 RET;
659SYM_FUNC_END(__serpent_dec_blk16)
660
661SYM_FUNC_START(serpent_ecb_enc_16way)
662
663
664
665
666
667 FRAME_BEGIN
668
669 vzeroupper;
670
671 load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
672
673 call __serpent_enc_blk16;
674
675 store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
676
677 vzeroupper;
678
679 FRAME_END
680 RET;
681SYM_FUNC_END(serpent_ecb_enc_16way)
682
683SYM_FUNC_START(serpent_ecb_dec_16way)
684
685
686
687
688
689 FRAME_BEGIN
690
691 vzeroupper;
692
693 load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
694
695 call __serpent_dec_blk16;
696
697 store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
698
699 vzeroupper;
700
701 FRAME_END
702 RET;
703SYM_FUNC_END(serpent_ecb_dec_16way)
704
705SYM_FUNC_START(serpent_cbc_dec_16way)
706
707
708
709
710
711 FRAME_BEGIN
712
713 vzeroupper;
714
715 load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
716
717 call __serpent_dec_blk16;
718
719 store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2,
720 RK0);
721
722 vzeroupper;
723
724 FRAME_END
725 RET;
726SYM_FUNC_END(serpent_cbc_dec_16way)
727