1
2
3
4
5
6
7
8
9
10
11
12
13#include <linux/linkage.h>
14#include "glue_helper-asm-avx2.S"
15
16.file "twofish-avx2-asm_64.S"
17
18.data
19.align 16
20
21.Lvpshufb_mask0:
22.long 0x80808000
23.long 0x80808004
24.long 0x80808008
25.long 0x8080800c
26
27.Lbswap128_mask:
28 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
29.Lxts_gf128mul_and_shl1_mask_0:
30 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
31.Lxts_gf128mul_and_shl1_mask_1:
32 .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
33
34.text
35
36
37#define s0 0
38#define s1 1024
39#define s2 2048
40#define s3 3072
41#define w 4096
42#define k 4128
43
44
45#define CTX %rdi
46
47#define RS0 CTX
48#define RS1 %r8
49#define RS2 %r9
50#define RS3 %r10
51#define RK %r11
52#define RW %rax
53#define RROUND %r12
54#define RROUNDd %r12d
55
56#define RA0 %ymm8
57#define RB0 %ymm9
58#define RC0 %ymm10
59#define RD0 %ymm11
60#define RA1 %ymm12
61#define RB1 %ymm13
62#define RC1 %ymm14
63#define RD1 %ymm15
64
65
66#define RX0 %ymm0
67#define RY0 %ymm1
68#define RX1 %ymm2
69#define RY1 %ymm3
70#define RT0 %ymm4
71#define RIDX %ymm5
72
73#define RX0x %xmm0
74#define RY0x %xmm1
75#define RX1x %xmm2
76#define RY1x %xmm3
77#define RT0x %xmm4
78
79
80#define RNOT %ymm6
81
82
83#define RBYTE %ymm7
84
85
86
87
88#define init_round_constants() \
89 vpcmpeqd RNOT, RNOT, RNOT; \
90 vpsrld $24, RNOT, RBYTE; \
91 leaq k(CTX), RK; \
92 leaq w(CTX), RW; \
93 leaq s1(CTX), RS1; \
94 leaq s2(CTX), RS2; \
95 leaq s3(CTX), RS3; \
96
97#define g16(ab, rs0, rs1, rs2, rs3, xy) \
98 vpand RBYTE, ab
99 vpgatherdd RNOT, (rs0, RIDX, 4), xy
100 vpcmpeqd RNOT, RNOT, RNOT; \
101 \
102 vpand RBYTE, ab
103 vpgatherdd RNOT, (rs0, RIDX, 4), xy
104 vpcmpeqd RNOT, RNOT, RNOT; \
105 \
106 vpsrld $8, ab
107 vpand RBYTE, RIDX, RIDX; \
108 vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
109 vpcmpeqd RNOT, RNOT, RNOT; \
110 vpxor RT0, xy
111 \
112 vpsrld $8, ab
113 vpand RBYTE, RIDX, RIDX; \
114 vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
115 vpcmpeqd RNOT, RNOT, RNOT; \
116 vpxor RT0, xy
117 \
118 vpsrld $16, ab
119 vpand RBYTE, RIDX, RIDX; \
120 vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
121 vpcmpeqd RNOT, RNOT, RNOT; \
122 vpxor RT0, xy
123 \
124 vpsrld $16, ab
125 vpand RBYTE, RIDX, RIDX; \
126 vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
127 vpcmpeqd RNOT, RNOT, RNOT; \
128 vpxor RT0, xy
129 \
130 vpsrld $24, ab
131 vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
132 vpcmpeqd RNOT, RNOT, RNOT; \
133 vpxor RT0, xy
134 \
135 vpsrld $24, ab
136 vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
137 vpcmpeqd RNOT, RNOT, RNOT; \
138 vpxor RT0, xy
139
140#define g1_16(a, x) \
141 g16(a, RS0, RS1, RS2, RS3, x);
142
143#define g2_16(b, y) \
144 g16(b, RS1, RS2, RS3, RS0, y);
145
146#define encrypt_round_end16(a, b, c, d, nk) \
147 vpaddd RY0, RX0, RX0; \
148 vpaddd RX0, RY0, RY0; \
149 vpbroadcastd nk(RK,RROUND,8), RT0; \
150 vpaddd RT0, RX0, RX0; \
151 vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
152 vpaddd RT0, RY0, RY0; \
153 \
154 vpxor RY0, d
155 \
156 vpxor RX0, c
157 vpsrld $1, c
158 vpslld $31, c
159 vpor RT0, c
160 \
161 vpaddd RY1, RX1, RX1; \
162 vpaddd RX1, RY1, RY1; \
163 vpbroadcastd nk(RK,RROUND,8), RT0; \
164 vpaddd RT0, RX1, RX1; \
165 vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
166 vpaddd RT0, RY1, RY1; \
167 \
168 vpxor RY1, d
169 \
170 vpxor RX1, c
171 vpsrld $1, c
172 vpslld $31, c
173 vpor RT0, c
174
175#define encrypt_round16(a, b, c, d, nk) \
176 g2_16(b, RY); \
177 \
178 vpslld $1, b
179 vpsrld $31, b
180 vpor RT0, b
181 \
182 vpslld $1, b
183 vpsrld $31, b
184 vpor RT0, b
185 \
186 g1_16(a, RX); \
187 \
188 encrypt_round_end16(a, b, c, d, nk);
189
190#define encrypt_round_first16(a, b, c, d, nk) \
191 vpslld $1, d
192 vpsrld $31, d
193 vpor RT0, d
194 \
195 vpslld $1, d
196 vpsrld $31, d
197 vpor RT0, d
198 \
199 encrypt_round16(a, b, c, d, nk);
200
201#define encrypt_round_last16(a, b, c, d, nk) \
202 g2_16(b, RY); \
203 \
204 g1_16(a, RX); \
205 \
206 encrypt_round_end16(a, b, c, d, nk);
207
208#define decrypt_round_end16(a, b, c, d, nk) \
209 vpaddd RY0, RX0, RX0; \
210 vpaddd RX0, RY0, RY0; \
211 vpbroadcastd nk(RK,RROUND,8), RT0; \
212 vpaddd RT0, RX0, RX0; \
213 vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
214 vpaddd RT0, RY0, RY0; \
215 \
216 vpxor RX0, c
217 \
218 vpxor RY0, d
219 vpsrld $1, d
220 vpslld $31, d
221 vpor RT0, d
222 \
223 vpaddd RY1, RX1, RX1; \
224 vpaddd RX1, RY1, RY1; \
225 vpbroadcastd nk(RK,RROUND,8), RT0; \
226 vpaddd RT0, RX1, RX1; \
227 vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
228 vpaddd RT0, RY1, RY1; \
229 \
230 vpxor RX1, c
231 \
232 vpxor RY1, d
233 vpsrld $1, d
234 vpslld $31, d
235 vpor RT0, d
236
237#define decrypt_round16(a, b, c, d, nk) \
238 g1_16(a, RX); \
239 \
240 vpslld $1, a
241 vpsrld $31, a
242 vpor RT0, a
243 \
244 vpslld $1, a
245 vpsrld $31, a
246 vpor RT0, a
247 \
248 g2_16(b, RY); \
249 \
250 decrypt_round_end16(a, b, c, d, nk);
251
252#define decrypt_round_first16(a, b, c, d, nk) \
253 vpslld $1, c
254 vpsrld $31, c
255 vpor RT0, c
256 \
257 vpslld $1, c
258 vpsrld $31, c
259 vpor RT0, c
260 \
261 decrypt_round16(a, b, c, d, nk)
262
263#define decrypt_round_last16(a, b, c, d, nk) \
264 g1_16(a, RX); \
265 \
266 g2_16(b, RY); \
267 \
268 decrypt_round_end16(a, b, c, d, nk);
269
270#define encrypt_cycle16() \
271 encrypt_round16(RA, RB, RC, RD, 0); \
272 encrypt_round16(RC, RD, RA, RB, 8);
273
274#define encrypt_cycle_first16() \
275 encrypt_round_first16(RA, RB, RC, RD, 0); \
276 encrypt_round16(RC, RD, RA, RB, 8);
277
278#define encrypt_cycle_last16() \
279 encrypt_round16(RA, RB, RC, RD, 0); \
280 encrypt_round_last16(RC, RD, RA, RB, 8);
281
282#define decrypt_cycle16(n) \
283 decrypt_round16(RC, RD, RA, RB, 8); \
284 decrypt_round16(RA, RB, RC, RD, 0);
285
286#define decrypt_cycle_first16(n) \
287 decrypt_round_first16(RC, RD, RA, RB, 8); \
288 decrypt_round16(RA, RB, RC, RD, 0);
289
290#define decrypt_cycle_last16(n) \
291 decrypt_round16(RC, RD, RA, RB, 8); \
292 decrypt_round_last16(RA, RB, RC, RD, 0);
293
294#define transpose_4x4(x0,x1,x2,x3,t1,t2) \
295 vpunpckhdq x1, x0, t2; \
296 vpunpckldq x1, x0, x0; \
297 \
298 vpunpckldq x3, x2, t1; \
299 vpunpckhdq x3, x2, x2; \
300 \
301 vpunpckhqdq t1, x0, x1; \
302 vpunpcklqdq t1, x0, x0; \
303 \
304 vpunpckhqdq x2, t2, x3; \
305 vpunpcklqdq x2, t2, x2;
306
307#define read_blocks8(offs,a,b,c,d) \
308 transpose_4x4(a, b, c, d, RX0, RY0);
309
310#define write_blocks8(offs,a,b,c,d) \
311 transpose_4x4(a, b, c, d, RX0, RY0);
312
313#define inpack_enc8(a,b,c,d) \
314 vpbroadcastd 4*0(RW), RT0; \
315 vpxor RT0, a, a; \
316 \
317 vpbroadcastd 4*1(RW), RT0; \
318 vpxor RT0, b, b; \
319 \
320 vpbroadcastd 4*2(RW), RT0; \
321 vpxor RT0, c, c; \
322 \
323 vpbroadcastd 4*3(RW), RT0; \
324 vpxor RT0, d, d;
325
326#define outunpack_enc8(a,b,c,d) \
327 vpbroadcastd 4*4(RW), RX0; \
328 vpbroadcastd 4*5(RW), RY0; \
329 vpxor RX0, c, RX0; \
330 vpxor RY0, d, RY0; \
331 \
332 vpbroadcastd 4*6(RW), RT0; \
333 vpxor RT0, a, c; \
334 vpbroadcastd 4*7(RW), RT0; \
335 vpxor RT0, b, d; \
336 \
337 vmovdqa RX0, a; \
338 vmovdqa RY0, b;
339
340#define inpack_dec8(a,b,c,d) \
341 vpbroadcastd 4*4(RW), RX0; \
342 vpbroadcastd 4*5(RW), RY0; \
343 vpxor RX0, a, RX0; \
344 vpxor RY0, b, RY0; \
345 \
346 vpbroadcastd 4*6(RW), RT0; \
347 vpxor RT0, c, a; \
348 vpbroadcastd 4*7(RW), RT0; \
349 vpxor RT0, d, b; \
350 \
351 vmovdqa RX0, c; \
352 vmovdqa RY0, d;
353
354#define outunpack_dec8(a,b,c,d) \
355 vpbroadcastd 4*0(RW), RT0; \
356 vpxor RT0, a, a; \
357 \
358 vpbroadcastd 4*1(RW), RT0; \
359 vpxor RT0, b, b; \
360 \
361 vpbroadcastd 4*2(RW), RT0; \
362 vpxor RT0, c, c; \
363 \
364 vpbroadcastd 4*3(RW), RT0; \
365 vpxor RT0, d, d;
366
367#define read_blocks16(a,b,c,d) \
368 read_blocks8(0, a
369 read_blocks8(8, a
370
371#define write_blocks16(a,b,c,d) \
372 write_blocks8(0, a
373 write_blocks8(8, a
374
375#define xor_blocks16(a,b,c,d) \
376 xor_blocks8(0, a
377 xor_blocks8(8, a
378
379#define inpack_enc16(a,b,c,d) \
380 inpack_enc8(a
381 inpack_enc8(a
382
383#define outunpack_enc16(a,b,c,d) \
384 outunpack_enc8(a
385 outunpack_enc8(a
386
387#define inpack_dec16(a,b,c,d) \
388 inpack_dec8(a
389 inpack_dec8(a
390
391#define outunpack_dec16(a,b,c,d) \
392 outunpack_dec8(a
393 outunpack_dec8(a
394
395.align 8
396__twofish_enc_blk16:
397
398
399
400
401
402
403 init_round_constants();
404
405 read_blocks16(RA, RB, RC, RD);
406 inpack_enc16(RA, RB, RC, RD);
407
408 xorl RROUNDd, RROUNDd;
409 encrypt_cycle_first16();
410 movl $2, RROUNDd;
411
412.align 4
413.L__enc_loop:
414 encrypt_cycle16();
415
416 addl $2, RROUNDd;
417 cmpl $14, RROUNDd;
418 jne .L__enc_loop;
419
420 encrypt_cycle_last16();
421
422 outunpack_enc16(RA, RB, RC, RD);
423 write_blocks16(RA, RB, RC, RD);
424
425 ret;
426ENDPROC(__twofish_enc_blk16)
427
428.align 8
429__twofish_dec_blk16:
430
431
432
433
434
435
436 init_round_constants();
437
438 read_blocks16(RA, RB, RC, RD);
439 inpack_dec16(RA, RB, RC, RD);
440
441 movl $14, RROUNDd;
442 decrypt_cycle_first16();
443 movl $12, RROUNDd;
444
445.align 4
446.L__dec_loop:
447 decrypt_cycle16();
448
449 addl $-2, RROUNDd;
450 jnz .L__dec_loop;
451
452 decrypt_cycle_last16();
453
454 outunpack_dec16(RA, RB, RC, RD);
455 write_blocks16(RA, RB, RC, RD);
456
457 ret;
458ENDPROC(__twofish_dec_blk16)
459
460ENTRY(twofish_ecb_enc_16way)
461
462
463
464
465
466
467 vzeroupper;
468 pushq %r12;
469
470 load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
471
472 call __twofish_enc_blk16;
473
474 store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
475
476 popq %r12;
477 vzeroupper;
478
479 ret;
480ENDPROC(twofish_ecb_enc_16way)
481
482ENTRY(twofish_ecb_dec_16way)
483
484
485
486
487
488
489 vzeroupper;
490 pushq %r12;
491
492 load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
493
494 call __twofish_dec_blk16;
495
496 store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
497
498 popq %r12;
499 vzeroupper;
500
501 ret;
502ENDPROC(twofish_ecb_dec_16way)
503
504ENTRY(twofish_cbc_dec_16way)
505
506
507
508
509
510
511 vzeroupper;
512 pushq %r12;
513
514 load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
515
516 call __twofish_dec_blk16;
517
518 store_cbc_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1,
519 RX0);
520
521 popq %r12;
522 vzeroupper;
523
524 ret;
525ENDPROC(twofish_cbc_dec_16way)
526
527ENTRY(twofish_ctr_16way)
528
529
530
531
532
533
534
535 vzeroupper;
536 pushq %r12;
537
538 load_ctr_16way(%rcx, .Lbswap128_mask, RA0, RB0, RC0, RD0, RA1, RB1, RC1,
539 RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT,
540 RBYTE);
541
542 call __twofish_enc_blk16;
543
544 store_ctr_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
545
546 popq %r12;
547 vzeroupper;
548
549 ret;
550ENDPROC(twofish_ctr_16way)
551
552.align 8
553twofish_xts_crypt_16way:
554
555
556
557
558
559
560
561
562 vzeroupper;
563 pushq %r12;
564
565 load_xts_16way(%rcx, %rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1,
566 RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT,
567 .Lxts_gf128mul_and_shl1_mask_0,
568 .Lxts_gf128mul_and_shl1_mask_1);
569
570 call *%r8;
571
572 store_xts_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
573
574 popq %r12;
575 vzeroupper;
576
577 ret;
578ENDPROC(twofish_xts_crypt_16way)
579
580ENTRY(twofish_xts_enc_16way)
581
582
583
584
585
586
587 leaq __twofish_enc_blk16, %r8;
588 jmp twofish_xts_crypt_16way;
589ENDPROC(twofish_xts_enc_16way)
590
591ENTRY(twofish_xts_dec_16way)
592
593
594
595
596
597
598 leaq __twofish_dec_blk16, %r8;
599 jmp twofish_xts_crypt_16way;
600ENDPROC(twofish_xts_dec_16way)
601