1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122#include <linux/linkage.h>
123#include <asm/inst.h>
124
125
126.section .rodata.cst16.POLY, "aM", @progbits, 16
127.align 16
128POLY: .octa 0xC2000000000000000000000000000001
129
130.section .rodata.cst16.POLY2, "aM", @progbits, 16
131.align 16
132POLY2: .octa 0xC20000000000000000000001C2000000
133
134.section .rodata.cst16.TWOONE, "aM", @progbits, 16
135.align 16
136TWOONE: .octa 0x00000001000000000000000000000001
137
138.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
139.align 16
140SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
141
142.section .rodata.cst16.ONE, "aM", @progbits, 16
143.align 16
144ONE: .octa 0x00000000000000000000000000000001
145
146.section .rodata.cst16.ONEf, "aM", @progbits, 16
147.align 16
148ONEf: .octa 0x01000000000000000000000000000000
149
150
151
152.section .rodata, "a", @progbits
153.align 16
154SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
155ALL_F: .octa 0xffffffffffffffffffffffffffffffff
156 .octa 0x00000000000000000000000000000000
157
158.section .rodata
159.align 16
160.type aad_shift_arr, @object
161.size aad_shift_arr, 272
162aad_shift_arr:
163 .octa 0xffffffffffffffffffffffffffffffff
164 .octa 0xffffffffffffffffffffffffffffff0C
165 .octa 0xffffffffffffffffffffffffffff0D0C
166 .octa 0xffffffffffffffffffffffffff0E0D0C
167 .octa 0xffffffffffffffffffffffff0F0E0D0C
168 .octa 0xffffffffffffffffffffff0C0B0A0908
169 .octa 0xffffffffffffffffffff0D0C0B0A0908
170 .octa 0xffffffffffffffffff0E0D0C0B0A0908
171 .octa 0xffffffffffffffff0F0E0D0C0B0A0908
172 .octa 0xffffffffffffff0C0B0A090807060504
173 .octa 0xffffffffffff0D0C0B0A090807060504
174 .octa 0xffffffffff0E0D0C0B0A090807060504
175 .octa 0xffffffff0F0E0D0C0B0A090807060504
176 .octa 0xffffff0C0B0A09080706050403020100
177 .octa 0xffff0D0C0B0A09080706050403020100
178 .octa 0xff0E0D0C0B0A09080706050403020100
179 .octa 0x0F0E0D0C0B0A09080706050403020100
180
181
182.text
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206HashKey = 16*11
207HashKey_2 = 16*12
208HashKey_3 = 16*13
209HashKey_4 = 16*14
210HashKey_5 = 16*15
211HashKey_6 = 16*16
212HashKey_7 = 16*17
213HashKey_8 = 16*18
214HashKey_k = 16*19
215HashKey_2_k = 16*20
216HashKey_3_k = 16*21
217HashKey_4_k = 16*22
218HashKey_5_k = 16*23
219HashKey_6_k = 16*24
220HashKey_7_k = 16*25
221HashKey_8_k = 16*26
222
223#define arg1 %rdi
224#define arg2 %rsi
225#define arg3 %rdx
226#define arg4 %rcx
227#define arg5 %r8
228#define arg6 %r9
229#define arg7 STACK_OFFSET+8*1(%r14)
230#define arg8 STACK_OFFSET+8*2(%r14)
231#define arg9 STACK_OFFSET+8*3(%r14)
232
233i = 0
234j = 0
235
236out_order = 0
237in_order = 1
238DEC = 0
239ENC = 1
240
241.macro define_reg r n
242reg_\r = %xmm\n
243.endm
244
245.macro setreg
246.altmacro
247define_reg i %i
248define_reg j %j
249.noaltmacro
250.endm
251
252
253STACK_OFFSET = 8*4
254
255TMP1 = 16*0
256TMP2 = 16*1
257TMP3 = 16*2
258TMP4 = 16*3
259TMP5 = 16*4
260TMP6 = 16*5
261TMP7 = 16*6
262TMP8 = 16*7
263
264VARIABLE_OFFSET = 16*8
265
266
267
268
269
270
271.macro ENCRYPT_SINGLE_BLOCK XMM0
272 vpxor (arg1), \XMM0, \XMM0
273 i = 1
274 setreg
275.rep 9
276 vaesenc 16*i(arg1), \XMM0, \XMM0
277 i = (i+1)
278 setreg
279.endr
280 vaesenclast 16*10(arg1), \XMM0, \XMM0
281.endm
282
283#ifdef CONFIG_AS_AVX
284
285
286
287
288
289
290
291.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
292
293 vpshufd $0b01001110, \GH, \T2
294 vpshufd $0b01001110, \HK, \T3
295 vpxor \GH , \T2, \T2
296 vpxor \HK , \T3, \T3
297
298 vpclmulqdq $0x11, \HK, \GH, \T1
299 vpclmulqdq $0x00, \HK, \GH, \GH
300 vpclmulqdq $0x00, \T3, \T2, \T2
301 vpxor \GH, \T2,\T2
302 vpxor \T1, \T2,\T2
303
304 vpslldq $8, \T2,\T3
305 vpsrldq $8, \T2,\T2
306 vpxor \T3, \GH, \GH
307 vpxor \T2, \T1, \T1
308
309
310 vpslld $31, \GH, \T2
311 vpslld $30, \GH, \T3
312 vpslld $25, \GH, \T4
313
314 vpxor \T3, \T2, \T2
315 vpxor \T4, \T2, \T2
316
317 vpsrldq $4, \T2, \T5
318
319 vpslldq $12, \T2, \T2
320 vpxor \T2, \GH, \GH
321
322
323
324 vpsrld $1,\GH, \T2
325 vpsrld $2,\GH, \T3
326 vpsrld $7,\GH, \T4
327 vpxor \T3, \T2, \T2
328 vpxor \T4, \T2, \T2
329
330 vpxor \T5, \T2, \T2
331 vpxor \T2, \GH, \GH
332 vpxor \T1, \GH, \GH
333
334
335.endm
336
337.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
338
339
340 vmovdqa \HK, \T5
341
342 vpshufd $0b01001110, \T5, \T1
343 vpxor \T5, \T1, \T1
344 vmovdqa \T1, HashKey_k(arg1)
345
346 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
347 vmovdqa \T5, HashKey_2(arg1)
348 vpshufd $0b01001110, \T5, \T1
349 vpxor \T5, \T1, \T1
350 vmovdqa \T1, HashKey_2_k(arg1)
351
352 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
353 vmovdqa \T5, HashKey_3(arg1)
354 vpshufd $0b01001110, \T5, \T1
355 vpxor \T5, \T1, \T1
356 vmovdqa \T1, HashKey_3_k(arg1)
357
358 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
359 vmovdqa \T5, HashKey_4(arg1)
360 vpshufd $0b01001110, \T5, \T1
361 vpxor \T5, \T1, \T1
362 vmovdqa \T1, HashKey_4_k(arg1)
363
364 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
365 vmovdqa \T5, HashKey_5(arg1)
366 vpshufd $0b01001110, \T5, \T1
367 vpxor \T5, \T1, \T1
368 vmovdqa \T1, HashKey_5_k(arg1)
369
370 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
371 vmovdqa \T5, HashKey_6(arg1)
372 vpshufd $0b01001110, \T5, \T1
373 vpxor \T5, \T1, \T1
374 vmovdqa \T1, HashKey_6_k(arg1)
375
376 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
377 vmovdqa \T5, HashKey_7(arg1)
378 vpshufd $0b01001110, \T5, \T1
379 vpxor \T5, \T1, \T1
380 vmovdqa \T1, HashKey_7_k(arg1)
381
382 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
383 vmovdqa \T5, HashKey_8(arg1)
384 vpshufd $0b01001110, \T5, \T1
385 vpxor \T5, \T1, \T1
386 vmovdqa \T1, HashKey_8_k(arg1)
387
388.endm
389
390
391
392
393
394
395
396
397.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
398 i = (8-\num_initial_blocks)
399 j = 0
400 setreg
401
402 mov arg6, %r10
403 mov arg7, %r12
404
405
406 mov %r12, %r11
407
408 vpxor reg_j, reg_j, reg_j
409 vpxor reg_i, reg_i, reg_i
410 cmp $16, %r11
411 jl _get_AAD_rest8\@
412_get_AAD_blocks\@:
413 vmovdqu (%r10), reg_i
414 vpshufb SHUF_MASK(%rip), reg_i, reg_i
415 vpxor reg_i, reg_j, reg_j
416 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6
417 add $16, %r10
418 sub $16, %r12
419 sub $16, %r11
420 cmp $16, %r11
421 jge _get_AAD_blocks\@
422 vmovdqu reg_j, reg_i
423 cmp $0, %r11
424 je _get_AAD_done\@
425
426 vpxor reg_i, reg_i, reg_i
427
428
429
430
431_get_AAD_rest8\@:
432 cmp $4, %r11
433 jle _get_AAD_rest4\@
434 movq (%r10), \T1
435 add $8, %r10
436 sub $8, %r11
437 vpslldq $8, \T1, \T1
438 vpsrldq $8, reg_i, reg_i
439 vpxor \T1, reg_i, reg_i
440 jmp _get_AAD_rest8\@
441_get_AAD_rest4\@:
442 cmp $0, %r11
443 jle _get_AAD_rest0\@
444 mov (%r10), %eax
445 movq %rax, \T1
446 add $4, %r10
447 sub $4, %r11
448 vpslldq $12, \T1, \T1
449 vpsrldq $4, reg_i, reg_i
450 vpxor \T1, reg_i, reg_i
451_get_AAD_rest0\@:
452
453
454
455 movq %r12, %r11
456 salq $4, %r11
457 movdqu aad_shift_arr(%r11), \T1
458 vpshufb \T1, reg_i, reg_i
459_get_AAD_rest_final\@:
460 vpshufb SHUF_MASK(%rip), reg_i, reg_i
461 vpxor reg_j, reg_i, reg_i
462 GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6
463
464_get_AAD_done\@:
465
466 xor %r11, %r11
467
468
469 mov arg5, %rax
470 vmovdqu (%rax), \CTR
471 vpshufb SHUF_MASK(%rip), \CTR, \CTR
472
473
474 i = (9-\num_initial_blocks)
475 setreg
476.rep \num_initial_blocks
477 vpaddd ONE(%rip), \CTR, \CTR
478 vmovdqa \CTR, reg_i
479 vpshufb SHUF_MASK(%rip), reg_i, reg_i
480 i = (i+1)
481 setreg
482.endr
483
484 vmovdqa (arg1), \T_key
485 i = (9-\num_initial_blocks)
486 setreg
487.rep \num_initial_blocks
488 vpxor \T_key, reg_i, reg_i
489 i = (i+1)
490 setreg
491.endr
492
493 j = 1
494 setreg
495.rep 9
496 vmovdqa 16*j(arg1), \T_key
497 i = (9-\num_initial_blocks)
498 setreg
499.rep \num_initial_blocks
500 vaesenc \T_key, reg_i, reg_i
501 i = (i+1)
502 setreg
503.endr
504
505 j = (j+1)
506 setreg
507.endr
508
509
510 vmovdqa 16*10(arg1), \T_key
511 i = (9-\num_initial_blocks)
512 setreg
513.rep \num_initial_blocks
514 vaesenclast \T_key, reg_i, reg_i
515 i = (i+1)
516 setreg
517.endr
518
519 i = (9-\num_initial_blocks)
520 setreg
521.rep \num_initial_blocks
522 vmovdqu (arg3, %r11), \T1
523 vpxor \T1, reg_i, reg_i
524 vmovdqu reg_i, (arg2 , %r11)
525 add $16, %r11
526.if \ENC_DEC == DEC
527 vmovdqa \T1, reg_i
528.endif
529 vpshufb SHUF_MASK(%rip), reg_i, reg_i
530 i = (i+1)
531 setreg
532.endr
533
534
535 i = (8-\num_initial_blocks)
536 j = (9-\num_initial_blocks)
537 setreg
538
539.rep \num_initial_blocks
540 vpxor reg_i, reg_j, reg_j
541 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6
542 i = (i+1)
543 j = (j+1)
544 setreg
545.endr
546
547
548 vmovdqa \XMM8, TMP1(%rsp)
549 vmovdqa \XMM8, \T3
550
551 cmp $128, %r13
552 jl _initial_blocks_done\@
553
554
555
556 vpaddd ONE(%rip), \CTR, \CTR
557 vmovdqa \CTR, \XMM1
558 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
559
560 vpaddd ONE(%rip), \CTR, \CTR
561 vmovdqa \CTR, \XMM2
562 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
563
564 vpaddd ONE(%rip), \CTR, \CTR
565 vmovdqa \CTR, \XMM3
566 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
567
568 vpaddd ONE(%rip), \CTR, \CTR
569 vmovdqa \CTR, \XMM4
570 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
571
572 vpaddd ONE(%rip), \CTR, \CTR
573 vmovdqa \CTR, \XMM5
574 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
575
576 vpaddd ONE(%rip), \CTR, \CTR
577 vmovdqa \CTR, \XMM6
578 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
579
580 vpaddd ONE(%rip), \CTR, \CTR
581 vmovdqa \CTR, \XMM7
582 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
583
584 vpaddd ONE(%rip), \CTR, \CTR
585 vmovdqa \CTR, \XMM8
586 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
587
588 vmovdqa (arg1), \T_key
589 vpxor \T_key, \XMM1, \XMM1
590 vpxor \T_key, \XMM2, \XMM2
591 vpxor \T_key, \XMM3, \XMM3
592 vpxor \T_key, \XMM4, \XMM4
593 vpxor \T_key, \XMM5, \XMM5
594 vpxor \T_key, \XMM6, \XMM6
595 vpxor \T_key, \XMM7, \XMM7
596 vpxor \T_key, \XMM8, \XMM8
597
598 i = 1
599 setreg
600.rep 9
601 vmovdqa 16*i(arg1), \T_key
602 vaesenc \T_key, \XMM1, \XMM1
603 vaesenc \T_key, \XMM2, \XMM2
604 vaesenc \T_key, \XMM3, \XMM3
605 vaesenc \T_key, \XMM4, \XMM4
606 vaesenc \T_key, \XMM5, \XMM5
607 vaesenc \T_key, \XMM6, \XMM6
608 vaesenc \T_key, \XMM7, \XMM7
609 vaesenc \T_key, \XMM8, \XMM8
610 i = (i+1)
611 setreg
612.endr
613
614
615 vmovdqa 16*i(arg1), \T_key
616 vaesenclast \T_key, \XMM1, \XMM1
617 vaesenclast \T_key, \XMM2, \XMM2
618 vaesenclast \T_key, \XMM3, \XMM3
619 vaesenclast \T_key, \XMM4, \XMM4
620 vaesenclast \T_key, \XMM5, \XMM5
621 vaesenclast \T_key, \XMM6, \XMM6
622 vaesenclast \T_key, \XMM7, \XMM7
623 vaesenclast \T_key, \XMM8, \XMM8
624
625 vmovdqu (arg3, %r11), \T1
626 vpxor \T1, \XMM1, \XMM1
627 vmovdqu \XMM1, (arg2 , %r11)
628 .if \ENC_DEC == DEC
629 vmovdqa \T1, \XMM1
630 .endif
631
632 vmovdqu 16*1(arg3, %r11), \T1
633 vpxor \T1, \XMM2, \XMM2
634 vmovdqu \XMM2, 16*1(arg2 , %r11)
635 .if \ENC_DEC == DEC
636 vmovdqa \T1, \XMM2
637 .endif
638
639 vmovdqu 16*2(arg3, %r11), \T1
640 vpxor \T1, \XMM3, \XMM3
641 vmovdqu \XMM3, 16*2(arg2 , %r11)
642 .if \ENC_DEC == DEC
643 vmovdqa \T1, \XMM3
644 .endif
645
646 vmovdqu 16*3(arg3, %r11), \T1
647 vpxor \T1, \XMM4, \XMM4
648 vmovdqu \XMM4, 16*3(arg2 , %r11)
649 .if \ENC_DEC == DEC
650 vmovdqa \T1, \XMM4
651 .endif
652
653 vmovdqu 16*4(arg3, %r11), \T1
654 vpxor \T1, \XMM5, \XMM5
655 vmovdqu \XMM5, 16*4(arg2 , %r11)
656 .if \ENC_DEC == DEC
657 vmovdqa \T1, \XMM5
658 .endif
659
660 vmovdqu 16*5(arg3, %r11), \T1
661 vpxor \T1, \XMM6, \XMM6
662 vmovdqu \XMM6, 16*5(arg2 , %r11)
663 .if \ENC_DEC == DEC
664 vmovdqa \T1, \XMM6
665 .endif
666
667 vmovdqu 16*6(arg3, %r11), \T1
668 vpxor \T1, \XMM7, \XMM7
669 vmovdqu \XMM7, 16*6(arg2 , %r11)
670 .if \ENC_DEC == DEC
671 vmovdqa \T1, \XMM7
672 .endif
673
674 vmovdqu 16*7(arg3, %r11), \T1
675 vpxor \T1, \XMM8, \XMM8
676 vmovdqu \XMM8, 16*7(arg2 , %r11)
677 .if \ENC_DEC == DEC
678 vmovdqa \T1, \XMM8
679 .endif
680
681 add $128, %r11
682
683 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
684 vpxor TMP1(%rsp), \XMM1, \XMM1
685 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
686 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
687 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
688 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
689 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
690 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
691 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
692
693
694
695_initial_blocks_done\@:
696
697.endm
698
699
700
701
702
703.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
704
705 vmovdqa \XMM1, \T2
706 vmovdqa \XMM2, TMP2(%rsp)
707 vmovdqa \XMM3, TMP3(%rsp)
708 vmovdqa \XMM4, TMP4(%rsp)
709 vmovdqa \XMM5, TMP5(%rsp)
710 vmovdqa \XMM6, TMP6(%rsp)
711 vmovdqa \XMM7, TMP7(%rsp)
712 vmovdqa \XMM8, TMP8(%rsp)
713
714.if \loop_idx == in_order
715 vpaddd ONE(%rip), \CTR, \XMM1
716 vpaddd ONE(%rip), \XMM1, \XMM2
717 vpaddd ONE(%rip), \XMM2, \XMM3
718 vpaddd ONE(%rip), \XMM3, \XMM4
719 vpaddd ONE(%rip), \XMM4, \XMM5
720 vpaddd ONE(%rip), \XMM5, \XMM6
721 vpaddd ONE(%rip), \XMM6, \XMM7
722 vpaddd ONE(%rip), \XMM7, \XMM8
723 vmovdqa \XMM8, \CTR
724
725 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
726 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
727 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
728 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
729 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
730 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
731 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
732 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
733.else
734 vpaddd ONEf(%rip), \CTR, \XMM1
735 vpaddd ONEf(%rip), \XMM1, \XMM2
736 vpaddd ONEf(%rip), \XMM2, \XMM3
737 vpaddd ONEf(%rip), \XMM3, \XMM4
738 vpaddd ONEf(%rip), \XMM4, \XMM5
739 vpaddd ONEf(%rip), \XMM5, \XMM6
740 vpaddd ONEf(%rip), \XMM6, \XMM7
741 vpaddd ONEf(%rip), \XMM7, \XMM8
742 vmovdqa \XMM8, \CTR
743.endif
744
745
746
747
748 vmovdqu (arg1), \T1
749 vpxor \T1, \XMM1, \XMM1
750 vpxor \T1, \XMM2, \XMM2
751 vpxor \T1, \XMM3, \XMM3
752 vpxor \T1, \XMM4, \XMM4
753 vpxor \T1, \XMM5, \XMM5
754 vpxor \T1, \XMM6, \XMM6
755 vpxor \T1, \XMM7, \XMM7
756 vpxor \T1, \XMM8, \XMM8
757
758
759
760
761
762
763
764 vmovdqu 16*1(arg1), \T1
765 vaesenc \T1, \XMM1, \XMM1
766 vaesenc \T1, \XMM2, \XMM2
767 vaesenc \T1, \XMM3, \XMM3
768 vaesenc \T1, \XMM4, \XMM4
769 vaesenc \T1, \XMM5, \XMM5
770 vaesenc \T1, \XMM6, \XMM6
771 vaesenc \T1, \XMM7, \XMM7
772 vaesenc \T1, \XMM8, \XMM8
773
774 vmovdqu 16*2(arg1), \T1
775 vaesenc \T1, \XMM1, \XMM1
776 vaesenc \T1, \XMM2, \XMM2
777 vaesenc \T1, \XMM3, \XMM3
778 vaesenc \T1, \XMM4, \XMM4
779 vaesenc \T1, \XMM5, \XMM5
780 vaesenc \T1, \XMM6, \XMM6
781 vaesenc \T1, \XMM7, \XMM7
782 vaesenc \T1, \XMM8, \XMM8
783
784
785
786
787 vmovdqa HashKey_8(arg1), \T5
788 vpclmulqdq $0x11, \T5, \T2, \T4
789 vpclmulqdq $0x00, \T5, \T2, \T7
790
791 vpshufd $0b01001110, \T2, \T6
792 vpxor \T2, \T6, \T6
793
794 vmovdqa HashKey_8_k(arg1), \T5
795 vpclmulqdq $0x00, \T5, \T6, \T6
796
797 vmovdqu 16*3(arg1), \T1
798 vaesenc \T1, \XMM1, \XMM1
799 vaesenc \T1, \XMM2, \XMM2
800 vaesenc \T1, \XMM3, \XMM3
801 vaesenc \T1, \XMM4, \XMM4
802 vaesenc \T1, \XMM5, \XMM5
803 vaesenc \T1, \XMM6, \XMM6
804 vaesenc \T1, \XMM7, \XMM7
805 vaesenc \T1, \XMM8, \XMM8
806
807 vmovdqa TMP2(%rsp), \T1
808 vmovdqa HashKey_7(arg1), \T5
809 vpclmulqdq $0x11, \T5, \T1, \T3
810 vpxor \T3, \T4, \T4
811 vpclmulqdq $0x00, \T5, \T1, \T3
812 vpxor \T3, \T7, \T7
813
814 vpshufd $0b01001110, \T1, \T3
815 vpxor \T1, \T3, \T3
816 vmovdqa HashKey_7_k(arg1), \T5
817 vpclmulqdq $0x10, \T5, \T3, \T3
818 vpxor \T3, \T6, \T6
819
820 vmovdqu 16*4(arg1), \T1
821 vaesenc \T1, \XMM1, \XMM1
822 vaesenc \T1, \XMM2, \XMM2
823 vaesenc \T1, \XMM3, \XMM3
824 vaesenc \T1, \XMM4, \XMM4
825 vaesenc \T1, \XMM5, \XMM5
826 vaesenc \T1, \XMM6, \XMM6
827 vaesenc \T1, \XMM7, \XMM7
828 vaesenc \T1, \XMM8, \XMM8
829
830
831
832 vmovdqa TMP3(%rsp), \T1
833 vmovdqa HashKey_6(arg1), \T5
834 vpclmulqdq $0x11, \T5, \T1, \T3
835 vpxor \T3, \T4, \T4
836 vpclmulqdq $0x00, \T5, \T1, \T3
837 vpxor \T3, \T7, \T7
838
839 vpshufd $0b01001110, \T1, \T3
840 vpxor \T1, \T3, \T3
841 vmovdqa HashKey_6_k(arg1), \T5
842 vpclmulqdq $0x10, \T5, \T3, \T3
843 vpxor \T3, \T6, \T6
844
845 vmovdqu 16*5(arg1), \T1
846 vaesenc \T1, \XMM1, \XMM1
847 vaesenc \T1, \XMM2, \XMM2
848 vaesenc \T1, \XMM3, \XMM3
849 vaesenc \T1, \XMM4, \XMM4
850 vaesenc \T1, \XMM5, \XMM5
851 vaesenc \T1, \XMM6, \XMM6
852 vaesenc \T1, \XMM7, \XMM7
853 vaesenc \T1, \XMM8, \XMM8
854
855 vmovdqa TMP4(%rsp), \T1
856 vmovdqa HashKey_5(arg1), \T5
857 vpclmulqdq $0x11, \T5, \T1, \T3
858 vpxor \T3, \T4, \T4
859 vpclmulqdq $0x00, \T5, \T1, \T3
860 vpxor \T3, \T7, \T7
861
862 vpshufd $0b01001110, \T1, \T3
863 vpxor \T1, \T3, \T3
864 vmovdqa HashKey_5_k(arg1), \T5
865 vpclmulqdq $0x10, \T5, \T3, \T3
866 vpxor \T3, \T6, \T6
867
868 vmovdqu 16*6(arg1), \T1
869 vaesenc \T1, \XMM1, \XMM1
870 vaesenc \T1, \XMM2, \XMM2
871 vaesenc \T1, \XMM3, \XMM3
872 vaesenc \T1, \XMM4, \XMM4
873 vaesenc \T1, \XMM5, \XMM5
874 vaesenc \T1, \XMM6, \XMM6
875 vaesenc \T1, \XMM7, \XMM7
876 vaesenc \T1, \XMM8, \XMM8
877
878
879 vmovdqa TMP5(%rsp), \T1
880 vmovdqa HashKey_4(arg1), \T5
881 vpclmulqdq $0x11, \T5, \T1, \T3
882 vpxor \T3, \T4, \T4
883 vpclmulqdq $0x00, \T5, \T1, \T3
884 vpxor \T3, \T7, \T7
885
886 vpshufd $0b01001110, \T1, \T3
887 vpxor \T1, \T3, \T3
888 vmovdqa HashKey_4_k(arg1), \T5
889 vpclmulqdq $0x10, \T5, \T3, \T3
890 vpxor \T3, \T6, \T6
891
892 vmovdqu 16*7(arg1), \T1
893 vaesenc \T1, \XMM1, \XMM1
894 vaesenc \T1, \XMM2, \XMM2
895 vaesenc \T1, \XMM3, \XMM3
896 vaesenc \T1, \XMM4, \XMM4
897 vaesenc \T1, \XMM5, \XMM5
898 vaesenc \T1, \XMM6, \XMM6
899 vaesenc \T1, \XMM7, \XMM7
900 vaesenc \T1, \XMM8, \XMM8
901
902 vmovdqa TMP6(%rsp), \T1
903 vmovdqa HashKey_3(arg1), \T5
904 vpclmulqdq $0x11, \T5, \T1, \T3
905 vpxor \T3, \T4, \T4
906 vpclmulqdq $0x00, \T5, \T1, \T3
907 vpxor \T3, \T7, \T7
908
909 vpshufd $0b01001110, \T1, \T3
910 vpxor \T1, \T3, \T3
911 vmovdqa HashKey_3_k(arg1), \T5
912 vpclmulqdq $0x10, \T5, \T3, \T3
913 vpxor \T3, \T6, \T6
914
915
916 vmovdqu 16*8(arg1), \T1
917 vaesenc \T1, \XMM1, \XMM1
918 vaesenc \T1, \XMM2, \XMM2
919 vaesenc \T1, \XMM3, \XMM3
920 vaesenc \T1, \XMM4, \XMM4
921 vaesenc \T1, \XMM5, \XMM5
922 vaesenc \T1, \XMM6, \XMM6
923 vaesenc \T1, \XMM7, \XMM7
924 vaesenc \T1, \XMM8, \XMM8
925
926 vmovdqa TMP7(%rsp), \T1
927 vmovdqa HashKey_2(arg1), \T5
928 vpclmulqdq $0x11, \T5, \T1, \T3
929 vpxor \T3, \T4, \T4
930 vpclmulqdq $0x00, \T5, \T1, \T3
931 vpxor \T3, \T7, \T7
932
933 vpshufd $0b01001110, \T1, \T3
934 vpxor \T1, \T3, \T3
935 vmovdqa HashKey_2_k(arg1), \T5
936 vpclmulqdq $0x10, \T5, \T3, \T3
937 vpxor \T3, \T6, \T6
938
939
940
941 vmovdqu 16*9(arg1), \T5
942 vaesenc \T5, \XMM1, \XMM1
943 vaesenc \T5, \XMM2, \XMM2
944 vaesenc \T5, \XMM3, \XMM3
945 vaesenc \T5, \XMM4, \XMM4
946 vaesenc \T5, \XMM5, \XMM5
947 vaesenc \T5, \XMM6, \XMM6
948 vaesenc \T5, \XMM7, \XMM7
949 vaesenc \T5, \XMM8, \XMM8
950
951 vmovdqa TMP8(%rsp), \T1
952 vmovdqa HashKey(arg1), \T5
953 vpclmulqdq $0x11, \T5, \T1, \T3
954 vpxor \T3, \T4, \T4
955 vpclmulqdq $0x00, \T5, \T1, \T3
956 vpxor \T3, \T7, \T7
957
958 vpshufd $0b01001110, \T1, \T3
959 vpxor \T1, \T3, \T3
960 vmovdqa HashKey_k(arg1), \T5
961 vpclmulqdq $0x10, \T5, \T3, \T3
962 vpxor \T3, \T6, \T6
963
964 vpxor \T4, \T6, \T6
965 vpxor \T7, \T6, \T6
966
967 vmovdqu 16*10(arg1), \T5
968
969 i = 0
970 j = 1
971 setreg
972.rep 8
973 vpxor 16*i(arg3, %r11), \T5, \T2
974 .if \ENC_DEC == ENC
975 vaesenclast \T2, reg_j, reg_j
976 .else
977 vaesenclast \T2, reg_j, \T3
978 vmovdqu 16*i(arg3, %r11), reg_j
979 vmovdqu \T3, 16*i(arg2, %r11)
980 .endif
981 i = (i+1)
982 j = (j+1)
983 setreg
984.endr
985
986
987
988 vpslldq $8, \T6, \T3
989 vpsrldq $8, \T6, \T6
990 vpxor \T3, \T7, \T7
991 vpxor \T4, \T6, \T6
992
993
994
995
996
997
998 vpslld $31, \T7, \T2
999 vpslld $30, \T7, \T3
1000 vpslld $25, \T7, \T4
1001
1002 vpxor \T3, \T2, \T2
1003 vpxor \T4, \T2, \T2
1004
1005 vpsrldq $4, \T2, \T1
1006
1007 vpslldq $12, \T2, \T2
1008 vpxor \T2, \T7, \T7
1009
1010 .if \ENC_DEC == ENC
1011 vmovdqu \XMM1, 16*0(arg2,%r11)
1012 vmovdqu \XMM2, 16*1(arg2,%r11)
1013 vmovdqu \XMM3, 16*2(arg2,%r11)
1014 vmovdqu \XMM4, 16*3(arg2,%r11)
1015 vmovdqu \XMM5, 16*4(arg2,%r11)
1016 vmovdqu \XMM6, 16*5(arg2,%r11)
1017 vmovdqu \XMM7, 16*6(arg2,%r11)
1018 vmovdqu \XMM8, 16*7(arg2,%r11)
1019 .endif
1020
1021
1022
1023 vpsrld $1, \T7, \T2
1024 vpsrld $2, \T7, \T3
1025 vpsrld $7, \T7, \T4
1026 vpxor \T3, \T2, \T2
1027 vpxor \T4, \T2, \T2
1028
1029 vpxor \T1, \T2, \T2
1030 vpxor \T2, \T7, \T7
1031 vpxor \T7, \T6, \T6
1032
1033
1034 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
1035 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
1036 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
1037 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
1038 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
1039 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
1040 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
1041 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
1042
1043
1044 vpxor \T6, \XMM1, \XMM1
1045
1046
1047
1048.endm
1049
1050
1051
1052.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1053
1054
1055
1056
1057 vpshufd $0b01001110, \XMM1, \T2
1058 vpxor \XMM1, \T2, \T2
1059 vmovdqa HashKey_8(arg1), \T5
1060 vpclmulqdq $0x11, \T5, \XMM1, \T6
1061 vpclmulqdq $0x00, \T5, \XMM1, \T7
1062
1063 vmovdqa HashKey_8_k(arg1), \T3
1064 vpclmulqdq $0x00, \T3, \T2, \XMM1
1065
1066
1067
1068 vpshufd $0b01001110, \XMM2, \T2
1069 vpxor \XMM2, \T2, \T2
1070 vmovdqa HashKey_7(arg1), \T5
1071 vpclmulqdq $0x11, \T5, \XMM2, \T4
1072 vpxor \T4, \T6, \T6
1073
1074 vpclmulqdq $0x00, \T5, \XMM2, \T4
1075 vpxor \T4, \T7, \T7
1076
1077 vmovdqa HashKey_7_k(arg1), \T3
1078 vpclmulqdq $0x00, \T3, \T2, \T2
1079 vpxor \T2, \XMM1, \XMM1
1080
1081
1082
1083 vpshufd $0b01001110, \XMM3, \T2
1084 vpxor \XMM3, \T2, \T2
1085 vmovdqa HashKey_6(arg1), \T5
1086 vpclmulqdq $0x11, \T5, \XMM3, \T4
1087 vpxor \T4, \T6, \T6
1088
1089 vpclmulqdq $0x00, \T5, \XMM3, \T4
1090 vpxor \T4, \T7, \T7
1091
1092 vmovdqa HashKey_6_k(arg1), \T3
1093 vpclmulqdq $0x00, \T3, \T2, \T2
1094 vpxor \T2, \XMM1, \XMM1
1095
1096
1097
1098 vpshufd $0b01001110, \XMM4, \T2
1099 vpxor \XMM4, \T2, \T2
1100 vmovdqa HashKey_5(arg1), \T5
1101 vpclmulqdq $0x11, \T5, \XMM4, \T4
1102 vpxor \T4, \T6, \T6
1103
1104 vpclmulqdq $0x00, \T5, \XMM4, \T4
1105 vpxor \T4, \T7, \T7
1106
1107 vmovdqa HashKey_5_k(arg1), \T3
1108 vpclmulqdq $0x00, \T3, \T2, \T2
1109 vpxor \T2, \XMM1, \XMM1
1110
1111
1112
1113 vpshufd $0b01001110, \XMM5, \T2
1114 vpxor \XMM5, \T2, \T2
1115 vmovdqa HashKey_4(arg1), \T5
1116 vpclmulqdq $0x11, \T5, \XMM5, \T4
1117 vpxor \T4, \T6, \T6
1118
1119 vpclmulqdq $0x00, \T5, \XMM5, \T4
1120 vpxor \T4, \T7, \T7
1121
1122 vmovdqa HashKey_4_k(arg1), \T3
1123 vpclmulqdq $0x00, \T3, \T2, \T2
1124 vpxor \T2, \XMM1, \XMM1
1125
1126
1127
1128 vpshufd $0b01001110, \XMM6, \T2
1129 vpxor \XMM6, \T2, \T2
1130 vmovdqa HashKey_3(arg1), \T5
1131 vpclmulqdq $0x11, \T5, \XMM6, \T4
1132 vpxor \T4, \T6, \T6
1133
1134 vpclmulqdq $0x00, \T5, \XMM6, \T4
1135 vpxor \T4, \T7, \T7
1136
1137 vmovdqa HashKey_3_k(arg1), \T3
1138 vpclmulqdq $0x00, \T3, \T2, \T2
1139 vpxor \T2, \XMM1, \XMM1
1140
1141
1142
1143 vpshufd $0b01001110, \XMM7, \T2
1144 vpxor \XMM7, \T2, \T2
1145 vmovdqa HashKey_2(arg1), \T5
1146 vpclmulqdq $0x11, \T5, \XMM7, \T4
1147 vpxor \T4, \T6, \T6
1148
1149 vpclmulqdq $0x00, \T5, \XMM7, \T4
1150 vpxor \T4, \T7, \T7
1151
1152 vmovdqa HashKey_2_k(arg1), \T3
1153 vpclmulqdq $0x00, \T3, \T2, \T2
1154 vpxor \T2, \XMM1, \XMM1
1155
1156
1157
1158 vpshufd $0b01001110, \XMM8, \T2
1159 vpxor \XMM8, \T2, \T2
1160 vmovdqa HashKey(arg1), \T5
1161 vpclmulqdq $0x11, \T5, \XMM8, \T4
1162 vpxor \T4, \T6, \T6
1163
1164 vpclmulqdq $0x00, \T5, \XMM8, \T4
1165 vpxor \T4, \T7, \T7
1166
1167 vmovdqa HashKey_k(arg1), \T3
1168 vpclmulqdq $0x00, \T3, \T2, \T2
1169
1170 vpxor \T2, \XMM1, \XMM1
1171 vpxor \T6, \XMM1, \XMM1
1172 vpxor \T7, \XMM1, \T2
1173
1174
1175
1176
1177 vpslldq $8, \T2, \T4
1178 vpsrldq $8, \T2, \T2
1179
1180 vpxor \T4, \T7, \T7
1181 vpxor \T2, \T6, \T6
1182
1183
1184
1185
1186 vpslld $31, \T7, \T2
1187 vpslld $30, \T7, \T3
1188 vpslld $25, \T7, \T4
1189
1190 vpxor \T3, \T2, \T2
1191 vpxor \T4, \T2, \T2
1192
1193 vpsrldq $4, \T2, \T1
1194
1195 vpslldq $12, \T2, \T2
1196 vpxor \T2, \T7, \T7
1197
1198
1199
1200
1201 vpsrld $1, \T7, \T2
1202 vpsrld $2, \T7, \T3
1203 vpsrld $7, \T7, \T4
1204 vpxor \T3, \T2, \T2
1205 vpxor \T4, \T2, \T2
1206
1207 vpxor \T1, \T2, \T2
1208 vpxor \T2, \T7, \T7
1209 vpxor \T7, \T6, \T6
1210
1211.endm
1212
1213
1214
1215
1216
1217.macro GCM_ENC_DEC_AVX ENC_DEC
1218
1219
1220 push %r12
1221 push %r13
1222 push %r14
1223 push %r15
1224
1225 mov %rsp, %r14
1226
1227
1228
1229
1230 sub $VARIABLE_OFFSET, %rsp
1231 and $~63, %rsp
1232
1233
1234 vmovdqu HashKey(arg1), %xmm13
1235
1236 mov arg4, %r13
1237 and $-16, %r13
1238
1239 mov %r13, %r12
1240 shr $4, %r12
1241 and $7, %r12
1242 jz _initial_num_blocks_is_0\@
1243
1244 cmp $7, %r12
1245 je _initial_num_blocks_is_7\@
1246 cmp $6, %r12
1247 je _initial_num_blocks_is_6\@
1248 cmp $5, %r12
1249 je _initial_num_blocks_is_5\@
1250 cmp $4, %r12
1251 je _initial_num_blocks_is_4\@
1252 cmp $3, %r12
1253 je _initial_num_blocks_is_3\@
1254 cmp $2, %r12
1255 je _initial_num_blocks_is_2\@
1256
1257 jmp _initial_num_blocks_is_1\@
1258
1259_initial_num_blocks_is_7\@:
1260 INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1261 sub $16*7, %r13
1262 jmp _initial_blocks_encrypted\@
1263
1264_initial_num_blocks_is_6\@:
1265 INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1266 sub $16*6, %r13
1267 jmp _initial_blocks_encrypted\@
1268
1269_initial_num_blocks_is_5\@:
1270 INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1271 sub $16*5, %r13
1272 jmp _initial_blocks_encrypted\@
1273
1274_initial_num_blocks_is_4\@:
1275 INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1276 sub $16*4, %r13
1277 jmp _initial_blocks_encrypted\@
1278
1279_initial_num_blocks_is_3\@:
1280 INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1281 sub $16*3, %r13
1282 jmp _initial_blocks_encrypted\@
1283
1284_initial_num_blocks_is_2\@:
1285 INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1286 sub $16*2, %r13
1287 jmp _initial_blocks_encrypted\@
1288
1289_initial_num_blocks_is_1\@:
1290 INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1291 sub $16*1, %r13
1292 jmp _initial_blocks_encrypted\@
1293
1294_initial_num_blocks_is_0\@:
1295 INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1296
1297
1298_initial_blocks_encrypted\@:
1299 cmp $0, %r13
1300 je _zero_cipher_left\@
1301
1302 sub $128, %r13
1303 je _eight_cipher_left\@
1304
1305
1306
1307
1308 vmovd %xmm9, %r15d
1309 and $255, %r15d
1310 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1311
1312
1313_encrypt_by_8_new\@:
1314 cmp $(255-8), %r15d
1315 jg _encrypt_by_8\@
1316
1317
1318
1319 add $8, %r15b
1320 GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
1321 add $128, %r11
1322 sub $128, %r13
1323 jne _encrypt_by_8_new\@
1324
1325 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1326 jmp _eight_cipher_left\@
1327
1328_encrypt_by_8\@:
1329 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1330 add $8, %r15b
1331 GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
1332 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1333 add $128, %r11
1334 sub $128, %r13
1335 jne _encrypt_by_8_new\@
1336
1337 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1338
1339
1340
1341
1342_eight_cipher_left\@:
1343 GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
1344
1345
1346_zero_cipher_left\@:
1347 cmp $16, arg4
1348 jl _only_less_than_16\@
1349
1350 mov arg4, %r13
1351 and $15, %r13
1352
1353 je _multiple_of_16_bytes\@
1354
1355
1356
1357
1358 vpaddd ONE(%rip), %xmm9, %xmm9
1359 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1360 ENCRYPT_SINGLE_BLOCK %xmm9
1361
1362 sub $16, %r11
1363 add %r13, %r11
1364 vmovdqu (arg3, %r11), %xmm1
1365
1366 lea SHIFT_MASK+16(%rip), %r12
1367 sub %r13, %r12
1368
1369
1370 vmovdqu (%r12), %xmm2
1371 vpshufb %xmm2, %xmm1, %xmm1
1372 jmp _final_ghash_mul\@
1373
1374_only_less_than_16\@:
1375
1376 mov arg4, %r13
1377 and $15, %r13
1378
1379 je _multiple_of_16_bytes\@
1380
1381
1382
1383
1384 vpaddd ONE(%rip), %xmm9, %xmm9
1385 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1386 ENCRYPT_SINGLE_BLOCK %xmm9
1387
1388
1389 lea SHIFT_MASK+16(%rip), %r12
1390 sub %r13, %r12
1391
1392
1393
1394_get_last_16_byte_loop\@:
1395 movb (arg3, %r11), %al
1396 movb %al, TMP1 (%rsp , %r11)
1397 add $1, %r11
1398 cmp %r13, %r11
1399 jne _get_last_16_byte_loop\@
1400
1401 vmovdqu TMP1(%rsp), %xmm1
1402
1403 sub $16, %r11
1404
1405_final_ghash_mul\@:
1406 .if \ENC_DEC == DEC
1407 vmovdqa %xmm1, %xmm2
1408 vpxor %xmm1, %xmm9, %xmm9
1409 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1410
1411 vpand %xmm1, %xmm9, %xmm9
1412 vpand %xmm1, %xmm2, %xmm2
1413 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
1414 vpxor %xmm2, %xmm14, %xmm14
1415
1416 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1417 sub %r13, %r11
1418 add $16, %r11
1419 .else
1420 vpxor %xmm1, %xmm9, %xmm9
1421 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1422
1423 vpand %xmm1, %xmm9, %xmm9
1424 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1425 vpxor %xmm9, %xmm14, %xmm14
1426
1427 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1428 sub %r13, %r11
1429 add $16, %r11
1430 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1431 .endif
1432
1433
1434
1435
1436 vmovq %xmm9, %rax
1437 cmp $8, %r13
1438 jle _less_than_8_bytes_left\@
1439
1440 mov %rax, (arg2 , %r11)
1441 add $8, %r11
1442 vpsrldq $8, %xmm9, %xmm9
1443 vmovq %xmm9, %rax
1444 sub $8, %r13
1445
1446_less_than_8_bytes_left\@:
1447 movb %al, (arg2 , %r11)
1448 add $1, %r11
1449 shr $8, %rax
1450 sub $1, %r13
1451 jne _less_than_8_bytes_left\@
1452
1453
1454_multiple_of_16_bytes\@:
1455 mov arg7, %r12
1456 shl $3, %r12
1457 vmovd %r12d, %xmm15
1458
1459 shl $3, arg4
1460 vmovq arg4, %xmm1
1461 vpslldq $8, %xmm15, %xmm15
1462 vpxor %xmm1, %xmm15, %xmm15
1463
1464 vpxor %xmm15, %xmm14, %xmm14
1465 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1466 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14
1467
1468 mov arg5, %rax
1469 vmovdqu (%rax), %xmm9
1470
1471 ENCRYPT_SINGLE_BLOCK %xmm9
1472
1473 vpxor %xmm14, %xmm9, %xmm9
1474
1475
1476
1477_return_T\@:
1478 mov arg8, %r10
1479 mov arg9, %r11
1480
1481 cmp $16, %r11
1482 je _T_16\@
1483
1484 cmp $8, %r11
1485 jl _T_4\@
1486
1487_T_8\@:
1488 vmovq %xmm9, %rax
1489 mov %rax, (%r10)
1490 add $8, %r10
1491 sub $8, %r11
1492 vpsrldq $8, %xmm9, %xmm9
1493 cmp $0, %r11
1494 je _return_T_done\@
1495_T_4\@:
1496 vmovd %xmm9, %eax
1497 mov %eax, (%r10)
1498 add $4, %r10
1499 sub $4, %r11
1500 vpsrldq $4, %xmm9, %xmm9
1501 cmp $0, %r11
1502 je _return_T_done\@
1503_T_123\@:
1504 vmovd %xmm9, %eax
1505 cmp $2, %r11
1506 jl _T_1\@
1507 mov %ax, (%r10)
1508 cmp $2, %r11
1509 je _return_T_done\@
1510 add $2, %r10
1511 sar $16, %eax
1512_T_1\@:
1513 mov %al, (%r10)
1514 jmp _return_T_done\@
1515
1516_T_16\@:
1517 vmovdqu %xmm9, (%r10)
1518
1519_return_T_done\@:
1520 mov %r14, %rsp
1521
1522 pop %r15
1523 pop %r14
1524 pop %r13
1525 pop %r12
1526.endm
1527
1528
1529
1530
1531
1532
1533
1534ENTRY(aesni_gcm_precomp_avx_gen2)
1535
1536 push %r12
1537 push %r13
1538 push %r14
1539 push %r15
1540
1541 mov %rsp, %r14
1542
1543
1544
1545 sub $VARIABLE_OFFSET, %rsp
1546 and $~63, %rsp
1547
1548 vmovdqu (arg2), %xmm6
1549
1550 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
1551
1552 vmovdqa %xmm6, %xmm2
1553 vpsllq $1, %xmm6, %xmm6
1554 vpsrlq $63, %xmm2, %xmm2
1555 vmovdqa %xmm2, %xmm1
1556 vpslldq $8, %xmm2, %xmm2
1557 vpsrldq $8, %xmm1, %xmm1
1558 vpor %xmm2, %xmm6, %xmm6
1559
1560 vpshufd $0b00100100, %xmm1, %xmm2
1561 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
1562 vpand POLY(%rip), %xmm2, %xmm2
1563 vpxor %xmm2, %xmm6, %xmm6
1564
1565 vmovdqa %xmm6, HashKey(arg1)
1566
1567
1568 PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
1569
1570 mov %r14, %rsp
1571
1572 pop %r15
1573 pop %r14
1574 pop %r13
1575 pop %r12
1576 ret
1577ENDPROC(aesni_gcm_precomp_avx_gen2)
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595ENTRY(aesni_gcm_enc_avx_gen2)
1596 GCM_ENC_DEC_AVX ENC
1597 ret
1598ENDPROC(aesni_gcm_enc_avx_gen2)
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616ENTRY(aesni_gcm_dec_avx_gen2)
1617 GCM_ENC_DEC_AVX DEC
1618 ret
1619ENDPROC(aesni_gcm_dec_avx_gen2)
1620#endif
1621
1622#ifdef CONFIG_AS_AVX2
1623
1624
1625
1626
1627
1628
1629
1630.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1631
1632 vpclmulqdq $0x11,\HK,\GH,\T1
1633 vpclmulqdq $0x00,\HK,\GH,\T2
1634 vpclmulqdq $0x01,\HK,\GH,\T3
1635 vpclmulqdq $0x10,\HK,\GH,\GH
1636 vpxor \T3, \GH, \GH
1637
1638
1639 vpsrldq $8 , \GH, \T3
1640 vpslldq $8 , \GH, \GH
1641
1642 vpxor \T3, \T1, \T1
1643 vpxor \T2, \GH, \GH
1644
1645
1646
1647 vmovdqa POLY2(%rip), \T3
1648
1649 vpclmulqdq $0x01, \GH, \T3, \T2
1650 vpslldq $8, \T2, \T2
1651
1652 vpxor \T2, \GH, \GH
1653
1654
1655 vpclmulqdq $0x00, \GH, \T3, \T2
1656 vpsrldq $4, \T2, \T2
1657
1658 vpclmulqdq $0x10, \GH, \T3, \GH
1659 vpslldq $4, \GH, \GH
1660
1661 vpxor \T2, \GH, \GH
1662
1663 vpxor \T1, \GH, \GH
1664
1665
1666.endm
1667
1668.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1669
1670
1671 vmovdqa \HK, \T5
1672 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1673 vmovdqa \T5, HashKey_2(arg1)
1674
1675 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1676 vmovdqa \T5, HashKey_3(arg1)
1677
1678 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1679 vmovdqa \T5, HashKey_4(arg1)
1680
1681 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1682 vmovdqa \T5, HashKey_5(arg1)
1683
1684 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1685 vmovdqa \T5, HashKey_6(arg1)
1686
1687 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1688 vmovdqa \T5, HashKey_7(arg1)
1689
1690 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1691 vmovdqa \T5, HashKey_8(arg1)
1692
1693.endm
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1704 i = (8-\num_initial_blocks)
1705 j = 0
1706 setreg
1707
1708 mov arg6, %r10
1709 mov arg7, %r12
1710
1711
1712 mov %r12, %r11
1713
1714 vpxor reg_j, reg_j, reg_j
1715 vpxor reg_i, reg_i, reg_i
1716
1717 cmp $16, %r11
1718 jl _get_AAD_rest8\@
1719_get_AAD_blocks\@:
1720 vmovdqu (%r10), reg_i
1721 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1722 vpxor reg_i, reg_j, reg_j
1723 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6
1724 add $16, %r10
1725 sub $16, %r12
1726 sub $16, %r11
1727 cmp $16, %r11
1728 jge _get_AAD_blocks\@
1729 vmovdqu reg_j, reg_i
1730 cmp $0, %r11
1731 je _get_AAD_done\@
1732
1733 vpxor reg_i, reg_i, reg_i
1734
1735
1736
1737
1738_get_AAD_rest8\@:
1739 cmp $4, %r11
1740 jle _get_AAD_rest4\@
1741 movq (%r10), \T1
1742 add $8, %r10
1743 sub $8, %r11
1744 vpslldq $8, \T1, \T1
1745 vpsrldq $8, reg_i, reg_i
1746 vpxor \T1, reg_i, reg_i
1747 jmp _get_AAD_rest8\@
1748_get_AAD_rest4\@:
1749 cmp $0, %r11
1750 jle _get_AAD_rest0\@
1751 mov (%r10), %eax
1752 movq %rax, \T1
1753 add $4, %r10
1754 sub $4, %r11
1755 vpslldq $12, \T1, \T1
1756 vpsrldq $4, reg_i, reg_i
1757 vpxor \T1, reg_i, reg_i
1758_get_AAD_rest0\@:
1759
1760
1761
1762 movq %r12, %r11
1763 salq $4, %r11
1764 movdqu aad_shift_arr(%r11), \T1
1765 vpshufb \T1, reg_i, reg_i
1766_get_AAD_rest_final\@:
1767 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1768 vpxor reg_j, reg_i, reg_i
1769 GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6
1770
1771_get_AAD_done\@:
1772
1773 xor %r11, %r11
1774
1775
1776 mov arg5, %rax
1777 vmovdqu (%rax), \CTR
1778 vpshufb SHUF_MASK(%rip), \CTR, \CTR
1779
1780
1781 i = (9-\num_initial_blocks)
1782 setreg
1783.rep \num_initial_blocks
1784 vpaddd ONE(%rip), \CTR, \CTR
1785 vmovdqa \CTR, reg_i
1786 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1787 i = (i+1)
1788 setreg
1789.endr
1790
1791 vmovdqa (arg1), \T_key
1792 i = (9-\num_initial_blocks)
1793 setreg
1794.rep \num_initial_blocks
1795 vpxor \T_key, reg_i, reg_i
1796 i = (i+1)
1797 setreg
1798.endr
1799
1800 j = 1
1801 setreg
1802.rep 9
1803 vmovdqa 16*j(arg1), \T_key
1804 i = (9-\num_initial_blocks)
1805 setreg
1806.rep \num_initial_blocks
1807 vaesenc \T_key, reg_i, reg_i
1808 i = (i+1)
1809 setreg
1810.endr
1811
1812 j = (j+1)
1813 setreg
1814.endr
1815
1816
1817 vmovdqa 16*10(arg1), \T_key
1818 i = (9-\num_initial_blocks)
1819 setreg
1820.rep \num_initial_blocks
1821 vaesenclast \T_key, reg_i, reg_i
1822 i = (i+1)
1823 setreg
1824.endr
1825
1826 i = (9-\num_initial_blocks)
1827 setreg
1828.rep \num_initial_blocks
1829 vmovdqu (arg3, %r11), \T1
1830 vpxor \T1, reg_i, reg_i
1831 vmovdqu reg_i, (arg2 , %r11)
1832
1833 add $16, %r11
1834.if \ENC_DEC == DEC
1835 vmovdqa \T1, reg_i
1836.endif
1837 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1838 i = (i+1)
1839 setreg
1840.endr
1841
1842
1843 i = (8-\num_initial_blocks)
1844 j = (9-\num_initial_blocks)
1845 setreg
1846
1847.rep \num_initial_blocks
1848 vpxor reg_i, reg_j, reg_j
1849 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6
1850 i = (i+1)
1851 j = (j+1)
1852 setreg
1853.endr
1854
1855
1856 vmovdqa \XMM8, TMP1(%rsp)
1857 vmovdqa \XMM8, \T3
1858
1859 cmp $128, %r13
1860 jl _initial_blocks_done\@
1861
1862
1863
1864 vpaddd ONE(%rip), \CTR, \CTR
1865 vmovdqa \CTR, \XMM1
1866 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
1867
1868 vpaddd ONE(%rip), \CTR, \CTR
1869 vmovdqa \CTR, \XMM2
1870 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
1871
1872 vpaddd ONE(%rip), \CTR, \CTR
1873 vmovdqa \CTR, \XMM3
1874 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
1875
1876 vpaddd ONE(%rip), \CTR, \CTR
1877 vmovdqa \CTR, \XMM4
1878 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
1879
1880 vpaddd ONE(%rip), \CTR, \CTR
1881 vmovdqa \CTR, \XMM5
1882 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
1883
1884 vpaddd ONE(%rip), \CTR, \CTR
1885 vmovdqa \CTR, \XMM6
1886 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
1887
1888 vpaddd ONE(%rip), \CTR, \CTR
1889 vmovdqa \CTR, \XMM7
1890 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
1891
1892 vpaddd ONE(%rip), \CTR, \CTR
1893 vmovdqa \CTR, \XMM8
1894 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
1895
1896 vmovdqa (arg1), \T_key
1897 vpxor \T_key, \XMM1, \XMM1
1898 vpxor \T_key, \XMM2, \XMM2
1899 vpxor \T_key, \XMM3, \XMM3
1900 vpxor \T_key, \XMM4, \XMM4
1901 vpxor \T_key, \XMM5, \XMM5
1902 vpxor \T_key, \XMM6, \XMM6
1903 vpxor \T_key, \XMM7, \XMM7
1904 vpxor \T_key, \XMM8, \XMM8
1905
1906 i = 1
1907 setreg
1908.rep 9
1909 vmovdqa 16*i(arg1), \T_key
1910 vaesenc \T_key, \XMM1, \XMM1
1911 vaesenc \T_key, \XMM2, \XMM2
1912 vaesenc \T_key, \XMM3, \XMM3
1913 vaesenc \T_key, \XMM4, \XMM4
1914 vaesenc \T_key, \XMM5, \XMM5
1915 vaesenc \T_key, \XMM6, \XMM6
1916 vaesenc \T_key, \XMM7, \XMM7
1917 vaesenc \T_key, \XMM8, \XMM8
1918 i = (i+1)
1919 setreg
1920.endr
1921
1922
1923 vmovdqa 16*i(arg1), \T_key
1924 vaesenclast \T_key, \XMM1, \XMM1
1925 vaesenclast \T_key, \XMM2, \XMM2
1926 vaesenclast \T_key, \XMM3, \XMM3
1927 vaesenclast \T_key, \XMM4, \XMM4
1928 vaesenclast \T_key, \XMM5, \XMM5
1929 vaesenclast \T_key, \XMM6, \XMM6
1930 vaesenclast \T_key, \XMM7, \XMM7
1931 vaesenclast \T_key, \XMM8, \XMM8
1932
1933 vmovdqu (arg3, %r11), \T1
1934 vpxor \T1, \XMM1, \XMM1
1935 vmovdqu \XMM1, (arg2 , %r11)
1936 .if \ENC_DEC == DEC
1937 vmovdqa \T1, \XMM1
1938 .endif
1939
1940 vmovdqu 16*1(arg3, %r11), \T1
1941 vpxor \T1, \XMM2, \XMM2
1942 vmovdqu \XMM2, 16*1(arg2 , %r11)
1943 .if \ENC_DEC == DEC
1944 vmovdqa \T1, \XMM2
1945 .endif
1946
1947 vmovdqu 16*2(arg3, %r11), \T1
1948 vpxor \T1, \XMM3, \XMM3
1949 vmovdqu \XMM3, 16*2(arg2 , %r11)
1950 .if \ENC_DEC == DEC
1951 vmovdqa \T1, \XMM3
1952 .endif
1953
1954 vmovdqu 16*3(arg3, %r11), \T1
1955 vpxor \T1, \XMM4, \XMM4
1956 vmovdqu \XMM4, 16*3(arg2 , %r11)
1957 .if \ENC_DEC == DEC
1958 vmovdqa \T1, \XMM4
1959 .endif
1960
1961 vmovdqu 16*4(arg3, %r11), \T1
1962 vpxor \T1, \XMM5, \XMM5
1963 vmovdqu \XMM5, 16*4(arg2 , %r11)
1964 .if \ENC_DEC == DEC
1965 vmovdqa \T1, \XMM5
1966 .endif
1967
1968 vmovdqu 16*5(arg3, %r11), \T1
1969 vpxor \T1, \XMM6, \XMM6
1970 vmovdqu \XMM6, 16*5(arg2 , %r11)
1971 .if \ENC_DEC == DEC
1972 vmovdqa \T1, \XMM6
1973 .endif
1974
1975 vmovdqu 16*6(arg3, %r11), \T1
1976 vpxor \T1, \XMM7, \XMM7
1977 vmovdqu \XMM7, 16*6(arg2 , %r11)
1978 .if \ENC_DEC == DEC
1979 vmovdqa \T1, \XMM7
1980 .endif
1981
1982 vmovdqu 16*7(arg3, %r11), \T1
1983 vpxor \T1, \XMM8, \XMM8
1984 vmovdqu \XMM8, 16*7(arg2 , %r11)
1985 .if \ENC_DEC == DEC
1986 vmovdqa \T1, \XMM8
1987 .endif
1988
1989 add $128, %r11
1990
1991 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
1992 vpxor TMP1(%rsp), \XMM1, \XMM1
1993
1994 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
1995 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
1996 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
1997 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
1998 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
1999 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
2000 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
2001
2002
2003
2004_initial_blocks_done\@:
2005
2006
2007.endm
2008
2009
2010
2011
2012
2013
2014
2015.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2016
2017 vmovdqa \XMM1, \T2
2018 vmovdqa \XMM2, TMP2(%rsp)
2019 vmovdqa \XMM3, TMP3(%rsp)
2020 vmovdqa \XMM4, TMP4(%rsp)
2021 vmovdqa \XMM5, TMP5(%rsp)
2022 vmovdqa \XMM6, TMP6(%rsp)
2023 vmovdqa \XMM7, TMP7(%rsp)
2024 vmovdqa \XMM8, TMP8(%rsp)
2025
2026.if \loop_idx == in_order
2027 vpaddd ONE(%rip), \CTR, \XMM1
2028 vpaddd ONE(%rip), \XMM1, \XMM2
2029 vpaddd ONE(%rip), \XMM2, \XMM3
2030 vpaddd ONE(%rip), \XMM3, \XMM4
2031 vpaddd ONE(%rip), \XMM4, \XMM5
2032 vpaddd ONE(%rip), \XMM5, \XMM6
2033 vpaddd ONE(%rip), \XMM6, \XMM7
2034 vpaddd ONE(%rip), \XMM7, \XMM8
2035 vmovdqa \XMM8, \CTR
2036
2037 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
2038 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
2039 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
2040 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
2041 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
2042 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
2043 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
2044 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
2045.else
2046 vpaddd ONEf(%rip), \CTR, \XMM1
2047 vpaddd ONEf(%rip), \XMM1, \XMM2
2048 vpaddd ONEf(%rip), \XMM2, \XMM3
2049 vpaddd ONEf(%rip), \XMM3, \XMM4
2050 vpaddd ONEf(%rip), \XMM4, \XMM5
2051 vpaddd ONEf(%rip), \XMM5, \XMM6
2052 vpaddd ONEf(%rip), \XMM6, \XMM7
2053 vpaddd ONEf(%rip), \XMM7, \XMM8
2054 vmovdqa \XMM8, \CTR
2055.endif
2056
2057
2058
2059
2060 vmovdqu (arg1), \T1
2061 vpxor \T1, \XMM1, \XMM1
2062 vpxor \T1, \XMM2, \XMM2
2063 vpxor \T1, \XMM3, \XMM3
2064 vpxor \T1, \XMM4, \XMM4
2065 vpxor \T1, \XMM5, \XMM5
2066 vpxor \T1, \XMM6, \XMM6
2067 vpxor \T1, \XMM7, \XMM7
2068 vpxor \T1, \XMM8, \XMM8
2069
2070
2071
2072
2073
2074
2075
2076 vmovdqu 16*1(arg1), \T1
2077 vaesenc \T1, \XMM1, \XMM1
2078 vaesenc \T1, \XMM2, \XMM2
2079 vaesenc \T1, \XMM3, \XMM3
2080 vaesenc \T1, \XMM4, \XMM4
2081 vaesenc \T1, \XMM5, \XMM5
2082 vaesenc \T1, \XMM6, \XMM6
2083 vaesenc \T1, \XMM7, \XMM7
2084 vaesenc \T1, \XMM8, \XMM8
2085
2086 vmovdqu 16*2(arg1), \T1
2087 vaesenc \T1, \XMM1, \XMM1
2088 vaesenc \T1, \XMM2, \XMM2
2089 vaesenc \T1, \XMM3, \XMM3
2090 vaesenc \T1, \XMM4, \XMM4
2091 vaesenc \T1, \XMM5, \XMM5
2092 vaesenc \T1, \XMM6, \XMM6
2093 vaesenc \T1, \XMM7, \XMM7
2094 vaesenc \T1, \XMM8, \XMM8
2095
2096
2097
2098
2099 vmovdqa HashKey_8(arg1), \T5
2100 vpclmulqdq $0x11, \T5, \T2, \T4
2101 vpclmulqdq $0x00, \T5, \T2, \T7
2102 vpclmulqdq $0x01, \T5, \T2, \T6
2103 vpclmulqdq $0x10, \T5, \T2, \T5
2104 vpxor \T5, \T6, \T6
2105
2106 vmovdqu 16*3(arg1), \T1
2107 vaesenc \T1, \XMM1, \XMM1
2108 vaesenc \T1, \XMM2, \XMM2
2109 vaesenc \T1, \XMM3, \XMM3
2110 vaesenc \T1, \XMM4, \XMM4
2111 vaesenc \T1, \XMM5, \XMM5
2112 vaesenc \T1, \XMM6, \XMM6
2113 vaesenc \T1, \XMM7, \XMM7
2114 vaesenc \T1, \XMM8, \XMM8
2115
2116 vmovdqa TMP2(%rsp), \T1
2117 vmovdqa HashKey_7(arg1), \T5
2118 vpclmulqdq $0x11, \T5, \T1, \T3
2119 vpxor \T3, \T4, \T4
2120
2121 vpclmulqdq $0x00, \T5, \T1, \T3
2122 vpxor \T3, \T7, \T7
2123
2124 vpclmulqdq $0x01, \T5, \T1, \T3
2125 vpxor \T3, \T6, \T6
2126
2127 vpclmulqdq $0x10, \T5, \T1, \T3
2128 vpxor \T3, \T6, \T6
2129
2130 vmovdqu 16*4(arg1), \T1
2131 vaesenc \T1, \XMM1, \XMM1
2132 vaesenc \T1, \XMM2, \XMM2
2133 vaesenc \T1, \XMM3, \XMM3
2134 vaesenc \T1, \XMM4, \XMM4
2135 vaesenc \T1, \XMM5, \XMM5
2136 vaesenc \T1, \XMM6, \XMM6
2137 vaesenc \T1, \XMM7, \XMM7
2138 vaesenc \T1, \XMM8, \XMM8
2139
2140
2141
2142 vmovdqa TMP3(%rsp), \T1
2143 vmovdqa HashKey_6(arg1), \T5
2144 vpclmulqdq $0x11, \T5, \T1, \T3
2145 vpxor \T3, \T4, \T4
2146
2147 vpclmulqdq $0x00, \T5, \T1, \T3
2148 vpxor \T3, \T7, \T7
2149
2150 vpclmulqdq $0x01, \T5, \T1, \T3
2151 vpxor \T3, \T6, \T6
2152
2153 vpclmulqdq $0x10, \T5, \T1, \T3
2154 vpxor \T3, \T6, \T6
2155
2156 vmovdqu 16*5(arg1), \T1
2157 vaesenc \T1, \XMM1, \XMM1
2158 vaesenc \T1, \XMM2, \XMM2
2159 vaesenc \T1, \XMM3, \XMM3
2160 vaesenc \T1, \XMM4, \XMM4
2161 vaesenc \T1, \XMM5, \XMM5
2162 vaesenc \T1, \XMM6, \XMM6
2163 vaesenc \T1, \XMM7, \XMM7
2164 vaesenc \T1, \XMM8, \XMM8
2165
2166 vmovdqa TMP4(%rsp), \T1
2167 vmovdqa HashKey_5(arg1), \T5
2168 vpclmulqdq $0x11, \T5, \T1, \T3
2169 vpxor \T3, \T4, \T4
2170
2171 vpclmulqdq $0x00, \T5, \T1, \T3
2172 vpxor \T3, \T7, \T7
2173
2174 vpclmulqdq $0x01, \T5, \T1, \T3
2175 vpxor \T3, \T6, \T6
2176
2177 vpclmulqdq $0x10, \T5, \T1, \T3
2178 vpxor \T3, \T6, \T6
2179
2180 vmovdqu 16*6(arg1), \T1
2181 vaesenc \T1, \XMM1, \XMM1
2182 vaesenc \T1, \XMM2, \XMM2
2183 vaesenc \T1, \XMM3, \XMM3
2184 vaesenc \T1, \XMM4, \XMM4
2185 vaesenc \T1, \XMM5, \XMM5
2186 vaesenc \T1, \XMM6, \XMM6
2187 vaesenc \T1, \XMM7, \XMM7
2188 vaesenc \T1, \XMM8, \XMM8
2189
2190
2191 vmovdqa TMP5(%rsp), \T1
2192 vmovdqa HashKey_4(arg1), \T5
2193 vpclmulqdq $0x11, \T5, \T1, \T3
2194 vpxor \T3, \T4, \T4
2195
2196 vpclmulqdq $0x00, \T5, \T1, \T3
2197 vpxor \T3, \T7, \T7
2198
2199 vpclmulqdq $0x01, \T5, \T1, \T3
2200 vpxor \T3, \T6, \T6
2201
2202 vpclmulqdq $0x10, \T5, \T1, \T3
2203 vpxor \T3, \T6, \T6
2204
2205 vmovdqu 16*7(arg1), \T1
2206 vaesenc \T1, \XMM1, \XMM1
2207 vaesenc \T1, \XMM2, \XMM2
2208 vaesenc \T1, \XMM3, \XMM3
2209 vaesenc \T1, \XMM4, \XMM4
2210 vaesenc \T1, \XMM5, \XMM5
2211 vaesenc \T1, \XMM6, \XMM6
2212 vaesenc \T1, \XMM7, \XMM7
2213 vaesenc \T1, \XMM8, \XMM8
2214
2215 vmovdqa TMP6(%rsp), \T1
2216 vmovdqa HashKey_3(arg1), \T5
2217 vpclmulqdq $0x11, \T5, \T1, \T3
2218 vpxor \T3, \T4, \T4
2219
2220 vpclmulqdq $0x00, \T5, \T1, \T3
2221 vpxor \T3, \T7, \T7
2222
2223 vpclmulqdq $0x01, \T5, \T1, \T3
2224 vpxor \T3, \T6, \T6
2225
2226 vpclmulqdq $0x10, \T5, \T1, \T3
2227 vpxor \T3, \T6, \T6
2228
2229 vmovdqu 16*8(arg1), \T1
2230 vaesenc \T1, \XMM1, \XMM1
2231 vaesenc \T1, \XMM2, \XMM2
2232 vaesenc \T1, \XMM3, \XMM3
2233 vaesenc \T1, \XMM4, \XMM4
2234 vaesenc \T1, \XMM5, \XMM5
2235 vaesenc \T1, \XMM6, \XMM6
2236 vaesenc \T1, \XMM7, \XMM7
2237 vaesenc \T1, \XMM8, \XMM8
2238
2239 vmovdqa TMP7(%rsp), \T1
2240 vmovdqa HashKey_2(arg1), \T5
2241 vpclmulqdq $0x11, \T5, \T1, \T3
2242 vpxor \T3, \T4, \T4
2243
2244 vpclmulqdq $0x00, \T5, \T1, \T3
2245 vpxor \T3, \T7, \T7
2246
2247 vpclmulqdq $0x01, \T5, \T1, \T3
2248 vpxor \T3, \T6, \T6
2249
2250 vpclmulqdq $0x10, \T5, \T1, \T3
2251 vpxor \T3, \T6, \T6
2252
2253
2254
2255
2256 vmovdqu 16*9(arg1), \T5
2257 vaesenc \T5, \XMM1, \XMM1
2258 vaesenc \T5, \XMM2, \XMM2
2259 vaesenc \T5, \XMM3, \XMM3
2260 vaesenc \T5, \XMM4, \XMM4
2261 vaesenc \T5, \XMM5, \XMM5
2262 vaesenc \T5, \XMM6, \XMM6
2263 vaesenc \T5, \XMM7, \XMM7
2264 vaesenc \T5, \XMM8, \XMM8
2265
2266 vmovdqa TMP8(%rsp), \T1
2267 vmovdqa HashKey(arg1), \T5
2268
2269 vpclmulqdq $0x00, \T5, \T1, \T3
2270 vpxor \T3, \T7, \T7
2271
2272 vpclmulqdq $0x01, \T5, \T1, \T3
2273 vpxor \T3, \T6, \T6
2274
2275 vpclmulqdq $0x10, \T5, \T1, \T3
2276 vpxor \T3, \T6, \T6
2277
2278 vpclmulqdq $0x11, \T5, \T1, \T3
2279 vpxor \T3, \T4, \T1
2280
2281
2282 vmovdqu 16*10(arg1), \T5
2283
2284 i = 0
2285 j = 1
2286 setreg
2287.rep 8
2288 vpxor 16*i(arg3, %r11), \T5, \T2
2289 .if \ENC_DEC == ENC
2290 vaesenclast \T2, reg_j, reg_j
2291 .else
2292 vaesenclast \T2, reg_j, \T3
2293 vmovdqu 16*i(arg3, %r11), reg_j
2294 vmovdqu \T3, 16*i(arg2, %r11)
2295 .endif
2296 i = (i+1)
2297 j = (j+1)
2298 setreg
2299.endr
2300
2301
2302
2303 vpslldq $8, \T6, \T3
2304 vpsrldq $8, \T6, \T6
2305 vpxor \T3, \T7, \T7
2306 vpxor \T6, \T1, \T1
2307
2308
2309
2310
2311
2312 vmovdqa POLY2(%rip), \T3
2313
2314 vpclmulqdq $0x01, \T7, \T3, \T2
2315 vpslldq $8, \T2, \T2
2316
2317 vpxor \T2, \T7, \T7
2318
2319 .if \ENC_DEC == ENC
2320 vmovdqu \XMM1, 16*0(arg2,%r11)
2321 vmovdqu \XMM2, 16*1(arg2,%r11)
2322 vmovdqu \XMM3, 16*2(arg2,%r11)
2323 vmovdqu \XMM4, 16*3(arg2,%r11)
2324 vmovdqu \XMM5, 16*4(arg2,%r11)
2325 vmovdqu \XMM6, 16*5(arg2,%r11)
2326 vmovdqu \XMM7, 16*6(arg2,%r11)
2327 vmovdqu \XMM8, 16*7(arg2,%r11)
2328 .endif
2329
2330
2331
2332 vpclmulqdq $0x00, \T7, \T3, \T2
2333 vpsrldq $4, \T2, \T2
2334
2335 vpclmulqdq $0x10, \T7, \T3, \T4
2336 vpslldq $4, \T4, \T4
2337
2338 vpxor \T2, \T4, \T4
2339
2340 vpxor \T4, \T1, \T1
2341
2342 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
2343 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
2344 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
2345 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
2346 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
2347 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
2348 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
2349 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
2350
2351
2352 vpxor \T1, \XMM1, \XMM1
2353
2354
2355
2356.endm
2357
2358
2359
2360.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2361
2362
2363
2364 vmovdqa HashKey_8(arg1), \T5
2365
2366 vpshufd $0b01001110, \XMM1, \T2
2367 vpshufd $0b01001110, \T5, \T3
2368 vpxor \XMM1, \T2, \T2
2369 vpxor \T5, \T3, \T3
2370
2371 vpclmulqdq $0x11, \T5, \XMM1, \T6
2372 vpclmulqdq $0x00, \T5, \XMM1, \T7
2373
2374 vpclmulqdq $0x00, \T3, \T2, \XMM1
2375
2376
2377
2378 vmovdqa HashKey_7(arg1), \T5
2379 vpshufd $0b01001110, \XMM2, \T2
2380 vpshufd $0b01001110, \T5, \T3
2381 vpxor \XMM2, \T2, \T2
2382 vpxor \T5, \T3, \T3
2383
2384 vpclmulqdq $0x11, \T5, \XMM2, \T4
2385 vpxor \T4, \T6, \T6
2386
2387 vpclmulqdq $0x00, \T5, \XMM2, \T4
2388 vpxor \T4, \T7, \T7
2389
2390 vpclmulqdq $0x00, \T3, \T2, \T2
2391
2392 vpxor \T2, \XMM1, \XMM1
2393
2394
2395
2396 vmovdqa HashKey_6(arg1), \T5
2397 vpshufd $0b01001110, \XMM3, \T2
2398 vpshufd $0b01001110, \T5, \T3
2399 vpxor \XMM3, \T2, \T2
2400 vpxor \T5, \T3, \T3
2401
2402 vpclmulqdq $0x11, \T5, \XMM3, \T4
2403 vpxor \T4, \T6, \T6
2404
2405 vpclmulqdq $0x00, \T5, \XMM3, \T4
2406 vpxor \T4, \T7, \T7
2407
2408 vpclmulqdq $0x00, \T3, \T2, \T2
2409
2410 vpxor \T2, \XMM1, \XMM1
2411
2412
2413
2414 vmovdqa HashKey_5(arg1), \T5
2415 vpshufd $0b01001110, \XMM4, \T2
2416 vpshufd $0b01001110, \T5, \T3
2417 vpxor \XMM4, \T2, \T2
2418 vpxor \T5, \T3, \T3
2419
2420 vpclmulqdq $0x11, \T5, \XMM4, \T4
2421 vpxor \T4, \T6, \T6
2422
2423 vpclmulqdq $0x00, \T5, \XMM4, \T4
2424 vpxor \T4, \T7, \T7
2425
2426 vpclmulqdq $0x00, \T3, \T2, \T2
2427
2428 vpxor \T2, \XMM1, \XMM1
2429
2430
2431
2432 vmovdqa HashKey_4(arg1), \T5
2433 vpshufd $0b01001110, \XMM5, \T2
2434 vpshufd $0b01001110, \T5, \T3
2435 vpxor \XMM5, \T2, \T2
2436 vpxor \T5, \T3, \T3
2437
2438 vpclmulqdq $0x11, \T5, \XMM5, \T4
2439 vpxor \T4, \T6, \T6
2440
2441 vpclmulqdq $0x00, \T5, \XMM5, \T4
2442 vpxor \T4, \T7, \T7
2443
2444 vpclmulqdq $0x00, \T3, \T2, \T2
2445
2446 vpxor \T2, \XMM1, \XMM1
2447
2448
2449
2450 vmovdqa HashKey_3(arg1), \T5
2451 vpshufd $0b01001110, \XMM6, \T2
2452 vpshufd $0b01001110, \T5, \T3
2453 vpxor \XMM6, \T2, \T2
2454 vpxor \T5, \T3, \T3
2455
2456 vpclmulqdq $0x11, \T5, \XMM6, \T4
2457 vpxor \T4, \T6, \T6
2458
2459 vpclmulqdq $0x00, \T5, \XMM6, \T4
2460 vpxor \T4, \T7, \T7
2461
2462 vpclmulqdq $0x00, \T3, \T2, \T2
2463
2464 vpxor \T2, \XMM1, \XMM1
2465
2466
2467
2468 vmovdqa HashKey_2(arg1), \T5
2469 vpshufd $0b01001110, \XMM7, \T2
2470 vpshufd $0b01001110, \T5, \T3
2471 vpxor \XMM7, \T2, \T2
2472 vpxor \T5, \T3, \T3
2473
2474 vpclmulqdq $0x11, \T5, \XMM7, \T4
2475 vpxor \T4, \T6, \T6
2476
2477 vpclmulqdq $0x00, \T5, \XMM7, \T4
2478 vpxor \T4, \T7, \T7
2479
2480 vpclmulqdq $0x00, \T3, \T2, \T2
2481
2482 vpxor \T2, \XMM1, \XMM1
2483
2484
2485
2486 vmovdqa HashKey(arg1), \T5
2487 vpshufd $0b01001110, \XMM8, \T2
2488 vpshufd $0b01001110, \T5, \T3
2489 vpxor \XMM8, \T2, \T2
2490 vpxor \T5, \T3, \T3
2491
2492 vpclmulqdq $0x11, \T5, \XMM8, \T4
2493 vpxor \T4, \T6, \T6
2494
2495 vpclmulqdq $0x00, \T5, \XMM8, \T4
2496 vpxor \T4, \T7, \T7
2497
2498 vpclmulqdq $0x00, \T3, \T2, \T2
2499
2500 vpxor \T2, \XMM1, \XMM1
2501 vpxor \T6, \XMM1, \XMM1
2502 vpxor \T7, \XMM1, \T2
2503
2504
2505
2506
2507 vpslldq $8, \T2, \T4
2508 vpsrldq $8, \T2, \T2
2509
2510 vpxor \T4, \T7, \T7
2511 vpxor \T2, \T6, \T6
2512
2513
2514
2515
2516 vmovdqa POLY2(%rip), \T3
2517
2518 vpclmulqdq $0x01, \T7, \T3, \T2
2519 vpslldq $8, \T2, \T2
2520
2521 vpxor \T2, \T7, \T7
2522
2523
2524
2525
2526 vpclmulqdq $0x00, \T7, \T3, \T2
2527 vpsrldq $4, \T2, \T2
2528
2529 vpclmulqdq $0x10, \T7, \T3, \T4
2530 vpslldq $4, \T4, \T4
2531
2532 vpxor \T2, \T4, \T4
2533
2534 vpxor \T4, \T6, \T6
2535.endm
2536
2537
2538
2539
2540
2541
2542.macro GCM_ENC_DEC_AVX2 ENC_DEC
2543
2544
2545 push %r12
2546 push %r13
2547 push %r14
2548 push %r15
2549
2550 mov %rsp, %r14
2551
2552
2553
2554
2555 sub $VARIABLE_OFFSET, %rsp
2556 and $~63, %rsp
2557
2558
2559 vmovdqu HashKey(arg1), %xmm13
2560
2561 mov arg4, %r13
2562 and $-16, %r13
2563
2564 mov %r13, %r12
2565 shr $4, %r12
2566 and $7, %r12
2567 jz _initial_num_blocks_is_0\@
2568
2569 cmp $7, %r12
2570 je _initial_num_blocks_is_7\@
2571 cmp $6, %r12
2572 je _initial_num_blocks_is_6\@
2573 cmp $5, %r12
2574 je _initial_num_blocks_is_5\@
2575 cmp $4, %r12
2576 je _initial_num_blocks_is_4\@
2577 cmp $3, %r12
2578 je _initial_num_blocks_is_3\@
2579 cmp $2, %r12
2580 je _initial_num_blocks_is_2\@
2581
2582 jmp _initial_num_blocks_is_1\@
2583
2584_initial_num_blocks_is_7\@:
2585 INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2586 sub $16*7, %r13
2587 jmp _initial_blocks_encrypted\@
2588
2589_initial_num_blocks_is_6\@:
2590 INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2591 sub $16*6, %r13
2592 jmp _initial_blocks_encrypted\@
2593
2594_initial_num_blocks_is_5\@:
2595 INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2596 sub $16*5, %r13
2597 jmp _initial_blocks_encrypted\@
2598
2599_initial_num_blocks_is_4\@:
2600 INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2601 sub $16*4, %r13
2602 jmp _initial_blocks_encrypted\@
2603
2604_initial_num_blocks_is_3\@:
2605 INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2606 sub $16*3, %r13
2607 jmp _initial_blocks_encrypted\@
2608
2609_initial_num_blocks_is_2\@:
2610 INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2611 sub $16*2, %r13
2612 jmp _initial_blocks_encrypted\@
2613
2614_initial_num_blocks_is_1\@:
2615 INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2616 sub $16*1, %r13
2617 jmp _initial_blocks_encrypted\@
2618
2619_initial_num_blocks_is_0\@:
2620 INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2621
2622
2623_initial_blocks_encrypted\@:
2624 cmp $0, %r13
2625 je _zero_cipher_left\@
2626
2627 sub $128, %r13
2628 je _eight_cipher_left\@
2629
2630
2631
2632
2633 vmovd %xmm9, %r15d
2634 and $255, %r15d
2635 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2636
2637
2638_encrypt_by_8_new\@:
2639 cmp $(255-8), %r15d
2640 jg _encrypt_by_8\@
2641
2642
2643
2644 add $8, %r15b
2645 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
2646 add $128, %r11
2647 sub $128, %r13
2648 jne _encrypt_by_8_new\@
2649
2650 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2651 jmp _eight_cipher_left\@
2652
2653_encrypt_by_8\@:
2654 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2655 add $8, %r15b
2656 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
2657 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2658 add $128, %r11
2659 sub $128, %r13
2660 jne _encrypt_by_8_new\@
2661
2662 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2663
2664
2665
2666
2667_eight_cipher_left\@:
2668 GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
2669
2670
2671_zero_cipher_left\@:
2672 cmp $16, arg4
2673 jl _only_less_than_16\@
2674
2675 mov arg4, %r13
2676 and $15, %r13
2677
2678 je _multiple_of_16_bytes\@
2679
2680
2681
2682
2683 vpaddd ONE(%rip), %xmm9, %xmm9
2684 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2685 ENCRYPT_SINGLE_BLOCK %xmm9
2686
2687 sub $16, %r11
2688 add %r13, %r11
2689 vmovdqu (arg3, %r11), %xmm1
2690
2691 lea SHIFT_MASK+16(%rip), %r12
2692 sub %r13, %r12
2693
2694
2695 vmovdqu (%r12), %xmm2
2696 vpshufb %xmm2, %xmm1, %xmm1
2697 jmp _final_ghash_mul\@
2698
2699_only_less_than_16\@:
2700
2701 mov arg4, %r13
2702 and $15, %r13
2703
2704 je _multiple_of_16_bytes\@
2705
2706
2707
2708
2709 vpaddd ONE(%rip), %xmm9, %xmm9
2710 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2711 ENCRYPT_SINGLE_BLOCK %xmm9
2712
2713
2714 lea SHIFT_MASK+16(%rip), %r12
2715 sub %r13, %r12
2716
2717
2718
2719_get_last_16_byte_loop\@:
2720 movb (arg3, %r11), %al
2721 movb %al, TMP1 (%rsp , %r11)
2722 add $1, %r11
2723 cmp %r13, %r11
2724 jne _get_last_16_byte_loop\@
2725
2726 vmovdqu TMP1(%rsp), %xmm1
2727
2728 sub $16, %r11
2729
2730_final_ghash_mul\@:
2731 .if \ENC_DEC == DEC
2732 vmovdqa %xmm1, %xmm2
2733 vpxor %xmm1, %xmm9, %xmm9
2734 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
2735 vpand %xmm1, %xmm9, %xmm9
2736 vpand %xmm1, %xmm2, %xmm2
2737 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
2738 vpxor %xmm2, %xmm14, %xmm14
2739
2740 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2741 sub %r13, %r11
2742 add $16, %r11
2743 .else
2744 vpxor %xmm1, %xmm9, %xmm9
2745 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
2746 vpand %xmm1, %xmm9, %xmm9
2747 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2748 vpxor %xmm9, %xmm14, %xmm14
2749
2750 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2751 sub %r13, %r11
2752 add $16, %r11
2753 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2754 .endif
2755
2756
2757
2758
2759 vmovq %xmm9, %rax
2760 cmp $8, %r13
2761 jle _less_than_8_bytes_left\@
2762
2763 mov %rax, (arg2 , %r11)
2764 add $8, %r11
2765 vpsrldq $8, %xmm9, %xmm9
2766 vmovq %xmm9, %rax
2767 sub $8, %r13
2768
2769_less_than_8_bytes_left\@:
2770 movb %al, (arg2 , %r11)
2771 add $1, %r11
2772 shr $8, %rax
2773 sub $1, %r13
2774 jne _less_than_8_bytes_left\@
2775
2776
2777_multiple_of_16_bytes\@:
2778 mov arg7, %r12
2779 shl $3, %r12
2780 vmovd %r12d, %xmm15
2781
2782 shl $3, arg4
2783 vmovq arg4, %xmm1
2784 vpslldq $8, %xmm15, %xmm15
2785 vpxor %xmm1, %xmm15, %xmm15
2786
2787 vpxor %xmm15, %xmm14, %xmm14
2788 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2789 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14
2790
2791 mov arg5, %rax
2792 vmovdqu (%rax), %xmm9
2793
2794 ENCRYPT_SINGLE_BLOCK %xmm9
2795
2796 vpxor %xmm14, %xmm9, %xmm9
2797
2798
2799
2800_return_T\@:
2801 mov arg8, %r10
2802 mov arg9, %r11
2803
2804 cmp $16, %r11
2805 je _T_16\@
2806
2807 cmp $8, %r11
2808 jl _T_4\@
2809
2810_T_8\@:
2811 vmovq %xmm9, %rax
2812 mov %rax, (%r10)
2813 add $8, %r10
2814 sub $8, %r11
2815 vpsrldq $8, %xmm9, %xmm9
2816 cmp $0, %r11
2817 je _return_T_done\@
2818_T_4\@:
2819 vmovd %xmm9, %eax
2820 mov %eax, (%r10)
2821 add $4, %r10
2822 sub $4, %r11
2823 vpsrldq $4, %xmm9, %xmm9
2824 cmp $0, %r11
2825 je _return_T_done\@
2826_T_123\@:
2827 vmovd %xmm9, %eax
2828 cmp $2, %r11
2829 jl _T_1\@
2830 mov %ax, (%r10)
2831 cmp $2, %r11
2832 je _return_T_done\@
2833 add $2, %r10
2834 sar $16, %eax
2835_T_1\@:
2836 mov %al, (%r10)
2837 jmp _return_T_done\@
2838
2839_T_16\@:
2840 vmovdqu %xmm9, (%r10)
2841
2842_return_T_done\@:
2843 mov %r14, %rsp
2844
2845 pop %r15
2846 pop %r14
2847 pop %r13
2848 pop %r12
2849.endm
2850
2851
2852
2853
2854
2855
2856
2857
2858ENTRY(aesni_gcm_precomp_avx_gen4)
2859
2860 push %r12
2861 push %r13
2862 push %r14
2863 push %r15
2864
2865 mov %rsp, %r14
2866
2867
2868
2869 sub $VARIABLE_OFFSET, %rsp
2870 and $~63, %rsp
2871
2872 vmovdqu (arg2), %xmm6
2873
2874 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
2875
2876 vmovdqa %xmm6, %xmm2
2877 vpsllq $1, %xmm6, %xmm6
2878 vpsrlq $63, %xmm2, %xmm2
2879 vmovdqa %xmm2, %xmm1
2880 vpslldq $8, %xmm2, %xmm2
2881 vpsrldq $8, %xmm1, %xmm1
2882 vpor %xmm2, %xmm6, %xmm6
2883
2884 vpshufd $0b00100100, %xmm1, %xmm2
2885 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
2886 vpand POLY(%rip), %xmm2, %xmm2
2887 vpxor %xmm2, %xmm6, %xmm6
2888
2889 vmovdqa %xmm6, HashKey(arg1)
2890
2891
2892 PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
2893
2894 mov %r14, %rsp
2895
2896 pop %r15
2897 pop %r14
2898 pop %r13
2899 pop %r12
2900 ret
2901ENDPROC(aesni_gcm_precomp_avx_gen4)
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920ENTRY(aesni_gcm_enc_avx_gen4)
2921 GCM_ENC_DEC_AVX2 ENC
2922 ret
2923ENDPROC(aesni_gcm_enc_avx_gen4)
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941ENTRY(aesni_gcm_dec_avx_gen4)
2942 GCM_ENC_DEC_AVX2 DEC
2943 ret
2944ENDPROC(aesni_gcm_dec_avx_gen4)
2945
2946#endif
2947