1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122#include <linux/linkage.h>
123#include <asm/inst.h>
124
125.data
126.align 16
127
128POLY: .octa 0xC2000000000000000000000000000001
129POLY2: .octa 0xC20000000000000000000001C2000000
130TWOONE: .octa 0x00000001000000000000000000000001
131
132
133
134
135SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
136SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
137ALL_F: .octa 0xffffffffffffffffffffffffffffffff
138ZERO: .octa 0x00000000000000000000000000000000
139ONE: .octa 0x00000000000000000000000000000001
140ONEf: .octa 0x01000000000000000000000000000000
141
142.section .rodata
143.align 16
144.type aad_shift_arr, @object
145.size aad_shift_arr, 272
146aad_shift_arr:
147 .octa 0xffffffffffffffffffffffffffffffff
148 .octa 0xffffffffffffffffffffffffffffff0C
149 .octa 0xffffffffffffffffffffffffffff0D0C
150 .octa 0xffffffffffffffffffffffffff0E0D0C
151 .octa 0xffffffffffffffffffffffff0F0E0D0C
152 .octa 0xffffffffffffffffffffff0C0B0A0908
153 .octa 0xffffffffffffffffffff0D0C0B0A0908
154 .octa 0xffffffffffffffffff0E0D0C0B0A0908
155 .octa 0xffffffffffffffff0F0E0D0C0B0A0908
156 .octa 0xffffffffffffff0C0B0A090807060504
157 .octa 0xffffffffffff0D0C0B0A090807060504
158 .octa 0xffffffffff0E0D0C0B0A090807060504
159 .octa 0xffffffff0F0E0D0C0B0A090807060504
160 .octa 0xffffff0C0B0A09080706050403020100
161 .octa 0xffff0D0C0B0A09080706050403020100
162 .octa 0xff0E0D0C0B0A09080706050403020100
163 .octa 0x0F0E0D0C0B0A09080706050403020100
164
165
166.text
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190HashKey = 16*11
191HashKey_2 = 16*12
192HashKey_3 = 16*13
193HashKey_4 = 16*14
194HashKey_5 = 16*15
195HashKey_6 = 16*16
196HashKey_7 = 16*17
197HashKey_8 = 16*18
198HashKey_k = 16*19
199HashKey_2_k = 16*20
200HashKey_3_k = 16*21
201HashKey_4_k = 16*22
202HashKey_5_k = 16*23
203HashKey_6_k = 16*24
204HashKey_7_k = 16*25
205HashKey_8_k = 16*26
206
207#define arg1 %rdi
208#define arg2 %rsi
209#define arg3 %rdx
210#define arg4 %rcx
211#define arg5 %r8
212#define arg6 %r9
213#define arg7 STACK_OFFSET+8*1(%r14)
214#define arg8 STACK_OFFSET+8*2(%r14)
215#define arg9 STACK_OFFSET+8*3(%r14)
216
217i = 0
218j = 0
219
220out_order = 0
221in_order = 1
222DEC = 0
223ENC = 1
224
225.macro define_reg r n
226reg_\r = %xmm\n
227.endm
228
229.macro setreg
230.altmacro
231define_reg i %i
232define_reg j %j
233.noaltmacro
234.endm
235
236
237STACK_OFFSET = 8*4
238
239TMP1 = 16*0
240TMP2 = 16*1
241TMP3 = 16*2
242TMP4 = 16*3
243TMP5 = 16*4
244TMP6 = 16*5
245TMP7 = 16*6
246TMP8 = 16*7
247
248VARIABLE_OFFSET = 16*8
249
250
251
252
253
254
255.macro ENCRYPT_SINGLE_BLOCK XMM0
256 vpxor (arg1), \XMM0, \XMM0
257 i = 1
258 setreg
259.rep 9
260 vaesenc 16*i(arg1), \XMM0, \XMM0
261 i = (i+1)
262 setreg
263.endr
264 vaesenclast 16*10(arg1), \XMM0, \XMM0
265.endm
266
267#ifdef CONFIG_AS_AVX
268
269
270
271
272
273
274
275.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
276
277 vpshufd $0b01001110, \GH, \T2
278 vpshufd $0b01001110, \HK, \T3
279 vpxor \GH , \T2, \T2
280 vpxor \HK , \T3, \T3
281
282 vpclmulqdq $0x11, \HK, \GH, \T1
283 vpclmulqdq $0x00, \HK, \GH, \GH
284 vpclmulqdq $0x00, \T3, \T2, \T2
285 vpxor \GH, \T2,\T2
286 vpxor \T1, \T2,\T2
287
288 vpslldq $8, \T2,\T3
289 vpsrldq $8, \T2,\T2
290 vpxor \T3, \GH, \GH
291 vpxor \T2, \T1, \T1
292
293
294 vpslld $31, \GH, \T2
295 vpslld $30, \GH, \T3
296 vpslld $25, \GH, \T4
297
298 vpxor \T3, \T2, \T2
299 vpxor \T4, \T2, \T2
300
301 vpsrldq $4, \T2, \T5
302
303 vpslldq $12, \T2, \T2
304 vpxor \T2, \GH, \GH
305
306
307
308 vpsrld $1,\GH, \T2
309 vpsrld $2,\GH, \T3
310 vpsrld $7,\GH, \T4
311 vpxor \T3, \T2, \T2
312 vpxor \T4, \T2, \T2
313
314 vpxor \T5, \T2, \T2
315 vpxor \T2, \GH, \GH
316 vpxor \T1, \GH, \GH
317
318
319.endm
320
321.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
322
323
324 vmovdqa \HK, \T5
325
326 vpshufd $0b01001110, \T5, \T1
327 vpxor \T5, \T1, \T1
328 vmovdqa \T1, HashKey_k(arg1)
329
330 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
331 vmovdqa \T5, HashKey_2(arg1)
332 vpshufd $0b01001110, \T5, \T1
333 vpxor \T5, \T1, \T1
334 vmovdqa \T1, HashKey_2_k(arg1)
335
336 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
337 vmovdqa \T5, HashKey_3(arg1)
338 vpshufd $0b01001110, \T5, \T1
339 vpxor \T5, \T1, \T1
340 vmovdqa \T1, HashKey_3_k(arg1)
341
342 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
343 vmovdqa \T5, HashKey_4(arg1)
344 vpshufd $0b01001110, \T5, \T1
345 vpxor \T5, \T1, \T1
346 vmovdqa \T1, HashKey_4_k(arg1)
347
348 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
349 vmovdqa \T5, HashKey_5(arg1)
350 vpshufd $0b01001110, \T5, \T1
351 vpxor \T5, \T1, \T1
352 vmovdqa \T1, HashKey_5_k(arg1)
353
354 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
355 vmovdqa \T5, HashKey_6(arg1)
356 vpshufd $0b01001110, \T5, \T1
357 vpxor \T5, \T1, \T1
358 vmovdqa \T1, HashKey_6_k(arg1)
359
360 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
361 vmovdqa \T5, HashKey_7(arg1)
362 vpshufd $0b01001110, \T5, \T1
363 vpxor \T5, \T1, \T1
364 vmovdqa \T1, HashKey_7_k(arg1)
365
366 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
367 vmovdqa \T5, HashKey_8(arg1)
368 vpshufd $0b01001110, \T5, \T1
369 vpxor \T5, \T1, \T1
370 vmovdqa \T1, HashKey_8_k(arg1)
371
372.endm
373
374
375
376
377
378
379
380
381.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
382 i = (8-\num_initial_blocks)
383 j = 0
384 setreg
385
386 mov arg6, %r10
387 mov arg7, %r12
388
389
390 mov %r12, %r11
391
392 vpxor reg_j, reg_j, reg_j
393 vpxor reg_i, reg_i, reg_i
394 cmp $16, %r11
395 jl _get_AAD_rest8\@
396_get_AAD_blocks\@:
397 vmovdqu (%r10), reg_i
398 vpshufb SHUF_MASK(%rip), reg_i, reg_i
399 vpxor reg_i, reg_j, reg_j
400 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6
401 add $16, %r10
402 sub $16, %r12
403 sub $16, %r11
404 cmp $16, %r11
405 jge _get_AAD_blocks\@
406 vmovdqu reg_j, reg_i
407 cmp $0, %r11
408 je _get_AAD_done\@
409
410 vpxor reg_i, reg_i, reg_i
411
412
413
414
415_get_AAD_rest8\@:
416 cmp $4, %r11
417 jle _get_AAD_rest4\@
418 movq (%r10), \T1
419 add $8, %r10
420 sub $8, %r11
421 vpslldq $8, \T1, \T1
422 vpsrldq $8, reg_i, reg_i
423 vpxor \T1, reg_i, reg_i
424 jmp _get_AAD_rest8\@
425_get_AAD_rest4\@:
426 cmp $0, %r11
427 jle _get_AAD_rest0\@
428 mov (%r10), %eax
429 movq %rax, \T1
430 add $4, %r10
431 sub $4, %r11
432 vpslldq $12, \T1, \T1
433 vpsrldq $4, reg_i, reg_i
434 vpxor \T1, reg_i, reg_i
435_get_AAD_rest0\@:
436
437
438
439 movq %r12, %r11
440 salq $4, %r11
441 movdqu aad_shift_arr(%r11), \T1
442 vpshufb \T1, reg_i, reg_i
443_get_AAD_rest_final\@:
444 vpshufb SHUF_MASK(%rip), reg_i, reg_i
445 vpxor reg_j, reg_i, reg_i
446 GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6
447
448_get_AAD_done\@:
449
450 xor %r11, %r11
451
452
453 mov arg5, %rax
454 vmovdqu (%rax), \CTR
455 vpshufb SHUF_MASK(%rip), \CTR, \CTR
456
457
458 i = (9-\num_initial_blocks)
459 setreg
460.rep \num_initial_blocks
461 vpaddd ONE(%rip), \CTR, \CTR
462 vmovdqa \CTR, reg_i
463 vpshufb SHUF_MASK(%rip), reg_i, reg_i
464 i = (i+1)
465 setreg
466.endr
467
468 vmovdqa (arg1), \T_key
469 i = (9-\num_initial_blocks)
470 setreg
471.rep \num_initial_blocks
472 vpxor \T_key, reg_i, reg_i
473 i = (i+1)
474 setreg
475.endr
476
477 j = 1
478 setreg
479.rep 9
480 vmovdqa 16*j(arg1), \T_key
481 i = (9-\num_initial_blocks)
482 setreg
483.rep \num_initial_blocks
484 vaesenc \T_key, reg_i, reg_i
485 i = (i+1)
486 setreg
487.endr
488
489 j = (j+1)
490 setreg
491.endr
492
493
494 vmovdqa 16*10(arg1), \T_key
495 i = (9-\num_initial_blocks)
496 setreg
497.rep \num_initial_blocks
498 vaesenclast \T_key, reg_i, reg_i
499 i = (i+1)
500 setreg
501.endr
502
503 i = (9-\num_initial_blocks)
504 setreg
505.rep \num_initial_blocks
506 vmovdqu (arg3, %r11), \T1
507 vpxor \T1, reg_i, reg_i
508 vmovdqu reg_i, (arg2 , %r11)
509 add $16, %r11
510.if \ENC_DEC == DEC
511 vmovdqa \T1, reg_i
512.endif
513 vpshufb SHUF_MASK(%rip), reg_i, reg_i
514 i = (i+1)
515 setreg
516.endr
517
518
519 i = (8-\num_initial_blocks)
520 j = (9-\num_initial_blocks)
521 setreg
522
523.rep \num_initial_blocks
524 vpxor reg_i, reg_j, reg_j
525 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6
526 i = (i+1)
527 j = (j+1)
528 setreg
529.endr
530
531
532 vmovdqa \XMM8, TMP1(%rsp)
533 vmovdqa \XMM8, \T3
534
535 cmp $128, %r13
536 jl _initial_blocks_done\@
537
538
539
540 vpaddd ONE(%rip), \CTR, \CTR
541 vmovdqa \CTR, \XMM1
542 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
543
544 vpaddd ONE(%rip), \CTR, \CTR
545 vmovdqa \CTR, \XMM2
546 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
547
548 vpaddd ONE(%rip), \CTR, \CTR
549 vmovdqa \CTR, \XMM3
550 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
551
552 vpaddd ONE(%rip), \CTR, \CTR
553 vmovdqa \CTR, \XMM4
554 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
555
556 vpaddd ONE(%rip), \CTR, \CTR
557 vmovdqa \CTR, \XMM5
558 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
559
560 vpaddd ONE(%rip), \CTR, \CTR
561 vmovdqa \CTR, \XMM6
562 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
563
564 vpaddd ONE(%rip), \CTR, \CTR
565 vmovdqa \CTR, \XMM7
566 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
567
568 vpaddd ONE(%rip), \CTR, \CTR
569 vmovdqa \CTR, \XMM8
570 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
571
572 vmovdqa (arg1), \T_key
573 vpxor \T_key, \XMM1, \XMM1
574 vpxor \T_key, \XMM2, \XMM2
575 vpxor \T_key, \XMM3, \XMM3
576 vpxor \T_key, \XMM4, \XMM4
577 vpxor \T_key, \XMM5, \XMM5
578 vpxor \T_key, \XMM6, \XMM6
579 vpxor \T_key, \XMM7, \XMM7
580 vpxor \T_key, \XMM8, \XMM8
581
582 i = 1
583 setreg
584.rep 9
585 vmovdqa 16*i(arg1), \T_key
586 vaesenc \T_key, \XMM1, \XMM1
587 vaesenc \T_key, \XMM2, \XMM2
588 vaesenc \T_key, \XMM3, \XMM3
589 vaesenc \T_key, \XMM4, \XMM4
590 vaesenc \T_key, \XMM5, \XMM5
591 vaesenc \T_key, \XMM6, \XMM6
592 vaesenc \T_key, \XMM7, \XMM7
593 vaesenc \T_key, \XMM8, \XMM8
594 i = (i+1)
595 setreg
596.endr
597
598
599 vmovdqa 16*i(arg1), \T_key
600 vaesenclast \T_key, \XMM1, \XMM1
601 vaesenclast \T_key, \XMM2, \XMM2
602 vaesenclast \T_key, \XMM3, \XMM3
603 vaesenclast \T_key, \XMM4, \XMM4
604 vaesenclast \T_key, \XMM5, \XMM5
605 vaesenclast \T_key, \XMM6, \XMM6
606 vaesenclast \T_key, \XMM7, \XMM7
607 vaesenclast \T_key, \XMM8, \XMM8
608
609 vmovdqu (arg3, %r11), \T1
610 vpxor \T1, \XMM1, \XMM1
611 vmovdqu \XMM1, (arg2 , %r11)
612 .if \ENC_DEC == DEC
613 vmovdqa \T1, \XMM1
614 .endif
615
616 vmovdqu 16*1(arg3, %r11), \T1
617 vpxor \T1, \XMM2, \XMM2
618 vmovdqu \XMM2, 16*1(arg2 , %r11)
619 .if \ENC_DEC == DEC
620 vmovdqa \T1, \XMM2
621 .endif
622
623 vmovdqu 16*2(arg3, %r11), \T1
624 vpxor \T1, \XMM3, \XMM3
625 vmovdqu \XMM3, 16*2(arg2 , %r11)
626 .if \ENC_DEC == DEC
627 vmovdqa \T1, \XMM3
628 .endif
629
630 vmovdqu 16*3(arg3, %r11), \T1
631 vpxor \T1, \XMM4, \XMM4
632 vmovdqu \XMM4, 16*3(arg2 , %r11)
633 .if \ENC_DEC == DEC
634 vmovdqa \T1, \XMM4
635 .endif
636
637 vmovdqu 16*4(arg3, %r11), \T1
638 vpxor \T1, \XMM5, \XMM5
639 vmovdqu \XMM5, 16*4(arg2 , %r11)
640 .if \ENC_DEC == DEC
641 vmovdqa \T1, \XMM5
642 .endif
643
644 vmovdqu 16*5(arg3, %r11), \T1
645 vpxor \T1, \XMM6, \XMM6
646 vmovdqu \XMM6, 16*5(arg2 , %r11)
647 .if \ENC_DEC == DEC
648 vmovdqa \T1, \XMM6
649 .endif
650
651 vmovdqu 16*6(arg3, %r11), \T1
652 vpxor \T1, \XMM7, \XMM7
653 vmovdqu \XMM7, 16*6(arg2 , %r11)
654 .if \ENC_DEC == DEC
655 vmovdqa \T1, \XMM7
656 .endif
657
658 vmovdqu 16*7(arg3, %r11), \T1
659 vpxor \T1, \XMM8, \XMM8
660 vmovdqu \XMM8, 16*7(arg2 , %r11)
661 .if \ENC_DEC == DEC
662 vmovdqa \T1, \XMM8
663 .endif
664
665 add $128, %r11
666
667 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
668 vpxor TMP1(%rsp), \XMM1, \XMM1
669 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
670 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
671 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
672 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
673 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
674 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
675 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
676
677
678
679_initial_blocks_done\@:
680
681.endm
682
683
684
685
686
687.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
688
689 vmovdqa \XMM1, \T2
690 vmovdqa \XMM2, TMP2(%rsp)
691 vmovdqa \XMM3, TMP3(%rsp)
692 vmovdqa \XMM4, TMP4(%rsp)
693 vmovdqa \XMM5, TMP5(%rsp)
694 vmovdqa \XMM6, TMP6(%rsp)
695 vmovdqa \XMM7, TMP7(%rsp)
696 vmovdqa \XMM8, TMP8(%rsp)
697
698.if \loop_idx == in_order
699 vpaddd ONE(%rip), \CTR, \XMM1
700 vpaddd ONE(%rip), \XMM1, \XMM2
701 vpaddd ONE(%rip), \XMM2, \XMM3
702 vpaddd ONE(%rip), \XMM3, \XMM4
703 vpaddd ONE(%rip), \XMM4, \XMM5
704 vpaddd ONE(%rip), \XMM5, \XMM6
705 vpaddd ONE(%rip), \XMM6, \XMM7
706 vpaddd ONE(%rip), \XMM7, \XMM8
707 vmovdqa \XMM8, \CTR
708
709 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
710 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
711 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
712 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
713 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
714 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
715 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
716 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
717.else
718 vpaddd ONEf(%rip), \CTR, \XMM1
719 vpaddd ONEf(%rip), \XMM1, \XMM2
720 vpaddd ONEf(%rip), \XMM2, \XMM3
721 vpaddd ONEf(%rip), \XMM3, \XMM4
722 vpaddd ONEf(%rip), \XMM4, \XMM5
723 vpaddd ONEf(%rip), \XMM5, \XMM6
724 vpaddd ONEf(%rip), \XMM6, \XMM7
725 vpaddd ONEf(%rip), \XMM7, \XMM8
726 vmovdqa \XMM8, \CTR
727.endif
728
729
730
731
732 vmovdqu (arg1), \T1
733 vpxor \T1, \XMM1, \XMM1
734 vpxor \T1, \XMM2, \XMM2
735 vpxor \T1, \XMM3, \XMM3
736 vpxor \T1, \XMM4, \XMM4
737 vpxor \T1, \XMM5, \XMM5
738 vpxor \T1, \XMM6, \XMM6
739 vpxor \T1, \XMM7, \XMM7
740 vpxor \T1, \XMM8, \XMM8
741
742
743
744
745
746
747
748 vmovdqu 16*1(arg1), \T1
749 vaesenc \T1, \XMM1, \XMM1
750 vaesenc \T1, \XMM2, \XMM2
751 vaesenc \T1, \XMM3, \XMM3
752 vaesenc \T1, \XMM4, \XMM4
753 vaesenc \T1, \XMM5, \XMM5
754 vaesenc \T1, \XMM6, \XMM6
755 vaesenc \T1, \XMM7, \XMM7
756 vaesenc \T1, \XMM8, \XMM8
757
758 vmovdqu 16*2(arg1), \T1
759 vaesenc \T1, \XMM1, \XMM1
760 vaesenc \T1, \XMM2, \XMM2
761 vaesenc \T1, \XMM3, \XMM3
762 vaesenc \T1, \XMM4, \XMM4
763 vaesenc \T1, \XMM5, \XMM5
764 vaesenc \T1, \XMM6, \XMM6
765 vaesenc \T1, \XMM7, \XMM7
766 vaesenc \T1, \XMM8, \XMM8
767
768
769
770
771 vmovdqa HashKey_8(arg1), \T5
772 vpclmulqdq $0x11, \T5, \T2, \T4
773 vpclmulqdq $0x00, \T5, \T2, \T7
774
775 vpshufd $0b01001110, \T2, \T6
776 vpxor \T2, \T6, \T6
777
778 vmovdqa HashKey_8_k(arg1), \T5
779 vpclmulqdq $0x00, \T5, \T6, \T6
780
781 vmovdqu 16*3(arg1), \T1
782 vaesenc \T1, \XMM1, \XMM1
783 vaesenc \T1, \XMM2, \XMM2
784 vaesenc \T1, \XMM3, \XMM3
785 vaesenc \T1, \XMM4, \XMM4
786 vaesenc \T1, \XMM5, \XMM5
787 vaesenc \T1, \XMM6, \XMM6
788 vaesenc \T1, \XMM7, \XMM7
789 vaesenc \T1, \XMM8, \XMM8
790
791 vmovdqa TMP2(%rsp), \T1
792 vmovdqa HashKey_7(arg1), \T5
793 vpclmulqdq $0x11, \T5, \T1, \T3
794 vpxor \T3, \T4, \T4
795 vpclmulqdq $0x00, \T5, \T1, \T3
796 vpxor \T3, \T7, \T7
797
798 vpshufd $0b01001110, \T1, \T3
799 vpxor \T1, \T3, \T3
800 vmovdqa HashKey_7_k(arg1), \T5
801 vpclmulqdq $0x10, \T5, \T3, \T3
802 vpxor \T3, \T6, \T6
803
804 vmovdqu 16*4(arg1), \T1
805 vaesenc \T1, \XMM1, \XMM1
806 vaesenc \T1, \XMM2, \XMM2
807 vaesenc \T1, \XMM3, \XMM3
808 vaesenc \T1, \XMM4, \XMM4
809 vaesenc \T1, \XMM5, \XMM5
810 vaesenc \T1, \XMM6, \XMM6
811 vaesenc \T1, \XMM7, \XMM7
812 vaesenc \T1, \XMM8, \XMM8
813
814
815
816 vmovdqa TMP3(%rsp), \T1
817 vmovdqa HashKey_6(arg1), \T5
818 vpclmulqdq $0x11, \T5, \T1, \T3
819 vpxor \T3, \T4, \T4
820 vpclmulqdq $0x00, \T5, \T1, \T3
821 vpxor \T3, \T7, \T7
822
823 vpshufd $0b01001110, \T1, \T3
824 vpxor \T1, \T3, \T3
825 vmovdqa HashKey_6_k(arg1), \T5
826 vpclmulqdq $0x10, \T5, \T3, \T3
827 vpxor \T3, \T6, \T6
828
829 vmovdqu 16*5(arg1), \T1
830 vaesenc \T1, \XMM1, \XMM1
831 vaesenc \T1, \XMM2, \XMM2
832 vaesenc \T1, \XMM3, \XMM3
833 vaesenc \T1, \XMM4, \XMM4
834 vaesenc \T1, \XMM5, \XMM5
835 vaesenc \T1, \XMM6, \XMM6
836 vaesenc \T1, \XMM7, \XMM7
837 vaesenc \T1, \XMM8, \XMM8
838
839 vmovdqa TMP4(%rsp), \T1
840 vmovdqa HashKey_5(arg1), \T5
841 vpclmulqdq $0x11, \T5, \T1, \T3
842 vpxor \T3, \T4, \T4
843 vpclmulqdq $0x00, \T5, \T1, \T3
844 vpxor \T3, \T7, \T7
845
846 vpshufd $0b01001110, \T1, \T3
847 vpxor \T1, \T3, \T3
848 vmovdqa HashKey_5_k(arg1), \T5
849 vpclmulqdq $0x10, \T5, \T3, \T3
850 vpxor \T3, \T6, \T6
851
852 vmovdqu 16*6(arg1), \T1
853 vaesenc \T1, \XMM1, \XMM1
854 vaesenc \T1, \XMM2, \XMM2
855 vaesenc \T1, \XMM3, \XMM3
856 vaesenc \T1, \XMM4, \XMM4
857 vaesenc \T1, \XMM5, \XMM5
858 vaesenc \T1, \XMM6, \XMM6
859 vaesenc \T1, \XMM7, \XMM7
860 vaesenc \T1, \XMM8, \XMM8
861
862
863 vmovdqa TMP5(%rsp), \T1
864 vmovdqa HashKey_4(arg1), \T5
865 vpclmulqdq $0x11, \T5, \T1, \T3
866 vpxor \T3, \T4, \T4
867 vpclmulqdq $0x00, \T5, \T1, \T3
868 vpxor \T3, \T7, \T7
869
870 vpshufd $0b01001110, \T1, \T3
871 vpxor \T1, \T3, \T3
872 vmovdqa HashKey_4_k(arg1), \T5
873 vpclmulqdq $0x10, \T5, \T3, \T3
874 vpxor \T3, \T6, \T6
875
876 vmovdqu 16*7(arg1), \T1
877 vaesenc \T1, \XMM1, \XMM1
878 vaesenc \T1, \XMM2, \XMM2
879 vaesenc \T1, \XMM3, \XMM3
880 vaesenc \T1, \XMM4, \XMM4
881 vaesenc \T1, \XMM5, \XMM5
882 vaesenc \T1, \XMM6, \XMM6
883 vaesenc \T1, \XMM7, \XMM7
884 vaesenc \T1, \XMM8, \XMM8
885
886 vmovdqa TMP6(%rsp), \T1
887 vmovdqa HashKey_3(arg1), \T5
888 vpclmulqdq $0x11, \T5, \T1, \T3
889 vpxor \T3, \T4, \T4
890 vpclmulqdq $0x00, \T5, \T1, \T3
891 vpxor \T3, \T7, \T7
892
893 vpshufd $0b01001110, \T1, \T3
894 vpxor \T1, \T3, \T3
895 vmovdqa HashKey_3_k(arg1), \T5
896 vpclmulqdq $0x10, \T5, \T3, \T3
897 vpxor \T3, \T6, \T6
898
899
900 vmovdqu 16*8(arg1), \T1
901 vaesenc \T1, \XMM1, \XMM1
902 vaesenc \T1, \XMM2, \XMM2
903 vaesenc \T1, \XMM3, \XMM3
904 vaesenc \T1, \XMM4, \XMM4
905 vaesenc \T1, \XMM5, \XMM5
906 vaesenc \T1, \XMM6, \XMM6
907 vaesenc \T1, \XMM7, \XMM7
908 vaesenc \T1, \XMM8, \XMM8
909
910 vmovdqa TMP7(%rsp), \T1
911 vmovdqa HashKey_2(arg1), \T5
912 vpclmulqdq $0x11, \T5, \T1, \T3
913 vpxor \T3, \T4, \T4
914 vpclmulqdq $0x00, \T5, \T1, \T3
915 vpxor \T3, \T7, \T7
916
917 vpshufd $0b01001110, \T1, \T3
918 vpxor \T1, \T3, \T3
919 vmovdqa HashKey_2_k(arg1), \T5
920 vpclmulqdq $0x10, \T5, \T3, \T3
921 vpxor \T3, \T6, \T6
922
923
924
925 vmovdqu 16*9(arg1), \T5
926 vaesenc \T5, \XMM1, \XMM1
927 vaesenc \T5, \XMM2, \XMM2
928 vaesenc \T5, \XMM3, \XMM3
929 vaesenc \T5, \XMM4, \XMM4
930 vaesenc \T5, \XMM5, \XMM5
931 vaesenc \T5, \XMM6, \XMM6
932 vaesenc \T5, \XMM7, \XMM7
933 vaesenc \T5, \XMM8, \XMM8
934
935 vmovdqa TMP8(%rsp), \T1
936 vmovdqa HashKey(arg1), \T5
937 vpclmulqdq $0x11, \T5, \T1, \T3
938 vpxor \T3, \T4, \T4
939 vpclmulqdq $0x00, \T5, \T1, \T3
940 vpxor \T3, \T7, \T7
941
942 vpshufd $0b01001110, \T1, \T3
943 vpxor \T1, \T3, \T3
944 vmovdqa HashKey_k(arg1), \T5
945 vpclmulqdq $0x10, \T5, \T3, \T3
946 vpxor \T3, \T6, \T6
947
948 vpxor \T4, \T6, \T6
949 vpxor \T7, \T6, \T6
950
951 vmovdqu 16*10(arg1), \T5
952
953 i = 0
954 j = 1
955 setreg
956.rep 8
957 vpxor 16*i(arg3, %r11), \T5, \T2
958 .if \ENC_DEC == ENC
959 vaesenclast \T2, reg_j, reg_j
960 .else
961 vaesenclast \T2, reg_j, \T3
962 vmovdqu 16*i(arg3, %r11), reg_j
963 vmovdqu \T3, 16*i(arg2, %r11)
964 .endif
965 i = (i+1)
966 j = (j+1)
967 setreg
968.endr
969
970
971
972 vpslldq $8, \T6, \T3
973 vpsrldq $8, \T6, \T6
974 vpxor \T3, \T7, \T7
975 vpxor \T4, \T6, \T6
976
977
978
979
980
981
982 vpslld $31, \T7, \T2
983 vpslld $30, \T7, \T3
984 vpslld $25, \T7, \T4
985
986 vpxor \T3, \T2, \T2
987 vpxor \T4, \T2, \T2
988
989 vpsrldq $4, \T2, \T1
990
991 vpslldq $12, \T2, \T2
992 vpxor \T2, \T7, \T7
993
994 .if \ENC_DEC == ENC
995 vmovdqu \XMM1, 16*0(arg2,%r11)
996 vmovdqu \XMM2, 16*1(arg2,%r11)
997 vmovdqu \XMM3, 16*2(arg2,%r11)
998 vmovdqu \XMM4, 16*3(arg2,%r11)
999 vmovdqu \XMM5, 16*4(arg2,%r11)
1000 vmovdqu \XMM6, 16*5(arg2,%r11)
1001 vmovdqu \XMM7, 16*6(arg2,%r11)
1002 vmovdqu \XMM8, 16*7(arg2,%r11)
1003 .endif
1004
1005
1006
1007 vpsrld $1, \T7, \T2
1008 vpsrld $2, \T7, \T3
1009 vpsrld $7, \T7, \T4
1010 vpxor \T3, \T2, \T2
1011 vpxor \T4, \T2, \T2
1012
1013 vpxor \T1, \T2, \T2
1014 vpxor \T2, \T7, \T7
1015 vpxor \T7, \T6, \T6
1016
1017
1018 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
1019 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
1020 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
1021 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
1022 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
1023 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
1024 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
1025 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
1026
1027
1028 vpxor \T6, \XMM1, \XMM1
1029
1030
1031
1032.endm
1033
1034
1035
1036.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1037
1038
1039
1040
1041 vpshufd $0b01001110, \XMM1, \T2
1042 vpxor \XMM1, \T2, \T2
1043 vmovdqa HashKey_8(arg1), \T5
1044 vpclmulqdq $0x11, \T5, \XMM1, \T6
1045 vpclmulqdq $0x00, \T5, \XMM1, \T7
1046
1047 vmovdqa HashKey_8_k(arg1), \T3
1048 vpclmulqdq $0x00, \T3, \T2, \XMM1
1049
1050
1051
1052 vpshufd $0b01001110, \XMM2, \T2
1053 vpxor \XMM2, \T2, \T2
1054 vmovdqa HashKey_7(arg1), \T5
1055 vpclmulqdq $0x11, \T5, \XMM2, \T4
1056 vpxor \T4, \T6, \T6
1057
1058 vpclmulqdq $0x00, \T5, \XMM2, \T4
1059 vpxor \T4, \T7, \T7
1060
1061 vmovdqa HashKey_7_k(arg1), \T3
1062 vpclmulqdq $0x00, \T3, \T2, \T2
1063 vpxor \T2, \XMM1, \XMM1
1064
1065
1066
1067 vpshufd $0b01001110, \XMM3, \T2
1068 vpxor \XMM3, \T2, \T2
1069 vmovdqa HashKey_6(arg1), \T5
1070 vpclmulqdq $0x11, \T5, \XMM3, \T4
1071 vpxor \T4, \T6, \T6
1072
1073 vpclmulqdq $0x00, \T5, \XMM3, \T4
1074 vpxor \T4, \T7, \T7
1075
1076 vmovdqa HashKey_6_k(arg1), \T3
1077 vpclmulqdq $0x00, \T3, \T2, \T2
1078 vpxor \T2, \XMM1, \XMM1
1079
1080
1081
1082 vpshufd $0b01001110, \XMM4, \T2
1083 vpxor \XMM4, \T2, \T2
1084 vmovdqa HashKey_5(arg1), \T5
1085 vpclmulqdq $0x11, \T5, \XMM4, \T4
1086 vpxor \T4, \T6, \T6
1087
1088 vpclmulqdq $0x00, \T5, \XMM4, \T4
1089 vpxor \T4, \T7, \T7
1090
1091 vmovdqa HashKey_5_k(arg1), \T3
1092 vpclmulqdq $0x00, \T3, \T2, \T2
1093 vpxor \T2, \XMM1, \XMM1
1094
1095
1096
1097 vpshufd $0b01001110, \XMM5, \T2
1098 vpxor \XMM5, \T2, \T2
1099 vmovdqa HashKey_4(arg1), \T5
1100 vpclmulqdq $0x11, \T5, \XMM5, \T4
1101 vpxor \T4, \T6, \T6
1102
1103 vpclmulqdq $0x00, \T5, \XMM5, \T4
1104 vpxor \T4, \T7, \T7
1105
1106 vmovdqa HashKey_4_k(arg1), \T3
1107 vpclmulqdq $0x00, \T3, \T2, \T2
1108 vpxor \T2, \XMM1, \XMM1
1109
1110
1111
1112 vpshufd $0b01001110, \XMM6, \T2
1113 vpxor \XMM6, \T2, \T2
1114 vmovdqa HashKey_3(arg1), \T5
1115 vpclmulqdq $0x11, \T5, \XMM6, \T4
1116 vpxor \T4, \T6, \T6
1117
1118 vpclmulqdq $0x00, \T5, \XMM6, \T4
1119 vpxor \T4, \T7, \T7
1120
1121 vmovdqa HashKey_3_k(arg1), \T3
1122 vpclmulqdq $0x00, \T3, \T2, \T2
1123 vpxor \T2, \XMM1, \XMM1
1124
1125
1126
1127 vpshufd $0b01001110, \XMM7, \T2
1128 vpxor \XMM7, \T2, \T2
1129 vmovdqa HashKey_2(arg1), \T5
1130 vpclmulqdq $0x11, \T5, \XMM7, \T4
1131 vpxor \T4, \T6, \T6
1132
1133 vpclmulqdq $0x00, \T5, \XMM7, \T4
1134 vpxor \T4, \T7, \T7
1135
1136 vmovdqa HashKey_2_k(arg1), \T3
1137 vpclmulqdq $0x00, \T3, \T2, \T2
1138 vpxor \T2, \XMM1, \XMM1
1139
1140
1141
1142 vpshufd $0b01001110, \XMM8, \T2
1143 vpxor \XMM8, \T2, \T2
1144 vmovdqa HashKey(arg1), \T5
1145 vpclmulqdq $0x11, \T5, \XMM8, \T4
1146 vpxor \T4, \T6, \T6
1147
1148 vpclmulqdq $0x00, \T5, \XMM8, \T4
1149 vpxor \T4, \T7, \T7
1150
1151 vmovdqa HashKey_k(arg1), \T3
1152 vpclmulqdq $0x00, \T3, \T2, \T2
1153
1154 vpxor \T2, \XMM1, \XMM1
1155 vpxor \T6, \XMM1, \XMM1
1156 vpxor \T7, \XMM1, \T2
1157
1158
1159
1160
1161 vpslldq $8, \T2, \T4
1162 vpsrldq $8, \T2, \T2
1163
1164 vpxor \T4, \T7, \T7
1165 vpxor \T2, \T6, \T6
1166
1167
1168
1169
1170 vpslld $31, \T7, \T2
1171 vpslld $30, \T7, \T3
1172 vpslld $25, \T7, \T4
1173
1174 vpxor \T3, \T2, \T2
1175 vpxor \T4, \T2, \T2
1176
1177 vpsrldq $4, \T2, \T1
1178
1179 vpslldq $12, \T2, \T2
1180 vpxor \T2, \T7, \T7
1181
1182
1183
1184
1185 vpsrld $1, \T7, \T2
1186 vpsrld $2, \T7, \T3
1187 vpsrld $7, \T7, \T4
1188 vpxor \T3, \T2, \T2
1189 vpxor \T4, \T2, \T2
1190
1191 vpxor \T1, \T2, \T2
1192 vpxor \T2, \T7, \T7
1193 vpxor \T7, \T6, \T6
1194
1195.endm
1196
1197
1198
1199
1200
1201.macro GCM_ENC_DEC_AVX ENC_DEC
1202
1203
1204 push %r12
1205 push %r13
1206 push %r14
1207 push %r15
1208
1209 mov %rsp, %r14
1210
1211
1212
1213
1214 sub $VARIABLE_OFFSET, %rsp
1215 and $~63, %rsp
1216
1217
1218 vmovdqu HashKey(arg1), %xmm13
1219
1220 mov arg4, %r13
1221 and $-16, %r13
1222
1223 mov %r13, %r12
1224 shr $4, %r12
1225 and $7, %r12
1226 jz _initial_num_blocks_is_0\@
1227
1228 cmp $7, %r12
1229 je _initial_num_blocks_is_7\@
1230 cmp $6, %r12
1231 je _initial_num_blocks_is_6\@
1232 cmp $5, %r12
1233 je _initial_num_blocks_is_5\@
1234 cmp $4, %r12
1235 je _initial_num_blocks_is_4\@
1236 cmp $3, %r12
1237 je _initial_num_blocks_is_3\@
1238 cmp $2, %r12
1239 je _initial_num_blocks_is_2\@
1240
1241 jmp _initial_num_blocks_is_1\@
1242
1243_initial_num_blocks_is_7\@:
1244 INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1245 sub $16*7, %r13
1246 jmp _initial_blocks_encrypted\@
1247
1248_initial_num_blocks_is_6\@:
1249 INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1250 sub $16*6, %r13
1251 jmp _initial_blocks_encrypted\@
1252
1253_initial_num_blocks_is_5\@:
1254 INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1255 sub $16*5, %r13
1256 jmp _initial_blocks_encrypted\@
1257
1258_initial_num_blocks_is_4\@:
1259 INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1260 sub $16*4, %r13
1261 jmp _initial_blocks_encrypted\@
1262
1263_initial_num_blocks_is_3\@:
1264 INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1265 sub $16*3, %r13
1266 jmp _initial_blocks_encrypted\@
1267
1268_initial_num_blocks_is_2\@:
1269 INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1270 sub $16*2, %r13
1271 jmp _initial_blocks_encrypted\@
1272
1273_initial_num_blocks_is_1\@:
1274 INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1275 sub $16*1, %r13
1276 jmp _initial_blocks_encrypted\@
1277
1278_initial_num_blocks_is_0\@:
1279 INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1280
1281
1282_initial_blocks_encrypted\@:
1283 cmp $0, %r13
1284 je _zero_cipher_left\@
1285
1286 sub $128, %r13
1287 je _eight_cipher_left\@
1288
1289
1290
1291
1292 vmovd %xmm9, %r15d
1293 and $255, %r15d
1294 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1295
1296
1297_encrypt_by_8_new\@:
1298 cmp $(255-8), %r15d
1299 jg _encrypt_by_8\@
1300
1301
1302
1303 add $8, %r15b
1304 GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
1305 add $128, %r11
1306 sub $128, %r13
1307 jne _encrypt_by_8_new\@
1308
1309 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1310 jmp _eight_cipher_left\@
1311
1312_encrypt_by_8\@:
1313 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1314 add $8, %r15b
1315 GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
1316 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1317 add $128, %r11
1318 sub $128, %r13
1319 jne _encrypt_by_8_new\@
1320
1321 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1322
1323
1324
1325
1326_eight_cipher_left\@:
1327 GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
1328
1329
1330_zero_cipher_left\@:
1331 cmp $16, arg4
1332 jl _only_less_than_16\@
1333
1334 mov arg4, %r13
1335 and $15, %r13
1336
1337 je _multiple_of_16_bytes\@
1338
1339
1340
1341
1342 vpaddd ONE(%rip), %xmm9, %xmm9
1343 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1344 ENCRYPT_SINGLE_BLOCK %xmm9
1345
1346 sub $16, %r11
1347 add %r13, %r11
1348 vmovdqu (arg3, %r11), %xmm1
1349
1350 lea SHIFT_MASK+16(%rip), %r12
1351 sub %r13, %r12
1352
1353
1354 vmovdqu (%r12), %xmm2
1355 vpshufb %xmm2, %xmm1, %xmm1
1356 jmp _final_ghash_mul\@
1357
1358_only_less_than_16\@:
1359
1360 mov arg4, %r13
1361 and $15, %r13
1362
1363 je _multiple_of_16_bytes\@
1364
1365
1366
1367
1368 vpaddd ONE(%rip), %xmm9, %xmm9
1369 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1370 ENCRYPT_SINGLE_BLOCK %xmm9
1371
1372
1373 lea SHIFT_MASK+16(%rip), %r12
1374 sub %r13, %r12
1375
1376
1377
1378_get_last_16_byte_loop\@:
1379 movb (arg3, %r11), %al
1380 movb %al, TMP1 (%rsp , %r11)
1381 add $1, %r11
1382 cmp %r13, %r11
1383 jne _get_last_16_byte_loop\@
1384
1385 vmovdqu TMP1(%rsp), %xmm1
1386
1387 sub $16, %r11
1388
1389_final_ghash_mul\@:
1390 .if \ENC_DEC == DEC
1391 vmovdqa %xmm1, %xmm2
1392 vpxor %xmm1, %xmm9, %xmm9
1393 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1394
1395 vpand %xmm1, %xmm9, %xmm9
1396 vpand %xmm1, %xmm2, %xmm2
1397 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
1398 vpxor %xmm2, %xmm14, %xmm14
1399
1400 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1401 sub %r13, %r11
1402 add $16, %r11
1403 .else
1404 vpxor %xmm1, %xmm9, %xmm9
1405 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1406
1407 vpand %xmm1, %xmm9, %xmm9
1408 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1409 vpxor %xmm9, %xmm14, %xmm14
1410
1411 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1412 sub %r13, %r11
1413 add $16, %r11
1414 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1415 .endif
1416
1417
1418
1419
1420 vmovq %xmm9, %rax
1421 cmp $8, %r13
1422 jle _less_than_8_bytes_left\@
1423
1424 mov %rax, (arg2 , %r11)
1425 add $8, %r11
1426 vpsrldq $8, %xmm9, %xmm9
1427 vmovq %xmm9, %rax
1428 sub $8, %r13
1429
1430_less_than_8_bytes_left\@:
1431 movb %al, (arg2 , %r11)
1432 add $1, %r11
1433 shr $8, %rax
1434 sub $1, %r13
1435 jne _less_than_8_bytes_left\@
1436
1437
1438_multiple_of_16_bytes\@:
1439 mov arg7, %r12
1440 shl $3, %r12
1441 vmovd %r12d, %xmm15
1442
1443 shl $3, arg4
1444 vmovq arg4, %xmm1
1445 vpslldq $8, %xmm15, %xmm15
1446 vpxor %xmm1, %xmm15, %xmm15
1447
1448 vpxor %xmm15, %xmm14, %xmm14
1449 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1450 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14
1451
1452 mov arg5, %rax
1453 vmovdqu (%rax), %xmm9
1454
1455 ENCRYPT_SINGLE_BLOCK %xmm9
1456
1457 vpxor %xmm14, %xmm9, %xmm9
1458
1459
1460
1461_return_T\@:
1462 mov arg8, %r10
1463 mov arg9, %r11
1464
1465 cmp $16, %r11
1466 je _T_16\@
1467
1468 cmp $8, %r11
1469 jl _T_4\@
1470
1471_T_8\@:
1472 vmovq %xmm9, %rax
1473 mov %rax, (%r10)
1474 add $8, %r10
1475 sub $8, %r11
1476 vpsrldq $8, %xmm9, %xmm9
1477 cmp $0, %r11
1478 je _return_T_done\@
1479_T_4\@:
1480 vmovd %xmm9, %eax
1481 mov %eax, (%r10)
1482 add $4, %r10
1483 sub $4, %r11
1484 vpsrldq $4, %xmm9, %xmm9
1485 cmp $0, %r11
1486 je _return_T_done\@
1487_T_123\@:
1488 vmovd %xmm9, %eax
1489 cmp $2, %r11
1490 jl _T_1\@
1491 mov %ax, (%r10)
1492 cmp $2, %r11
1493 je _return_T_done\@
1494 add $2, %r10
1495 sar $16, %eax
1496_T_1\@:
1497 mov %al, (%r10)
1498 jmp _return_T_done\@
1499
1500_T_16\@:
1501 vmovdqu %xmm9, (%r10)
1502
1503_return_T_done\@:
1504 mov %r14, %rsp
1505
1506 pop %r15
1507 pop %r14
1508 pop %r13
1509 pop %r12
1510.endm
1511
1512
1513
1514
1515
1516
1517
1518ENTRY(aesni_gcm_precomp_avx_gen2)
1519
1520 push %r12
1521 push %r13
1522 push %r14
1523 push %r15
1524
1525 mov %rsp, %r14
1526
1527
1528
1529 sub $VARIABLE_OFFSET, %rsp
1530 and $~63, %rsp
1531
1532 vmovdqu (arg2), %xmm6
1533
1534 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
1535
1536 vmovdqa %xmm6, %xmm2
1537 vpsllq $1, %xmm6, %xmm6
1538 vpsrlq $63, %xmm2, %xmm2
1539 vmovdqa %xmm2, %xmm1
1540 vpslldq $8, %xmm2, %xmm2
1541 vpsrldq $8, %xmm1, %xmm1
1542 vpor %xmm2, %xmm6, %xmm6
1543
1544 vpshufd $0b00100100, %xmm1, %xmm2
1545 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
1546 vpand POLY(%rip), %xmm2, %xmm2
1547 vpxor %xmm2, %xmm6, %xmm6
1548
1549 vmovdqa %xmm6, HashKey(arg1)
1550
1551
1552 PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
1553
1554 mov %r14, %rsp
1555
1556 pop %r15
1557 pop %r14
1558 pop %r13
1559 pop %r12
1560 ret
1561ENDPROC(aesni_gcm_precomp_avx_gen2)
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579ENTRY(aesni_gcm_enc_avx_gen2)
1580 GCM_ENC_DEC_AVX ENC
1581 ret
1582ENDPROC(aesni_gcm_enc_avx_gen2)
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600ENTRY(aesni_gcm_dec_avx_gen2)
1601 GCM_ENC_DEC_AVX DEC
1602 ret
1603ENDPROC(aesni_gcm_dec_avx_gen2)
1604#endif
1605
1606#ifdef CONFIG_AS_AVX2
1607
1608
1609
1610
1611
1612
1613
1614.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1615
1616 vpclmulqdq $0x11,\HK,\GH,\T1
1617 vpclmulqdq $0x00,\HK,\GH,\T2
1618 vpclmulqdq $0x01,\HK,\GH,\T3
1619 vpclmulqdq $0x10,\HK,\GH,\GH
1620 vpxor \T3, \GH, \GH
1621
1622
1623 vpsrldq $8 , \GH, \T3
1624 vpslldq $8 , \GH, \GH
1625
1626 vpxor \T3, \T1, \T1
1627 vpxor \T2, \GH, \GH
1628
1629
1630
1631 vmovdqa POLY2(%rip), \T3
1632
1633 vpclmulqdq $0x01, \GH, \T3, \T2
1634 vpslldq $8, \T2, \T2
1635
1636 vpxor \T2, \GH, \GH
1637
1638
1639 vpclmulqdq $0x00, \GH, \T3, \T2
1640 vpsrldq $4, \T2, \T2
1641
1642 vpclmulqdq $0x10, \GH, \T3, \GH
1643 vpslldq $4, \GH, \GH
1644
1645 vpxor \T2, \GH, \GH
1646
1647 vpxor \T1, \GH, \GH
1648
1649
1650.endm
1651
1652.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1653
1654
1655 vmovdqa \HK, \T5
1656 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1657 vmovdqa \T5, HashKey_2(arg1)
1658
1659 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1660 vmovdqa \T5, HashKey_3(arg1)
1661
1662 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1663 vmovdqa \T5, HashKey_4(arg1)
1664
1665 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1666 vmovdqa \T5, HashKey_5(arg1)
1667
1668 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1669 vmovdqa \T5, HashKey_6(arg1)
1670
1671 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1672 vmovdqa \T5, HashKey_7(arg1)
1673
1674 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1675 vmovdqa \T5, HashKey_8(arg1)
1676
1677.endm
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1688 i = (8-\num_initial_blocks)
1689 j = 0
1690 setreg
1691
1692 mov arg6, %r10
1693 mov arg7, %r12
1694
1695
1696 mov %r12, %r11
1697
1698 vpxor reg_j, reg_j, reg_j
1699 vpxor reg_i, reg_i, reg_i
1700
1701 cmp $16, %r11
1702 jl _get_AAD_rest8\@
1703_get_AAD_blocks\@:
1704 vmovdqu (%r10), reg_i
1705 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1706 vpxor reg_i, reg_j, reg_j
1707 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6
1708 add $16, %r10
1709 sub $16, %r12
1710 sub $16, %r11
1711 cmp $16, %r11
1712 jge _get_AAD_blocks\@
1713 vmovdqu reg_j, reg_i
1714 cmp $0, %r11
1715 je _get_AAD_done\@
1716
1717 vpxor reg_i, reg_i, reg_i
1718
1719
1720
1721
1722_get_AAD_rest8\@:
1723 cmp $4, %r11
1724 jle _get_AAD_rest4\@
1725 movq (%r10), \T1
1726 add $8, %r10
1727 sub $8, %r11
1728 vpslldq $8, \T1, \T1
1729 vpsrldq $8, reg_i, reg_i
1730 vpxor \T1, reg_i, reg_i
1731 jmp _get_AAD_rest8\@
1732_get_AAD_rest4\@:
1733 cmp $0, %r11
1734 jle _get_AAD_rest0\@
1735 mov (%r10), %eax
1736 movq %rax, \T1
1737 add $4, %r10
1738 sub $4, %r11
1739 vpslldq $12, \T1, \T1
1740 vpsrldq $4, reg_i, reg_i
1741 vpxor \T1, reg_i, reg_i
1742_get_AAD_rest0\@:
1743
1744
1745
1746 movq %r12, %r11
1747 salq $4, %r11
1748 movdqu aad_shift_arr(%r11), \T1
1749 vpshufb \T1, reg_i, reg_i
1750_get_AAD_rest_final\@:
1751 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1752 vpxor reg_j, reg_i, reg_i
1753 GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6
1754
1755_get_AAD_done\@:
1756
1757 xor %r11, %r11
1758
1759
1760 mov arg5, %rax
1761 vmovdqu (%rax), \CTR
1762 vpshufb SHUF_MASK(%rip), \CTR, \CTR
1763
1764
1765 i = (9-\num_initial_blocks)
1766 setreg
1767.rep \num_initial_blocks
1768 vpaddd ONE(%rip), \CTR, \CTR
1769 vmovdqa \CTR, reg_i
1770 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1771 i = (i+1)
1772 setreg
1773.endr
1774
1775 vmovdqa (arg1), \T_key
1776 i = (9-\num_initial_blocks)
1777 setreg
1778.rep \num_initial_blocks
1779 vpxor \T_key, reg_i, reg_i
1780 i = (i+1)
1781 setreg
1782.endr
1783
1784 j = 1
1785 setreg
1786.rep 9
1787 vmovdqa 16*j(arg1), \T_key
1788 i = (9-\num_initial_blocks)
1789 setreg
1790.rep \num_initial_blocks
1791 vaesenc \T_key, reg_i, reg_i
1792 i = (i+1)
1793 setreg
1794.endr
1795
1796 j = (j+1)
1797 setreg
1798.endr
1799
1800
1801 vmovdqa 16*10(arg1), \T_key
1802 i = (9-\num_initial_blocks)
1803 setreg
1804.rep \num_initial_blocks
1805 vaesenclast \T_key, reg_i, reg_i
1806 i = (i+1)
1807 setreg
1808.endr
1809
1810 i = (9-\num_initial_blocks)
1811 setreg
1812.rep \num_initial_blocks
1813 vmovdqu (arg3, %r11), \T1
1814 vpxor \T1, reg_i, reg_i
1815 vmovdqu reg_i, (arg2 , %r11)
1816
1817 add $16, %r11
1818.if \ENC_DEC == DEC
1819 vmovdqa \T1, reg_i
1820.endif
1821 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1822 i = (i+1)
1823 setreg
1824.endr
1825
1826
1827 i = (8-\num_initial_blocks)
1828 j = (9-\num_initial_blocks)
1829 setreg
1830
1831.rep \num_initial_blocks
1832 vpxor reg_i, reg_j, reg_j
1833 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6
1834 i = (i+1)
1835 j = (j+1)
1836 setreg
1837.endr
1838
1839
1840 vmovdqa \XMM8, TMP1(%rsp)
1841 vmovdqa \XMM8, \T3
1842
1843 cmp $128, %r13
1844 jl _initial_blocks_done\@
1845
1846
1847
1848 vpaddd ONE(%rip), \CTR, \CTR
1849 vmovdqa \CTR, \XMM1
1850 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
1851
1852 vpaddd ONE(%rip), \CTR, \CTR
1853 vmovdqa \CTR, \XMM2
1854 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
1855
1856 vpaddd ONE(%rip), \CTR, \CTR
1857 vmovdqa \CTR, \XMM3
1858 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
1859
1860 vpaddd ONE(%rip), \CTR, \CTR
1861 vmovdqa \CTR, \XMM4
1862 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
1863
1864 vpaddd ONE(%rip), \CTR, \CTR
1865 vmovdqa \CTR, \XMM5
1866 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
1867
1868 vpaddd ONE(%rip), \CTR, \CTR
1869 vmovdqa \CTR, \XMM6
1870 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
1871
1872 vpaddd ONE(%rip), \CTR, \CTR
1873 vmovdqa \CTR, \XMM7
1874 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
1875
1876 vpaddd ONE(%rip), \CTR, \CTR
1877 vmovdqa \CTR, \XMM8
1878 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
1879
1880 vmovdqa (arg1), \T_key
1881 vpxor \T_key, \XMM1, \XMM1
1882 vpxor \T_key, \XMM2, \XMM2
1883 vpxor \T_key, \XMM3, \XMM3
1884 vpxor \T_key, \XMM4, \XMM4
1885 vpxor \T_key, \XMM5, \XMM5
1886 vpxor \T_key, \XMM6, \XMM6
1887 vpxor \T_key, \XMM7, \XMM7
1888 vpxor \T_key, \XMM8, \XMM8
1889
1890 i = 1
1891 setreg
1892.rep 9
1893 vmovdqa 16*i(arg1), \T_key
1894 vaesenc \T_key, \XMM1, \XMM1
1895 vaesenc \T_key, \XMM2, \XMM2
1896 vaesenc \T_key, \XMM3, \XMM3
1897 vaesenc \T_key, \XMM4, \XMM4
1898 vaesenc \T_key, \XMM5, \XMM5
1899 vaesenc \T_key, \XMM6, \XMM6
1900 vaesenc \T_key, \XMM7, \XMM7
1901 vaesenc \T_key, \XMM8, \XMM8
1902 i = (i+1)
1903 setreg
1904.endr
1905
1906
1907 vmovdqa 16*i(arg1), \T_key
1908 vaesenclast \T_key, \XMM1, \XMM1
1909 vaesenclast \T_key, \XMM2, \XMM2
1910 vaesenclast \T_key, \XMM3, \XMM3
1911 vaesenclast \T_key, \XMM4, \XMM4
1912 vaesenclast \T_key, \XMM5, \XMM5
1913 vaesenclast \T_key, \XMM6, \XMM6
1914 vaesenclast \T_key, \XMM7, \XMM7
1915 vaesenclast \T_key, \XMM8, \XMM8
1916
1917 vmovdqu (arg3, %r11), \T1
1918 vpxor \T1, \XMM1, \XMM1
1919 vmovdqu \XMM1, (arg2 , %r11)
1920 .if \ENC_DEC == DEC
1921 vmovdqa \T1, \XMM1
1922 .endif
1923
1924 vmovdqu 16*1(arg3, %r11), \T1
1925 vpxor \T1, \XMM2, \XMM2
1926 vmovdqu \XMM2, 16*1(arg2 , %r11)
1927 .if \ENC_DEC == DEC
1928 vmovdqa \T1, \XMM2
1929 .endif
1930
1931 vmovdqu 16*2(arg3, %r11), \T1
1932 vpxor \T1, \XMM3, \XMM3
1933 vmovdqu \XMM3, 16*2(arg2 , %r11)
1934 .if \ENC_DEC == DEC
1935 vmovdqa \T1, \XMM3
1936 .endif
1937
1938 vmovdqu 16*3(arg3, %r11), \T1
1939 vpxor \T1, \XMM4, \XMM4
1940 vmovdqu \XMM4, 16*3(arg2 , %r11)
1941 .if \ENC_DEC == DEC
1942 vmovdqa \T1, \XMM4
1943 .endif
1944
1945 vmovdqu 16*4(arg3, %r11), \T1
1946 vpxor \T1, \XMM5, \XMM5
1947 vmovdqu \XMM5, 16*4(arg2 , %r11)
1948 .if \ENC_DEC == DEC
1949 vmovdqa \T1, \XMM5
1950 .endif
1951
1952 vmovdqu 16*5(arg3, %r11), \T1
1953 vpxor \T1, \XMM6, \XMM6
1954 vmovdqu \XMM6, 16*5(arg2 , %r11)
1955 .if \ENC_DEC == DEC
1956 vmovdqa \T1, \XMM6
1957 .endif
1958
1959 vmovdqu 16*6(arg3, %r11), \T1
1960 vpxor \T1, \XMM7, \XMM7
1961 vmovdqu \XMM7, 16*6(arg2 , %r11)
1962 .if \ENC_DEC == DEC
1963 vmovdqa \T1, \XMM7
1964 .endif
1965
1966 vmovdqu 16*7(arg3, %r11), \T1
1967 vpxor \T1, \XMM8, \XMM8
1968 vmovdqu \XMM8, 16*7(arg2 , %r11)
1969 .if \ENC_DEC == DEC
1970 vmovdqa \T1, \XMM8
1971 .endif
1972
1973 add $128, %r11
1974
1975 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
1976 vpxor TMP1(%rsp), \XMM1, \XMM1
1977
1978 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
1979 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
1980 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
1981 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
1982 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
1983 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
1984 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
1985
1986
1987
1988_initial_blocks_done\@:
1989
1990
1991.endm
1992
1993
1994
1995
1996
1997
1998
1999.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2000
2001 vmovdqa \XMM1, \T2
2002 vmovdqa \XMM2, TMP2(%rsp)
2003 vmovdqa \XMM3, TMP3(%rsp)
2004 vmovdqa \XMM4, TMP4(%rsp)
2005 vmovdqa \XMM5, TMP5(%rsp)
2006 vmovdqa \XMM6, TMP6(%rsp)
2007 vmovdqa \XMM7, TMP7(%rsp)
2008 vmovdqa \XMM8, TMP8(%rsp)
2009
2010.if \loop_idx == in_order
2011 vpaddd ONE(%rip), \CTR, \XMM1
2012 vpaddd ONE(%rip), \XMM1, \XMM2
2013 vpaddd ONE(%rip), \XMM2, \XMM3
2014 vpaddd ONE(%rip), \XMM3, \XMM4
2015 vpaddd ONE(%rip), \XMM4, \XMM5
2016 vpaddd ONE(%rip), \XMM5, \XMM6
2017 vpaddd ONE(%rip), \XMM6, \XMM7
2018 vpaddd ONE(%rip), \XMM7, \XMM8
2019 vmovdqa \XMM8, \CTR
2020
2021 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
2022 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
2023 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
2024 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
2025 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
2026 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
2027 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
2028 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
2029.else
2030 vpaddd ONEf(%rip), \CTR, \XMM1
2031 vpaddd ONEf(%rip), \XMM1, \XMM2
2032 vpaddd ONEf(%rip), \XMM2, \XMM3
2033 vpaddd ONEf(%rip), \XMM3, \XMM4
2034 vpaddd ONEf(%rip), \XMM4, \XMM5
2035 vpaddd ONEf(%rip), \XMM5, \XMM6
2036 vpaddd ONEf(%rip), \XMM6, \XMM7
2037 vpaddd ONEf(%rip), \XMM7, \XMM8
2038 vmovdqa \XMM8, \CTR
2039.endif
2040
2041
2042
2043
2044 vmovdqu (arg1), \T1
2045 vpxor \T1, \XMM1, \XMM1
2046 vpxor \T1, \XMM2, \XMM2
2047 vpxor \T1, \XMM3, \XMM3
2048 vpxor \T1, \XMM4, \XMM4
2049 vpxor \T1, \XMM5, \XMM5
2050 vpxor \T1, \XMM6, \XMM6
2051 vpxor \T1, \XMM7, \XMM7
2052 vpxor \T1, \XMM8, \XMM8
2053
2054
2055
2056
2057
2058
2059
2060 vmovdqu 16*1(arg1), \T1
2061 vaesenc \T1, \XMM1, \XMM1
2062 vaesenc \T1, \XMM2, \XMM2
2063 vaesenc \T1, \XMM3, \XMM3
2064 vaesenc \T1, \XMM4, \XMM4
2065 vaesenc \T1, \XMM5, \XMM5
2066 vaesenc \T1, \XMM6, \XMM6
2067 vaesenc \T1, \XMM7, \XMM7
2068 vaesenc \T1, \XMM8, \XMM8
2069
2070 vmovdqu 16*2(arg1), \T1
2071 vaesenc \T1, \XMM1, \XMM1
2072 vaesenc \T1, \XMM2, \XMM2
2073 vaesenc \T1, \XMM3, \XMM3
2074 vaesenc \T1, \XMM4, \XMM4
2075 vaesenc \T1, \XMM5, \XMM5
2076 vaesenc \T1, \XMM6, \XMM6
2077 vaesenc \T1, \XMM7, \XMM7
2078 vaesenc \T1, \XMM8, \XMM8
2079
2080
2081
2082
2083 vmovdqa HashKey_8(arg1), \T5
2084 vpclmulqdq $0x11, \T5, \T2, \T4
2085 vpclmulqdq $0x00, \T5, \T2, \T7
2086 vpclmulqdq $0x01, \T5, \T2, \T6
2087 vpclmulqdq $0x10, \T5, \T2, \T5
2088 vpxor \T5, \T6, \T6
2089
2090 vmovdqu 16*3(arg1), \T1
2091 vaesenc \T1, \XMM1, \XMM1
2092 vaesenc \T1, \XMM2, \XMM2
2093 vaesenc \T1, \XMM3, \XMM3
2094 vaesenc \T1, \XMM4, \XMM4
2095 vaesenc \T1, \XMM5, \XMM5
2096 vaesenc \T1, \XMM6, \XMM6
2097 vaesenc \T1, \XMM7, \XMM7
2098 vaesenc \T1, \XMM8, \XMM8
2099
2100 vmovdqa TMP2(%rsp), \T1
2101 vmovdqa HashKey_7(arg1), \T5
2102 vpclmulqdq $0x11, \T5, \T1, \T3
2103 vpxor \T3, \T4, \T4
2104
2105 vpclmulqdq $0x00, \T5, \T1, \T3
2106 vpxor \T3, \T7, \T7
2107
2108 vpclmulqdq $0x01, \T5, \T1, \T3
2109 vpxor \T3, \T6, \T6
2110
2111 vpclmulqdq $0x10, \T5, \T1, \T3
2112 vpxor \T3, \T6, \T6
2113
2114 vmovdqu 16*4(arg1), \T1
2115 vaesenc \T1, \XMM1, \XMM1
2116 vaesenc \T1, \XMM2, \XMM2
2117 vaesenc \T1, \XMM3, \XMM3
2118 vaesenc \T1, \XMM4, \XMM4
2119 vaesenc \T1, \XMM5, \XMM5
2120 vaesenc \T1, \XMM6, \XMM6
2121 vaesenc \T1, \XMM7, \XMM7
2122 vaesenc \T1, \XMM8, \XMM8
2123
2124
2125
2126 vmovdqa TMP3(%rsp), \T1
2127 vmovdqa HashKey_6(arg1), \T5
2128 vpclmulqdq $0x11, \T5, \T1, \T3
2129 vpxor \T3, \T4, \T4
2130
2131 vpclmulqdq $0x00, \T5, \T1, \T3
2132 vpxor \T3, \T7, \T7
2133
2134 vpclmulqdq $0x01, \T5, \T1, \T3
2135 vpxor \T3, \T6, \T6
2136
2137 vpclmulqdq $0x10, \T5, \T1, \T3
2138 vpxor \T3, \T6, \T6
2139
2140 vmovdqu 16*5(arg1), \T1
2141 vaesenc \T1, \XMM1, \XMM1
2142 vaesenc \T1, \XMM2, \XMM2
2143 vaesenc \T1, \XMM3, \XMM3
2144 vaesenc \T1, \XMM4, \XMM4
2145 vaesenc \T1, \XMM5, \XMM5
2146 vaesenc \T1, \XMM6, \XMM6
2147 vaesenc \T1, \XMM7, \XMM7
2148 vaesenc \T1, \XMM8, \XMM8
2149
2150 vmovdqa TMP4(%rsp), \T1
2151 vmovdqa HashKey_5(arg1), \T5
2152 vpclmulqdq $0x11, \T5, \T1, \T3
2153 vpxor \T3, \T4, \T4
2154
2155 vpclmulqdq $0x00, \T5, \T1, \T3
2156 vpxor \T3, \T7, \T7
2157
2158 vpclmulqdq $0x01, \T5, \T1, \T3
2159 vpxor \T3, \T6, \T6
2160
2161 vpclmulqdq $0x10, \T5, \T1, \T3
2162 vpxor \T3, \T6, \T6
2163
2164 vmovdqu 16*6(arg1), \T1
2165 vaesenc \T1, \XMM1, \XMM1
2166 vaesenc \T1, \XMM2, \XMM2
2167 vaesenc \T1, \XMM3, \XMM3
2168 vaesenc \T1, \XMM4, \XMM4
2169 vaesenc \T1, \XMM5, \XMM5
2170 vaesenc \T1, \XMM6, \XMM6
2171 vaesenc \T1, \XMM7, \XMM7
2172 vaesenc \T1, \XMM8, \XMM8
2173
2174
2175 vmovdqa TMP5(%rsp), \T1
2176 vmovdqa HashKey_4(arg1), \T5
2177 vpclmulqdq $0x11, \T5, \T1, \T3
2178 vpxor \T3, \T4, \T4
2179
2180 vpclmulqdq $0x00, \T5, \T1, \T3
2181 vpxor \T3, \T7, \T7
2182
2183 vpclmulqdq $0x01, \T5, \T1, \T3
2184 vpxor \T3, \T6, \T6
2185
2186 vpclmulqdq $0x10, \T5, \T1, \T3
2187 vpxor \T3, \T6, \T6
2188
2189 vmovdqu 16*7(arg1), \T1
2190 vaesenc \T1, \XMM1, \XMM1
2191 vaesenc \T1, \XMM2, \XMM2
2192 vaesenc \T1, \XMM3, \XMM3
2193 vaesenc \T1, \XMM4, \XMM4
2194 vaesenc \T1, \XMM5, \XMM5
2195 vaesenc \T1, \XMM6, \XMM6
2196 vaesenc \T1, \XMM7, \XMM7
2197 vaesenc \T1, \XMM8, \XMM8
2198
2199 vmovdqa TMP6(%rsp), \T1
2200 vmovdqa HashKey_3(arg1), \T5
2201 vpclmulqdq $0x11, \T5, \T1, \T3
2202 vpxor \T3, \T4, \T4
2203
2204 vpclmulqdq $0x00, \T5, \T1, \T3
2205 vpxor \T3, \T7, \T7
2206
2207 vpclmulqdq $0x01, \T5, \T1, \T3
2208 vpxor \T3, \T6, \T6
2209
2210 vpclmulqdq $0x10, \T5, \T1, \T3
2211 vpxor \T3, \T6, \T6
2212
2213 vmovdqu 16*8(arg1), \T1
2214 vaesenc \T1, \XMM1, \XMM1
2215 vaesenc \T1, \XMM2, \XMM2
2216 vaesenc \T1, \XMM3, \XMM3
2217 vaesenc \T1, \XMM4, \XMM4
2218 vaesenc \T1, \XMM5, \XMM5
2219 vaesenc \T1, \XMM6, \XMM6
2220 vaesenc \T1, \XMM7, \XMM7
2221 vaesenc \T1, \XMM8, \XMM8
2222
2223 vmovdqa TMP7(%rsp), \T1
2224 vmovdqa HashKey_2(arg1), \T5
2225 vpclmulqdq $0x11, \T5, \T1, \T3
2226 vpxor \T3, \T4, \T4
2227
2228 vpclmulqdq $0x00, \T5, \T1, \T3
2229 vpxor \T3, \T7, \T7
2230
2231 vpclmulqdq $0x01, \T5, \T1, \T3
2232 vpxor \T3, \T6, \T6
2233
2234 vpclmulqdq $0x10, \T5, \T1, \T3
2235 vpxor \T3, \T6, \T6
2236
2237
2238
2239
2240 vmovdqu 16*9(arg1), \T5
2241 vaesenc \T5, \XMM1, \XMM1
2242 vaesenc \T5, \XMM2, \XMM2
2243 vaesenc \T5, \XMM3, \XMM3
2244 vaesenc \T5, \XMM4, \XMM4
2245 vaesenc \T5, \XMM5, \XMM5
2246 vaesenc \T5, \XMM6, \XMM6
2247 vaesenc \T5, \XMM7, \XMM7
2248 vaesenc \T5, \XMM8, \XMM8
2249
2250 vmovdqa TMP8(%rsp), \T1
2251 vmovdqa HashKey(arg1), \T5
2252
2253 vpclmulqdq $0x00, \T5, \T1, \T3
2254 vpxor \T3, \T7, \T7
2255
2256 vpclmulqdq $0x01, \T5, \T1, \T3
2257 vpxor \T3, \T6, \T6
2258
2259 vpclmulqdq $0x10, \T5, \T1, \T3
2260 vpxor \T3, \T6, \T6
2261
2262 vpclmulqdq $0x11, \T5, \T1, \T3
2263 vpxor \T3, \T4, \T1
2264
2265
2266 vmovdqu 16*10(arg1), \T5
2267
2268 i = 0
2269 j = 1
2270 setreg
2271.rep 8
2272 vpxor 16*i(arg3, %r11), \T5, \T2
2273 .if \ENC_DEC == ENC
2274 vaesenclast \T2, reg_j, reg_j
2275 .else
2276 vaesenclast \T2, reg_j, \T3
2277 vmovdqu 16*i(arg3, %r11), reg_j
2278 vmovdqu \T3, 16*i(arg2, %r11)
2279 .endif
2280 i = (i+1)
2281 j = (j+1)
2282 setreg
2283.endr
2284
2285
2286
2287 vpslldq $8, \T6, \T3
2288 vpsrldq $8, \T6, \T6
2289 vpxor \T3, \T7, \T7
2290 vpxor \T6, \T1, \T1
2291
2292
2293
2294
2295
2296 vmovdqa POLY2(%rip), \T3
2297
2298 vpclmulqdq $0x01, \T7, \T3, \T2
2299 vpslldq $8, \T2, \T2
2300
2301 vpxor \T2, \T7, \T7
2302
2303 .if \ENC_DEC == ENC
2304 vmovdqu \XMM1, 16*0(arg2,%r11)
2305 vmovdqu \XMM2, 16*1(arg2,%r11)
2306 vmovdqu \XMM3, 16*2(arg2,%r11)
2307 vmovdqu \XMM4, 16*3(arg2,%r11)
2308 vmovdqu \XMM5, 16*4(arg2,%r11)
2309 vmovdqu \XMM6, 16*5(arg2,%r11)
2310 vmovdqu \XMM7, 16*6(arg2,%r11)
2311 vmovdqu \XMM8, 16*7(arg2,%r11)
2312 .endif
2313
2314
2315
2316 vpclmulqdq $0x00, \T7, \T3, \T2
2317 vpsrldq $4, \T2, \T2
2318
2319 vpclmulqdq $0x10, \T7, \T3, \T4
2320 vpslldq $4, \T4, \T4
2321
2322 vpxor \T2, \T4, \T4
2323
2324 vpxor \T4, \T1, \T1
2325
2326 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
2327 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
2328 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
2329 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
2330 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
2331 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
2332 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
2333 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
2334
2335
2336 vpxor \T1, \XMM1, \XMM1
2337
2338
2339
2340.endm
2341
2342
2343
2344.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2345
2346
2347
2348 vmovdqa HashKey_8(arg1), \T5
2349
2350 vpshufd $0b01001110, \XMM1, \T2
2351 vpshufd $0b01001110, \T5, \T3
2352 vpxor \XMM1, \T2, \T2
2353 vpxor \T5, \T3, \T3
2354
2355 vpclmulqdq $0x11, \T5, \XMM1, \T6
2356 vpclmulqdq $0x00, \T5, \XMM1, \T7
2357
2358 vpclmulqdq $0x00, \T3, \T2, \XMM1
2359
2360
2361
2362 vmovdqa HashKey_7(arg1), \T5
2363 vpshufd $0b01001110, \XMM2, \T2
2364 vpshufd $0b01001110, \T5, \T3
2365 vpxor \XMM2, \T2, \T2
2366 vpxor \T5, \T3, \T3
2367
2368 vpclmulqdq $0x11, \T5, \XMM2, \T4
2369 vpxor \T4, \T6, \T6
2370
2371 vpclmulqdq $0x00, \T5, \XMM2, \T4
2372 vpxor \T4, \T7, \T7
2373
2374 vpclmulqdq $0x00, \T3, \T2, \T2
2375
2376 vpxor \T2, \XMM1, \XMM1
2377
2378
2379
2380 vmovdqa HashKey_6(arg1), \T5
2381 vpshufd $0b01001110, \XMM3, \T2
2382 vpshufd $0b01001110, \T5, \T3
2383 vpxor \XMM3, \T2, \T2
2384 vpxor \T5, \T3, \T3
2385
2386 vpclmulqdq $0x11, \T5, \XMM3, \T4
2387 vpxor \T4, \T6, \T6
2388
2389 vpclmulqdq $0x00, \T5, \XMM3, \T4
2390 vpxor \T4, \T7, \T7
2391
2392 vpclmulqdq $0x00, \T3, \T2, \T2
2393
2394 vpxor \T2, \XMM1, \XMM1
2395
2396
2397
2398 vmovdqa HashKey_5(arg1), \T5
2399 vpshufd $0b01001110, \XMM4, \T2
2400 vpshufd $0b01001110, \T5, \T3
2401 vpxor \XMM4, \T2, \T2
2402 vpxor \T5, \T3, \T3
2403
2404 vpclmulqdq $0x11, \T5, \XMM4, \T4
2405 vpxor \T4, \T6, \T6
2406
2407 vpclmulqdq $0x00, \T5, \XMM4, \T4
2408 vpxor \T4, \T7, \T7
2409
2410 vpclmulqdq $0x00, \T3, \T2, \T2
2411
2412 vpxor \T2, \XMM1, \XMM1
2413
2414
2415
2416 vmovdqa HashKey_4(arg1), \T5
2417 vpshufd $0b01001110, \XMM5, \T2
2418 vpshufd $0b01001110, \T5, \T3
2419 vpxor \XMM5, \T2, \T2
2420 vpxor \T5, \T3, \T3
2421
2422 vpclmulqdq $0x11, \T5, \XMM5, \T4
2423 vpxor \T4, \T6, \T6
2424
2425 vpclmulqdq $0x00, \T5, \XMM5, \T4
2426 vpxor \T4, \T7, \T7
2427
2428 vpclmulqdq $0x00, \T3, \T2, \T2
2429
2430 vpxor \T2, \XMM1, \XMM1
2431
2432
2433
2434 vmovdqa HashKey_3(arg1), \T5
2435 vpshufd $0b01001110, \XMM6, \T2
2436 vpshufd $0b01001110, \T5, \T3
2437 vpxor \XMM6, \T2, \T2
2438 vpxor \T5, \T3, \T3
2439
2440 vpclmulqdq $0x11, \T5, \XMM6, \T4
2441 vpxor \T4, \T6, \T6
2442
2443 vpclmulqdq $0x00, \T5, \XMM6, \T4
2444 vpxor \T4, \T7, \T7
2445
2446 vpclmulqdq $0x00, \T3, \T2, \T2
2447
2448 vpxor \T2, \XMM1, \XMM1
2449
2450
2451
2452 vmovdqa HashKey_2(arg1), \T5
2453 vpshufd $0b01001110, \XMM7, \T2
2454 vpshufd $0b01001110, \T5, \T3
2455 vpxor \XMM7, \T2, \T2
2456 vpxor \T5, \T3, \T3
2457
2458 vpclmulqdq $0x11, \T5, \XMM7, \T4
2459 vpxor \T4, \T6, \T6
2460
2461 vpclmulqdq $0x00, \T5, \XMM7, \T4
2462 vpxor \T4, \T7, \T7
2463
2464 vpclmulqdq $0x00, \T3, \T2, \T2
2465
2466 vpxor \T2, \XMM1, \XMM1
2467
2468
2469
2470 vmovdqa HashKey(arg1), \T5
2471 vpshufd $0b01001110, \XMM8, \T2
2472 vpshufd $0b01001110, \T5, \T3
2473 vpxor \XMM8, \T2, \T2
2474 vpxor \T5, \T3, \T3
2475
2476 vpclmulqdq $0x11, \T5, \XMM8, \T4
2477 vpxor \T4, \T6, \T6
2478
2479 vpclmulqdq $0x00, \T5, \XMM8, \T4
2480 vpxor \T4, \T7, \T7
2481
2482 vpclmulqdq $0x00, \T3, \T2, \T2
2483
2484 vpxor \T2, \XMM1, \XMM1
2485 vpxor \T6, \XMM1, \XMM1
2486 vpxor \T7, \XMM1, \T2
2487
2488
2489
2490
2491 vpslldq $8, \T2, \T4
2492 vpsrldq $8, \T2, \T2
2493
2494 vpxor \T4, \T7, \T7
2495 vpxor \T2, \T6, \T6
2496
2497
2498
2499
2500 vmovdqa POLY2(%rip), \T3
2501
2502 vpclmulqdq $0x01, \T7, \T3, \T2
2503 vpslldq $8, \T2, \T2
2504
2505 vpxor \T2, \T7, \T7
2506
2507
2508
2509
2510 vpclmulqdq $0x00, \T7, \T3, \T2
2511 vpsrldq $4, \T2, \T2
2512
2513 vpclmulqdq $0x10, \T7, \T3, \T4
2514 vpslldq $4, \T4, \T4
2515
2516 vpxor \T2, \T4, \T4
2517
2518 vpxor \T4, \T6, \T6
2519.endm
2520
2521
2522
2523
2524
2525
2526.macro GCM_ENC_DEC_AVX2 ENC_DEC
2527
2528
2529 push %r12
2530 push %r13
2531 push %r14
2532 push %r15
2533
2534 mov %rsp, %r14
2535
2536
2537
2538
2539 sub $VARIABLE_OFFSET, %rsp
2540 and $~63, %rsp
2541
2542
2543 vmovdqu HashKey(arg1), %xmm13
2544
2545 mov arg4, %r13
2546 and $-16, %r13
2547
2548 mov %r13, %r12
2549 shr $4, %r12
2550 and $7, %r12
2551 jz _initial_num_blocks_is_0\@
2552
2553 cmp $7, %r12
2554 je _initial_num_blocks_is_7\@
2555 cmp $6, %r12
2556 je _initial_num_blocks_is_6\@
2557 cmp $5, %r12
2558 je _initial_num_blocks_is_5\@
2559 cmp $4, %r12
2560 je _initial_num_blocks_is_4\@
2561 cmp $3, %r12
2562 je _initial_num_blocks_is_3\@
2563 cmp $2, %r12
2564 je _initial_num_blocks_is_2\@
2565
2566 jmp _initial_num_blocks_is_1\@
2567
2568_initial_num_blocks_is_7\@:
2569 INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2570 sub $16*7, %r13
2571 jmp _initial_blocks_encrypted\@
2572
2573_initial_num_blocks_is_6\@:
2574 INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2575 sub $16*6, %r13
2576 jmp _initial_blocks_encrypted\@
2577
2578_initial_num_blocks_is_5\@:
2579 INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2580 sub $16*5, %r13
2581 jmp _initial_blocks_encrypted\@
2582
2583_initial_num_blocks_is_4\@:
2584 INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2585 sub $16*4, %r13
2586 jmp _initial_blocks_encrypted\@
2587
2588_initial_num_blocks_is_3\@:
2589 INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2590 sub $16*3, %r13
2591 jmp _initial_blocks_encrypted\@
2592
2593_initial_num_blocks_is_2\@:
2594 INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2595 sub $16*2, %r13
2596 jmp _initial_blocks_encrypted\@
2597
2598_initial_num_blocks_is_1\@:
2599 INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2600 sub $16*1, %r13
2601 jmp _initial_blocks_encrypted\@
2602
2603_initial_num_blocks_is_0\@:
2604 INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2605
2606
2607_initial_blocks_encrypted\@:
2608 cmp $0, %r13
2609 je _zero_cipher_left\@
2610
2611 sub $128, %r13
2612 je _eight_cipher_left\@
2613
2614
2615
2616
2617 vmovd %xmm9, %r15d
2618 and $255, %r15d
2619 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2620
2621
2622_encrypt_by_8_new\@:
2623 cmp $(255-8), %r15d
2624 jg _encrypt_by_8\@
2625
2626
2627
2628 add $8, %r15b
2629 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
2630 add $128, %r11
2631 sub $128, %r13
2632 jne _encrypt_by_8_new\@
2633
2634 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2635 jmp _eight_cipher_left\@
2636
2637_encrypt_by_8\@:
2638 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2639 add $8, %r15b
2640 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
2641 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2642 add $128, %r11
2643 sub $128, %r13
2644 jne _encrypt_by_8_new\@
2645
2646 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2647
2648
2649
2650
2651_eight_cipher_left\@:
2652 GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
2653
2654
2655_zero_cipher_left\@:
2656 cmp $16, arg4
2657 jl _only_less_than_16\@
2658
2659 mov arg4, %r13
2660 and $15, %r13
2661
2662 je _multiple_of_16_bytes\@
2663
2664
2665
2666
2667 vpaddd ONE(%rip), %xmm9, %xmm9
2668 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2669 ENCRYPT_SINGLE_BLOCK %xmm9
2670
2671 sub $16, %r11
2672 add %r13, %r11
2673 vmovdqu (arg3, %r11), %xmm1
2674
2675 lea SHIFT_MASK+16(%rip), %r12
2676 sub %r13, %r12
2677
2678
2679 vmovdqu (%r12), %xmm2
2680 vpshufb %xmm2, %xmm1, %xmm1
2681 jmp _final_ghash_mul\@
2682
2683_only_less_than_16\@:
2684
2685 mov arg4, %r13
2686 and $15, %r13
2687
2688 je _multiple_of_16_bytes\@
2689
2690
2691
2692
2693 vpaddd ONE(%rip), %xmm9, %xmm9
2694 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2695 ENCRYPT_SINGLE_BLOCK %xmm9
2696
2697
2698 lea SHIFT_MASK+16(%rip), %r12
2699 sub %r13, %r12
2700
2701
2702
2703_get_last_16_byte_loop\@:
2704 movb (arg3, %r11), %al
2705 movb %al, TMP1 (%rsp , %r11)
2706 add $1, %r11
2707 cmp %r13, %r11
2708 jne _get_last_16_byte_loop\@
2709
2710 vmovdqu TMP1(%rsp), %xmm1
2711
2712 sub $16, %r11
2713
2714_final_ghash_mul\@:
2715 .if \ENC_DEC == DEC
2716 vmovdqa %xmm1, %xmm2
2717 vpxor %xmm1, %xmm9, %xmm9
2718 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
2719 vpand %xmm1, %xmm9, %xmm9
2720 vpand %xmm1, %xmm2, %xmm2
2721 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
2722 vpxor %xmm2, %xmm14, %xmm14
2723
2724 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2725 sub %r13, %r11
2726 add $16, %r11
2727 .else
2728 vpxor %xmm1, %xmm9, %xmm9
2729 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
2730 vpand %xmm1, %xmm9, %xmm9
2731 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2732 vpxor %xmm9, %xmm14, %xmm14
2733
2734 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2735 sub %r13, %r11
2736 add $16, %r11
2737 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2738 .endif
2739
2740
2741
2742
2743 vmovq %xmm9, %rax
2744 cmp $8, %r13
2745 jle _less_than_8_bytes_left\@
2746
2747 mov %rax, (arg2 , %r11)
2748 add $8, %r11
2749 vpsrldq $8, %xmm9, %xmm9
2750 vmovq %xmm9, %rax
2751 sub $8, %r13
2752
2753_less_than_8_bytes_left\@:
2754 movb %al, (arg2 , %r11)
2755 add $1, %r11
2756 shr $8, %rax
2757 sub $1, %r13
2758 jne _less_than_8_bytes_left\@
2759
2760
2761_multiple_of_16_bytes\@:
2762 mov arg7, %r12
2763 shl $3, %r12
2764 vmovd %r12d, %xmm15
2765
2766 shl $3, arg4
2767 vmovq arg4, %xmm1
2768 vpslldq $8, %xmm15, %xmm15
2769 vpxor %xmm1, %xmm15, %xmm15
2770
2771 vpxor %xmm15, %xmm14, %xmm14
2772 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2773 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14
2774
2775 mov arg5, %rax
2776 vmovdqu (%rax), %xmm9
2777
2778 ENCRYPT_SINGLE_BLOCK %xmm9
2779
2780 vpxor %xmm14, %xmm9, %xmm9
2781
2782
2783
2784_return_T\@:
2785 mov arg8, %r10
2786 mov arg9, %r11
2787
2788 cmp $16, %r11
2789 je _T_16\@
2790
2791 cmp $8, %r11
2792 jl _T_4\@
2793
2794_T_8\@:
2795 vmovq %xmm9, %rax
2796 mov %rax, (%r10)
2797 add $8, %r10
2798 sub $8, %r11
2799 vpsrldq $8, %xmm9, %xmm9
2800 cmp $0, %r11
2801 je _return_T_done\@
2802_T_4\@:
2803 vmovd %xmm9, %eax
2804 mov %eax, (%r10)
2805 add $4, %r10
2806 sub $4, %r11
2807 vpsrldq $4, %xmm9, %xmm9
2808 cmp $0, %r11
2809 je _return_T_done\@
2810_T_123\@:
2811 vmovd %xmm9, %eax
2812 cmp $2, %r11
2813 jl _T_1\@
2814 mov %ax, (%r10)
2815 cmp $2, %r11
2816 je _return_T_done\@
2817 add $2, %r10
2818 sar $16, %eax
2819_T_1\@:
2820 mov %al, (%r10)
2821 jmp _return_T_done\@
2822
2823_T_16\@:
2824 vmovdqu %xmm9, (%r10)
2825
2826_return_T_done\@:
2827 mov %r14, %rsp
2828
2829 pop %r15
2830 pop %r14
2831 pop %r13
2832 pop %r12
2833.endm
2834
2835
2836
2837
2838
2839
2840
2841
2842ENTRY(aesni_gcm_precomp_avx_gen4)
2843
2844 push %r12
2845 push %r13
2846 push %r14
2847 push %r15
2848
2849 mov %rsp, %r14
2850
2851
2852
2853 sub $VARIABLE_OFFSET, %rsp
2854 and $~63, %rsp
2855
2856 vmovdqu (arg2), %xmm6
2857
2858 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
2859
2860 vmovdqa %xmm6, %xmm2
2861 vpsllq $1, %xmm6, %xmm6
2862 vpsrlq $63, %xmm2, %xmm2
2863 vmovdqa %xmm2, %xmm1
2864 vpslldq $8, %xmm2, %xmm2
2865 vpsrldq $8, %xmm1, %xmm1
2866 vpor %xmm2, %xmm6, %xmm6
2867
2868 vpshufd $0b00100100, %xmm1, %xmm2
2869 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
2870 vpand POLY(%rip), %xmm2, %xmm2
2871 vpxor %xmm2, %xmm6, %xmm6
2872
2873 vmovdqa %xmm6, HashKey(arg1)
2874
2875
2876 PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
2877
2878 mov %r14, %rsp
2879
2880 pop %r15
2881 pop %r14
2882 pop %r13
2883 pop %r12
2884 ret
2885ENDPROC(aesni_gcm_precomp_avx_gen4)
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904ENTRY(aesni_gcm_enc_avx_gen4)
2905 GCM_ENC_DEC_AVX2 ENC
2906 ret
2907ENDPROC(aesni_gcm_enc_avx_gen4)
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925ENTRY(aesni_gcm_dec_avx_gen4)
2926 GCM_ENC_DEC_AVX2 DEC
2927 ret
2928ENDPROC(aesni_gcm_dec_avx_gen4)
2929
2930#endif
2931