1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122#include <linux/linkage.h>
123#include <asm/inst.h>
124
125.data
126.align 16
127
128POLY: .octa 0xC2000000000000000000000000000001
129POLY2: .octa 0xC20000000000000000000001C2000000
130TWOONE: .octa 0x00000001000000000000000000000001
131
132
133
134
135SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
136SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
137ALL_F: .octa 0xffffffffffffffffffffffffffffffff
138ZERO: .octa 0x00000000000000000000000000000000
139ONE: .octa 0x00000000000000000000000000000001
140ONEf: .octa 0x01000000000000000000000000000000
141
142.text
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166HashKey = 16*11
167HashKey_2 = 16*12
168HashKey_3 = 16*13
169HashKey_4 = 16*14
170HashKey_5 = 16*15
171HashKey_6 = 16*16
172HashKey_7 = 16*17
173HashKey_8 = 16*18
174HashKey_k = 16*19
175HashKey_2_k = 16*20
176HashKey_3_k = 16*21
177HashKey_4_k = 16*22
178HashKey_5_k = 16*23
179HashKey_6_k = 16*24
180HashKey_7_k = 16*25
181HashKey_8_k = 16*26
182
183#define arg1 %rdi
184#define arg2 %rsi
185#define arg3 %rdx
186#define arg4 %rcx
187#define arg5 %r8
188#define arg6 %r9
189#define arg7 STACK_OFFSET+8*1(%r14)
190#define arg8 STACK_OFFSET+8*2(%r14)
191#define arg9 STACK_OFFSET+8*3(%r14)
192
193i = 0
194j = 0
195
196out_order = 0
197in_order = 1
198DEC = 0
199ENC = 1
200
201.macro define_reg r n
202reg_\r = %xmm\n
203.endm
204
205.macro setreg
206.altmacro
207define_reg i %i
208define_reg j %j
209.noaltmacro
210.endm
211
212
213STACK_OFFSET = 8*4
214
215TMP1 = 16*0
216TMP2 = 16*1
217TMP3 = 16*2
218TMP4 = 16*3
219TMP5 = 16*4
220TMP6 = 16*5
221TMP7 = 16*6
222TMP8 = 16*7
223
224VARIABLE_OFFSET = 16*8
225
226
227
228
229
230
231.macro ENCRYPT_SINGLE_BLOCK XMM0
232 vpxor (arg1), \XMM0, \XMM0
233 i = 1
234 setreg
235.rep 9
236 vaesenc 16*i(arg1), \XMM0, \XMM0
237 i = (i+1)
238 setreg
239.endr
240 vaesenclast 16*10(arg1), \XMM0, \XMM0
241.endm
242
243#ifdef CONFIG_AS_AVX
244
245
246
247
248
249
250
251.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
252
253 vpshufd $0b01001110, \GH, \T2
254 vpshufd $0b01001110, \HK, \T3
255 vpxor \GH , \T2, \T2
256 vpxor \HK , \T3, \T3
257
258 vpclmulqdq $0x11, \HK, \GH, \T1
259 vpclmulqdq $0x00, \HK, \GH, \GH
260 vpclmulqdq $0x00, \T3, \T2, \T2
261 vpxor \GH, \T2,\T2
262 vpxor \T1, \T2,\T2
263
264 vpslldq $8, \T2,\T3
265 vpsrldq $8, \T2,\T2
266 vpxor \T3, \GH, \GH
267 vpxor \T2, \T1, \T1
268
269
270 vpslld $31, \GH, \T2
271 vpslld $30, \GH, \T3
272 vpslld $25, \GH, \T4
273
274 vpxor \T3, \T2, \T2
275 vpxor \T4, \T2, \T2
276
277 vpsrldq $4, \T2, \T5
278
279 vpslldq $12, \T2, \T2
280 vpxor \T2, \GH, \GH
281
282
283
284 vpsrld $1,\GH, \T2
285 vpsrld $2,\GH, \T3
286 vpsrld $7,\GH, \T4
287 vpxor \T3, \T2, \T2
288 vpxor \T4, \T2, \T2
289
290 vpxor \T5, \T2, \T2
291 vpxor \T2, \GH, \GH
292 vpxor \T1, \GH, \GH
293
294
295.endm
296
297.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
298
299
300 vmovdqa \HK, \T5
301
302 vpshufd $0b01001110, \T5, \T1
303 vpxor \T5, \T1, \T1
304 vmovdqa \T1, HashKey_k(arg1)
305
306 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
307 vmovdqa \T5, HashKey_2(arg1)
308 vpshufd $0b01001110, \T5, \T1
309 vpxor \T5, \T1, \T1
310 vmovdqa \T1, HashKey_2_k(arg1)
311
312 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
313 vmovdqa \T5, HashKey_3(arg1)
314 vpshufd $0b01001110, \T5, \T1
315 vpxor \T5, \T1, \T1
316 vmovdqa \T1, HashKey_3_k(arg1)
317
318 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
319 vmovdqa \T5, HashKey_4(arg1)
320 vpshufd $0b01001110, \T5, \T1
321 vpxor \T5, \T1, \T1
322 vmovdqa \T1, HashKey_4_k(arg1)
323
324 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
325 vmovdqa \T5, HashKey_5(arg1)
326 vpshufd $0b01001110, \T5, \T1
327 vpxor \T5, \T1, \T1
328 vmovdqa \T1, HashKey_5_k(arg1)
329
330 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
331 vmovdqa \T5, HashKey_6(arg1)
332 vpshufd $0b01001110, \T5, \T1
333 vpxor \T5, \T1, \T1
334 vmovdqa \T1, HashKey_6_k(arg1)
335
336 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
337 vmovdqa \T5, HashKey_7(arg1)
338 vpshufd $0b01001110, \T5, \T1
339 vpxor \T5, \T1, \T1
340 vmovdqa \T1, HashKey_7_k(arg1)
341
342 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
343 vmovdqa \T5, HashKey_8(arg1)
344 vpshufd $0b01001110, \T5, \T1
345 vpxor \T5, \T1, \T1
346 vmovdqa \T1, HashKey_8_k(arg1)
347
348.endm
349
350
351
352
353
354
355
356
357.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
358 i = (8-\num_initial_blocks)
359 setreg
360
361 mov arg6, %r10
362 mov arg7, %r12
363
364
365 mov %r12, %r11
366
367 vpxor reg_i, reg_i, reg_i
368_get_AAD_loop\@:
369 vmovd (%r10), \T1
370 vpslldq $12, \T1, \T1
371 vpsrldq $4, reg_i, reg_i
372 vpxor \T1, reg_i, reg_i
373
374 add $4, %r10
375 sub $4, %r12
376 jg _get_AAD_loop\@
377
378
379 cmp $16, %r11
380 je _get_AAD_loop2_done\@
381 mov $16, %r12
382
383_get_AAD_loop2\@:
384 vpsrldq $4, reg_i, reg_i
385 sub $4, %r12
386 cmp %r11, %r12
387 jg _get_AAD_loop2\@
388
389_get_AAD_loop2_done\@:
390
391
392 vpshufb SHUF_MASK(%rip), reg_i, reg_i
393
394
395 xor %r11, %r11
396
397
398 mov arg5, %rax
399 vmovdqu (%rax), \CTR
400 vpshufb SHUF_MASK(%rip), \CTR, \CTR
401
402
403 i = (9-\num_initial_blocks)
404 setreg
405.rep \num_initial_blocks
406 vpaddd ONE(%rip), \CTR, \CTR
407 vmovdqa \CTR, reg_i
408 vpshufb SHUF_MASK(%rip), reg_i, reg_i
409 i = (i+1)
410 setreg
411.endr
412
413 vmovdqa (arg1), \T_key
414 i = (9-\num_initial_blocks)
415 setreg
416.rep \num_initial_blocks
417 vpxor \T_key, reg_i, reg_i
418 i = (i+1)
419 setreg
420.endr
421
422 j = 1
423 setreg
424.rep 9
425 vmovdqa 16*j(arg1), \T_key
426 i = (9-\num_initial_blocks)
427 setreg
428.rep \num_initial_blocks
429 vaesenc \T_key, reg_i, reg_i
430 i = (i+1)
431 setreg
432.endr
433
434 j = (j+1)
435 setreg
436.endr
437
438
439 vmovdqa 16*10(arg1), \T_key
440 i = (9-\num_initial_blocks)
441 setreg
442.rep \num_initial_blocks
443 vaesenclast \T_key, reg_i, reg_i
444 i = (i+1)
445 setreg
446.endr
447
448 i = (9-\num_initial_blocks)
449 setreg
450.rep \num_initial_blocks
451 vmovdqu (arg3, %r11), \T1
452 vpxor \T1, reg_i, reg_i
453 vmovdqu reg_i, (arg2 , %r11)
454 add $16, %r11
455.if \ENC_DEC == DEC
456 vmovdqa \T1, reg_i
457.endif
458 vpshufb SHUF_MASK(%rip), reg_i, reg_i
459 i = (i+1)
460 setreg
461.endr
462
463
464 i = (8-\num_initial_blocks)
465 j = (9-\num_initial_blocks)
466 setreg
467 GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6
468
469.rep \num_initial_blocks
470 vpxor reg_i, reg_j, reg_j
471 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6
472 i = (i+1)
473 j = (j+1)
474 setreg
475.endr
476
477
478 vmovdqa \XMM8, TMP1(%rsp)
479 vmovdqa \XMM8, \T3
480
481 cmp $128, %r13
482 jl _initial_blocks_done\@
483
484
485
486 vpaddd ONE(%rip), \CTR, \CTR
487 vmovdqa \CTR, \XMM1
488 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
489
490 vpaddd ONE(%rip), \CTR, \CTR
491 vmovdqa \CTR, \XMM2
492 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
493
494 vpaddd ONE(%rip), \CTR, \CTR
495 vmovdqa \CTR, \XMM3
496 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
497
498 vpaddd ONE(%rip), \CTR, \CTR
499 vmovdqa \CTR, \XMM4
500 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
501
502 vpaddd ONE(%rip), \CTR, \CTR
503 vmovdqa \CTR, \XMM5
504 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
505
506 vpaddd ONE(%rip), \CTR, \CTR
507 vmovdqa \CTR, \XMM6
508 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
509
510 vpaddd ONE(%rip), \CTR, \CTR
511 vmovdqa \CTR, \XMM7
512 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
513
514 vpaddd ONE(%rip), \CTR, \CTR
515 vmovdqa \CTR, \XMM8
516 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
517
518 vmovdqa (arg1), \T_key
519 vpxor \T_key, \XMM1, \XMM1
520 vpxor \T_key, \XMM2, \XMM2
521 vpxor \T_key, \XMM3, \XMM3
522 vpxor \T_key, \XMM4, \XMM4
523 vpxor \T_key, \XMM5, \XMM5
524 vpxor \T_key, \XMM6, \XMM6
525 vpxor \T_key, \XMM7, \XMM7
526 vpxor \T_key, \XMM8, \XMM8
527
528 i = 1
529 setreg
530.rep 9
531 vmovdqa 16*i(arg1), \T_key
532 vaesenc \T_key, \XMM1, \XMM1
533 vaesenc \T_key, \XMM2, \XMM2
534 vaesenc \T_key, \XMM3, \XMM3
535 vaesenc \T_key, \XMM4, \XMM4
536 vaesenc \T_key, \XMM5, \XMM5
537 vaesenc \T_key, \XMM6, \XMM6
538 vaesenc \T_key, \XMM7, \XMM7
539 vaesenc \T_key, \XMM8, \XMM8
540 i = (i+1)
541 setreg
542.endr
543
544
545 vmovdqa 16*i(arg1), \T_key
546 vaesenclast \T_key, \XMM1, \XMM1
547 vaesenclast \T_key, \XMM2, \XMM2
548 vaesenclast \T_key, \XMM3, \XMM3
549 vaesenclast \T_key, \XMM4, \XMM4
550 vaesenclast \T_key, \XMM5, \XMM5
551 vaesenclast \T_key, \XMM6, \XMM6
552 vaesenclast \T_key, \XMM7, \XMM7
553 vaesenclast \T_key, \XMM8, \XMM8
554
555 vmovdqu (arg3, %r11), \T1
556 vpxor \T1, \XMM1, \XMM1
557 vmovdqu \XMM1, (arg2 , %r11)
558 .if \ENC_DEC == DEC
559 vmovdqa \T1, \XMM1
560 .endif
561
562 vmovdqu 16*1(arg3, %r11), \T1
563 vpxor \T1, \XMM2, \XMM2
564 vmovdqu \XMM2, 16*1(arg2 , %r11)
565 .if \ENC_DEC == DEC
566 vmovdqa \T1, \XMM2
567 .endif
568
569 vmovdqu 16*2(arg3, %r11), \T1
570 vpxor \T1, \XMM3, \XMM3
571 vmovdqu \XMM3, 16*2(arg2 , %r11)
572 .if \ENC_DEC == DEC
573 vmovdqa \T1, \XMM3
574 .endif
575
576 vmovdqu 16*3(arg3, %r11), \T1
577 vpxor \T1, \XMM4, \XMM4
578 vmovdqu \XMM4, 16*3(arg2 , %r11)
579 .if \ENC_DEC == DEC
580 vmovdqa \T1, \XMM4
581 .endif
582
583 vmovdqu 16*4(arg3, %r11), \T1
584 vpxor \T1, \XMM5, \XMM5
585 vmovdqu \XMM5, 16*4(arg2 , %r11)
586 .if \ENC_DEC == DEC
587 vmovdqa \T1, \XMM5
588 .endif
589
590 vmovdqu 16*5(arg3, %r11), \T1
591 vpxor \T1, \XMM6, \XMM6
592 vmovdqu \XMM6, 16*5(arg2 , %r11)
593 .if \ENC_DEC == DEC
594 vmovdqa \T1, \XMM6
595 .endif
596
597 vmovdqu 16*6(arg3, %r11), \T1
598 vpxor \T1, \XMM7, \XMM7
599 vmovdqu \XMM7, 16*6(arg2 , %r11)
600 .if \ENC_DEC == DEC
601 vmovdqa \T1, \XMM7
602 .endif
603
604 vmovdqu 16*7(arg3, %r11), \T1
605 vpxor \T1, \XMM8, \XMM8
606 vmovdqu \XMM8, 16*7(arg2 , %r11)
607 .if \ENC_DEC == DEC
608 vmovdqa \T1, \XMM8
609 .endif
610
611 add $128, %r11
612
613 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
614 vpxor TMP1(%rsp), \XMM1, \XMM1
615 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
616 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
617 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
618 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
619 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
620 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
621 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
622
623
624
625_initial_blocks_done\@:
626
627.endm
628
629
630
631
632
633.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
634
635 vmovdqa \XMM1, \T2
636 vmovdqa \XMM2, TMP2(%rsp)
637 vmovdqa \XMM3, TMP3(%rsp)
638 vmovdqa \XMM4, TMP4(%rsp)
639 vmovdqa \XMM5, TMP5(%rsp)
640 vmovdqa \XMM6, TMP6(%rsp)
641 vmovdqa \XMM7, TMP7(%rsp)
642 vmovdqa \XMM8, TMP8(%rsp)
643
644.if \loop_idx == in_order
645 vpaddd ONE(%rip), \CTR, \XMM1
646 vpaddd ONE(%rip), \XMM1, \XMM2
647 vpaddd ONE(%rip), \XMM2, \XMM3
648 vpaddd ONE(%rip), \XMM3, \XMM4
649 vpaddd ONE(%rip), \XMM4, \XMM5
650 vpaddd ONE(%rip), \XMM5, \XMM6
651 vpaddd ONE(%rip), \XMM6, \XMM7
652 vpaddd ONE(%rip), \XMM7, \XMM8
653 vmovdqa \XMM8, \CTR
654
655 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
656 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
657 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
658 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
659 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
660 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
661 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
662 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
663.else
664 vpaddd ONEf(%rip), \CTR, \XMM1
665 vpaddd ONEf(%rip), \XMM1, \XMM2
666 vpaddd ONEf(%rip), \XMM2, \XMM3
667 vpaddd ONEf(%rip), \XMM3, \XMM4
668 vpaddd ONEf(%rip), \XMM4, \XMM5
669 vpaddd ONEf(%rip), \XMM5, \XMM6
670 vpaddd ONEf(%rip), \XMM6, \XMM7
671 vpaddd ONEf(%rip), \XMM7, \XMM8
672 vmovdqa \XMM8, \CTR
673.endif
674
675
676
677
678 vmovdqu (arg1), \T1
679 vpxor \T1, \XMM1, \XMM1
680 vpxor \T1, \XMM2, \XMM2
681 vpxor \T1, \XMM3, \XMM3
682 vpxor \T1, \XMM4, \XMM4
683 vpxor \T1, \XMM5, \XMM5
684 vpxor \T1, \XMM6, \XMM6
685 vpxor \T1, \XMM7, \XMM7
686 vpxor \T1, \XMM8, \XMM8
687
688
689
690
691
692
693
694 vmovdqu 16*1(arg1), \T1
695 vaesenc \T1, \XMM1, \XMM1
696 vaesenc \T1, \XMM2, \XMM2
697 vaesenc \T1, \XMM3, \XMM3
698 vaesenc \T1, \XMM4, \XMM4
699 vaesenc \T1, \XMM5, \XMM5
700 vaesenc \T1, \XMM6, \XMM6
701 vaesenc \T1, \XMM7, \XMM7
702 vaesenc \T1, \XMM8, \XMM8
703
704 vmovdqu 16*2(arg1), \T1
705 vaesenc \T1, \XMM1, \XMM1
706 vaesenc \T1, \XMM2, \XMM2
707 vaesenc \T1, \XMM3, \XMM3
708 vaesenc \T1, \XMM4, \XMM4
709 vaesenc \T1, \XMM5, \XMM5
710 vaesenc \T1, \XMM6, \XMM6
711 vaesenc \T1, \XMM7, \XMM7
712 vaesenc \T1, \XMM8, \XMM8
713
714
715
716
717 vmovdqa HashKey_8(arg1), \T5
718 vpclmulqdq $0x11, \T5, \T2, \T4
719 vpclmulqdq $0x00, \T5, \T2, \T7
720
721 vpshufd $0b01001110, \T2, \T6
722 vpxor \T2, \T6, \T6
723
724 vmovdqa HashKey_8_k(arg1), \T5
725 vpclmulqdq $0x00, \T5, \T6, \T6
726
727 vmovdqu 16*3(arg1), \T1
728 vaesenc \T1, \XMM1, \XMM1
729 vaesenc \T1, \XMM2, \XMM2
730 vaesenc \T1, \XMM3, \XMM3
731 vaesenc \T1, \XMM4, \XMM4
732 vaesenc \T1, \XMM5, \XMM5
733 vaesenc \T1, \XMM6, \XMM6
734 vaesenc \T1, \XMM7, \XMM7
735 vaesenc \T1, \XMM8, \XMM8
736
737 vmovdqa TMP2(%rsp), \T1
738 vmovdqa HashKey_7(arg1), \T5
739 vpclmulqdq $0x11, \T5, \T1, \T3
740 vpxor \T3, \T4, \T4
741 vpclmulqdq $0x00, \T5, \T1, \T3
742 vpxor \T3, \T7, \T7
743
744 vpshufd $0b01001110, \T1, \T3
745 vpxor \T1, \T3, \T3
746 vmovdqa HashKey_7_k(arg1), \T5
747 vpclmulqdq $0x10, \T5, \T3, \T3
748 vpxor \T3, \T6, \T6
749
750 vmovdqu 16*4(arg1), \T1
751 vaesenc \T1, \XMM1, \XMM1
752 vaesenc \T1, \XMM2, \XMM2
753 vaesenc \T1, \XMM3, \XMM3
754 vaesenc \T1, \XMM4, \XMM4
755 vaesenc \T1, \XMM5, \XMM5
756 vaesenc \T1, \XMM6, \XMM6
757 vaesenc \T1, \XMM7, \XMM7
758 vaesenc \T1, \XMM8, \XMM8
759
760
761
762 vmovdqa TMP3(%rsp), \T1
763 vmovdqa HashKey_6(arg1), \T5
764 vpclmulqdq $0x11, \T5, \T1, \T3
765 vpxor \T3, \T4, \T4
766 vpclmulqdq $0x00, \T5, \T1, \T3
767 vpxor \T3, \T7, \T7
768
769 vpshufd $0b01001110, \T1, \T3
770 vpxor \T1, \T3, \T3
771 vmovdqa HashKey_6_k(arg1), \T5
772 vpclmulqdq $0x10, \T5, \T3, \T3
773 vpxor \T3, \T6, \T6
774
775 vmovdqu 16*5(arg1), \T1
776 vaesenc \T1, \XMM1, \XMM1
777 vaesenc \T1, \XMM2, \XMM2
778 vaesenc \T1, \XMM3, \XMM3
779 vaesenc \T1, \XMM4, \XMM4
780 vaesenc \T1, \XMM5, \XMM5
781 vaesenc \T1, \XMM6, \XMM6
782 vaesenc \T1, \XMM7, \XMM7
783 vaesenc \T1, \XMM8, \XMM8
784
785 vmovdqa TMP4(%rsp), \T1
786 vmovdqa HashKey_5(arg1), \T5
787 vpclmulqdq $0x11, \T5, \T1, \T3
788 vpxor \T3, \T4, \T4
789 vpclmulqdq $0x00, \T5, \T1, \T3
790 vpxor \T3, \T7, \T7
791
792 vpshufd $0b01001110, \T1, \T3
793 vpxor \T1, \T3, \T3
794 vmovdqa HashKey_5_k(arg1), \T5
795 vpclmulqdq $0x10, \T5, \T3, \T3
796 vpxor \T3, \T6, \T6
797
798 vmovdqu 16*6(arg1), \T1
799 vaesenc \T1, \XMM1, \XMM1
800 vaesenc \T1, \XMM2, \XMM2
801 vaesenc \T1, \XMM3, \XMM3
802 vaesenc \T1, \XMM4, \XMM4
803 vaesenc \T1, \XMM5, \XMM5
804 vaesenc \T1, \XMM6, \XMM6
805 vaesenc \T1, \XMM7, \XMM7
806 vaesenc \T1, \XMM8, \XMM8
807
808
809 vmovdqa TMP5(%rsp), \T1
810 vmovdqa HashKey_4(arg1), \T5
811 vpclmulqdq $0x11, \T5, \T1, \T3
812 vpxor \T3, \T4, \T4
813 vpclmulqdq $0x00, \T5, \T1, \T3
814 vpxor \T3, \T7, \T7
815
816 vpshufd $0b01001110, \T1, \T3
817 vpxor \T1, \T3, \T3
818 vmovdqa HashKey_4_k(arg1), \T5
819 vpclmulqdq $0x10, \T5, \T3, \T3
820 vpxor \T3, \T6, \T6
821
822 vmovdqu 16*7(arg1), \T1
823 vaesenc \T1, \XMM1, \XMM1
824 vaesenc \T1, \XMM2, \XMM2
825 vaesenc \T1, \XMM3, \XMM3
826 vaesenc \T1, \XMM4, \XMM4
827 vaesenc \T1, \XMM5, \XMM5
828 vaesenc \T1, \XMM6, \XMM6
829 vaesenc \T1, \XMM7, \XMM7
830 vaesenc \T1, \XMM8, \XMM8
831
832 vmovdqa TMP6(%rsp), \T1
833 vmovdqa HashKey_3(arg1), \T5
834 vpclmulqdq $0x11, \T5, \T1, \T3
835 vpxor \T3, \T4, \T4
836 vpclmulqdq $0x00, \T5, \T1, \T3
837 vpxor \T3, \T7, \T7
838
839 vpshufd $0b01001110, \T1, \T3
840 vpxor \T1, \T3, \T3
841 vmovdqa HashKey_3_k(arg1), \T5
842 vpclmulqdq $0x10, \T5, \T3, \T3
843 vpxor \T3, \T6, \T6
844
845
846 vmovdqu 16*8(arg1), \T1
847 vaesenc \T1, \XMM1, \XMM1
848 vaesenc \T1, \XMM2, \XMM2
849 vaesenc \T1, \XMM3, \XMM3
850 vaesenc \T1, \XMM4, \XMM4
851 vaesenc \T1, \XMM5, \XMM5
852 vaesenc \T1, \XMM6, \XMM6
853 vaesenc \T1, \XMM7, \XMM7
854 vaesenc \T1, \XMM8, \XMM8
855
856 vmovdqa TMP7(%rsp), \T1
857 vmovdqa HashKey_2(arg1), \T5
858 vpclmulqdq $0x11, \T5, \T1, \T3
859 vpxor \T3, \T4, \T4
860 vpclmulqdq $0x00, \T5, \T1, \T3
861 vpxor \T3, \T7, \T7
862
863 vpshufd $0b01001110, \T1, \T3
864 vpxor \T1, \T3, \T3
865 vmovdqa HashKey_2_k(arg1), \T5
866 vpclmulqdq $0x10, \T5, \T3, \T3
867 vpxor \T3, \T6, \T6
868
869
870
871 vmovdqu 16*9(arg1), \T5
872 vaesenc \T5, \XMM1, \XMM1
873 vaesenc \T5, \XMM2, \XMM2
874 vaesenc \T5, \XMM3, \XMM3
875 vaesenc \T5, \XMM4, \XMM4
876 vaesenc \T5, \XMM5, \XMM5
877 vaesenc \T5, \XMM6, \XMM6
878 vaesenc \T5, \XMM7, \XMM7
879 vaesenc \T5, \XMM8, \XMM8
880
881 vmovdqa TMP8(%rsp), \T1
882 vmovdqa HashKey(arg1), \T5
883 vpclmulqdq $0x11, \T5, \T1, \T3
884 vpxor \T3, \T4, \T4
885 vpclmulqdq $0x00, \T5, \T1, \T3
886 vpxor \T3, \T7, \T7
887
888 vpshufd $0b01001110, \T1, \T3
889 vpxor \T1, \T3, \T3
890 vmovdqa HashKey_k(arg1), \T5
891 vpclmulqdq $0x10, \T5, \T3, \T3
892 vpxor \T3, \T6, \T6
893
894 vpxor \T4, \T6, \T6
895 vpxor \T7, \T6, \T6
896
897 vmovdqu 16*10(arg1), \T5
898
899 i = 0
900 j = 1
901 setreg
902.rep 8
903 vpxor 16*i(arg3, %r11), \T5, \T2
904 .if \ENC_DEC == ENC
905 vaesenclast \T2, reg_j, reg_j
906 .else
907 vaesenclast \T2, reg_j, \T3
908 vmovdqu 16*i(arg3, %r11), reg_j
909 vmovdqu \T3, 16*i(arg2, %r11)
910 .endif
911 i = (i+1)
912 j = (j+1)
913 setreg
914.endr
915
916
917
918 vpslldq $8, \T6, \T3
919 vpsrldq $8, \T6, \T6
920 vpxor \T3, \T7, \T7
921 vpxor \T4, \T6, \T6
922
923
924
925
926
927
928 vpslld $31, \T7, \T2
929 vpslld $30, \T7, \T3
930 vpslld $25, \T7, \T4
931
932 vpxor \T3, \T2, \T2
933 vpxor \T4, \T2, \T2
934
935 vpsrldq $4, \T2, \T1
936
937 vpslldq $12, \T2, \T2
938 vpxor \T2, \T7, \T7
939
940 .if \ENC_DEC == ENC
941 vmovdqu \XMM1, 16*0(arg2,%r11)
942 vmovdqu \XMM2, 16*1(arg2,%r11)
943 vmovdqu \XMM3, 16*2(arg2,%r11)
944 vmovdqu \XMM4, 16*3(arg2,%r11)
945 vmovdqu \XMM5, 16*4(arg2,%r11)
946 vmovdqu \XMM6, 16*5(arg2,%r11)
947 vmovdqu \XMM7, 16*6(arg2,%r11)
948 vmovdqu \XMM8, 16*7(arg2,%r11)
949 .endif
950
951
952
953 vpsrld $1, \T7, \T2
954 vpsrld $2, \T7, \T3
955 vpsrld $7, \T7, \T4
956 vpxor \T3, \T2, \T2
957 vpxor \T4, \T2, \T2
958
959 vpxor \T1, \T2, \T2
960 vpxor \T2, \T7, \T7
961 vpxor \T7, \T6, \T6
962
963
964 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
965 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
966 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
967 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
968 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
969 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
970 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
971 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
972
973
974 vpxor \T6, \XMM1, \XMM1
975
976
977
978.endm
979
980
981
982.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
983
984
985
986
987 vpshufd $0b01001110, \XMM1, \T2
988 vpxor \XMM1, \T2, \T2
989 vmovdqa HashKey_8(arg1), \T5
990 vpclmulqdq $0x11, \T5, \XMM1, \T6
991 vpclmulqdq $0x00, \T5, \XMM1, \T7
992
993 vmovdqa HashKey_8_k(arg1), \T3
994 vpclmulqdq $0x00, \T3, \T2, \XMM1
995
996
997
998 vpshufd $0b01001110, \XMM2, \T2
999 vpxor \XMM2, \T2, \T2
1000 vmovdqa HashKey_7(arg1), \T5
1001 vpclmulqdq $0x11, \T5, \XMM2, \T4
1002 vpxor \T4, \T6, \T6
1003
1004 vpclmulqdq $0x00, \T5, \XMM2, \T4
1005 vpxor \T4, \T7, \T7
1006
1007 vmovdqa HashKey_7_k(arg1), \T3
1008 vpclmulqdq $0x00, \T3, \T2, \T2
1009 vpxor \T2, \XMM1, \XMM1
1010
1011
1012
1013 vpshufd $0b01001110, \XMM3, \T2
1014 vpxor \XMM3, \T2, \T2
1015 vmovdqa HashKey_6(arg1), \T5
1016 vpclmulqdq $0x11, \T5, \XMM3, \T4
1017 vpxor \T4, \T6, \T6
1018
1019 vpclmulqdq $0x00, \T5, \XMM3, \T4
1020 vpxor \T4, \T7, \T7
1021
1022 vmovdqa HashKey_6_k(arg1), \T3
1023 vpclmulqdq $0x00, \T3, \T2, \T2
1024 vpxor \T2, \XMM1, \XMM1
1025
1026
1027
1028 vpshufd $0b01001110, \XMM4, \T2
1029 vpxor \XMM4, \T2, \T2
1030 vmovdqa HashKey_5(arg1), \T5
1031 vpclmulqdq $0x11, \T5, \XMM4, \T4
1032 vpxor \T4, \T6, \T6
1033
1034 vpclmulqdq $0x00, \T5, \XMM4, \T4
1035 vpxor \T4, \T7, \T7
1036
1037 vmovdqa HashKey_5_k(arg1), \T3
1038 vpclmulqdq $0x00, \T3, \T2, \T2
1039 vpxor \T2, \XMM1, \XMM1
1040
1041
1042
1043 vpshufd $0b01001110, \XMM5, \T2
1044 vpxor \XMM5, \T2, \T2
1045 vmovdqa HashKey_4(arg1), \T5
1046 vpclmulqdq $0x11, \T5, \XMM5, \T4
1047 vpxor \T4, \T6, \T6
1048
1049 vpclmulqdq $0x00, \T5, \XMM5, \T4
1050 vpxor \T4, \T7, \T7
1051
1052 vmovdqa HashKey_4_k(arg1), \T3
1053 vpclmulqdq $0x00, \T3, \T2, \T2
1054 vpxor \T2, \XMM1, \XMM1
1055
1056
1057
1058 vpshufd $0b01001110, \XMM6, \T2
1059 vpxor \XMM6, \T2, \T2
1060 vmovdqa HashKey_3(arg1), \T5
1061 vpclmulqdq $0x11, \T5, \XMM6, \T4
1062 vpxor \T4, \T6, \T6
1063
1064 vpclmulqdq $0x00, \T5, \XMM6, \T4
1065 vpxor \T4, \T7, \T7
1066
1067 vmovdqa HashKey_3_k(arg1), \T3
1068 vpclmulqdq $0x00, \T3, \T2, \T2
1069 vpxor \T2, \XMM1, \XMM1
1070
1071
1072
1073 vpshufd $0b01001110, \XMM7, \T2
1074 vpxor \XMM7, \T2, \T2
1075 vmovdqa HashKey_2(arg1), \T5
1076 vpclmulqdq $0x11, \T5, \XMM7, \T4
1077 vpxor \T4, \T6, \T6
1078
1079 vpclmulqdq $0x00, \T5, \XMM7, \T4
1080 vpxor \T4, \T7, \T7
1081
1082 vmovdqa HashKey_2_k(arg1), \T3
1083 vpclmulqdq $0x00, \T3, \T2, \T2
1084 vpxor \T2, \XMM1, \XMM1
1085
1086
1087
1088 vpshufd $0b01001110, \XMM8, \T2
1089 vpxor \XMM8, \T2, \T2
1090 vmovdqa HashKey(arg1), \T5
1091 vpclmulqdq $0x11, \T5, \XMM8, \T4
1092 vpxor \T4, \T6, \T6
1093
1094 vpclmulqdq $0x00, \T5, \XMM8, \T4
1095 vpxor \T4, \T7, \T7
1096
1097 vmovdqa HashKey_k(arg1), \T3
1098 vpclmulqdq $0x00, \T3, \T2, \T2
1099
1100 vpxor \T2, \XMM1, \XMM1
1101 vpxor \T6, \XMM1, \XMM1
1102 vpxor \T7, \XMM1, \T2
1103
1104
1105
1106
1107 vpslldq $8, \T2, \T4
1108 vpsrldq $8, \T2, \T2
1109
1110 vpxor \T4, \T7, \T7
1111 vpxor \T2, \T6, \T6
1112
1113
1114
1115
1116 vpslld $31, \T7, \T2
1117 vpslld $30, \T7, \T3
1118 vpslld $25, \T7, \T4
1119
1120 vpxor \T3, \T2, \T2
1121 vpxor \T4, \T2, \T2
1122
1123 vpsrldq $4, \T2, \T1
1124
1125 vpslldq $12, \T2, \T2
1126 vpxor \T2, \T7, \T7
1127
1128
1129
1130
1131 vpsrld $1, \T7, \T2
1132 vpsrld $2, \T7, \T3
1133 vpsrld $7, \T7, \T4
1134 vpxor \T3, \T2, \T2
1135 vpxor \T4, \T2, \T2
1136
1137 vpxor \T1, \T2, \T2
1138 vpxor \T2, \T7, \T7
1139 vpxor \T7, \T6, \T6
1140
1141.endm
1142
1143
1144
1145
1146
1147.macro GCM_ENC_DEC_AVX ENC_DEC
1148
1149
1150 push %r12
1151 push %r13
1152 push %r14
1153 push %r15
1154
1155 mov %rsp, %r14
1156
1157
1158
1159
1160 sub $VARIABLE_OFFSET, %rsp
1161 and $~63, %rsp
1162
1163
1164 vmovdqu HashKey(arg1), %xmm13
1165
1166 mov arg4, %r13
1167 and $-16, %r13
1168
1169 mov %r13, %r12
1170 shr $4, %r12
1171 and $7, %r12
1172 jz _initial_num_blocks_is_0\@
1173
1174 cmp $7, %r12
1175 je _initial_num_blocks_is_7\@
1176 cmp $6, %r12
1177 je _initial_num_blocks_is_6\@
1178 cmp $5, %r12
1179 je _initial_num_blocks_is_5\@
1180 cmp $4, %r12
1181 je _initial_num_blocks_is_4\@
1182 cmp $3, %r12
1183 je _initial_num_blocks_is_3\@
1184 cmp $2, %r12
1185 je _initial_num_blocks_is_2\@
1186
1187 jmp _initial_num_blocks_is_1\@
1188
1189_initial_num_blocks_is_7\@:
1190 INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1191 sub $16*7, %r13
1192 jmp _initial_blocks_encrypted\@
1193
1194_initial_num_blocks_is_6\@:
1195 INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1196 sub $16*6, %r13
1197 jmp _initial_blocks_encrypted\@
1198
1199_initial_num_blocks_is_5\@:
1200 INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1201 sub $16*5, %r13
1202 jmp _initial_blocks_encrypted\@
1203
1204_initial_num_blocks_is_4\@:
1205 INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1206 sub $16*4, %r13
1207 jmp _initial_blocks_encrypted\@
1208
1209_initial_num_blocks_is_3\@:
1210 INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1211 sub $16*3, %r13
1212 jmp _initial_blocks_encrypted\@
1213
1214_initial_num_blocks_is_2\@:
1215 INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1216 sub $16*2, %r13
1217 jmp _initial_blocks_encrypted\@
1218
1219_initial_num_blocks_is_1\@:
1220 INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1221 sub $16*1, %r13
1222 jmp _initial_blocks_encrypted\@
1223
1224_initial_num_blocks_is_0\@:
1225 INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1226
1227
1228_initial_blocks_encrypted\@:
1229 cmp $0, %r13
1230 je _zero_cipher_left\@
1231
1232 sub $128, %r13
1233 je _eight_cipher_left\@
1234
1235
1236
1237
1238 vmovd %xmm9, %r15d
1239 and $255, %r15d
1240 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1241
1242
1243_encrypt_by_8_new\@:
1244 cmp $(255-8), %r15d
1245 jg _encrypt_by_8\@
1246
1247
1248
1249 add $8, %r15b
1250 GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
1251 add $128, %r11
1252 sub $128, %r13
1253 jne _encrypt_by_8_new\@
1254
1255 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1256 jmp _eight_cipher_left\@
1257
1258_encrypt_by_8\@:
1259 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1260 add $8, %r15b
1261 GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
1262 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1263 add $128, %r11
1264 sub $128, %r13
1265 jne _encrypt_by_8_new\@
1266
1267 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1268
1269
1270
1271
1272_eight_cipher_left\@:
1273 GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
1274
1275
1276_zero_cipher_left\@:
1277 cmp $16, arg4
1278 jl _only_less_than_16\@
1279
1280 mov arg4, %r13
1281 and $15, %r13
1282
1283 je _multiple_of_16_bytes\@
1284
1285
1286
1287
1288 vpaddd ONE(%rip), %xmm9, %xmm9
1289 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1290 ENCRYPT_SINGLE_BLOCK %xmm9
1291
1292 sub $16, %r11
1293 add %r13, %r11
1294 vmovdqu (arg3, %r11), %xmm1
1295
1296 lea SHIFT_MASK+16(%rip), %r12
1297 sub %r13, %r12
1298
1299
1300 vmovdqu (%r12), %xmm2
1301 vpshufb %xmm2, %xmm1, %xmm1
1302 jmp _final_ghash_mul\@
1303
1304_only_less_than_16\@:
1305
1306 mov arg4, %r13
1307 and $15, %r13
1308
1309 je _multiple_of_16_bytes\@
1310
1311
1312
1313
1314 vpaddd ONE(%rip), %xmm9, %xmm9
1315 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1316 ENCRYPT_SINGLE_BLOCK %xmm9
1317
1318
1319 lea SHIFT_MASK+16(%rip), %r12
1320 sub %r13, %r12
1321
1322
1323
1324_get_last_16_byte_loop\@:
1325 movb (arg3, %r11), %al
1326 movb %al, TMP1 (%rsp , %r11)
1327 add $1, %r11
1328 cmp %r13, %r11
1329 jne _get_last_16_byte_loop\@
1330
1331 vmovdqu TMP1(%rsp), %xmm1
1332
1333 sub $16, %r11
1334
1335_final_ghash_mul\@:
1336 .if \ENC_DEC == DEC
1337 vmovdqa %xmm1, %xmm2
1338 vpxor %xmm1, %xmm9, %xmm9
1339 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1340
1341 vpand %xmm1, %xmm9, %xmm9
1342 vpand %xmm1, %xmm2, %xmm2
1343 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
1344 vpxor %xmm2, %xmm14, %xmm14
1345
1346 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1347 sub %r13, %r11
1348 add $16, %r11
1349 .else
1350 vpxor %xmm1, %xmm9, %xmm9
1351 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1352
1353 vpand %xmm1, %xmm9, %xmm9
1354 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1355 vpxor %xmm9, %xmm14, %xmm14
1356
1357 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1358 sub %r13, %r11
1359 add $16, %r11
1360 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1361 .endif
1362
1363
1364
1365
1366 vmovq %xmm9, %rax
1367 cmp $8, %r13
1368 jle _less_than_8_bytes_left\@
1369
1370 mov %rax, (arg2 , %r11)
1371 add $8, %r11
1372 vpsrldq $8, %xmm9, %xmm9
1373 vmovq %xmm9, %rax
1374 sub $8, %r13
1375
1376_less_than_8_bytes_left\@:
1377 movb %al, (arg2 , %r11)
1378 add $1, %r11
1379 shr $8, %rax
1380 sub $1, %r13
1381 jne _less_than_8_bytes_left\@
1382
1383
1384_multiple_of_16_bytes\@:
1385 mov arg7, %r12
1386 shl $3, %r12
1387 vmovd %r12d, %xmm15
1388
1389 shl $3, arg4
1390 vmovq arg4, %xmm1
1391 vpslldq $8, %xmm15, %xmm15
1392 vpxor %xmm1, %xmm15, %xmm15
1393
1394 vpxor %xmm15, %xmm14, %xmm14
1395 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1396 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14
1397
1398 mov arg5, %rax
1399 vmovdqu (%rax), %xmm9
1400
1401 ENCRYPT_SINGLE_BLOCK %xmm9
1402
1403 vpxor %xmm14, %xmm9, %xmm9
1404
1405
1406
1407_return_T\@:
1408 mov arg8, %r10
1409 mov arg9, %r11
1410
1411 cmp $16, %r11
1412 je _T_16\@
1413
1414 cmp $12, %r11
1415 je _T_12\@
1416
1417_T_8\@:
1418 vmovq %xmm9, %rax
1419 mov %rax, (%r10)
1420 jmp _return_T_done\@
1421_T_12\@:
1422 vmovq %xmm9, %rax
1423 mov %rax, (%r10)
1424 vpsrldq $8, %xmm9, %xmm9
1425 vmovd %xmm9, %eax
1426 mov %eax, 8(%r10)
1427 jmp _return_T_done\@
1428
1429_T_16\@:
1430 vmovdqu %xmm9, (%r10)
1431
1432_return_T_done\@:
1433 mov %r14, %rsp
1434
1435 pop %r15
1436 pop %r14
1437 pop %r13
1438 pop %r12
1439.endm
1440
1441
1442
1443
1444
1445
1446
1447ENTRY(aesni_gcm_precomp_avx_gen2)
1448
1449 push %r12
1450 push %r13
1451 push %r14
1452 push %r15
1453
1454 mov %rsp, %r14
1455
1456
1457
1458 sub $VARIABLE_OFFSET, %rsp
1459 and $~63, %rsp
1460
1461 vmovdqu (arg2), %xmm6
1462
1463 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
1464
1465 vmovdqa %xmm6, %xmm2
1466 vpsllq $1, %xmm6, %xmm6
1467 vpsrlq $63, %xmm2, %xmm2
1468 vmovdqa %xmm2, %xmm1
1469 vpslldq $8, %xmm2, %xmm2
1470 vpsrldq $8, %xmm1, %xmm1
1471 vpor %xmm2, %xmm6, %xmm6
1472
1473 vpshufd $0b00100100, %xmm1, %xmm2
1474 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
1475 vpand POLY(%rip), %xmm2, %xmm2
1476 vpxor %xmm2, %xmm6, %xmm6
1477
1478 vmovdqa %xmm6, HashKey(arg1)
1479
1480
1481 PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
1482
1483 mov %r14, %rsp
1484
1485 pop %r15
1486 pop %r14
1487 pop %r13
1488 pop %r12
1489 ret
1490ENDPROC(aesni_gcm_precomp_avx_gen2)
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508ENTRY(aesni_gcm_enc_avx_gen2)
1509 GCM_ENC_DEC_AVX ENC
1510 ret
1511ENDPROC(aesni_gcm_enc_avx_gen2)
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529ENTRY(aesni_gcm_dec_avx_gen2)
1530 GCM_ENC_DEC_AVX DEC
1531 ret
1532ENDPROC(aesni_gcm_dec_avx_gen2)
1533#endif
1534
1535#ifdef CONFIG_AS_AVX2
1536
1537
1538
1539
1540
1541
1542
1543.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1544
1545 vpclmulqdq $0x11,\HK,\GH,\T1
1546 vpclmulqdq $0x00,\HK,\GH,\T2
1547 vpclmulqdq $0x01,\HK,\GH,\T3
1548 vpclmulqdq $0x10,\HK,\GH,\GH
1549 vpxor \T3, \GH, \GH
1550
1551
1552 vpsrldq $8 , \GH, \T3
1553 vpslldq $8 , \GH, \GH
1554
1555 vpxor \T3, \T1, \T1
1556 vpxor \T2, \GH, \GH
1557
1558
1559
1560 vmovdqa POLY2(%rip), \T3
1561
1562 vpclmulqdq $0x01, \GH, \T3, \T2
1563 vpslldq $8, \T2, \T2
1564
1565 vpxor \T2, \GH, \GH
1566
1567
1568 vpclmulqdq $0x00, \GH, \T3, \T2
1569 vpsrldq $4, \T2, \T2
1570
1571 vpclmulqdq $0x10, \GH, \T3, \GH
1572 vpslldq $4, \GH, \GH
1573
1574 vpxor \T2, \GH, \GH
1575
1576 vpxor \T1, \GH, \GH
1577
1578
1579.endm
1580
1581.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1582
1583
1584 vmovdqa \HK, \T5
1585 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1586 vmovdqa \T5, HashKey_2(arg1)
1587
1588 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1589 vmovdqa \T5, HashKey_3(arg1)
1590
1591 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1592 vmovdqa \T5, HashKey_4(arg1)
1593
1594 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1595 vmovdqa \T5, HashKey_5(arg1)
1596
1597 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1598 vmovdqa \T5, HashKey_6(arg1)
1599
1600 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1601 vmovdqa \T5, HashKey_7(arg1)
1602
1603 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1604 vmovdqa \T5, HashKey_8(arg1)
1605
1606.endm
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1617 i = (8-\num_initial_blocks)
1618 setreg
1619
1620 mov arg6, %r10
1621 mov arg7, %r12
1622
1623
1624 mov %r12, %r11
1625
1626 vpxor reg_i, reg_i, reg_i
1627_get_AAD_loop\@:
1628 vmovd (%r10), \T1
1629 vpslldq $12, \T1, \T1
1630 vpsrldq $4, reg_i, reg_i
1631 vpxor \T1, reg_i, reg_i
1632
1633 add $4, %r10
1634 sub $4, %r12
1635 jg _get_AAD_loop\@
1636
1637
1638 cmp $16, %r11
1639 je _get_AAD_loop2_done\@
1640 mov $16, %r12
1641
1642_get_AAD_loop2\@:
1643 vpsrldq $4, reg_i, reg_i
1644 sub $4, %r12
1645 cmp %r11, %r12
1646 jg _get_AAD_loop2\@
1647
1648_get_AAD_loop2_done\@:
1649
1650
1651 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1652
1653
1654 xor %r11, %r11
1655
1656
1657 mov arg5, %rax
1658 vmovdqu (%rax), \CTR
1659 vpshufb SHUF_MASK(%rip), \CTR, \CTR
1660
1661
1662 i = (9-\num_initial_blocks)
1663 setreg
1664.rep \num_initial_blocks
1665 vpaddd ONE(%rip), \CTR, \CTR
1666 vmovdqa \CTR, reg_i
1667 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1668 i = (i+1)
1669 setreg
1670.endr
1671
1672 vmovdqa (arg1), \T_key
1673 i = (9-\num_initial_blocks)
1674 setreg
1675.rep \num_initial_blocks
1676 vpxor \T_key, reg_i, reg_i
1677 i = (i+1)
1678 setreg
1679.endr
1680
1681 j = 1
1682 setreg
1683.rep 9
1684 vmovdqa 16*j(arg1), \T_key
1685 i = (9-\num_initial_blocks)
1686 setreg
1687.rep \num_initial_blocks
1688 vaesenc \T_key, reg_i, reg_i
1689 i = (i+1)
1690 setreg
1691.endr
1692
1693 j = (j+1)
1694 setreg
1695.endr
1696
1697
1698 vmovdqa 16*10(arg1), \T_key
1699 i = (9-\num_initial_blocks)
1700 setreg
1701.rep \num_initial_blocks
1702 vaesenclast \T_key, reg_i, reg_i
1703 i = (i+1)
1704 setreg
1705.endr
1706
1707 i = (9-\num_initial_blocks)
1708 setreg
1709.rep \num_initial_blocks
1710 vmovdqu (arg3, %r11), \T1
1711 vpxor \T1, reg_i, reg_i
1712 vmovdqu reg_i, (arg2 , %r11)
1713
1714 add $16, %r11
1715.if \ENC_DEC == DEC
1716 vmovdqa \T1, reg_i
1717.endif
1718 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1719 i = (i+1)
1720 setreg
1721.endr
1722
1723
1724 i = (8-\num_initial_blocks)
1725 j = (9-\num_initial_blocks)
1726 setreg
1727 GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6
1728
1729.rep \num_initial_blocks
1730 vpxor reg_i, reg_j, reg_j
1731 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6
1732 i = (i+1)
1733 j = (j+1)
1734 setreg
1735.endr
1736
1737
1738 vmovdqa \XMM8, TMP1(%rsp)
1739 vmovdqa \XMM8, \T3
1740
1741 cmp $128, %r13
1742 jl _initial_blocks_done\@
1743
1744
1745
1746 vpaddd ONE(%rip), \CTR, \CTR
1747 vmovdqa \CTR, \XMM1
1748 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
1749
1750 vpaddd ONE(%rip), \CTR, \CTR
1751 vmovdqa \CTR, \XMM2
1752 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
1753
1754 vpaddd ONE(%rip), \CTR, \CTR
1755 vmovdqa \CTR, \XMM3
1756 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
1757
1758 vpaddd ONE(%rip), \CTR, \CTR
1759 vmovdqa \CTR, \XMM4
1760 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
1761
1762 vpaddd ONE(%rip), \CTR, \CTR
1763 vmovdqa \CTR, \XMM5
1764 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
1765
1766 vpaddd ONE(%rip), \CTR, \CTR
1767 vmovdqa \CTR, \XMM6
1768 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
1769
1770 vpaddd ONE(%rip), \CTR, \CTR
1771 vmovdqa \CTR, \XMM7
1772 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
1773
1774 vpaddd ONE(%rip), \CTR, \CTR
1775 vmovdqa \CTR, \XMM8
1776 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
1777
1778 vmovdqa (arg1), \T_key
1779 vpxor \T_key, \XMM1, \XMM1
1780 vpxor \T_key, \XMM2, \XMM2
1781 vpxor \T_key, \XMM3, \XMM3
1782 vpxor \T_key, \XMM4, \XMM4
1783 vpxor \T_key, \XMM5, \XMM5
1784 vpxor \T_key, \XMM6, \XMM6
1785 vpxor \T_key, \XMM7, \XMM7
1786 vpxor \T_key, \XMM8, \XMM8
1787
1788 i = 1
1789 setreg
1790.rep 9
1791 vmovdqa 16*i(arg1), \T_key
1792 vaesenc \T_key, \XMM1, \XMM1
1793 vaesenc \T_key, \XMM2, \XMM2
1794 vaesenc \T_key, \XMM3, \XMM3
1795 vaesenc \T_key, \XMM4, \XMM4
1796 vaesenc \T_key, \XMM5, \XMM5
1797 vaesenc \T_key, \XMM6, \XMM6
1798 vaesenc \T_key, \XMM7, \XMM7
1799 vaesenc \T_key, \XMM8, \XMM8
1800 i = (i+1)
1801 setreg
1802.endr
1803
1804
1805 vmovdqa 16*i(arg1), \T_key
1806 vaesenclast \T_key, \XMM1, \XMM1
1807 vaesenclast \T_key, \XMM2, \XMM2
1808 vaesenclast \T_key, \XMM3, \XMM3
1809 vaesenclast \T_key, \XMM4, \XMM4
1810 vaesenclast \T_key, \XMM5, \XMM5
1811 vaesenclast \T_key, \XMM6, \XMM6
1812 vaesenclast \T_key, \XMM7, \XMM7
1813 vaesenclast \T_key, \XMM8, \XMM8
1814
1815 vmovdqu (arg3, %r11), \T1
1816 vpxor \T1, \XMM1, \XMM1
1817 vmovdqu \XMM1, (arg2 , %r11)
1818 .if \ENC_DEC == DEC
1819 vmovdqa \T1, \XMM1
1820 .endif
1821
1822 vmovdqu 16*1(arg3, %r11), \T1
1823 vpxor \T1, \XMM2, \XMM2
1824 vmovdqu \XMM2, 16*1(arg2 , %r11)
1825 .if \ENC_DEC == DEC
1826 vmovdqa \T1, \XMM2
1827 .endif
1828
1829 vmovdqu 16*2(arg3, %r11), \T1
1830 vpxor \T1, \XMM3, \XMM3
1831 vmovdqu \XMM3, 16*2(arg2 , %r11)
1832 .if \ENC_DEC == DEC
1833 vmovdqa \T1, \XMM3
1834 .endif
1835
1836 vmovdqu 16*3(arg3, %r11), \T1
1837 vpxor \T1, \XMM4, \XMM4
1838 vmovdqu \XMM4, 16*3(arg2 , %r11)
1839 .if \ENC_DEC == DEC
1840 vmovdqa \T1, \XMM4
1841 .endif
1842
1843 vmovdqu 16*4(arg3, %r11), \T1
1844 vpxor \T1, \XMM5, \XMM5
1845 vmovdqu \XMM5, 16*4(arg2 , %r11)
1846 .if \ENC_DEC == DEC
1847 vmovdqa \T1, \XMM5
1848 .endif
1849
1850 vmovdqu 16*5(arg3, %r11), \T1
1851 vpxor \T1, \XMM6, \XMM6
1852 vmovdqu \XMM6, 16*5(arg2 , %r11)
1853 .if \ENC_DEC == DEC
1854 vmovdqa \T1, \XMM6
1855 .endif
1856
1857 vmovdqu 16*6(arg3, %r11), \T1
1858 vpxor \T1, \XMM7, \XMM7
1859 vmovdqu \XMM7, 16*6(arg2 , %r11)
1860 .if \ENC_DEC == DEC
1861 vmovdqa \T1, \XMM7
1862 .endif
1863
1864 vmovdqu 16*7(arg3, %r11), \T1
1865 vpxor \T1, \XMM8, \XMM8
1866 vmovdqu \XMM8, 16*7(arg2 , %r11)
1867 .if \ENC_DEC == DEC
1868 vmovdqa \T1, \XMM8
1869 .endif
1870
1871 add $128, %r11
1872
1873 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
1874 vpxor TMP1(%rsp), \XMM1, \XMM1
1875
1876 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
1877 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
1878 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
1879 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
1880 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
1881 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
1882 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
1883
1884
1885
1886_initial_blocks_done\@:
1887
1888
1889.endm
1890
1891
1892
1893
1894
1895
1896
1897.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1898
1899 vmovdqa \XMM1, \T2
1900 vmovdqa \XMM2, TMP2(%rsp)
1901 vmovdqa \XMM3, TMP3(%rsp)
1902 vmovdqa \XMM4, TMP4(%rsp)
1903 vmovdqa \XMM5, TMP5(%rsp)
1904 vmovdqa \XMM6, TMP6(%rsp)
1905 vmovdqa \XMM7, TMP7(%rsp)
1906 vmovdqa \XMM8, TMP8(%rsp)
1907
1908.if \loop_idx == in_order
1909 vpaddd ONE(%rip), \CTR, \XMM1
1910 vpaddd ONE(%rip), \XMM1, \XMM2
1911 vpaddd ONE(%rip), \XMM2, \XMM3
1912 vpaddd ONE(%rip), \XMM3, \XMM4
1913 vpaddd ONE(%rip), \XMM4, \XMM5
1914 vpaddd ONE(%rip), \XMM5, \XMM6
1915 vpaddd ONE(%rip), \XMM6, \XMM7
1916 vpaddd ONE(%rip), \XMM7, \XMM8
1917 vmovdqa \XMM8, \CTR
1918
1919 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
1920 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
1921 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
1922 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
1923 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
1924 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
1925 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
1926 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
1927.else
1928 vpaddd ONEf(%rip), \CTR, \XMM1
1929 vpaddd ONEf(%rip), \XMM1, \XMM2
1930 vpaddd ONEf(%rip), \XMM2, \XMM3
1931 vpaddd ONEf(%rip), \XMM3, \XMM4
1932 vpaddd ONEf(%rip), \XMM4, \XMM5
1933 vpaddd ONEf(%rip), \XMM5, \XMM6
1934 vpaddd ONEf(%rip), \XMM6, \XMM7
1935 vpaddd ONEf(%rip), \XMM7, \XMM8
1936 vmovdqa \XMM8, \CTR
1937.endif
1938
1939
1940
1941
1942 vmovdqu (arg1), \T1
1943 vpxor \T1, \XMM1, \XMM1
1944 vpxor \T1, \XMM2, \XMM2
1945 vpxor \T1, \XMM3, \XMM3
1946 vpxor \T1, \XMM4, \XMM4
1947 vpxor \T1, \XMM5, \XMM5
1948 vpxor \T1, \XMM6, \XMM6
1949 vpxor \T1, \XMM7, \XMM7
1950 vpxor \T1, \XMM8, \XMM8
1951
1952
1953
1954
1955
1956
1957
1958 vmovdqu 16*1(arg1), \T1
1959 vaesenc \T1, \XMM1, \XMM1
1960 vaesenc \T1, \XMM2, \XMM2
1961 vaesenc \T1, \XMM3, \XMM3
1962 vaesenc \T1, \XMM4, \XMM4
1963 vaesenc \T1, \XMM5, \XMM5
1964 vaesenc \T1, \XMM6, \XMM6
1965 vaesenc \T1, \XMM7, \XMM7
1966 vaesenc \T1, \XMM8, \XMM8
1967
1968 vmovdqu 16*2(arg1), \T1
1969 vaesenc \T1, \XMM1, \XMM1
1970 vaesenc \T1, \XMM2, \XMM2
1971 vaesenc \T1, \XMM3, \XMM3
1972 vaesenc \T1, \XMM4, \XMM4
1973 vaesenc \T1, \XMM5, \XMM5
1974 vaesenc \T1, \XMM6, \XMM6
1975 vaesenc \T1, \XMM7, \XMM7
1976 vaesenc \T1, \XMM8, \XMM8
1977
1978
1979
1980
1981 vmovdqa HashKey_8(arg1), \T5
1982 vpclmulqdq $0x11, \T5, \T2, \T4
1983 vpclmulqdq $0x00, \T5, \T2, \T7
1984 vpclmulqdq $0x01, \T5, \T2, \T6
1985 vpclmulqdq $0x10, \T5, \T2, \T5
1986 vpxor \T5, \T6, \T6
1987
1988 vmovdqu 16*3(arg1), \T1
1989 vaesenc \T1, \XMM1, \XMM1
1990 vaesenc \T1, \XMM2, \XMM2
1991 vaesenc \T1, \XMM3, \XMM3
1992 vaesenc \T1, \XMM4, \XMM4
1993 vaesenc \T1, \XMM5, \XMM5
1994 vaesenc \T1, \XMM6, \XMM6
1995 vaesenc \T1, \XMM7, \XMM7
1996 vaesenc \T1, \XMM8, \XMM8
1997
1998 vmovdqa TMP2(%rsp), \T1
1999 vmovdqa HashKey_7(arg1), \T5
2000 vpclmulqdq $0x11, \T5, \T1, \T3
2001 vpxor \T3, \T4, \T4
2002
2003 vpclmulqdq $0x00, \T5, \T1, \T3
2004 vpxor \T3, \T7, \T7
2005
2006 vpclmulqdq $0x01, \T5, \T1, \T3
2007 vpxor \T3, \T6, \T6
2008
2009 vpclmulqdq $0x10, \T5, \T1, \T3
2010 vpxor \T3, \T6, \T6
2011
2012 vmovdqu 16*4(arg1), \T1
2013 vaesenc \T1, \XMM1, \XMM1
2014 vaesenc \T1, \XMM2, \XMM2
2015 vaesenc \T1, \XMM3, \XMM3
2016 vaesenc \T1, \XMM4, \XMM4
2017 vaesenc \T1, \XMM5, \XMM5
2018 vaesenc \T1, \XMM6, \XMM6
2019 vaesenc \T1, \XMM7, \XMM7
2020 vaesenc \T1, \XMM8, \XMM8
2021
2022
2023
2024 vmovdqa TMP3(%rsp), \T1
2025 vmovdqa HashKey_6(arg1), \T5
2026 vpclmulqdq $0x11, \T5, \T1, \T3
2027 vpxor \T3, \T4, \T4
2028
2029 vpclmulqdq $0x00, \T5, \T1, \T3
2030 vpxor \T3, \T7, \T7
2031
2032 vpclmulqdq $0x01, \T5, \T1, \T3
2033 vpxor \T3, \T6, \T6
2034
2035 vpclmulqdq $0x10, \T5, \T1, \T3
2036 vpxor \T3, \T6, \T6
2037
2038 vmovdqu 16*5(arg1), \T1
2039 vaesenc \T1, \XMM1, \XMM1
2040 vaesenc \T1, \XMM2, \XMM2
2041 vaesenc \T1, \XMM3, \XMM3
2042 vaesenc \T1, \XMM4, \XMM4
2043 vaesenc \T1, \XMM5, \XMM5
2044 vaesenc \T1, \XMM6, \XMM6
2045 vaesenc \T1, \XMM7, \XMM7
2046 vaesenc \T1, \XMM8, \XMM8
2047
2048 vmovdqa TMP4(%rsp), \T1
2049 vmovdqa HashKey_5(arg1), \T5
2050 vpclmulqdq $0x11, \T5, \T1, \T3
2051 vpxor \T3, \T4, \T4
2052
2053 vpclmulqdq $0x00, \T5, \T1, \T3
2054 vpxor \T3, \T7, \T7
2055
2056 vpclmulqdq $0x01, \T5, \T1, \T3
2057 vpxor \T3, \T6, \T6
2058
2059 vpclmulqdq $0x10, \T5, \T1, \T3
2060 vpxor \T3, \T6, \T6
2061
2062 vmovdqu 16*6(arg1), \T1
2063 vaesenc \T1, \XMM1, \XMM1
2064 vaesenc \T1, \XMM2, \XMM2
2065 vaesenc \T1, \XMM3, \XMM3
2066 vaesenc \T1, \XMM4, \XMM4
2067 vaesenc \T1, \XMM5, \XMM5
2068 vaesenc \T1, \XMM6, \XMM6
2069 vaesenc \T1, \XMM7, \XMM7
2070 vaesenc \T1, \XMM8, \XMM8
2071
2072
2073 vmovdqa TMP5(%rsp), \T1
2074 vmovdqa HashKey_4(arg1), \T5
2075 vpclmulqdq $0x11, \T5, \T1, \T3
2076 vpxor \T3, \T4, \T4
2077
2078 vpclmulqdq $0x00, \T5, \T1, \T3
2079 vpxor \T3, \T7, \T7
2080
2081 vpclmulqdq $0x01, \T5, \T1, \T3
2082 vpxor \T3, \T6, \T6
2083
2084 vpclmulqdq $0x10, \T5, \T1, \T3
2085 vpxor \T3, \T6, \T6
2086
2087 vmovdqu 16*7(arg1), \T1
2088 vaesenc \T1, \XMM1, \XMM1
2089 vaesenc \T1, \XMM2, \XMM2
2090 vaesenc \T1, \XMM3, \XMM3
2091 vaesenc \T1, \XMM4, \XMM4
2092 vaesenc \T1, \XMM5, \XMM5
2093 vaesenc \T1, \XMM6, \XMM6
2094 vaesenc \T1, \XMM7, \XMM7
2095 vaesenc \T1, \XMM8, \XMM8
2096
2097 vmovdqa TMP6(%rsp), \T1
2098 vmovdqa HashKey_3(arg1), \T5
2099 vpclmulqdq $0x11, \T5, \T1, \T3
2100 vpxor \T3, \T4, \T4
2101
2102 vpclmulqdq $0x00, \T5, \T1, \T3
2103 vpxor \T3, \T7, \T7
2104
2105 vpclmulqdq $0x01, \T5, \T1, \T3
2106 vpxor \T3, \T6, \T6
2107
2108 vpclmulqdq $0x10, \T5, \T1, \T3
2109 vpxor \T3, \T6, \T6
2110
2111 vmovdqu 16*8(arg1), \T1
2112 vaesenc \T1, \XMM1, \XMM1
2113 vaesenc \T1, \XMM2, \XMM2
2114 vaesenc \T1, \XMM3, \XMM3
2115 vaesenc \T1, \XMM4, \XMM4
2116 vaesenc \T1, \XMM5, \XMM5
2117 vaesenc \T1, \XMM6, \XMM6
2118 vaesenc \T1, \XMM7, \XMM7
2119 vaesenc \T1, \XMM8, \XMM8
2120
2121 vmovdqa TMP7(%rsp), \T1
2122 vmovdqa HashKey_2(arg1), \T5
2123 vpclmulqdq $0x11, \T5, \T1, \T3
2124 vpxor \T3, \T4, \T4
2125
2126 vpclmulqdq $0x00, \T5, \T1, \T3
2127 vpxor \T3, \T7, \T7
2128
2129 vpclmulqdq $0x01, \T5, \T1, \T3
2130 vpxor \T3, \T6, \T6
2131
2132 vpclmulqdq $0x10, \T5, \T1, \T3
2133 vpxor \T3, \T6, \T6
2134
2135
2136
2137
2138 vmovdqu 16*9(arg1), \T5
2139 vaesenc \T5, \XMM1, \XMM1
2140 vaesenc \T5, \XMM2, \XMM2
2141 vaesenc \T5, \XMM3, \XMM3
2142 vaesenc \T5, \XMM4, \XMM4
2143 vaesenc \T5, \XMM5, \XMM5
2144 vaesenc \T5, \XMM6, \XMM6
2145 vaesenc \T5, \XMM7, \XMM7
2146 vaesenc \T5, \XMM8, \XMM8
2147
2148 vmovdqa TMP8(%rsp), \T1
2149 vmovdqa HashKey(arg1), \T5
2150
2151 vpclmulqdq $0x00, \T5, \T1, \T3
2152 vpxor \T3, \T7, \T7
2153
2154 vpclmulqdq $0x01, \T5, \T1, \T3
2155 vpxor \T3, \T6, \T6
2156
2157 vpclmulqdq $0x10, \T5, \T1, \T3
2158 vpxor \T3, \T6, \T6
2159
2160 vpclmulqdq $0x11, \T5, \T1, \T3
2161 vpxor \T3, \T4, \T1
2162
2163
2164 vmovdqu 16*10(arg1), \T5
2165
2166 i = 0
2167 j = 1
2168 setreg
2169.rep 8
2170 vpxor 16*i(arg3, %r11), \T5, \T2
2171 .if \ENC_DEC == ENC
2172 vaesenclast \T2, reg_j, reg_j
2173 .else
2174 vaesenclast \T2, reg_j, \T3
2175 vmovdqu 16*i(arg3, %r11), reg_j
2176 vmovdqu \T3, 16*i(arg2, %r11)
2177 .endif
2178 i = (i+1)
2179 j = (j+1)
2180 setreg
2181.endr
2182
2183
2184
2185 vpslldq $8, \T6, \T3
2186 vpsrldq $8, \T6, \T6
2187 vpxor \T3, \T7, \T7
2188 vpxor \T6, \T1, \T1
2189
2190
2191
2192
2193
2194 vmovdqa POLY2(%rip), \T3
2195
2196 vpclmulqdq $0x01, \T7, \T3, \T2
2197 vpslldq $8, \T2, \T2
2198
2199 vpxor \T2, \T7, \T7
2200
2201 .if \ENC_DEC == ENC
2202 vmovdqu \XMM1, 16*0(arg2,%r11)
2203 vmovdqu \XMM2, 16*1(arg2,%r11)
2204 vmovdqu \XMM3, 16*2(arg2,%r11)
2205 vmovdqu \XMM4, 16*3(arg2,%r11)
2206 vmovdqu \XMM5, 16*4(arg2,%r11)
2207 vmovdqu \XMM6, 16*5(arg2,%r11)
2208 vmovdqu \XMM7, 16*6(arg2,%r11)
2209 vmovdqu \XMM8, 16*7(arg2,%r11)
2210 .endif
2211
2212
2213
2214 vpclmulqdq $0x00, \T7, \T3, \T2
2215 vpsrldq $4, \T2, \T2
2216
2217 vpclmulqdq $0x10, \T7, \T3, \T4
2218 vpslldq $4, \T4, \T4
2219
2220 vpxor \T2, \T4, \T4
2221
2222 vpxor \T4, \T1, \T1
2223
2224 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
2225 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
2226 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
2227 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
2228 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
2229 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
2230 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
2231 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
2232
2233
2234 vpxor \T1, \XMM1, \XMM1
2235
2236
2237
2238.endm
2239
2240
2241
2242.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2243
2244
2245
2246 vmovdqa HashKey_8(arg1), \T5
2247
2248 vpshufd $0b01001110, \XMM1, \T2
2249 vpshufd $0b01001110, \T5, \T3
2250 vpxor \XMM1, \T2, \T2
2251 vpxor \T5, \T3, \T3
2252
2253 vpclmulqdq $0x11, \T5, \XMM1, \T6
2254 vpclmulqdq $0x00, \T5, \XMM1, \T7
2255
2256 vpclmulqdq $0x00, \T3, \T2, \XMM1
2257
2258
2259
2260 vmovdqa HashKey_7(arg1), \T5
2261 vpshufd $0b01001110, \XMM2, \T2
2262 vpshufd $0b01001110, \T5, \T3
2263 vpxor \XMM2, \T2, \T2
2264 vpxor \T5, \T3, \T3
2265
2266 vpclmulqdq $0x11, \T5, \XMM2, \T4
2267 vpxor \T4, \T6, \T6
2268
2269 vpclmulqdq $0x00, \T5, \XMM2, \T4
2270 vpxor \T4, \T7, \T7
2271
2272 vpclmulqdq $0x00, \T3, \T2, \T2
2273
2274 vpxor \T2, \XMM1, \XMM1
2275
2276
2277
2278 vmovdqa HashKey_6(arg1), \T5
2279 vpshufd $0b01001110, \XMM3, \T2
2280 vpshufd $0b01001110, \T5, \T3
2281 vpxor \XMM3, \T2, \T2
2282 vpxor \T5, \T3, \T3
2283
2284 vpclmulqdq $0x11, \T5, \XMM3, \T4
2285 vpxor \T4, \T6, \T6
2286
2287 vpclmulqdq $0x00, \T5, \XMM3, \T4
2288 vpxor \T4, \T7, \T7
2289
2290 vpclmulqdq $0x00, \T3, \T2, \T2
2291
2292 vpxor \T2, \XMM1, \XMM1
2293
2294
2295
2296 vmovdqa HashKey_5(arg1), \T5
2297 vpshufd $0b01001110, \XMM4, \T2
2298 vpshufd $0b01001110, \T5, \T3
2299 vpxor \XMM4, \T2, \T2
2300 vpxor \T5, \T3, \T3
2301
2302 vpclmulqdq $0x11, \T5, \XMM4, \T4
2303 vpxor \T4, \T6, \T6
2304
2305 vpclmulqdq $0x00, \T5, \XMM4, \T4
2306 vpxor \T4, \T7, \T7
2307
2308 vpclmulqdq $0x00, \T3, \T2, \T2
2309
2310 vpxor \T2, \XMM1, \XMM1
2311
2312
2313
2314 vmovdqa HashKey_4(arg1), \T5
2315 vpshufd $0b01001110, \XMM5, \T2
2316 vpshufd $0b01001110, \T5, \T3
2317 vpxor \XMM5, \T2, \T2
2318 vpxor \T5, \T3, \T3
2319
2320 vpclmulqdq $0x11, \T5, \XMM5, \T4
2321 vpxor \T4, \T6, \T6
2322
2323 vpclmulqdq $0x00, \T5, \XMM5, \T4
2324 vpxor \T4, \T7, \T7
2325
2326 vpclmulqdq $0x00, \T3, \T2, \T2
2327
2328 vpxor \T2, \XMM1, \XMM1
2329
2330
2331
2332 vmovdqa HashKey_3(arg1), \T5
2333 vpshufd $0b01001110, \XMM6, \T2
2334 vpshufd $0b01001110, \T5, \T3
2335 vpxor \XMM6, \T2, \T2
2336 vpxor \T5, \T3, \T3
2337
2338 vpclmulqdq $0x11, \T5, \XMM6, \T4
2339 vpxor \T4, \T6, \T6
2340
2341 vpclmulqdq $0x00, \T5, \XMM6, \T4
2342 vpxor \T4, \T7, \T7
2343
2344 vpclmulqdq $0x00, \T3, \T2, \T2
2345
2346 vpxor \T2, \XMM1, \XMM1
2347
2348
2349
2350 vmovdqa HashKey_2(arg1), \T5
2351 vpshufd $0b01001110, \XMM7, \T2
2352 vpshufd $0b01001110, \T5, \T3
2353 vpxor \XMM7, \T2, \T2
2354 vpxor \T5, \T3, \T3
2355
2356 vpclmulqdq $0x11, \T5, \XMM7, \T4
2357 vpxor \T4, \T6, \T6
2358
2359 vpclmulqdq $0x00, \T5, \XMM7, \T4
2360 vpxor \T4, \T7, \T7
2361
2362 vpclmulqdq $0x00, \T3, \T2, \T2
2363
2364 vpxor \T2, \XMM1, \XMM1
2365
2366
2367
2368 vmovdqa HashKey(arg1), \T5
2369 vpshufd $0b01001110, \XMM8, \T2
2370 vpshufd $0b01001110, \T5, \T3
2371 vpxor \XMM8, \T2, \T2
2372 vpxor \T5, \T3, \T3
2373
2374 vpclmulqdq $0x11, \T5, \XMM8, \T4
2375 vpxor \T4, \T6, \T6
2376
2377 vpclmulqdq $0x00, \T5, \XMM8, \T4
2378 vpxor \T4, \T7, \T7
2379
2380 vpclmulqdq $0x00, \T3, \T2, \T2
2381
2382 vpxor \T2, \XMM1, \XMM1
2383 vpxor \T6, \XMM1, \XMM1
2384 vpxor \T7, \XMM1, \T2
2385
2386
2387
2388
2389 vpslldq $8, \T2, \T4
2390 vpsrldq $8, \T2, \T2
2391
2392 vpxor \T4, \T7, \T7
2393 vpxor \T2, \T6, \T6
2394
2395
2396
2397
2398 vmovdqa POLY2(%rip), \T3
2399
2400 vpclmulqdq $0x01, \T7, \T3, \T2
2401 vpslldq $8, \T2, \T2
2402
2403 vpxor \T2, \T7, \T7
2404
2405
2406
2407
2408 vpclmulqdq $0x00, \T7, \T3, \T2
2409 vpsrldq $4, \T2, \T2
2410
2411 vpclmulqdq $0x10, \T7, \T3, \T4
2412 vpslldq $4, \T4, \T4
2413
2414 vpxor \T2, \T4, \T4
2415
2416 vpxor \T4, \T6, \T6
2417.endm
2418
2419
2420
2421
2422
2423
2424.macro GCM_ENC_DEC_AVX2 ENC_DEC
2425
2426
2427 push %r12
2428 push %r13
2429 push %r14
2430 push %r15
2431
2432 mov %rsp, %r14
2433
2434
2435
2436
2437 sub $VARIABLE_OFFSET, %rsp
2438 and $~63, %rsp
2439
2440
2441 vmovdqu HashKey(arg1), %xmm13
2442
2443 mov arg4, %r13
2444 and $-16, %r13
2445
2446 mov %r13, %r12
2447 shr $4, %r12
2448 and $7, %r12
2449 jz _initial_num_blocks_is_0\@
2450
2451 cmp $7, %r12
2452 je _initial_num_blocks_is_7\@
2453 cmp $6, %r12
2454 je _initial_num_blocks_is_6\@
2455 cmp $5, %r12
2456 je _initial_num_blocks_is_5\@
2457 cmp $4, %r12
2458 je _initial_num_blocks_is_4\@
2459 cmp $3, %r12
2460 je _initial_num_blocks_is_3\@
2461 cmp $2, %r12
2462 je _initial_num_blocks_is_2\@
2463
2464 jmp _initial_num_blocks_is_1\@
2465
2466_initial_num_blocks_is_7\@:
2467 INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2468 sub $16*7, %r13
2469 jmp _initial_blocks_encrypted\@
2470
2471_initial_num_blocks_is_6\@:
2472 INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2473 sub $16*6, %r13
2474 jmp _initial_blocks_encrypted\@
2475
2476_initial_num_blocks_is_5\@:
2477 INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2478 sub $16*5, %r13
2479 jmp _initial_blocks_encrypted\@
2480
2481_initial_num_blocks_is_4\@:
2482 INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2483 sub $16*4, %r13
2484 jmp _initial_blocks_encrypted\@
2485
2486_initial_num_blocks_is_3\@:
2487 INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2488 sub $16*3, %r13
2489 jmp _initial_blocks_encrypted\@
2490
2491_initial_num_blocks_is_2\@:
2492 INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2493 sub $16*2, %r13
2494 jmp _initial_blocks_encrypted\@
2495
2496_initial_num_blocks_is_1\@:
2497 INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2498 sub $16*1, %r13
2499 jmp _initial_blocks_encrypted\@
2500
2501_initial_num_blocks_is_0\@:
2502 INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2503
2504
2505_initial_blocks_encrypted\@:
2506 cmp $0, %r13
2507 je _zero_cipher_left\@
2508
2509 sub $128, %r13
2510 je _eight_cipher_left\@
2511
2512
2513
2514
2515 vmovd %xmm9, %r15d
2516 and $255, %r15d
2517 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2518
2519
2520_encrypt_by_8_new\@:
2521 cmp $(255-8), %r15d
2522 jg _encrypt_by_8\@
2523
2524
2525
2526 add $8, %r15b
2527 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
2528 add $128, %r11
2529 sub $128, %r13
2530 jne _encrypt_by_8_new\@
2531
2532 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2533 jmp _eight_cipher_left\@
2534
2535_encrypt_by_8\@:
2536 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2537 add $8, %r15b
2538 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
2539 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2540 add $128, %r11
2541 sub $128, %r13
2542 jne _encrypt_by_8_new\@
2543
2544 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2545
2546
2547
2548
2549_eight_cipher_left\@:
2550 GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
2551
2552
2553_zero_cipher_left\@:
2554 cmp $16, arg4
2555 jl _only_less_than_16\@
2556
2557 mov arg4, %r13
2558 and $15, %r13
2559
2560 je _multiple_of_16_bytes\@
2561
2562
2563
2564
2565 vpaddd ONE(%rip), %xmm9, %xmm9
2566 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2567 ENCRYPT_SINGLE_BLOCK %xmm9
2568
2569 sub $16, %r11
2570 add %r13, %r11
2571 vmovdqu (arg3, %r11), %xmm1
2572
2573 lea SHIFT_MASK+16(%rip), %r12
2574 sub %r13, %r12
2575
2576
2577 vmovdqu (%r12), %xmm2
2578 vpshufb %xmm2, %xmm1, %xmm1
2579 jmp _final_ghash_mul\@
2580
2581_only_less_than_16\@:
2582
2583 mov arg4, %r13
2584 and $15, %r13
2585
2586 je _multiple_of_16_bytes\@
2587
2588
2589
2590
2591 vpaddd ONE(%rip), %xmm9, %xmm9
2592 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2593 ENCRYPT_SINGLE_BLOCK %xmm9
2594
2595
2596 lea SHIFT_MASK+16(%rip), %r12
2597 sub %r13, %r12
2598
2599
2600
2601_get_last_16_byte_loop\@:
2602 movb (arg3, %r11), %al
2603 movb %al, TMP1 (%rsp , %r11)
2604 add $1, %r11
2605 cmp %r13, %r11
2606 jne _get_last_16_byte_loop\@
2607
2608 vmovdqu TMP1(%rsp), %xmm1
2609
2610 sub $16, %r11
2611
2612_final_ghash_mul\@:
2613 .if \ENC_DEC == DEC
2614 vmovdqa %xmm1, %xmm2
2615 vpxor %xmm1, %xmm9, %xmm9
2616 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
2617 vpand %xmm1, %xmm9, %xmm9
2618 vpand %xmm1, %xmm2, %xmm2
2619 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
2620 vpxor %xmm2, %xmm14, %xmm14
2621
2622 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2623 sub %r13, %r11
2624 add $16, %r11
2625 .else
2626 vpxor %xmm1, %xmm9, %xmm9
2627 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
2628 vpand %xmm1, %xmm9, %xmm9
2629 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2630 vpxor %xmm9, %xmm14, %xmm14
2631
2632 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2633 sub %r13, %r11
2634 add $16, %r11
2635 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2636 .endif
2637
2638
2639
2640
2641 vmovq %xmm9, %rax
2642 cmp $8, %r13
2643 jle _less_than_8_bytes_left\@
2644
2645 mov %rax, (arg2 , %r11)
2646 add $8, %r11
2647 vpsrldq $8, %xmm9, %xmm9
2648 vmovq %xmm9, %rax
2649 sub $8, %r13
2650
2651_less_than_8_bytes_left\@:
2652 movb %al, (arg2 , %r11)
2653 add $1, %r11
2654 shr $8, %rax
2655 sub $1, %r13
2656 jne _less_than_8_bytes_left\@
2657
2658
2659_multiple_of_16_bytes\@:
2660 mov arg7, %r12
2661 shl $3, %r12
2662 vmovd %r12d, %xmm15
2663
2664 shl $3, arg4
2665 vmovq arg4, %xmm1
2666 vpslldq $8, %xmm15, %xmm15
2667 vpxor %xmm1, %xmm15, %xmm15
2668
2669 vpxor %xmm15, %xmm14, %xmm14
2670 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2671 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14
2672
2673 mov arg5, %rax
2674 vmovdqu (%rax), %xmm9
2675
2676 ENCRYPT_SINGLE_BLOCK %xmm9
2677
2678 vpxor %xmm14, %xmm9, %xmm9
2679
2680
2681
2682_return_T\@:
2683 mov arg8, %r10
2684 mov arg9, %r11
2685
2686 cmp $16, %r11
2687 je _T_16\@
2688
2689 cmp $12, %r11
2690 je _T_12\@
2691
2692_T_8\@:
2693 vmovq %xmm9, %rax
2694 mov %rax, (%r10)
2695 jmp _return_T_done\@
2696_T_12\@:
2697 vmovq %xmm9, %rax
2698 mov %rax, (%r10)
2699 vpsrldq $8, %xmm9, %xmm9
2700 vmovd %xmm9, %eax
2701 mov %eax, 8(%r10)
2702 jmp _return_T_done\@
2703
2704_T_16\@:
2705 vmovdqu %xmm9, (%r10)
2706
2707_return_T_done\@:
2708 mov %r14, %rsp
2709
2710 pop %r15
2711 pop %r14
2712 pop %r13
2713 pop %r12
2714.endm
2715
2716
2717
2718
2719
2720
2721
2722
2723ENTRY(aesni_gcm_precomp_avx_gen4)
2724
2725 push %r12
2726 push %r13
2727 push %r14
2728 push %r15
2729
2730 mov %rsp, %r14
2731
2732
2733
2734 sub $VARIABLE_OFFSET, %rsp
2735 and $~63, %rsp
2736
2737 vmovdqu (arg2), %xmm6
2738
2739 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
2740
2741 vmovdqa %xmm6, %xmm2
2742 vpsllq $1, %xmm6, %xmm6
2743 vpsrlq $63, %xmm2, %xmm2
2744 vmovdqa %xmm2, %xmm1
2745 vpslldq $8, %xmm2, %xmm2
2746 vpsrldq $8, %xmm1, %xmm1
2747 vpor %xmm2, %xmm6, %xmm6
2748
2749 vpshufd $0b00100100, %xmm1, %xmm2
2750 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
2751 vpand POLY(%rip), %xmm2, %xmm2
2752 vpxor %xmm2, %xmm6, %xmm6
2753
2754 vmovdqa %xmm6, HashKey(arg1)
2755
2756
2757 PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
2758
2759 mov %r14, %rsp
2760
2761 pop %r15
2762 pop %r14
2763 pop %r13
2764 pop %r12
2765 ret
2766ENDPROC(aesni_gcm_precomp_avx_gen4)
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785ENTRY(aesni_gcm_enc_avx_gen4)
2786 GCM_ENC_DEC_AVX2 ENC
2787 ret
2788ENDPROC(aesni_gcm_enc_avx_gen4)
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806ENTRY(aesni_gcm_dec_avx_gen4)
2807 GCM_ENC_DEC_AVX2 DEC
2808 ret
2809ENDPROC(aesni_gcm_dec_avx_gen4)
2810
2811#endif
2812