1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122#include <linux/linkage.h>
123#include <asm/inst.h>
124
125
126.section .rodata.cst16.POLY, "aM", @progbits, 16
127.align 16
128POLY: .octa 0xC2000000000000000000000000000001
129
130.section .rodata.cst16.POLY2, "aM", @progbits, 16
131.align 16
132POLY2: .octa 0xC20000000000000000000001C2000000
133
134.section .rodata.cst16.TWOONE, "aM", @progbits, 16
135.align 16
136TWOONE: .octa 0x00000001000000000000000000000001
137
138.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
139.align 16
140SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
141
142.section .rodata.cst16.ONE, "aM", @progbits, 16
143.align 16
144ONE: .octa 0x00000000000000000000000000000001
145
146.section .rodata.cst16.ONEf, "aM", @progbits, 16
147.align 16
148ONEf: .octa 0x01000000000000000000000000000000
149
150
151
152.section .rodata, "a", @progbits
153.align 16
154SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
155ALL_F: .octa 0xffffffffffffffffffffffffffffffff
156 .octa 0x00000000000000000000000000000000
157
158.section .rodata
159.align 16
160.type aad_shift_arr, @object
161.size aad_shift_arr, 272
162aad_shift_arr:
163 .octa 0xffffffffffffffffffffffffffffffff
164 .octa 0xffffffffffffffffffffffffffffff0C
165 .octa 0xffffffffffffffffffffffffffff0D0C
166 .octa 0xffffffffffffffffffffffffff0E0D0C
167 .octa 0xffffffffffffffffffffffff0F0E0D0C
168 .octa 0xffffffffffffffffffffff0C0B0A0908
169 .octa 0xffffffffffffffffffff0D0C0B0A0908
170 .octa 0xffffffffffffffffff0E0D0C0B0A0908
171 .octa 0xffffffffffffffff0F0E0D0C0B0A0908
172 .octa 0xffffffffffffff0C0B0A090807060504
173 .octa 0xffffffffffff0D0C0B0A090807060504
174 .octa 0xffffffffff0E0D0C0B0A090807060504
175 .octa 0xffffffff0F0E0D0C0B0A090807060504
176 .octa 0xffffff0C0B0A09080706050403020100
177 .octa 0xffff0D0C0B0A09080706050403020100
178 .octa 0xff0E0D0C0B0A09080706050403020100
179 .octa 0x0F0E0D0C0B0A09080706050403020100
180
181
182.text
183
184
185#define AadHash 16*0
186#define AadLen 16*1
187#define InLen (16*1)+8
188#define PBlockEncKey 16*2
189#define OrigIV 16*3
190#define CurCount 16*4
191#define PBlockLen 16*5
192
193HashKey = 16*6
194HashKey_2 = 16*7
195HashKey_3 = 16*8
196HashKey_4 = 16*9
197HashKey_5 = 16*10
198HashKey_6 = 16*11
199HashKey_7 = 16*12
200HashKey_8 = 16*13
201HashKey_k = 16*14
202HashKey_2_k = 16*15
203HashKey_3_k = 16*16
204HashKey_4_k = 16*17
205HashKey_5_k = 16*18
206HashKey_6_k = 16*19
207HashKey_7_k = 16*20
208HashKey_8_k = 16*21
209
210#define arg1 %rdi
211#define arg2 %rsi
212#define arg3 %rdx
213#define arg4 %rcx
214#define arg5 %r8
215#define arg6 %r9
216#define arg7 STACK_OFFSET+8*1(%r14)
217#define arg8 STACK_OFFSET+8*2(%r14)
218#define arg9 STACK_OFFSET+8*3(%r14)
219#define arg10 STACK_OFFSET+8*4(%r14)
220#define keysize 2*15*16(arg1)
221
222i = 0
223j = 0
224
225out_order = 0
226in_order = 1
227DEC = 0
228ENC = 1
229
230.macro define_reg r n
231reg_\r = %xmm\n
232.endm
233
234.macro setreg
235.altmacro
236define_reg i %i
237define_reg j %j
238.noaltmacro
239.endm
240
241
242STACK_OFFSET = 8*4
243
244TMP1 = 16*0
245TMP2 = 16*1
246TMP3 = 16*2
247TMP4 = 16*3
248TMP5 = 16*4
249TMP6 = 16*5
250TMP7 = 16*6
251TMP8 = 16*7
252
253VARIABLE_OFFSET = 16*8
254
255
256
257
258
259.macro FUNC_SAVE
260
261 push %r12
262 push %r13
263 push %r14
264 push %r15
265
266 mov %rsp, %r14
267
268
269
270 sub $VARIABLE_OFFSET, %rsp
271 and $~63, %rsp
272.endm
273
274.macro FUNC_RESTORE
275 mov %r14, %rsp
276
277 pop %r15
278 pop %r14
279 pop %r13
280 pop %r12
281.endm
282
283
284.macro ENCRYPT_SINGLE_BLOCK REP XMM0
285 vpxor (arg1), \XMM0, \XMM0
286 i = 1
287 setreg
288.rep \REP
289 vaesenc 16*i(arg1), \XMM0, \XMM0
290 i = (i+1)
291 setreg
292.endr
293 vaesenclast 16*i(arg1), \XMM0, \XMM0
294.endm
295
296
297
298
299.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
300 vmovdqu AadHash(arg2), %xmm8
301 vmovdqu HashKey(arg2), %xmm13
302 add arg5, InLen(arg2)
303
304
305 xor %r11d, %r11d
306
307 PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
308 sub %r11, arg5
309
310 mov arg5, %r13
311 and $-16, %r13
312
313 mov %r13, %r12
314 shr $4, %r12
315 and $7, %r12
316 jz _initial_num_blocks_is_0\@
317
318 cmp $7, %r12
319 je _initial_num_blocks_is_7\@
320 cmp $6, %r12
321 je _initial_num_blocks_is_6\@
322 cmp $5, %r12
323 je _initial_num_blocks_is_5\@
324 cmp $4, %r12
325 je _initial_num_blocks_is_4\@
326 cmp $3, %r12
327 je _initial_num_blocks_is_3\@
328 cmp $2, %r12
329 je _initial_num_blocks_is_2\@
330
331 jmp _initial_num_blocks_is_1\@
332
333_initial_num_blocks_is_7\@:
334 \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
335 sub $16*7, %r13
336 jmp _initial_blocks_encrypted\@
337
338_initial_num_blocks_is_6\@:
339 \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
340 sub $16*6, %r13
341 jmp _initial_blocks_encrypted\@
342
343_initial_num_blocks_is_5\@:
344 \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
345 sub $16*5, %r13
346 jmp _initial_blocks_encrypted\@
347
348_initial_num_blocks_is_4\@:
349 \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
350 sub $16*4, %r13
351 jmp _initial_blocks_encrypted\@
352
353_initial_num_blocks_is_3\@:
354 \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
355 sub $16*3, %r13
356 jmp _initial_blocks_encrypted\@
357
358_initial_num_blocks_is_2\@:
359 \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
360 sub $16*2, %r13
361 jmp _initial_blocks_encrypted\@
362
363_initial_num_blocks_is_1\@:
364 \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
365 sub $16*1, %r13
366 jmp _initial_blocks_encrypted\@
367
368_initial_num_blocks_is_0\@:
369 \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
370
371
372_initial_blocks_encrypted\@:
373 cmp $0, %r13
374 je _zero_cipher_left\@
375
376 sub $128, %r13
377 je _eight_cipher_left\@
378
379
380
381
382 vmovd %xmm9, %r15d
383 and $255, %r15d
384 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
385
386
387_encrypt_by_8_new\@:
388 cmp $(255-8), %r15d
389 jg _encrypt_by_8\@
390
391
392
393 add $8, %r15b
394 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
395 add $128, %r11
396 sub $128, %r13
397 jne _encrypt_by_8_new\@
398
399 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
400 jmp _eight_cipher_left\@
401
402_encrypt_by_8\@:
403 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
404 add $8, %r15b
405 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
406 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
407 add $128, %r11
408 sub $128, %r13
409 jne _encrypt_by_8_new\@
410
411 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
412
413
414
415
416_eight_cipher_left\@:
417 \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
418
419
420_zero_cipher_left\@:
421 vmovdqu %xmm14, AadHash(arg2)
422 vmovdqu %xmm9, CurCount(arg2)
423
424
425 mov arg5, %r13
426 and $15, %r13
427
428 je _multiple_of_16_bytes\@
429
430
431
432 mov %r13, PBlockLen(arg2)
433
434 vpaddd ONE(%rip), %xmm9, %xmm9
435 vmovdqu %xmm9, CurCount(arg2)
436 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
437
438 ENCRYPT_SINGLE_BLOCK \REP, %xmm9
439 vmovdqu %xmm9, PBlockEncKey(arg2)
440
441 cmp $16, arg5
442 jge _large_enough_update\@
443
444 lea (arg4,%r11,1), %r10
445 mov %r13, %r12
446
447 READ_PARTIAL_BLOCK %r10 %r12 %xmm1
448
449 lea SHIFT_MASK+16(%rip), %r12
450 sub %r13, %r12
451
452
453
454 jmp _final_ghash_mul\@
455
456_large_enough_update\@:
457 sub $16, %r11
458 add %r13, %r11
459
460
461 vmovdqu (arg4, %r11, 1), %xmm1
462
463 sub %r13, %r11
464 add $16, %r11
465
466 lea SHIFT_MASK+16(%rip), %r12
467
468
469 sub %r13, %r12
470
471 vmovdqu (%r12), %xmm2
472
473 vpshufb %xmm2, %xmm1, %xmm1
474
475_final_ghash_mul\@:
476 .if \ENC_DEC == DEC
477 vmovdqa %xmm1, %xmm2
478 vpxor %xmm1, %xmm9, %xmm9
479 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
480
481 vpand %xmm1, %xmm9, %xmm9
482 vpand %xmm1, %xmm2, %xmm2
483 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
484 vpxor %xmm2, %xmm14, %xmm14
485
486 vmovdqu %xmm14, AadHash(arg2)
487 .else
488 vpxor %xmm1, %xmm9, %xmm9
489 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
490
491 vpand %xmm1, %xmm9, %xmm9
492 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
493 vpxor %xmm9, %xmm14, %xmm14
494
495 vmovdqu %xmm14, AadHash(arg2)
496 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
497 .endif
498
499
500
501
502 vmovq %xmm9, %rax
503 cmp $8, %r13
504 jle _less_than_8_bytes_left\@
505
506 mov %rax, (arg3 , %r11)
507 add $8, %r11
508 vpsrldq $8, %xmm9, %xmm9
509 vmovq %xmm9, %rax
510 sub $8, %r13
511
512_less_than_8_bytes_left\@:
513 movb %al, (arg3 , %r11)
514 add $1, %r11
515 shr $8, %rax
516 sub $1, %r13
517 jne _less_than_8_bytes_left\@
518
519
520_multiple_of_16_bytes\@:
521.endm
522
523
524
525
526
527.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
528 vmovdqu AadHash(arg2), %xmm14
529 vmovdqu HashKey(arg2), %xmm13
530
531 mov PBlockLen(arg2), %r12
532 cmp $0, %r12
533 je _partial_done\@
534
535
536 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
537
538_partial_done\@:
539 mov AadLen(arg2), %r12
540 shl $3, %r12
541 vmovd %r12d, %xmm15
542
543 mov InLen(arg2), %r12
544 shl $3, %r12
545 vmovq %r12, %xmm1
546 vpslldq $8, %xmm15, %xmm15
547 vpxor %xmm1, %xmm15, %xmm15
548
549 vpxor %xmm15, %xmm14, %xmm14
550 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
551 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14
552
553 vmovdqu OrigIV(arg2), %xmm9
554
555 ENCRYPT_SINGLE_BLOCK \REP, %xmm9
556
557 vpxor %xmm14, %xmm9, %xmm9
558
559
560
561_return_T\@:
562 mov \AUTH_TAG, %r10
563 mov \AUTH_TAG_LEN, %r11
564
565 cmp $16, %r11
566 je _T_16\@
567
568 cmp $8, %r11
569 jl _T_4\@
570
571_T_8\@:
572 vmovq %xmm9, %rax
573 mov %rax, (%r10)
574 add $8, %r10
575 sub $8, %r11
576 vpsrldq $8, %xmm9, %xmm9
577 cmp $0, %r11
578 je _return_T_done\@
579_T_4\@:
580 vmovd %xmm9, %eax
581 mov %eax, (%r10)
582 add $4, %r10
583 sub $4, %r11
584 vpsrldq $4, %xmm9, %xmm9
585 cmp $0, %r11
586 je _return_T_done\@
587_T_123\@:
588 vmovd %xmm9, %eax
589 cmp $2, %r11
590 jl _T_1\@
591 mov %ax, (%r10)
592 cmp $2, %r11
593 je _return_T_done\@
594 add $2, %r10
595 sar $16, %eax
596_T_1\@:
597 mov %al, (%r10)
598 jmp _return_T_done\@
599
600_T_16\@:
601 vmovdqu %xmm9, (%r10)
602
603_return_T_done\@:
604.endm
605
606.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
607
608 mov \AAD, %r10
609 mov \AADLEN, %r12
610
611
612 mov %r12, %r11
613
614 vpxor \T8, \T8, \T8
615 vpxor \T7, \T7, \T7
616 cmp $16, %r11
617 jl _get_AAD_rest8\@
618_get_AAD_blocks\@:
619 vmovdqu (%r10), \T7
620 vpshufb SHUF_MASK(%rip), \T7, \T7
621 vpxor \T7, \T8, \T8
622 \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6
623 add $16, %r10
624 sub $16, %r12
625 sub $16, %r11
626 cmp $16, %r11
627 jge _get_AAD_blocks\@
628 vmovdqu \T8, \T7
629 cmp $0, %r11
630 je _get_AAD_done\@
631
632 vpxor \T7, \T7, \T7
633
634
635
636
637_get_AAD_rest8\@:
638 cmp $4, %r11
639 jle _get_AAD_rest4\@
640 movq (%r10), \T1
641 add $8, %r10
642 sub $8, %r11
643 vpslldq $8, \T1, \T1
644 vpsrldq $8, \T7, \T7
645 vpxor \T1, \T7, \T7
646 jmp _get_AAD_rest8\@
647_get_AAD_rest4\@:
648 cmp $0, %r11
649 jle _get_AAD_rest0\@
650 mov (%r10), %eax
651 movq %rax, \T1
652 add $4, %r10
653 sub $4, %r11
654 vpslldq $12, \T1, \T1
655 vpsrldq $4, \T7, \T7
656 vpxor \T1, \T7, \T7
657_get_AAD_rest0\@:
658
659
660
661 movq %r12, %r11
662 salq $4, %r11
663 vmovdqu aad_shift_arr(%r11), \T1
664 vpshufb \T1, \T7, \T7
665_get_AAD_rest_final\@:
666 vpshufb SHUF_MASK(%rip), \T7, \T7
667 vpxor \T8, \T7, \T7
668 \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6
669
670_get_AAD_done\@:
671 vmovdqu \T7, AadHash(arg2)
672.endm
673
674.macro INIT GHASH_MUL PRECOMPUTE
675 mov arg6, %r11
676 mov %r11, AadLen(arg2)
677 xor %r11d, %r11d
678 mov %r11, InLen(arg2)
679
680 mov %r11, PBlockLen(arg2)
681 mov %r11, PBlockEncKey(arg2)
682 mov arg3, %rax
683 movdqu (%rax), %xmm0
684 movdqu %xmm0, OrigIV(arg2)
685
686 vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
687 movdqu %xmm0, CurCount(arg2)
688
689 vmovdqu (arg4), %xmm6
690
691 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
692
693 vmovdqa %xmm6, %xmm2
694 vpsllq $1, %xmm6, %xmm6
695 vpsrlq $63, %xmm2, %xmm2
696 vmovdqa %xmm2, %xmm1
697 vpslldq $8, %xmm2, %xmm2
698 vpsrldq $8, %xmm1, %xmm1
699 vpor %xmm2, %xmm6, %xmm6
700
701 vpshufd $0b00100100, %xmm1, %xmm2
702 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
703 vpand POLY(%rip), %xmm2, %xmm2
704 vpxor %xmm2, %xmm6, %xmm6
705
706 vmovdqu %xmm6, HashKey(arg2)
707
708 CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
709
710 \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
711.endm
712
713
714
715
716
717.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
718 vpxor \XMMDst, \XMMDst, \XMMDst
719
720 cmp $8, \DLEN
721 jl _read_lt8_\@
722 mov (\DPTR), %rax
723 vpinsrq $0, %rax, \XMMDst, \XMMDst
724 sub $8, \DLEN
725 jz _done_read_partial_block_\@
726 xor %eax, %eax
727_read_next_byte_\@:
728 shl $8, %rax
729 mov 7(\DPTR, \DLEN, 1), %al
730 dec \DLEN
731 jnz _read_next_byte_\@
732 vpinsrq $1, %rax, \XMMDst, \XMMDst
733 jmp _done_read_partial_block_\@
734_read_lt8_\@:
735 xor %eax, %eax
736_read_next_byte_lt8_\@:
737 shl $8, %rax
738 mov -1(\DPTR, \DLEN, 1), %al
739 dec \DLEN
740 jnz _read_next_byte_lt8_\@
741 vpinsrq $0, %rax, \XMMDst, \XMMDst
742_done_read_partial_block_\@:
743.endm
744
745
746
747
748
749
750.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
751 AAD_HASH ENC_DEC
752 mov PBlockLen(arg2), %r13
753 cmp $0, %r13
754 je _partial_block_done_\@
755
756 cmp $16, \PLAIN_CYPH_LEN
757 jl _fewer_than_16_bytes_\@
758 vmovdqu (\PLAIN_CYPH_IN), %xmm1
759 jmp _data_read_\@
760
761_fewer_than_16_bytes_\@:
762 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
763 mov \PLAIN_CYPH_LEN, %r12
764 READ_PARTIAL_BLOCK %r10 %r12 %xmm1
765
766 mov PBlockLen(arg2), %r13
767
768_data_read_\@:
769
770 vmovdqu PBlockEncKey(arg2), %xmm9
771 vmovdqu HashKey(arg2), %xmm13
772
773 lea SHIFT_MASK(%rip), %r12
774
775
776
777 add %r13, %r12
778 vmovdqu (%r12), %xmm2
779 vpshufb %xmm2, %xmm9, %xmm9
780
781.if \ENC_DEC == DEC
782 vmovdqa %xmm1, %xmm3
783 pxor %xmm1, %xmm9
784
785 mov \PLAIN_CYPH_LEN, %r10
786 add %r13, %r10
787
788 sub $16, %r10
789
790
791 jge _no_extra_mask_1_\@
792 sub %r10, %r12
793_no_extra_mask_1_\@:
794
795 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
796
797 vpand %xmm1, %xmm9, %xmm9
798
799 vpand %xmm1, %xmm3, %xmm3
800 vmovdqa SHUF_MASK(%rip), %xmm10
801 vpshufb %xmm10, %xmm3, %xmm3
802 vpshufb %xmm2, %xmm3, %xmm3
803 vpxor %xmm3, \AAD_HASH, \AAD_HASH
804
805 cmp $0, %r10
806 jl _partial_incomplete_1_\@
807
808
809 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
810 xor %eax,%eax
811
812 mov %rax, PBlockLen(arg2)
813 jmp _dec_done_\@
814_partial_incomplete_1_\@:
815 add \PLAIN_CYPH_LEN, PBlockLen(arg2)
816_dec_done_\@:
817 vmovdqu \AAD_HASH, AadHash(arg2)
818.else
819 vpxor %xmm1, %xmm9, %xmm9
820
821 mov \PLAIN_CYPH_LEN, %r10
822 add %r13, %r10
823
824 sub $16, %r10
825
826
827 jge _no_extra_mask_2_\@
828 sub %r10, %r12
829_no_extra_mask_2_\@:
830
831 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
832
833 vpand %xmm1, %xmm9, %xmm9
834
835 vmovdqa SHUF_MASK(%rip), %xmm1
836 vpshufb %xmm1, %xmm9, %xmm9
837 vpshufb %xmm2, %xmm9, %xmm9
838 vpxor %xmm9, \AAD_HASH, \AAD_HASH
839
840 cmp $0, %r10
841 jl _partial_incomplete_2_\@
842
843
844 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
845 xor %eax,%eax
846
847 mov %rax, PBlockLen(arg2)
848 jmp _encode_done_\@
849_partial_incomplete_2_\@:
850 add \PLAIN_CYPH_LEN, PBlockLen(arg2)
851_encode_done_\@:
852 vmovdqu \AAD_HASH, AadHash(arg2)
853
854 vmovdqa SHUF_MASK(%rip), %xmm10
855
856 vpshufb %xmm10, %xmm9, %xmm9
857 vpshufb %xmm2, %xmm9, %xmm9
858.endif
859
860 cmp $0, %r10
861 jl _partial_fill_\@
862 mov %r13, %r12
863 mov $16, %r13
864
865 sub %r12, %r13
866 jmp _count_set_\@
867_partial_fill_\@:
868 mov \PLAIN_CYPH_LEN, %r13
869_count_set_\@:
870 vmovdqa %xmm9, %xmm0
871 vmovq %xmm0, %rax
872 cmp $8, %r13
873 jle _less_than_8_bytes_left_\@
874
875 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
876 add $8, \DATA_OFFSET
877 psrldq $8, %xmm0
878 vmovq %xmm0, %rax
879 sub $8, %r13
880_less_than_8_bytes_left_\@:
881 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
882 add $1, \DATA_OFFSET
883 shr $8, %rax
884 sub $1, %r13
885 jne _less_than_8_bytes_left_\@
886_partial_block_done_\@:
887.endm
888
889#ifdef CONFIG_AS_AVX
890
891
892
893
894
895
896
897.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
898
899 vpshufd $0b01001110, \GH, \T2
900 vpshufd $0b01001110, \HK, \T3
901 vpxor \GH , \T2, \T2
902 vpxor \HK , \T3, \T3
903
904 vpclmulqdq $0x11, \HK, \GH, \T1
905 vpclmulqdq $0x00, \HK, \GH, \GH
906 vpclmulqdq $0x00, \T3, \T2, \T2
907 vpxor \GH, \T2,\T2
908 vpxor \T1, \T2,\T2
909
910 vpslldq $8, \T2,\T3
911 vpsrldq $8, \T2,\T2
912 vpxor \T3, \GH, \GH
913 vpxor \T2, \T1, \T1
914
915
916 vpslld $31, \GH, \T2
917 vpslld $30, \GH, \T3
918 vpslld $25, \GH, \T4
919
920 vpxor \T3, \T2, \T2
921 vpxor \T4, \T2, \T2
922
923 vpsrldq $4, \T2, \T5
924
925 vpslldq $12, \T2, \T2
926 vpxor \T2, \GH, \GH
927
928
929
930 vpsrld $1,\GH, \T2
931 vpsrld $2,\GH, \T3
932 vpsrld $7,\GH, \T4
933 vpxor \T3, \T2, \T2
934 vpxor \T4, \T2, \T2
935
936 vpxor \T5, \T2, \T2
937 vpxor \T2, \GH, \GH
938 vpxor \T1, \GH, \GH
939
940
941.endm
942
943.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
944
945
946 vmovdqa \HK, \T5
947
948 vpshufd $0b01001110, \T5, \T1
949 vpxor \T5, \T1, \T1
950 vmovdqu \T1, HashKey_k(arg2)
951
952 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
953 vmovdqu \T5, HashKey_2(arg2)
954 vpshufd $0b01001110, \T5, \T1
955 vpxor \T5, \T1, \T1
956 vmovdqu \T1, HashKey_2_k(arg2)
957
958 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
959 vmovdqu \T5, HashKey_3(arg2)
960 vpshufd $0b01001110, \T5, \T1
961 vpxor \T5, \T1, \T1
962 vmovdqu \T1, HashKey_3_k(arg2)
963
964 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
965 vmovdqu \T5, HashKey_4(arg2)
966 vpshufd $0b01001110, \T5, \T1
967 vpxor \T5, \T1, \T1
968 vmovdqu \T1, HashKey_4_k(arg2)
969
970 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
971 vmovdqu \T5, HashKey_5(arg2)
972 vpshufd $0b01001110, \T5, \T1
973 vpxor \T5, \T1, \T1
974 vmovdqu \T1, HashKey_5_k(arg2)
975
976 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
977 vmovdqu \T5, HashKey_6(arg2)
978 vpshufd $0b01001110, \T5, \T1
979 vpxor \T5, \T1, \T1
980 vmovdqu \T1, HashKey_6_k(arg2)
981
982 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
983 vmovdqu \T5, HashKey_7(arg2)
984 vpshufd $0b01001110, \T5, \T1
985 vpxor \T5, \T1, \T1
986 vmovdqu \T1, HashKey_7_k(arg2)
987
988 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
989 vmovdqu \T5, HashKey_8(arg2)
990 vpshufd $0b01001110, \T5, \T1
991 vpxor \T5, \T1, \T1
992 vmovdqu \T1, HashKey_8_k(arg2)
993
994.endm
995
996
997
998
999
1000
1001
1002
1003.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
1004 i = (8-\num_initial_blocks)
1005 setreg
1006 vmovdqu AadHash(arg2), reg_i
1007
1008
1009 vmovdqu CurCount(arg2), \CTR
1010
1011 i = (9-\num_initial_blocks)
1012 setreg
1013.rep \num_initial_blocks
1014 vpaddd ONE(%rip), \CTR, \CTR
1015 vmovdqa \CTR, reg_i
1016 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1017 i = (i+1)
1018 setreg
1019.endr
1020
1021 vmovdqa (arg1), \T_key
1022 i = (9-\num_initial_blocks)
1023 setreg
1024.rep \num_initial_blocks
1025 vpxor \T_key, reg_i, reg_i
1026 i = (i+1)
1027 setreg
1028.endr
1029
1030 j = 1
1031 setreg
1032.rep \REP
1033 vmovdqa 16*j(arg1), \T_key
1034 i = (9-\num_initial_blocks)
1035 setreg
1036.rep \num_initial_blocks
1037 vaesenc \T_key, reg_i, reg_i
1038 i = (i+1)
1039 setreg
1040.endr
1041
1042 j = (j+1)
1043 setreg
1044.endr
1045
1046 vmovdqa 16*j(arg1), \T_key
1047 i = (9-\num_initial_blocks)
1048 setreg
1049.rep \num_initial_blocks
1050 vaesenclast \T_key, reg_i, reg_i
1051 i = (i+1)
1052 setreg
1053.endr
1054
1055 i = (9-\num_initial_blocks)
1056 setreg
1057.rep \num_initial_blocks
1058 vmovdqu (arg4, %r11), \T1
1059 vpxor \T1, reg_i, reg_i
1060 vmovdqu reg_i, (arg3 , %r11)
1061 add $16, %r11
1062.if \ENC_DEC == DEC
1063 vmovdqa \T1, reg_i
1064.endif
1065 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1066 i = (i+1)
1067 setreg
1068.endr
1069
1070
1071 i = (8-\num_initial_blocks)
1072 j = (9-\num_initial_blocks)
1073 setreg
1074
1075.rep \num_initial_blocks
1076 vpxor reg_i, reg_j, reg_j
1077 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6
1078 i = (i+1)
1079 j = (j+1)
1080 setreg
1081.endr
1082
1083
1084 vmovdqa \XMM8, TMP1(%rsp)
1085 vmovdqa \XMM8, \T3
1086
1087 cmp $128, %r13
1088 jl _initial_blocks_done\@
1089
1090
1091
1092 vpaddd ONE(%rip), \CTR, \CTR
1093 vmovdqa \CTR, \XMM1
1094 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
1095
1096 vpaddd ONE(%rip), \CTR, \CTR
1097 vmovdqa \CTR, \XMM2
1098 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
1099
1100 vpaddd ONE(%rip), \CTR, \CTR
1101 vmovdqa \CTR, \XMM3
1102 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
1103
1104 vpaddd ONE(%rip), \CTR, \CTR
1105 vmovdqa \CTR, \XMM4
1106 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
1107
1108 vpaddd ONE(%rip), \CTR, \CTR
1109 vmovdqa \CTR, \XMM5
1110 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
1111
1112 vpaddd ONE(%rip), \CTR, \CTR
1113 vmovdqa \CTR, \XMM6
1114 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
1115
1116 vpaddd ONE(%rip), \CTR, \CTR
1117 vmovdqa \CTR, \XMM7
1118 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
1119
1120 vpaddd ONE(%rip), \CTR, \CTR
1121 vmovdqa \CTR, \XMM8
1122 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
1123
1124 vmovdqa (arg1), \T_key
1125 vpxor \T_key, \XMM1, \XMM1
1126 vpxor \T_key, \XMM2, \XMM2
1127 vpxor \T_key, \XMM3, \XMM3
1128 vpxor \T_key, \XMM4, \XMM4
1129 vpxor \T_key, \XMM5, \XMM5
1130 vpxor \T_key, \XMM6, \XMM6
1131 vpxor \T_key, \XMM7, \XMM7
1132 vpxor \T_key, \XMM8, \XMM8
1133
1134 i = 1
1135 setreg
1136.rep \REP
1137 vmovdqa 16*i(arg1), \T_key
1138 vaesenc \T_key, \XMM1, \XMM1
1139 vaesenc \T_key, \XMM2, \XMM2
1140 vaesenc \T_key, \XMM3, \XMM3
1141 vaesenc \T_key, \XMM4, \XMM4
1142 vaesenc \T_key, \XMM5, \XMM5
1143 vaesenc \T_key, \XMM6, \XMM6
1144 vaesenc \T_key, \XMM7, \XMM7
1145 vaesenc \T_key, \XMM8, \XMM8
1146 i = (i+1)
1147 setreg
1148.endr
1149
1150 vmovdqa 16*i(arg1), \T_key
1151 vaesenclast \T_key, \XMM1, \XMM1
1152 vaesenclast \T_key, \XMM2, \XMM2
1153 vaesenclast \T_key, \XMM3, \XMM3
1154 vaesenclast \T_key, \XMM4, \XMM4
1155 vaesenclast \T_key, \XMM5, \XMM5
1156 vaesenclast \T_key, \XMM6, \XMM6
1157 vaesenclast \T_key, \XMM7, \XMM7
1158 vaesenclast \T_key, \XMM8, \XMM8
1159
1160 vmovdqu (arg4, %r11), \T1
1161 vpxor \T1, \XMM1, \XMM1
1162 vmovdqu \XMM1, (arg3 , %r11)
1163 .if \ENC_DEC == DEC
1164 vmovdqa \T1, \XMM1
1165 .endif
1166
1167 vmovdqu 16*1(arg4, %r11), \T1
1168 vpxor \T1, \XMM2, \XMM2
1169 vmovdqu \XMM2, 16*1(arg3 , %r11)
1170 .if \ENC_DEC == DEC
1171 vmovdqa \T1, \XMM2
1172 .endif
1173
1174 vmovdqu 16*2(arg4, %r11), \T1
1175 vpxor \T1, \XMM3, \XMM3
1176 vmovdqu \XMM3, 16*2(arg3 , %r11)
1177 .if \ENC_DEC == DEC
1178 vmovdqa \T1, \XMM3
1179 .endif
1180
1181 vmovdqu 16*3(arg4, %r11), \T1
1182 vpxor \T1, \XMM4, \XMM4
1183 vmovdqu \XMM4, 16*3(arg3 , %r11)
1184 .if \ENC_DEC == DEC
1185 vmovdqa \T1, \XMM4
1186 .endif
1187
1188 vmovdqu 16*4(arg4, %r11), \T1
1189 vpxor \T1, \XMM5, \XMM5
1190 vmovdqu \XMM5, 16*4(arg3 , %r11)
1191 .if \ENC_DEC == DEC
1192 vmovdqa \T1, \XMM5
1193 .endif
1194
1195 vmovdqu 16*5(arg4, %r11), \T1
1196 vpxor \T1, \XMM6, \XMM6
1197 vmovdqu \XMM6, 16*5(arg3 , %r11)
1198 .if \ENC_DEC == DEC
1199 vmovdqa \T1, \XMM6
1200 .endif
1201
1202 vmovdqu 16*6(arg4, %r11), \T1
1203 vpxor \T1, \XMM7, \XMM7
1204 vmovdqu \XMM7, 16*6(arg3 , %r11)
1205 .if \ENC_DEC == DEC
1206 vmovdqa \T1, \XMM7
1207 .endif
1208
1209 vmovdqu 16*7(arg4, %r11), \T1
1210 vpxor \T1, \XMM8, \XMM8
1211 vmovdqu \XMM8, 16*7(arg3 , %r11)
1212 .if \ENC_DEC == DEC
1213 vmovdqa \T1, \XMM8
1214 .endif
1215
1216 add $128, %r11
1217
1218 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
1219 vpxor TMP1(%rsp), \XMM1, \XMM1
1220 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
1221 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
1222 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
1223 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
1224 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
1225 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
1226 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
1227
1228
1229
1230_initial_blocks_done\@:
1231
1232.endm
1233
1234
1235
1236
1237
1238.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1239
1240 vmovdqa \XMM1, \T2
1241 vmovdqa \XMM2, TMP2(%rsp)
1242 vmovdqa \XMM3, TMP3(%rsp)
1243 vmovdqa \XMM4, TMP4(%rsp)
1244 vmovdqa \XMM5, TMP5(%rsp)
1245 vmovdqa \XMM6, TMP6(%rsp)
1246 vmovdqa \XMM7, TMP7(%rsp)
1247 vmovdqa \XMM8, TMP8(%rsp)
1248
1249.if \loop_idx == in_order
1250 vpaddd ONE(%rip), \CTR, \XMM1
1251 vpaddd ONE(%rip), \XMM1, \XMM2
1252 vpaddd ONE(%rip), \XMM2, \XMM3
1253 vpaddd ONE(%rip), \XMM3, \XMM4
1254 vpaddd ONE(%rip), \XMM4, \XMM5
1255 vpaddd ONE(%rip), \XMM5, \XMM6
1256 vpaddd ONE(%rip), \XMM6, \XMM7
1257 vpaddd ONE(%rip), \XMM7, \XMM8
1258 vmovdqa \XMM8, \CTR
1259
1260 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
1261 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
1262 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
1263 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
1264 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
1265 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
1266 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
1267 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
1268.else
1269 vpaddd ONEf(%rip), \CTR, \XMM1
1270 vpaddd ONEf(%rip), \XMM1, \XMM2
1271 vpaddd ONEf(%rip), \XMM2, \XMM3
1272 vpaddd ONEf(%rip), \XMM3, \XMM4
1273 vpaddd ONEf(%rip), \XMM4, \XMM5
1274 vpaddd ONEf(%rip), \XMM5, \XMM6
1275 vpaddd ONEf(%rip), \XMM6, \XMM7
1276 vpaddd ONEf(%rip), \XMM7, \XMM8
1277 vmovdqa \XMM8, \CTR
1278.endif
1279
1280
1281
1282
1283 vmovdqu (arg1), \T1
1284 vpxor \T1, \XMM1, \XMM1
1285 vpxor \T1, \XMM2, \XMM2
1286 vpxor \T1, \XMM3, \XMM3
1287 vpxor \T1, \XMM4, \XMM4
1288 vpxor \T1, \XMM5, \XMM5
1289 vpxor \T1, \XMM6, \XMM6
1290 vpxor \T1, \XMM7, \XMM7
1291 vpxor \T1, \XMM8, \XMM8
1292
1293
1294
1295
1296
1297
1298
1299 vmovdqu 16*1(arg1), \T1
1300 vaesenc \T1, \XMM1, \XMM1
1301 vaesenc \T1, \XMM2, \XMM2
1302 vaesenc \T1, \XMM3, \XMM3
1303 vaesenc \T1, \XMM4, \XMM4
1304 vaesenc \T1, \XMM5, \XMM5
1305 vaesenc \T1, \XMM6, \XMM6
1306 vaesenc \T1, \XMM7, \XMM7
1307 vaesenc \T1, \XMM8, \XMM8
1308
1309 vmovdqu 16*2(arg1), \T1
1310 vaesenc \T1, \XMM1, \XMM1
1311 vaesenc \T1, \XMM2, \XMM2
1312 vaesenc \T1, \XMM3, \XMM3
1313 vaesenc \T1, \XMM4, \XMM4
1314 vaesenc \T1, \XMM5, \XMM5
1315 vaesenc \T1, \XMM6, \XMM6
1316 vaesenc \T1, \XMM7, \XMM7
1317 vaesenc \T1, \XMM8, \XMM8
1318
1319
1320
1321
1322 vmovdqu HashKey_8(arg2), \T5
1323 vpclmulqdq $0x11, \T5, \T2, \T4
1324 vpclmulqdq $0x00, \T5, \T2, \T7
1325
1326 vpshufd $0b01001110, \T2, \T6
1327 vpxor \T2, \T6, \T6
1328
1329 vmovdqu HashKey_8_k(arg2), \T5
1330 vpclmulqdq $0x00, \T5, \T6, \T6
1331
1332 vmovdqu 16*3(arg1), \T1
1333 vaesenc \T1, \XMM1, \XMM1
1334 vaesenc \T1, \XMM2, \XMM2
1335 vaesenc \T1, \XMM3, \XMM3
1336 vaesenc \T1, \XMM4, \XMM4
1337 vaesenc \T1, \XMM5, \XMM5
1338 vaesenc \T1, \XMM6, \XMM6
1339 vaesenc \T1, \XMM7, \XMM7
1340 vaesenc \T1, \XMM8, \XMM8
1341
1342 vmovdqa TMP2(%rsp), \T1
1343 vmovdqu HashKey_7(arg2), \T5
1344 vpclmulqdq $0x11, \T5, \T1, \T3
1345 vpxor \T3, \T4, \T4
1346 vpclmulqdq $0x00, \T5, \T1, \T3
1347 vpxor \T3, \T7, \T7
1348
1349 vpshufd $0b01001110, \T1, \T3
1350 vpxor \T1, \T3, \T3
1351 vmovdqu HashKey_7_k(arg2), \T5
1352 vpclmulqdq $0x10, \T5, \T3, \T3
1353 vpxor \T3, \T6, \T6
1354
1355 vmovdqu 16*4(arg1), \T1
1356 vaesenc \T1, \XMM1, \XMM1
1357 vaesenc \T1, \XMM2, \XMM2
1358 vaesenc \T1, \XMM3, \XMM3
1359 vaesenc \T1, \XMM4, \XMM4
1360 vaesenc \T1, \XMM5, \XMM5
1361 vaesenc \T1, \XMM6, \XMM6
1362 vaesenc \T1, \XMM7, \XMM7
1363 vaesenc \T1, \XMM8, \XMM8
1364
1365
1366
1367 vmovdqa TMP3(%rsp), \T1
1368 vmovdqu HashKey_6(arg2), \T5
1369 vpclmulqdq $0x11, \T5, \T1, \T3
1370 vpxor \T3, \T4, \T4
1371 vpclmulqdq $0x00, \T5, \T1, \T3
1372 vpxor \T3, \T7, \T7
1373
1374 vpshufd $0b01001110, \T1, \T3
1375 vpxor \T1, \T3, \T3
1376 vmovdqu HashKey_6_k(arg2), \T5
1377 vpclmulqdq $0x10, \T5, \T3, \T3
1378 vpxor \T3, \T6, \T6
1379
1380 vmovdqu 16*5(arg1), \T1
1381 vaesenc \T1, \XMM1, \XMM1
1382 vaesenc \T1, \XMM2, \XMM2
1383 vaesenc \T1, \XMM3, \XMM3
1384 vaesenc \T1, \XMM4, \XMM4
1385 vaesenc \T1, \XMM5, \XMM5
1386 vaesenc \T1, \XMM6, \XMM6
1387 vaesenc \T1, \XMM7, \XMM7
1388 vaesenc \T1, \XMM8, \XMM8
1389
1390 vmovdqa TMP4(%rsp), \T1
1391 vmovdqu HashKey_5(arg2), \T5
1392 vpclmulqdq $0x11, \T5, \T1, \T3
1393 vpxor \T3, \T4, \T4
1394 vpclmulqdq $0x00, \T5, \T1, \T3
1395 vpxor \T3, \T7, \T7
1396
1397 vpshufd $0b01001110, \T1, \T3
1398 vpxor \T1, \T3, \T3
1399 vmovdqu HashKey_5_k(arg2), \T5
1400 vpclmulqdq $0x10, \T5, \T3, \T3
1401 vpxor \T3, \T6, \T6
1402
1403 vmovdqu 16*6(arg1), \T1
1404 vaesenc \T1, \XMM1, \XMM1
1405 vaesenc \T1, \XMM2, \XMM2
1406 vaesenc \T1, \XMM3, \XMM3
1407 vaesenc \T1, \XMM4, \XMM4
1408 vaesenc \T1, \XMM5, \XMM5
1409 vaesenc \T1, \XMM6, \XMM6
1410 vaesenc \T1, \XMM7, \XMM7
1411 vaesenc \T1, \XMM8, \XMM8
1412
1413
1414 vmovdqa TMP5(%rsp), \T1
1415 vmovdqu HashKey_4(arg2), \T5
1416 vpclmulqdq $0x11, \T5, \T1, \T3
1417 vpxor \T3, \T4, \T4
1418 vpclmulqdq $0x00, \T5, \T1, \T3
1419 vpxor \T3, \T7, \T7
1420
1421 vpshufd $0b01001110, \T1, \T3
1422 vpxor \T1, \T3, \T3
1423 vmovdqu HashKey_4_k(arg2), \T5
1424 vpclmulqdq $0x10, \T5, \T3, \T3
1425 vpxor \T3, \T6, \T6
1426
1427 vmovdqu 16*7(arg1), \T1
1428 vaesenc \T1, \XMM1, \XMM1
1429 vaesenc \T1, \XMM2, \XMM2
1430 vaesenc \T1, \XMM3, \XMM3
1431 vaesenc \T1, \XMM4, \XMM4
1432 vaesenc \T1, \XMM5, \XMM5
1433 vaesenc \T1, \XMM6, \XMM6
1434 vaesenc \T1, \XMM7, \XMM7
1435 vaesenc \T1, \XMM8, \XMM8
1436
1437 vmovdqa TMP6(%rsp), \T1
1438 vmovdqu HashKey_3(arg2), \T5
1439 vpclmulqdq $0x11, \T5, \T1, \T3
1440 vpxor \T3, \T4, \T4
1441 vpclmulqdq $0x00, \T5, \T1, \T3
1442 vpxor \T3, \T7, \T7
1443
1444 vpshufd $0b01001110, \T1, \T3
1445 vpxor \T1, \T3, \T3
1446 vmovdqu HashKey_3_k(arg2), \T5
1447 vpclmulqdq $0x10, \T5, \T3, \T3
1448 vpxor \T3, \T6, \T6
1449
1450
1451 vmovdqu 16*8(arg1), \T1
1452 vaesenc \T1, \XMM1, \XMM1
1453 vaesenc \T1, \XMM2, \XMM2
1454 vaesenc \T1, \XMM3, \XMM3
1455 vaesenc \T1, \XMM4, \XMM4
1456 vaesenc \T1, \XMM5, \XMM5
1457 vaesenc \T1, \XMM6, \XMM6
1458 vaesenc \T1, \XMM7, \XMM7
1459 vaesenc \T1, \XMM8, \XMM8
1460
1461 vmovdqa TMP7(%rsp), \T1
1462 vmovdqu HashKey_2(arg2), \T5
1463 vpclmulqdq $0x11, \T5, \T1, \T3
1464 vpxor \T3, \T4, \T4
1465 vpclmulqdq $0x00, \T5, \T1, \T3
1466 vpxor \T3, \T7, \T7
1467
1468 vpshufd $0b01001110, \T1, \T3
1469 vpxor \T1, \T3, \T3
1470 vmovdqu HashKey_2_k(arg2), \T5
1471 vpclmulqdq $0x10, \T5, \T3, \T3
1472 vpxor \T3, \T6, \T6
1473
1474
1475
1476 vmovdqu 16*9(arg1), \T5
1477 vaesenc \T5, \XMM1, \XMM1
1478 vaesenc \T5, \XMM2, \XMM2
1479 vaesenc \T5, \XMM3, \XMM3
1480 vaesenc \T5, \XMM4, \XMM4
1481 vaesenc \T5, \XMM5, \XMM5
1482 vaesenc \T5, \XMM6, \XMM6
1483 vaesenc \T5, \XMM7, \XMM7
1484 vaesenc \T5, \XMM8, \XMM8
1485
1486 vmovdqa TMP8(%rsp), \T1
1487 vmovdqu HashKey(arg2), \T5
1488 vpclmulqdq $0x11, \T5, \T1, \T3
1489 vpxor \T3, \T4, \T4
1490 vpclmulqdq $0x00, \T5, \T1, \T3
1491 vpxor \T3, \T7, \T7
1492
1493 vpshufd $0b01001110, \T1, \T3
1494 vpxor \T1, \T3, \T3
1495 vmovdqu HashKey_k(arg2), \T5
1496 vpclmulqdq $0x10, \T5, \T3, \T3
1497 vpxor \T3, \T6, \T6
1498
1499 vpxor \T4, \T6, \T6
1500 vpxor \T7, \T6, \T6
1501
1502 vmovdqu 16*10(arg1), \T5
1503
1504 i = 11
1505 setreg
1506.rep (\REP-9)
1507
1508 vaesenc \T5, \XMM1, \XMM1
1509 vaesenc \T5, \XMM2, \XMM2
1510 vaesenc \T5, \XMM3, \XMM3
1511 vaesenc \T5, \XMM4, \XMM4
1512 vaesenc \T5, \XMM5, \XMM5
1513 vaesenc \T5, \XMM6, \XMM6
1514 vaesenc \T5, \XMM7, \XMM7
1515 vaesenc \T5, \XMM8, \XMM8
1516
1517 vmovdqu 16*i(arg1), \T5
1518 i = i + 1
1519 setreg
1520.endr
1521
1522 i = 0
1523 j = 1
1524 setreg
1525.rep 8
1526 vpxor 16*i(arg4, %r11), \T5, \T2
1527 .if \ENC_DEC == ENC
1528 vaesenclast \T2, reg_j, reg_j
1529 .else
1530 vaesenclast \T2, reg_j, \T3
1531 vmovdqu 16*i(arg4, %r11), reg_j
1532 vmovdqu \T3, 16*i(arg3, %r11)
1533 .endif
1534 i = (i+1)
1535 j = (j+1)
1536 setreg
1537.endr
1538
1539
1540
1541 vpslldq $8, \T6, \T3
1542 vpsrldq $8, \T6, \T6
1543 vpxor \T3, \T7, \T7
1544 vpxor \T4, \T6, \T6
1545
1546
1547
1548
1549
1550
1551 vpslld $31, \T7, \T2
1552 vpslld $30, \T7, \T3
1553 vpslld $25, \T7, \T4
1554
1555 vpxor \T3, \T2, \T2
1556 vpxor \T4, \T2, \T2
1557
1558 vpsrldq $4, \T2, \T1
1559
1560 vpslldq $12, \T2, \T2
1561 vpxor \T2, \T7, \T7
1562
1563 .if \ENC_DEC == ENC
1564 vmovdqu \XMM1, 16*0(arg3,%r11)
1565 vmovdqu \XMM2, 16*1(arg3,%r11)
1566 vmovdqu \XMM3, 16*2(arg3,%r11)
1567 vmovdqu \XMM4, 16*3(arg3,%r11)
1568 vmovdqu \XMM5, 16*4(arg3,%r11)
1569 vmovdqu \XMM6, 16*5(arg3,%r11)
1570 vmovdqu \XMM7, 16*6(arg3,%r11)
1571 vmovdqu \XMM8, 16*7(arg3,%r11)
1572 .endif
1573
1574
1575
1576 vpsrld $1, \T7, \T2
1577 vpsrld $2, \T7, \T3
1578 vpsrld $7, \T7, \T4
1579 vpxor \T3, \T2, \T2
1580 vpxor \T4, \T2, \T2
1581
1582 vpxor \T1, \T2, \T2
1583 vpxor \T2, \T7, \T7
1584 vpxor \T7, \T6, \T6
1585
1586
1587 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
1588 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
1589 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
1590 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
1591 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
1592 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
1593 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
1594 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
1595
1596
1597 vpxor \T6, \XMM1, \XMM1
1598
1599
1600
1601.endm
1602
1603
1604
1605.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1606
1607
1608
1609
1610 vpshufd $0b01001110, \XMM1, \T2
1611 vpxor \XMM1, \T2, \T2
1612 vmovdqu HashKey_8(arg2), \T5
1613 vpclmulqdq $0x11, \T5, \XMM1, \T6
1614 vpclmulqdq $0x00, \T5, \XMM1, \T7
1615
1616 vmovdqu HashKey_8_k(arg2), \T3
1617 vpclmulqdq $0x00, \T3, \T2, \XMM1
1618
1619
1620
1621 vpshufd $0b01001110, \XMM2, \T2
1622 vpxor \XMM2, \T2, \T2
1623 vmovdqu HashKey_7(arg2), \T5
1624 vpclmulqdq $0x11, \T5, \XMM2, \T4
1625 vpxor \T4, \T6, \T6
1626
1627 vpclmulqdq $0x00, \T5, \XMM2, \T4
1628 vpxor \T4, \T7, \T7
1629
1630 vmovdqu HashKey_7_k(arg2), \T3
1631 vpclmulqdq $0x00, \T3, \T2, \T2
1632 vpxor \T2, \XMM1, \XMM1
1633
1634
1635
1636 vpshufd $0b01001110, \XMM3, \T2
1637 vpxor \XMM3, \T2, \T2
1638 vmovdqu HashKey_6(arg2), \T5
1639 vpclmulqdq $0x11, \T5, \XMM3, \T4
1640 vpxor \T4, \T6, \T6
1641
1642 vpclmulqdq $0x00, \T5, \XMM3, \T4
1643 vpxor \T4, \T7, \T7
1644
1645 vmovdqu HashKey_6_k(arg2), \T3
1646 vpclmulqdq $0x00, \T3, \T2, \T2
1647 vpxor \T2, \XMM1, \XMM1
1648
1649
1650
1651 vpshufd $0b01001110, \XMM4, \T2
1652 vpxor \XMM4, \T2, \T2
1653 vmovdqu HashKey_5(arg2), \T5
1654 vpclmulqdq $0x11, \T5, \XMM4, \T4
1655 vpxor \T4, \T6, \T6
1656
1657 vpclmulqdq $0x00, \T5, \XMM4, \T4
1658 vpxor \T4, \T7, \T7
1659
1660 vmovdqu HashKey_5_k(arg2), \T3
1661 vpclmulqdq $0x00, \T3, \T2, \T2
1662 vpxor \T2, \XMM1, \XMM1
1663
1664
1665
1666 vpshufd $0b01001110, \XMM5, \T2
1667 vpxor \XMM5, \T2, \T2
1668 vmovdqu HashKey_4(arg2), \T5
1669 vpclmulqdq $0x11, \T5, \XMM5, \T4
1670 vpxor \T4, \T6, \T6
1671
1672 vpclmulqdq $0x00, \T5, \XMM5, \T4
1673 vpxor \T4, \T7, \T7
1674
1675 vmovdqu HashKey_4_k(arg2), \T3
1676 vpclmulqdq $0x00, \T3, \T2, \T2
1677 vpxor \T2, \XMM1, \XMM1
1678
1679
1680
1681 vpshufd $0b01001110, \XMM6, \T2
1682 vpxor \XMM6, \T2, \T2
1683 vmovdqu HashKey_3(arg2), \T5
1684 vpclmulqdq $0x11, \T5, \XMM6, \T4
1685 vpxor \T4, \T6, \T6
1686
1687 vpclmulqdq $0x00, \T5, \XMM6, \T4
1688 vpxor \T4, \T7, \T7
1689
1690 vmovdqu HashKey_3_k(arg2), \T3
1691 vpclmulqdq $0x00, \T3, \T2, \T2
1692 vpxor \T2, \XMM1, \XMM1
1693
1694
1695
1696 vpshufd $0b01001110, \XMM7, \T2
1697 vpxor \XMM7, \T2, \T2
1698 vmovdqu HashKey_2(arg2), \T5
1699 vpclmulqdq $0x11, \T5, \XMM7, \T4
1700 vpxor \T4, \T6, \T6
1701
1702 vpclmulqdq $0x00, \T5, \XMM7, \T4
1703 vpxor \T4, \T7, \T7
1704
1705 vmovdqu HashKey_2_k(arg2), \T3
1706 vpclmulqdq $0x00, \T3, \T2, \T2
1707 vpxor \T2, \XMM1, \XMM1
1708
1709
1710
1711 vpshufd $0b01001110, \XMM8, \T2
1712 vpxor \XMM8, \T2, \T2
1713 vmovdqu HashKey(arg2), \T5
1714 vpclmulqdq $0x11, \T5, \XMM8, \T4
1715 vpxor \T4, \T6, \T6
1716
1717 vpclmulqdq $0x00, \T5, \XMM8, \T4
1718 vpxor \T4, \T7, \T7
1719
1720 vmovdqu HashKey_k(arg2), \T3
1721 vpclmulqdq $0x00, \T3, \T2, \T2
1722
1723 vpxor \T2, \XMM1, \XMM1
1724 vpxor \T6, \XMM1, \XMM1
1725 vpxor \T7, \XMM1, \T2
1726
1727
1728
1729
1730 vpslldq $8, \T2, \T4
1731 vpsrldq $8, \T2, \T2
1732
1733 vpxor \T4, \T7, \T7
1734 vpxor \T2, \T6, \T6
1735
1736
1737
1738
1739 vpslld $31, \T7, \T2
1740 vpslld $30, \T7, \T3
1741 vpslld $25, \T7, \T4
1742
1743 vpxor \T3, \T2, \T2
1744 vpxor \T4, \T2, \T2
1745
1746 vpsrldq $4, \T2, \T1
1747
1748 vpslldq $12, \T2, \T2
1749 vpxor \T2, \T7, \T7
1750
1751
1752
1753
1754 vpsrld $1, \T7, \T2
1755 vpsrld $2, \T7, \T3
1756 vpsrld $7, \T7, \T4
1757 vpxor \T3, \T2, \T2
1758 vpxor \T4, \T2, \T2
1759
1760 vpxor \T1, \T2, \T2
1761 vpxor \T2, \T7, \T7
1762 vpxor \T7, \T6, \T6
1763
1764.endm
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778ENTRY(aesni_gcm_init_avx_gen2)
1779 FUNC_SAVE
1780 INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
1781 FUNC_RESTORE
1782 ret
1783ENDPROC(aesni_gcm_init_avx_gen2)
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793ENTRY(aesni_gcm_enc_update_avx_gen2)
1794 FUNC_SAVE
1795 mov keysize, %eax
1796 cmp $32, %eax
1797 je key_256_enc_update
1798 cmp $16, %eax
1799 je key_128_enc_update
1800
1801 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
1802 FUNC_RESTORE
1803 ret
1804key_128_enc_update:
1805 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
1806 FUNC_RESTORE
1807 ret
1808key_256_enc_update:
1809 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
1810 FUNC_RESTORE
1811 ret
1812ENDPROC(aesni_gcm_enc_update_avx_gen2)
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822ENTRY(aesni_gcm_dec_update_avx_gen2)
1823 FUNC_SAVE
1824 mov keysize,%eax
1825 cmp $32, %eax
1826 je key_256_dec_update
1827 cmp $16, %eax
1828 je key_128_dec_update
1829
1830 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
1831 FUNC_RESTORE
1832 ret
1833key_128_dec_update:
1834 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
1835 FUNC_RESTORE
1836 ret
1837key_256_dec_update:
1838 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
1839 FUNC_RESTORE
1840 ret
1841ENDPROC(aesni_gcm_dec_update_avx_gen2)
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851ENTRY(aesni_gcm_finalize_avx_gen2)
1852 FUNC_SAVE
1853 mov keysize,%eax
1854 cmp $32, %eax
1855 je key_256_finalize
1856 cmp $16, %eax
1857 je key_128_finalize
1858
1859 GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
1860 FUNC_RESTORE
1861 ret
1862key_128_finalize:
1863 GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
1864 FUNC_RESTORE
1865 ret
1866key_256_finalize:
1867 GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
1868 FUNC_RESTORE
1869 ret
1870ENDPROC(aesni_gcm_finalize_avx_gen2)
1871
1872#endif
1873
1874#ifdef CONFIG_AS_AVX2
1875
1876
1877
1878
1879
1880
1881
1882.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1883
1884 vpclmulqdq $0x11,\HK,\GH,\T1
1885 vpclmulqdq $0x00,\HK,\GH,\T2
1886 vpclmulqdq $0x01,\HK,\GH,\T3
1887 vpclmulqdq $0x10,\HK,\GH,\GH
1888 vpxor \T3, \GH, \GH
1889
1890
1891 vpsrldq $8 , \GH, \T3
1892 vpslldq $8 , \GH, \GH
1893
1894 vpxor \T3, \T1, \T1
1895 vpxor \T2, \GH, \GH
1896
1897
1898
1899 vmovdqa POLY2(%rip), \T3
1900
1901 vpclmulqdq $0x01, \GH, \T3, \T2
1902 vpslldq $8, \T2, \T2
1903
1904 vpxor \T2, \GH, \GH
1905
1906
1907 vpclmulqdq $0x00, \GH, \T3, \T2
1908 vpsrldq $4, \T2, \T2
1909
1910 vpclmulqdq $0x10, \GH, \T3, \GH
1911 vpslldq $4, \GH, \GH
1912
1913 vpxor \T2, \GH, \GH
1914
1915 vpxor \T1, \GH, \GH
1916
1917
1918.endm
1919
1920.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1921
1922
1923 vmovdqa \HK, \T5
1924 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1925 vmovdqu \T5, HashKey_2(arg2)
1926
1927 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1928 vmovdqu \T5, HashKey_3(arg2)
1929
1930 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1931 vmovdqu \T5, HashKey_4(arg2)
1932
1933 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1934 vmovdqu \T5, HashKey_5(arg2)
1935
1936 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1937 vmovdqu \T5, HashKey_6(arg2)
1938
1939 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1940 vmovdqu \T5, HashKey_7(arg2)
1941
1942 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1943 vmovdqu \T5, HashKey_8(arg2)
1944
1945.endm
1946
1947
1948
1949
1950
1951
1952
1953
1954.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1955 i = (8-\num_initial_blocks)
1956 setreg
1957 vmovdqu AadHash(arg2), reg_i
1958
1959
1960 vmovdqu CurCount(arg2), \CTR
1961
1962 i = (9-\num_initial_blocks)
1963 setreg
1964.rep \num_initial_blocks
1965 vpaddd ONE(%rip), \CTR, \CTR
1966 vmovdqa \CTR, reg_i
1967 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1968 i = (i+1)
1969 setreg
1970.endr
1971
1972 vmovdqa (arg1), \T_key
1973 i = (9-\num_initial_blocks)
1974 setreg
1975.rep \num_initial_blocks
1976 vpxor \T_key, reg_i, reg_i
1977 i = (i+1)
1978 setreg
1979.endr
1980
1981 j = 1
1982 setreg
1983.rep \REP
1984 vmovdqa 16*j(arg1), \T_key
1985 i = (9-\num_initial_blocks)
1986 setreg
1987.rep \num_initial_blocks
1988 vaesenc \T_key, reg_i, reg_i
1989 i = (i+1)
1990 setreg
1991.endr
1992
1993 j = (j+1)
1994 setreg
1995.endr
1996
1997
1998 vmovdqa 16*j(arg1), \T_key
1999 i = (9-\num_initial_blocks)
2000 setreg
2001.rep \num_initial_blocks
2002 vaesenclast \T_key, reg_i, reg_i
2003 i = (i+1)
2004 setreg
2005.endr
2006
2007 i = (9-\num_initial_blocks)
2008 setreg
2009.rep \num_initial_blocks
2010 vmovdqu (arg4, %r11), \T1
2011 vpxor \T1, reg_i, reg_i
2012 vmovdqu reg_i, (arg3 , %r11)
2013
2014 add $16, %r11
2015.if \ENC_DEC == DEC
2016 vmovdqa \T1, reg_i
2017.endif
2018 vpshufb SHUF_MASK(%rip), reg_i, reg_i
2019 i = (i+1)
2020 setreg
2021.endr
2022
2023
2024 i = (8-\num_initial_blocks)
2025 j = (9-\num_initial_blocks)
2026 setreg
2027
2028.rep \num_initial_blocks
2029 vpxor reg_i, reg_j, reg_j
2030 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6
2031 i = (i+1)
2032 j = (j+1)
2033 setreg
2034.endr
2035
2036
2037 vmovdqa \XMM8, TMP1(%rsp)
2038 vmovdqa \XMM8, \T3
2039
2040 cmp $128, %r13
2041 jl _initial_blocks_done\@
2042
2043
2044
2045 vpaddd ONE(%rip), \CTR, \CTR
2046 vmovdqa \CTR, \XMM1
2047 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
2048
2049 vpaddd ONE(%rip), \CTR, \CTR
2050 vmovdqa \CTR, \XMM2
2051 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
2052
2053 vpaddd ONE(%rip), \CTR, \CTR
2054 vmovdqa \CTR, \XMM3
2055 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
2056
2057 vpaddd ONE(%rip), \CTR, \CTR
2058 vmovdqa \CTR, \XMM4
2059 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
2060
2061 vpaddd ONE(%rip), \CTR, \CTR
2062 vmovdqa \CTR, \XMM5
2063 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
2064
2065 vpaddd ONE(%rip), \CTR, \CTR
2066 vmovdqa \CTR, \XMM6
2067 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
2068
2069 vpaddd ONE(%rip), \CTR, \CTR
2070 vmovdqa \CTR, \XMM7
2071 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
2072
2073 vpaddd ONE(%rip), \CTR, \CTR
2074 vmovdqa \CTR, \XMM8
2075 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
2076
2077 vmovdqa (arg1), \T_key
2078 vpxor \T_key, \XMM1, \XMM1
2079 vpxor \T_key, \XMM2, \XMM2
2080 vpxor \T_key, \XMM3, \XMM3
2081 vpxor \T_key, \XMM4, \XMM4
2082 vpxor \T_key, \XMM5, \XMM5
2083 vpxor \T_key, \XMM6, \XMM6
2084 vpxor \T_key, \XMM7, \XMM7
2085 vpxor \T_key, \XMM8, \XMM8
2086
2087 i = 1
2088 setreg
2089.rep \REP
2090 vmovdqa 16*i(arg1), \T_key
2091 vaesenc \T_key, \XMM1, \XMM1
2092 vaesenc \T_key, \XMM2, \XMM2
2093 vaesenc \T_key, \XMM3, \XMM3
2094 vaesenc \T_key, \XMM4, \XMM4
2095 vaesenc \T_key, \XMM5, \XMM5
2096 vaesenc \T_key, \XMM6, \XMM6
2097 vaesenc \T_key, \XMM7, \XMM7
2098 vaesenc \T_key, \XMM8, \XMM8
2099 i = (i+1)
2100 setreg
2101.endr
2102
2103
2104 vmovdqa 16*i(arg1), \T_key
2105 vaesenclast \T_key, \XMM1, \XMM1
2106 vaesenclast \T_key, \XMM2, \XMM2
2107 vaesenclast \T_key, \XMM3, \XMM3
2108 vaesenclast \T_key, \XMM4, \XMM4
2109 vaesenclast \T_key, \XMM5, \XMM5
2110 vaesenclast \T_key, \XMM6, \XMM6
2111 vaesenclast \T_key, \XMM7, \XMM7
2112 vaesenclast \T_key, \XMM8, \XMM8
2113
2114 vmovdqu (arg4, %r11), \T1
2115 vpxor \T1, \XMM1, \XMM1
2116 vmovdqu \XMM1, (arg3 , %r11)
2117 .if \ENC_DEC == DEC
2118 vmovdqa \T1, \XMM1
2119 .endif
2120
2121 vmovdqu 16*1(arg4, %r11), \T1
2122 vpxor \T1, \XMM2, \XMM2
2123 vmovdqu \XMM2, 16*1(arg3 , %r11)
2124 .if \ENC_DEC == DEC
2125 vmovdqa \T1, \XMM2
2126 .endif
2127
2128 vmovdqu 16*2(arg4, %r11), \T1
2129 vpxor \T1, \XMM3, \XMM3
2130 vmovdqu \XMM3, 16*2(arg3 , %r11)
2131 .if \ENC_DEC == DEC
2132 vmovdqa \T1, \XMM3
2133 .endif
2134
2135 vmovdqu 16*3(arg4, %r11), \T1
2136 vpxor \T1, \XMM4, \XMM4
2137 vmovdqu \XMM4, 16*3(arg3 , %r11)
2138 .if \ENC_DEC == DEC
2139 vmovdqa \T1, \XMM4
2140 .endif
2141
2142 vmovdqu 16*4(arg4, %r11), \T1
2143 vpxor \T1, \XMM5, \XMM5
2144 vmovdqu \XMM5, 16*4(arg3 , %r11)
2145 .if \ENC_DEC == DEC
2146 vmovdqa \T1, \XMM5
2147 .endif
2148
2149 vmovdqu 16*5(arg4, %r11), \T1
2150 vpxor \T1, \XMM6, \XMM6
2151 vmovdqu \XMM6, 16*5(arg3 , %r11)
2152 .if \ENC_DEC == DEC
2153 vmovdqa \T1, \XMM6
2154 .endif
2155
2156 vmovdqu 16*6(arg4, %r11), \T1
2157 vpxor \T1, \XMM7, \XMM7
2158 vmovdqu \XMM7, 16*6(arg3 , %r11)
2159 .if \ENC_DEC == DEC
2160 vmovdqa \T1, \XMM7
2161 .endif
2162
2163 vmovdqu 16*7(arg4, %r11), \T1
2164 vpxor \T1, \XMM8, \XMM8
2165 vmovdqu \XMM8, 16*7(arg3 , %r11)
2166 .if \ENC_DEC == DEC
2167 vmovdqa \T1, \XMM8
2168 .endif
2169
2170 add $128, %r11
2171
2172 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
2173 vpxor TMP1(%rsp), \XMM1, \XMM1
2174
2175 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
2176 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
2177 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
2178 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
2179 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
2180 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
2181 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
2182
2183
2184
2185_initial_blocks_done\@:
2186
2187
2188.endm
2189
2190
2191
2192
2193
2194
2195
2196.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2197
2198 vmovdqa \XMM1, \T2
2199 vmovdqa \XMM2, TMP2(%rsp)
2200 vmovdqa \XMM3, TMP3(%rsp)
2201 vmovdqa \XMM4, TMP4(%rsp)
2202 vmovdqa \XMM5, TMP5(%rsp)
2203 vmovdqa \XMM6, TMP6(%rsp)
2204 vmovdqa \XMM7, TMP7(%rsp)
2205 vmovdqa \XMM8, TMP8(%rsp)
2206
2207.if \loop_idx == in_order
2208 vpaddd ONE(%rip), \CTR, \XMM1
2209 vpaddd ONE(%rip), \XMM1, \XMM2
2210 vpaddd ONE(%rip), \XMM2, \XMM3
2211 vpaddd ONE(%rip), \XMM3, \XMM4
2212 vpaddd ONE(%rip), \XMM4, \XMM5
2213 vpaddd ONE(%rip), \XMM5, \XMM6
2214 vpaddd ONE(%rip), \XMM6, \XMM7
2215 vpaddd ONE(%rip), \XMM7, \XMM8
2216 vmovdqa \XMM8, \CTR
2217
2218 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
2219 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
2220 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
2221 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
2222 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
2223 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
2224 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
2225 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
2226.else
2227 vpaddd ONEf(%rip), \CTR, \XMM1
2228 vpaddd ONEf(%rip), \XMM1, \XMM2
2229 vpaddd ONEf(%rip), \XMM2, \XMM3
2230 vpaddd ONEf(%rip), \XMM3, \XMM4
2231 vpaddd ONEf(%rip), \XMM4, \XMM5
2232 vpaddd ONEf(%rip), \XMM5, \XMM6
2233 vpaddd ONEf(%rip), \XMM6, \XMM7
2234 vpaddd ONEf(%rip), \XMM7, \XMM8
2235 vmovdqa \XMM8, \CTR
2236.endif
2237
2238
2239
2240
2241 vmovdqu (arg1), \T1
2242 vpxor \T1, \XMM1, \XMM1
2243 vpxor \T1, \XMM2, \XMM2
2244 vpxor \T1, \XMM3, \XMM3
2245 vpxor \T1, \XMM4, \XMM4
2246 vpxor \T1, \XMM5, \XMM5
2247 vpxor \T1, \XMM6, \XMM6
2248 vpxor \T1, \XMM7, \XMM7
2249 vpxor \T1, \XMM8, \XMM8
2250
2251
2252
2253
2254
2255
2256
2257 vmovdqu 16*1(arg1), \T1
2258 vaesenc \T1, \XMM1, \XMM1
2259 vaesenc \T1, \XMM2, \XMM2
2260 vaesenc \T1, \XMM3, \XMM3
2261 vaesenc \T1, \XMM4, \XMM4
2262 vaesenc \T1, \XMM5, \XMM5
2263 vaesenc \T1, \XMM6, \XMM6
2264 vaesenc \T1, \XMM7, \XMM7
2265 vaesenc \T1, \XMM8, \XMM8
2266
2267 vmovdqu 16*2(arg1), \T1
2268 vaesenc \T1, \XMM1, \XMM1
2269 vaesenc \T1, \XMM2, \XMM2
2270 vaesenc \T1, \XMM3, \XMM3
2271 vaesenc \T1, \XMM4, \XMM4
2272 vaesenc \T1, \XMM5, \XMM5
2273 vaesenc \T1, \XMM6, \XMM6
2274 vaesenc \T1, \XMM7, \XMM7
2275 vaesenc \T1, \XMM8, \XMM8
2276
2277
2278
2279
2280 vmovdqu HashKey_8(arg2), \T5
2281 vpclmulqdq $0x11, \T5, \T2, \T4
2282 vpclmulqdq $0x00, \T5, \T2, \T7
2283 vpclmulqdq $0x01, \T5, \T2, \T6
2284 vpclmulqdq $0x10, \T5, \T2, \T5
2285 vpxor \T5, \T6, \T6
2286
2287 vmovdqu 16*3(arg1), \T1
2288 vaesenc \T1, \XMM1, \XMM1
2289 vaesenc \T1, \XMM2, \XMM2
2290 vaesenc \T1, \XMM3, \XMM3
2291 vaesenc \T1, \XMM4, \XMM4
2292 vaesenc \T1, \XMM5, \XMM5
2293 vaesenc \T1, \XMM6, \XMM6
2294 vaesenc \T1, \XMM7, \XMM7
2295 vaesenc \T1, \XMM8, \XMM8
2296
2297 vmovdqa TMP2(%rsp), \T1
2298 vmovdqu HashKey_7(arg2), \T5
2299 vpclmulqdq $0x11, \T5, \T1, \T3
2300 vpxor \T3, \T4, \T4
2301
2302 vpclmulqdq $0x00, \T5, \T1, \T3
2303 vpxor \T3, \T7, \T7
2304
2305 vpclmulqdq $0x01, \T5, \T1, \T3
2306 vpxor \T3, \T6, \T6
2307
2308 vpclmulqdq $0x10, \T5, \T1, \T3
2309 vpxor \T3, \T6, \T6
2310
2311 vmovdqu 16*4(arg1), \T1
2312 vaesenc \T1, \XMM1, \XMM1
2313 vaesenc \T1, \XMM2, \XMM2
2314 vaesenc \T1, \XMM3, \XMM3
2315 vaesenc \T1, \XMM4, \XMM4
2316 vaesenc \T1, \XMM5, \XMM5
2317 vaesenc \T1, \XMM6, \XMM6
2318 vaesenc \T1, \XMM7, \XMM7
2319 vaesenc \T1, \XMM8, \XMM8
2320
2321
2322
2323 vmovdqa TMP3(%rsp), \T1
2324 vmovdqu HashKey_6(arg2), \T5
2325 vpclmulqdq $0x11, \T5, \T1, \T3
2326 vpxor \T3, \T4, \T4
2327
2328 vpclmulqdq $0x00, \T5, \T1, \T3
2329 vpxor \T3, \T7, \T7
2330
2331 vpclmulqdq $0x01, \T5, \T1, \T3
2332 vpxor \T3, \T6, \T6
2333
2334 vpclmulqdq $0x10, \T5, \T1, \T3
2335 vpxor \T3, \T6, \T6
2336
2337 vmovdqu 16*5(arg1), \T1
2338 vaesenc \T1, \XMM1, \XMM1
2339 vaesenc \T1, \XMM2, \XMM2
2340 vaesenc \T1, \XMM3, \XMM3
2341 vaesenc \T1, \XMM4, \XMM4
2342 vaesenc \T1, \XMM5, \XMM5
2343 vaesenc \T1, \XMM6, \XMM6
2344 vaesenc \T1, \XMM7, \XMM7
2345 vaesenc \T1, \XMM8, \XMM8
2346
2347 vmovdqa TMP4(%rsp), \T1
2348 vmovdqu HashKey_5(arg2), \T5
2349 vpclmulqdq $0x11, \T5, \T1, \T3
2350 vpxor \T3, \T4, \T4
2351
2352 vpclmulqdq $0x00, \T5, \T1, \T3
2353 vpxor \T3, \T7, \T7
2354
2355 vpclmulqdq $0x01, \T5, \T1, \T3
2356 vpxor \T3, \T6, \T6
2357
2358 vpclmulqdq $0x10, \T5, \T1, \T3
2359 vpxor \T3, \T6, \T6
2360
2361 vmovdqu 16*6(arg1), \T1
2362 vaesenc \T1, \XMM1, \XMM1
2363 vaesenc \T1, \XMM2, \XMM2
2364 vaesenc \T1, \XMM3, \XMM3
2365 vaesenc \T1, \XMM4, \XMM4
2366 vaesenc \T1, \XMM5, \XMM5
2367 vaesenc \T1, \XMM6, \XMM6
2368 vaesenc \T1, \XMM7, \XMM7
2369 vaesenc \T1, \XMM8, \XMM8
2370
2371
2372 vmovdqa TMP5(%rsp), \T1
2373 vmovdqu HashKey_4(arg2), \T5
2374 vpclmulqdq $0x11, \T5, \T1, \T3
2375 vpxor \T3, \T4, \T4
2376
2377 vpclmulqdq $0x00, \T5, \T1, \T3
2378 vpxor \T3, \T7, \T7
2379
2380 vpclmulqdq $0x01, \T5, \T1, \T3
2381 vpxor \T3, \T6, \T6
2382
2383 vpclmulqdq $0x10, \T5, \T1, \T3
2384 vpxor \T3, \T6, \T6
2385
2386 vmovdqu 16*7(arg1), \T1
2387 vaesenc \T1, \XMM1, \XMM1
2388 vaesenc \T1, \XMM2, \XMM2
2389 vaesenc \T1, \XMM3, \XMM3
2390 vaesenc \T1, \XMM4, \XMM4
2391 vaesenc \T1, \XMM5, \XMM5
2392 vaesenc \T1, \XMM6, \XMM6
2393 vaesenc \T1, \XMM7, \XMM7
2394 vaesenc \T1, \XMM8, \XMM8
2395
2396 vmovdqa TMP6(%rsp), \T1
2397 vmovdqu HashKey_3(arg2), \T5
2398 vpclmulqdq $0x11, \T5, \T1, \T3
2399 vpxor \T3, \T4, \T4
2400
2401 vpclmulqdq $0x00, \T5, \T1, \T3
2402 vpxor \T3, \T7, \T7
2403
2404 vpclmulqdq $0x01, \T5, \T1, \T3
2405 vpxor \T3, \T6, \T6
2406
2407 vpclmulqdq $0x10, \T5, \T1, \T3
2408 vpxor \T3, \T6, \T6
2409
2410 vmovdqu 16*8(arg1), \T1
2411 vaesenc \T1, \XMM1, \XMM1
2412 vaesenc \T1, \XMM2, \XMM2
2413 vaesenc \T1, \XMM3, \XMM3
2414 vaesenc \T1, \XMM4, \XMM4
2415 vaesenc \T1, \XMM5, \XMM5
2416 vaesenc \T1, \XMM6, \XMM6
2417 vaesenc \T1, \XMM7, \XMM7
2418 vaesenc \T1, \XMM8, \XMM8
2419
2420 vmovdqa TMP7(%rsp), \T1
2421 vmovdqu HashKey_2(arg2), \T5
2422 vpclmulqdq $0x11, \T5, \T1, \T3
2423 vpxor \T3, \T4, \T4
2424
2425 vpclmulqdq $0x00, \T5, \T1, \T3
2426 vpxor \T3, \T7, \T7
2427
2428 vpclmulqdq $0x01, \T5, \T1, \T3
2429 vpxor \T3, \T6, \T6
2430
2431 vpclmulqdq $0x10, \T5, \T1, \T3
2432 vpxor \T3, \T6, \T6
2433
2434
2435
2436
2437 vmovdqu 16*9(arg1), \T5
2438 vaesenc \T5, \XMM1, \XMM1
2439 vaesenc \T5, \XMM2, \XMM2
2440 vaesenc \T5, \XMM3, \XMM3
2441 vaesenc \T5, \XMM4, \XMM4
2442 vaesenc \T5, \XMM5, \XMM5
2443 vaesenc \T5, \XMM6, \XMM6
2444 vaesenc \T5, \XMM7, \XMM7
2445 vaesenc \T5, \XMM8, \XMM8
2446
2447 vmovdqa TMP8(%rsp), \T1
2448 vmovdqu HashKey(arg2), \T5
2449
2450 vpclmulqdq $0x00, \T5, \T1, \T3
2451 vpxor \T3, \T7, \T7
2452
2453 vpclmulqdq $0x01, \T5, \T1, \T3
2454 vpxor \T3, \T6, \T6
2455
2456 vpclmulqdq $0x10, \T5, \T1, \T3
2457 vpxor \T3, \T6, \T6
2458
2459 vpclmulqdq $0x11, \T5, \T1, \T3
2460 vpxor \T3, \T4, \T1
2461
2462
2463 vmovdqu 16*10(arg1), \T5
2464
2465 i = 11
2466 setreg
2467.rep (\REP-9)
2468 vaesenc \T5, \XMM1, \XMM1
2469 vaesenc \T5, \XMM2, \XMM2
2470 vaesenc \T5, \XMM3, \XMM3
2471 vaesenc \T5, \XMM4, \XMM4
2472 vaesenc \T5, \XMM5, \XMM5
2473 vaesenc \T5, \XMM6, \XMM6
2474 vaesenc \T5, \XMM7, \XMM7
2475 vaesenc \T5, \XMM8, \XMM8
2476
2477 vmovdqu 16*i(arg1), \T5
2478 i = i + 1
2479 setreg
2480.endr
2481
2482 i = 0
2483 j = 1
2484 setreg
2485.rep 8
2486 vpxor 16*i(arg4, %r11), \T5, \T2
2487 .if \ENC_DEC == ENC
2488 vaesenclast \T2, reg_j, reg_j
2489 .else
2490 vaesenclast \T2, reg_j, \T3
2491 vmovdqu 16*i(arg4, %r11), reg_j
2492 vmovdqu \T3, 16*i(arg3, %r11)
2493 .endif
2494 i = (i+1)
2495 j = (j+1)
2496 setreg
2497.endr
2498
2499
2500
2501 vpslldq $8, \T6, \T3
2502 vpsrldq $8, \T6, \T6
2503 vpxor \T3, \T7, \T7
2504 vpxor \T6, \T1, \T1
2505
2506
2507
2508
2509
2510 vmovdqa POLY2(%rip), \T3
2511
2512 vpclmulqdq $0x01, \T7, \T3, \T2
2513 vpslldq $8, \T2, \T2
2514
2515 vpxor \T2, \T7, \T7
2516
2517 .if \ENC_DEC == ENC
2518 vmovdqu \XMM1, 16*0(arg3,%r11)
2519 vmovdqu \XMM2, 16*1(arg3,%r11)
2520 vmovdqu \XMM3, 16*2(arg3,%r11)
2521 vmovdqu \XMM4, 16*3(arg3,%r11)
2522 vmovdqu \XMM5, 16*4(arg3,%r11)
2523 vmovdqu \XMM6, 16*5(arg3,%r11)
2524 vmovdqu \XMM7, 16*6(arg3,%r11)
2525 vmovdqu \XMM8, 16*7(arg3,%r11)
2526 .endif
2527
2528
2529
2530 vpclmulqdq $0x00, \T7, \T3, \T2
2531 vpsrldq $4, \T2, \T2
2532
2533 vpclmulqdq $0x10, \T7, \T3, \T4
2534 vpslldq $4, \T4, \T4
2535
2536 vpxor \T2, \T4, \T4
2537
2538 vpxor \T4, \T1, \T1
2539
2540 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
2541 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
2542 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
2543 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
2544 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
2545 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
2546 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
2547 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
2548
2549
2550 vpxor \T1, \XMM1, \XMM1
2551
2552
2553
2554.endm
2555
2556
2557
2558.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2559
2560
2561
2562 vmovdqu HashKey_8(arg2), \T5
2563
2564 vpshufd $0b01001110, \XMM1, \T2
2565 vpshufd $0b01001110, \T5, \T3
2566 vpxor \XMM1, \T2, \T2
2567 vpxor \T5, \T3, \T3
2568
2569 vpclmulqdq $0x11, \T5, \XMM1, \T6
2570 vpclmulqdq $0x00, \T5, \XMM1, \T7
2571
2572 vpclmulqdq $0x00, \T3, \T2, \XMM1
2573
2574
2575
2576 vmovdqu HashKey_7(arg2), \T5
2577 vpshufd $0b01001110, \XMM2, \T2
2578 vpshufd $0b01001110, \T5, \T3
2579 vpxor \XMM2, \T2, \T2
2580 vpxor \T5, \T3, \T3
2581
2582 vpclmulqdq $0x11, \T5, \XMM2, \T4
2583 vpxor \T4, \T6, \T6
2584
2585 vpclmulqdq $0x00, \T5, \XMM2, \T4
2586 vpxor \T4, \T7, \T7
2587
2588 vpclmulqdq $0x00, \T3, \T2, \T2
2589
2590 vpxor \T2, \XMM1, \XMM1
2591
2592
2593
2594 vmovdqu HashKey_6(arg2), \T5
2595 vpshufd $0b01001110, \XMM3, \T2
2596 vpshufd $0b01001110, \T5, \T3
2597 vpxor \XMM3, \T2, \T2
2598 vpxor \T5, \T3, \T3
2599
2600 vpclmulqdq $0x11, \T5, \XMM3, \T4
2601 vpxor \T4, \T6, \T6
2602
2603 vpclmulqdq $0x00, \T5, \XMM3, \T4
2604 vpxor \T4, \T7, \T7
2605
2606 vpclmulqdq $0x00, \T3, \T2, \T2
2607
2608 vpxor \T2, \XMM1, \XMM1
2609
2610
2611
2612 vmovdqu HashKey_5(arg2), \T5
2613 vpshufd $0b01001110, \XMM4, \T2
2614 vpshufd $0b01001110, \T5, \T3
2615 vpxor \XMM4, \T2, \T2
2616 vpxor \T5, \T3, \T3
2617
2618 vpclmulqdq $0x11, \T5, \XMM4, \T4
2619 vpxor \T4, \T6, \T6
2620
2621 vpclmulqdq $0x00, \T5, \XMM4, \T4
2622 vpxor \T4, \T7, \T7
2623
2624 vpclmulqdq $0x00, \T3, \T2, \T2
2625
2626 vpxor \T2, \XMM1, \XMM1
2627
2628
2629
2630 vmovdqu HashKey_4(arg2), \T5
2631 vpshufd $0b01001110, \XMM5, \T2
2632 vpshufd $0b01001110, \T5, \T3
2633 vpxor \XMM5, \T2, \T2
2634 vpxor \T5, \T3, \T3
2635
2636 vpclmulqdq $0x11, \T5, \XMM5, \T4
2637 vpxor \T4, \T6, \T6
2638
2639 vpclmulqdq $0x00, \T5, \XMM5, \T4
2640 vpxor \T4, \T7, \T7
2641
2642 vpclmulqdq $0x00, \T3, \T2, \T2
2643
2644 vpxor \T2, \XMM1, \XMM1
2645
2646
2647
2648 vmovdqu HashKey_3(arg2), \T5
2649 vpshufd $0b01001110, \XMM6, \T2
2650 vpshufd $0b01001110, \T5, \T3
2651 vpxor \XMM6, \T2, \T2
2652 vpxor \T5, \T3, \T3
2653
2654 vpclmulqdq $0x11, \T5, \XMM6, \T4
2655 vpxor \T4, \T6, \T6
2656
2657 vpclmulqdq $0x00, \T5, \XMM6, \T4
2658 vpxor \T4, \T7, \T7
2659
2660 vpclmulqdq $0x00, \T3, \T2, \T2
2661
2662 vpxor \T2, \XMM1, \XMM1
2663
2664
2665
2666 vmovdqu HashKey_2(arg2), \T5
2667 vpshufd $0b01001110, \XMM7, \T2
2668 vpshufd $0b01001110, \T5, \T3
2669 vpxor \XMM7, \T2, \T2
2670 vpxor \T5, \T3, \T3
2671
2672 vpclmulqdq $0x11, \T5, \XMM7, \T4
2673 vpxor \T4, \T6, \T6
2674
2675 vpclmulqdq $0x00, \T5, \XMM7, \T4
2676 vpxor \T4, \T7, \T7
2677
2678 vpclmulqdq $0x00, \T3, \T2, \T2
2679
2680 vpxor \T2, \XMM1, \XMM1
2681
2682
2683
2684 vmovdqu HashKey(arg2), \T5
2685 vpshufd $0b01001110, \XMM8, \T2
2686 vpshufd $0b01001110, \T5, \T3
2687 vpxor \XMM8, \T2, \T2
2688 vpxor \T5, \T3, \T3
2689
2690 vpclmulqdq $0x11, \T5, \XMM8, \T4
2691 vpxor \T4, \T6, \T6
2692
2693 vpclmulqdq $0x00, \T5, \XMM8, \T4
2694 vpxor \T4, \T7, \T7
2695
2696 vpclmulqdq $0x00, \T3, \T2, \T2
2697
2698 vpxor \T2, \XMM1, \XMM1
2699 vpxor \T6, \XMM1, \XMM1
2700 vpxor \T7, \XMM1, \T2
2701
2702
2703
2704
2705 vpslldq $8, \T2, \T4
2706 vpsrldq $8, \T2, \T2
2707
2708 vpxor \T4, \T7, \T7
2709 vpxor \T2, \T6, \T6
2710
2711
2712
2713
2714 vmovdqa POLY2(%rip), \T3
2715
2716 vpclmulqdq $0x01, \T7, \T3, \T2
2717 vpslldq $8, \T2, \T2
2718
2719 vpxor \T2, \T7, \T7
2720
2721
2722
2723
2724 vpclmulqdq $0x00, \T7, \T3, \T2
2725 vpsrldq $4, \T2, \T2
2726
2727 vpclmulqdq $0x10, \T7, \T3, \T4
2728 vpslldq $4, \T4, \T4
2729
2730 vpxor \T2, \T4, \T4
2731
2732 vpxor \T4, \T6, \T6
2733.endm
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749ENTRY(aesni_gcm_init_avx_gen4)
2750 FUNC_SAVE
2751 INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
2752 FUNC_RESTORE
2753 ret
2754ENDPROC(aesni_gcm_init_avx_gen4)
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764ENTRY(aesni_gcm_enc_update_avx_gen4)
2765 FUNC_SAVE
2766 mov keysize,%eax
2767 cmp $32, %eax
2768 je key_256_enc_update4
2769 cmp $16, %eax
2770 je key_128_enc_update4
2771
2772 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
2773 FUNC_RESTORE
2774 ret
2775key_128_enc_update4:
2776 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
2777 FUNC_RESTORE
2778 ret
2779key_256_enc_update4:
2780 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
2781 FUNC_RESTORE
2782 ret
2783ENDPROC(aesni_gcm_enc_update_avx_gen4)
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793ENTRY(aesni_gcm_dec_update_avx_gen4)
2794 FUNC_SAVE
2795 mov keysize,%eax
2796 cmp $32, %eax
2797 je key_256_dec_update4
2798 cmp $16, %eax
2799 je key_128_dec_update4
2800
2801 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
2802 FUNC_RESTORE
2803 ret
2804key_128_dec_update4:
2805 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
2806 FUNC_RESTORE
2807 ret
2808key_256_dec_update4:
2809 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
2810 FUNC_RESTORE
2811 ret
2812ENDPROC(aesni_gcm_dec_update_avx_gen4)
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822ENTRY(aesni_gcm_finalize_avx_gen4)
2823 FUNC_SAVE
2824 mov keysize,%eax
2825 cmp $32, %eax
2826 je key_256_finalize4
2827 cmp $16, %eax
2828 je key_128_finalize4
2829
2830 GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
2831 FUNC_RESTORE
2832 ret
2833key_128_finalize4:
2834 GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
2835 FUNC_RESTORE
2836 ret
2837key_256_finalize4:
2838 GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
2839 FUNC_RESTORE
2840 ret
2841ENDPROC(aesni_gcm_finalize_avx_gen4)
2842
2843#endif
2844