1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122#include <linux/linkage.h>
123
124
125.section .rodata.cst16.POLY, "aM", @progbits, 16
126.align 16
127POLY: .octa 0xC2000000000000000000000000000001
128
129.section .rodata.cst16.POLY2, "aM", @progbits, 16
130.align 16
131POLY2: .octa 0xC20000000000000000000001C2000000
132
133.section .rodata.cst16.TWOONE, "aM", @progbits, 16
134.align 16
135TWOONE: .octa 0x00000001000000000000000000000001
136
137.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
138.align 16
139SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
140
141.section .rodata.cst16.ONE, "aM", @progbits, 16
142.align 16
143ONE: .octa 0x00000000000000000000000000000001
144
145.section .rodata.cst16.ONEf, "aM", @progbits, 16
146.align 16
147ONEf: .octa 0x01000000000000000000000000000000
148
149
150
151.section .rodata, "a", @progbits
152.align 16
153SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
154ALL_F: .octa 0xffffffffffffffffffffffffffffffff
155 .octa 0x00000000000000000000000000000000
156
157.section .rodata
158.align 16
159.type aad_shift_arr, @object
160.size aad_shift_arr, 272
161aad_shift_arr:
162 .octa 0xffffffffffffffffffffffffffffffff
163 .octa 0xffffffffffffffffffffffffffffff0C
164 .octa 0xffffffffffffffffffffffffffff0D0C
165 .octa 0xffffffffffffffffffffffffff0E0D0C
166 .octa 0xffffffffffffffffffffffff0F0E0D0C
167 .octa 0xffffffffffffffffffffff0C0B0A0908
168 .octa 0xffffffffffffffffffff0D0C0B0A0908
169 .octa 0xffffffffffffffffff0E0D0C0B0A0908
170 .octa 0xffffffffffffffff0F0E0D0C0B0A0908
171 .octa 0xffffffffffffff0C0B0A090807060504
172 .octa 0xffffffffffff0D0C0B0A090807060504
173 .octa 0xffffffffff0E0D0C0B0A090807060504
174 .octa 0xffffffff0F0E0D0C0B0A090807060504
175 .octa 0xffffff0C0B0A09080706050403020100
176 .octa 0xffff0D0C0B0A09080706050403020100
177 .octa 0xff0E0D0C0B0A09080706050403020100
178 .octa 0x0F0E0D0C0B0A09080706050403020100
179
180
181.text
182
183
184#define AadHash 16*0
185#define AadLen 16*1
186#define InLen (16*1)+8
187#define PBlockEncKey 16*2
188#define OrigIV 16*3
189#define CurCount 16*4
190#define PBlockLen 16*5
191
192HashKey = 16*6
193HashKey_2 = 16*7
194HashKey_3 = 16*8
195HashKey_4 = 16*9
196HashKey_5 = 16*10
197HashKey_6 = 16*11
198HashKey_7 = 16*12
199HashKey_8 = 16*13
200HashKey_k = 16*14
201HashKey_2_k = 16*15
202HashKey_3_k = 16*16
203HashKey_4_k = 16*17
204HashKey_5_k = 16*18
205HashKey_6_k = 16*19
206HashKey_7_k = 16*20
207HashKey_8_k = 16*21
208
209#define arg1 %rdi
210#define arg2 %rsi
211#define arg3 %rdx
212#define arg4 %rcx
213#define arg5 %r8
214#define arg6 %r9
215#define keysize 2*15*16(arg1)
216
217i = 0
218j = 0
219
220out_order = 0
221in_order = 1
222DEC = 0
223ENC = 1
224
225.macro define_reg r n
226reg_\r = %xmm\n
227.endm
228
229.macro setreg
230.altmacro
231define_reg i %i
232define_reg j %j
233.noaltmacro
234.endm
235
236TMP1 = 16*0
237TMP2 = 16*1
238TMP3 = 16*2
239TMP4 = 16*3
240TMP5 = 16*4
241TMP6 = 16*5
242TMP7 = 16*6
243TMP8 = 16*7
244
245VARIABLE_OFFSET = 16*8
246
247
248
249
250
251.macro FUNC_SAVE
252 push %r12
253 push %r13
254 push %r15
255
256 push %rbp
257 mov %rsp, %rbp
258
259 sub $VARIABLE_OFFSET, %rsp
260 and $~63, %rsp
261.endm
262
263.macro FUNC_RESTORE
264 mov %rbp, %rsp
265 pop %rbp
266
267 pop %r15
268 pop %r13
269 pop %r12
270.endm
271
272
273.macro ENCRYPT_SINGLE_BLOCK REP XMM0
274 vpxor (arg1), \XMM0, \XMM0
275 i = 1
276 setreg
277.rep \REP
278 vaesenc 16*i(arg1), \XMM0, \XMM0
279 i = (i+1)
280 setreg
281.endr
282 vaesenclast 16*i(arg1), \XMM0, \XMM0
283.endm
284
285
286
287
288.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
289 vmovdqu AadHash(arg2), %xmm8
290 vmovdqu HashKey(arg2), %xmm13
291 add arg5, InLen(arg2)
292
293
294 xor %r11d, %r11d
295
296 PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
297 sub %r11, arg5
298
299 mov arg5, %r13
300 and $-16, %r13
301
302 mov %r13, %r12
303 shr $4, %r12
304 and $7, %r12
305 jz _initial_num_blocks_is_0\@
306
307 cmp $7, %r12
308 je _initial_num_blocks_is_7\@
309 cmp $6, %r12
310 je _initial_num_blocks_is_6\@
311 cmp $5, %r12
312 je _initial_num_blocks_is_5\@
313 cmp $4, %r12
314 je _initial_num_blocks_is_4\@
315 cmp $3, %r12
316 je _initial_num_blocks_is_3\@
317 cmp $2, %r12
318 je _initial_num_blocks_is_2\@
319
320 jmp _initial_num_blocks_is_1\@
321
322_initial_num_blocks_is_7\@:
323 \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
324 sub $16*7, %r13
325 jmp _initial_blocks_encrypted\@
326
327_initial_num_blocks_is_6\@:
328 \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
329 sub $16*6, %r13
330 jmp _initial_blocks_encrypted\@
331
332_initial_num_blocks_is_5\@:
333 \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
334 sub $16*5, %r13
335 jmp _initial_blocks_encrypted\@
336
337_initial_num_blocks_is_4\@:
338 \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
339 sub $16*4, %r13
340 jmp _initial_blocks_encrypted\@
341
342_initial_num_blocks_is_3\@:
343 \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
344 sub $16*3, %r13
345 jmp _initial_blocks_encrypted\@
346
347_initial_num_blocks_is_2\@:
348 \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
349 sub $16*2, %r13
350 jmp _initial_blocks_encrypted\@
351
352_initial_num_blocks_is_1\@:
353 \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
354 sub $16*1, %r13
355 jmp _initial_blocks_encrypted\@
356
357_initial_num_blocks_is_0\@:
358 \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
359
360
361_initial_blocks_encrypted\@:
362 test %r13, %r13
363 je _zero_cipher_left\@
364
365 sub $128, %r13
366 je _eight_cipher_left\@
367
368
369
370
371 vmovd %xmm9, %r15d
372 and $255, %r15d
373 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
374
375
376_encrypt_by_8_new\@:
377 cmp $(255-8), %r15d
378 jg _encrypt_by_8\@
379
380
381
382 add $8, %r15b
383 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
384 add $128, %r11
385 sub $128, %r13
386 jne _encrypt_by_8_new\@
387
388 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
389 jmp _eight_cipher_left\@
390
391_encrypt_by_8\@:
392 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
393 add $8, %r15b
394 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
395 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
396 add $128, %r11
397 sub $128, %r13
398 jne _encrypt_by_8_new\@
399
400 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
401
402
403
404
405_eight_cipher_left\@:
406 \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
407
408
409_zero_cipher_left\@:
410 vmovdqu %xmm14, AadHash(arg2)
411 vmovdqu %xmm9, CurCount(arg2)
412
413
414 mov arg5, %r13
415 and $15, %r13
416
417 je _multiple_of_16_bytes\@
418
419
420
421 mov %r13, PBlockLen(arg2)
422
423 vpaddd ONE(%rip), %xmm9, %xmm9
424 vmovdqu %xmm9, CurCount(arg2)
425 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
426
427 ENCRYPT_SINGLE_BLOCK \REP, %xmm9
428 vmovdqu %xmm9, PBlockEncKey(arg2)
429
430 cmp $16, arg5
431 jge _large_enough_update\@
432
433 lea (arg4,%r11,1), %r10
434 mov %r13, %r12
435
436 READ_PARTIAL_BLOCK %r10 %r12 %xmm1
437
438 lea SHIFT_MASK+16(%rip), %r12
439 sub %r13, %r12
440
441
442
443 jmp _final_ghash_mul\@
444
445_large_enough_update\@:
446 sub $16, %r11
447 add %r13, %r11
448
449
450 vmovdqu (arg4, %r11, 1), %xmm1
451
452 sub %r13, %r11
453 add $16, %r11
454
455 lea SHIFT_MASK+16(%rip), %r12
456
457
458 sub %r13, %r12
459
460 vmovdqu (%r12), %xmm2
461
462 vpshufb %xmm2, %xmm1, %xmm1
463
464_final_ghash_mul\@:
465 .if \ENC_DEC == DEC
466 vmovdqa %xmm1, %xmm2
467 vpxor %xmm1, %xmm9, %xmm9
468 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
469
470 vpand %xmm1, %xmm9, %xmm9
471 vpand %xmm1, %xmm2, %xmm2
472 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
473 vpxor %xmm2, %xmm14, %xmm14
474
475 vmovdqu %xmm14, AadHash(arg2)
476 .else
477 vpxor %xmm1, %xmm9, %xmm9
478 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
479
480 vpand %xmm1, %xmm9, %xmm9
481 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
482 vpxor %xmm9, %xmm14, %xmm14
483
484 vmovdqu %xmm14, AadHash(arg2)
485 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
486 .endif
487
488
489
490
491 vmovq %xmm9, %rax
492 cmp $8, %r13
493 jle _less_than_8_bytes_left\@
494
495 mov %rax, (arg3 , %r11)
496 add $8, %r11
497 vpsrldq $8, %xmm9, %xmm9
498 vmovq %xmm9, %rax
499 sub $8, %r13
500
501_less_than_8_bytes_left\@:
502 movb %al, (arg3 , %r11)
503 add $1, %r11
504 shr $8, %rax
505 sub $1, %r13
506 jne _less_than_8_bytes_left\@
507
508
509_multiple_of_16_bytes\@:
510.endm
511
512
513
514
515
516.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
517 vmovdqu AadHash(arg2), %xmm14
518 vmovdqu HashKey(arg2), %xmm13
519
520 mov PBlockLen(arg2), %r12
521 test %r12, %r12
522 je _partial_done\@
523
524
525 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
526
527_partial_done\@:
528 mov AadLen(arg2), %r12
529 shl $3, %r12
530 vmovd %r12d, %xmm15
531
532 mov InLen(arg2), %r12
533 shl $3, %r12
534 vmovq %r12, %xmm1
535 vpslldq $8, %xmm15, %xmm15
536 vpxor %xmm1, %xmm15, %xmm15
537
538 vpxor %xmm15, %xmm14, %xmm14
539 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
540 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14
541
542 vmovdqu OrigIV(arg2), %xmm9
543
544 ENCRYPT_SINGLE_BLOCK \REP, %xmm9
545
546 vpxor %xmm14, %xmm9, %xmm9
547
548
549
550_return_T\@:
551 mov \AUTH_TAG, %r10
552 mov \AUTH_TAG_LEN, %r11
553
554 cmp $16, %r11
555 je _T_16\@
556
557 cmp $8, %r11
558 jl _T_4\@
559
560_T_8\@:
561 vmovq %xmm9, %rax
562 mov %rax, (%r10)
563 add $8, %r10
564 sub $8, %r11
565 vpsrldq $8, %xmm9, %xmm9
566 test %r11, %r11
567 je _return_T_done\@
568_T_4\@:
569 vmovd %xmm9, %eax
570 mov %eax, (%r10)
571 add $4, %r10
572 sub $4, %r11
573 vpsrldq $4, %xmm9, %xmm9
574 test %r11, %r11
575 je _return_T_done\@
576_T_123\@:
577 vmovd %xmm9, %eax
578 cmp $2, %r11
579 jl _T_1\@
580 mov %ax, (%r10)
581 cmp $2, %r11
582 je _return_T_done\@
583 add $2, %r10
584 sar $16, %eax
585_T_1\@:
586 mov %al, (%r10)
587 jmp _return_T_done\@
588
589_T_16\@:
590 vmovdqu %xmm9, (%r10)
591
592_return_T_done\@:
593.endm
594
595.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
596
597 mov \AAD, %r10
598 mov \AADLEN, %r12
599
600
601 mov %r12, %r11
602
603 vpxor \T8, \T8, \T8
604 vpxor \T7, \T7, \T7
605 cmp $16, %r11
606 jl _get_AAD_rest8\@
607_get_AAD_blocks\@:
608 vmovdqu (%r10), \T7
609 vpshufb SHUF_MASK(%rip), \T7, \T7
610 vpxor \T7, \T8, \T8
611 \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6
612 add $16, %r10
613 sub $16, %r12
614 sub $16, %r11
615 cmp $16, %r11
616 jge _get_AAD_blocks\@
617 vmovdqu \T8, \T7
618 test %r11, %r11
619 je _get_AAD_done\@
620
621 vpxor \T7, \T7, \T7
622
623
624
625
626_get_AAD_rest8\@:
627 cmp $4, %r11
628 jle _get_AAD_rest4\@
629 movq (%r10), \T1
630 add $8, %r10
631 sub $8, %r11
632 vpslldq $8, \T1, \T1
633 vpsrldq $8, \T7, \T7
634 vpxor \T1, \T7, \T7
635 jmp _get_AAD_rest8\@
636_get_AAD_rest4\@:
637 test %r11, %r11
638 jle _get_AAD_rest0\@
639 mov (%r10), %eax
640 movq %rax, \T1
641 add $4, %r10
642 sub $4, %r11
643 vpslldq $12, \T1, \T1
644 vpsrldq $4, \T7, \T7
645 vpxor \T1, \T7, \T7
646_get_AAD_rest0\@:
647
648
649
650 movq %r12, %r11
651 salq $4, %r11
652 vmovdqu aad_shift_arr(%r11), \T1
653 vpshufb \T1, \T7, \T7
654_get_AAD_rest_final\@:
655 vpshufb SHUF_MASK(%rip), \T7, \T7
656 vpxor \T8, \T7, \T7
657 \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6
658
659_get_AAD_done\@:
660 vmovdqu \T7, AadHash(arg2)
661.endm
662
663.macro INIT GHASH_MUL PRECOMPUTE
664 mov arg6, %r11
665 mov %r11, AadLen(arg2)
666 xor %r11d, %r11d
667 mov %r11, InLen(arg2)
668
669 mov %r11, PBlockLen(arg2)
670 mov %r11, PBlockEncKey(arg2)
671 mov arg3, %rax
672 movdqu (%rax), %xmm0
673 movdqu %xmm0, OrigIV(arg2)
674
675 vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
676 movdqu %xmm0, CurCount(arg2)
677
678 vmovdqu (arg4), %xmm6
679
680 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
681
682 vmovdqa %xmm6, %xmm2
683 vpsllq $1, %xmm6, %xmm6
684 vpsrlq $63, %xmm2, %xmm2
685 vmovdqa %xmm2, %xmm1
686 vpslldq $8, %xmm2, %xmm2
687 vpsrldq $8, %xmm1, %xmm1
688 vpor %xmm2, %xmm6, %xmm6
689
690 vpshufd $0b00100100, %xmm1, %xmm2
691 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
692 vpand POLY(%rip), %xmm2, %xmm2
693 vpxor %xmm2, %xmm6, %xmm6
694
695 vmovdqu %xmm6, HashKey(arg2)
696
697 CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
698
699 \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
700.endm
701
702
703
704
705
706.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
707 vpxor \XMMDst, \XMMDst, \XMMDst
708
709 cmp $8, \DLEN
710 jl _read_lt8_\@
711 mov (\DPTR), %rax
712 vpinsrq $0, %rax, \XMMDst, \XMMDst
713 sub $8, \DLEN
714 jz _done_read_partial_block_\@
715 xor %eax, %eax
716_read_next_byte_\@:
717 shl $8, %rax
718 mov 7(\DPTR, \DLEN, 1), %al
719 dec \DLEN
720 jnz _read_next_byte_\@
721 vpinsrq $1, %rax, \XMMDst, \XMMDst
722 jmp _done_read_partial_block_\@
723_read_lt8_\@:
724 xor %eax, %eax
725_read_next_byte_lt8_\@:
726 shl $8, %rax
727 mov -1(\DPTR, \DLEN, 1), %al
728 dec \DLEN
729 jnz _read_next_byte_lt8_\@
730 vpinsrq $0, %rax, \XMMDst, \XMMDst
731_done_read_partial_block_\@:
732.endm
733
734
735
736
737
738
739.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
740 AAD_HASH ENC_DEC
741 mov PBlockLen(arg2), %r13
742 test %r13, %r13
743 je _partial_block_done_\@
744
745 cmp $16, \PLAIN_CYPH_LEN
746 jl _fewer_than_16_bytes_\@
747 vmovdqu (\PLAIN_CYPH_IN), %xmm1
748 jmp _data_read_\@
749
750_fewer_than_16_bytes_\@:
751 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
752 mov \PLAIN_CYPH_LEN, %r12
753 READ_PARTIAL_BLOCK %r10 %r12 %xmm1
754
755 mov PBlockLen(arg2), %r13
756
757_data_read_\@:
758
759 vmovdqu PBlockEncKey(arg2), %xmm9
760 vmovdqu HashKey(arg2), %xmm13
761
762 lea SHIFT_MASK(%rip), %r12
763
764
765
766 add %r13, %r12
767 vmovdqu (%r12), %xmm2
768 vpshufb %xmm2, %xmm9, %xmm9
769
770.if \ENC_DEC == DEC
771 vmovdqa %xmm1, %xmm3
772 pxor %xmm1, %xmm9
773
774 mov \PLAIN_CYPH_LEN, %r10
775 add %r13, %r10
776
777 sub $16, %r10
778
779
780 jge _no_extra_mask_1_\@
781 sub %r10, %r12
782_no_extra_mask_1_\@:
783
784 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
785
786 vpand %xmm1, %xmm9, %xmm9
787
788 vpand %xmm1, %xmm3, %xmm3
789 vmovdqa SHUF_MASK(%rip), %xmm10
790 vpshufb %xmm10, %xmm3, %xmm3
791 vpshufb %xmm2, %xmm3, %xmm3
792 vpxor %xmm3, \AAD_HASH, \AAD_HASH
793
794 test %r10, %r10
795 jl _partial_incomplete_1_\@
796
797
798 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
799 xor %eax,%eax
800
801 mov %rax, PBlockLen(arg2)
802 jmp _dec_done_\@
803_partial_incomplete_1_\@:
804 add \PLAIN_CYPH_LEN, PBlockLen(arg2)
805_dec_done_\@:
806 vmovdqu \AAD_HASH, AadHash(arg2)
807.else
808 vpxor %xmm1, %xmm9, %xmm9
809
810 mov \PLAIN_CYPH_LEN, %r10
811 add %r13, %r10
812
813 sub $16, %r10
814
815
816 jge _no_extra_mask_2_\@
817 sub %r10, %r12
818_no_extra_mask_2_\@:
819
820 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
821
822 vpand %xmm1, %xmm9, %xmm9
823
824 vmovdqa SHUF_MASK(%rip), %xmm1
825 vpshufb %xmm1, %xmm9, %xmm9
826 vpshufb %xmm2, %xmm9, %xmm9
827 vpxor %xmm9, \AAD_HASH, \AAD_HASH
828
829 test %r10, %r10
830 jl _partial_incomplete_2_\@
831
832
833 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
834 xor %eax,%eax
835
836 mov %rax, PBlockLen(arg2)
837 jmp _encode_done_\@
838_partial_incomplete_2_\@:
839 add \PLAIN_CYPH_LEN, PBlockLen(arg2)
840_encode_done_\@:
841 vmovdqu \AAD_HASH, AadHash(arg2)
842
843 vmovdqa SHUF_MASK(%rip), %xmm10
844
845 vpshufb %xmm10, %xmm9, %xmm9
846 vpshufb %xmm2, %xmm9, %xmm9
847.endif
848
849 test %r10, %r10
850 jl _partial_fill_\@
851 mov %r13, %r12
852 mov $16, %r13
853
854 sub %r12, %r13
855 jmp _count_set_\@
856_partial_fill_\@:
857 mov \PLAIN_CYPH_LEN, %r13
858_count_set_\@:
859 vmovdqa %xmm9, %xmm0
860 vmovq %xmm0, %rax
861 cmp $8, %r13
862 jle _less_than_8_bytes_left_\@
863
864 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
865 add $8, \DATA_OFFSET
866 psrldq $8, %xmm0
867 vmovq %xmm0, %rax
868 sub $8, %r13
869_less_than_8_bytes_left_\@:
870 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
871 add $1, \DATA_OFFSET
872 shr $8, %rax
873 sub $1, %r13
874 jne _less_than_8_bytes_left_\@
875_partial_block_done_\@:
876.endm
877
878
879
880
881
882
883
884
885.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
886
887 vpshufd $0b01001110, \GH, \T2
888 vpshufd $0b01001110, \HK, \T3
889 vpxor \GH , \T2, \T2
890 vpxor \HK , \T3, \T3
891
892 vpclmulqdq $0x11, \HK, \GH, \T1
893 vpclmulqdq $0x00, \HK, \GH, \GH
894 vpclmulqdq $0x00, \T3, \T2, \T2
895 vpxor \GH, \T2,\T2
896 vpxor \T1, \T2,\T2
897
898 vpslldq $8, \T2,\T3
899 vpsrldq $8, \T2,\T2
900 vpxor \T3, \GH, \GH
901 vpxor \T2, \T1, \T1
902
903
904 vpslld $31, \GH, \T2
905 vpslld $30, \GH, \T3
906 vpslld $25, \GH, \T4
907
908 vpxor \T3, \T2, \T2
909 vpxor \T4, \T2, \T2
910
911 vpsrldq $4, \T2, \T5
912
913 vpslldq $12, \T2, \T2
914 vpxor \T2, \GH, \GH
915
916
917
918 vpsrld $1,\GH, \T2
919 vpsrld $2,\GH, \T3
920 vpsrld $7,\GH, \T4
921 vpxor \T3, \T2, \T2
922 vpxor \T4, \T2, \T2
923
924 vpxor \T5, \T2, \T2
925 vpxor \T2, \GH, \GH
926 vpxor \T1, \GH, \GH
927
928
929.endm
930
931.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
932
933
934 vmovdqa \HK, \T5
935
936 vpshufd $0b01001110, \T5, \T1
937 vpxor \T5, \T1, \T1
938 vmovdqu \T1, HashKey_k(arg2)
939
940 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
941 vmovdqu \T5, HashKey_2(arg2)
942 vpshufd $0b01001110, \T5, \T1
943 vpxor \T5, \T1, \T1
944 vmovdqu \T1, HashKey_2_k(arg2)
945
946 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
947 vmovdqu \T5, HashKey_3(arg2)
948 vpshufd $0b01001110, \T5, \T1
949 vpxor \T5, \T1, \T1
950 vmovdqu \T1, HashKey_3_k(arg2)
951
952 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
953 vmovdqu \T5, HashKey_4(arg2)
954 vpshufd $0b01001110, \T5, \T1
955 vpxor \T5, \T1, \T1
956 vmovdqu \T1, HashKey_4_k(arg2)
957
958 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
959 vmovdqu \T5, HashKey_5(arg2)
960 vpshufd $0b01001110, \T5, \T1
961 vpxor \T5, \T1, \T1
962 vmovdqu \T1, HashKey_5_k(arg2)
963
964 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
965 vmovdqu \T5, HashKey_6(arg2)
966 vpshufd $0b01001110, \T5, \T1
967 vpxor \T5, \T1, \T1
968 vmovdqu \T1, HashKey_6_k(arg2)
969
970 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
971 vmovdqu \T5, HashKey_7(arg2)
972 vpshufd $0b01001110, \T5, \T1
973 vpxor \T5, \T1, \T1
974 vmovdqu \T1, HashKey_7_k(arg2)
975
976 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2
977 vmovdqu \T5, HashKey_8(arg2)
978 vpshufd $0b01001110, \T5, \T1
979 vpxor \T5, \T1, \T1
980 vmovdqu \T1, HashKey_8_k(arg2)
981
982.endm
983
984
985
986
987
988
989
990
991.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
992 i = (8-\num_initial_blocks)
993 setreg
994 vmovdqu AadHash(arg2), reg_i
995
996
997 vmovdqu CurCount(arg2), \CTR
998
999 i = (9-\num_initial_blocks)
1000 setreg
1001.rep \num_initial_blocks
1002 vpaddd ONE(%rip), \CTR, \CTR
1003 vmovdqa \CTR, reg_i
1004 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1005 i = (i+1)
1006 setreg
1007.endr
1008
1009 vmovdqa (arg1), \T_key
1010 i = (9-\num_initial_blocks)
1011 setreg
1012.rep \num_initial_blocks
1013 vpxor \T_key, reg_i, reg_i
1014 i = (i+1)
1015 setreg
1016.endr
1017
1018 j = 1
1019 setreg
1020.rep \REP
1021 vmovdqa 16*j(arg1), \T_key
1022 i = (9-\num_initial_blocks)
1023 setreg
1024.rep \num_initial_blocks
1025 vaesenc \T_key, reg_i, reg_i
1026 i = (i+1)
1027 setreg
1028.endr
1029
1030 j = (j+1)
1031 setreg
1032.endr
1033
1034 vmovdqa 16*j(arg1), \T_key
1035 i = (9-\num_initial_blocks)
1036 setreg
1037.rep \num_initial_blocks
1038 vaesenclast \T_key, reg_i, reg_i
1039 i = (i+1)
1040 setreg
1041.endr
1042
1043 i = (9-\num_initial_blocks)
1044 setreg
1045.rep \num_initial_blocks
1046 vmovdqu (arg4, %r11), \T1
1047 vpxor \T1, reg_i, reg_i
1048 vmovdqu reg_i, (arg3 , %r11)
1049 add $16, %r11
1050.if \ENC_DEC == DEC
1051 vmovdqa \T1, reg_i
1052.endif
1053 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1054 i = (i+1)
1055 setreg
1056.endr
1057
1058
1059 i = (8-\num_initial_blocks)
1060 j = (9-\num_initial_blocks)
1061 setreg
1062
1063.rep \num_initial_blocks
1064 vpxor reg_i, reg_j, reg_j
1065 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6
1066 i = (i+1)
1067 j = (j+1)
1068 setreg
1069.endr
1070
1071
1072 vmovdqa \XMM8, TMP1(%rsp)
1073 vmovdqa \XMM8, \T3
1074
1075 cmp $128, %r13
1076 jl _initial_blocks_done\@
1077
1078
1079
1080 vpaddd ONE(%rip), \CTR, \CTR
1081 vmovdqa \CTR, \XMM1
1082 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
1083
1084 vpaddd ONE(%rip), \CTR, \CTR
1085 vmovdqa \CTR, \XMM2
1086 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
1087
1088 vpaddd ONE(%rip), \CTR, \CTR
1089 vmovdqa \CTR, \XMM3
1090 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
1091
1092 vpaddd ONE(%rip), \CTR, \CTR
1093 vmovdqa \CTR, \XMM4
1094 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
1095
1096 vpaddd ONE(%rip), \CTR, \CTR
1097 vmovdqa \CTR, \XMM5
1098 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
1099
1100 vpaddd ONE(%rip), \CTR, \CTR
1101 vmovdqa \CTR, \XMM6
1102 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
1103
1104 vpaddd ONE(%rip), \CTR, \CTR
1105 vmovdqa \CTR, \XMM7
1106 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
1107
1108 vpaddd ONE(%rip), \CTR, \CTR
1109 vmovdqa \CTR, \XMM8
1110 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
1111
1112 vmovdqa (arg1), \T_key
1113 vpxor \T_key, \XMM1, \XMM1
1114 vpxor \T_key, \XMM2, \XMM2
1115 vpxor \T_key, \XMM3, \XMM3
1116 vpxor \T_key, \XMM4, \XMM4
1117 vpxor \T_key, \XMM5, \XMM5
1118 vpxor \T_key, \XMM6, \XMM6
1119 vpxor \T_key, \XMM7, \XMM7
1120 vpxor \T_key, \XMM8, \XMM8
1121
1122 i = 1
1123 setreg
1124.rep \REP
1125 vmovdqa 16*i(arg1), \T_key
1126 vaesenc \T_key, \XMM1, \XMM1
1127 vaesenc \T_key, \XMM2, \XMM2
1128 vaesenc \T_key, \XMM3, \XMM3
1129 vaesenc \T_key, \XMM4, \XMM4
1130 vaesenc \T_key, \XMM5, \XMM5
1131 vaesenc \T_key, \XMM6, \XMM6
1132 vaesenc \T_key, \XMM7, \XMM7
1133 vaesenc \T_key, \XMM8, \XMM8
1134 i = (i+1)
1135 setreg
1136.endr
1137
1138 vmovdqa 16*i(arg1), \T_key
1139 vaesenclast \T_key, \XMM1, \XMM1
1140 vaesenclast \T_key, \XMM2, \XMM2
1141 vaesenclast \T_key, \XMM3, \XMM3
1142 vaesenclast \T_key, \XMM4, \XMM4
1143 vaesenclast \T_key, \XMM5, \XMM5
1144 vaesenclast \T_key, \XMM6, \XMM6
1145 vaesenclast \T_key, \XMM7, \XMM7
1146 vaesenclast \T_key, \XMM8, \XMM8
1147
1148 vmovdqu (arg4, %r11), \T1
1149 vpxor \T1, \XMM1, \XMM1
1150 vmovdqu \XMM1, (arg3 , %r11)
1151 .if \ENC_DEC == DEC
1152 vmovdqa \T1, \XMM1
1153 .endif
1154
1155 vmovdqu 16*1(arg4, %r11), \T1
1156 vpxor \T1, \XMM2, \XMM2
1157 vmovdqu \XMM2, 16*1(arg3 , %r11)
1158 .if \ENC_DEC == DEC
1159 vmovdqa \T1, \XMM2
1160 .endif
1161
1162 vmovdqu 16*2(arg4, %r11), \T1
1163 vpxor \T1, \XMM3, \XMM3
1164 vmovdqu \XMM3, 16*2(arg3 , %r11)
1165 .if \ENC_DEC == DEC
1166 vmovdqa \T1, \XMM3
1167 .endif
1168
1169 vmovdqu 16*3(arg4, %r11), \T1
1170 vpxor \T1, \XMM4, \XMM4
1171 vmovdqu \XMM4, 16*3(arg3 , %r11)
1172 .if \ENC_DEC == DEC
1173 vmovdqa \T1, \XMM4
1174 .endif
1175
1176 vmovdqu 16*4(arg4, %r11), \T1
1177 vpxor \T1, \XMM5, \XMM5
1178 vmovdqu \XMM5, 16*4(arg3 , %r11)
1179 .if \ENC_DEC == DEC
1180 vmovdqa \T1, \XMM5
1181 .endif
1182
1183 vmovdqu 16*5(arg4, %r11), \T1
1184 vpxor \T1, \XMM6, \XMM6
1185 vmovdqu \XMM6, 16*5(arg3 , %r11)
1186 .if \ENC_DEC == DEC
1187 vmovdqa \T1, \XMM6
1188 .endif
1189
1190 vmovdqu 16*6(arg4, %r11), \T1
1191 vpxor \T1, \XMM7, \XMM7
1192 vmovdqu \XMM7, 16*6(arg3 , %r11)
1193 .if \ENC_DEC == DEC
1194 vmovdqa \T1, \XMM7
1195 .endif
1196
1197 vmovdqu 16*7(arg4, %r11), \T1
1198 vpxor \T1, \XMM8, \XMM8
1199 vmovdqu \XMM8, 16*7(arg3 , %r11)
1200 .if \ENC_DEC == DEC
1201 vmovdqa \T1, \XMM8
1202 .endif
1203
1204 add $128, %r11
1205
1206 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
1207 vpxor TMP1(%rsp), \XMM1, \XMM1
1208 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
1209 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
1210 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
1211 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
1212 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
1213 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
1214 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
1215
1216
1217
1218_initial_blocks_done\@:
1219
1220.endm
1221
1222
1223
1224
1225
1226.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1227
1228 vmovdqa \XMM1, \T2
1229 vmovdqa \XMM2, TMP2(%rsp)
1230 vmovdqa \XMM3, TMP3(%rsp)
1231 vmovdqa \XMM4, TMP4(%rsp)
1232 vmovdqa \XMM5, TMP5(%rsp)
1233 vmovdqa \XMM6, TMP6(%rsp)
1234 vmovdqa \XMM7, TMP7(%rsp)
1235 vmovdqa \XMM8, TMP8(%rsp)
1236
1237.if \loop_idx == in_order
1238 vpaddd ONE(%rip), \CTR, \XMM1
1239 vpaddd ONE(%rip), \XMM1, \XMM2
1240 vpaddd ONE(%rip), \XMM2, \XMM3
1241 vpaddd ONE(%rip), \XMM3, \XMM4
1242 vpaddd ONE(%rip), \XMM4, \XMM5
1243 vpaddd ONE(%rip), \XMM5, \XMM6
1244 vpaddd ONE(%rip), \XMM6, \XMM7
1245 vpaddd ONE(%rip), \XMM7, \XMM8
1246 vmovdqa \XMM8, \CTR
1247
1248 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
1249 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
1250 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
1251 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
1252 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
1253 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
1254 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
1255 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
1256.else
1257 vpaddd ONEf(%rip), \CTR, \XMM1
1258 vpaddd ONEf(%rip), \XMM1, \XMM2
1259 vpaddd ONEf(%rip), \XMM2, \XMM3
1260 vpaddd ONEf(%rip), \XMM3, \XMM4
1261 vpaddd ONEf(%rip), \XMM4, \XMM5
1262 vpaddd ONEf(%rip), \XMM5, \XMM6
1263 vpaddd ONEf(%rip), \XMM6, \XMM7
1264 vpaddd ONEf(%rip), \XMM7, \XMM8
1265 vmovdqa \XMM8, \CTR
1266.endif
1267
1268
1269
1270
1271 vmovdqu (arg1), \T1
1272 vpxor \T1, \XMM1, \XMM1
1273 vpxor \T1, \XMM2, \XMM2
1274 vpxor \T1, \XMM3, \XMM3
1275 vpxor \T1, \XMM4, \XMM4
1276 vpxor \T1, \XMM5, \XMM5
1277 vpxor \T1, \XMM6, \XMM6
1278 vpxor \T1, \XMM7, \XMM7
1279 vpxor \T1, \XMM8, \XMM8
1280
1281
1282
1283
1284
1285
1286
1287 vmovdqu 16*1(arg1), \T1
1288 vaesenc \T1, \XMM1, \XMM1
1289 vaesenc \T1, \XMM2, \XMM2
1290 vaesenc \T1, \XMM3, \XMM3
1291 vaesenc \T1, \XMM4, \XMM4
1292 vaesenc \T1, \XMM5, \XMM5
1293 vaesenc \T1, \XMM6, \XMM6
1294 vaesenc \T1, \XMM7, \XMM7
1295 vaesenc \T1, \XMM8, \XMM8
1296
1297 vmovdqu 16*2(arg1), \T1
1298 vaesenc \T1, \XMM1, \XMM1
1299 vaesenc \T1, \XMM2, \XMM2
1300 vaesenc \T1, \XMM3, \XMM3
1301 vaesenc \T1, \XMM4, \XMM4
1302 vaesenc \T1, \XMM5, \XMM5
1303 vaesenc \T1, \XMM6, \XMM6
1304 vaesenc \T1, \XMM7, \XMM7
1305 vaesenc \T1, \XMM8, \XMM8
1306
1307
1308
1309
1310 vmovdqu HashKey_8(arg2), \T5
1311 vpclmulqdq $0x11, \T5, \T2, \T4
1312 vpclmulqdq $0x00, \T5, \T2, \T7
1313
1314 vpshufd $0b01001110, \T2, \T6
1315 vpxor \T2, \T6, \T6
1316
1317 vmovdqu HashKey_8_k(arg2), \T5
1318 vpclmulqdq $0x00, \T5, \T6, \T6
1319
1320 vmovdqu 16*3(arg1), \T1
1321 vaesenc \T1, \XMM1, \XMM1
1322 vaesenc \T1, \XMM2, \XMM2
1323 vaesenc \T1, \XMM3, \XMM3
1324 vaesenc \T1, \XMM4, \XMM4
1325 vaesenc \T1, \XMM5, \XMM5
1326 vaesenc \T1, \XMM6, \XMM6
1327 vaesenc \T1, \XMM7, \XMM7
1328 vaesenc \T1, \XMM8, \XMM8
1329
1330 vmovdqa TMP2(%rsp), \T1
1331 vmovdqu HashKey_7(arg2), \T5
1332 vpclmulqdq $0x11, \T5, \T1, \T3
1333 vpxor \T3, \T4, \T4
1334 vpclmulqdq $0x00, \T5, \T1, \T3
1335 vpxor \T3, \T7, \T7
1336
1337 vpshufd $0b01001110, \T1, \T3
1338 vpxor \T1, \T3, \T3
1339 vmovdqu HashKey_7_k(arg2), \T5
1340 vpclmulqdq $0x10, \T5, \T3, \T3
1341 vpxor \T3, \T6, \T6
1342
1343 vmovdqu 16*4(arg1), \T1
1344 vaesenc \T1, \XMM1, \XMM1
1345 vaesenc \T1, \XMM2, \XMM2
1346 vaesenc \T1, \XMM3, \XMM3
1347 vaesenc \T1, \XMM4, \XMM4
1348 vaesenc \T1, \XMM5, \XMM5
1349 vaesenc \T1, \XMM6, \XMM6
1350 vaesenc \T1, \XMM7, \XMM7
1351 vaesenc \T1, \XMM8, \XMM8
1352
1353
1354
1355 vmovdqa TMP3(%rsp), \T1
1356 vmovdqu HashKey_6(arg2), \T5
1357 vpclmulqdq $0x11, \T5, \T1, \T3
1358 vpxor \T3, \T4, \T4
1359 vpclmulqdq $0x00, \T5, \T1, \T3
1360 vpxor \T3, \T7, \T7
1361
1362 vpshufd $0b01001110, \T1, \T3
1363 vpxor \T1, \T3, \T3
1364 vmovdqu HashKey_6_k(arg2), \T5
1365 vpclmulqdq $0x10, \T5, \T3, \T3
1366 vpxor \T3, \T6, \T6
1367
1368 vmovdqu 16*5(arg1), \T1
1369 vaesenc \T1, \XMM1, \XMM1
1370 vaesenc \T1, \XMM2, \XMM2
1371 vaesenc \T1, \XMM3, \XMM3
1372 vaesenc \T1, \XMM4, \XMM4
1373 vaesenc \T1, \XMM5, \XMM5
1374 vaesenc \T1, \XMM6, \XMM6
1375 vaesenc \T1, \XMM7, \XMM7
1376 vaesenc \T1, \XMM8, \XMM8
1377
1378 vmovdqa TMP4(%rsp), \T1
1379 vmovdqu HashKey_5(arg2), \T5
1380 vpclmulqdq $0x11, \T5, \T1, \T3
1381 vpxor \T3, \T4, \T4
1382 vpclmulqdq $0x00, \T5, \T1, \T3
1383 vpxor \T3, \T7, \T7
1384
1385 vpshufd $0b01001110, \T1, \T3
1386 vpxor \T1, \T3, \T3
1387 vmovdqu HashKey_5_k(arg2), \T5
1388 vpclmulqdq $0x10, \T5, \T3, \T3
1389 vpxor \T3, \T6, \T6
1390
1391 vmovdqu 16*6(arg1), \T1
1392 vaesenc \T1, \XMM1, \XMM1
1393 vaesenc \T1, \XMM2, \XMM2
1394 vaesenc \T1, \XMM3, \XMM3
1395 vaesenc \T1, \XMM4, \XMM4
1396 vaesenc \T1, \XMM5, \XMM5
1397 vaesenc \T1, \XMM6, \XMM6
1398 vaesenc \T1, \XMM7, \XMM7
1399 vaesenc \T1, \XMM8, \XMM8
1400
1401
1402 vmovdqa TMP5(%rsp), \T1
1403 vmovdqu HashKey_4(arg2), \T5
1404 vpclmulqdq $0x11, \T5, \T1, \T3
1405 vpxor \T3, \T4, \T4
1406 vpclmulqdq $0x00, \T5, \T1, \T3
1407 vpxor \T3, \T7, \T7
1408
1409 vpshufd $0b01001110, \T1, \T3
1410 vpxor \T1, \T3, \T3
1411 vmovdqu HashKey_4_k(arg2), \T5
1412 vpclmulqdq $0x10, \T5, \T3, \T3
1413 vpxor \T3, \T6, \T6
1414
1415 vmovdqu 16*7(arg1), \T1
1416 vaesenc \T1, \XMM1, \XMM1
1417 vaesenc \T1, \XMM2, \XMM2
1418 vaesenc \T1, \XMM3, \XMM3
1419 vaesenc \T1, \XMM4, \XMM4
1420 vaesenc \T1, \XMM5, \XMM5
1421 vaesenc \T1, \XMM6, \XMM6
1422 vaesenc \T1, \XMM7, \XMM7
1423 vaesenc \T1, \XMM8, \XMM8
1424
1425 vmovdqa TMP6(%rsp), \T1
1426 vmovdqu HashKey_3(arg2), \T5
1427 vpclmulqdq $0x11, \T5, \T1, \T3
1428 vpxor \T3, \T4, \T4
1429 vpclmulqdq $0x00, \T5, \T1, \T3
1430 vpxor \T3, \T7, \T7
1431
1432 vpshufd $0b01001110, \T1, \T3
1433 vpxor \T1, \T3, \T3
1434 vmovdqu HashKey_3_k(arg2), \T5
1435 vpclmulqdq $0x10, \T5, \T3, \T3
1436 vpxor \T3, \T6, \T6
1437
1438
1439 vmovdqu 16*8(arg1), \T1
1440 vaesenc \T1, \XMM1, \XMM1
1441 vaesenc \T1, \XMM2, \XMM2
1442 vaesenc \T1, \XMM3, \XMM3
1443 vaesenc \T1, \XMM4, \XMM4
1444 vaesenc \T1, \XMM5, \XMM5
1445 vaesenc \T1, \XMM6, \XMM6
1446 vaesenc \T1, \XMM7, \XMM7
1447 vaesenc \T1, \XMM8, \XMM8
1448
1449 vmovdqa TMP7(%rsp), \T1
1450 vmovdqu HashKey_2(arg2), \T5
1451 vpclmulqdq $0x11, \T5, \T1, \T3
1452 vpxor \T3, \T4, \T4
1453 vpclmulqdq $0x00, \T5, \T1, \T3
1454 vpxor \T3, \T7, \T7
1455
1456 vpshufd $0b01001110, \T1, \T3
1457 vpxor \T1, \T3, \T3
1458 vmovdqu HashKey_2_k(arg2), \T5
1459 vpclmulqdq $0x10, \T5, \T3, \T3
1460 vpxor \T3, \T6, \T6
1461
1462
1463
1464 vmovdqu 16*9(arg1), \T5
1465 vaesenc \T5, \XMM1, \XMM1
1466 vaesenc \T5, \XMM2, \XMM2
1467 vaesenc \T5, \XMM3, \XMM3
1468 vaesenc \T5, \XMM4, \XMM4
1469 vaesenc \T5, \XMM5, \XMM5
1470 vaesenc \T5, \XMM6, \XMM6
1471 vaesenc \T5, \XMM7, \XMM7
1472 vaesenc \T5, \XMM8, \XMM8
1473
1474 vmovdqa TMP8(%rsp), \T1
1475 vmovdqu HashKey(arg2), \T5
1476 vpclmulqdq $0x11, \T5, \T1, \T3
1477 vpxor \T3, \T4, \T4
1478 vpclmulqdq $0x00, \T5, \T1, \T3
1479 vpxor \T3, \T7, \T7
1480
1481 vpshufd $0b01001110, \T1, \T3
1482 vpxor \T1, \T3, \T3
1483 vmovdqu HashKey_k(arg2), \T5
1484 vpclmulqdq $0x10, \T5, \T3, \T3
1485 vpxor \T3, \T6, \T6
1486
1487 vpxor \T4, \T6, \T6
1488 vpxor \T7, \T6, \T6
1489
1490 vmovdqu 16*10(arg1), \T5
1491
1492 i = 11
1493 setreg
1494.rep (\REP-9)
1495
1496 vaesenc \T5, \XMM1, \XMM1
1497 vaesenc \T5, \XMM2, \XMM2
1498 vaesenc \T5, \XMM3, \XMM3
1499 vaesenc \T5, \XMM4, \XMM4
1500 vaesenc \T5, \XMM5, \XMM5
1501 vaesenc \T5, \XMM6, \XMM6
1502 vaesenc \T5, \XMM7, \XMM7
1503 vaesenc \T5, \XMM8, \XMM8
1504
1505 vmovdqu 16*i(arg1), \T5
1506 i = i + 1
1507 setreg
1508.endr
1509
1510 i = 0
1511 j = 1
1512 setreg
1513.rep 8
1514 vpxor 16*i(arg4, %r11), \T5, \T2
1515 .if \ENC_DEC == ENC
1516 vaesenclast \T2, reg_j, reg_j
1517 .else
1518 vaesenclast \T2, reg_j, \T3
1519 vmovdqu 16*i(arg4, %r11), reg_j
1520 vmovdqu \T3, 16*i(arg3, %r11)
1521 .endif
1522 i = (i+1)
1523 j = (j+1)
1524 setreg
1525.endr
1526
1527
1528
1529 vpslldq $8, \T6, \T3
1530 vpsrldq $8, \T6, \T6
1531 vpxor \T3, \T7, \T7
1532 vpxor \T4, \T6, \T6
1533
1534
1535
1536
1537
1538
1539 vpslld $31, \T7, \T2
1540 vpslld $30, \T7, \T3
1541 vpslld $25, \T7, \T4
1542
1543 vpxor \T3, \T2, \T2
1544 vpxor \T4, \T2, \T2
1545
1546 vpsrldq $4, \T2, \T1
1547
1548 vpslldq $12, \T2, \T2
1549 vpxor \T2, \T7, \T7
1550
1551 .if \ENC_DEC == ENC
1552 vmovdqu \XMM1, 16*0(arg3,%r11)
1553 vmovdqu \XMM2, 16*1(arg3,%r11)
1554 vmovdqu \XMM3, 16*2(arg3,%r11)
1555 vmovdqu \XMM4, 16*3(arg3,%r11)
1556 vmovdqu \XMM5, 16*4(arg3,%r11)
1557 vmovdqu \XMM6, 16*5(arg3,%r11)
1558 vmovdqu \XMM7, 16*6(arg3,%r11)
1559 vmovdqu \XMM8, 16*7(arg3,%r11)
1560 .endif
1561
1562
1563
1564 vpsrld $1, \T7, \T2
1565 vpsrld $2, \T7, \T3
1566 vpsrld $7, \T7, \T4
1567 vpxor \T3, \T2, \T2
1568 vpxor \T4, \T2, \T2
1569
1570 vpxor \T1, \T2, \T2
1571 vpxor \T2, \T7, \T7
1572 vpxor \T7, \T6, \T6
1573
1574
1575 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
1576 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
1577 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
1578 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
1579 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
1580 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
1581 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
1582 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
1583
1584
1585 vpxor \T6, \XMM1, \XMM1
1586
1587
1588
1589.endm
1590
1591
1592
1593.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1594
1595
1596
1597
1598 vpshufd $0b01001110, \XMM1, \T2
1599 vpxor \XMM1, \T2, \T2
1600 vmovdqu HashKey_8(arg2), \T5
1601 vpclmulqdq $0x11, \T5, \XMM1, \T6
1602 vpclmulqdq $0x00, \T5, \XMM1, \T7
1603
1604 vmovdqu HashKey_8_k(arg2), \T3
1605 vpclmulqdq $0x00, \T3, \T2, \XMM1
1606
1607
1608
1609 vpshufd $0b01001110, \XMM2, \T2
1610 vpxor \XMM2, \T2, \T2
1611 vmovdqu HashKey_7(arg2), \T5
1612 vpclmulqdq $0x11, \T5, \XMM2, \T4
1613 vpxor \T4, \T6, \T6
1614
1615 vpclmulqdq $0x00, \T5, \XMM2, \T4
1616 vpxor \T4, \T7, \T7
1617
1618 vmovdqu HashKey_7_k(arg2), \T3
1619 vpclmulqdq $0x00, \T3, \T2, \T2
1620 vpxor \T2, \XMM1, \XMM1
1621
1622
1623
1624 vpshufd $0b01001110, \XMM3, \T2
1625 vpxor \XMM3, \T2, \T2
1626 vmovdqu HashKey_6(arg2), \T5
1627 vpclmulqdq $0x11, \T5, \XMM3, \T4
1628 vpxor \T4, \T6, \T6
1629
1630 vpclmulqdq $0x00, \T5, \XMM3, \T4
1631 vpxor \T4, \T7, \T7
1632
1633 vmovdqu HashKey_6_k(arg2), \T3
1634 vpclmulqdq $0x00, \T3, \T2, \T2
1635 vpxor \T2, \XMM1, \XMM1
1636
1637
1638
1639 vpshufd $0b01001110, \XMM4, \T2
1640 vpxor \XMM4, \T2, \T2
1641 vmovdqu HashKey_5(arg2), \T5
1642 vpclmulqdq $0x11, \T5, \XMM4, \T4
1643 vpxor \T4, \T6, \T6
1644
1645 vpclmulqdq $0x00, \T5, \XMM4, \T4
1646 vpxor \T4, \T7, \T7
1647
1648 vmovdqu HashKey_5_k(arg2), \T3
1649 vpclmulqdq $0x00, \T3, \T2, \T2
1650 vpxor \T2, \XMM1, \XMM1
1651
1652
1653
1654 vpshufd $0b01001110, \XMM5, \T2
1655 vpxor \XMM5, \T2, \T2
1656 vmovdqu HashKey_4(arg2), \T5
1657 vpclmulqdq $0x11, \T5, \XMM5, \T4
1658 vpxor \T4, \T6, \T6
1659
1660 vpclmulqdq $0x00, \T5, \XMM5, \T4
1661 vpxor \T4, \T7, \T7
1662
1663 vmovdqu HashKey_4_k(arg2), \T3
1664 vpclmulqdq $0x00, \T3, \T2, \T2
1665 vpxor \T2, \XMM1, \XMM1
1666
1667
1668
1669 vpshufd $0b01001110, \XMM6, \T2
1670 vpxor \XMM6, \T2, \T2
1671 vmovdqu HashKey_3(arg2), \T5
1672 vpclmulqdq $0x11, \T5, \XMM6, \T4
1673 vpxor \T4, \T6, \T6
1674
1675 vpclmulqdq $0x00, \T5, \XMM6, \T4
1676 vpxor \T4, \T7, \T7
1677
1678 vmovdqu HashKey_3_k(arg2), \T3
1679 vpclmulqdq $0x00, \T3, \T2, \T2
1680 vpxor \T2, \XMM1, \XMM1
1681
1682
1683
1684 vpshufd $0b01001110, \XMM7, \T2
1685 vpxor \XMM7, \T2, \T2
1686 vmovdqu HashKey_2(arg2), \T5
1687 vpclmulqdq $0x11, \T5, \XMM7, \T4
1688 vpxor \T4, \T6, \T6
1689
1690 vpclmulqdq $0x00, \T5, \XMM7, \T4
1691 vpxor \T4, \T7, \T7
1692
1693 vmovdqu HashKey_2_k(arg2), \T3
1694 vpclmulqdq $0x00, \T3, \T2, \T2
1695 vpxor \T2, \XMM1, \XMM1
1696
1697
1698
1699 vpshufd $0b01001110, \XMM8, \T2
1700 vpxor \XMM8, \T2, \T2
1701 vmovdqu HashKey(arg2), \T5
1702 vpclmulqdq $0x11, \T5, \XMM8, \T4
1703 vpxor \T4, \T6, \T6
1704
1705 vpclmulqdq $0x00, \T5, \XMM8, \T4
1706 vpxor \T4, \T7, \T7
1707
1708 vmovdqu HashKey_k(arg2), \T3
1709 vpclmulqdq $0x00, \T3, \T2, \T2
1710
1711 vpxor \T2, \XMM1, \XMM1
1712 vpxor \T6, \XMM1, \XMM1
1713 vpxor \T7, \XMM1, \T2
1714
1715
1716
1717
1718 vpslldq $8, \T2, \T4
1719 vpsrldq $8, \T2, \T2
1720
1721 vpxor \T4, \T7, \T7
1722 vpxor \T2, \T6, \T6
1723
1724
1725
1726
1727 vpslld $31, \T7, \T2
1728 vpslld $30, \T7, \T3
1729 vpslld $25, \T7, \T4
1730
1731 vpxor \T3, \T2, \T2
1732 vpxor \T4, \T2, \T2
1733
1734 vpsrldq $4, \T2, \T1
1735
1736 vpslldq $12, \T2, \T2
1737 vpxor \T2, \T7, \T7
1738
1739
1740
1741
1742 vpsrld $1, \T7, \T2
1743 vpsrld $2, \T7, \T3
1744 vpsrld $7, \T7, \T4
1745 vpxor \T3, \T2, \T2
1746 vpxor \T4, \T2, \T2
1747
1748 vpxor \T1, \T2, \T2
1749 vpxor \T2, \T7, \T7
1750 vpxor \T7, \T6, \T6
1751
1752.endm
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766SYM_FUNC_START(aesni_gcm_init_avx_gen2)
1767 FUNC_SAVE
1768 INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
1769 FUNC_RESTORE
1770 RET
1771SYM_FUNC_END(aesni_gcm_init_avx_gen2)
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
1782 FUNC_SAVE
1783 mov keysize, %eax
1784 cmp $32, %eax
1785 je key_256_enc_update
1786 cmp $16, %eax
1787 je key_128_enc_update
1788
1789 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
1790 FUNC_RESTORE
1791 RET
1792key_128_enc_update:
1793 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
1794 FUNC_RESTORE
1795 RET
1796key_256_enc_update:
1797 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
1798 FUNC_RESTORE
1799 RET
1800SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
1811 FUNC_SAVE
1812 mov keysize,%eax
1813 cmp $32, %eax
1814 je key_256_dec_update
1815 cmp $16, %eax
1816 je key_128_dec_update
1817
1818 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
1819 FUNC_RESTORE
1820 RET
1821key_128_dec_update:
1822 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
1823 FUNC_RESTORE
1824 RET
1825key_256_dec_update:
1826 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
1827 FUNC_RESTORE
1828 RET
1829SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
1840 FUNC_SAVE
1841 mov keysize,%eax
1842 cmp $32, %eax
1843 je key_256_finalize
1844 cmp $16, %eax
1845 je key_128_finalize
1846
1847 GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
1848 FUNC_RESTORE
1849 RET
1850key_128_finalize:
1851 GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
1852 FUNC_RESTORE
1853 RET
1854key_256_finalize:
1855 GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
1856 FUNC_RESTORE
1857 RET
1858SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
1859
1860
1861
1862
1863
1864
1865
1866
1867.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1868
1869 vpclmulqdq $0x11,\HK,\GH,\T1
1870 vpclmulqdq $0x00,\HK,\GH,\T2
1871 vpclmulqdq $0x01,\HK,\GH,\T3
1872 vpclmulqdq $0x10,\HK,\GH,\GH
1873 vpxor \T3, \GH, \GH
1874
1875
1876 vpsrldq $8 , \GH, \T3
1877 vpslldq $8 , \GH, \GH
1878
1879 vpxor \T3, \T1, \T1
1880 vpxor \T2, \GH, \GH
1881
1882
1883
1884 vmovdqa POLY2(%rip), \T3
1885
1886 vpclmulqdq $0x01, \GH, \T3, \T2
1887 vpslldq $8, \T2, \T2
1888
1889 vpxor \T2, \GH, \GH
1890
1891
1892 vpclmulqdq $0x00, \GH, \T3, \T2
1893 vpsrldq $4, \T2, \T2
1894
1895 vpclmulqdq $0x10, \GH, \T3, \GH
1896 vpslldq $4, \GH, \GH
1897
1898 vpxor \T2, \GH, \GH
1899
1900 vpxor \T1, \GH, \GH
1901
1902
1903.endm
1904
1905.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1906
1907
1908 vmovdqa \HK, \T5
1909 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1910 vmovdqu \T5, HashKey_2(arg2)
1911
1912 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1913 vmovdqu \T5, HashKey_3(arg2)
1914
1915 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1916 vmovdqu \T5, HashKey_4(arg2)
1917
1918 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1919 vmovdqu \T5, HashKey_5(arg2)
1920
1921 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1922 vmovdqu \T5, HashKey_6(arg2)
1923
1924 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1925 vmovdqu \T5, HashKey_7(arg2)
1926
1927 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2
1928 vmovdqu \T5, HashKey_8(arg2)
1929
1930.endm
1931
1932
1933
1934
1935
1936
1937
1938
1939.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1940 i = (8-\num_initial_blocks)
1941 setreg
1942 vmovdqu AadHash(arg2), reg_i
1943
1944
1945 vmovdqu CurCount(arg2), \CTR
1946
1947 i = (9-\num_initial_blocks)
1948 setreg
1949.rep \num_initial_blocks
1950 vpaddd ONE(%rip), \CTR, \CTR
1951 vmovdqa \CTR, reg_i
1952 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1953 i = (i+1)
1954 setreg
1955.endr
1956
1957 vmovdqa (arg1), \T_key
1958 i = (9-\num_initial_blocks)
1959 setreg
1960.rep \num_initial_blocks
1961 vpxor \T_key, reg_i, reg_i
1962 i = (i+1)
1963 setreg
1964.endr
1965
1966 j = 1
1967 setreg
1968.rep \REP
1969 vmovdqa 16*j(arg1), \T_key
1970 i = (9-\num_initial_blocks)
1971 setreg
1972.rep \num_initial_blocks
1973 vaesenc \T_key, reg_i, reg_i
1974 i = (i+1)
1975 setreg
1976.endr
1977
1978 j = (j+1)
1979 setreg
1980.endr
1981
1982
1983 vmovdqa 16*j(arg1), \T_key
1984 i = (9-\num_initial_blocks)
1985 setreg
1986.rep \num_initial_blocks
1987 vaesenclast \T_key, reg_i, reg_i
1988 i = (i+1)
1989 setreg
1990.endr
1991
1992 i = (9-\num_initial_blocks)
1993 setreg
1994.rep \num_initial_blocks
1995 vmovdqu (arg4, %r11), \T1
1996 vpxor \T1, reg_i, reg_i
1997 vmovdqu reg_i, (arg3 , %r11)
1998
1999 add $16, %r11
2000.if \ENC_DEC == DEC
2001 vmovdqa \T1, reg_i
2002.endif
2003 vpshufb SHUF_MASK(%rip), reg_i, reg_i
2004 i = (i+1)
2005 setreg
2006.endr
2007
2008
2009 i = (8-\num_initial_blocks)
2010 j = (9-\num_initial_blocks)
2011 setreg
2012
2013.rep \num_initial_blocks
2014 vpxor reg_i, reg_j, reg_j
2015 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6
2016 i = (i+1)
2017 j = (j+1)
2018 setreg
2019.endr
2020
2021
2022 vmovdqa \XMM8, TMP1(%rsp)
2023 vmovdqa \XMM8, \T3
2024
2025 cmp $128, %r13
2026 jl _initial_blocks_done\@
2027
2028
2029
2030 vpaddd ONE(%rip), \CTR, \CTR
2031 vmovdqa \CTR, \XMM1
2032 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
2033
2034 vpaddd ONE(%rip), \CTR, \CTR
2035 vmovdqa \CTR, \XMM2
2036 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
2037
2038 vpaddd ONE(%rip), \CTR, \CTR
2039 vmovdqa \CTR, \XMM3
2040 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
2041
2042 vpaddd ONE(%rip), \CTR, \CTR
2043 vmovdqa \CTR, \XMM4
2044 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
2045
2046 vpaddd ONE(%rip), \CTR, \CTR
2047 vmovdqa \CTR, \XMM5
2048 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
2049
2050 vpaddd ONE(%rip), \CTR, \CTR
2051 vmovdqa \CTR, \XMM6
2052 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
2053
2054 vpaddd ONE(%rip), \CTR, \CTR
2055 vmovdqa \CTR, \XMM7
2056 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
2057
2058 vpaddd ONE(%rip), \CTR, \CTR
2059 vmovdqa \CTR, \XMM8
2060 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
2061
2062 vmovdqa (arg1), \T_key
2063 vpxor \T_key, \XMM1, \XMM1
2064 vpxor \T_key, \XMM2, \XMM2
2065 vpxor \T_key, \XMM3, \XMM3
2066 vpxor \T_key, \XMM4, \XMM4
2067 vpxor \T_key, \XMM5, \XMM5
2068 vpxor \T_key, \XMM6, \XMM6
2069 vpxor \T_key, \XMM7, \XMM7
2070 vpxor \T_key, \XMM8, \XMM8
2071
2072 i = 1
2073 setreg
2074.rep \REP
2075 vmovdqa 16*i(arg1), \T_key
2076 vaesenc \T_key, \XMM1, \XMM1
2077 vaesenc \T_key, \XMM2, \XMM2
2078 vaesenc \T_key, \XMM3, \XMM3
2079 vaesenc \T_key, \XMM4, \XMM4
2080 vaesenc \T_key, \XMM5, \XMM5
2081 vaesenc \T_key, \XMM6, \XMM6
2082 vaesenc \T_key, \XMM7, \XMM7
2083 vaesenc \T_key, \XMM8, \XMM8
2084 i = (i+1)
2085 setreg
2086.endr
2087
2088
2089 vmovdqa 16*i(arg1), \T_key
2090 vaesenclast \T_key, \XMM1, \XMM1
2091 vaesenclast \T_key, \XMM2, \XMM2
2092 vaesenclast \T_key, \XMM3, \XMM3
2093 vaesenclast \T_key, \XMM4, \XMM4
2094 vaesenclast \T_key, \XMM5, \XMM5
2095 vaesenclast \T_key, \XMM6, \XMM6
2096 vaesenclast \T_key, \XMM7, \XMM7
2097 vaesenclast \T_key, \XMM8, \XMM8
2098
2099 vmovdqu (arg4, %r11), \T1
2100 vpxor \T1, \XMM1, \XMM1
2101 vmovdqu \XMM1, (arg3 , %r11)
2102 .if \ENC_DEC == DEC
2103 vmovdqa \T1, \XMM1
2104 .endif
2105
2106 vmovdqu 16*1(arg4, %r11), \T1
2107 vpxor \T1, \XMM2, \XMM2
2108 vmovdqu \XMM2, 16*1(arg3 , %r11)
2109 .if \ENC_DEC == DEC
2110 vmovdqa \T1, \XMM2
2111 .endif
2112
2113 vmovdqu 16*2(arg4, %r11), \T1
2114 vpxor \T1, \XMM3, \XMM3
2115 vmovdqu \XMM3, 16*2(arg3 , %r11)
2116 .if \ENC_DEC == DEC
2117 vmovdqa \T1, \XMM3
2118 .endif
2119
2120 vmovdqu 16*3(arg4, %r11), \T1
2121 vpxor \T1, \XMM4, \XMM4
2122 vmovdqu \XMM4, 16*3(arg3 , %r11)
2123 .if \ENC_DEC == DEC
2124 vmovdqa \T1, \XMM4
2125 .endif
2126
2127 vmovdqu 16*4(arg4, %r11), \T1
2128 vpxor \T1, \XMM5, \XMM5
2129 vmovdqu \XMM5, 16*4(arg3 , %r11)
2130 .if \ENC_DEC == DEC
2131 vmovdqa \T1, \XMM5
2132 .endif
2133
2134 vmovdqu 16*5(arg4, %r11), \T1
2135 vpxor \T1, \XMM6, \XMM6
2136 vmovdqu \XMM6, 16*5(arg3 , %r11)
2137 .if \ENC_DEC == DEC
2138 vmovdqa \T1, \XMM6
2139 .endif
2140
2141 vmovdqu 16*6(arg4, %r11), \T1
2142 vpxor \T1, \XMM7, \XMM7
2143 vmovdqu \XMM7, 16*6(arg3 , %r11)
2144 .if \ENC_DEC == DEC
2145 vmovdqa \T1, \XMM7
2146 .endif
2147
2148 vmovdqu 16*7(arg4, %r11), \T1
2149 vpxor \T1, \XMM8, \XMM8
2150 vmovdqu \XMM8, 16*7(arg3 , %r11)
2151 .if \ENC_DEC == DEC
2152 vmovdqa \T1, \XMM8
2153 .endif
2154
2155 add $128, %r11
2156
2157 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
2158 vpxor TMP1(%rsp), \XMM1, \XMM1
2159
2160 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
2161 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
2162 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
2163 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
2164 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
2165 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
2166 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
2167
2168
2169
2170_initial_blocks_done\@:
2171
2172
2173.endm
2174
2175
2176
2177
2178
2179
2180
2181.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2182
2183 vmovdqa \XMM1, \T2
2184 vmovdqa \XMM2, TMP2(%rsp)
2185 vmovdqa \XMM3, TMP3(%rsp)
2186 vmovdqa \XMM4, TMP4(%rsp)
2187 vmovdqa \XMM5, TMP5(%rsp)
2188 vmovdqa \XMM6, TMP6(%rsp)
2189 vmovdqa \XMM7, TMP7(%rsp)
2190 vmovdqa \XMM8, TMP8(%rsp)
2191
2192.if \loop_idx == in_order
2193 vpaddd ONE(%rip), \CTR, \XMM1
2194 vpaddd ONE(%rip), \XMM1, \XMM2
2195 vpaddd ONE(%rip), \XMM2, \XMM3
2196 vpaddd ONE(%rip), \XMM3, \XMM4
2197 vpaddd ONE(%rip), \XMM4, \XMM5
2198 vpaddd ONE(%rip), \XMM5, \XMM6
2199 vpaddd ONE(%rip), \XMM6, \XMM7
2200 vpaddd ONE(%rip), \XMM7, \XMM8
2201 vmovdqa \XMM8, \CTR
2202
2203 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
2204 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
2205 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
2206 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
2207 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
2208 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
2209 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
2210 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
2211.else
2212 vpaddd ONEf(%rip), \CTR, \XMM1
2213 vpaddd ONEf(%rip), \XMM1, \XMM2
2214 vpaddd ONEf(%rip), \XMM2, \XMM3
2215 vpaddd ONEf(%rip), \XMM3, \XMM4
2216 vpaddd ONEf(%rip), \XMM4, \XMM5
2217 vpaddd ONEf(%rip), \XMM5, \XMM6
2218 vpaddd ONEf(%rip), \XMM6, \XMM7
2219 vpaddd ONEf(%rip), \XMM7, \XMM8
2220 vmovdqa \XMM8, \CTR
2221.endif
2222
2223
2224
2225
2226 vmovdqu (arg1), \T1
2227 vpxor \T1, \XMM1, \XMM1
2228 vpxor \T1, \XMM2, \XMM2
2229 vpxor \T1, \XMM3, \XMM3
2230 vpxor \T1, \XMM4, \XMM4
2231 vpxor \T1, \XMM5, \XMM5
2232 vpxor \T1, \XMM6, \XMM6
2233 vpxor \T1, \XMM7, \XMM7
2234 vpxor \T1, \XMM8, \XMM8
2235
2236
2237
2238
2239
2240
2241
2242 vmovdqu 16*1(arg1), \T1
2243 vaesenc \T1, \XMM1, \XMM1
2244 vaesenc \T1, \XMM2, \XMM2
2245 vaesenc \T1, \XMM3, \XMM3
2246 vaesenc \T1, \XMM4, \XMM4
2247 vaesenc \T1, \XMM5, \XMM5
2248 vaesenc \T1, \XMM6, \XMM6
2249 vaesenc \T1, \XMM7, \XMM7
2250 vaesenc \T1, \XMM8, \XMM8
2251
2252 vmovdqu 16*2(arg1), \T1
2253 vaesenc \T1, \XMM1, \XMM1
2254 vaesenc \T1, \XMM2, \XMM2
2255 vaesenc \T1, \XMM3, \XMM3
2256 vaesenc \T1, \XMM4, \XMM4
2257 vaesenc \T1, \XMM5, \XMM5
2258 vaesenc \T1, \XMM6, \XMM6
2259 vaesenc \T1, \XMM7, \XMM7
2260 vaesenc \T1, \XMM8, \XMM8
2261
2262
2263
2264
2265 vmovdqu HashKey_8(arg2), \T5
2266 vpclmulqdq $0x11, \T5, \T2, \T4
2267 vpclmulqdq $0x00, \T5, \T2, \T7
2268 vpclmulqdq $0x01, \T5, \T2, \T6
2269 vpclmulqdq $0x10, \T5, \T2, \T5
2270 vpxor \T5, \T6, \T6
2271
2272 vmovdqu 16*3(arg1), \T1
2273 vaesenc \T1, \XMM1, \XMM1
2274 vaesenc \T1, \XMM2, \XMM2
2275 vaesenc \T1, \XMM3, \XMM3
2276 vaesenc \T1, \XMM4, \XMM4
2277 vaesenc \T1, \XMM5, \XMM5
2278 vaesenc \T1, \XMM6, \XMM6
2279 vaesenc \T1, \XMM7, \XMM7
2280 vaesenc \T1, \XMM8, \XMM8
2281
2282 vmovdqa TMP2(%rsp), \T1
2283 vmovdqu HashKey_7(arg2), \T5
2284 vpclmulqdq $0x11, \T5, \T1, \T3
2285 vpxor \T3, \T4, \T4
2286
2287 vpclmulqdq $0x00, \T5, \T1, \T3
2288 vpxor \T3, \T7, \T7
2289
2290 vpclmulqdq $0x01, \T5, \T1, \T3
2291 vpxor \T3, \T6, \T6
2292
2293 vpclmulqdq $0x10, \T5, \T1, \T3
2294 vpxor \T3, \T6, \T6
2295
2296 vmovdqu 16*4(arg1), \T1
2297 vaesenc \T1, \XMM1, \XMM1
2298 vaesenc \T1, \XMM2, \XMM2
2299 vaesenc \T1, \XMM3, \XMM3
2300 vaesenc \T1, \XMM4, \XMM4
2301 vaesenc \T1, \XMM5, \XMM5
2302 vaesenc \T1, \XMM6, \XMM6
2303 vaesenc \T1, \XMM7, \XMM7
2304 vaesenc \T1, \XMM8, \XMM8
2305
2306
2307
2308 vmovdqa TMP3(%rsp), \T1
2309 vmovdqu HashKey_6(arg2), \T5
2310 vpclmulqdq $0x11, \T5, \T1, \T3
2311 vpxor \T3, \T4, \T4
2312
2313 vpclmulqdq $0x00, \T5, \T1, \T3
2314 vpxor \T3, \T7, \T7
2315
2316 vpclmulqdq $0x01, \T5, \T1, \T3
2317 vpxor \T3, \T6, \T6
2318
2319 vpclmulqdq $0x10, \T5, \T1, \T3
2320 vpxor \T3, \T6, \T6
2321
2322 vmovdqu 16*5(arg1), \T1
2323 vaesenc \T1, \XMM1, \XMM1
2324 vaesenc \T1, \XMM2, \XMM2
2325 vaesenc \T1, \XMM3, \XMM3
2326 vaesenc \T1, \XMM4, \XMM4
2327 vaesenc \T1, \XMM5, \XMM5
2328 vaesenc \T1, \XMM6, \XMM6
2329 vaesenc \T1, \XMM7, \XMM7
2330 vaesenc \T1, \XMM8, \XMM8
2331
2332 vmovdqa TMP4(%rsp), \T1
2333 vmovdqu HashKey_5(arg2), \T5
2334 vpclmulqdq $0x11, \T5, \T1, \T3
2335 vpxor \T3, \T4, \T4
2336
2337 vpclmulqdq $0x00, \T5, \T1, \T3
2338 vpxor \T3, \T7, \T7
2339
2340 vpclmulqdq $0x01, \T5, \T1, \T3
2341 vpxor \T3, \T6, \T6
2342
2343 vpclmulqdq $0x10, \T5, \T1, \T3
2344 vpxor \T3, \T6, \T6
2345
2346 vmovdqu 16*6(arg1), \T1
2347 vaesenc \T1, \XMM1, \XMM1
2348 vaesenc \T1, \XMM2, \XMM2
2349 vaesenc \T1, \XMM3, \XMM3
2350 vaesenc \T1, \XMM4, \XMM4
2351 vaesenc \T1, \XMM5, \XMM5
2352 vaesenc \T1, \XMM6, \XMM6
2353 vaesenc \T1, \XMM7, \XMM7
2354 vaesenc \T1, \XMM8, \XMM8
2355
2356
2357 vmovdqa TMP5(%rsp), \T1
2358 vmovdqu HashKey_4(arg2), \T5
2359 vpclmulqdq $0x11, \T5, \T1, \T3
2360 vpxor \T3, \T4, \T4
2361
2362 vpclmulqdq $0x00, \T5, \T1, \T3
2363 vpxor \T3, \T7, \T7
2364
2365 vpclmulqdq $0x01, \T5, \T1, \T3
2366 vpxor \T3, \T6, \T6
2367
2368 vpclmulqdq $0x10, \T5, \T1, \T3
2369 vpxor \T3, \T6, \T6
2370
2371 vmovdqu 16*7(arg1), \T1
2372 vaesenc \T1, \XMM1, \XMM1
2373 vaesenc \T1, \XMM2, \XMM2
2374 vaesenc \T1, \XMM3, \XMM3
2375 vaesenc \T1, \XMM4, \XMM4
2376 vaesenc \T1, \XMM5, \XMM5
2377 vaesenc \T1, \XMM6, \XMM6
2378 vaesenc \T1, \XMM7, \XMM7
2379 vaesenc \T1, \XMM8, \XMM8
2380
2381 vmovdqa TMP6(%rsp), \T1
2382 vmovdqu HashKey_3(arg2), \T5
2383 vpclmulqdq $0x11, \T5, \T1, \T3
2384 vpxor \T3, \T4, \T4
2385
2386 vpclmulqdq $0x00, \T5, \T1, \T3
2387 vpxor \T3, \T7, \T7
2388
2389 vpclmulqdq $0x01, \T5, \T1, \T3
2390 vpxor \T3, \T6, \T6
2391
2392 vpclmulqdq $0x10, \T5, \T1, \T3
2393 vpxor \T3, \T6, \T6
2394
2395 vmovdqu 16*8(arg1), \T1
2396 vaesenc \T1, \XMM1, \XMM1
2397 vaesenc \T1, \XMM2, \XMM2
2398 vaesenc \T1, \XMM3, \XMM3
2399 vaesenc \T1, \XMM4, \XMM4
2400 vaesenc \T1, \XMM5, \XMM5
2401 vaesenc \T1, \XMM6, \XMM6
2402 vaesenc \T1, \XMM7, \XMM7
2403 vaesenc \T1, \XMM8, \XMM8
2404
2405 vmovdqa TMP7(%rsp), \T1
2406 vmovdqu HashKey_2(arg2), \T5
2407 vpclmulqdq $0x11, \T5, \T1, \T3
2408 vpxor \T3, \T4, \T4
2409
2410 vpclmulqdq $0x00, \T5, \T1, \T3
2411 vpxor \T3, \T7, \T7
2412
2413 vpclmulqdq $0x01, \T5, \T1, \T3
2414 vpxor \T3, \T6, \T6
2415
2416 vpclmulqdq $0x10, \T5, \T1, \T3
2417 vpxor \T3, \T6, \T6
2418
2419
2420
2421
2422 vmovdqu 16*9(arg1), \T5
2423 vaesenc \T5, \XMM1, \XMM1
2424 vaesenc \T5, \XMM2, \XMM2
2425 vaesenc \T5, \XMM3, \XMM3
2426 vaesenc \T5, \XMM4, \XMM4
2427 vaesenc \T5, \XMM5, \XMM5
2428 vaesenc \T5, \XMM6, \XMM6
2429 vaesenc \T5, \XMM7, \XMM7
2430 vaesenc \T5, \XMM8, \XMM8
2431
2432 vmovdqa TMP8(%rsp), \T1
2433 vmovdqu HashKey(arg2), \T5
2434
2435 vpclmulqdq $0x00, \T5, \T1, \T3
2436 vpxor \T3, \T7, \T7
2437
2438 vpclmulqdq $0x01, \T5, \T1, \T3
2439 vpxor \T3, \T6, \T6
2440
2441 vpclmulqdq $0x10, \T5, \T1, \T3
2442 vpxor \T3, \T6, \T6
2443
2444 vpclmulqdq $0x11, \T5, \T1, \T3
2445 vpxor \T3, \T4, \T1
2446
2447
2448 vmovdqu 16*10(arg1), \T5
2449
2450 i = 11
2451 setreg
2452.rep (\REP-9)
2453 vaesenc \T5, \XMM1, \XMM1
2454 vaesenc \T5, \XMM2, \XMM2
2455 vaesenc \T5, \XMM3, \XMM3
2456 vaesenc \T5, \XMM4, \XMM4
2457 vaesenc \T5, \XMM5, \XMM5
2458 vaesenc \T5, \XMM6, \XMM6
2459 vaesenc \T5, \XMM7, \XMM7
2460 vaesenc \T5, \XMM8, \XMM8
2461
2462 vmovdqu 16*i(arg1), \T5
2463 i = i + 1
2464 setreg
2465.endr
2466
2467 i = 0
2468 j = 1
2469 setreg
2470.rep 8
2471 vpxor 16*i(arg4, %r11), \T5, \T2
2472 .if \ENC_DEC == ENC
2473 vaesenclast \T2, reg_j, reg_j
2474 .else
2475 vaesenclast \T2, reg_j, \T3
2476 vmovdqu 16*i(arg4, %r11), reg_j
2477 vmovdqu \T3, 16*i(arg3, %r11)
2478 .endif
2479 i = (i+1)
2480 j = (j+1)
2481 setreg
2482.endr
2483
2484
2485
2486 vpslldq $8, \T6, \T3
2487 vpsrldq $8, \T6, \T6
2488 vpxor \T3, \T7, \T7
2489 vpxor \T6, \T1, \T1
2490
2491
2492
2493
2494
2495 vmovdqa POLY2(%rip), \T3
2496
2497 vpclmulqdq $0x01, \T7, \T3, \T2
2498 vpslldq $8, \T2, \T2
2499
2500 vpxor \T2, \T7, \T7
2501
2502 .if \ENC_DEC == ENC
2503 vmovdqu \XMM1, 16*0(arg3,%r11)
2504 vmovdqu \XMM2, 16*1(arg3,%r11)
2505 vmovdqu \XMM3, 16*2(arg3,%r11)
2506 vmovdqu \XMM4, 16*3(arg3,%r11)
2507 vmovdqu \XMM5, 16*4(arg3,%r11)
2508 vmovdqu \XMM6, 16*5(arg3,%r11)
2509 vmovdqu \XMM7, 16*6(arg3,%r11)
2510 vmovdqu \XMM8, 16*7(arg3,%r11)
2511 .endif
2512
2513
2514
2515 vpclmulqdq $0x00, \T7, \T3, \T2
2516 vpsrldq $4, \T2, \T2
2517
2518 vpclmulqdq $0x10, \T7, \T3, \T4
2519 vpslldq $4, \T4, \T4
2520
2521 vpxor \T2, \T4, \T4
2522
2523 vpxor \T4, \T1, \T1
2524
2525 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1
2526 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2
2527 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3
2528 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4
2529 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5
2530 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6
2531 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7
2532 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8
2533
2534
2535 vpxor \T1, \XMM1, \XMM1
2536
2537
2538
2539.endm
2540
2541
2542
2543.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2544
2545
2546
2547 vmovdqu HashKey_8(arg2), \T5
2548
2549 vpshufd $0b01001110, \XMM1, \T2
2550 vpshufd $0b01001110, \T5, \T3
2551 vpxor \XMM1, \T2, \T2
2552 vpxor \T5, \T3, \T3
2553
2554 vpclmulqdq $0x11, \T5, \XMM1, \T6
2555 vpclmulqdq $0x00, \T5, \XMM1, \T7
2556
2557 vpclmulqdq $0x00, \T3, \T2, \XMM1
2558
2559
2560
2561 vmovdqu HashKey_7(arg2), \T5
2562 vpshufd $0b01001110, \XMM2, \T2
2563 vpshufd $0b01001110, \T5, \T3
2564 vpxor \XMM2, \T2, \T2
2565 vpxor \T5, \T3, \T3
2566
2567 vpclmulqdq $0x11, \T5, \XMM2, \T4
2568 vpxor \T4, \T6, \T6
2569
2570 vpclmulqdq $0x00, \T5, \XMM2, \T4
2571 vpxor \T4, \T7, \T7
2572
2573 vpclmulqdq $0x00, \T3, \T2, \T2
2574
2575 vpxor \T2, \XMM1, \XMM1
2576
2577
2578
2579 vmovdqu HashKey_6(arg2), \T5
2580 vpshufd $0b01001110, \XMM3, \T2
2581 vpshufd $0b01001110, \T5, \T3
2582 vpxor \XMM3, \T2, \T2
2583 vpxor \T5, \T3, \T3
2584
2585 vpclmulqdq $0x11, \T5, \XMM3, \T4
2586 vpxor \T4, \T6, \T6
2587
2588 vpclmulqdq $0x00, \T5, \XMM3, \T4
2589 vpxor \T4, \T7, \T7
2590
2591 vpclmulqdq $0x00, \T3, \T2, \T2
2592
2593 vpxor \T2, \XMM1, \XMM1
2594
2595
2596
2597 vmovdqu HashKey_5(arg2), \T5
2598 vpshufd $0b01001110, \XMM4, \T2
2599 vpshufd $0b01001110, \T5, \T3
2600 vpxor \XMM4, \T2, \T2
2601 vpxor \T5, \T3, \T3
2602
2603 vpclmulqdq $0x11, \T5, \XMM4, \T4
2604 vpxor \T4, \T6, \T6
2605
2606 vpclmulqdq $0x00, \T5, \XMM4, \T4
2607 vpxor \T4, \T7, \T7
2608
2609 vpclmulqdq $0x00, \T3, \T2, \T2
2610
2611 vpxor \T2, \XMM1, \XMM1
2612
2613
2614
2615 vmovdqu HashKey_4(arg2), \T5
2616 vpshufd $0b01001110, \XMM5, \T2
2617 vpshufd $0b01001110, \T5, \T3
2618 vpxor \XMM5, \T2, \T2
2619 vpxor \T5, \T3, \T3
2620
2621 vpclmulqdq $0x11, \T5, \XMM5, \T4
2622 vpxor \T4, \T6, \T6
2623
2624 vpclmulqdq $0x00, \T5, \XMM5, \T4
2625 vpxor \T4, \T7, \T7
2626
2627 vpclmulqdq $0x00, \T3, \T2, \T2
2628
2629 vpxor \T2, \XMM1, \XMM1
2630
2631
2632
2633 vmovdqu HashKey_3(arg2), \T5
2634 vpshufd $0b01001110, \XMM6, \T2
2635 vpshufd $0b01001110, \T5, \T3
2636 vpxor \XMM6, \T2, \T2
2637 vpxor \T5, \T3, \T3
2638
2639 vpclmulqdq $0x11, \T5, \XMM6, \T4
2640 vpxor \T4, \T6, \T6
2641
2642 vpclmulqdq $0x00, \T5, \XMM6, \T4
2643 vpxor \T4, \T7, \T7
2644
2645 vpclmulqdq $0x00, \T3, \T2, \T2
2646
2647 vpxor \T2, \XMM1, \XMM1
2648
2649
2650
2651 vmovdqu HashKey_2(arg2), \T5
2652 vpshufd $0b01001110, \XMM7, \T2
2653 vpshufd $0b01001110, \T5, \T3
2654 vpxor \XMM7, \T2, \T2
2655 vpxor \T5, \T3, \T3
2656
2657 vpclmulqdq $0x11, \T5, \XMM7, \T4
2658 vpxor \T4, \T6, \T6
2659
2660 vpclmulqdq $0x00, \T5, \XMM7, \T4
2661 vpxor \T4, \T7, \T7
2662
2663 vpclmulqdq $0x00, \T3, \T2, \T2
2664
2665 vpxor \T2, \XMM1, \XMM1
2666
2667
2668
2669 vmovdqu HashKey(arg2), \T5
2670 vpshufd $0b01001110, \XMM8, \T2
2671 vpshufd $0b01001110, \T5, \T3
2672 vpxor \XMM8, \T2, \T2
2673 vpxor \T5, \T3, \T3
2674
2675 vpclmulqdq $0x11, \T5, \XMM8, \T4
2676 vpxor \T4, \T6, \T6
2677
2678 vpclmulqdq $0x00, \T5, \XMM8, \T4
2679 vpxor \T4, \T7, \T7
2680
2681 vpclmulqdq $0x00, \T3, \T2, \T2
2682
2683 vpxor \T2, \XMM1, \XMM1
2684 vpxor \T6, \XMM1, \XMM1
2685 vpxor \T7, \XMM1, \T2
2686
2687
2688
2689
2690 vpslldq $8, \T2, \T4
2691 vpsrldq $8, \T2, \T2
2692
2693 vpxor \T4, \T7, \T7
2694 vpxor \T2, \T6, \T6
2695
2696
2697
2698
2699 vmovdqa POLY2(%rip), \T3
2700
2701 vpclmulqdq $0x01, \T7, \T3, \T2
2702 vpslldq $8, \T2, \T2
2703
2704 vpxor \T2, \T7, \T7
2705
2706
2707
2708
2709 vpclmulqdq $0x00, \T7, \T3, \T2
2710 vpsrldq $4, \T2, \T2
2711
2712 vpclmulqdq $0x10, \T7, \T3, \T4
2713 vpslldq $4, \T4, \T4
2714
2715 vpxor \T2, \T4, \T4
2716
2717 vpxor \T4, \T6, \T6
2718.endm
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734SYM_FUNC_START(aesni_gcm_init_avx_gen4)
2735 FUNC_SAVE
2736 INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
2737 FUNC_RESTORE
2738 RET
2739SYM_FUNC_END(aesni_gcm_init_avx_gen4)
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
2750 FUNC_SAVE
2751 mov keysize,%eax
2752 cmp $32, %eax
2753 je key_256_enc_update4
2754 cmp $16, %eax
2755 je key_128_enc_update4
2756
2757 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
2758 FUNC_RESTORE
2759 RET
2760key_128_enc_update4:
2761 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
2762 FUNC_RESTORE
2763 RET
2764key_256_enc_update4:
2765 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
2766 FUNC_RESTORE
2767 RET
2768SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
2779 FUNC_SAVE
2780 mov keysize,%eax
2781 cmp $32, %eax
2782 je key_256_dec_update4
2783 cmp $16, %eax
2784 je key_128_dec_update4
2785
2786 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
2787 FUNC_RESTORE
2788 RET
2789key_128_dec_update4:
2790 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
2791 FUNC_RESTORE
2792 RET
2793key_256_dec_update4:
2794 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
2795 FUNC_RESTORE
2796 RET
2797SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
2808 FUNC_SAVE
2809 mov keysize,%eax
2810 cmp $32, %eax
2811 je key_256_finalize4
2812 cmp $16, %eax
2813 je key_128_finalize4
2814
2815 GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
2816 FUNC_RESTORE
2817 RET
2818key_128_finalize4:
2819 GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
2820 FUNC_RESTORE
2821 RET
2822key_256_finalize4:
2823 GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
2824 FUNC_RESTORE
2825 RET
2826SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)
2827