1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32#include <linux/linkage.h>
33#include <asm/inst.h>
34#include <asm/frame.h>
35#include <asm/nospec-branch.h>
36
37
38
39
40
41
42
43
44
45#define MOVADQ movaps
46#define MOVUDQ movups
47
48#ifdef __x86_64__
49
50
51.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
52.align 16
53.Lgf128mul_x_ble_mask:
54 .octa 0x00000000000000010000000000000087
55.section .rodata.cst16.POLY, "aM", @progbits, 16
56.align 16
57POLY: .octa 0xC2000000000000000000000000000001
58.section .rodata.cst16.TWOONE, "aM", @progbits, 16
59.align 16
60TWOONE: .octa 0x00000001000000000000000000000001
61
62.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
63.align 16
64SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
65.section .rodata.cst16.MASK1, "aM", @progbits, 16
66.align 16
67MASK1: .octa 0x0000000000000000ffffffffffffffff
68.section .rodata.cst16.MASK2, "aM", @progbits, 16
69.align 16
70MASK2: .octa 0xffffffffffffffff0000000000000000
71.section .rodata.cst16.ONE, "aM", @progbits, 16
72.align 16
73ONE: .octa 0x00000000000000000000000000000001
74.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
75.align 16
76F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
77.section .rodata.cst16.dec, "aM", @progbits, 16
78.align 16
79dec: .octa 0x1
80.section .rodata.cst16.enc, "aM", @progbits, 16
81.align 16
82enc: .octa 0x2
83
84
85
86
87.section .rodata, "a", @progbits
88.align 16
89SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
90ALL_F: .octa 0xffffffffffffffffffffffffffffffff
91 .octa 0x00000000000000000000000000000000
92
93.text
94
95
96#define STACK_OFFSET 8*3
97
98#define AadHash 16*0
99#define AadLen 16*1
100#define InLen (16*1)+8
101#define PBlockEncKey 16*2
102#define OrigIV 16*3
103#define CurCount 16*4
104#define PBlockLen 16*5
105#define HashKey 16*6
106#define HashKey_2 16*7
107#define HashKey_3 16*8
108#define HashKey_4 16*9
109#define HashKey_k 16*10
110
111
112#define HashKey_2_k 16*11
113
114
115#define HashKey_3_k 16*12
116
117
118#define HashKey_4_k 16*13
119
120
121
122#define arg1 rdi
123#define arg2 rsi
124#define arg3 rdx
125#define arg4 rcx
126#define arg5 r8
127#define arg6 r9
128#define arg7 STACK_OFFSET+8(%rsp)
129#define arg8 STACK_OFFSET+16(%rsp)
130#define arg9 STACK_OFFSET+24(%rsp)
131#define arg10 STACK_OFFSET+32(%rsp)
132#define arg11 STACK_OFFSET+40(%rsp)
133#define keysize 2*15*16(%arg1)
134#endif
135
136
137#define STATE1 %xmm0
138#define STATE2 %xmm4
139#define STATE3 %xmm5
140#define STATE4 %xmm6
141#define STATE STATE1
142#define IN1 %xmm1
143#define IN2 %xmm7
144#define IN3 %xmm8
145#define IN4 %xmm9
146#define IN IN1
147#define KEY %xmm2
148#define IV %xmm3
149
150#define BSWAP_MASK %xmm10
151#define CTR %xmm11
152#define INC %xmm12
153
154#define GF128MUL_MASK %xmm10
155
156#ifdef __x86_64__
157#define AREG %rax
158#define KEYP %rdi
159#define OUTP %rsi
160#define UKEYP OUTP
161#define INP %rdx
162#define LEN %rcx
163#define IVP %r8
164#define KLEN %r9d
165#define T1 %r10
166#define TKEYP T1
167#define T2 %r11
168#define TCTR_LOW T2
169#else
170#define AREG %eax
171#define KEYP %edi
172#define OUTP AREG
173#define UKEYP OUTP
174#define INP %edx
175#define LEN %esi
176#define IVP %ebp
177#define KLEN %ebx
178#define T1 %ecx
179#define TKEYP T1
180#endif
181
182.macro FUNC_SAVE
183 push %r12
184 push %r13
185 push %r14
186
187
188
189
190.endm
191
192
193.macro FUNC_RESTORE
194 pop %r14
195 pop %r13
196 pop %r12
197.endm
198
199
200
201
202
203
204.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
205 mov \SUBKEY, %r12
206 movdqu (%r12), \TMP3
207 movdqa SHUF_MASK(%rip), \TMP2
208 PSHUFB_XMM \TMP2, \TMP3
209
210
211
212 movdqa \TMP3, \TMP2
213 psllq $1, \TMP3
214 psrlq $63, \TMP2
215 movdqa \TMP2, \TMP1
216 pslldq $8, \TMP2
217 psrldq $8, \TMP1
218 por \TMP2, \TMP3
219
220
221
222 pshufd $0x24, \TMP1, \TMP2
223 pcmpeqd TWOONE(%rip), \TMP2
224 pand POLY(%rip), \TMP2
225 pxor \TMP2, \TMP3
226 movdqa \TMP3, HashKey(%arg2)
227
228 movdqa \TMP3, \TMP5
229 pshufd $78, \TMP3, \TMP1
230 pxor \TMP3, \TMP1
231 movdqa \TMP1, HashKey_k(%arg2)
232
233 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
234
235 movdqa \TMP5, HashKey_2(%arg2)
236
237 pshufd $78, \TMP5, \TMP1
238 pxor \TMP5, \TMP1
239 movdqa \TMP1, HashKey_2_k(%arg2)
240
241 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
242
243 movdqa \TMP5, HashKey_3(%arg2)
244 pshufd $78, \TMP5, \TMP1
245 pxor \TMP5, \TMP1
246 movdqa \TMP1, HashKey_3_k(%arg2)
247
248 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
249
250 movdqa \TMP5, HashKey_4(%arg2)
251 pshufd $78, \TMP5, \TMP1
252 pxor \TMP5, \TMP1
253 movdqa \TMP1, HashKey_4_k(%arg2)
254.endm
255
256
257
258.macro GCM_INIT Iv SUBKEY AAD AADLEN
259 mov \AADLEN, %r11
260 mov %r11, AadLen(%arg2)
261 xor %r11, %r11
262 mov %r11, InLen(%arg2)
263 mov %r11, PBlockLen(%arg2)
264 mov %r11, PBlockEncKey(%arg2)
265 mov \Iv, %rax
266 movdqu (%rax), %xmm0
267 movdqu %xmm0, OrigIV(%arg2)
268
269 movdqa SHUF_MASK(%rip), %xmm2
270 PSHUFB_XMM %xmm2, %xmm0
271 movdqu %xmm0, CurCount(%arg2)
272
273 PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
274 movdqa HashKey(%arg2), %xmm13
275
276 CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
277 %xmm4, %xmm5, %xmm6
278.endm
279
280
281
282
283
284.macro GCM_ENC_DEC operation
285 movdqu AadHash(%arg2), %xmm8
286 movdqu HashKey(%arg2), %xmm13
287 add %arg5, InLen(%arg2)
288
289 xor %r11, %r11
290 PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
291
292 sub %r11, %arg5
293 mov %arg5, %r13
294
295 and $-16, %r13
296 mov %r13, %r12
297
298
299 and $(3<<4), %r12
300 jz _initial_num_blocks_is_0_\@
301 cmp $(2<<4), %r12
302 jb _initial_num_blocks_is_1_\@
303 je _initial_num_blocks_is_2_\@
304_initial_num_blocks_is_3_\@:
305 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
306%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
307 sub $48, %r13
308 jmp _initial_blocks_\@
309_initial_num_blocks_is_2_\@:
310 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
311%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
312 sub $32, %r13
313 jmp _initial_blocks_\@
314_initial_num_blocks_is_1_\@:
315 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
316%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
317 sub $16, %r13
318 jmp _initial_blocks_\@
319_initial_num_blocks_is_0_\@:
320 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
321%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
322_initial_blocks_\@:
323
324
325
326 cmp $0, %r13
327 je _zero_cipher_left_\@
328 sub $64, %r13
329 je _four_cipher_left_\@
330_crypt_by_4_\@:
331 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
332 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
333 %xmm7, %xmm8, enc
334 add $64, %r11
335 sub $64, %r13
336 jne _crypt_by_4_\@
337_four_cipher_left_\@:
338 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
339%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
340_zero_cipher_left_\@:
341 movdqu %xmm8, AadHash(%arg2)
342 movdqu %xmm0, CurCount(%arg2)
343
344 mov %arg5, %r13
345 and $15, %r13
346 je _multiple_of_16_bytes_\@
347
348 mov %r13, PBlockLen(%arg2)
349
350
351 paddd ONE(%rip), %xmm0
352 movdqu %xmm0, CurCount(%arg2)
353 movdqa SHUF_MASK(%rip), %xmm10
354 PSHUFB_XMM %xmm10, %xmm0
355
356 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1
357 movdqu %xmm0, PBlockEncKey(%arg2)
358
359 cmp $16, %arg5
360 jge _large_enough_update_\@
361
362 lea (%arg4,%r11,1), %r10
363 mov %r13, %r12
364 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
365 jmp _data_read_\@
366
367_large_enough_update_\@:
368 sub $16, %r11
369 add %r13, %r11
370
371
372 movdqu (%arg4, %r11, 1), %xmm1
373
374 sub %r13, %r11
375 add $16, %r11
376
377 lea SHIFT_MASK+16(%rip), %r12
378
379
380 sub %r13, %r12
381
382 movdqu (%r12), %xmm2
383
384 PSHUFB_XMM %xmm2, %xmm1
385
386_data_read_\@:
387 lea ALL_F+16(%rip), %r12
388 sub %r13, %r12
389
390.ifc \operation, dec
391 movdqa %xmm1, %xmm2
392.endif
393 pxor %xmm1, %xmm0
394 movdqu (%r12), %xmm1
395
396 pand %xmm1, %xmm0
397.ifc \operation, dec
398 pand %xmm1, %xmm2
399 movdqa SHUF_MASK(%rip), %xmm10
400 PSHUFB_XMM %xmm10 ,%xmm2
401
402 pxor %xmm2, %xmm8
403.else
404 movdqa SHUF_MASK(%rip), %xmm10
405 PSHUFB_XMM %xmm10,%xmm0
406
407 pxor %xmm0, %xmm8
408.endif
409
410 movdqu %xmm8, AadHash(%arg2)
411.ifc \operation, enc
412
413 movdqa SHUF_MASK(%rip), %xmm10
414
415 PSHUFB_XMM %xmm10, %xmm0
416.endif
417
418
419 MOVQ_R64_XMM %xmm0, %rax
420 cmp $8, %r13
421 jle _less_than_8_bytes_left_\@
422 mov %rax, (%arg3 , %r11, 1)
423 add $8, %r11
424 psrldq $8, %xmm0
425 MOVQ_R64_XMM %xmm0, %rax
426 sub $8, %r13
427_less_than_8_bytes_left_\@:
428 mov %al, (%arg3, %r11, 1)
429 add $1, %r11
430 shr $8, %rax
431 sub $1, %r13
432 jne _less_than_8_bytes_left_\@
433_multiple_of_16_bytes_\@:
434.endm
435
436
437
438
439.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
440 movdqu AadHash(%arg2), %xmm8
441 movdqu HashKey(%arg2), %xmm13
442
443 mov PBlockLen(%arg2), %r12
444
445 cmp $0, %r12
446 je _partial_done\@
447
448 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
449
450_partial_done\@:
451 mov AadLen(%arg2), %r12
452 shl $3, %r12
453 movd %r12d, %xmm15
454 mov InLen(%arg2), %r12
455 shl $3, %r12
456 MOVQ_R64_XMM %r12, %xmm1
457
458 pslldq $8, %xmm15
459 pxor %xmm1, %xmm15
460 pxor %xmm15, %xmm8
461 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
462
463 movdqa SHUF_MASK(%rip), %xmm10
464 PSHUFB_XMM %xmm10, %xmm8
465
466 movdqu OrigIV(%arg2), %xmm0
467 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1
468 pxor %xmm8, %xmm0
469_return_T_\@:
470 mov \AUTHTAG, %r10
471 mov \AUTHTAGLEN, %r11
472 cmp $16, %r11
473 je _T_16_\@
474 cmp $8, %r11
475 jl _T_4_\@
476_T_8_\@:
477 MOVQ_R64_XMM %xmm0, %rax
478 mov %rax, (%r10)
479 add $8, %r10
480 sub $8, %r11
481 psrldq $8, %xmm0
482 cmp $0, %r11
483 je _return_T_done_\@
484_T_4_\@:
485 movd %xmm0, %eax
486 mov %eax, (%r10)
487 add $4, %r10
488 sub $4, %r11
489 psrldq $4, %xmm0
490 cmp $0, %r11
491 je _return_T_done_\@
492_T_123_\@:
493 movd %xmm0, %eax
494 cmp $2, %r11
495 jl _T_1_\@
496 mov %ax, (%r10)
497 cmp $2, %r11
498 je _return_T_done_\@
499 add $2, %r10
500 sar $16, %eax
501_T_1_\@:
502 mov %al, (%r10)
503 jmp _return_T_done_\@
504_T_16_\@:
505 movdqu %xmm0, (%r10)
506_return_T_done_\@:
507.endm
508
509#ifdef __x86_64__
510
511
512
513
514
515
516
517
518
519.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
520 movdqa \GH, \TMP1
521 pshufd $78, \GH, \TMP2
522 pshufd $78, \HK, \TMP3
523 pxor \GH, \TMP2
524 pxor \HK, \TMP3
525 PCLMULQDQ 0x11, \HK, \TMP1
526 PCLMULQDQ 0x00, \HK, \GH
527 PCLMULQDQ 0x00, \TMP3, \TMP2
528 pxor \GH, \TMP2
529 pxor \TMP1, \TMP2
530 movdqa \TMP2, \TMP3
531 pslldq $8, \TMP3
532 psrldq $8, \TMP2
533 pxor \TMP3, \GH
534 pxor \TMP2, \TMP1
535
536
537
538 movdqa \GH, \TMP2
539 movdqa \GH, \TMP3
540 movdqa \GH, \TMP4
541
542
543 pslld $31, \TMP2
544 pslld $30, \TMP3
545 pslld $25, \TMP4
546 pxor \TMP3, \TMP2
547 pxor \TMP4, \TMP2
548 movdqa \TMP2, \TMP5
549 psrldq $4, \TMP5
550 pslldq $12, \TMP2
551 pxor \TMP2, \GH
552
553
554
555 movdqa \GH,\TMP2
556
557
558 movdqa \GH,\TMP3
559 movdqa \GH,\TMP4
560 psrld $1,\TMP2
561 psrld $2,\TMP3
562 psrld $7,\TMP4
563 pxor \TMP3,\TMP2
564 pxor \TMP4,\TMP2
565 pxor \TMP5, \TMP2
566 pxor \TMP2, \GH
567 pxor \TMP1, \GH
568.endm
569
570
571
572
573.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
574 cmp $8, \DLEN
575 jl _read_lt8_\@
576 mov (\DPTR), %rax
577 MOVQ_R64_XMM %rax, \XMMDst
578 sub $8, \DLEN
579 jz _done_read_partial_block_\@
580 xor %eax, %eax
581_read_next_byte_\@:
582 shl $8, %rax
583 mov 7(\DPTR, \DLEN, 1), %al
584 dec \DLEN
585 jnz _read_next_byte_\@
586 MOVQ_R64_XMM %rax, \XMM1
587 pslldq $8, \XMM1
588 por \XMM1, \XMMDst
589 jmp _done_read_partial_block_\@
590_read_lt8_\@:
591 xor %eax, %eax
592_read_next_byte_lt8_\@:
593 shl $8, %rax
594 mov -1(\DPTR, \DLEN, 1), %al
595 dec \DLEN
596 jnz _read_next_byte_lt8_\@
597 MOVQ_R64_XMM %rax, \XMMDst
598_done_read_partial_block_\@:
599.endm
600
601
602
603.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
604 TMP6 TMP7
605 MOVADQ SHUF_MASK(%rip), %xmm14
606 mov \AAD, %r10
607 mov \AADLEN, %r11
608 pxor \TMP7, \TMP7
609 pxor \TMP6, \TMP6
610
611 cmp $16, %r11
612 jl _get_AAD_rest\@
613_get_AAD_blocks\@:
614 movdqu (%r10), \TMP7
615 PSHUFB_XMM %xmm14, \TMP7
616 pxor \TMP7, \TMP6
617 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
618 add $16, %r10
619 sub $16, %r11
620 cmp $16, %r11
621 jge _get_AAD_blocks\@
622
623 movdqu \TMP6, \TMP7
624
625
626_get_AAD_rest\@:
627 cmp $0, %r11
628 je _get_AAD_done\@
629
630 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
631 PSHUFB_XMM %xmm14, \TMP7
632 pxor \TMP6, \TMP7
633 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
634 movdqu \TMP7, \TMP6
635
636_get_AAD_done\@:
637 movdqu \TMP6, AadHash(%arg2)
638.endm
639
640
641
642
643
644
645.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
646 AAD_HASH operation
647 mov PBlockLen(%arg2), %r13
648 cmp $0, %r13
649 je _partial_block_done_\@
650
651 cmp $16, \PLAIN_CYPH_LEN
652 jl _fewer_than_16_bytes_\@
653 movups (\PLAIN_CYPH_IN), %xmm1
654 jmp _data_read_\@
655
656_fewer_than_16_bytes_\@:
657 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
658 mov \PLAIN_CYPH_LEN, %r12
659 READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
660
661 mov PBlockLen(%arg2), %r13
662
663_data_read_\@:
664
665 movdqu PBlockEncKey(%arg2), %xmm9
666 movdqu HashKey(%arg2), %xmm13
667
668 lea SHIFT_MASK(%rip), %r12
669
670
671
672 add %r13, %r12
673 movdqu (%r12), %xmm2
674 PSHUFB_XMM %xmm2, %xmm9
675
676.ifc \operation, dec
677 movdqa %xmm1, %xmm3
678 pxor %xmm1, %xmm9
679
680 mov \PLAIN_CYPH_LEN, %r10
681 add %r13, %r10
682
683 sub $16, %r10
684
685
686 jge _no_extra_mask_1_\@
687 sub %r10, %r12
688_no_extra_mask_1_\@:
689
690 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
691
692 pand %xmm1, %xmm9
693
694 pand %xmm1, %xmm3
695 movdqa SHUF_MASK(%rip), %xmm10
696 PSHUFB_XMM %xmm10, %xmm3
697 PSHUFB_XMM %xmm2, %xmm3
698 pxor %xmm3, \AAD_HASH
699
700 cmp $0, %r10
701 jl _partial_incomplete_1_\@
702
703
704 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
705 xor %rax,%rax
706
707 mov %rax, PBlockLen(%arg2)
708 jmp _dec_done_\@
709_partial_incomplete_1_\@:
710 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
711_dec_done_\@:
712 movdqu \AAD_HASH, AadHash(%arg2)
713.else
714 pxor %xmm1, %xmm9
715
716 mov \PLAIN_CYPH_LEN, %r10
717 add %r13, %r10
718
719 sub $16, %r10
720
721
722 jge _no_extra_mask_2_\@
723 sub %r10, %r12
724_no_extra_mask_2_\@:
725
726 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
727
728 pand %xmm1, %xmm9
729
730 movdqa SHUF_MASK(%rip), %xmm1
731 PSHUFB_XMM %xmm1, %xmm9
732 PSHUFB_XMM %xmm2, %xmm9
733 pxor %xmm9, \AAD_HASH
734
735 cmp $0, %r10
736 jl _partial_incomplete_2_\@
737
738
739 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
740 xor %rax,%rax
741
742 mov %rax, PBlockLen(%arg2)
743 jmp _encode_done_\@
744_partial_incomplete_2_\@:
745 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
746_encode_done_\@:
747 movdqu \AAD_HASH, AadHash(%arg2)
748
749 movdqa SHUF_MASK(%rip), %xmm10
750
751 PSHUFB_XMM %xmm10, %xmm9
752 PSHUFB_XMM %xmm2, %xmm9
753.endif
754
755 cmp $0, %r10
756 jl _partial_fill_\@
757 mov %r13, %r12
758 mov $16, %r13
759
760 sub %r12, %r13
761 jmp _count_set_\@
762_partial_fill_\@:
763 mov \PLAIN_CYPH_LEN, %r13
764_count_set_\@:
765 movdqa %xmm9, %xmm0
766 MOVQ_R64_XMM %xmm0, %rax
767 cmp $8, %r13
768 jle _less_than_8_bytes_left_\@
769
770 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
771 add $8, \DATA_OFFSET
772 psrldq $8, %xmm0
773 MOVQ_R64_XMM %xmm0, %rax
774 sub $8, %r13
775_less_than_8_bytes_left_\@:
776 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
777 add $1, \DATA_OFFSET
778 shr $8, %rax
779 sub $1, %r13
780 jne _less_than_8_bytes_left_\@
781_partial_block_done_\@:
782.endm
783
784
785
786
787
788
789
790
791
792
793
794
795
796.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
797 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
798 MOVADQ SHUF_MASK(%rip), %xmm14
799
800 movdqu AadHash(%arg2), %xmm\i
801
802
803
804 movdqu CurCount(%arg2), \XMM0
805
806.if (\i == 5) || (\i == 6) || (\i == 7)
807
808 MOVADQ ONE(%RIP),\TMP1
809 MOVADQ 0(%arg1),\TMP2
810.irpc index, \i_seq
811 paddd \TMP1, \XMM0
812.ifc \operation, dec
813 movdqa \XMM0, %xmm\index
814.else
815 MOVADQ \XMM0, %xmm\index
816.endif
817 PSHUFB_XMM %xmm14, %xmm\index
818 pxor \TMP2, %xmm\index
819.endr
820 lea 0x10(%arg1),%r10
821 mov keysize,%eax
822 shr $2,%eax
823 add $5,%eax
824
825aes_loop_initial_\@:
826 MOVADQ (%r10),\TMP1
827.irpc index, \i_seq
828 AESENC \TMP1, %xmm\index
829.endr
830 add $16,%r10
831 sub $1,%eax
832 jnz aes_loop_initial_\@
833
834 MOVADQ (%r10), \TMP1
835.irpc index, \i_seq
836 AESENCLAST \TMP1, %xmm\index
837.endr
838.irpc index, \i_seq
839 movdqu (%arg4 , %r11, 1), \TMP1
840 pxor \TMP1, %xmm\index
841 movdqu %xmm\index, (%arg3 , %r11, 1)
842
843 add $16, %r11
844
845.ifc \operation, dec
846 movdqa \TMP1, %xmm\index
847.endif
848 PSHUFB_XMM %xmm14, %xmm\index
849
850
851.endr
852.endif
853
854
855
856.if \i == 5
857 pxor %xmm5, %xmm6
858 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
859 pxor %xmm6, %xmm7
860 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
861 pxor %xmm7, %xmm8
862 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
863.elseif \i == 6
864 pxor %xmm6, %xmm7
865 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
866 pxor %xmm7, %xmm8
867 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
868.elseif \i == 7
869 pxor %xmm7, %xmm8
870 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
871.endif
872 cmp $64, %r13
873 jl _initial_blocks_done\@
874
875
876
877
878
879
880 MOVADQ ONE(%RIP),\TMP1
881 paddd \TMP1, \XMM0
882 MOVADQ \XMM0, \XMM1
883 PSHUFB_XMM %xmm14, \XMM1
884
885 paddd \TMP1, \XMM0
886 MOVADQ \XMM0, \XMM2
887 PSHUFB_XMM %xmm14, \XMM2
888
889 paddd \TMP1, \XMM0
890 MOVADQ \XMM0, \XMM3
891 PSHUFB_XMM %xmm14, \XMM3
892
893 paddd \TMP1, \XMM0
894 MOVADQ \XMM0, \XMM4
895 PSHUFB_XMM %xmm14, \XMM4
896
897 MOVADQ 0(%arg1),\TMP1
898 pxor \TMP1, \XMM1
899 pxor \TMP1, \XMM2
900 pxor \TMP1, \XMM3
901 pxor \TMP1, \XMM4
902.irpc index, 1234
903 movaps 0x10*\index(%arg1), \TMP1
904 AESENC \TMP1, \XMM1
905 AESENC \TMP1, \XMM2
906 AESENC \TMP1, \XMM3
907 AESENC \TMP1, \XMM4
908.endr
909.irpc index, 56789
910 movaps 0x10*\index(%arg1), \TMP1
911 AESENC \TMP1, \XMM1
912 AESENC \TMP1, \XMM2
913 AESENC \TMP1, \XMM3
914 AESENC \TMP1, \XMM4
915.endr
916 lea 0xa0(%arg1),%r10
917 mov keysize,%eax
918 shr $2,%eax
919 sub $4,%eax
920 jz aes_loop_pre_done\@
921
922aes_loop_pre_\@:
923 MOVADQ (%r10),\TMP2
924.irpc index, 1234
925 AESENC \TMP2, %xmm\index
926.endr
927 add $16,%r10
928 sub $1,%eax
929 jnz aes_loop_pre_\@
930
931aes_loop_pre_done\@:
932 MOVADQ (%r10), \TMP2
933 AESENCLAST \TMP2, \XMM1
934 AESENCLAST \TMP2, \XMM2
935 AESENCLAST \TMP2, \XMM3
936 AESENCLAST \TMP2, \XMM4
937 movdqu 16*0(%arg4 , %r11 , 1), \TMP1
938 pxor \TMP1, \XMM1
939.ifc \operation, dec
940 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
941 movdqa \TMP1, \XMM1
942.endif
943 movdqu 16*1(%arg4 , %r11 , 1), \TMP1
944 pxor \TMP1, \XMM2
945.ifc \operation, dec
946 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
947 movdqa \TMP1, \XMM2
948.endif
949 movdqu 16*2(%arg4 , %r11 , 1), \TMP1
950 pxor \TMP1, \XMM3
951.ifc \operation, dec
952 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
953 movdqa \TMP1, \XMM3
954.endif
955 movdqu 16*3(%arg4 , %r11 , 1), \TMP1
956 pxor \TMP1, \XMM4
957.ifc \operation, dec
958 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
959 movdqa \TMP1, \XMM4
960.else
961 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
962 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
963 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
964 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
965.endif
966
967 add $64, %r11
968 PSHUFB_XMM %xmm14, \XMM1
969 pxor \XMMDst, \XMM1
970
971 PSHUFB_XMM %xmm14, \XMM2
972 PSHUFB_XMM %xmm14, \XMM3
973 PSHUFB_XMM %xmm14, \XMM4
974
975_initial_blocks_done\@:
976
977.endm
978
979
980
981
982
983
984
985.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
986TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
987
988 movdqa \XMM1, \XMM5
989 movdqa \XMM2, \XMM6
990 movdqa \XMM3, \XMM7
991 movdqa \XMM4, \XMM8
992
993 movdqa SHUF_MASK(%rip), %xmm15
994
995
996 movdqa \XMM5, \TMP4
997 pshufd $78, \XMM5, \TMP6
998 pxor \XMM5, \TMP6
999 paddd ONE(%rip), \XMM0
1000 movdqa HashKey_4(%arg2), \TMP5
1001 PCLMULQDQ 0x11, \TMP5, \TMP4
1002 movdqa \XMM0, \XMM1
1003 paddd ONE(%rip), \XMM0
1004 movdqa \XMM0, \XMM2
1005 paddd ONE(%rip), \XMM0
1006 movdqa \XMM0, \XMM3
1007 paddd ONE(%rip), \XMM0
1008 movdqa \XMM0, \XMM4
1009 PSHUFB_XMM %xmm15, \XMM1
1010 PCLMULQDQ 0x00, \TMP5, \XMM5
1011 PSHUFB_XMM %xmm15, \XMM2
1012 PSHUFB_XMM %xmm15, \XMM3
1013 PSHUFB_XMM %xmm15, \XMM4
1014
1015 pxor (%arg1), \XMM1
1016 pxor (%arg1), \XMM2
1017 pxor (%arg1), \XMM3
1018 pxor (%arg1), \XMM4
1019 movdqa HashKey_4_k(%arg2), \TMP5
1020 PCLMULQDQ 0x00, \TMP5, \TMP6
1021 movaps 0x10(%arg1), \TMP1
1022 AESENC \TMP1, \XMM1
1023 AESENC \TMP1, \XMM2
1024 AESENC \TMP1, \XMM3
1025 AESENC \TMP1, \XMM4
1026 movaps 0x20(%arg1), \TMP1
1027 AESENC \TMP1, \XMM1
1028 AESENC \TMP1, \XMM2
1029 AESENC \TMP1, \XMM3
1030 AESENC \TMP1, \XMM4
1031 movdqa \XMM6, \TMP1
1032 pshufd $78, \XMM6, \TMP2
1033 pxor \XMM6, \TMP2
1034 movdqa HashKey_3(%arg2), \TMP5
1035 PCLMULQDQ 0x11, \TMP5, \TMP1
1036 movaps 0x30(%arg1), \TMP3
1037 AESENC \TMP3, \XMM1
1038 AESENC \TMP3, \XMM2
1039 AESENC \TMP3, \XMM3
1040 AESENC \TMP3, \XMM4
1041 PCLMULQDQ 0x00, \TMP5, \XMM6
1042 movaps 0x40(%arg1), \TMP3
1043 AESENC \TMP3, \XMM1
1044 AESENC \TMP3, \XMM2
1045 AESENC \TMP3, \XMM3
1046 AESENC \TMP3, \XMM4
1047 movdqa HashKey_3_k(%arg2), \TMP5
1048 PCLMULQDQ 0x00, \TMP5, \TMP2
1049 movaps 0x50(%arg1), \TMP3
1050 AESENC \TMP3, \XMM1
1051 AESENC \TMP3, \XMM2
1052 AESENC \TMP3, \XMM3
1053 AESENC \TMP3, \XMM4
1054 pxor \TMP1, \TMP4
1055
1056 pxor \XMM6, \XMM5
1057 pxor \TMP2, \TMP6
1058 movdqa \XMM7, \TMP1
1059 pshufd $78, \XMM7, \TMP2
1060 pxor \XMM7, \TMP2
1061 movdqa HashKey_2(%arg2), \TMP5
1062
1063
1064
1065 PCLMULQDQ 0x11, \TMP5, \TMP1
1066 movaps 0x60(%arg1), \TMP3
1067 AESENC \TMP3, \XMM1
1068 AESENC \TMP3, \XMM2
1069 AESENC \TMP3, \XMM3
1070 AESENC \TMP3, \XMM4
1071 PCLMULQDQ 0x00, \TMP5, \XMM7
1072 movaps 0x70(%arg1), \TMP3
1073 AESENC \TMP3, \XMM1
1074 AESENC \TMP3, \XMM2
1075 AESENC \TMP3, \XMM3
1076 AESENC \TMP3, \XMM4
1077 movdqa HashKey_2_k(%arg2), \TMP5
1078 PCLMULQDQ 0x00, \TMP5, \TMP2
1079 movaps 0x80(%arg1), \TMP3
1080 AESENC \TMP3, \XMM1
1081 AESENC \TMP3, \XMM2
1082 AESENC \TMP3, \XMM3
1083 AESENC \TMP3, \XMM4
1084 pxor \TMP1, \TMP4
1085
1086 pxor \XMM7, \XMM5
1087 pxor \TMP2, \TMP6
1088
1089
1090
1091
1092 movdqa \XMM8, \TMP1
1093 pshufd $78, \XMM8, \TMP2
1094 pxor \XMM8, \TMP2
1095 movdqa HashKey(%arg2), \TMP5
1096 PCLMULQDQ 0x11, \TMP5, \TMP1
1097 movaps 0x90(%arg1), \TMP3
1098 AESENC \TMP3, \XMM1
1099 AESENC \TMP3, \XMM2
1100 AESENC \TMP3, \XMM3
1101 AESENC \TMP3, \XMM4
1102 PCLMULQDQ 0x00, \TMP5, \XMM8
1103 lea 0xa0(%arg1),%r10
1104 mov keysize,%eax
1105 shr $2,%eax
1106 sub $4,%eax
1107 jz aes_loop_par_enc_done\@
1108
1109aes_loop_par_enc\@:
1110 MOVADQ (%r10),\TMP3
1111.irpc index, 1234
1112 AESENC \TMP3, %xmm\index
1113.endr
1114 add $16,%r10
1115 sub $1,%eax
1116 jnz aes_loop_par_enc\@
1117
1118aes_loop_par_enc_done\@:
1119 MOVADQ (%r10), \TMP3
1120 AESENCLAST \TMP3, \XMM1
1121 AESENCLAST \TMP3, \XMM2
1122 AESENCLAST \TMP3, \XMM3
1123 AESENCLAST \TMP3, \XMM4
1124 movdqa HashKey_k(%arg2), \TMP5
1125 PCLMULQDQ 0x00, \TMP5, \TMP2
1126 movdqu (%arg4,%r11,1), \TMP3
1127 pxor \TMP3, \XMM1
1128 movdqu 16(%arg4,%r11,1), \TMP3
1129 pxor \TMP3, \XMM2
1130 movdqu 32(%arg4,%r11,1), \TMP3
1131 pxor \TMP3, \XMM3
1132 movdqu 48(%arg4,%r11,1), \TMP3
1133 pxor \TMP3, \XMM4
1134 movdqu \XMM1, (%arg3,%r11,1)
1135 movdqu \XMM2, 16(%arg3,%r11,1)
1136 movdqu \XMM3, 32(%arg3,%r11,1)
1137 movdqu \XMM4, 48(%arg3,%r11,1)
1138 PSHUFB_XMM %xmm15, \XMM1
1139 PSHUFB_XMM %xmm15, \XMM2
1140 PSHUFB_XMM %xmm15, \XMM3
1141 PSHUFB_XMM %xmm15, \XMM4
1142
1143 pxor \TMP4, \TMP1
1144 pxor \XMM8, \XMM5
1145 pxor \TMP6, \TMP2
1146 pxor \TMP1, \TMP2
1147 pxor \XMM5, \TMP2
1148 movdqa \TMP2, \TMP3
1149 pslldq $8, \TMP3
1150 psrldq $8, \TMP2
1151 pxor \TMP3, \XMM5
1152 pxor \TMP2, \TMP1
1153
1154
1155
1156 movdqa \XMM5, \TMP2
1157 movdqa \XMM5, \TMP3
1158 movdqa \XMM5, \TMP4
1159
1160 pslld $31, \TMP2
1161 pslld $30, \TMP3
1162 pslld $25, \TMP4
1163 pxor \TMP3, \TMP2
1164 pxor \TMP4, \TMP2
1165 movdqa \TMP2, \TMP5
1166 psrldq $4, \TMP5
1167 pslldq $12, \TMP2
1168 pxor \TMP2, \XMM5
1169
1170
1171
1172 movdqa \XMM5,\TMP2
1173 movdqa \XMM5,\TMP3
1174 movdqa \XMM5,\TMP4
1175 psrld $1, \TMP2
1176 psrld $2, \TMP3
1177 psrld $7, \TMP4
1178 pxor \TMP3,\TMP2
1179 pxor \TMP4,\TMP2
1180 pxor \TMP5, \TMP2
1181 pxor \TMP2, \XMM5
1182 pxor \TMP1, \XMM5
1183
1184 pxor \XMM5, \XMM1
1185.endm
1186
1187
1188
1189
1190
1191
1192
1193.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
1194TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1195
1196 movdqa \XMM1, \XMM5
1197 movdqa \XMM2, \XMM6
1198 movdqa \XMM3, \XMM7
1199 movdqa \XMM4, \XMM8
1200
1201 movdqa SHUF_MASK(%rip), %xmm15
1202
1203
1204 movdqa \XMM5, \TMP4
1205 pshufd $78, \XMM5, \TMP6
1206 pxor \XMM5, \TMP6
1207 paddd ONE(%rip), \XMM0
1208 movdqa HashKey_4(%arg2), \TMP5
1209 PCLMULQDQ 0x11, \TMP5, \TMP4
1210 movdqa \XMM0, \XMM1
1211 paddd ONE(%rip), \XMM0
1212 movdqa \XMM0, \XMM2
1213 paddd ONE(%rip), \XMM0
1214 movdqa \XMM0, \XMM3
1215 paddd ONE(%rip), \XMM0
1216 movdqa \XMM0, \XMM4
1217 PSHUFB_XMM %xmm15, \XMM1
1218 PCLMULQDQ 0x00, \TMP5, \XMM5
1219 PSHUFB_XMM %xmm15, \XMM2
1220 PSHUFB_XMM %xmm15, \XMM3
1221 PSHUFB_XMM %xmm15, \XMM4
1222
1223 pxor (%arg1), \XMM1
1224 pxor (%arg1), \XMM2
1225 pxor (%arg1), \XMM3
1226 pxor (%arg1), \XMM4
1227 movdqa HashKey_4_k(%arg2), \TMP5
1228 PCLMULQDQ 0x00, \TMP5, \TMP6
1229 movaps 0x10(%arg1), \TMP1
1230 AESENC \TMP1, \XMM1
1231 AESENC \TMP1, \XMM2
1232 AESENC \TMP1, \XMM3
1233 AESENC \TMP1, \XMM4
1234 movaps 0x20(%arg1), \TMP1
1235 AESENC \TMP1, \XMM1
1236 AESENC \TMP1, \XMM2
1237 AESENC \TMP1, \XMM3
1238 AESENC \TMP1, \XMM4
1239 movdqa \XMM6, \TMP1
1240 pshufd $78, \XMM6, \TMP2
1241 pxor \XMM6, \TMP2
1242 movdqa HashKey_3(%arg2), \TMP5
1243 PCLMULQDQ 0x11, \TMP5, \TMP1
1244 movaps 0x30(%arg1), \TMP3
1245 AESENC \TMP3, \XMM1
1246 AESENC \TMP3, \XMM2
1247 AESENC \TMP3, \XMM3
1248 AESENC \TMP3, \XMM4
1249 PCLMULQDQ 0x00, \TMP5, \XMM6
1250 movaps 0x40(%arg1), \TMP3
1251 AESENC \TMP3, \XMM1
1252 AESENC \TMP3, \XMM2
1253 AESENC \TMP3, \XMM3
1254 AESENC \TMP3, \XMM4
1255 movdqa HashKey_3_k(%arg2), \TMP5
1256 PCLMULQDQ 0x00, \TMP5, \TMP2
1257 movaps 0x50(%arg1), \TMP3
1258 AESENC \TMP3, \XMM1
1259 AESENC \TMP3, \XMM2
1260 AESENC \TMP3, \XMM3
1261 AESENC \TMP3, \XMM4
1262 pxor \TMP1, \TMP4
1263
1264 pxor \XMM6, \XMM5
1265 pxor \TMP2, \TMP6
1266 movdqa \XMM7, \TMP1
1267 pshufd $78, \XMM7, \TMP2
1268 pxor \XMM7, \TMP2
1269 movdqa HashKey_2(%arg2), \TMP5
1270
1271
1272
1273 PCLMULQDQ 0x11, \TMP5, \TMP1
1274 movaps 0x60(%arg1), \TMP3
1275 AESENC \TMP3, \XMM1
1276 AESENC \TMP3, \XMM2
1277 AESENC \TMP3, \XMM3
1278 AESENC \TMP3, \XMM4
1279 PCLMULQDQ 0x00, \TMP5, \XMM7
1280 movaps 0x70(%arg1), \TMP3
1281 AESENC \TMP3, \XMM1
1282 AESENC \TMP3, \XMM2
1283 AESENC \TMP3, \XMM3
1284 AESENC \TMP3, \XMM4
1285 movdqa HashKey_2_k(%arg2), \TMP5
1286 PCLMULQDQ 0x00, \TMP5, \TMP2
1287 movaps 0x80(%arg1), \TMP3
1288 AESENC \TMP3, \XMM1
1289 AESENC \TMP3, \XMM2
1290 AESENC \TMP3, \XMM3
1291 AESENC \TMP3, \XMM4
1292 pxor \TMP1, \TMP4
1293
1294 pxor \XMM7, \XMM5
1295 pxor \TMP2, \TMP6
1296
1297
1298
1299
1300 movdqa \XMM8, \TMP1
1301 pshufd $78, \XMM8, \TMP2
1302 pxor \XMM8, \TMP2
1303 movdqa HashKey(%arg2), \TMP5
1304 PCLMULQDQ 0x11, \TMP5, \TMP1
1305 movaps 0x90(%arg1), \TMP3
1306 AESENC \TMP3, \XMM1
1307 AESENC \TMP3, \XMM2
1308 AESENC \TMP3, \XMM3
1309 AESENC \TMP3, \XMM4
1310 PCLMULQDQ 0x00, \TMP5, \XMM8
1311 lea 0xa0(%arg1),%r10
1312 mov keysize,%eax
1313 shr $2,%eax
1314 sub $4,%eax
1315 jz aes_loop_par_dec_done\@
1316
1317aes_loop_par_dec\@:
1318 MOVADQ (%r10),\TMP3
1319.irpc index, 1234
1320 AESENC \TMP3, %xmm\index
1321.endr
1322 add $16,%r10
1323 sub $1,%eax
1324 jnz aes_loop_par_dec\@
1325
1326aes_loop_par_dec_done\@:
1327 MOVADQ (%r10), \TMP3
1328 AESENCLAST \TMP3, \XMM1
1329 AESENCLAST \TMP3, \XMM2
1330 AESENCLAST \TMP3, \XMM3
1331 AESENCLAST \TMP3, \XMM4
1332 movdqa HashKey_k(%arg2), \TMP5
1333 PCLMULQDQ 0x00, \TMP5, \TMP2
1334 movdqu (%arg4,%r11,1), \TMP3
1335 pxor \TMP3, \XMM1
1336 movdqu \XMM1, (%arg3,%r11,1)
1337 movdqa \TMP3, \XMM1
1338 movdqu 16(%arg4,%r11,1), \TMP3
1339 pxor \TMP3, \XMM2
1340 movdqu \XMM2, 16(%arg3,%r11,1)
1341 movdqa \TMP3, \XMM2
1342 movdqu 32(%arg4,%r11,1), \TMP3
1343 pxor \TMP3, \XMM3
1344 movdqu \XMM3, 32(%arg3,%r11,1)
1345 movdqa \TMP3, \XMM3
1346 movdqu 48(%arg4,%r11,1), \TMP3
1347 pxor \TMP3, \XMM4
1348 movdqu \XMM4, 48(%arg3,%r11,1)
1349 movdqa \TMP3, \XMM4
1350 PSHUFB_XMM %xmm15, \XMM1
1351 PSHUFB_XMM %xmm15, \XMM2
1352 PSHUFB_XMM %xmm15, \XMM3
1353 PSHUFB_XMM %xmm15, \XMM4
1354
1355 pxor \TMP4, \TMP1
1356 pxor \XMM8, \XMM5
1357 pxor \TMP6, \TMP2
1358 pxor \TMP1, \TMP2
1359 pxor \XMM5, \TMP2
1360 movdqa \TMP2, \TMP3
1361 pslldq $8, \TMP3
1362 psrldq $8, \TMP2
1363 pxor \TMP3, \XMM5
1364 pxor \TMP2, \TMP1
1365
1366
1367
1368 movdqa \XMM5, \TMP2
1369 movdqa \XMM5, \TMP3
1370 movdqa \XMM5, \TMP4
1371
1372 pslld $31, \TMP2
1373 pslld $30, \TMP3
1374 pslld $25, \TMP4
1375 pxor \TMP3, \TMP2
1376 pxor \TMP4, \TMP2
1377 movdqa \TMP2, \TMP5
1378 psrldq $4, \TMP5
1379 pslldq $12, \TMP2
1380 pxor \TMP2, \XMM5
1381
1382
1383
1384 movdqa \XMM5,\TMP2
1385 movdqa \XMM5,\TMP3
1386 movdqa \XMM5,\TMP4
1387 psrld $1, \TMP2
1388 psrld $2, \TMP3
1389 psrld $7, \TMP4
1390 pxor \TMP3,\TMP2
1391 pxor \TMP4,\TMP2
1392 pxor \TMP5, \TMP2
1393 pxor \TMP2, \XMM5
1394 pxor \TMP1, \XMM5
1395
1396 pxor \XMM5, \XMM1
1397.endm
1398
1399
1400.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1401TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1402
1403
1404
1405 movdqa \XMM1, \TMP6
1406 pshufd $78, \XMM1, \TMP2
1407 pxor \XMM1, \TMP2
1408 movdqa HashKey_4(%arg2), \TMP5
1409 PCLMULQDQ 0x11, \TMP5, \TMP6
1410 PCLMULQDQ 0x00, \TMP5, \XMM1
1411 movdqa HashKey_4_k(%arg2), \TMP4
1412 PCLMULQDQ 0x00, \TMP4, \TMP2
1413 movdqa \XMM1, \XMMDst
1414 movdqa \TMP2, \XMM1
1415
1416
1417
1418 movdqa \XMM2, \TMP1
1419 pshufd $78, \XMM2, \TMP2
1420 pxor \XMM2, \TMP2
1421 movdqa HashKey_3(%arg2), \TMP5
1422 PCLMULQDQ 0x11, \TMP5, \TMP1
1423 PCLMULQDQ 0x00, \TMP5, \XMM2
1424 movdqa HashKey_3_k(%arg2), \TMP4
1425 PCLMULQDQ 0x00, \TMP4, \TMP2
1426 pxor \TMP1, \TMP6
1427 pxor \XMM2, \XMMDst
1428 pxor \TMP2, \XMM1
1429
1430
1431
1432
1433 movdqa \XMM3, \TMP1
1434 pshufd $78, \XMM3, \TMP2
1435 pxor \XMM3, \TMP2
1436 movdqa HashKey_2(%arg2), \TMP5
1437 PCLMULQDQ 0x11, \TMP5, \TMP1
1438 PCLMULQDQ 0x00, \TMP5, \XMM3
1439 movdqa HashKey_2_k(%arg2), \TMP4
1440 PCLMULQDQ 0x00, \TMP4, \TMP2
1441 pxor \TMP1, \TMP6
1442 pxor \XMM3, \XMMDst
1443 pxor \TMP2, \XMM1
1444
1445
1446 movdqa \XMM4, \TMP1
1447 pshufd $78, \XMM4, \TMP2
1448 pxor \XMM4, \TMP2
1449 movdqa HashKey(%arg2), \TMP5
1450 PCLMULQDQ 0x11, \TMP5, \TMP1
1451 PCLMULQDQ 0x00, \TMP5, \XMM4
1452 movdqa HashKey_k(%arg2), \TMP4
1453 PCLMULQDQ 0x00, \TMP4, \TMP2
1454 pxor \TMP1, \TMP6
1455 pxor \XMM4, \XMMDst
1456 pxor \XMM1, \TMP2
1457 pxor \TMP6, \TMP2
1458 pxor \XMMDst, \TMP2
1459
1460 movdqa \TMP2, \TMP4
1461 pslldq $8, \TMP4
1462 psrldq $8, \TMP2
1463 pxor \TMP4, \XMMDst
1464 pxor \TMP2, \TMP6
1465
1466
1467 movdqa \XMMDst, \TMP2
1468 movdqa \XMMDst, \TMP3
1469 movdqa \XMMDst, \TMP4
1470
1471 pslld $31, \TMP2
1472 pslld $30, \TMP3
1473 pslld $25, \TMP4
1474 pxor \TMP3, \TMP2
1475 pxor \TMP4, \TMP2
1476 movdqa \TMP2, \TMP7
1477 psrldq $4, \TMP7
1478 pslldq $12, \TMP2
1479 pxor \TMP2, \XMMDst
1480
1481
1482 movdqa \XMMDst, \TMP2
1483
1484 movdqa \XMMDst, \TMP3
1485 movdqa \XMMDst, \TMP4
1486 psrld $1, \TMP2
1487 psrld $2, \TMP3
1488 psrld $7, \TMP4
1489 pxor \TMP3, \TMP2
1490 pxor \TMP4, \TMP2
1491 pxor \TMP7, \TMP2
1492 pxor \TMP2, \XMMDst
1493 pxor \TMP6, \XMMDst
1494.endm
1495
1496
1497
1498
1499
1500
1501.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1502
1503 pxor (%arg1), \XMM0
1504 mov keysize,%eax
1505 shr $2,%eax
1506 add $5,%eax
1507 lea 16(%arg1), %r10
1508
1509_esb_loop_\@:
1510 MOVADQ (%r10),\TMP1
1511 AESENC \TMP1,\XMM0
1512 add $16,%r10
1513 sub $1,%eax
1514 jnz _esb_loop_\@
1515
1516 MOVADQ (%r10),\TMP1
1517 AESENCLAST \TMP1,\XMM0
1518.endm
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599ENTRY(aesni_gcm_dec)
1600 FUNC_SAVE
1601
1602 GCM_INIT %arg6, arg7, arg8, arg9
1603 GCM_ENC_DEC dec
1604 GCM_COMPLETE arg10, arg11
1605 FUNC_RESTORE
1606 ret
1607ENDPROC(aesni_gcm_dec)
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687ENTRY(aesni_gcm_enc)
1688 FUNC_SAVE
1689
1690 GCM_INIT %arg6, arg7, arg8, arg9
1691 GCM_ENC_DEC enc
1692
1693 GCM_COMPLETE arg10, arg11
1694 FUNC_RESTORE
1695 ret
1696ENDPROC(aesni_gcm_enc)
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709ENTRY(aesni_gcm_init)
1710 FUNC_SAVE
1711 GCM_INIT %arg3, %arg4,%arg5, %arg6
1712 FUNC_RESTORE
1713 ret
1714ENDPROC(aesni_gcm_init)
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724ENTRY(aesni_gcm_enc_update)
1725 FUNC_SAVE
1726 GCM_ENC_DEC enc
1727 FUNC_RESTORE
1728 ret
1729ENDPROC(aesni_gcm_enc_update)
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739ENTRY(aesni_gcm_dec_update)
1740 FUNC_SAVE
1741 GCM_ENC_DEC dec
1742 FUNC_RESTORE
1743 ret
1744ENDPROC(aesni_gcm_dec_update)
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754ENTRY(aesni_gcm_finalize)
1755 FUNC_SAVE
1756 GCM_COMPLETE %arg3 %arg4
1757 FUNC_RESTORE
1758 ret
1759ENDPROC(aesni_gcm_finalize)
1760
1761#endif
1762
1763
1764.align 4
1765_key_expansion_128:
1766_key_expansion_256a:
1767 pshufd $0b11111111, %xmm1, %xmm1
1768 shufps $0b00010000, %xmm0, %xmm4
1769 pxor %xmm4, %xmm0
1770 shufps $0b10001100, %xmm0, %xmm4
1771 pxor %xmm4, %xmm0
1772 pxor %xmm1, %xmm0
1773 movaps %xmm0, (TKEYP)
1774 add $0x10, TKEYP
1775 ret
1776ENDPROC(_key_expansion_128)
1777ENDPROC(_key_expansion_256a)
1778
1779.align 4
1780_key_expansion_192a:
1781 pshufd $0b01010101, %xmm1, %xmm1
1782 shufps $0b00010000, %xmm0, %xmm4
1783 pxor %xmm4, %xmm0
1784 shufps $0b10001100, %xmm0, %xmm4
1785 pxor %xmm4, %xmm0
1786 pxor %xmm1, %xmm0
1787
1788 movaps %xmm2, %xmm5
1789 movaps %xmm2, %xmm6
1790 pslldq $4, %xmm5
1791 pshufd $0b11111111, %xmm0, %xmm3
1792 pxor %xmm3, %xmm2
1793 pxor %xmm5, %xmm2
1794
1795 movaps %xmm0, %xmm1
1796 shufps $0b01000100, %xmm0, %xmm6
1797 movaps %xmm6, (TKEYP)
1798 shufps $0b01001110, %xmm2, %xmm1
1799 movaps %xmm1, 0x10(TKEYP)
1800 add $0x20, TKEYP
1801 ret
1802ENDPROC(_key_expansion_192a)
1803
1804.align 4
1805_key_expansion_192b:
1806 pshufd $0b01010101, %xmm1, %xmm1
1807 shufps $0b00010000, %xmm0, %xmm4
1808 pxor %xmm4, %xmm0
1809 shufps $0b10001100, %xmm0, %xmm4
1810 pxor %xmm4, %xmm0
1811 pxor %xmm1, %xmm0
1812
1813 movaps %xmm2, %xmm5
1814 pslldq $4, %xmm5
1815 pshufd $0b11111111, %xmm0, %xmm3
1816 pxor %xmm3, %xmm2
1817 pxor %xmm5, %xmm2
1818
1819 movaps %xmm0, (TKEYP)
1820 add $0x10, TKEYP
1821 ret
1822ENDPROC(_key_expansion_192b)
1823
1824.align 4
1825_key_expansion_256b:
1826 pshufd $0b10101010, %xmm1, %xmm1
1827 shufps $0b00010000, %xmm2, %xmm4
1828 pxor %xmm4, %xmm2
1829 shufps $0b10001100, %xmm2, %xmm4
1830 pxor %xmm4, %xmm2
1831 pxor %xmm1, %xmm2
1832 movaps %xmm2, (TKEYP)
1833 add $0x10, TKEYP
1834 ret
1835ENDPROC(_key_expansion_256b)
1836
1837
1838
1839
1840
1841ENTRY(aesni_set_key)
1842 FRAME_BEGIN
1843#ifndef __x86_64__
1844 pushl KEYP
1845 movl (FRAME_OFFSET+8)(%esp), KEYP
1846 movl (FRAME_OFFSET+12)(%esp), UKEYP
1847 movl (FRAME_OFFSET+16)(%esp), %edx
1848#endif
1849 movups (UKEYP), %xmm0
1850 movaps %xmm0, (KEYP)
1851 lea 0x10(KEYP), TKEYP
1852 movl %edx, 480(KEYP)
1853 pxor %xmm4, %xmm4
1854 cmp $24, %dl
1855 jb .Lenc_key128
1856 je .Lenc_key192
1857 movups 0x10(UKEYP), %xmm2
1858 movaps %xmm2, (TKEYP)
1859 add $0x10, TKEYP
1860 AESKEYGENASSIST 0x1 %xmm2 %xmm1
1861 call _key_expansion_256a
1862 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1863 call _key_expansion_256b
1864 AESKEYGENASSIST 0x2 %xmm2 %xmm1
1865 call _key_expansion_256a
1866 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1867 call _key_expansion_256b
1868 AESKEYGENASSIST 0x4 %xmm2 %xmm1
1869 call _key_expansion_256a
1870 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1871 call _key_expansion_256b
1872 AESKEYGENASSIST 0x8 %xmm2 %xmm1
1873 call _key_expansion_256a
1874 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1875 call _key_expansion_256b
1876 AESKEYGENASSIST 0x10 %xmm2 %xmm1
1877 call _key_expansion_256a
1878 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1879 call _key_expansion_256b
1880 AESKEYGENASSIST 0x20 %xmm2 %xmm1
1881 call _key_expansion_256a
1882 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1883 call _key_expansion_256b
1884 AESKEYGENASSIST 0x40 %xmm2 %xmm1
1885 call _key_expansion_256a
1886 jmp .Ldec_key
1887.Lenc_key192:
1888 movq 0x10(UKEYP), %xmm2
1889 AESKEYGENASSIST 0x1 %xmm2 %xmm1
1890 call _key_expansion_192a
1891 AESKEYGENASSIST 0x2 %xmm2 %xmm1
1892 call _key_expansion_192b
1893 AESKEYGENASSIST 0x4 %xmm2 %xmm1
1894 call _key_expansion_192a
1895 AESKEYGENASSIST 0x8 %xmm2 %xmm1
1896 call _key_expansion_192b
1897 AESKEYGENASSIST 0x10 %xmm2 %xmm1
1898 call _key_expansion_192a
1899 AESKEYGENASSIST 0x20 %xmm2 %xmm1
1900 call _key_expansion_192b
1901 AESKEYGENASSIST 0x40 %xmm2 %xmm1
1902 call _key_expansion_192a
1903 AESKEYGENASSIST 0x80 %xmm2 %xmm1
1904 call _key_expansion_192b
1905 jmp .Ldec_key
1906.Lenc_key128:
1907 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1908 call _key_expansion_128
1909 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1910 call _key_expansion_128
1911 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1912 call _key_expansion_128
1913 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1914 call _key_expansion_128
1915 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1916 call _key_expansion_128
1917 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1918 call _key_expansion_128
1919 AESKEYGENASSIST 0x40 %xmm0 %xmm1
1920 call _key_expansion_128
1921 AESKEYGENASSIST 0x80 %xmm0 %xmm1
1922 call _key_expansion_128
1923 AESKEYGENASSIST 0x1b %xmm0 %xmm1
1924 call _key_expansion_128
1925 AESKEYGENASSIST 0x36 %xmm0 %xmm1
1926 call _key_expansion_128
1927.Ldec_key:
1928 sub $0x10, TKEYP
1929 movaps (KEYP), %xmm0
1930 movaps (TKEYP), %xmm1
1931 movaps %xmm0, 240(TKEYP)
1932 movaps %xmm1, 240(KEYP)
1933 add $0x10, KEYP
1934 lea 240-16(TKEYP), UKEYP
1935.align 4
1936.Ldec_key_loop:
1937 movaps (KEYP), %xmm0
1938 AESIMC %xmm0 %xmm1
1939 movaps %xmm1, (UKEYP)
1940 add $0x10, KEYP
1941 sub $0x10, UKEYP
1942 cmp TKEYP, KEYP
1943 jb .Ldec_key_loop
1944 xor AREG, AREG
1945#ifndef __x86_64__
1946 popl KEYP
1947#endif
1948 FRAME_END
1949 ret
1950ENDPROC(aesni_set_key)
1951
1952
1953
1954
1955ENTRY(aesni_enc)
1956 FRAME_BEGIN
1957#ifndef __x86_64__
1958 pushl KEYP
1959 pushl KLEN
1960 movl (FRAME_OFFSET+12)(%esp), KEYP
1961 movl (FRAME_OFFSET+16)(%esp), OUTP
1962 movl (FRAME_OFFSET+20)(%esp), INP
1963#endif
1964 movl 480(KEYP), KLEN
1965 movups (INP), STATE
1966 call _aesni_enc1
1967 movups STATE, (OUTP)
1968#ifndef __x86_64__
1969 popl KLEN
1970 popl KEYP
1971#endif
1972 FRAME_END
1973 ret
1974ENDPROC(aesni_enc)
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988.align 4
1989_aesni_enc1:
1990 movaps (KEYP), KEY
1991 mov KEYP, TKEYP
1992 pxor KEY, STATE
1993 add $0x30, TKEYP
1994 cmp $24, KLEN
1995 jb .Lenc128
1996 lea 0x20(TKEYP), TKEYP
1997 je .Lenc192
1998 add $0x20, TKEYP
1999 movaps -0x60(TKEYP), KEY
2000 AESENC KEY STATE
2001 movaps -0x50(TKEYP), KEY
2002 AESENC KEY STATE
2003.align 4
2004.Lenc192:
2005 movaps -0x40(TKEYP), KEY
2006 AESENC KEY STATE
2007 movaps -0x30(TKEYP), KEY
2008 AESENC KEY STATE
2009.align 4
2010.Lenc128:
2011 movaps -0x20(TKEYP), KEY
2012 AESENC KEY STATE
2013 movaps -0x10(TKEYP), KEY
2014 AESENC KEY STATE
2015 movaps (TKEYP), KEY
2016 AESENC KEY STATE
2017 movaps 0x10(TKEYP), KEY
2018 AESENC KEY STATE
2019 movaps 0x20(TKEYP), KEY
2020 AESENC KEY STATE
2021 movaps 0x30(TKEYP), KEY
2022 AESENC KEY STATE
2023 movaps 0x40(TKEYP), KEY
2024 AESENC KEY STATE
2025 movaps 0x50(TKEYP), KEY
2026 AESENC KEY STATE
2027 movaps 0x60(TKEYP), KEY
2028 AESENC KEY STATE
2029 movaps 0x70(TKEYP), KEY
2030 AESENCLAST KEY STATE
2031 ret
2032ENDPROC(_aesni_enc1)
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052.align 4
2053_aesni_enc4:
2054 movaps (KEYP), KEY
2055 mov KEYP, TKEYP
2056 pxor KEY, STATE1
2057 pxor KEY, STATE2
2058 pxor KEY, STATE3
2059 pxor KEY, STATE4
2060 add $0x30, TKEYP
2061 cmp $24, KLEN
2062 jb .L4enc128
2063 lea 0x20(TKEYP), TKEYP
2064 je .L4enc192
2065 add $0x20, TKEYP
2066 movaps -0x60(TKEYP), KEY
2067 AESENC KEY STATE1
2068 AESENC KEY STATE2
2069 AESENC KEY STATE3
2070 AESENC KEY STATE4
2071 movaps -0x50(TKEYP), KEY
2072 AESENC KEY STATE1
2073 AESENC KEY STATE2
2074 AESENC KEY STATE3
2075 AESENC KEY STATE4
2076
2077.L4enc192:
2078 movaps -0x40(TKEYP), KEY
2079 AESENC KEY STATE1
2080 AESENC KEY STATE2
2081 AESENC KEY STATE3
2082 AESENC KEY STATE4
2083 movaps -0x30(TKEYP), KEY
2084 AESENC KEY STATE1
2085 AESENC KEY STATE2
2086 AESENC KEY STATE3
2087 AESENC KEY STATE4
2088
2089.L4enc128:
2090 movaps -0x20(TKEYP), KEY
2091 AESENC KEY STATE1
2092 AESENC KEY STATE2
2093 AESENC KEY STATE3
2094 AESENC KEY STATE4
2095 movaps -0x10(TKEYP), KEY
2096 AESENC KEY STATE1
2097 AESENC KEY STATE2
2098 AESENC KEY STATE3
2099 AESENC KEY STATE4
2100 movaps (TKEYP), KEY
2101 AESENC KEY STATE1
2102 AESENC KEY STATE2
2103 AESENC KEY STATE3
2104 AESENC KEY STATE4
2105 movaps 0x10(TKEYP), KEY
2106 AESENC KEY STATE1
2107 AESENC KEY STATE2
2108 AESENC KEY STATE3
2109 AESENC KEY STATE4
2110 movaps 0x20(TKEYP), KEY
2111 AESENC KEY STATE1
2112 AESENC KEY STATE2
2113 AESENC KEY STATE3
2114 AESENC KEY STATE4
2115 movaps 0x30(TKEYP), KEY
2116 AESENC KEY STATE1
2117 AESENC KEY STATE2
2118 AESENC KEY STATE3
2119 AESENC KEY STATE4
2120 movaps 0x40(TKEYP), KEY
2121 AESENC KEY STATE1
2122 AESENC KEY STATE2
2123 AESENC KEY STATE3
2124 AESENC KEY STATE4
2125 movaps 0x50(TKEYP), KEY
2126 AESENC KEY STATE1
2127 AESENC KEY STATE2
2128 AESENC KEY STATE3
2129 AESENC KEY STATE4
2130 movaps 0x60(TKEYP), KEY
2131 AESENC KEY STATE1
2132 AESENC KEY STATE2
2133 AESENC KEY STATE3
2134 AESENC KEY STATE4
2135 movaps 0x70(TKEYP), KEY
2136 AESENCLAST KEY STATE1
2137 AESENCLAST KEY STATE2
2138 AESENCLAST KEY STATE3
2139 AESENCLAST KEY STATE4
2140 ret
2141ENDPROC(_aesni_enc4)
2142
2143
2144
2145
2146ENTRY(aesni_dec)
2147 FRAME_BEGIN
2148#ifndef __x86_64__
2149 pushl KEYP
2150 pushl KLEN
2151 movl (FRAME_OFFSET+12)(%esp), KEYP
2152 movl (FRAME_OFFSET+16)(%esp), OUTP
2153 movl (FRAME_OFFSET+20)(%esp), INP
2154#endif
2155 mov 480(KEYP), KLEN
2156 add $240, KEYP
2157 movups (INP), STATE
2158 call _aesni_dec1
2159 movups STATE, (OUTP)
2160#ifndef __x86_64__
2161 popl KLEN
2162 popl KEYP
2163#endif
2164 FRAME_END
2165 ret
2166ENDPROC(aesni_dec)
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180.align 4
2181_aesni_dec1:
2182 movaps (KEYP), KEY
2183 mov KEYP, TKEYP
2184 pxor KEY, STATE
2185 add $0x30, TKEYP
2186 cmp $24, KLEN
2187 jb .Ldec128
2188 lea 0x20(TKEYP), TKEYP
2189 je .Ldec192
2190 add $0x20, TKEYP
2191 movaps -0x60(TKEYP), KEY
2192 AESDEC KEY STATE
2193 movaps -0x50(TKEYP), KEY
2194 AESDEC KEY STATE
2195.align 4
2196.Ldec192:
2197 movaps -0x40(TKEYP), KEY
2198 AESDEC KEY STATE
2199 movaps -0x30(TKEYP), KEY
2200 AESDEC KEY STATE
2201.align 4
2202.Ldec128:
2203 movaps -0x20(TKEYP), KEY
2204 AESDEC KEY STATE
2205 movaps -0x10(TKEYP), KEY
2206 AESDEC KEY STATE
2207 movaps (TKEYP), KEY
2208 AESDEC KEY STATE
2209 movaps 0x10(TKEYP), KEY
2210 AESDEC KEY STATE
2211 movaps 0x20(TKEYP), KEY
2212 AESDEC KEY STATE
2213 movaps 0x30(TKEYP), KEY
2214 AESDEC KEY STATE
2215 movaps 0x40(TKEYP), KEY
2216 AESDEC KEY STATE
2217 movaps 0x50(TKEYP), KEY
2218 AESDEC KEY STATE
2219 movaps 0x60(TKEYP), KEY
2220 AESDEC KEY STATE
2221 movaps 0x70(TKEYP), KEY
2222 AESDECLAST KEY STATE
2223 ret
2224ENDPROC(_aesni_dec1)
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244.align 4
2245_aesni_dec4:
2246 movaps (KEYP), KEY
2247 mov KEYP, TKEYP
2248 pxor KEY, STATE1
2249 pxor KEY, STATE2
2250 pxor KEY, STATE3
2251 pxor KEY, STATE4
2252 add $0x30, TKEYP
2253 cmp $24, KLEN
2254 jb .L4dec128
2255 lea 0x20(TKEYP), TKEYP
2256 je .L4dec192
2257 add $0x20, TKEYP
2258 movaps -0x60(TKEYP), KEY
2259 AESDEC KEY STATE1
2260 AESDEC KEY STATE2
2261 AESDEC KEY STATE3
2262 AESDEC KEY STATE4
2263 movaps -0x50(TKEYP), KEY
2264 AESDEC KEY STATE1
2265 AESDEC KEY STATE2
2266 AESDEC KEY STATE3
2267 AESDEC KEY STATE4
2268.align 4
2269.L4dec192:
2270 movaps -0x40(TKEYP), KEY
2271 AESDEC KEY STATE1
2272 AESDEC KEY STATE2
2273 AESDEC KEY STATE3
2274 AESDEC KEY STATE4
2275 movaps -0x30(TKEYP), KEY
2276 AESDEC KEY STATE1
2277 AESDEC KEY STATE2
2278 AESDEC KEY STATE3
2279 AESDEC KEY STATE4
2280.align 4
2281.L4dec128:
2282 movaps -0x20(TKEYP), KEY
2283 AESDEC KEY STATE1
2284 AESDEC KEY STATE2
2285 AESDEC KEY STATE3
2286 AESDEC KEY STATE4
2287 movaps -0x10(TKEYP), KEY
2288 AESDEC KEY STATE1
2289 AESDEC KEY STATE2
2290 AESDEC KEY STATE3
2291 AESDEC KEY STATE4
2292 movaps (TKEYP), KEY
2293 AESDEC KEY STATE1
2294 AESDEC KEY STATE2
2295 AESDEC KEY STATE3
2296 AESDEC KEY STATE4
2297 movaps 0x10(TKEYP), KEY
2298 AESDEC KEY STATE1
2299 AESDEC KEY STATE2
2300 AESDEC KEY STATE3
2301 AESDEC KEY STATE4
2302 movaps 0x20(TKEYP), KEY
2303 AESDEC KEY STATE1
2304 AESDEC KEY STATE2
2305 AESDEC KEY STATE3
2306 AESDEC KEY STATE4
2307 movaps 0x30(TKEYP), KEY
2308 AESDEC KEY STATE1
2309 AESDEC KEY STATE2
2310 AESDEC KEY STATE3
2311 AESDEC KEY STATE4
2312 movaps 0x40(TKEYP), KEY
2313 AESDEC KEY STATE1
2314 AESDEC KEY STATE2
2315 AESDEC KEY STATE3
2316 AESDEC KEY STATE4
2317 movaps 0x50(TKEYP), KEY
2318 AESDEC KEY STATE1
2319 AESDEC KEY STATE2
2320 AESDEC KEY STATE3
2321 AESDEC KEY STATE4
2322 movaps 0x60(TKEYP), KEY
2323 AESDEC KEY STATE1
2324 AESDEC KEY STATE2
2325 AESDEC KEY STATE3
2326 AESDEC KEY STATE4
2327 movaps 0x70(TKEYP), KEY
2328 AESDECLAST KEY STATE1
2329 AESDECLAST KEY STATE2
2330 AESDECLAST KEY STATE3
2331 AESDECLAST KEY STATE4
2332 ret
2333ENDPROC(_aesni_dec4)
2334
2335
2336
2337
2338
2339ENTRY(aesni_ecb_enc)
2340 FRAME_BEGIN
2341#ifndef __x86_64__
2342 pushl LEN
2343 pushl KEYP
2344 pushl KLEN
2345 movl (FRAME_OFFSET+16)(%esp), KEYP
2346 movl (FRAME_OFFSET+20)(%esp), OUTP
2347 movl (FRAME_OFFSET+24)(%esp), INP
2348 movl (FRAME_OFFSET+28)(%esp), LEN
2349#endif
2350 test LEN, LEN
2351 jz .Lecb_enc_ret
2352 mov 480(KEYP), KLEN
2353 cmp $16, LEN
2354 jb .Lecb_enc_ret
2355 cmp $64, LEN
2356 jb .Lecb_enc_loop1
2357.align 4
2358.Lecb_enc_loop4:
2359 movups (INP), STATE1
2360 movups 0x10(INP), STATE2
2361 movups 0x20(INP), STATE3
2362 movups 0x30(INP), STATE4
2363 call _aesni_enc4
2364 movups STATE1, (OUTP)
2365 movups STATE2, 0x10(OUTP)
2366 movups STATE3, 0x20(OUTP)
2367 movups STATE4, 0x30(OUTP)
2368 sub $64, LEN
2369 add $64, INP
2370 add $64, OUTP
2371 cmp $64, LEN
2372 jge .Lecb_enc_loop4
2373 cmp $16, LEN
2374 jb .Lecb_enc_ret
2375.align 4
2376.Lecb_enc_loop1:
2377 movups (INP), STATE1
2378 call _aesni_enc1
2379 movups STATE1, (OUTP)
2380 sub $16, LEN
2381 add $16, INP
2382 add $16, OUTP
2383 cmp $16, LEN
2384 jge .Lecb_enc_loop1
2385.Lecb_enc_ret:
2386#ifndef __x86_64__
2387 popl KLEN
2388 popl KEYP
2389 popl LEN
2390#endif
2391 FRAME_END
2392 ret
2393ENDPROC(aesni_ecb_enc)
2394
2395
2396
2397
2398
2399ENTRY(aesni_ecb_dec)
2400 FRAME_BEGIN
2401#ifndef __x86_64__
2402 pushl LEN
2403 pushl KEYP
2404 pushl KLEN
2405 movl (FRAME_OFFSET+16)(%esp), KEYP
2406 movl (FRAME_OFFSET+20)(%esp), OUTP
2407 movl (FRAME_OFFSET+24)(%esp), INP
2408 movl (FRAME_OFFSET+28)(%esp), LEN
2409#endif
2410 test LEN, LEN
2411 jz .Lecb_dec_ret
2412 mov 480(KEYP), KLEN
2413 add $240, KEYP
2414 cmp $16, LEN
2415 jb .Lecb_dec_ret
2416 cmp $64, LEN
2417 jb .Lecb_dec_loop1
2418.align 4
2419.Lecb_dec_loop4:
2420 movups (INP), STATE1
2421 movups 0x10(INP), STATE2
2422 movups 0x20(INP), STATE3
2423 movups 0x30(INP), STATE4
2424 call _aesni_dec4
2425 movups STATE1, (OUTP)
2426 movups STATE2, 0x10(OUTP)
2427 movups STATE3, 0x20(OUTP)
2428 movups STATE4, 0x30(OUTP)
2429 sub $64, LEN
2430 add $64, INP
2431 add $64, OUTP
2432 cmp $64, LEN
2433 jge .Lecb_dec_loop4
2434 cmp $16, LEN
2435 jb .Lecb_dec_ret
2436.align 4
2437.Lecb_dec_loop1:
2438 movups (INP), STATE1
2439 call _aesni_dec1
2440 movups STATE1, (OUTP)
2441 sub $16, LEN
2442 add $16, INP
2443 add $16, OUTP
2444 cmp $16, LEN
2445 jge .Lecb_dec_loop1
2446.Lecb_dec_ret:
2447#ifndef __x86_64__
2448 popl KLEN
2449 popl KEYP
2450 popl LEN
2451#endif
2452 FRAME_END
2453 ret
2454ENDPROC(aesni_ecb_dec)
2455
2456
2457
2458
2459
2460ENTRY(aesni_cbc_enc)
2461 FRAME_BEGIN
2462#ifndef __x86_64__
2463 pushl IVP
2464 pushl LEN
2465 pushl KEYP
2466 pushl KLEN
2467 movl (FRAME_OFFSET+20)(%esp), KEYP
2468 movl (FRAME_OFFSET+24)(%esp), OUTP
2469 movl (FRAME_OFFSET+28)(%esp), INP
2470 movl (FRAME_OFFSET+32)(%esp), LEN
2471 movl (FRAME_OFFSET+36)(%esp), IVP
2472#endif
2473 cmp $16, LEN
2474 jb .Lcbc_enc_ret
2475 mov 480(KEYP), KLEN
2476 movups (IVP), STATE
2477.align 4
2478.Lcbc_enc_loop:
2479 movups (INP), IN
2480 pxor IN, STATE
2481 call _aesni_enc1
2482 movups STATE, (OUTP)
2483 sub $16, LEN
2484 add $16, INP
2485 add $16, OUTP
2486 cmp $16, LEN
2487 jge .Lcbc_enc_loop
2488 movups STATE, (IVP)
2489.Lcbc_enc_ret:
2490#ifndef __x86_64__
2491 popl KLEN
2492 popl KEYP
2493 popl LEN
2494 popl IVP
2495#endif
2496 FRAME_END
2497 ret
2498ENDPROC(aesni_cbc_enc)
2499
2500
2501
2502
2503
2504ENTRY(aesni_cbc_dec)
2505 FRAME_BEGIN
2506#ifndef __x86_64__
2507 pushl IVP
2508 pushl LEN
2509 pushl KEYP
2510 pushl KLEN
2511 movl (FRAME_OFFSET+20)(%esp), KEYP
2512 movl (FRAME_OFFSET+24)(%esp), OUTP
2513 movl (FRAME_OFFSET+28)(%esp), INP
2514 movl (FRAME_OFFSET+32)(%esp), LEN
2515 movl (FRAME_OFFSET+36)(%esp), IVP
2516#endif
2517 cmp $16, LEN
2518 jb .Lcbc_dec_just_ret
2519 mov 480(KEYP), KLEN
2520 add $240, KEYP
2521 movups (IVP), IV
2522 cmp $64, LEN
2523 jb .Lcbc_dec_loop1
2524.align 4
2525.Lcbc_dec_loop4:
2526 movups (INP), IN1
2527 movaps IN1, STATE1
2528 movups 0x10(INP), IN2
2529 movaps IN2, STATE2
2530#ifdef __x86_64__
2531 movups 0x20(INP), IN3
2532 movaps IN3, STATE3
2533 movups 0x30(INP), IN4
2534 movaps IN4, STATE4
2535#else
2536 movups 0x20(INP), IN1
2537 movaps IN1, STATE3
2538 movups 0x30(INP), IN2
2539 movaps IN2, STATE4
2540#endif
2541 call _aesni_dec4
2542 pxor IV, STATE1
2543#ifdef __x86_64__
2544 pxor IN1, STATE2
2545 pxor IN2, STATE3
2546 pxor IN3, STATE4
2547 movaps IN4, IV
2548#else
2549 pxor IN1, STATE4
2550 movaps IN2, IV
2551 movups (INP), IN1
2552 pxor IN1, STATE2
2553 movups 0x10(INP), IN2
2554 pxor IN2, STATE3
2555#endif
2556 movups STATE1, (OUTP)
2557 movups STATE2, 0x10(OUTP)
2558 movups STATE3, 0x20(OUTP)
2559 movups STATE4, 0x30(OUTP)
2560 sub $64, LEN
2561 add $64, INP
2562 add $64, OUTP
2563 cmp $64, LEN
2564 jge .Lcbc_dec_loop4
2565 cmp $16, LEN
2566 jb .Lcbc_dec_ret
2567.align 4
2568.Lcbc_dec_loop1:
2569 movups (INP), IN
2570 movaps IN, STATE
2571 call _aesni_dec1
2572 pxor IV, STATE
2573 movups STATE, (OUTP)
2574 movaps IN, IV
2575 sub $16, LEN
2576 add $16, INP
2577 add $16, OUTP
2578 cmp $16, LEN
2579 jge .Lcbc_dec_loop1
2580.Lcbc_dec_ret:
2581 movups IV, (IVP)
2582.Lcbc_dec_just_ret:
2583#ifndef __x86_64__
2584 popl KLEN
2585 popl KEYP
2586 popl LEN
2587 popl IVP
2588#endif
2589 FRAME_END
2590 ret
2591ENDPROC(aesni_cbc_dec)
2592
2593#ifdef __x86_64__
2594.pushsection .rodata
2595.align 16
2596.Lbswap_mask:
2597 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2598.popsection
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611.align 4
2612_aesni_inc_init:
2613 movaps .Lbswap_mask, BSWAP_MASK
2614 movaps IV, CTR
2615 PSHUFB_XMM BSWAP_MASK CTR
2616 mov $1, TCTR_LOW
2617 MOVQ_R64_XMM TCTR_LOW INC
2618 MOVQ_R64_XMM CTR TCTR_LOW
2619 ret
2620ENDPROC(_aesni_inc_init)
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637.align 4
2638_aesni_inc:
2639 paddq INC, CTR
2640 add $1, TCTR_LOW
2641 jnc .Linc_low
2642 pslldq $8, INC
2643 paddq INC, CTR
2644 psrldq $8, INC
2645.Linc_low:
2646 movaps CTR, IV
2647 PSHUFB_XMM BSWAP_MASK IV
2648 ret
2649ENDPROC(_aesni_inc)
2650
2651
2652
2653
2654
2655ENTRY(aesni_ctr_enc)
2656 FRAME_BEGIN
2657 cmp $16, LEN
2658 jb .Lctr_enc_just_ret
2659 mov 480(KEYP), KLEN
2660 movups (IVP), IV
2661 call _aesni_inc_init
2662 cmp $64, LEN
2663 jb .Lctr_enc_loop1
2664.align 4
2665.Lctr_enc_loop4:
2666 movaps IV, STATE1
2667 call _aesni_inc
2668 movups (INP), IN1
2669 movaps IV, STATE2
2670 call _aesni_inc
2671 movups 0x10(INP), IN2
2672 movaps IV, STATE3
2673 call _aesni_inc
2674 movups 0x20(INP), IN3
2675 movaps IV, STATE4
2676 call _aesni_inc
2677 movups 0x30(INP), IN4
2678 call _aesni_enc4
2679 pxor IN1, STATE1
2680 movups STATE1, (OUTP)
2681 pxor IN2, STATE2
2682 movups STATE2, 0x10(OUTP)
2683 pxor IN3, STATE3
2684 movups STATE3, 0x20(OUTP)
2685 pxor IN4, STATE4
2686 movups STATE4, 0x30(OUTP)
2687 sub $64, LEN
2688 add $64, INP
2689 add $64, OUTP
2690 cmp $64, LEN
2691 jge .Lctr_enc_loop4
2692 cmp $16, LEN
2693 jb .Lctr_enc_ret
2694.align 4
2695.Lctr_enc_loop1:
2696 movaps IV, STATE
2697 call _aesni_inc
2698 movups (INP), IN
2699 call _aesni_enc1
2700 pxor IN, STATE
2701 movups STATE, (OUTP)
2702 sub $16, LEN
2703 add $16, INP
2704 add $16, OUTP
2705 cmp $16, LEN
2706 jge .Lctr_enc_loop1
2707.Lctr_enc_ret:
2708 movups IV, (IVP)
2709.Lctr_enc_just_ret:
2710 FRAME_END
2711 ret
2712ENDPROC(aesni_ctr_enc)
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725#define _aesni_gf128mul_x_ble() \
2726 pshufd $0x13, IV, CTR; \
2727 paddq IV, IV; \
2728 psrad $31, CTR; \
2729 pand GF128MUL_MASK, CTR; \
2730 pxor CTR, IV;
2731
2732
2733
2734
2735
2736ENTRY(aesni_xts_crypt8)
2737 FRAME_BEGIN
2738 cmpb $0, %cl
2739 movl $0, %ecx
2740 movl $240, %r10d
2741 leaq _aesni_enc4, %r11
2742 leaq _aesni_dec4, %rax
2743 cmovel %r10d, %ecx
2744 cmoveq %rax, %r11
2745
2746 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2747 movups (IVP), IV
2748
2749 mov 480(KEYP), KLEN
2750 addq %rcx, KEYP
2751
2752 movdqa IV, STATE1
2753 movdqu 0x00(INP), INC
2754 pxor INC, STATE1
2755 movdqu IV, 0x00(OUTP)
2756
2757 _aesni_gf128mul_x_ble()
2758 movdqa IV, STATE2
2759 movdqu 0x10(INP), INC
2760 pxor INC, STATE2
2761 movdqu IV, 0x10(OUTP)
2762
2763 _aesni_gf128mul_x_ble()
2764 movdqa IV, STATE3
2765 movdqu 0x20(INP), INC
2766 pxor INC, STATE3
2767 movdqu IV, 0x20(OUTP)
2768
2769 _aesni_gf128mul_x_ble()
2770 movdqa IV, STATE4
2771 movdqu 0x30(INP), INC
2772 pxor INC, STATE4
2773 movdqu IV, 0x30(OUTP)
2774
2775 CALL_NOSPEC %r11
2776
2777 movdqu 0x00(OUTP), INC
2778 pxor INC, STATE1
2779 movdqu STATE1, 0x00(OUTP)
2780
2781 _aesni_gf128mul_x_ble()
2782 movdqa IV, STATE1
2783 movdqu 0x40(INP), INC
2784 pxor INC, STATE1
2785 movdqu IV, 0x40(OUTP)
2786
2787 movdqu 0x10(OUTP), INC
2788 pxor INC, STATE2
2789 movdqu STATE2, 0x10(OUTP)
2790
2791 _aesni_gf128mul_x_ble()
2792 movdqa IV, STATE2
2793 movdqu 0x50(INP), INC
2794 pxor INC, STATE2
2795 movdqu IV, 0x50(OUTP)
2796
2797 movdqu 0x20(OUTP), INC
2798 pxor INC, STATE3
2799 movdqu STATE3, 0x20(OUTP)
2800
2801 _aesni_gf128mul_x_ble()
2802 movdqa IV, STATE3
2803 movdqu 0x60(INP), INC
2804 pxor INC, STATE3
2805 movdqu IV, 0x60(OUTP)
2806
2807 movdqu 0x30(OUTP), INC
2808 pxor INC, STATE4
2809 movdqu STATE4, 0x30(OUTP)
2810
2811 _aesni_gf128mul_x_ble()
2812 movdqa IV, STATE4
2813 movdqu 0x70(INP), INC
2814 pxor INC, STATE4
2815 movdqu IV, 0x70(OUTP)
2816
2817 _aesni_gf128mul_x_ble()
2818 movups IV, (IVP)
2819
2820 CALL_NOSPEC %r11
2821
2822 movdqu 0x40(OUTP), INC
2823 pxor INC, STATE1
2824 movdqu STATE1, 0x40(OUTP)
2825
2826 movdqu 0x50(OUTP), INC
2827 pxor INC, STATE2
2828 movdqu STATE2, 0x50(OUTP)
2829
2830 movdqu 0x60(OUTP), INC
2831 pxor INC, STATE3
2832 movdqu STATE3, 0x60(OUTP)
2833
2834 movdqu 0x70(OUTP), INC
2835 pxor INC, STATE4
2836 movdqu STATE4, 0x70(OUTP)
2837
2838 FRAME_END
2839 ret
2840ENDPROC(aesni_xts_crypt8)
2841
2842#endif
2843