1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32#include <linux/linkage.h>
33#include <asm/inst.h>
34#include <asm/frame.h>
35
36
37
38
39
40
41
42
43
44#define MOVADQ movaps
45#define MOVUDQ movups
46
47#ifdef __x86_64__
48
49.data
50.align 16
51.Lgf128mul_x_ble_mask:
52 .octa 0x00000000000000010000000000000087
53POLY: .octa 0xC2000000000000000000000000000001
54TWOONE: .octa 0x00000001000000000000000000000001
55
56
57
58
59
60SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
61MASK1: .octa 0x0000000000000000ffffffffffffffff
62MASK2: .octa 0xffffffffffffffff0000000000000000
63SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
64ALL_F: .octa 0xffffffffffffffffffffffffffffffff
65ZERO: .octa 0x00000000000000000000000000000000
66ONE: .octa 0x00000000000000000000000000000001
67F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
68dec: .octa 0x1
69enc: .octa 0x2
70
71
72.text
73
74
75#define STACK_OFFSET 8*3
76#define HashKey 16*0
77#define HashKey_2 16*1
78#define HashKey_3 16*2
79#define HashKey_4 16*3
80#define HashKey_k 16*4
81
82
83#define HashKey_2_k 16*5
84
85
86#define HashKey_3_k 16*6
87
88
89#define HashKey_4_k 16*7
90
91
92#define VARIABLE_OFFSET 16*8
93
94#define arg1 rdi
95#define arg2 rsi
96#define arg3 rdx
97#define arg4 rcx
98#define arg5 r8
99#define arg6 r9
100#define arg7 STACK_OFFSET+8(%r14)
101#define arg8 STACK_OFFSET+16(%r14)
102#define arg9 STACK_OFFSET+24(%r14)
103#define arg10 STACK_OFFSET+32(%r14)
104#define keysize 2*15*16(%arg1)
105#endif
106
107
108#define STATE1 %xmm0
109#define STATE2 %xmm4
110#define STATE3 %xmm5
111#define STATE4 %xmm6
112#define STATE STATE1
113#define IN1 %xmm1
114#define IN2 %xmm7
115#define IN3 %xmm8
116#define IN4 %xmm9
117#define IN IN1
118#define KEY %xmm2
119#define IV %xmm3
120
121#define BSWAP_MASK %xmm10
122#define CTR %xmm11
123#define INC %xmm12
124
125#define GF128MUL_MASK %xmm10
126
127#ifdef __x86_64__
128#define AREG %rax
129#define KEYP %rdi
130#define OUTP %rsi
131#define UKEYP OUTP
132#define INP %rdx
133#define LEN %rcx
134#define IVP %r8
135#define KLEN %r9d
136#define T1 %r10
137#define TKEYP T1
138#define T2 %r11
139#define TCTR_LOW T2
140#else
141#define AREG %eax
142#define KEYP %edi
143#define OUTP AREG
144#define UKEYP OUTP
145#define INP %edx
146#define LEN %esi
147#define IVP %ebp
148#define KLEN %ebx
149#define T1 %ecx
150#define TKEYP T1
151#endif
152
153
154#ifdef __x86_64__
155
156
157
158
159
160
161
162
163
164.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
165 movdqa \GH, \TMP1
166 pshufd $78, \GH, \TMP2
167 pshufd $78, \HK, \TMP3
168 pxor \GH, \TMP2
169 pxor \HK, \TMP3
170 PCLMULQDQ 0x11, \HK, \TMP1
171 PCLMULQDQ 0x00, \HK, \GH
172 PCLMULQDQ 0x00, \TMP3, \TMP2
173 pxor \GH, \TMP2
174 pxor \TMP1, \TMP2
175 movdqa \TMP2, \TMP3
176 pslldq $8, \TMP3
177 psrldq $8, \TMP2
178 pxor \TMP3, \GH
179 pxor \TMP2, \TMP1
180
181
182
183 movdqa \GH, \TMP2
184 movdqa \GH, \TMP3
185 movdqa \GH, \TMP4
186
187
188 pslld $31, \TMP2
189 pslld $30, \TMP3
190 pslld $25, \TMP4
191 pxor \TMP3, \TMP2
192 pxor \TMP4, \TMP2
193 movdqa \TMP2, \TMP5
194 psrldq $4, \TMP5
195 pslldq $12, \TMP2
196 pxor \TMP2, \GH
197
198
199
200 movdqa \GH,\TMP2
201
202
203 movdqa \GH,\TMP3
204 movdqa \GH,\TMP4
205 psrld $1,\TMP2
206 psrld $2,\TMP3
207 psrld $7,\TMP4
208 pxor \TMP3,\TMP2
209 pxor \TMP4,\TMP2
210 pxor \TMP5, \TMP2
211 pxor \TMP2, \GH
212 pxor \TMP1, \GH
213.endm
214
215
216
217
218
219
220
221
222
223
224
225
226
227.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
228XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
229 MOVADQ SHUF_MASK(%rip), %xmm14
230 mov arg7, %r10
231 mov arg8, %r12
232 mov %r12, %r11
233 pxor %xmm\i, %xmm\i
234
235_get_AAD_loop\num_initial_blocks\operation:
236 movd (%r10), \TMP1
237 pslldq $12, \TMP1
238 psrldq $4, %xmm\i
239 pxor \TMP1, %xmm\i
240 add $4, %r10
241 sub $4, %r12
242 jne _get_AAD_loop\num_initial_blocks\operation
243
244 cmp $16, %r11
245 je _get_AAD_loop2_done\num_initial_blocks\operation
246
247 mov $16, %r12
248_get_AAD_loop2\num_initial_blocks\operation:
249 psrldq $4, %xmm\i
250 sub $4, %r12
251 cmp %r11, %r12
252 jne _get_AAD_loop2\num_initial_blocks\operation
253
254_get_AAD_loop2_done\num_initial_blocks\operation:
255 PSHUFB_XMM %xmm14, %xmm\i
256
257 xor %r11, %r11
258
259
260
261 mov %arg5, %rax
262 movdqu (%rax), \XMM0
263 PSHUFB_XMM %xmm14, \XMM0
264
265.if (\i == 5) || (\i == 6) || (\i == 7)
266 MOVADQ ONE(%RIP),\TMP1
267 MOVADQ (%arg1),\TMP2
268.irpc index, \i_seq
269 paddd \TMP1, \XMM0
270 movdqa \XMM0, %xmm\index
271 PSHUFB_XMM %xmm14, %xmm\index
272 pxor \TMP2, %xmm\index
273.endr
274 lea 0x10(%arg1),%r10
275 mov keysize,%eax
276 shr $2,%eax
277 add $5,%eax
278
279aes_loop_initial_dec\num_initial_blocks:
280 MOVADQ (%r10),\TMP1
281.irpc index, \i_seq
282 AESENC \TMP1, %xmm\index
283.endr
284 add $16,%r10
285 sub $1,%eax
286 jnz aes_loop_initial_dec\num_initial_blocks
287
288 MOVADQ (%r10), \TMP1
289.irpc index, \i_seq
290 AESENCLAST \TMP1, %xmm\index
291.endr
292.irpc index, \i_seq
293 movdqu (%arg3 , %r11, 1), \TMP1
294 pxor \TMP1, %xmm\index
295 movdqu %xmm\index, (%arg2 , %r11, 1)
296
297 add $16, %r11
298
299 movdqa \TMP1, %xmm\index
300 PSHUFB_XMM %xmm14, %xmm\index
301
302.endr
303.endif
304 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
305
306
307.if \i == 5
308 pxor %xmm5, %xmm6
309 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
310 pxor %xmm6, %xmm7
311 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
312 pxor %xmm7, %xmm8
313 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
314.elseif \i == 6
315 pxor %xmm6, %xmm7
316 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
317 pxor %xmm7, %xmm8
318 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
319.elseif \i == 7
320 pxor %xmm7, %xmm8
321 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
322.endif
323 cmp $64, %r13
324 jl _initial_blocks_done\num_initial_blocks\operation
325
326
327
328
329
330
331 MOVADQ ONE(%rip), \TMP1
332 paddd \TMP1, \XMM0
333 MOVADQ \XMM0, \XMM1
334 PSHUFB_XMM %xmm14, \XMM1
335
336 paddd \TMP1, \XMM0
337 MOVADQ \XMM0, \XMM2
338 PSHUFB_XMM %xmm14, \XMM2
339
340 paddd \TMP1, \XMM0
341 MOVADQ \XMM0, \XMM3
342 PSHUFB_XMM %xmm14, \XMM3
343
344 paddd \TMP1, \XMM0
345 MOVADQ \XMM0, \XMM4
346 PSHUFB_XMM %xmm14, \XMM4
347
348 MOVADQ 0(%arg1),\TMP1
349 pxor \TMP1, \XMM1
350 pxor \TMP1, \XMM2
351 pxor \TMP1, \XMM3
352 pxor \TMP1, \XMM4
353 movdqa \TMP3, \TMP5
354 pshufd $78, \TMP3, \TMP1
355 pxor \TMP3, \TMP1
356 movdqa \TMP1, HashKey_k(%rsp)
357 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
358
359 movdqa \TMP5, HashKey_2(%rsp)
360
361 pshufd $78, \TMP5, \TMP1
362 pxor \TMP5, \TMP1
363 movdqa \TMP1, HashKey_2_k(%rsp)
364.irpc index, 1234
365 movaps 0x10*\index(%arg1), \TMP1
366 AESENC \TMP1, \XMM1
367 AESENC \TMP1, \XMM2
368 AESENC \TMP1, \XMM3
369 AESENC \TMP1, \XMM4
370.endr
371 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
372
373 movdqa \TMP5, HashKey_3(%rsp)
374 pshufd $78, \TMP5, \TMP1
375 pxor \TMP5, \TMP1
376 movdqa \TMP1, HashKey_3_k(%rsp)
377.irpc index, 56789
378 movaps 0x10*\index(%arg1), \TMP1
379 AESENC \TMP1, \XMM1
380 AESENC \TMP1, \XMM2
381 AESENC \TMP1, \XMM3
382 AESENC \TMP1, \XMM4
383.endr
384 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
385
386 movdqa \TMP5, HashKey_4(%rsp)
387 pshufd $78, \TMP5, \TMP1
388 pxor \TMP5, \TMP1
389 movdqa \TMP1, HashKey_4_k(%rsp)
390 lea 0xa0(%arg1),%r10
391 mov keysize,%eax
392 shr $2,%eax
393 sub $4,%eax
394 jz aes_loop_pre_dec_done\num_initial_blocks
395
396aes_loop_pre_dec\num_initial_blocks:
397 MOVADQ (%r10),\TMP2
398.irpc index, 1234
399 AESENC \TMP2, %xmm\index
400.endr
401 add $16,%r10
402 sub $1,%eax
403 jnz aes_loop_pre_dec\num_initial_blocks
404
405aes_loop_pre_dec_done\num_initial_blocks:
406 MOVADQ (%r10), \TMP2
407 AESENCLAST \TMP2, \XMM1
408 AESENCLAST \TMP2, \XMM2
409 AESENCLAST \TMP2, \XMM3
410 AESENCLAST \TMP2, \XMM4
411 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
412 pxor \TMP1, \XMM1
413 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
414 movdqa \TMP1, \XMM1
415 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
416 pxor \TMP1, \XMM2
417 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
418 movdqa \TMP1, \XMM2
419 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
420 pxor \TMP1, \XMM3
421 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
422 movdqa \TMP1, \XMM3
423 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
424 pxor \TMP1, \XMM4
425 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
426 movdqa \TMP1, \XMM4
427 add $64, %r11
428 PSHUFB_XMM %xmm14, \XMM1
429 pxor \XMMDst, \XMM1
430
431 PSHUFB_XMM %xmm14, \XMM2
432 PSHUFB_XMM %xmm14, \XMM3
433 PSHUFB_XMM %xmm14, \XMM4
434
435_initial_blocks_done\num_initial_blocks\operation:
436
437.endm
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
453XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
454 MOVADQ SHUF_MASK(%rip), %xmm14
455 mov arg7, %r10
456 mov arg8, %r12
457 mov %r12, %r11
458 pxor %xmm\i, %xmm\i
459_get_AAD_loop\num_initial_blocks\operation:
460 movd (%r10), \TMP1
461 pslldq $12, \TMP1
462 psrldq $4, %xmm\i
463 pxor \TMP1, %xmm\i
464 add $4, %r10
465 sub $4, %r12
466 jne _get_AAD_loop\num_initial_blocks\operation
467 cmp $16, %r11
468 je _get_AAD_loop2_done\num_initial_blocks\operation
469 mov $16, %r12
470_get_AAD_loop2\num_initial_blocks\operation:
471 psrldq $4, %xmm\i
472 sub $4, %r12
473 cmp %r11, %r12
474 jne _get_AAD_loop2\num_initial_blocks\operation
475_get_AAD_loop2_done\num_initial_blocks\operation:
476 PSHUFB_XMM %xmm14, %xmm\i
477
478 xor %r11, %r11
479
480
481
482 mov %arg5, %rax
483 movdqu (%rax), \XMM0
484 PSHUFB_XMM %xmm14, \XMM0
485
486.if (\i == 5) || (\i == 6) || (\i == 7)
487
488 MOVADQ ONE(%RIP),\TMP1
489 MOVADQ 0(%arg1),\TMP2
490.irpc index, \i_seq
491 paddd \TMP1, \XMM0
492 MOVADQ \XMM0, %xmm\index
493 PSHUFB_XMM %xmm14, %xmm\index
494 pxor \TMP2, %xmm\index
495.endr
496 lea 0x10(%arg1),%r10
497 mov keysize,%eax
498 shr $2,%eax
499 add $5,%eax
500
501aes_loop_initial_enc\num_initial_blocks:
502 MOVADQ (%r10),\TMP1
503.irpc index, \i_seq
504 AESENC \TMP1, %xmm\index
505.endr
506 add $16,%r10
507 sub $1,%eax
508 jnz aes_loop_initial_enc\num_initial_blocks
509
510 MOVADQ (%r10), \TMP1
511.irpc index, \i_seq
512 AESENCLAST \TMP1, %xmm\index
513.endr
514.irpc index, \i_seq
515 movdqu (%arg3 , %r11, 1), \TMP1
516 pxor \TMP1, %xmm\index
517 movdqu %xmm\index, (%arg2 , %r11, 1)
518
519 add $16, %r11
520 PSHUFB_XMM %xmm14, %xmm\index
521
522
523.endr
524.endif
525 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
526
527
528.if \i == 5
529 pxor %xmm5, %xmm6
530 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
531 pxor %xmm6, %xmm7
532 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
533 pxor %xmm7, %xmm8
534 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
535.elseif \i == 6
536 pxor %xmm6, %xmm7
537 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
538 pxor %xmm7, %xmm8
539 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
540.elseif \i == 7
541 pxor %xmm7, %xmm8
542 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
543.endif
544 cmp $64, %r13
545 jl _initial_blocks_done\num_initial_blocks\operation
546
547
548
549
550
551
552 MOVADQ ONE(%RIP),\TMP1
553 paddd \TMP1, \XMM0
554 MOVADQ \XMM0, \XMM1
555 PSHUFB_XMM %xmm14, \XMM1
556
557 paddd \TMP1, \XMM0
558 MOVADQ \XMM0, \XMM2
559 PSHUFB_XMM %xmm14, \XMM2
560
561 paddd \TMP1, \XMM0
562 MOVADQ \XMM0, \XMM3
563 PSHUFB_XMM %xmm14, \XMM3
564
565 paddd \TMP1, \XMM0
566 MOVADQ \XMM0, \XMM4
567 PSHUFB_XMM %xmm14, \XMM4
568
569 MOVADQ 0(%arg1),\TMP1
570 pxor \TMP1, \XMM1
571 pxor \TMP1, \XMM2
572 pxor \TMP1, \XMM3
573 pxor \TMP1, \XMM4
574 movdqa \TMP3, \TMP5
575 pshufd $78, \TMP3, \TMP1
576 pxor \TMP3, \TMP1
577 movdqa \TMP1, HashKey_k(%rsp)
578 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
579
580 movdqa \TMP5, HashKey_2(%rsp)
581
582 pshufd $78, \TMP5, \TMP1
583 pxor \TMP5, \TMP1
584 movdqa \TMP1, HashKey_2_k(%rsp)
585.irpc index, 1234
586 movaps 0x10*\index(%arg1), \TMP1
587 AESENC \TMP1, \XMM1
588 AESENC \TMP1, \XMM2
589 AESENC \TMP1, \XMM3
590 AESENC \TMP1, \XMM4
591.endr
592 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
593
594 movdqa \TMP5, HashKey_3(%rsp)
595 pshufd $78, \TMP5, \TMP1
596 pxor \TMP5, \TMP1
597 movdqa \TMP1, HashKey_3_k(%rsp)
598.irpc index, 56789
599 movaps 0x10*\index(%arg1), \TMP1
600 AESENC \TMP1, \XMM1
601 AESENC \TMP1, \XMM2
602 AESENC \TMP1, \XMM3
603 AESENC \TMP1, \XMM4
604.endr
605 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
606
607 movdqa \TMP5, HashKey_4(%rsp)
608 pshufd $78, \TMP5, \TMP1
609 pxor \TMP5, \TMP1
610 movdqa \TMP1, HashKey_4_k(%rsp)
611 lea 0xa0(%arg1),%r10
612 mov keysize,%eax
613 shr $2,%eax
614 sub $4,%eax
615 jz aes_loop_pre_enc_done\num_initial_blocks
616
617aes_loop_pre_enc\num_initial_blocks:
618 MOVADQ (%r10),\TMP2
619.irpc index, 1234
620 AESENC \TMP2, %xmm\index
621.endr
622 add $16,%r10
623 sub $1,%eax
624 jnz aes_loop_pre_enc\num_initial_blocks
625
626aes_loop_pre_enc_done\num_initial_blocks:
627 MOVADQ (%r10), \TMP2
628 AESENCLAST \TMP2, \XMM1
629 AESENCLAST \TMP2, \XMM2
630 AESENCLAST \TMP2, \XMM3
631 AESENCLAST \TMP2, \XMM4
632 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
633 pxor \TMP1, \XMM1
634 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
635 pxor \TMP1, \XMM2
636 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
637 pxor \TMP1, \XMM3
638 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
639 pxor \TMP1, \XMM4
640 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
641 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
642 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
643 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
644
645 add $64, %r11
646 PSHUFB_XMM %xmm14, \XMM1
647 pxor \XMMDst, \XMM1
648
649 PSHUFB_XMM %xmm14, \XMM2
650 PSHUFB_XMM %xmm14, \XMM3
651 PSHUFB_XMM %xmm14, \XMM4
652
653_initial_blocks_done\num_initial_blocks\operation:
654
655.endm
656
657
658
659
660
661
662
663.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
664TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
665
666 movdqa \XMM1, \XMM5
667 movdqa \XMM2, \XMM6
668 movdqa \XMM3, \XMM7
669 movdqa \XMM4, \XMM8
670
671 movdqa SHUF_MASK(%rip), %xmm15
672
673
674 movdqa \XMM5, \TMP4
675 pshufd $78, \XMM5, \TMP6
676 pxor \XMM5, \TMP6
677 paddd ONE(%rip), \XMM0
678 movdqa HashKey_4(%rsp), \TMP5
679 PCLMULQDQ 0x11, \TMP5, \TMP4
680 movdqa \XMM0, \XMM1
681 paddd ONE(%rip), \XMM0
682 movdqa \XMM0, \XMM2
683 paddd ONE(%rip), \XMM0
684 movdqa \XMM0, \XMM3
685 paddd ONE(%rip), \XMM0
686 movdqa \XMM0, \XMM4
687 PSHUFB_XMM %xmm15, \XMM1
688 PCLMULQDQ 0x00, \TMP5, \XMM5
689 PSHUFB_XMM %xmm15, \XMM2
690 PSHUFB_XMM %xmm15, \XMM3
691 PSHUFB_XMM %xmm15, \XMM4
692
693 pxor (%arg1), \XMM1
694 pxor (%arg1), \XMM2
695 pxor (%arg1), \XMM3
696 pxor (%arg1), \XMM4
697 movdqa HashKey_4_k(%rsp), \TMP5
698 PCLMULQDQ 0x00, \TMP5, \TMP6
699 movaps 0x10(%arg1), \TMP1
700 AESENC \TMP1, \XMM1
701 AESENC \TMP1, \XMM2
702 AESENC \TMP1, \XMM3
703 AESENC \TMP1, \XMM4
704 movaps 0x20(%arg1), \TMP1
705 AESENC \TMP1, \XMM1
706 AESENC \TMP1, \XMM2
707 AESENC \TMP1, \XMM3
708 AESENC \TMP1, \XMM4
709 movdqa \XMM6, \TMP1
710 pshufd $78, \XMM6, \TMP2
711 pxor \XMM6, \TMP2
712 movdqa HashKey_3(%rsp), \TMP5
713 PCLMULQDQ 0x11, \TMP5, \TMP1
714 movaps 0x30(%arg1), \TMP3
715 AESENC \TMP3, \XMM1
716 AESENC \TMP3, \XMM2
717 AESENC \TMP3, \XMM3
718 AESENC \TMP3, \XMM4
719 PCLMULQDQ 0x00, \TMP5, \XMM6
720 movaps 0x40(%arg1), \TMP3
721 AESENC \TMP3, \XMM1
722 AESENC \TMP3, \XMM2
723 AESENC \TMP3, \XMM3
724 AESENC \TMP3, \XMM4
725 movdqa HashKey_3_k(%rsp), \TMP5
726 PCLMULQDQ 0x00, \TMP5, \TMP2
727 movaps 0x50(%arg1), \TMP3
728 AESENC \TMP3, \XMM1
729 AESENC \TMP3, \XMM2
730 AESENC \TMP3, \XMM3
731 AESENC \TMP3, \XMM4
732 pxor \TMP1, \TMP4
733
734 pxor \XMM6, \XMM5
735 pxor \TMP2, \TMP6
736 movdqa \XMM7, \TMP1
737 pshufd $78, \XMM7, \TMP2
738 pxor \XMM7, \TMP2
739 movdqa HashKey_2(%rsp ), \TMP5
740
741
742
743 PCLMULQDQ 0x11, \TMP5, \TMP1
744 movaps 0x60(%arg1), \TMP3
745 AESENC \TMP3, \XMM1
746 AESENC \TMP3, \XMM2
747 AESENC \TMP3, \XMM3
748 AESENC \TMP3, \XMM4
749 PCLMULQDQ 0x00, \TMP5, \XMM7
750 movaps 0x70(%arg1), \TMP3
751 AESENC \TMP3, \XMM1
752 AESENC \TMP3, \XMM2
753 AESENC \TMP3, \XMM3
754 AESENC \TMP3, \XMM4
755 movdqa HashKey_2_k(%rsp), \TMP5
756 PCLMULQDQ 0x00, \TMP5, \TMP2
757 movaps 0x80(%arg1), \TMP3
758 AESENC \TMP3, \XMM1
759 AESENC \TMP3, \XMM2
760 AESENC \TMP3, \XMM3
761 AESENC \TMP3, \XMM4
762 pxor \TMP1, \TMP4
763
764 pxor \XMM7, \XMM5
765 pxor \TMP2, \TMP6
766
767
768
769
770 movdqa \XMM8, \TMP1
771 pshufd $78, \XMM8, \TMP2
772 pxor \XMM8, \TMP2
773 movdqa HashKey(%rsp), \TMP5
774 PCLMULQDQ 0x11, \TMP5, \TMP1
775 movaps 0x90(%arg1), \TMP3
776 AESENC \TMP3, \XMM1
777 AESENC \TMP3, \XMM2
778 AESENC \TMP3, \XMM3
779 AESENC \TMP3, \XMM4
780 PCLMULQDQ 0x00, \TMP5, \XMM8
781 lea 0xa0(%arg1),%r10
782 mov keysize,%eax
783 shr $2,%eax
784 sub $4,%eax
785 jz aes_loop_par_enc_done
786
787aes_loop_par_enc:
788 MOVADQ (%r10),\TMP3
789.irpc index, 1234
790 AESENC \TMP3, %xmm\index
791.endr
792 add $16,%r10
793 sub $1,%eax
794 jnz aes_loop_par_enc
795
796aes_loop_par_enc_done:
797 MOVADQ (%r10), \TMP3
798 AESENCLAST \TMP3, \XMM1
799 AESENCLAST \TMP3, \XMM2
800 AESENCLAST \TMP3, \XMM3
801 AESENCLAST \TMP3, \XMM4
802 movdqa HashKey_k(%rsp), \TMP5
803 PCLMULQDQ 0x00, \TMP5, \TMP2
804 movdqu (%arg3,%r11,1), \TMP3
805 pxor \TMP3, \XMM1
806 movdqu 16(%arg3,%r11,1), \TMP3
807 pxor \TMP3, \XMM2
808 movdqu 32(%arg3,%r11,1), \TMP3
809 pxor \TMP3, \XMM3
810 movdqu 48(%arg3,%r11,1), \TMP3
811 pxor \TMP3, \XMM4
812 movdqu \XMM1, (%arg2,%r11,1)
813 movdqu \XMM2, 16(%arg2,%r11,1)
814 movdqu \XMM3, 32(%arg2,%r11,1)
815 movdqu \XMM4, 48(%arg2,%r11,1)
816 PSHUFB_XMM %xmm15, \XMM1
817 PSHUFB_XMM %xmm15, \XMM2
818 PSHUFB_XMM %xmm15, \XMM3
819 PSHUFB_XMM %xmm15, \XMM4
820
821 pxor \TMP4, \TMP1
822 pxor \XMM8, \XMM5
823 pxor \TMP6, \TMP2
824 pxor \TMP1, \TMP2
825 pxor \XMM5, \TMP2
826 movdqa \TMP2, \TMP3
827 pslldq $8, \TMP3
828 psrldq $8, \TMP2
829 pxor \TMP3, \XMM5
830 pxor \TMP2, \TMP1
831
832
833
834 movdqa \XMM5, \TMP2
835 movdqa \XMM5, \TMP3
836 movdqa \XMM5, \TMP4
837
838 pslld $31, \TMP2
839 pslld $30, \TMP3
840 pslld $25, \TMP4
841 pxor \TMP3, \TMP2
842 pxor \TMP4, \TMP2
843 movdqa \TMP2, \TMP5
844 psrldq $4, \TMP5
845 pslldq $12, \TMP2
846 pxor \TMP2, \XMM5
847
848
849
850 movdqa \XMM5,\TMP2
851 movdqa \XMM5,\TMP3
852 movdqa \XMM5,\TMP4
853 psrld $1, \TMP2
854 psrld $2, \TMP3
855 psrld $7, \TMP4
856 pxor \TMP3,\TMP2
857 pxor \TMP4,\TMP2
858 pxor \TMP5, \TMP2
859 pxor \TMP2, \XMM5
860 pxor \TMP1, \XMM5
861
862 pxor \XMM5, \XMM1
863.endm
864
865
866
867
868
869
870
871.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
872TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
873
874 movdqa \XMM1, \XMM5
875 movdqa \XMM2, \XMM6
876 movdqa \XMM3, \XMM7
877 movdqa \XMM4, \XMM8
878
879 movdqa SHUF_MASK(%rip), %xmm15
880
881
882 movdqa \XMM5, \TMP4
883 pshufd $78, \XMM5, \TMP6
884 pxor \XMM5, \TMP6
885 paddd ONE(%rip), \XMM0
886 movdqa HashKey_4(%rsp), \TMP5
887 PCLMULQDQ 0x11, \TMP5, \TMP4
888 movdqa \XMM0, \XMM1
889 paddd ONE(%rip), \XMM0
890 movdqa \XMM0, \XMM2
891 paddd ONE(%rip), \XMM0
892 movdqa \XMM0, \XMM3
893 paddd ONE(%rip), \XMM0
894 movdqa \XMM0, \XMM4
895 PSHUFB_XMM %xmm15, \XMM1
896 PCLMULQDQ 0x00, \TMP5, \XMM5
897 PSHUFB_XMM %xmm15, \XMM2
898 PSHUFB_XMM %xmm15, \XMM3
899 PSHUFB_XMM %xmm15, \XMM4
900
901 pxor (%arg1), \XMM1
902 pxor (%arg1), \XMM2
903 pxor (%arg1), \XMM3
904 pxor (%arg1), \XMM4
905 movdqa HashKey_4_k(%rsp), \TMP5
906 PCLMULQDQ 0x00, \TMP5, \TMP6
907 movaps 0x10(%arg1), \TMP1
908 AESENC \TMP1, \XMM1
909 AESENC \TMP1, \XMM2
910 AESENC \TMP1, \XMM3
911 AESENC \TMP1, \XMM4
912 movaps 0x20(%arg1), \TMP1
913 AESENC \TMP1, \XMM1
914 AESENC \TMP1, \XMM2
915 AESENC \TMP1, \XMM3
916 AESENC \TMP1, \XMM4
917 movdqa \XMM6, \TMP1
918 pshufd $78, \XMM6, \TMP2
919 pxor \XMM6, \TMP2
920 movdqa HashKey_3(%rsp), \TMP5
921 PCLMULQDQ 0x11, \TMP5, \TMP1
922 movaps 0x30(%arg1), \TMP3
923 AESENC \TMP3, \XMM1
924 AESENC \TMP3, \XMM2
925 AESENC \TMP3, \XMM3
926 AESENC \TMP3, \XMM4
927 PCLMULQDQ 0x00, \TMP5, \XMM6
928 movaps 0x40(%arg1), \TMP3
929 AESENC \TMP3, \XMM1
930 AESENC \TMP3, \XMM2
931 AESENC \TMP3, \XMM3
932 AESENC \TMP3, \XMM4
933 movdqa HashKey_3_k(%rsp), \TMP5
934 PCLMULQDQ 0x00, \TMP5, \TMP2
935 movaps 0x50(%arg1), \TMP3
936 AESENC \TMP3, \XMM1
937 AESENC \TMP3, \XMM2
938 AESENC \TMP3, \XMM3
939 AESENC \TMP3, \XMM4
940 pxor \TMP1, \TMP4
941
942 pxor \XMM6, \XMM5
943 pxor \TMP2, \TMP6
944 movdqa \XMM7, \TMP1
945 pshufd $78, \XMM7, \TMP2
946 pxor \XMM7, \TMP2
947 movdqa HashKey_2(%rsp ), \TMP5
948
949
950
951 PCLMULQDQ 0x11, \TMP5, \TMP1
952 movaps 0x60(%arg1), \TMP3
953 AESENC \TMP3, \XMM1
954 AESENC \TMP3, \XMM2
955 AESENC \TMP3, \XMM3
956 AESENC \TMP3, \XMM4
957 PCLMULQDQ 0x00, \TMP5, \XMM7
958 movaps 0x70(%arg1), \TMP3
959 AESENC \TMP3, \XMM1
960 AESENC \TMP3, \XMM2
961 AESENC \TMP3, \XMM3
962 AESENC \TMP3, \XMM4
963 movdqa HashKey_2_k(%rsp), \TMP5
964 PCLMULQDQ 0x00, \TMP5, \TMP2
965 movaps 0x80(%arg1), \TMP3
966 AESENC \TMP3, \XMM1
967 AESENC \TMP3, \XMM2
968 AESENC \TMP3, \XMM3
969 AESENC \TMP3, \XMM4
970 pxor \TMP1, \TMP4
971
972 pxor \XMM7, \XMM5
973 pxor \TMP2, \TMP6
974
975
976
977
978 movdqa \XMM8, \TMP1
979 pshufd $78, \XMM8, \TMP2
980 pxor \XMM8, \TMP2
981 movdqa HashKey(%rsp), \TMP5
982 PCLMULQDQ 0x11, \TMP5, \TMP1
983 movaps 0x90(%arg1), \TMP3
984 AESENC \TMP3, \XMM1
985 AESENC \TMP3, \XMM2
986 AESENC \TMP3, \XMM3
987 AESENC \TMP3, \XMM4
988 PCLMULQDQ 0x00, \TMP5, \XMM8
989 lea 0xa0(%arg1),%r10
990 mov keysize,%eax
991 shr $2,%eax
992 sub $4,%eax
993 jz aes_loop_par_dec_done
994
995aes_loop_par_dec:
996 MOVADQ (%r10),\TMP3
997.irpc index, 1234
998 AESENC \TMP3, %xmm\index
999.endr
1000 add $16,%r10
1001 sub $1,%eax
1002 jnz aes_loop_par_dec
1003
1004aes_loop_par_dec_done:
1005 MOVADQ (%r10), \TMP3
1006 AESENCLAST \TMP3, \XMM1
1007 AESENCLAST \TMP3, \XMM2
1008 AESENCLAST \TMP3, \XMM3
1009 AESENCLAST \TMP3, \XMM4
1010 movdqa HashKey_k(%rsp), \TMP5
1011 PCLMULQDQ 0x00, \TMP5, \TMP2
1012 movdqu (%arg3,%r11,1), \TMP3
1013 pxor \TMP3, \XMM1
1014 movdqu \XMM1, (%arg2,%r11,1)
1015 movdqa \TMP3, \XMM1
1016 movdqu 16(%arg3,%r11,1), \TMP3
1017 pxor \TMP3, \XMM2
1018 movdqu \XMM2, 16(%arg2,%r11,1)
1019 movdqa \TMP3, \XMM2
1020 movdqu 32(%arg3,%r11,1), \TMP3
1021 pxor \TMP3, \XMM3
1022 movdqu \XMM3, 32(%arg2,%r11,1)
1023 movdqa \TMP3, \XMM3
1024 movdqu 48(%arg3,%r11,1), \TMP3
1025 pxor \TMP3, \XMM4
1026 movdqu \XMM4, 48(%arg2,%r11,1)
1027 movdqa \TMP3, \XMM4
1028 PSHUFB_XMM %xmm15, \XMM1
1029 PSHUFB_XMM %xmm15, \XMM2
1030 PSHUFB_XMM %xmm15, \XMM3
1031 PSHUFB_XMM %xmm15, \XMM4
1032
1033 pxor \TMP4, \TMP1
1034 pxor \XMM8, \XMM5
1035 pxor \TMP6, \TMP2
1036 pxor \TMP1, \TMP2
1037 pxor \XMM5, \TMP2
1038 movdqa \TMP2, \TMP3
1039 pslldq $8, \TMP3
1040 psrldq $8, \TMP2
1041 pxor \TMP3, \XMM5
1042 pxor \TMP2, \TMP1
1043
1044
1045
1046 movdqa \XMM5, \TMP2
1047 movdqa \XMM5, \TMP3
1048 movdqa \XMM5, \TMP4
1049
1050 pslld $31, \TMP2
1051 pslld $30, \TMP3
1052 pslld $25, \TMP4
1053 pxor \TMP3, \TMP2
1054 pxor \TMP4, \TMP2
1055 movdqa \TMP2, \TMP5
1056 psrldq $4, \TMP5
1057 pslldq $12, \TMP2
1058 pxor \TMP2, \XMM5
1059
1060
1061
1062 movdqa \XMM5,\TMP2
1063 movdqa \XMM5,\TMP3
1064 movdqa \XMM5,\TMP4
1065 psrld $1, \TMP2
1066 psrld $2, \TMP3
1067 psrld $7, \TMP4
1068 pxor \TMP3,\TMP2
1069 pxor \TMP4,\TMP2
1070 pxor \TMP5, \TMP2
1071 pxor \TMP2, \XMM5
1072 pxor \TMP1, \XMM5
1073
1074 pxor \XMM5, \XMM1
1075.endm
1076
1077
1078.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1079TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1080
1081
1082
1083 movdqa \XMM1, \TMP6
1084 pshufd $78, \XMM1, \TMP2
1085 pxor \XMM1, \TMP2
1086 movdqa HashKey_4(%rsp), \TMP5
1087 PCLMULQDQ 0x11, \TMP5, \TMP6
1088 PCLMULQDQ 0x00, \TMP5, \XMM1
1089 movdqa HashKey_4_k(%rsp), \TMP4
1090 PCLMULQDQ 0x00, \TMP4, \TMP2
1091 movdqa \XMM1, \XMMDst
1092 movdqa \TMP2, \XMM1
1093
1094
1095
1096 movdqa \XMM2, \TMP1
1097 pshufd $78, \XMM2, \TMP2
1098 pxor \XMM2, \TMP2
1099 movdqa HashKey_3(%rsp), \TMP5
1100 PCLMULQDQ 0x11, \TMP5, \TMP1
1101 PCLMULQDQ 0x00, \TMP5, \XMM2
1102 movdqa HashKey_3_k(%rsp), \TMP4
1103 PCLMULQDQ 0x00, \TMP4, \TMP2
1104 pxor \TMP1, \TMP6
1105 pxor \XMM2, \XMMDst
1106 pxor \TMP2, \XMM1
1107
1108
1109
1110
1111 movdqa \XMM3, \TMP1
1112 pshufd $78, \XMM3, \TMP2
1113 pxor \XMM3, \TMP2
1114 movdqa HashKey_2(%rsp), \TMP5
1115 PCLMULQDQ 0x11, \TMP5, \TMP1
1116 PCLMULQDQ 0x00, \TMP5, \XMM3
1117 movdqa HashKey_2_k(%rsp), \TMP4
1118 PCLMULQDQ 0x00, \TMP4, \TMP2
1119 pxor \TMP1, \TMP6
1120 pxor \XMM3, \XMMDst
1121 pxor \TMP2, \XMM1
1122
1123
1124 movdqa \XMM4, \TMP1
1125 pshufd $78, \XMM4, \TMP2
1126 pxor \XMM4, \TMP2
1127 movdqa HashKey(%rsp), \TMP5
1128 PCLMULQDQ 0x11, \TMP5, \TMP1
1129 PCLMULQDQ 0x00, \TMP5, \XMM4
1130 movdqa HashKey_k(%rsp), \TMP4
1131 PCLMULQDQ 0x00, \TMP4, \TMP2
1132 pxor \TMP1, \TMP6
1133 pxor \XMM4, \XMMDst
1134 pxor \XMM1, \TMP2
1135 pxor \TMP6, \TMP2
1136 pxor \XMMDst, \TMP2
1137
1138 movdqa \TMP2, \TMP4
1139 pslldq $8, \TMP4
1140 psrldq $8, \TMP2
1141 pxor \TMP4, \XMMDst
1142 pxor \TMP2, \TMP6
1143
1144
1145 movdqa \XMMDst, \TMP2
1146 movdqa \XMMDst, \TMP3
1147 movdqa \XMMDst, \TMP4
1148
1149 pslld $31, \TMP2
1150 pslld $30, \TMP3
1151 pslld $25, \TMP4
1152 pxor \TMP3, \TMP2
1153 pxor \TMP4, \TMP2
1154 movdqa \TMP2, \TMP7
1155 psrldq $4, \TMP7
1156 pslldq $12, \TMP2
1157 pxor \TMP2, \XMMDst
1158
1159
1160 movdqa \XMMDst, \TMP2
1161
1162 movdqa \XMMDst, \TMP3
1163 movdqa \XMMDst, \TMP4
1164 psrld $1, \TMP2
1165 psrld $2, \TMP3
1166 psrld $7, \TMP4
1167 pxor \TMP3, \TMP2
1168 pxor \TMP4, \TMP2
1169 pxor \TMP7, \TMP2
1170 pxor \TMP2, \XMMDst
1171 pxor \TMP6, \XMMDst
1172.endm
1173
1174
1175
1176
1177
1178
1179.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1180
1181 pxor (%arg1), \XMM0
1182 mov keysize,%eax
1183 shr $2,%eax
1184 add $5,%eax
1185 lea 16(%arg1), %r10
1186
1187_esb_loop_\@:
1188 MOVADQ (%r10),\TMP1
1189 AESENC \TMP1,\XMM0
1190 add $16,%r10
1191 sub $1,%eax
1192 jnz _esb_loop_\@
1193
1194 MOVADQ (%r10),\TMP1
1195 AESENCLAST \TMP1,\XMM0
1196.endm
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283ENTRY(aesni_gcm_dec)
1284 push %r12
1285 push %r13
1286 push %r14
1287 mov %rsp, %r14
1288
1289
1290
1291
1292 sub $VARIABLE_OFFSET, %rsp
1293 and $~63, %rsp
1294 mov %arg6, %r12
1295 movdqu (%r12), %xmm13
1296 movdqa SHUF_MASK(%rip), %xmm2
1297 PSHUFB_XMM %xmm2, %xmm13
1298
1299
1300
1301
1302 movdqa %xmm13, %xmm2
1303 psllq $1, %xmm13
1304 psrlq $63, %xmm2
1305 movdqa %xmm2, %xmm1
1306 pslldq $8, %xmm2
1307 psrldq $8, %xmm1
1308 por %xmm2, %xmm13
1309
1310
1311
1312 pshufd $0x24, %xmm1, %xmm2
1313 pcmpeqd TWOONE(%rip), %xmm2
1314 pand POLY(%rip), %xmm2
1315 pxor %xmm2, %xmm13
1316
1317
1318
1319
1320 movdqa %xmm13, HashKey(%rsp)
1321 mov %arg4, %r13
1322 and $-16, %r13
1323 mov %r13, %r12
1324 and $(3<<4), %r12
1325 jz _initial_num_blocks_is_0_decrypt
1326 cmp $(2<<4), %r12
1327 jb _initial_num_blocks_is_1_decrypt
1328 je _initial_num_blocks_is_2_decrypt
1329_initial_num_blocks_is_3_decrypt:
1330 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1331%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1332 sub $48, %r13
1333 jmp _initial_blocks_decrypted
1334_initial_num_blocks_is_2_decrypt:
1335 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1336%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1337 sub $32, %r13
1338 jmp _initial_blocks_decrypted
1339_initial_num_blocks_is_1_decrypt:
1340 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1341%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1342 sub $16, %r13
1343 jmp _initial_blocks_decrypted
1344_initial_num_blocks_is_0_decrypt:
1345 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1346%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1347_initial_blocks_decrypted:
1348 cmp $0, %r13
1349 je _zero_cipher_left_decrypt
1350 sub $64, %r13
1351 je _four_cipher_left_decrypt
1352_decrypt_by_4:
1353 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1354%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1355 add $64, %r11
1356 sub $64, %r13
1357 jne _decrypt_by_4
1358_four_cipher_left_decrypt:
1359 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1360%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1361_zero_cipher_left_decrypt:
1362 mov %arg4, %r13
1363 and $15, %r13
1364 je _multiple_of_16_bytes_decrypt
1365
1366
1367
1368 paddd ONE(%rip), %xmm0
1369 movdqa SHUF_MASK(%rip), %xmm10
1370 PSHUFB_XMM %xmm10, %xmm0
1371
1372 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1
1373 sub $16, %r11
1374 add %r13, %r11
1375 movdqu (%arg3,%r11,1), %xmm1
1376 lea SHIFT_MASK+16(%rip), %r12
1377 sub %r13, %r12
1378
1379
1380 movdqu (%r12), %xmm2
1381 PSHUFB_XMM %xmm2, %xmm1
1382
1383 movdqa %xmm1, %xmm2
1384 pxor %xmm1, %xmm0
1385 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1386
1387 pand %xmm1, %xmm0
1388 pand %xmm1, %xmm2
1389 movdqa SHUF_MASK(%rip), %xmm10
1390 PSHUFB_XMM %xmm10 ,%xmm2
1391
1392 pxor %xmm2, %xmm8
1393 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1394
1395 sub %r13, %r11
1396 add $16, %r11
1397
1398
1399 MOVQ_R64_XMM %xmm0, %rax
1400 cmp $8, %r13
1401 jle _less_than_8_bytes_left_decrypt
1402 mov %rax, (%arg2 , %r11, 1)
1403 add $8, %r11
1404 psrldq $8, %xmm0
1405 MOVQ_R64_XMM %xmm0, %rax
1406 sub $8, %r13
1407_less_than_8_bytes_left_decrypt:
1408 mov %al, (%arg2, %r11, 1)
1409 add $1, %r11
1410 shr $8, %rax
1411 sub $1, %r13
1412 jne _less_than_8_bytes_left_decrypt
1413_multiple_of_16_bytes_decrypt:
1414 mov arg8, %r12
1415 shl $3, %r12
1416 movd %r12d, %xmm15
1417 shl $3, %arg4
1418 MOVQ_R64_XMM %arg4, %xmm1
1419 pslldq $8, %xmm15
1420 pxor %xmm1, %xmm15
1421 pxor %xmm15, %xmm8
1422 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1423
1424 movdqa SHUF_MASK(%rip), %xmm10
1425 PSHUFB_XMM %xmm10, %xmm8
1426
1427 mov %arg5, %rax
1428 movdqu (%rax), %xmm0
1429 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1
1430 pxor %xmm8, %xmm0
1431_return_T_decrypt:
1432 mov arg9, %r10
1433 mov arg10, %r11
1434 cmp $16, %r11
1435 je _T_16_decrypt
1436 cmp $12, %r11
1437 je _T_12_decrypt
1438_T_8_decrypt:
1439 MOVQ_R64_XMM %xmm0, %rax
1440 mov %rax, (%r10)
1441 jmp _return_T_done_decrypt
1442_T_12_decrypt:
1443 MOVQ_R64_XMM %xmm0, %rax
1444 mov %rax, (%r10)
1445 psrldq $8, %xmm0
1446 movd %xmm0, %eax
1447 mov %eax, 8(%r10)
1448 jmp _return_T_done_decrypt
1449_T_16_decrypt:
1450 movdqu %xmm0, (%r10)
1451_return_T_done_decrypt:
1452 mov %r14, %rsp
1453 pop %r14
1454 pop %r13
1455 pop %r12
1456 ret
1457ENDPROC(aesni_gcm_dec)
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543ENTRY(aesni_gcm_enc)
1544 push %r12
1545 push %r13
1546 push %r14
1547 mov %rsp, %r14
1548
1549
1550
1551
1552 sub $VARIABLE_OFFSET, %rsp
1553 and $~63, %rsp
1554 mov %arg6, %r12
1555 movdqu (%r12), %xmm13
1556 movdqa SHUF_MASK(%rip), %xmm2
1557 PSHUFB_XMM %xmm2, %xmm13
1558
1559
1560
1561
1562 movdqa %xmm13, %xmm2
1563 psllq $1, %xmm13
1564 psrlq $63, %xmm2
1565 movdqa %xmm2, %xmm1
1566 pslldq $8, %xmm2
1567 psrldq $8, %xmm1
1568 por %xmm2, %xmm13
1569
1570
1571
1572 pshufd $0x24, %xmm1, %xmm2
1573 pcmpeqd TWOONE(%rip), %xmm2
1574 pand POLY(%rip), %xmm2
1575 pxor %xmm2, %xmm13
1576 movdqa %xmm13, HashKey(%rsp)
1577 mov %arg4, %r13
1578 and $-16, %r13
1579 mov %r13, %r12
1580
1581
1582
1583 and $(3<<4), %r12
1584 jz _initial_num_blocks_is_0_encrypt
1585 cmp $(2<<4), %r12
1586 jb _initial_num_blocks_is_1_encrypt
1587 je _initial_num_blocks_is_2_encrypt
1588_initial_num_blocks_is_3_encrypt:
1589 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1590%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1591 sub $48, %r13
1592 jmp _initial_blocks_encrypted
1593_initial_num_blocks_is_2_encrypt:
1594 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1595%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1596 sub $32, %r13
1597 jmp _initial_blocks_encrypted
1598_initial_num_blocks_is_1_encrypt:
1599 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1600%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1601 sub $16, %r13
1602 jmp _initial_blocks_encrypted
1603_initial_num_blocks_is_0_encrypt:
1604 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1605%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1606_initial_blocks_encrypted:
1607
1608
1609
1610 cmp $0, %r13
1611 je _zero_cipher_left_encrypt
1612 sub $64, %r13
1613 je _four_cipher_left_encrypt
1614_encrypt_by_4_encrypt:
1615 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1616%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1617 add $64, %r11
1618 sub $64, %r13
1619 jne _encrypt_by_4_encrypt
1620_four_cipher_left_encrypt:
1621 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1622%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1623_zero_cipher_left_encrypt:
1624 mov %arg4, %r13
1625 and $15, %r13
1626 je _multiple_of_16_bytes_encrypt
1627
1628
1629 paddd ONE(%rip), %xmm0
1630 movdqa SHUF_MASK(%rip), %xmm10
1631 PSHUFB_XMM %xmm10, %xmm0
1632
1633
1634 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1
1635 sub $16, %r11
1636 add %r13, %r11
1637 movdqu (%arg3,%r11,1), %xmm1
1638 lea SHIFT_MASK+16(%rip), %r12
1639 sub %r13, %r12
1640
1641
1642 movdqu (%r12), %xmm2
1643 PSHUFB_XMM %xmm2, %xmm1
1644 pxor %xmm1, %xmm0
1645 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1646
1647 pand %xmm1, %xmm0
1648 movdqa SHUF_MASK(%rip), %xmm10
1649 PSHUFB_XMM %xmm10,%xmm0
1650
1651 pxor %xmm0, %xmm8
1652 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1653
1654 sub %r13, %r11
1655 add $16, %r11
1656
1657 movdqa SHUF_MASK(%rip), %xmm10
1658 PSHUFB_XMM %xmm10, %xmm0
1659
1660
1661
1662
1663 MOVQ_R64_XMM %xmm0, %rax
1664 cmp $8, %r13
1665 jle _less_than_8_bytes_left_encrypt
1666 mov %rax, (%arg2 , %r11, 1)
1667 add $8, %r11
1668 psrldq $8, %xmm0
1669 MOVQ_R64_XMM %xmm0, %rax
1670 sub $8, %r13
1671_less_than_8_bytes_left_encrypt:
1672 mov %al, (%arg2, %r11, 1)
1673 add $1, %r11
1674 shr $8, %rax
1675 sub $1, %r13
1676 jne _less_than_8_bytes_left_encrypt
1677_multiple_of_16_bytes_encrypt:
1678 mov arg8, %r12
1679 shl $3, %r12
1680 movd %r12d, %xmm15
1681 shl $3, %arg4
1682 MOVQ_R64_XMM %arg4, %xmm1
1683 pslldq $8, %xmm15
1684 pxor %xmm1, %xmm15
1685 pxor %xmm15, %xmm8
1686 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1687
1688 movdqa SHUF_MASK(%rip), %xmm10
1689 PSHUFB_XMM %xmm10, %xmm8
1690
1691 mov %arg5, %rax
1692 movdqu (%rax), %xmm0
1693 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15
1694 pxor %xmm8, %xmm0
1695_return_T_encrypt:
1696 mov arg9, %r10
1697 mov arg10, %r11
1698 cmp $16, %r11
1699 je _T_16_encrypt
1700 cmp $12, %r11
1701 je _T_12_encrypt
1702_T_8_encrypt:
1703 MOVQ_R64_XMM %xmm0, %rax
1704 mov %rax, (%r10)
1705 jmp _return_T_done_encrypt
1706_T_12_encrypt:
1707 MOVQ_R64_XMM %xmm0, %rax
1708 mov %rax, (%r10)
1709 psrldq $8, %xmm0
1710 movd %xmm0, %eax
1711 mov %eax, 8(%r10)
1712 jmp _return_T_done_encrypt
1713_T_16_encrypt:
1714 movdqu %xmm0, (%r10)
1715_return_T_done_encrypt:
1716 mov %r14, %rsp
1717 pop %r14
1718 pop %r13
1719 pop %r12
1720 ret
1721ENDPROC(aesni_gcm_enc)
1722
1723#endif
1724
1725
1726.align 4
1727_key_expansion_128:
1728_key_expansion_256a:
1729 pshufd $0b11111111, %xmm1, %xmm1
1730 shufps $0b00010000, %xmm0, %xmm4
1731 pxor %xmm4, %xmm0
1732 shufps $0b10001100, %xmm0, %xmm4
1733 pxor %xmm4, %xmm0
1734 pxor %xmm1, %xmm0
1735 movaps %xmm0, (TKEYP)
1736 add $0x10, TKEYP
1737 ret
1738ENDPROC(_key_expansion_128)
1739ENDPROC(_key_expansion_256a)
1740
1741.align 4
1742_key_expansion_192a:
1743 pshufd $0b01010101, %xmm1, %xmm1
1744 shufps $0b00010000, %xmm0, %xmm4
1745 pxor %xmm4, %xmm0
1746 shufps $0b10001100, %xmm0, %xmm4
1747 pxor %xmm4, %xmm0
1748 pxor %xmm1, %xmm0
1749
1750 movaps %xmm2, %xmm5
1751 movaps %xmm2, %xmm6
1752 pslldq $4, %xmm5
1753 pshufd $0b11111111, %xmm0, %xmm3
1754 pxor %xmm3, %xmm2
1755 pxor %xmm5, %xmm2
1756
1757 movaps %xmm0, %xmm1
1758 shufps $0b01000100, %xmm0, %xmm6
1759 movaps %xmm6, (TKEYP)
1760 shufps $0b01001110, %xmm2, %xmm1
1761 movaps %xmm1, 0x10(TKEYP)
1762 add $0x20, TKEYP
1763 ret
1764ENDPROC(_key_expansion_192a)
1765
1766.align 4
1767_key_expansion_192b:
1768 pshufd $0b01010101, %xmm1, %xmm1
1769 shufps $0b00010000, %xmm0, %xmm4
1770 pxor %xmm4, %xmm0
1771 shufps $0b10001100, %xmm0, %xmm4
1772 pxor %xmm4, %xmm0
1773 pxor %xmm1, %xmm0
1774
1775 movaps %xmm2, %xmm5
1776 pslldq $4, %xmm5
1777 pshufd $0b11111111, %xmm0, %xmm3
1778 pxor %xmm3, %xmm2
1779 pxor %xmm5, %xmm2
1780
1781 movaps %xmm0, (TKEYP)
1782 add $0x10, TKEYP
1783 ret
1784ENDPROC(_key_expansion_192b)
1785
1786.align 4
1787_key_expansion_256b:
1788 pshufd $0b10101010, %xmm1, %xmm1
1789 shufps $0b00010000, %xmm2, %xmm4
1790 pxor %xmm4, %xmm2
1791 shufps $0b10001100, %xmm2, %xmm4
1792 pxor %xmm4, %xmm2
1793 pxor %xmm1, %xmm2
1794 movaps %xmm2, (TKEYP)
1795 add $0x10, TKEYP
1796 ret
1797ENDPROC(_key_expansion_256b)
1798
1799
1800
1801
1802
1803ENTRY(aesni_set_key)
1804 FRAME_BEGIN
1805#ifndef __x86_64__
1806 pushl KEYP
1807 movl (FRAME_OFFSET+8)(%esp), KEYP
1808 movl (FRAME_OFFSET+12)(%esp), UKEYP
1809 movl (FRAME_OFFSET+16)(%esp), %edx
1810#endif
1811 movups (UKEYP), %xmm0
1812 movaps %xmm0, (KEYP)
1813 lea 0x10(KEYP), TKEYP
1814 movl %edx, 480(KEYP)
1815 pxor %xmm4, %xmm4
1816 cmp $24, %dl
1817 jb .Lenc_key128
1818 je .Lenc_key192
1819 movups 0x10(UKEYP), %xmm2
1820 movaps %xmm2, (TKEYP)
1821 add $0x10, TKEYP
1822 AESKEYGENASSIST 0x1 %xmm2 %xmm1
1823 call _key_expansion_256a
1824 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1825 call _key_expansion_256b
1826 AESKEYGENASSIST 0x2 %xmm2 %xmm1
1827 call _key_expansion_256a
1828 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1829 call _key_expansion_256b
1830 AESKEYGENASSIST 0x4 %xmm2 %xmm1
1831 call _key_expansion_256a
1832 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1833 call _key_expansion_256b
1834 AESKEYGENASSIST 0x8 %xmm2 %xmm1
1835 call _key_expansion_256a
1836 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1837 call _key_expansion_256b
1838 AESKEYGENASSIST 0x10 %xmm2 %xmm1
1839 call _key_expansion_256a
1840 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1841 call _key_expansion_256b
1842 AESKEYGENASSIST 0x20 %xmm2 %xmm1
1843 call _key_expansion_256a
1844 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1845 call _key_expansion_256b
1846 AESKEYGENASSIST 0x40 %xmm2 %xmm1
1847 call _key_expansion_256a
1848 jmp .Ldec_key
1849.Lenc_key192:
1850 movq 0x10(UKEYP), %xmm2
1851 AESKEYGENASSIST 0x1 %xmm2 %xmm1
1852 call _key_expansion_192a
1853 AESKEYGENASSIST 0x2 %xmm2 %xmm1
1854 call _key_expansion_192b
1855 AESKEYGENASSIST 0x4 %xmm2 %xmm1
1856 call _key_expansion_192a
1857 AESKEYGENASSIST 0x8 %xmm2 %xmm1
1858 call _key_expansion_192b
1859 AESKEYGENASSIST 0x10 %xmm2 %xmm1
1860 call _key_expansion_192a
1861 AESKEYGENASSIST 0x20 %xmm2 %xmm1
1862 call _key_expansion_192b
1863 AESKEYGENASSIST 0x40 %xmm2 %xmm1
1864 call _key_expansion_192a
1865 AESKEYGENASSIST 0x80 %xmm2 %xmm1
1866 call _key_expansion_192b
1867 jmp .Ldec_key
1868.Lenc_key128:
1869 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1870 call _key_expansion_128
1871 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1872 call _key_expansion_128
1873 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1874 call _key_expansion_128
1875 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1876 call _key_expansion_128
1877 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1878 call _key_expansion_128
1879 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1880 call _key_expansion_128
1881 AESKEYGENASSIST 0x40 %xmm0 %xmm1
1882 call _key_expansion_128
1883 AESKEYGENASSIST 0x80 %xmm0 %xmm1
1884 call _key_expansion_128
1885 AESKEYGENASSIST 0x1b %xmm0 %xmm1
1886 call _key_expansion_128
1887 AESKEYGENASSIST 0x36 %xmm0 %xmm1
1888 call _key_expansion_128
1889.Ldec_key:
1890 sub $0x10, TKEYP
1891 movaps (KEYP), %xmm0
1892 movaps (TKEYP), %xmm1
1893 movaps %xmm0, 240(TKEYP)
1894 movaps %xmm1, 240(KEYP)
1895 add $0x10, KEYP
1896 lea 240-16(TKEYP), UKEYP
1897.align 4
1898.Ldec_key_loop:
1899 movaps (KEYP), %xmm0
1900 AESIMC %xmm0 %xmm1
1901 movaps %xmm1, (UKEYP)
1902 add $0x10, KEYP
1903 sub $0x10, UKEYP
1904 cmp TKEYP, KEYP
1905 jb .Ldec_key_loop
1906 xor AREG, AREG
1907#ifndef __x86_64__
1908 popl KEYP
1909#endif
1910 FRAME_END
1911 ret
1912ENDPROC(aesni_set_key)
1913
1914
1915
1916
1917ENTRY(aesni_enc)
1918 FRAME_BEGIN
1919#ifndef __x86_64__
1920 pushl KEYP
1921 pushl KLEN
1922 movl (FRAME_OFFSET+12)(%esp), KEYP
1923 movl (FRAME_OFFSET+16)(%esp), OUTP
1924 movl (FRAME_OFFSET+20)(%esp), INP
1925#endif
1926 movl 480(KEYP), KLEN
1927 movups (INP), STATE
1928 call _aesni_enc1
1929 movups STATE, (OUTP)
1930#ifndef __x86_64__
1931 popl KLEN
1932 popl KEYP
1933#endif
1934 FRAME_END
1935 ret
1936ENDPROC(aesni_enc)
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950.align 4
1951_aesni_enc1:
1952 movaps (KEYP), KEY
1953 mov KEYP, TKEYP
1954 pxor KEY, STATE
1955 add $0x30, TKEYP
1956 cmp $24, KLEN
1957 jb .Lenc128
1958 lea 0x20(TKEYP), TKEYP
1959 je .Lenc192
1960 add $0x20, TKEYP
1961 movaps -0x60(TKEYP), KEY
1962 AESENC KEY STATE
1963 movaps -0x50(TKEYP), KEY
1964 AESENC KEY STATE
1965.align 4
1966.Lenc192:
1967 movaps -0x40(TKEYP), KEY
1968 AESENC KEY STATE
1969 movaps -0x30(TKEYP), KEY
1970 AESENC KEY STATE
1971.align 4
1972.Lenc128:
1973 movaps -0x20(TKEYP), KEY
1974 AESENC KEY STATE
1975 movaps -0x10(TKEYP), KEY
1976 AESENC KEY STATE
1977 movaps (TKEYP), KEY
1978 AESENC KEY STATE
1979 movaps 0x10(TKEYP), KEY
1980 AESENC KEY STATE
1981 movaps 0x20(TKEYP), KEY
1982 AESENC KEY STATE
1983 movaps 0x30(TKEYP), KEY
1984 AESENC KEY STATE
1985 movaps 0x40(TKEYP), KEY
1986 AESENC KEY STATE
1987 movaps 0x50(TKEYP), KEY
1988 AESENC KEY STATE
1989 movaps 0x60(TKEYP), KEY
1990 AESENC KEY STATE
1991 movaps 0x70(TKEYP), KEY
1992 AESENCLAST KEY STATE
1993 ret
1994ENDPROC(_aesni_enc1)
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014.align 4
2015_aesni_enc4:
2016 movaps (KEYP), KEY
2017 mov KEYP, TKEYP
2018 pxor KEY, STATE1
2019 pxor KEY, STATE2
2020 pxor KEY, STATE3
2021 pxor KEY, STATE4
2022 add $0x30, TKEYP
2023 cmp $24, KLEN
2024 jb .L4enc128
2025 lea 0x20(TKEYP), TKEYP
2026 je .L4enc192
2027 add $0x20, TKEYP
2028 movaps -0x60(TKEYP), KEY
2029 AESENC KEY STATE1
2030 AESENC KEY STATE2
2031 AESENC KEY STATE3
2032 AESENC KEY STATE4
2033 movaps -0x50(TKEYP), KEY
2034 AESENC KEY STATE1
2035 AESENC KEY STATE2
2036 AESENC KEY STATE3
2037 AESENC KEY STATE4
2038
2039.L4enc192:
2040 movaps -0x40(TKEYP), KEY
2041 AESENC KEY STATE1
2042 AESENC KEY STATE2
2043 AESENC KEY STATE3
2044 AESENC KEY STATE4
2045 movaps -0x30(TKEYP), KEY
2046 AESENC KEY STATE1
2047 AESENC KEY STATE2
2048 AESENC KEY STATE3
2049 AESENC KEY STATE4
2050
2051.L4enc128:
2052 movaps -0x20(TKEYP), KEY
2053 AESENC KEY STATE1
2054 AESENC KEY STATE2
2055 AESENC KEY STATE3
2056 AESENC KEY STATE4
2057 movaps -0x10(TKEYP), KEY
2058 AESENC KEY STATE1
2059 AESENC KEY STATE2
2060 AESENC KEY STATE3
2061 AESENC KEY STATE4
2062 movaps (TKEYP), KEY
2063 AESENC KEY STATE1
2064 AESENC KEY STATE2
2065 AESENC KEY STATE3
2066 AESENC KEY STATE4
2067 movaps 0x10(TKEYP), KEY
2068 AESENC KEY STATE1
2069 AESENC KEY STATE2
2070 AESENC KEY STATE3
2071 AESENC KEY STATE4
2072 movaps 0x20(TKEYP), KEY
2073 AESENC KEY STATE1
2074 AESENC KEY STATE2
2075 AESENC KEY STATE3
2076 AESENC KEY STATE4
2077 movaps 0x30(TKEYP), KEY
2078 AESENC KEY STATE1
2079 AESENC KEY STATE2
2080 AESENC KEY STATE3
2081 AESENC KEY STATE4
2082 movaps 0x40(TKEYP), KEY
2083 AESENC KEY STATE1
2084 AESENC KEY STATE2
2085 AESENC KEY STATE3
2086 AESENC KEY STATE4
2087 movaps 0x50(TKEYP), KEY
2088 AESENC KEY STATE1
2089 AESENC KEY STATE2
2090 AESENC KEY STATE3
2091 AESENC KEY STATE4
2092 movaps 0x60(TKEYP), KEY
2093 AESENC KEY STATE1
2094 AESENC KEY STATE2
2095 AESENC KEY STATE3
2096 AESENC KEY STATE4
2097 movaps 0x70(TKEYP), KEY
2098 AESENCLAST KEY STATE1
2099 AESENCLAST KEY STATE2
2100 AESENCLAST KEY STATE3
2101 AESENCLAST KEY STATE4
2102 ret
2103ENDPROC(_aesni_enc4)
2104
2105
2106
2107
2108ENTRY(aesni_dec)
2109 FRAME_BEGIN
2110#ifndef __x86_64__
2111 pushl KEYP
2112 pushl KLEN
2113 movl (FRAME_OFFSET+12)(%esp), KEYP
2114 movl (FRAME_OFFSET+16)(%esp), OUTP
2115 movl (FRAME_OFFSET+20)(%esp), INP
2116#endif
2117 mov 480(KEYP), KLEN
2118 add $240, KEYP
2119 movups (INP), STATE
2120 call _aesni_dec1
2121 movups STATE, (OUTP)
2122#ifndef __x86_64__
2123 popl KLEN
2124 popl KEYP
2125#endif
2126 FRAME_END
2127 ret
2128ENDPROC(aesni_dec)
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142.align 4
2143_aesni_dec1:
2144 movaps (KEYP), KEY
2145 mov KEYP, TKEYP
2146 pxor KEY, STATE
2147 add $0x30, TKEYP
2148 cmp $24, KLEN
2149 jb .Ldec128
2150 lea 0x20(TKEYP), TKEYP
2151 je .Ldec192
2152 add $0x20, TKEYP
2153 movaps -0x60(TKEYP), KEY
2154 AESDEC KEY STATE
2155 movaps -0x50(TKEYP), KEY
2156 AESDEC KEY STATE
2157.align 4
2158.Ldec192:
2159 movaps -0x40(TKEYP), KEY
2160 AESDEC KEY STATE
2161 movaps -0x30(TKEYP), KEY
2162 AESDEC KEY STATE
2163.align 4
2164.Ldec128:
2165 movaps -0x20(TKEYP), KEY
2166 AESDEC KEY STATE
2167 movaps -0x10(TKEYP), KEY
2168 AESDEC KEY STATE
2169 movaps (TKEYP), KEY
2170 AESDEC KEY STATE
2171 movaps 0x10(TKEYP), KEY
2172 AESDEC KEY STATE
2173 movaps 0x20(TKEYP), KEY
2174 AESDEC KEY STATE
2175 movaps 0x30(TKEYP), KEY
2176 AESDEC KEY STATE
2177 movaps 0x40(TKEYP), KEY
2178 AESDEC KEY STATE
2179 movaps 0x50(TKEYP), KEY
2180 AESDEC KEY STATE
2181 movaps 0x60(TKEYP), KEY
2182 AESDEC KEY STATE
2183 movaps 0x70(TKEYP), KEY
2184 AESDECLAST KEY STATE
2185 ret
2186ENDPROC(_aesni_dec1)
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206.align 4
2207_aesni_dec4:
2208 movaps (KEYP), KEY
2209 mov KEYP, TKEYP
2210 pxor KEY, STATE1
2211 pxor KEY, STATE2
2212 pxor KEY, STATE3
2213 pxor KEY, STATE4
2214 add $0x30, TKEYP
2215 cmp $24, KLEN
2216 jb .L4dec128
2217 lea 0x20(TKEYP), TKEYP
2218 je .L4dec192
2219 add $0x20, TKEYP
2220 movaps -0x60(TKEYP), KEY
2221 AESDEC KEY STATE1
2222 AESDEC KEY STATE2
2223 AESDEC KEY STATE3
2224 AESDEC KEY STATE4
2225 movaps -0x50(TKEYP), KEY
2226 AESDEC KEY STATE1
2227 AESDEC KEY STATE2
2228 AESDEC KEY STATE3
2229 AESDEC KEY STATE4
2230.align 4
2231.L4dec192:
2232 movaps -0x40(TKEYP), KEY
2233 AESDEC KEY STATE1
2234 AESDEC KEY STATE2
2235 AESDEC KEY STATE3
2236 AESDEC KEY STATE4
2237 movaps -0x30(TKEYP), KEY
2238 AESDEC KEY STATE1
2239 AESDEC KEY STATE2
2240 AESDEC KEY STATE3
2241 AESDEC KEY STATE4
2242.align 4
2243.L4dec128:
2244 movaps -0x20(TKEYP), KEY
2245 AESDEC KEY STATE1
2246 AESDEC KEY STATE2
2247 AESDEC KEY STATE3
2248 AESDEC KEY STATE4
2249 movaps -0x10(TKEYP), KEY
2250 AESDEC KEY STATE1
2251 AESDEC KEY STATE2
2252 AESDEC KEY STATE3
2253 AESDEC KEY STATE4
2254 movaps (TKEYP), KEY
2255 AESDEC KEY STATE1
2256 AESDEC KEY STATE2
2257 AESDEC KEY STATE3
2258 AESDEC KEY STATE4
2259 movaps 0x10(TKEYP), KEY
2260 AESDEC KEY STATE1
2261 AESDEC KEY STATE2
2262 AESDEC KEY STATE3
2263 AESDEC KEY STATE4
2264 movaps 0x20(TKEYP), KEY
2265 AESDEC KEY STATE1
2266 AESDEC KEY STATE2
2267 AESDEC KEY STATE3
2268 AESDEC KEY STATE4
2269 movaps 0x30(TKEYP), KEY
2270 AESDEC KEY STATE1
2271 AESDEC KEY STATE2
2272 AESDEC KEY STATE3
2273 AESDEC KEY STATE4
2274 movaps 0x40(TKEYP), KEY
2275 AESDEC KEY STATE1
2276 AESDEC KEY STATE2
2277 AESDEC KEY STATE3
2278 AESDEC KEY STATE4
2279 movaps 0x50(TKEYP), KEY
2280 AESDEC KEY STATE1
2281 AESDEC KEY STATE2
2282 AESDEC KEY STATE3
2283 AESDEC KEY STATE4
2284 movaps 0x60(TKEYP), KEY
2285 AESDEC KEY STATE1
2286 AESDEC KEY STATE2
2287 AESDEC KEY STATE3
2288 AESDEC KEY STATE4
2289 movaps 0x70(TKEYP), KEY
2290 AESDECLAST KEY STATE1
2291 AESDECLAST KEY STATE2
2292 AESDECLAST KEY STATE3
2293 AESDECLAST KEY STATE4
2294 ret
2295ENDPROC(_aesni_dec4)
2296
2297
2298
2299
2300
2301ENTRY(aesni_ecb_enc)
2302 FRAME_BEGIN
2303#ifndef __x86_64__
2304 pushl LEN
2305 pushl KEYP
2306 pushl KLEN
2307 movl (FRAME_OFFSET+16)(%esp), KEYP
2308 movl (FRAME_OFFSET+20)(%esp), OUTP
2309 movl (FRAME_OFFSET+24)(%esp), INP
2310 movl (FRAME_OFFSET+28)(%esp), LEN
2311#endif
2312 test LEN, LEN
2313 jz .Lecb_enc_ret
2314 mov 480(KEYP), KLEN
2315 cmp $16, LEN
2316 jb .Lecb_enc_ret
2317 cmp $64, LEN
2318 jb .Lecb_enc_loop1
2319.align 4
2320.Lecb_enc_loop4:
2321 movups (INP), STATE1
2322 movups 0x10(INP), STATE2
2323 movups 0x20(INP), STATE3
2324 movups 0x30(INP), STATE4
2325 call _aesni_enc4
2326 movups STATE1, (OUTP)
2327 movups STATE2, 0x10(OUTP)
2328 movups STATE3, 0x20(OUTP)
2329 movups STATE4, 0x30(OUTP)
2330 sub $64, LEN
2331 add $64, INP
2332 add $64, OUTP
2333 cmp $64, LEN
2334 jge .Lecb_enc_loop4
2335 cmp $16, LEN
2336 jb .Lecb_enc_ret
2337.align 4
2338.Lecb_enc_loop1:
2339 movups (INP), STATE1
2340 call _aesni_enc1
2341 movups STATE1, (OUTP)
2342 sub $16, LEN
2343 add $16, INP
2344 add $16, OUTP
2345 cmp $16, LEN
2346 jge .Lecb_enc_loop1
2347.Lecb_enc_ret:
2348#ifndef __x86_64__
2349 popl KLEN
2350 popl KEYP
2351 popl LEN
2352#endif
2353 FRAME_END
2354 ret
2355ENDPROC(aesni_ecb_enc)
2356
2357
2358
2359
2360
2361ENTRY(aesni_ecb_dec)
2362 FRAME_BEGIN
2363#ifndef __x86_64__
2364 pushl LEN
2365 pushl KEYP
2366 pushl KLEN
2367 movl (FRAME_OFFSET+16)(%esp), KEYP
2368 movl (FRAME_OFFSET+20)(%esp), OUTP
2369 movl (FRAME_OFFSET+24)(%esp), INP
2370 movl (FRAME_OFFSET+28)(%esp), LEN
2371#endif
2372 test LEN, LEN
2373 jz .Lecb_dec_ret
2374 mov 480(KEYP), KLEN
2375 add $240, KEYP
2376 cmp $16, LEN
2377 jb .Lecb_dec_ret
2378 cmp $64, LEN
2379 jb .Lecb_dec_loop1
2380.align 4
2381.Lecb_dec_loop4:
2382 movups (INP), STATE1
2383 movups 0x10(INP), STATE2
2384 movups 0x20(INP), STATE3
2385 movups 0x30(INP), STATE4
2386 call _aesni_dec4
2387 movups STATE1, (OUTP)
2388 movups STATE2, 0x10(OUTP)
2389 movups STATE3, 0x20(OUTP)
2390 movups STATE4, 0x30(OUTP)
2391 sub $64, LEN
2392 add $64, INP
2393 add $64, OUTP
2394 cmp $64, LEN
2395 jge .Lecb_dec_loop4
2396 cmp $16, LEN
2397 jb .Lecb_dec_ret
2398.align 4
2399.Lecb_dec_loop1:
2400 movups (INP), STATE1
2401 call _aesni_dec1
2402 movups STATE1, (OUTP)
2403 sub $16, LEN
2404 add $16, INP
2405 add $16, OUTP
2406 cmp $16, LEN
2407 jge .Lecb_dec_loop1
2408.Lecb_dec_ret:
2409#ifndef __x86_64__
2410 popl KLEN
2411 popl KEYP
2412 popl LEN
2413#endif
2414 FRAME_END
2415 ret
2416ENDPROC(aesni_ecb_dec)
2417
2418
2419
2420
2421
2422ENTRY(aesni_cbc_enc)
2423 FRAME_BEGIN
2424#ifndef __x86_64__
2425 pushl IVP
2426 pushl LEN
2427 pushl KEYP
2428 pushl KLEN
2429 movl (FRAME_OFFSET+20)(%esp), KEYP
2430 movl (FRAME_OFFSET+24)(%esp), OUTP
2431 movl (FRAME_OFFSET+28)(%esp), INP
2432 movl (FRAME_OFFSET+32)(%esp), LEN
2433 movl (FRAME_OFFSET+36)(%esp), IVP
2434#endif
2435 cmp $16, LEN
2436 jb .Lcbc_enc_ret
2437 mov 480(KEYP), KLEN
2438 movups (IVP), STATE
2439.align 4
2440.Lcbc_enc_loop:
2441 movups (INP), IN
2442 pxor IN, STATE
2443 call _aesni_enc1
2444 movups STATE, (OUTP)
2445 sub $16, LEN
2446 add $16, INP
2447 add $16, OUTP
2448 cmp $16, LEN
2449 jge .Lcbc_enc_loop
2450 movups STATE, (IVP)
2451.Lcbc_enc_ret:
2452#ifndef __x86_64__
2453 popl KLEN
2454 popl KEYP
2455 popl LEN
2456 popl IVP
2457#endif
2458 FRAME_END
2459 ret
2460ENDPROC(aesni_cbc_enc)
2461
2462
2463
2464
2465
2466ENTRY(aesni_cbc_dec)
2467 FRAME_BEGIN
2468#ifndef __x86_64__
2469 pushl IVP
2470 pushl LEN
2471 pushl KEYP
2472 pushl KLEN
2473 movl (FRAME_OFFSET+20)(%esp), KEYP
2474 movl (FRAME_OFFSET+24)(%esp), OUTP
2475 movl (FRAME_OFFSET+28)(%esp), INP
2476 movl (FRAME_OFFSET+32)(%esp), LEN
2477 movl (FRAME_OFFSET+36)(%esp), IVP
2478#endif
2479 cmp $16, LEN
2480 jb .Lcbc_dec_just_ret
2481 mov 480(KEYP), KLEN
2482 add $240, KEYP
2483 movups (IVP), IV
2484 cmp $64, LEN
2485 jb .Lcbc_dec_loop1
2486.align 4
2487.Lcbc_dec_loop4:
2488 movups (INP), IN1
2489 movaps IN1, STATE1
2490 movups 0x10(INP), IN2
2491 movaps IN2, STATE2
2492#ifdef __x86_64__
2493 movups 0x20(INP), IN3
2494 movaps IN3, STATE3
2495 movups 0x30(INP), IN4
2496 movaps IN4, STATE4
2497#else
2498 movups 0x20(INP), IN1
2499 movaps IN1, STATE3
2500 movups 0x30(INP), IN2
2501 movaps IN2, STATE4
2502#endif
2503 call _aesni_dec4
2504 pxor IV, STATE1
2505#ifdef __x86_64__
2506 pxor IN1, STATE2
2507 pxor IN2, STATE3
2508 pxor IN3, STATE4
2509 movaps IN4, IV
2510#else
2511 pxor IN1, STATE4
2512 movaps IN2, IV
2513 movups (INP), IN1
2514 pxor IN1, STATE2
2515 movups 0x10(INP), IN2
2516 pxor IN2, STATE3
2517#endif
2518 movups STATE1, (OUTP)
2519 movups STATE2, 0x10(OUTP)
2520 movups STATE3, 0x20(OUTP)
2521 movups STATE4, 0x30(OUTP)
2522 sub $64, LEN
2523 add $64, INP
2524 add $64, OUTP
2525 cmp $64, LEN
2526 jge .Lcbc_dec_loop4
2527 cmp $16, LEN
2528 jb .Lcbc_dec_ret
2529.align 4
2530.Lcbc_dec_loop1:
2531 movups (INP), IN
2532 movaps IN, STATE
2533 call _aesni_dec1
2534 pxor IV, STATE
2535 movups STATE, (OUTP)
2536 movaps IN, IV
2537 sub $16, LEN
2538 add $16, INP
2539 add $16, OUTP
2540 cmp $16, LEN
2541 jge .Lcbc_dec_loop1
2542.Lcbc_dec_ret:
2543 movups IV, (IVP)
2544.Lcbc_dec_just_ret:
2545#ifndef __x86_64__
2546 popl KLEN
2547 popl KEYP
2548 popl LEN
2549 popl IVP
2550#endif
2551 FRAME_END
2552 ret
2553ENDPROC(aesni_cbc_dec)
2554
2555#ifdef __x86_64__
2556.pushsection .rodata
2557.align 16
2558.Lbswap_mask:
2559 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2560.popsection
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573.align 4
2574_aesni_inc_init:
2575 movaps .Lbswap_mask, BSWAP_MASK
2576 movaps IV, CTR
2577 PSHUFB_XMM BSWAP_MASK CTR
2578 mov $1, TCTR_LOW
2579 MOVQ_R64_XMM TCTR_LOW INC
2580 MOVQ_R64_XMM CTR TCTR_LOW
2581 ret
2582ENDPROC(_aesni_inc_init)
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599.align 4
2600_aesni_inc:
2601 paddq INC, CTR
2602 add $1, TCTR_LOW
2603 jnc .Linc_low
2604 pslldq $8, INC
2605 paddq INC, CTR
2606 psrldq $8, INC
2607.Linc_low:
2608 movaps CTR, IV
2609 PSHUFB_XMM BSWAP_MASK IV
2610 ret
2611ENDPROC(_aesni_inc)
2612
2613
2614
2615
2616
2617ENTRY(aesni_ctr_enc)
2618 FRAME_BEGIN
2619 cmp $16, LEN
2620 jb .Lctr_enc_just_ret
2621 mov 480(KEYP), KLEN
2622 movups (IVP), IV
2623 call _aesni_inc_init
2624 cmp $64, LEN
2625 jb .Lctr_enc_loop1
2626.align 4
2627.Lctr_enc_loop4:
2628 movaps IV, STATE1
2629 call _aesni_inc
2630 movups (INP), IN1
2631 movaps IV, STATE2
2632 call _aesni_inc
2633 movups 0x10(INP), IN2
2634 movaps IV, STATE3
2635 call _aesni_inc
2636 movups 0x20(INP), IN3
2637 movaps IV, STATE4
2638 call _aesni_inc
2639 movups 0x30(INP), IN4
2640 call _aesni_enc4
2641 pxor IN1, STATE1
2642 movups STATE1, (OUTP)
2643 pxor IN2, STATE2
2644 movups STATE2, 0x10(OUTP)
2645 pxor IN3, STATE3
2646 movups STATE3, 0x20(OUTP)
2647 pxor IN4, STATE4
2648 movups STATE4, 0x30(OUTP)
2649 sub $64, LEN
2650 add $64, INP
2651 add $64, OUTP
2652 cmp $64, LEN
2653 jge .Lctr_enc_loop4
2654 cmp $16, LEN
2655 jb .Lctr_enc_ret
2656.align 4
2657.Lctr_enc_loop1:
2658 movaps IV, STATE
2659 call _aesni_inc
2660 movups (INP), IN
2661 call _aesni_enc1
2662 pxor IN, STATE
2663 movups STATE, (OUTP)
2664 sub $16, LEN
2665 add $16, INP
2666 add $16, OUTP
2667 cmp $16, LEN
2668 jge .Lctr_enc_loop1
2669.Lctr_enc_ret:
2670 movups IV, (IVP)
2671.Lctr_enc_just_ret:
2672 FRAME_END
2673 ret
2674ENDPROC(aesni_ctr_enc)
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687#define _aesni_gf128mul_x_ble() \
2688 pshufd $0x13, IV, CTR; \
2689 paddq IV, IV; \
2690 psrad $31, CTR; \
2691 pand GF128MUL_MASK, CTR; \
2692 pxor CTR, IV;
2693
2694
2695
2696
2697
2698ENTRY(aesni_xts_crypt8)
2699 FRAME_BEGIN
2700 cmpb $0, %cl
2701 movl $0, %ecx
2702 movl $240, %r10d
2703 leaq _aesni_enc4, %r11
2704 leaq _aesni_dec4, %rax
2705 cmovel %r10d, %ecx
2706 cmoveq %rax, %r11
2707
2708 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2709 movups (IVP), IV
2710
2711 mov 480(KEYP), KLEN
2712 addq %rcx, KEYP
2713
2714 movdqa IV, STATE1
2715 movdqu 0x00(INP), INC
2716 pxor INC, STATE1
2717 movdqu IV, 0x00(OUTP)
2718
2719 _aesni_gf128mul_x_ble()
2720 movdqa IV, STATE2
2721 movdqu 0x10(INP), INC
2722 pxor INC, STATE2
2723 movdqu IV, 0x10(OUTP)
2724
2725 _aesni_gf128mul_x_ble()
2726 movdqa IV, STATE3
2727 movdqu 0x20(INP), INC
2728 pxor INC, STATE3
2729 movdqu IV, 0x20(OUTP)
2730
2731 _aesni_gf128mul_x_ble()
2732 movdqa IV, STATE4
2733 movdqu 0x30(INP), INC
2734 pxor INC, STATE4
2735 movdqu IV, 0x30(OUTP)
2736
2737 call *%r11
2738
2739 movdqu 0x00(OUTP), INC
2740 pxor INC, STATE1
2741 movdqu STATE1, 0x00(OUTP)
2742
2743 _aesni_gf128mul_x_ble()
2744 movdqa IV, STATE1
2745 movdqu 0x40(INP), INC
2746 pxor INC, STATE1
2747 movdqu IV, 0x40(OUTP)
2748
2749 movdqu 0x10(OUTP), INC
2750 pxor INC, STATE2
2751 movdqu STATE2, 0x10(OUTP)
2752
2753 _aesni_gf128mul_x_ble()
2754 movdqa IV, STATE2
2755 movdqu 0x50(INP), INC
2756 pxor INC, STATE2
2757 movdqu IV, 0x50(OUTP)
2758
2759 movdqu 0x20(OUTP), INC
2760 pxor INC, STATE3
2761 movdqu STATE3, 0x20(OUTP)
2762
2763 _aesni_gf128mul_x_ble()
2764 movdqa IV, STATE3
2765 movdqu 0x60(INP), INC
2766 pxor INC, STATE3
2767 movdqu IV, 0x60(OUTP)
2768
2769 movdqu 0x30(OUTP), INC
2770 pxor INC, STATE4
2771 movdqu STATE4, 0x30(OUTP)
2772
2773 _aesni_gf128mul_x_ble()
2774 movdqa IV, STATE4
2775 movdqu 0x70(INP), INC
2776 pxor INC, STATE4
2777 movdqu IV, 0x70(OUTP)
2778
2779 _aesni_gf128mul_x_ble()
2780 movups IV, (IVP)
2781
2782 call *%r11
2783
2784 movdqu 0x40(OUTP), INC
2785 pxor INC, STATE1
2786 movdqu STATE1, 0x40(OUTP)
2787
2788 movdqu 0x50(OUTP), INC
2789 pxor INC, STATE2
2790 movdqu STATE2, 0x50(OUTP)
2791
2792 movdqu 0x60(OUTP), INC
2793 pxor INC, STATE3
2794 movdqu STATE3, 0x60(OUTP)
2795
2796 movdqu 0x70(OUTP), INC
2797 pxor INC, STATE4
2798 movdqu STATE4, 0x70(OUTP)
2799
2800 FRAME_END
2801 ret
2802ENDPROC(aesni_xts_crypt8)
2803
2804#endif
2805