1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65#include <linux/linkage.h>
66
67#define VMOVDQ vmovdqu
68
69#define xdata0 %xmm0
70#define xdata1 %xmm1
71#define xdata2 %xmm2
72#define xdata3 %xmm3
73#define xdata4 %xmm4
74#define xdata5 %xmm5
75#define xdata6 %xmm6
76#define xdata7 %xmm7
77#define xcounter %xmm8
78#define xbyteswap %xmm9
79#define xkey0 %xmm10
80#define xkey4 %xmm11
81#define xkey8 %xmm12
82#define xkey12 %xmm13
83#define xkeyA %xmm14
84#define xkeyB %xmm15
85
86#define p_in %rdi
87#define p_iv %rsi
88#define p_keys %rdx
89#define p_out %rcx
90#define num_bytes %r8
91
92#define tmp %r10
93#define DDQ_DATA 0
94#define XDATA 1
95#define KEY_128 1
96#define KEY_192 2
97#define KEY_256 3
98
99.section .rodata
100.align 16
101
102byteswap_const:
103 .octa 0x000102030405060708090A0B0C0D0E0F
104ddq_low_msk:
105 .octa 0x0000000000000000FFFFFFFFFFFFFFFF
106ddq_high_add_1:
107 .octa 0x00000000000000010000000000000000
108ddq_add_1:
109 .octa 0x00000000000000000000000000000001
110ddq_add_2:
111 .octa 0x00000000000000000000000000000002
112ddq_add_3:
113 .octa 0x00000000000000000000000000000003
114ddq_add_4:
115 .octa 0x00000000000000000000000000000004
116ddq_add_5:
117 .octa 0x00000000000000000000000000000005
118ddq_add_6:
119 .octa 0x00000000000000000000000000000006
120ddq_add_7:
121 .octa 0x00000000000000000000000000000007
122ddq_add_8:
123 .octa 0x00000000000000000000000000000008
124
125.text
126
127
128
129
130.macro setxdata n
131 var_xdata = %xmm\n
132.endm
133
134
135
136.macro club name, id
137.altmacro
138 .if \name == XDATA
139 setxdata %\id
140 .endif
141.noaltmacro
142.endm
143
144
145
146
147
148.macro do_aes b, k, key_len
149 .set by, \b
150 .set load_keys, \k
151 .set klen, \key_len
152
153 .if (load_keys)
154 vmovdqa 0*16(p_keys), xkey0
155 .endif
156
157 vpshufb xbyteswap, xcounter, xdata0
158
159 .set i, 1
160 .rept (by - 1)
161 club XDATA, i
162 vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
163 vptest ddq_low_msk(%rip), var_xdata
164 jnz 1f
165 vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
166 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
167 1:
168 vpshufb xbyteswap, var_xdata, var_xdata
169 .set i, (i +1)
170 .endr
171
172 vmovdqa 1*16(p_keys), xkeyA
173
174 vpxor xkey0, xdata0, xdata0
175 vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
176 vptest ddq_low_msk(%rip), xcounter
177 jnz 1f
178 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
179 1:
180
181 .set i, 1
182 .rept (by - 1)
183 club XDATA, i
184 vpxor xkey0, var_xdata, var_xdata
185 .set i, (i +1)
186 .endr
187
188 vmovdqa 2*16(p_keys), xkeyB
189
190 .set i, 0
191 .rept by
192 club XDATA, i
193 vaesenc xkeyA, var_xdata, var_xdata
194 .set i, (i +1)
195 .endr
196
197 .if (klen == KEY_128)
198 .if (load_keys)
199 vmovdqa 3*16(p_keys), xkey4
200 .endif
201 .else
202 vmovdqa 3*16(p_keys), xkeyA
203 .endif
204
205 .set i, 0
206 .rept by
207 club XDATA, i
208 vaesenc xkeyB, var_xdata, var_xdata
209 .set i, (i +1)
210 .endr
211
212 add $(16*by), p_in
213
214 .if (klen == KEY_128)
215 vmovdqa 4*16(p_keys), xkeyB
216 .else
217 .if (load_keys)
218 vmovdqa 4*16(p_keys), xkey4
219 .endif
220 .endif
221
222 .set i, 0
223 .rept by
224 club XDATA, i
225
226 .if (klen == KEY_128)
227 vaesenc xkey4, var_xdata, var_xdata
228 .else
229 vaesenc xkeyA, var_xdata, var_xdata
230 .endif
231 .set i, (i +1)
232 .endr
233
234 vmovdqa 5*16(p_keys), xkeyA
235
236 .set i, 0
237 .rept by
238 club XDATA, i
239
240 .if (klen == KEY_128)
241 vaesenc xkeyB, var_xdata, var_xdata
242 .else
243 vaesenc xkey4, var_xdata, var_xdata
244 .endif
245 .set i, (i +1)
246 .endr
247
248 .if (klen == KEY_128)
249 .if (load_keys)
250 vmovdqa 6*16(p_keys), xkey8
251 .endif
252 .else
253 vmovdqa 6*16(p_keys), xkeyB
254 .endif
255
256 .set i, 0
257 .rept by
258 club XDATA, i
259 vaesenc xkeyA, var_xdata, var_xdata
260 .set i, (i +1)
261 .endr
262
263 vmovdqa 7*16(p_keys), xkeyA
264
265 .set i, 0
266 .rept by
267 club XDATA, i
268
269 .if (klen == KEY_128)
270 vaesenc xkey8, var_xdata, var_xdata
271 .else
272 vaesenc xkeyB, var_xdata, var_xdata
273 .endif
274 .set i, (i +1)
275 .endr
276
277 .if (klen == KEY_128)
278 vmovdqa 8*16(p_keys), xkeyB
279 .else
280 .if (load_keys)
281 vmovdqa 8*16(p_keys), xkey8
282 .endif
283 .endif
284
285 .set i, 0
286 .rept by
287 club XDATA, i
288 vaesenc xkeyA, var_xdata, var_xdata
289 .set i, (i +1)
290 .endr
291
292 .if (klen == KEY_128)
293 .if (load_keys)
294 vmovdqa 9*16(p_keys), xkey12
295 .endif
296 .else
297 vmovdqa 9*16(p_keys), xkeyA
298 .endif
299
300 .set i, 0
301 .rept by
302 club XDATA, i
303
304 .if (klen == KEY_128)
305 vaesenc xkeyB, var_xdata, var_xdata
306 .else
307 vaesenc xkey8, var_xdata, var_xdata
308 .endif
309 .set i, (i +1)
310 .endr
311
312 vmovdqa 10*16(p_keys), xkeyB
313
314 .set i, 0
315 .rept by
316 club XDATA, i
317
318 .if (klen == KEY_128)
319 vaesenc xkey12, var_xdata, var_xdata
320 .else
321 vaesenc xkeyA, var_xdata, var_xdata
322 .endif
323 .set i, (i +1)
324 .endr
325
326 .if (klen != KEY_128)
327 vmovdqa 11*16(p_keys), xkeyA
328 .endif
329
330 .set i, 0
331 .rept by
332 club XDATA, i
333
334 .if (klen == KEY_128)
335 vaesenclast xkeyB, var_xdata, var_xdata
336 .else
337 vaesenc xkeyB, var_xdata, var_xdata
338 .endif
339 .set i, (i +1)
340 .endr
341
342 .if (klen != KEY_128)
343 .if (load_keys)
344 vmovdqa 12*16(p_keys), xkey12
345 .endif
346
347 .set i, 0
348 .rept by
349 club XDATA, i
350 vaesenc xkeyA, var_xdata, var_xdata
351 .set i, (i +1)
352 .endr
353
354 .if (klen == KEY_256)
355 vmovdqa 13*16(p_keys), xkeyA
356 .endif
357
358 .set i, 0
359 .rept by
360 club XDATA, i
361 .if (klen == KEY_256)
362
363 vaesenc xkey12, var_xdata, var_xdata
364 .else
365 vaesenclast xkey12, var_xdata, var_xdata
366 .endif
367 .set i, (i +1)
368 .endr
369
370 .if (klen == KEY_256)
371 vmovdqa 14*16(p_keys), xkeyB
372
373 .set i, 0
374 .rept by
375 club XDATA, i
376
377 vaesenc xkeyA, var_xdata, var_xdata
378 .set i, (i +1)
379 .endr
380
381 .set i, 0
382 .rept by
383 club XDATA, i
384
385 vaesenclast xkeyB, var_xdata, var_xdata
386 .set i, (i +1)
387 .endr
388 .endif
389 .endif
390
391 .set i, 0
392 .rept (by / 2)
393 .set j, (i+1)
394 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
395 VMOVDQ (j*16 - 16*by)(p_in), xkeyB
396 club XDATA, i
397 vpxor xkeyA, var_xdata, var_xdata
398 club XDATA, j
399 vpxor xkeyB, var_xdata, var_xdata
400 .set i, (i+2)
401 .endr
402
403 .if (i < by)
404 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
405 club XDATA, i
406 vpxor xkeyA, var_xdata, var_xdata
407 .endif
408
409 .set i, 0
410 .rept by
411 club XDATA, i
412 VMOVDQ var_xdata, i*16(p_out)
413 .set i, (i+1)
414 .endr
415.endm
416
417.macro do_aes_load val, key_len
418 do_aes \val, 1, \key_len
419.endm
420
421.macro do_aes_noload val, key_len
422 do_aes \val, 0, \key_len
423.endm
424
425
426
427.macro do_aes_ctrmain key_len
428 cmp $16, num_bytes
429 jb .Ldo_return2\key_len
430
431 vmovdqa byteswap_const(%rip), xbyteswap
432 vmovdqu (p_iv), xcounter
433 vpshufb xbyteswap, xcounter, xcounter
434
435 mov num_bytes, tmp
436 and $(7*16), tmp
437 jz .Lmult_of_8_blks\key_len
438
439
440 cmp $(4*16), tmp
441 jg .Lgt4\key_len
442 je .Leq4\key_len
443
444.Llt4\key_len:
445 cmp $(2*16), tmp
446 jg .Leq3\key_len
447 je .Leq2\key_len
448
449.Leq1\key_len:
450 do_aes_load 1, \key_len
451 add $(1*16), p_out
452 and $(~7*16), num_bytes
453 jz .Ldo_return2\key_len
454 jmp .Lmain_loop2\key_len
455
456.Leq2\key_len:
457 do_aes_load 2, \key_len
458 add $(2*16), p_out
459 and $(~7*16), num_bytes
460 jz .Ldo_return2\key_len
461 jmp .Lmain_loop2\key_len
462
463
464.Leq3\key_len:
465 do_aes_load 3, \key_len
466 add $(3*16), p_out
467 and $(~7*16), num_bytes
468 jz .Ldo_return2\key_len
469 jmp .Lmain_loop2\key_len
470
471.Leq4\key_len:
472 do_aes_load 4, \key_len
473 add $(4*16), p_out
474 and $(~7*16), num_bytes
475 jz .Ldo_return2\key_len
476 jmp .Lmain_loop2\key_len
477
478.Lgt4\key_len:
479 cmp $(6*16), tmp
480 jg .Leq7\key_len
481 je .Leq6\key_len
482
483.Leq5\key_len:
484 do_aes_load 5, \key_len
485 add $(5*16), p_out
486 and $(~7*16), num_bytes
487 jz .Ldo_return2\key_len
488 jmp .Lmain_loop2\key_len
489
490.Leq6\key_len:
491 do_aes_load 6, \key_len
492 add $(6*16), p_out
493 and $(~7*16), num_bytes
494 jz .Ldo_return2\key_len
495 jmp .Lmain_loop2\key_len
496
497.Leq7\key_len:
498 do_aes_load 7, \key_len
499 add $(7*16), p_out
500 and $(~7*16), num_bytes
501 jz .Ldo_return2\key_len
502 jmp .Lmain_loop2\key_len
503
504.Lmult_of_8_blks\key_len:
505 .if (\key_len != KEY_128)
506 vmovdqa 0*16(p_keys), xkey0
507 vmovdqa 4*16(p_keys), xkey4
508 vmovdqa 8*16(p_keys), xkey8
509 vmovdqa 12*16(p_keys), xkey12
510 .else
511 vmovdqa 0*16(p_keys), xkey0
512 vmovdqa 3*16(p_keys), xkey4
513 vmovdqa 6*16(p_keys), xkey8
514 vmovdqa 9*16(p_keys), xkey12
515 .endif
516.align 16
517.Lmain_loop2\key_len:
518
519 do_aes_noload 8, \key_len
520 add $(8*16), p_out
521 sub $(8*16), num_bytes
522 jne .Lmain_loop2\key_len
523
524.Ldo_return2\key_len:
525
526 vpshufb xbyteswap, xcounter, xcounter
527 vmovdqu xcounter, (p_iv)
528 ret
529.endm
530
531
532
533
534
535
536
537
538SYM_FUNC_START(aes_ctr_enc_128_avx_by8)
539
540 do_aes_ctrmain KEY_128
541
542SYM_FUNC_END(aes_ctr_enc_128_avx_by8)
543
544
545
546
547
548
549
550
551SYM_FUNC_START(aes_ctr_enc_192_avx_by8)
552
553 do_aes_ctrmain KEY_192
554
555SYM_FUNC_END(aes_ctr_enc_192_avx_by8)
556
557
558
559
560
561
562
563
564SYM_FUNC_START(aes_ctr_enc_256_avx_by8)
565
566 do_aes_ctrmain KEY_256
567
568SYM_FUNC_END(aes_ctr_enc_256_avx_by8)
569