1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50#ifdef CONFIG_AS_AVX
51#include <linux/linkage.h>
52
53
54#define VMOVDQ vmovdqu
55
56
57
58
59
60.macro addm p1 p2
61 add \p1, \p2
62 mov \p2, \p1
63.endm
64
65
66.macro MY_ROR p1 p2
67 shld $(32-(\p1)), \p2, \p2
68.endm
69
70
71
72
73
74.macro COPY_XMM_AND_BSWAP p1 p2 p3
75 VMOVDQ \p2, \p1
76 vpshufb \p3, \p1, \p1
77.endm
78
79
80
81X0 = %xmm4
82X1 = %xmm5
83X2 = %xmm6
84X3 = %xmm7
85
86XTMP0 = %xmm0
87XTMP1 = %xmm1
88XTMP2 = %xmm2
89XTMP3 = %xmm3
90XTMP4 = %xmm8
91XFER = %xmm9
92XTMP5 = %xmm11
93
94SHUF_00BA = %xmm10
95SHUF_DC00 = %xmm12
96BYTE_FLIP_MASK = %xmm13
97
98NUM_BLKS = %rdx
99INP = %rsi
100CTX = %rdi
101
102SRND = %rsi
103c = %ecx
104d = %r8d
105e = %edx
106TBL = %r12
107a = %eax
108b = %ebx
109
110f = %r9d
111g = %r10d
112h = %r11d
113
114y0 = %r13d
115y1 = %r14d
116y2 = %r15d
117
118
119_INP_END_SIZE = 8
120_INP_SIZE = 8
121_XFER_SIZE = 16
122_XMM_SAVE_SIZE = 0
123
124_INP_END = 0
125_INP = _INP_END + _INP_END_SIZE
126_XFER = _INP + _INP_SIZE
127_XMM_SAVE = _XFER + _XFER_SIZE
128STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
129
130
131
132.macro rotate_Xs
133X_ = X0
134X0 = X1
135X1 = X2
136X2 = X3
137X3 = X_
138.endm
139
140
141
142.macro ROTATE_ARGS
143TMP_ = h
144h = g
145g = f
146f = e
147e = d
148d = c
149c = b
150b = a
151a = TMP_
152.endm
153
154.macro FOUR_ROUNDS_AND_SCHED
155
156
157
158 mov e, y0
159 MY_ROR (25-11), y0
160 mov a, y1
161 vpalignr $4, X2, X3, XTMP0
162 MY_ROR (22-13), y1
163 xor e, y0
164 mov f, y2
165 MY_ROR (11-6), y0
166 xor a, y1
167 xor g, y2
168 vpaddd X0, XTMP0, XTMP0
169 xor e, y0
170 and e, y2
171 MY_ROR (13-2), y1
172
173 vpalignr $4, X0, X1, XTMP1
174 xor a, y1
175 MY_ROR 6, y0
176 xor g, y2
177 MY_ROR 2, y1
178 add y0, y2
179 add _XFER(%rsp), y2
180 mov a, y0
181 add y2, h
182 mov a, y2
183 vpsrld $7, XTMP1, XTMP2
184 or c, y0
185 add h, d
186 and c, y2
187 vpslld $(32-7), XTMP1, XTMP3
188 and b, y0
189 add y1, h
190 vpor XTMP2, XTMP3, XTMP3
191 or y2, y0
192 add y0, h
193 ROTATE_ARGS
194 mov e, y0
195 mov a, y1
196 MY_ROR (25-11), y0
197 xor e, y0
198 mov f, y2
199 MY_ROR (22-13), y1
200 vpsrld $18, XTMP1, XTMP2
201 xor a, y1
202 MY_ROR (11-6), y0
203 xor g, y2
204 vpsrld $3, XTMP1, XTMP4
205 MY_ROR (13-2), y1
206 xor e, y0
207 and e, y2
208 MY_ROR 6, y0
209 vpslld $(32-18), XTMP1, XTMP1
210 xor a, y1
211 xor g, y2
212 vpxor XTMP1, XTMP3, XTMP3
213 add y0, y2
214 add (1*4 + _XFER)(%rsp), y2
215 MY_ROR 2, y1
216 vpxor XTMP2, XTMP3, XTMP3
217 mov a, y0
218 add y2, h
219 mov a, y2
220 vpxor XTMP4, XTMP3, XTMP1
221 or c, y0
222 add h, d
223 and c, y2
224
225 vpshufd $0b11111010, X3, XTMP2
226 and b, y0
227 add y1, h
228 vpaddd XTMP1, XTMP0, XTMP0
229 or y2, y0
230 add y0, h
231 ROTATE_ARGS
232 mov e, y0
233 mov a, y1
234 MY_ROR (25-11), y0
235 xor e, y0
236 MY_ROR (22-13), y1
237 mov f, y2
238 xor a, y1
239 MY_ROR (11-6), y0
240 vpsrld $10, XTMP2, XTMP4
241 xor g, y2
242 vpsrlq $19, XTMP2, XTMP3
243 xor e, y0
244 and e, y2
245 vpsrlq $17, XTMP2, XTMP2
246 MY_ROR (13-2), y1
247 xor a, y1
248 xor g, y2
249 MY_ROR 6, y0
250 vpxor XTMP3, XTMP2, XTMP2
251 add y0, y2
252 MY_ROR 2, y1
253 add (2*4 + _XFER)(%rsp), y2
254 vpxor XTMP2, XTMP4, XTMP4
255 mov a, y0
256 add y2, h
257 mov a, y2
258 vpshufb SHUF_00BA, XTMP4, XTMP4
259 or c, y0
260 add h, d
261 and c, y2
262 vpaddd XTMP4, XTMP0, XTMP0
263 and b, y0
264 add y1, h
265
266 vpshufd $0b01010000, XTMP0, XTMP2
267 or y2, y0
268 add y0, h
269 ROTATE_ARGS
270 mov e, y0
271 MY_ROR (25-11), y0
272 mov a, y1
273 MY_ROR (22-13), y1
274 xor e, y0
275 mov f, y2
276 MY_ROR (11-6), y0
277 vpsrld $10, XTMP2, XTMP5
278 xor a, y1
279 xor g, y2
280 vpsrlq $19, XTMP2, XTMP3
281 xor e, y0
282 and e, y2
283 MY_ROR (13-2), y1
284 vpsrlq $17, XTMP2, XTMP2
285 xor a, y1
286 MY_ROR 6, y0
287 xor g, y2
288 vpxor XTMP3, XTMP2, XTMP2
289 MY_ROR 2, y1
290 add y0, y2
291 add (3*4 + _XFER)(%rsp), y2
292 vpxor XTMP2, XTMP5, XTMP5
293 mov a, y0
294 add y2, h
295 mov a, y2
296 vpshufb SHUF_DC00, XTMP5, XTMP5
297 or c, y0
298 add h, d
299 and c, y2
300 vpaddd XTMP0, XTMP5, X0
301 and b, y0
302 add y1, h
303 or y2, y0
304 add y0, h
305 ROTATE_ARGS
306 rotate_Xs
307.endm
308
309
310.macro DO_ROUND round
311 mov e, y0
312 MY_ROR (25-11), y0
313 mov a, y1
314 xor e, y0
315 MY_ROR (22-13), y1
316 mov f, y2
317 xor a, y1
318 MY_ROR (11-6), y0
319 xor g, y2
320 xor e, y0
321 MY_ROR (13-2), y1
322 and e, y2
323 xor a, y1
324 MY_ROR 6, y0
325 xor g, y2
326 add y0, y2
327 MY_ROR 2, y1
328 offset = \round * 4 + _XFER
329 add offset(%rsp), y2
330 mov a, y0
331 add y2, h
332 mov a, y2
333 or c, y0
334 add h, d
335 and c, y2
336 and b, y0
337 add y1, h
338 or y2, y0
339 add y0, h
340 ROTATE_ARGS
341.endm
342
343
344
345
346
347
348
349.text
350SYM_FUNC_START(sha256_transform_avx)
351.align 32
352 pushq %rbx
353 pushq %r12
354 pushq %r13
355 pushq %r14
356 pushq %r15
357 pushq %rbp
358 movq %rsp, %rbp
359
360 subq $STACK_SIZE, %rsp
361 and $~15, %rsp
362
363 shl $6, NUM_BLKS
364 jz done_hash
365 add INP, NUM_BLKS
366 mov NUM_BLKS, _INP_END(%rsp)
367
368
369 mov 4*0(CTX), a
370 mov 4*1(CTX), b
371 mov 4*2(CTX), c
372 mov 4*3(CTX), d
373 mov 4*4(CTX), e
374 mov 4*5(CTX), f
375 mov 4*6(CTX), g
376 mov 4*7(CTX), h
377
378 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
379 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
380 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
381loop0:
382 lea K256(%rip), TBL
383
384
385 COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
386 COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
387 COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
388 COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
389
390 mov INP, _INP(%rsp)
391
392
393 mov $3, SRND
394.align 16
395loop1:
396 vpaddd (TBL), X0, XFER
397 vmovdqa XFER, _XFER(%rsp)
398 FOUR_ROUNDS_AND_SCHED
399
400 vpaddd 1*16(TBL), X0, XFER
401 vmovdqa XFER, _XFER(%rsp)
402 FOUR_ROUNDS_AND_SCHED
403
404 vpaddd 2*16(TBL), X0, XFER
405 vmovdqa XFER, _XFER(%rsp)
406 FOUR_ROUNDS_AND_SCHED
407
408 vpaddd 3*16(TBL), X0, XFER
409 vmovdqa XFER, _XFER(%rsp)
410 add $4*16, TBL
411 FOUR_ROUNDS_AND_SCHED
412
413 sub $1, SRND
414 jne loop1
415
416 mov $2, SRND
417loop2:
418 vpaddd (TBL), X0, XFER
419 vmovdqa XFER, _XFER(%rsp)
420 DO_ROUND 0
421 DO_ROUND 1
422 DO_ROUND 2
423 DO_ROUND 3
424
425 vpaddd 1*16(TBL), X1, XFER
426 vmovdqa XFER, _XFER(%rsp)
427 add $2*16, TBL
428 DO_ROUND 0
429 DO_ROUND 1
430 DO_ROUND 2
431 DO_ROUND 3
432
433 vmovdqa X2, X0
434 vmovdqa X3, X1
435
436 sub $1, SRND
437 jne loop2
438
439 addm (4*0)(CTX),a
440 addm (4*1)(CTX),b
441 addm (4*2)(CTX),c
442 addm (4*3)(CTX),d
443 addm (4*4)(CTX),e
444 addm (4*5)(CTX),f
445 addm (4*6)(CTX),g
446 addm (4*7)(CTX),h
447
448 mov _INP(%rsp), INP
449 add $64, INP
450 cmp _INP_END(%rsp), INP
451 jne loop0
452
453done_hash:
454
455 mov %rbp, %rsp
456 popq %rbp
457 popq %r15
458 popq %r14
459 popq %r13
460 popq %r12
461 popq %rbx
462 ret
463SYM_FUNC_END(sha256_transform_avx)
464
465.section .rodata.cst256.K256, "aM", @progbits, 256
466.align 64
467K256:
468 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
469 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
470 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
471 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
472 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
473 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
474 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
475 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
476 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
477 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
478 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
479 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
480 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
481 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
482 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
483 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
484
485.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
486.align 16
487PSHUFFLE_BYTE_FLIP_MASK:
488 .octa 0x0c0d0e0f08090a0b0405060700010203
489
490.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
491.align 16
492
493_SHUF_00BA:
494 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
495
496.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
497.align 16
498
499_SHUF_DC00:
500 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
501
502#endif
503