1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51#ifdef CONFIG_AS_AVX2
52#include <linux/linkage.h>
53
54
55#define VMOVDQ vmovdqu
56
57
58
59
60
61.macro addm p1 p2
62 add \p1, \p2
63 mov \p2, \p1
64.endm
65
66
67
68X0 = %ymm4
69X1 = %ymm5
70X2 = %ymm6
71X3 = %ymm7
72
73
74XWORD0 = %xmm4
75XWORD1 = %xmm5
76XWORD2 = %xmm6
77XWORD3 = %xmm7
78
79XTMP0 = %ymm0
80XTMP1 = %ymm1
81XTMP2 = %ymm2
82XTMP3 = %ymm3
83XTMP4 = %ymm8
84XFER = %ymm9
85XTMP5 = %ymm11
86
87SHUF_00BA = %ymm10
88SHUF_DC00 = %ymm12
89BYTE_FLIP_MASK = %ymm13
90
91X_BYTE_FLIP_MASK = %xmm13
92
93NUM_BLKS = %rdx
94INP = %rsi
95CTX = %rdi
96c = %ecx
97d = %r8d
98e = %edx
99y3 = %esi
100
101SRND = CTX
102
103a = %eax
104b = %ebx
105f = %r9d
106g = %r10d
107h = %r11d
108old_h = %r11d
109
110T1 = %r12d
111y0 = %r13d
112y1 = %r14d
113y2 = %r15d
114
115
116_XFER_SIZE = 2*64*4
117_XMM_SAVE_SIZE = 0
118_INP_END_SIZE = 8
119_INP_SIZE = 8
120_CTX_SIZE = 8
121_RSP_SIZE = 8
122
123_XFER = 0
124_XMM_SAVE = _XFER + _XFER_SIZE
125_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
126_INP = _INP_END + _INP_END_SIZE
127_CTX = _INP + _INP_SIZE
128_RSP = _CTX + _CTX_SIZE
129STACK_SIZE = _RSP + _RSP_SIZE
130
131
132
133.macro rotate_Xs
134 X_ = X0
135 X0 = X1
136 X1 = X2
137 X2 = X3
138 X3 = X_
139.endm
140
141
142
143.macro ROTATE_ARGS
144 old_h = h
145 TMP_ = h
146 h = g
147 g = f
148 f = e
149 e = d
150 d = c
151 c = b
152 b = a
153 a = TMP_
154.endm
155
156.macro FOUR_ROUNDS_AND_SCHED disp
157
158
159 mov a, y3
160 rorx $25, e, y0
161 rorx $11, e, y1
162
163 addl \disp(%rsp, SRND), h
164 or c, y3
165 vpalignr $4, X2, X3, XTMP0
166 mov f, y2
167 rorx $13, a, T1
168
169 xor y1, y0
170 xor g, y2
171 vpaddd X0, XTMP0, XTMP0
172 rorx $6, e, y1
173
174 and e, y2
175 xor y1, y0
176 rorx $22, a, y1
177 add h, d
178
179 and b, y3
180 vpalignr $4, X0, X1, XTMP1
181 xor T1, y1
182 rorx $2, a, T1
183
184 xor g, y2
185 vpsrld $7, XTMP1, XTMP2
186 xor T1, y1
187 mov a, T1
188 and c, T1
189
190 add y0, y2
191 vpslld $(32-7), XTMP1, XTMP3
192 or T1, y3
193 add y1, h
194
195 add y2, d
196 vpor XTMP2, XTMP3, XTMP3
197
198 vpsrld $18, XTMP1, XTMP2
199 add y2, h
200 add y3, h
201
202
203 ROTATE_ARGS
204
205
206
207 mov a, y3
208 rorx $25, e, y0
209 rorx $11, e, y1
210 offset = \disp + 1*4
211 addl offset(%rsp, SRND), h
212 or c, y3
213
214
215 vpsrld $3, XTMP1, XTMP4
216 mov f, y2
217 rorx $13, a, T1
218 xor y1, y0
219 xor g, y2
220
221
222 rorx $6, e, y1
223 xor y1, y0
224 rorx $22, a, y1
225 and e, y2
226 add h, d
227
228 vpslld $(32-18), XTMP1, XTMP1
229 and b, y3
230 xor T1, y1
231
232 vpxor XTMP1, XTMP3, XTMP3
233 rorx $2, a, T1
234 xor g, y2
235
236 vpxor XTMP2, XTMP3, XTMP3
237 xor T1, y1
238 mov a, T1
239 and c, T1
240 add y0, y2
241
242 vpxor XTMP4, XTMP3, XTMP1
243 vpshufd $0b11111010, X3, XTMP2
244 or T1, y3
245 add y1, h
246
247 vpaddd XTMP1, XTMP0, XTMP0
248 add y2, d
249 add y2, h
250 add y3, h
251
252 vpsrld $10, XTMP2, XTMP4
253
254
255 ROTATE_ARGS
256
257
258
259 mov a, y3
260 rorx $25, e, y0
261 offset = \disp + 2*4
262 addl offset(%rsp, SRND), h
263
264 vpsrlq $19, XTMP2, XTMP3
265 rorx $11, e, y1
266 or c, y3
267 mov f, y2
268 xor g, y2
269
270 rorx $13, a, T1
271 xor y1, y0
272 vpsrlq $17, XTMP2, XTMP2
273 and e, y2
274
275 rorx $6, e, y1
276 vpxor XTMP3, XTMP2, XTMP2
277 add h, d
278 and b, y3
279
280 xor y1, y0
281 rorx $22, a, y1
282 vpxor XTMP2, XTMP4, XTMP4
283 xor g, y2
284
285 vpshufb SHUF_00BA, XTMP4, XTMP4
286 xor T1, y1
287 rorx $2, a ,T1
288 vpaddd XTMP4, XTMP0, XTMP0
289
290 xor T1, y1
291 mov a, T1
292 and c, T1
293 add y0, y2
294 vpshufd $0b01010000, XTMP0, XTMP2
295
296 or T1, y3
297 add y1,h
298 add y2,d
299 add y2,h
300
301 add y3,h
302
303
304 ROTATE_ARGS
305
306
307
308 mov a, y3
309 rorx $25, e, y0
310 rorx $11, e, y1
311 offset = \disp + 3*4
312 addl offset(%rsp, SRND), h
313 or c, y3
314
315
316 vpsrld $10, XTMP2, XTMP5
317 mov f, y2
318 rorx $13, a, T1
319 xor y1, y0
320 xor g, y2
321
322
323 vpsrlq $19, XTMP2, XTMP3
324 rorx $6, e, y1
325 and e, y2
326 add h, d
327 and b, y3
328
329 vpsrlq $17, XTMP2, XTMP2
330 xor y1, y0
331 xor g, y2
332
333 vpxor XTMP3, XTMP2, XTMP2
334 rorx $22, a, y1
335 add y0, y2
336
337 vpxor XTMP2, XTMP5, XTMP5
338 xor T1, y1
339 add y2, d
340
341 rorx $2, a, T1
342 vpshufb SHUF_DC00, XTMP5, XTMP5
343
344 vpaddd XTMP0, XTMP5, X0
345 xor T1, y1
346 mov a, T1
347 and c, T1
348 or T1, y3
349
350 add y1, h
351 add y2, h
352 add y3, h
353
354 ROTATE_ARGS
355 rotate_Xs
356.endm
357
358.macro DO_4ROUNDS disp
359
360
361 mov f, y2
362 rorx $25, e, y0
363 rorx $11, e, y1
364 xor g, y2
365
366 xor y1, y0
367 rorx $6, e, y1
368 and e, y2
369
370 xor y1, y0
371 rorx $13, a, T1
372 xor g, y2
373 rorx $22, a, y1
374 mov a, y3
375
376 xor T1, y1
377 rorx $2, a, T1
378 addl \disp(%rsp, SRND), h
379 or c, y3
380
381 xor T1, y1
382 mov a, T1
383 and b, y3
384 and c, T1
385 add y0, y2
386
387
388 add h, d
389 or T1, y3
390 add y1, h
391 add y2, d
392
393 ROTATE_ARGS
394
395
396
397 add y2, old_h
398 mov f, y2
399 rorx $25, e, y0
400 rorx $11, e, y1
401 xor g, y2
402
403 xor y1, y0
404 rorx $6, e, y1
405 and e, y2
406 add y3, old_h
407
408 xor y1, y0
409 rorx $13, a, T1
410 xor g, y2
411 rorx $22, a, y1
412 mov a, y3
413
414 xor T1, y1
415 rorx $2, a, T1
416 offset = 4*1 + \disp
417 addl offset(%rsp, SRND), h
418 or c, y3
419
420 xor T1, y1
421 mov a, T1
422 and b, y3
423 and c, T1
424 add y0, y2
425
426
427 add h, d
428 or T1, y3
429 add y1, h
430
431 add y2, d
432
433 ROTATE_ARGS
434
435
436
437 add y2, old_h
438 mov f, y2
439 rorx $25, e, y0
440 rorx $11, e, y1
441 xor g, y2
442
443 xor y1, y0
444 rorx $6, e, y1
445 and e, y2
446 add y3, old_h
447
448 xor y1, y0
449 rorx $13, a, T1
450 xor g, y2
451 rorx $22, a, y1
452 mov a, y3
453
454 xor T1, y1
455 rorx $2, a, T1
456 offset = 4*2 + \disp
457 addl offset(%rsp, SRND), h
458 or c, y3
459
460 xor T1, y1
461 mov a, T1
462 and b, y3
463 and c, T1
464 add y0, y2
465
466
467 add h, d
468 or T1, y3
469 add y1, h
470
471 add y2, d
472
473 ROTATE_ARGS
474
475
476
477 add y2, old_h
478 mov f, y2
479 rorx $25, e, y0
480 rorx $11, e, y1
481 xor g, y2
482
483 xor y1, y0
484 rorx $6, e, y1
485 and e, y2
486 add y3, old_h
487
488 xor y1, y0
489 rorx $13, a, T1
490 xor g, y2
491 rorx $22, a, y1
492 mov a, y3
493
494 xor T1, y1
495 rorx $2, a, T1
496 offset = 4*3 + \disp
497 addl offset(%rsp, SRND), h
498 or c, y3
499
500 xor T1, y1
501 mov a, T1
502 and b, y3
503 and c, T1
504 add y0, y2
505
506
507 add h, d
508 or T1, y3
509 add y1, h
510
511 add y2, d
512
513
514 add y2, h
515
516 add y3, h
517
518 ROTATE_ARGS
519
520.endm
521
522
523
524
525
526
527
528.text
529ENTRY(sha256_transform_rorx)
530.align 32
531 pushq %rbx
532 pushq %r12
533 pushq %r13
534 pushq %r14
535 pushq %r15
536
537 mov %rsp, %rax
538 subq $STACK_SIZE, %rsp
539 and $-32, %rsp
540 mov %rax, _RSP(%rsp)
541
542
543 shl $6, NUM_BLKS
544 jz done_hash
545 lea -64(INP, NUM_BLKS), NUM_BLKS
546 mov NUM_BLKS, _INP_END(%rsp)
547
548 cmp NUM_BLKS, INP
549 je only_one_block
550
551
552 mov (CTX), a
553 mov 4*1(CTX), b
554 mov 4*2(CTX), c
555 mov 4*3(CTX), d
556 mov 4*4(CTX), e
557 mov 4*5(CTX), f
558 mov 4*6(CTX), g
559 mov 4*7(CTX), h
560
561 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
562 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
563 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
564
565 mov CTX, _CTX(%rsp)
566
567loop0:
568
569 VMOVDQ 0*32(INP),XTMP0
570 VMOVDQ 1*32(INP),XTMP1
571 VMOVDQ 2*32(INP),XTMP2
572 VMOVDQ 3*32(INP),XTMP3
573
574
575 vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
576 vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
577 vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
578 vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
579
580
581 vperm2i128 $0x20, XTMP2, XTMP0, X0
582 vperm2i128 $0x31, XTMP2, XTMP0, X1
583 vperm2i128 $0x20, XTMP3, XTMP1, X2
584 vperm2i128 $0x31, XTMP3, XTMP1, X3
585
586last_block_enter:
587 add $64, INP
588 mov INP, _INP(%rsp)
589
590
591 xor SRND, SRND
592
593.align 16
594loop1:
595 vpaddd K256+0*32(SRND), X0, XFER
596 vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
597 FOUR_ROUNDS_AND_SCHED _XFER + 0*32
598
599 vpaddd K256+1*32(SRND), X0, XFER
600 vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
601 FOUR_ROUNDS_AND_SCHED _XFER + 1*32
602
603 vpaddd K256+2*32(SRND), X0, XFER
604 vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
605 FOUR_ROUNDS_AND_SCHED _XFER + 2*32
606
607 vpaddd K256+3*32(SRND), X0, XFER
608 vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
609 FOUR_ROUNDS_AND_SCHED _XFER + 3*32
610
611 add $4*32, SRND
612 cmp $3*4*32, SRND
613 jb loop1
614
615loop2:
616
617 vpaddd K256+0*32(SRND), X0, XFER
618 vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
619 DO_4ROUNDS _XFER + 0*32
620
621 vpaddd K256+1*32(SRND), X1, XFER
622 vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
623 DO_4ROUNDS _XFER + 1*32
624 add $2*32, SRND
625
626 vmovdqa X2, X0
627 vmovdqa X3, X1
628
629 cmp $4*4*32, SRND
630 jb loop2
631
632 mov _CTX(%rsp), CTX
633 mov _INP(%rsp), INP
634
635 addm (4*0)(CTX),a
636 addm (4*1)(CTX),b
637 addm (4*2)(CTX),c
638 addm (4*3)(CTX),d
639 addm (4*4)(CTX),e
640 addm (4*5)(CTX),f
641 addm (4*6)(CTX),g
642 addm (4*7)(CTX),h
643
644 cmp _INP_END(%rsp), INP
645 ja done_hash
646
647
648 xor SRND, SRND
649.align 16
650loop3:
651 DO_4ROUNDS _XFER + 0*32 + 16
652 DO_4ROUNDS _XFER + 1*32 + 16
653 add $2*32, SRND
654 cmp $4*4*32, SRND
655 jb loop3
656
657 mov _CTX(%rsp), CTX
658 mov _INP(%rsp), INP
659 add $64, INP
660
661 addm (4*0)(CTX),a
662 addm (4*1)(CTX),b
663 addm (4*2)(CTX),c
664 addm (4*3)(CTX),d
665 addm (4*4)(CTX),e
666 addm (4*5)(CTX),f
667 addm (4*6)(CTX),g
668 addm (4*7)(CTX),h
669
670 cmp _INP_END(%rsp), INP
671 jb loop0
672 ja done_hash
673
674do_last_block:
675 VMOVDQ 0*16(INP),XWORD0
676 VMOVDQ 1*16(INP),XWORD1
677 VMOVDQ 2*16(INP),XWORD2
678 VMOVDQ 3*16(INP),XWORD3
679
680 vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
681 vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
682 vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
683 vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
684
685 jmp last_block_enter
686
687only_one_block:
688
689
690 mov (4*0)(CTX),a
691 mov (4*1)(CTX),b
692 mov (4*2)(CTX),c
693 mov (4*3)(CTX),d
694 mov (4*4)(CTX),e
695 mov (4*5)(CTX),f
696 mov (4*6)(CTX),g
697 mov (4*7)(CTX),h
698
699 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
700 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
701 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
702
703 mov CTX, _CTX(%rsp)
704 jmp do_last_block
705
706done_hash:
707
708 mov _RSP(%rsp), %rsp
709
710 popq %r15
711 popq %r14
712 popq %r13
713 popq %r12
714 popq %rbx
715 ret
716ENDPROC(sha256_transform_rorx)
717
718.section .rodata.cst512.K256, "aM", @progbits, 512
719.align 64
720K256:
721 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
722 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
723 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
724 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
725 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
726 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
727 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
728 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
729 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
730 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
731 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
732 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
733 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
734 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
735 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
736 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
737 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
738 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
739 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
740 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
741 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
742 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
743 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
744 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
745 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
746 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
747 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
748 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
749 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
750 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
751 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
752 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
753
754.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
755.align 32
756PSHUFFLE_BYTE_FLIP_MASK:
757 .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
758
759
760.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32
761.align 32
762_SHUF_00BA:
763 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
764
765
766.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32
767.align 32
768_SHUF_DC00:
769 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
770
771#endif
772