1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51#ifdef CONFIG_AS_AVX2
52#include <linux/linkage.h>
53
54
55#define VMOVDQ vmovdqu
56
57
58
59
60
61.macro addm p1 p2
62 add \p1, \p2
63 mov \p2, \p1
64.endm
65
66
67
68X0 = %ymm4
69X1 = %ymm5
70X2 = %ymm6
71X3 = %ymm7
72
73
74XWORD0 = %xmm4
75XWORD1 = %xmm5
76XWORD2 = %xmm6
77XWORD3 = %xmm7
78
79XTMP0 = %ymm0
80XTMP1 = %ymm1
81XTMP2 = %ymm2
82XTMP3 = %ymm3
83XTMP4 = %ymm8
84XFER = %ymm9
85XTMP5 = %ymm11
86
87SHUF_00BA = %ymm10
88SHUF_DC00 = %ymm12
89BYTE_FLIP_MASK = %ymm13
90
91X_BYTE_FLIP_MASK = %xmm13
92
93NUM_BLKS = %rdx
94INP = %rsi
95CTX = %rdi
96c = %ecx
97d = %r8d
98e = %edx
99y3 = %esi
100
101
102TBL = %rbp
103SRND = CTX
104
105a = %eax
106b = %ebx
107f = %r9d
108g = %r10d
109h = %r11d
110old_h = %r11d
111
112T1 = %r12d
113y0 = %r13d
114y1 = %r14d
115y2 = %r15d
116
117
118_XFER_SIZE = 2*64*4
119_XMM_SAVE_SIZE = 0
120_INP_END_SIZE = 8
121_INP_SIZE = 8
122_CTX_SIZE = 8
123_RSP_SIZE = 8
124
125_XFER = 0
126_XMM_SAVE = _XFER + _XFER_SIZE
127_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
128_INP = _INP_END + _INP_END_SIZE
129_CTX = _INP + _INP_SIZE
130_RSP = _CTX + _CTX_SIZE
131STACK_SIZE = _RSP + _RSP_SIZE
132
133
134
135.macro rotate_Xs
136 X_ = X0
137 X0 = X1
138 X1 = X2
139 X2 = X3
140 X3 = X_
141.endm
142
143
144
145.macro ROTATE_ARGS
146 old_h = h
147 TMP_ = h
148 h = g
149 g = f
150 f = e
151 e = d
152 d = c
153 c = b
154 b = a
155 a = TMP_
156.endm
157
158.macro FOUR_ROUNDS_AND_SCHED disp
159
160
161 mov a, y3
162 rorx $25, e, y0
163 rorx $11, e, y1
164
165 addl \disp(%rsp, SRND), h
166 or c, y3
167 vpalignr $4, X2, X3, XTMP0
168 mov f, y2
169 rorx $13, a, T1
170
171 xor y1, y0
172 xor g, y2
173 vpaddd X0, XTMP0, XTMP0
174 rorx $6, e, y1
175
176 and e, y2
177 xor y1, y0
178 rorx $22, a, y1
179 add h, d
180
181 and b, y3
182 vpalignr $4, X0, X1, XTMP1
183 xor T1, y1
184 rorx $2, a, T1
185
186 xor g, y2
187 vpsrld $7, XTMP1, XTMP2
188 xor T1, y1
189 mov a, T1
190 and c, T1
191
192 add y0, y2
193 vpslld $(32-7), XTMP1, XTMP3
194 or T1, y3
195 add y1, h
196
197 add y2, d
198 vpor XTMP2, XTMP3, XTMP3
199
200 vpsrld $18, XTMP1, XTMP2
201 add y2, h
202 add y3, h
203
204
205 ROTATE_ARGS
206
207
208
209 mov a, y3
210 rorx $25, e, y0
211 rorx $11, e, y1
212 offset = \disp + 1*4
213 addl offset(%rsp, SRND), h
214 or c, y3
215
216
217 vpsrld $3, XTMP1, XTMP4
218 mov f, y2
219 rorx $13, a, T1
220 xor y1, y0
221 xor g, y2
222
223
224 rorx $6, e, y1
225 xor y1, y0
226 rorx $22, a, y1
227 and e, y2
228 add h, d
229
230 vpslld $(32-18), XTMP1, XTMP1
231 and b, y3
232 xor T1, y1
233
234 vpxor XTMP1, XTMP3, XTMP3
235 rorx $2, a, T1
236 xor g, y2
237
238 vpxor XTMP2, XTMP3, XTMP3
239 xor T1, y1
240 mov a, T1
241 and c, T1
242 add y0, y2
243
244 vpxor XTMP4, XTMP3, XTMP1
245 vpshufd $0b11111010, X3, XTMP2
246 or T1, y3
247 add y1, h
248
249 vpaddd XTMP1, XTMP0, XTMP0
250 add y2, d
251 add y2, h
252 add y3, h
253
254 vpsrld $10, XTMP2, XTMP4
255
256
257 ROTATE_ARGS
258
259
260
261 mov a, y3
262 rorx $25, e, y0
263 offset = \disp + 2*4
264 addl offset(%rsp, SRND), h
265
266 vpsrlq $19, XTMP2, XTMP3
267 rorx $11, e, y1
268 or c, y3
269 mov f, y2
270 xor g, y2
271
272 rorx $13, a, T1
273 xor y1, y0
274 vpsrlq $17, XTMP2, XTMP2
275 and e, y2
276
277 rorx $6, e, y1
278 vpxor XTMP3, XTMP2, XTMP2
279 add h, d
280 and b, y3
281
282 xor y1, y0
283 rorx $22, a, y1
284 vpxor XTMP2, XTMP4, XTMP4
285 xor g, y2
286
287 vpshufb SHUF_00BA, XTMP4, XTMP4
288 xor T1, y1
289 rorx $2, a ,T1
290 vpaddd XTMP4, XTMP0, XTMP0
291
292 xor T1, y1
293 mov a, T1
294 and c, T1
295 add y0, y2
296 vpshufd $0b01010000, XTMP0, XTMP2
297
298 or T1, y3
299 add y1,h
300 add y2,d
301 add y2,h
302
303 add y3,h
304
305
306 ROTATE_ARGS
307
308
309
310 mov a, y3
311 rorx $25, e, y0
312 rorx $11, e, y1
313 offset = \disp + 3*4
314 addl offset(%rsp, SRND), h
315 or c, y3
316
317
318 vpsrld $10, XTMP2, XTMP5
319 mov f, y2
320 rorx $13, a, T1
321 xor y1, y0
322 xor g, y2
323
324
325 vpsrlq $19, XTMP2, XTMP3
326 rorx $6, e, y1
327 and e, y2
328 add h, d
329 and b, y3
330
331 vpsrlq $17, XTMP2, XTMP2
332 xor y1, y0
333 xor g, y2
334
335 vpxor XTMP3, XTMP2, XTMP2
336 rorx $22, a, y1
337 add y0, y2
338
339 vpxor XTMP2, XTMP5, XTMP5
340 xor T1, y1
341 add y2, d
342
343 rorx $2, a, T1
344 vpshufb SHUF_DC00, XTMP5, XTMP5
345
346 vpaddd XTMP0, XTMP5, X0
347 xor T1, y1
348 mov a, T1
349 and c, T1
350 or T1, y3
351
352 add y1, h
353 add y2, h
354 add y3, h
355
356 ROTATE_ARGS
357 rotate_Xs
358.endm
359
360.macro DO_4ROUNDS disp
361
362
363 mov f, y2
364 rorx $25, e, y0
365 rorx $11, e, y1
366 xor g, y2
367
368 xor y1, y0
369 rorx $6, e, y1
370 and e, y2
371
372 xor y1, y0
373 rorx $13, a, T1
374 xor g, y2
375 rorx $22, a, y1
376 mov a, y3
377
378 xor T1, y1
379 rorx $2, a, T1
380 addl \disp(%rsp, SRND), h
381 or c, y3
382
383 xor T1, y1
384 mov a, T1
385 and b, y3
386 and c, T1
387 add y0, y2
388
389
390 add h, d
391 or T1, y3
392 add y1, h
393 add y2, d
394
395 ROTATE_ARGS
396
397
398
399 add y2, old_h
400 mov f, y2
401 rorx $25, e, y0
402 rorx $11, e, y1
403 xor g, y2
404
405 xor y1, y0
406 rorx $6, e, y1
407 and e, y2
408 add y3, old_h
409
410 xor y1, y0
411 rorx $13, a, T1
412 xor g, y2
413 rorx $22, a, y1
414 mov a, y3
415
416 xor T1, y1
417 rorx $2, a, T1
418 offset = 4*1 + \disp
419 addl offset(%rsp, SRND), h
420 or c, y3
421
422 xor T1, y1
423 mov a, T1
424 and b, y3
425 and c, T1
426 add y0, y2
427
428
429 add h, d
430 or T1, y3
431 add y1, h
432
433 add y2, d
434
435 ROTATE_ARGS
436
437
438
439 add y2, old_h
440 mov f, y2
441 rorx $25, e, y0
442 rorx $11, e, y1
443 xor g, y2
444
445 xor y1, y0
446 rorx $6, e, y1
447 and e, y2
448 add y3, old_h
449
450 xor y1, y0
451 rorx $13, a, T1
452 xor g, y2
453 rorx $22, a, y1
454 mov a, y3
455
456 xor T1, y1
457 rorx $2, a, T1
458 offset = 4*2 + \disp
459 addl offset(%rsp, SRND), h
460 or c, y3
461
462 xor T1, y1
463 mov a, T1
464 and b, y3
465 and c, T1
466 add y0, y2
467
468
469 add h, d
470 or T1, y3
471 add y1, h
472
473 add y2, d
474
475 ROTATE_ARGS
476
477
478
479 add y2, old_h
480 mov f, y2
481 rorx $25, e, y0
482 rorx $11, e, y1
483 xor g, y2
484
485 xor y1, y0
486 rorx $6, e, y1
487 and e, y2
488 add y3, old_h
489
490 xor y1, y0
491 rorx $13, a, T1
492 xor g, y2
493 rorx $22, a, y1
494 mov a, y3
495
496 xor T1, y1
497 rorx $2, a, T1
498 offset = 4*3 + \disp
499 addl offset(%rsp, SRND), h
500 or c, y3
501
502 xor T1, y1
503 mov a, T1
504 and b, y3
505 and c, T1
506 add y0, y2
507
508
509 add h, d
510 or T1, y3
511 add y1, h
512
513 add y2, d
514
515
516 add y2, h
517
518 add y3, h
519
520 ROTATE_ARGS
521
522.endm
523
524
525
526
527
528
529
530.text
531ENTRY(sha256_transform_rorx)
532.align 32
533 pushq %rbx
534 pushq %rbp
535 pushq %r12
536 pushq %r13
537 pushq %r14
538 pushq %r15
539
540 mov %rsp, %rax
541 subq $STACK_SIZE, %rsp
542 and $-32, %rsp
543 mov %rax, _RSP(%rsp)
544
545
546 shl $6, NUM_BLKS
547 jz done_hash
548 lea -64(INP, NUM_BLKS), NUM_BLKS
549 mov NUM_BLKS, _INP_END(%rsp)
550
551 cmp NUM_BLKS, INP
552 je only_one_block
553
554
555 mov (CTX), a
556 mov 4*1(CTX), b
557 mov 4*2(CTX), c
558 mov 4*3(CTX), d
559 mov 4*4(CTX), e
560 mov 4*5(CTX), f
561 mov 4*6(CTX), g
562 mov 4*7(CTX), h
563
564 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
565 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
566 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
567
568 mov CTX, _CTX(%rsp)
569
570loop0:
571 lea K256(%rip), TBL
572
573
574 VMOVDQ 0*32(INP),XTMP0
575 VMOVDQ 1*32(INP),XTMP1
576 VMOVDQ 2*32(INP),XTMP2
577 VMOVDQ 3*32(INP),XTMP3
578
579
580 vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
581 vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
582 vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
583 vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
584
585
586 vperm2i128 $0x20, XTMP2, XTMP0, X0
587 vperm2i128 $0x31, XTMP2, XTMP0, X1
588 vperm2i128 $0x20, XTMP3, XTMP1, X2
589 vperm2i128 $0x31, XTMP3, XTMP1, X3
590
591last_block_enter:
592 add $64, INP
593 mov INP, _INP(%rsp)
594
595
596 xor SRND, SRND
597
598.align 16
599loop1:
600 vpaddd 0*32(TBL, SRND), X0, XFER
601 vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
602 FOUR_ROUNDS_AND_SCHED _XFER + 0*32
603
604 vpaddd 1*32(TBL, SRND), X0, XFER
605 vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
606 FOUR_ROUNDS_AND_SCHED _XFER + 1*32
607
608 vpaddd 2*32(TBL, SRND), X0, XFER
609 vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
610 FOUR_ROUNDS_AND_SCHED _XFER + 2*32
611
612 vpaddd 3*32(TBL, SRND), X0, XFER
613 vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
614 FOUR_ROUNDS_AND_SCHED _XFER + 3*32
615
616 add $4*32, SRND
617 cmp $3*4*32, SRND
618 jb loop1
619
620loop2:
621
622 vpaddd 0*32(TBL, SRND), X0, XFER
623 vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
624 DO_4ROUNDS _XFER + 0*32
625 vpaddd 1*32(TBL, SRND), X1, XFER
626 vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
627 DO_4ROUNDS _XFER + 1*32
628 add $2*32, SRND
629
630 vmovdqa X2, X0
631 vmovdqa X3, X1
632
633 cmp $4*4*32, SRND
634 jb loop2
635
636 mov _CTX(%rsp), CTX
637 mov _INP(%rsp), INP
638
639 addm (4*0)(CTX),a
640 addm (4*1)(CTX),b
641 addm (4*2)(CTX),c
642 addm (4*3)(CTX),d
643 addm (4*4)(CTX),e
644 addm (4*5)(CTX),f
645 addm (4*6)(CTX),g
646 addm (4*7)(CTX),h
647
648 cmp _INP_END(%rsp), INP
649 ja done_hash
650
651
652 xor SRND, SRND
653.align 16
654loop3:
655 DO_4ROUNDS _XFER + 0*32 + 16
656 DO_4ROUNDS _XFER + 1*32 + 16
657 add $2*32, SRND
658 cmp $4*4*32, SRND
659 jb loop3
660
661 mov _CTX(%rsp), CTX
662 mov _INP(%rsp), INP
663 add $64, INP
664
665 addm (4*0)(CTX),a
666 addm (4*1)(CTX),b
667 addm (4*2)(CTX),c
668 addm (4*3)(CTX),d
669 addm (4*4)(CTX),e
670 addm (4*5)(CTX),f
671 addm (4*6)(CTX),g
672 addm (4*7)(CTX),h
673
674 cmp _INP_END(%rsp), INP
675 jb loop0
676 ja done_hash
677
678do_last_block:
679
680 lea K256(%rip), TBL
681
682 VMOVDQ 0*16(INP),XWORD0
683 VMOVDQ 1*16(INP),XWORD1
684 VMOVDQ 2*16(INP),XWORD2
685 VMOVDQ 3*16(INP),XWORD3
686
687 vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
688 vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
689 vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
690 vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
691
692 jmp last_block_enter
693
694only_one_block:
695
696
697 mov (4*0)(CTX),a
698 mov (4*1)(CTX),b
699 mov (4*2)(CTX),c
700 mov (4*3)(CTX),d
701 mov (4*4)(CTX),e
702 mov (4*5)(CTX),f
703 mov (4*6)(CTX),g
704 mov (4*7)(CTX),h
705
706 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
707 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
708 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
709
710 mov CTX, _CTX(%rsp)
711 jmp do_last_block
712
713done_hash:
714
715 mov _RSP(%rsp), %rsp
716
717 popq %r15
718 popq %r14
719 popq %r13
720 popq %r12
721 popq %rbp
722 popq %rbx
723 ret
724ENDPROC(sha256_transform_rorx)
725
726.data
727.align 64
728K256:
729 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
730 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
731 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
732 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
733 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
734 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
735 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
736 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
737 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
738 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
739 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
740 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
741 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
742 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
743 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
744 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
745 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
746 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
747 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
748 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
749 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
750 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
751 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
752 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
753 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
754 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
755 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
756 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
757 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
758 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
759 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
760 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
761
762PSHUFFLE_BYTE_FLIP_MASK:
763 .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
764
765
766_SHUF_00BA:
767 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
768
769
770_SHUF_DC00:
771 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
772#endif
773