1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51#include <linux/linkage.h>
52
53
54#define VMOVDQ vmovdqu
55
56
57
58
59
60.macro addm p1 p2
61 add \p1, \p2
62 mov \p2, \p1
63.endm
64
65
66
67X0 = %ymm4
68X1 = %ymm5
69X2 = %ymm6
70X3 = %ymm7
71
72
73XWORD0 = %xmm4
74XWORD1 = %xmm5
75XWORD2 = %xmm6
76XWORD3 = %xmm7
77
78XTMP0 = %ymm0
79XTMP1 = %ymm1
80XTMP2 = %ymm2
81XTMP3 = %ymm3
82XTMP4 = %ymm8
83XFER = %ymm9
84XTMP5 = %ymm11
85
86SHUF_00BA = %ymm10
87SHUF_DC00 = %ymm12
88BYTE_FLIP_MASK = %ymm13
89
90X_BYTE_FLIP_MASK = %xmm13
91
92NUM_BLKS = %rdx
93INP = %rsi
94CTX = %rdi
95c = %ecx
96d = %r8d
97e = %edx
98y3 = %esi
99
100SRND = CTX
101
102a = %eax
103b = %ebx
104f = %r9d
105g = %r10d
106h = %r11d
107old_h = %r11d
108
109T1 = %r12d
110y0 = %r13d
111y1 = %r14d
112y2 = %r15d
113
114
115_XFER_SIZE = 2*64*4
116_XMM_SAVE_SIZE = 0
117_INP_END_SIZE = 8
118_INP_SIZE = 8
119_CTX_SIZE = 8
120
121_XFER = 0
122_XMM_SAVE = _XFER + _XFER_SIZE
123_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
124_INP = _INP_END + _INP_END_SIZE
125_CTX = _INP + _INP_SIZE
126STACK_SIZE = _CTX + _CTX_SIZE
127
128
129
130.macro rotate_Xs
131 X_ = X0
132 X0 = X1
133 X1 = X2
134 X2 = X3
135 X3 = X_
136.endm
137
138
139
140.macro ROTATE_ARGS
141 old_h = h
142 TMP_ = h
143 h = g
144 g = f
145 f = e
146 e = d
147 d = c
148 c = b
149 b = a
150 a = TMP_
151.endm
152
153.macro FOUR_ROUNDS_AND_SCHED disp
154
155
156 mov a, y3
157 rorx $25, e, y0
158 rorx $11, e, y1
159
160 addl \disp(%rsp, SRND), h
161 or c, y3
162 vpalignr $4, X2, X3, XTMP0
163 mov f, y2
164 rorx $13, a, T1
165
166 xor y1, y0
167 xor g, y2
168 vpaddd X0, XTMP0, XTMP0
169 rorx $6, e, y1
170
171 and e, y2
172 xor y1, y0
173 rorx $22, a, y1
174 add h, d
175
176 and b, y3
177 vpalignr $4, X0, X1, XTMP1
178 xor T1, y1
179 rorx $2, a, T1
180
181 xor g, y2
182 vpsrld $7, XTMP1, XTMP2
183 xor T1, y1
184 mov a, T1
185 and c, T1
186
187 add y0, y2
188 vpslld $(32-7), XTMP1, XTMP3
189 or T1, y3
190 add y1, h
191
192 add y2, d
193 vpor XTMP2, XTMP3, XTMP3
194
195 vpsrld $18, XTMP1, XTMP2
196 add y2, h
197 add y3, h
198
199
200 ROTATE_ARGS
201
202
203
204 mov a, y3
205 rorx $25, e, y0
206 rorx $11, e, y1
207 offset = \disp + 1*4
208 addl offset(%rsp, SRND), h
209 or c, y3
210
211
212 vpsrld $3, XTMP1, XTMP4
213 mov f, y2
214 rorx $13, a, T1
215 xor y1, y0
216 xor g, y2
217
218
219 rorx $6, e, y1
220 xor y1, y0
221 rorx $22, a, y1
222 and e, y2
223 add h, d
224
225 vpslld $(32-18), XTMP1, XTMP1
226 and b, y3
227 xor T1, y1
228
229 vpxor XTMP1, XTMP3, XTMP3
230 rorx $2, a, T1
231 xor g, y2
232
233 vpxor XTMP2, XTMP3, XTMP3
234 xor T1, y1
235 mov a, T1
236 and c, T1
237 add y0, y2
238
239 vpxor XTMP4, XTMP3, XTMP1
240 vpshufd $0b11111010, X3, XTMP2
241 or T1, y3
242 add y1, h
243
244 vpaddd XTMP1, XTMP0, XTMP0
245 add y2, d
246 add y2, h
247 add y3, h
248
249 vpsrld $10, XTMP2, XTMP4
250
251
252 ROTATE_ARGS
253
254
255
256 mov a, y3
257 rorx $25, e, y0
258 offset = \disp + 2*4
259 addl offset(%rsp, SRND), h
260
261 vpsrlq $19, XTMP2, XTMP3
262 rorx $11, e, y1
263 or c, y3
264 mov f, y2
265 xor g, y2
266
267 rorx $13, a, T1
268 xor y1, y0
269 vpsrlq $17, XTMP2, XTMP2
270 and e, y2
271
272 rorx $6, e, y1
273 vpxor XTMP3, XTMP2, XTMP2
274 add h, d
275 and b, y3
276
277 xor y1, y0
278 rorx $22, a, y1
279 vpxor XTMP2, XTMP4, XTMP4
280 xor g, y2
281
282 vpshufb SHUF_00BA, XTMP4, XTMP4
283 xor T1, y1
284 rorx $2, a ,T1
285 vpaddd XTMP4, XTMP0, XTMP0
286
287 xor T1, y1
288 mov a, T1
289 and c, T1
290 add y0, y2
291 vpshufd $0b01010000, XTMP0, XTMP2
292
293 or T1, y3
294 add y1,h
295 add y2,d
296 add y2,h
297
298 add y3,h
299
300
301 ROTATE_ARGS
302
303
304
305 mov a, y3
306 rorx $25, e, y0
307 rorx $11, e, y1
308 offset = \disp + 3*4
309 addl offset(%rsp, SRND), h
310 or c, y3
311
312
313 vpsrld $10, XTMP2, XTMP5
314 mov f, y2
315 rorx $13, a, T1
316 xor y1, y0
317 xor g, y2
318
319
320 vpsrlq $19, XTMP2, XTMP3
321 rorx $6, e, y1
322 and e, y2
323 add h, d
324 and b, y3
325
326 vpsrlq $17, XTMP2, XTMP2
327 xor y1, y0
328 xor g, y2
329
330 vpxor XTMP3, XTMP2, XTMP2
331 rorx $22, a, y1
332 add y0, y2
333
334 vpxor XTMP2, XTMP5, XTMP5
335 xor T1, y1
336 add y2, d
337
338 rorx $2, a, T1
339 vpshufb SHUF_DC00, XTMP5, XTMP5
340
341 vpaddd XTMP0, XTMP5, X0
342 xor T1, y1
343 mov a, T1
344 and c, T1
345 or T1, y3
346
347 add y1, h
348 add y2, h
349 add y3, h
350
351 ROTATE_ARGS
352 rotate_Xs
353.endm
354
355.macro DO_4ROUNDS disp
356
357
358 mov f, y2
359 rorx $25, e, y0
360 rorx $11, e, y1
361 xor g, y2
362
363 xor y1, y0
364 rorx $6, e, y1
365 and e, y2
366
367 xor y1, y0
368 rorx $13, a, T1
369 xor g, y2
370 rorx $22, a, y1
371 mov a, y3
372
373 xor T1, y1
374 rorx $2, a, T1
375 addl \disp(%rsp, SRND), h
376 or c, y3
377
378 xor T1, y1
379 mov a, T1
380 and b, y3
381 and c, T1
382 add y0, y2
383
384
385 add h, d
386 or T1, y3
387 add y1, h
388 add y2, d
389
390 ROTATE_ARGS
391
392
393
394 add y2, old_h
395 mov f, y2
396 rorx $25, e, y0
397 rorx $11, e, y1
398 xor g, y2
399
400 xor y1, y0
401 rorx $6, e, y1
402 and e, y2
403 add y3, old_h
404
405 xor y1, y0
406 rorx $13, a, T1
407 xor g, y2
408 rorx $22, a, y1
409 mov a, y3
410
411 xor T1, y1
412 rorx $2, a, T1
413 offset = 4*1 + \disp
414 addl offset(%rsp, SRND), h
415 or c, y3
416
417 xor T1, y1
418 mov a, T1
419 and b, y3
420 and c, T1
421 add y0, y2
422
423
424 add h, d
425 or T1, y3
426 add y1, h
427
428 add y2, d
429
430 ROTATE_ARGS
431
432
433
434 add y2, old_h
435 mov f, y2
436 rorx $25, e, y0
437 rorx $11, e, y1
438 xor g, y2
439
440 xor y1, y0
441 rorx $6, e, y1
442 and e, y2
443 add y3, old_h
444
445 xor y1, y0
446 rorx $13, a, T1
447 xor g, y2
448 rorx $22, a, y1
449 mov a, y3
450
451 xor T1, y1
452 rorx $2, a, T1
453 offset = 4*2 + \disp
454 addl offset(%rsp, SRND), h
455 or c, y3
456
457 xor T1, y1
458 mov a, T1
459 and b, y3
460 and c, T1
461 add y0, y2
462
463
464 add h, d
465 or T1, y3
466 add y1, h
467
468 add y2, d
469
470 ROTATE_ARGS
471
472
473
474 add y2, old_h
475 mov f, y2
476 rorx $25, e, y0
477 rorx $11, e, y1
478 xor g, y2
479
480 xor y1, y0
481 rorx $6, e, y1
482 and e, y2
483 add y3, old_h
484
485 xor y1, y0
486 rorx $13, a, T1
487 xor g, y2
488 rorx $22, a, y1
489 mov a, y3
490
491 xor T1, y1
492 rorx $2, a, T1
493 offset = 4*3 + \disp
494 addl offset(%rsp, SRND), h
495 or c, y3
496
497 xor T1, y1
498 mov a, T1
499 and b, y3
500 and c, T1
501 add y0, y2
502
503
504 add h, d
505 or T1, y3
506 add y1, h
507
508 add y2, d
509
510
511 add y2, h
512
513 add y3, h
514
515 ROTATE_ARGS
516
517.endm
518
519
520
521
522
523
524
525.text
526SYM_FUNC_START(sha256_transform_rorx)
527.align 32
528 pushq %rbx
529 pushq %r12
530 pushq %r13
531 pushq %r14
532 pushq %r15
533
534 push %rbp
535 mov %rsp, %rbp
536
537 subq $STACK_SIZE, %rsp
538 and $-32, %rsp
539
540 shl $6, NUM_BLKS
541 jz done_hash
542 lea -64(INP, NUM_BLKS), NUM_BLKS
543 mov NUM_BLKS, _INP_END(%rsp)
544
545 cmp NUM_BLKS, INP
546 je only_one_block
547
548
549 mov (CTX), a
550 mov 4*1(CTX), b
551 mov 4*2(CTX), c
552 mov 4*3(CTX), d
553 mov 4*4(CTX), e
554 mov 4*5(CTX), f
555 mov 4*6(CTX), g
556 mov 4*7(CTX), h
557
558 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
559 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
560 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
561
562 mov CTX, _CTX(%rsp)
563
564loop0:
565
566 VMOVDQ 0*32(INP),XTMP0
567 VMOVDQ 1*32(INP),XTMP1
568 VMOVDQ 2*32(INP),XTMP2
569 VMOVDQ 3*32(INP),XTMP3
570
571
572 vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
573 vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
574 vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
575 vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
576
577
578 vperm2i128 $0x20, XTMP2, XTMP0, X0
579 vperm2i128 $0x31, XTMP2, XTMP0, X1
580 vperm2i128 $0x20, XTMP3, XTMP1, X2
581 vperm2i128 $0x31, XTMP3, XTMP1, X3
582
583last_block_enter:
584 add $64, INP
585 mov INP, _INP(%rsp)
586
587
588 xor SRND, SRND
589
590.align 16
591loop1:
592 vpaddd K256+0*32(SRND), X0, XFER
593 vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
594 FOUR_ROUNDS_AND_SCHED _XFER + 0*32
595
596 vpaddd K256+1*32(SRND), X0, XFER
597 vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
598 FOUR_ROUNDS_AND_SCHED _XFER + 1*32
599
600 vpaddd K256+2*32(SRND), X0, XFER
601 vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
602 FOUR_ROUNDS_AND_SCHED _XFER + 2*32
603
604 vpaddd K256+3*32(SRND), X0, XFER
605 vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
606 FOUR_ROUNDS_AND_SCHED _XFER + 3*32
607
608 add $4*32, SRND
609 cmp $3*4*32, SRND
610 jb loop1
611
612loop2:
613
614 vpaddd K256+0*32(SRND), X0, XFER
615 vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
616 DO_4ROUNDS _XFER + 0*32
617
618 vpaddd K256+1*32(SRND), X1, XFER
619 vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
620 DO_4ROUNDS _XFER + 1*32
621 add $2*32, SRND
622
623 vmovdqa X2, X0
624 vmovdqa X3, X1
625
626 cmp $4*4*32, SRND
627 jb loop2
628
629 mov _CTX(%rsp), CTX
630 mov _INP(%rsp), INP
631
632 addm (4*0)(CTX),a
633 addm (4*1)(CTX),b
634 addm (4*2)(CTX),c
635 addm (4*3)(CTX),d
636 addm (4*4)(CTX),e
637 addm (4*5)(CTX),f
638 addm (4*6)(CTX),g
639 addm (4*7)(CTX),h
640
641 cmp _INP_END(%rsp), INP
642 ja done_hash
643
644
645 xor SRND, SRND
646.align 16
647loop3:
648 DO_4ROUNDS _XFER + 0*32 + 16
649 DO_4ROUNDS _XFER + 1*32 + 16
650 add $2*32, SRND
651 cmp $4*4*32, SRND
652 jb loop3
653
654 mov _CTX(%rsp), CTX
655 mov _INP(%rsp), INP
656 add $64, INP
657
658 addm (4*0)(CTX),a
659 addm (4*1)(CTX),b
660 addm (4*2)(CTX),c
661 addm (4*3)(CTX),d
662 addm (4*4)(CTX),e
663 addm (4*5)(CTX),f
664 addm (4*6)(CTX),g
665 addm (4*7)(CTX),h
666
667 cmp _INP_END(%rsp), INP
668 jb loop0
669 ja done_hash
670
671do_last_block:
672 VMOVDQ 0*16(INP),XWORD0
673 VMOVDQ 1*16(INP),XWORD1
674 VMOVDQ 2*16(INP),XWORD2
675 VMOVDQ 3*16(INP),XWORD3
676
677 vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
678 vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
679 vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
680 vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
681
682 jmp last_block_enter
683
684only_one_block:
685
686
687 mov (4*0)(CTX),a
688 mov (4*1)(CTX),b
689 mov (4*2)(CTX),c
690 mov (4*3)(CTX),d
691 mov (4*4)(CTX),e
692 mov (4*5)(CTX),f
693 mov (4*6)(CTX),g
694 mov (4*7)(CTX),h
695
696 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
697 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
698 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
699
700 mov CTX, _CTX(%rsp)
701 jmp do_last_block
702
703done_hash:
704
705 mov %rbp, %rsp
706 pop %rbp
707
708 popq %r15
709 popq %r14
710 popq %r13
711 popq %r12
712 popq %rbx
713 ret
714SYM_FUNC_END(sha256_transform_rorx)
715
716.section .rodata.cst512.K256, "aM", @progbits, 512
717.align 64
718K256:
719 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
720 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
721 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
722 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
723 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
724 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
725 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
726 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
727 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
728 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
729 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
730 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
731 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
732 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
733 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
734 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
735 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
736 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
737 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
738 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
739 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
740 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
741 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
742 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
743 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
744 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
745 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
746 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
747 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
748 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
749 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
750 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
751
752.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
753.align 32
754PSHUFFLE_BYTE_FLIP_MASK:
755 .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
756
757
758.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32
759.align 32
760_SHUF_00BA:
761 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
762
763
764.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32
765.align 32
766_SHUF_DC00:
767 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
768