1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72#include <linux/linkage.h>
73
74#define CTX %rdi
75#define BUF %rsi
76#define CNT %rdx
77
78#define REG_A %ecx
79#define REG_B %esi
80#define REG_C %edi
81#define REG_D %eax
82#define REG_E %edx
83#define REG_TB %ebx
84#define REG_TA %r12d
85#define REG_RA %rcx
86#define REG_RB %rsi
87#define REG_RC %rdi
88#define REG_RD %rax
89#define REG_RE %rdx
90#define REG_RTA %r12
91#define REG_RTB %rbx
92#define REG_T1 %ebp
93#define xmm_mov vmovups
94#define avx2_zeroupper vzeroupper
95#define RND_F1 1
96#define RND_F2 2
97#define RND_F3 3
98
99.macro REGALLOC
100 .set A, REG_A
101 .set B, REG_B
102 .set C, REG_C
103 .set D, REG_D
104 .set E, REG_E
105 .set TB, REG_TB
106 .set TA, REG_TA
107
108 .set RA, REG_RA
109 .set RB, REG_RB
110 .set RC, REG_RC
111 .set RD, REG_RD
112 .set RE, REG_RE
113
114 .set RTA, REG_RTA
115 .set RTB, REG_RTB
116
117 .set T1, REG_T1
118.endm
119
120#define K_BASE %r8
121#define HASH_PTR %r9
122#define BUFFER_PTR %r10
123#define BUFFER_PTR2 %r13
124#define BUFFER_END %r11
125
126#define PRECALC_BUF %r14
127#define WK_BUF %r15
128
129#define W_TMP %xmm0
130#define WY_TMP %ymm0
131#define WY_TMP2 %ymm9
132
133
134#define WY0 %ymm3
135#define WY4 %ymm5
136#define WY08 %ymm7
137#define WY12 %ymm8
138#define WY16 %ymm12
139#define WY20 %ymm13
140#define WY24 %ymm14
141#define WY28 %ymm15
142
143#define YMM_SHUFB_BSWAP %ymm10
144
145
146
147
148
149#define W_SIZE (80*2*2 +16)
150
151#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
152#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF)
153
154
155.macro UPDATE_HASH hash, val
156 add \hash, \val
157 mov \val, \hash
158.endm
159
160.macro PRECALC_RESET_WY
161 .set WY_00, WY0
162 .set WY_04, WY4
163 .set WY_08, WY08
164 .set WY_12, WY12
165 .set WY_16, WY16
166 .set WY_20, WY20
167 .set WY_24, WY24
168 .set WY_28, WY28
169 .set WY_32, WY_00
170.endm
171
172.macro PRECALC_ROTATE_WY
173
174 .set WY_32, WY_28
175 .set WY_28, WY_24
176 .set WY_24, WY_20
177 .set WY_20, WY_16
178 .set WY_16, WY_12
179 .set WY_12, WY_08
180 .set WY_08, WY_04
181 .set WY_04, WY_00
182 .set WY_00, WY_32
183
184
185 .set WY, WY_00
186 .set WY_minus_04, WY_04
187 .set WY_minus_08, WY_08
188 .set WY_minus_12, WY_12
189 .set WY_minus_16, WY_16
190 .set WY_minus_20, WY_20
191 .set WY_minus_24, WY_24
192 .set WY_minus_28, WY_28
193 .set WY_minus_32, WY
194.endm
195
196.macro PRECALC_00_15
197 .if (i == 0)
198 PRECALC_RESET_WY
199 PRECALC_ROTATE_WY
200 .endif
201
202
203 .if ((i & 7) == 0)
204
205
206
207
208 vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP
209 .elseif ((i & 7) == 1)
210 vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\
211 WY_TMP, WY_TMP
212 .elseif ((i & 7) == 2)
213 vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
214 .elseif ((i & 7) == 4)
215 vpaddd K_XMM(K_BASE), WY, WY_TMP
216 .elseif ((i & 7) == 7)
217 vmovdqu WY_TMP, PRECALC_WK(i&~7)
218
219 PRECALC_ROTATE_WY
220 .endif
221.endm
222
223.macro PRECALC_16_31
224
225
226
227
228
229
230
231
232
233 .if ((i & 7) == 0)
234
235
236
237
238
239 vpalignr $8, WY_minus_16, WY_minus_12, WY
240 vpsrldq $4, WY_minus_04, WY_TMP
241 .elseif ((i & 7) == 1)
242 vpxor WY_minus_08, WY, WY
243 vpxor WY_minus_16, WY_TMP, WY_TMP
244 .elseif ((i & 7) == 2)
245 vpxor WY_TMP, WY, WY
246 vpslldq $12, WY, WY_TMP2
247 .elseif ((i & 7) == 3)
248 vpslld $1, WY, WY_TMP
249 vpsrld $31, WY, WY
250 .elseif ((i & 7) == 4)
251 vpor WY, WY_TMP, WY_TMP
252 vpslld $2, WY_TMP2, WY
253 .elseif ((i & 7) == 5)
254 vpsrld $30, WY_TMP2, WY_TMP2
255 vpxor WY, WY_TMP, WY_TMP
256 .elseif ((i & 7) == 7)
257 vpxor WY_TMP2, WY_TMP, WY
258 vpaddd K_XMM(K_BASE), WY, WY_TMP
259 vmovdqu WY_TMP, PRECALC_WK(i&~7)
260
261 PRECALC_ROTATE_WY
262 .endif
263.endm
264
265.macro PRECALC_32_79
266
267
268
269
270
271
272
273
274
275 .if ((i & 7) == 0)
276
277
278
279
280 vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP
281 .elseif ((i & 7) == 1)
282
283 vpxor WY_minus_28, WY, WY
284 .elseif ((i & 7) == 2)
285 vpxor WY_minus_16, WY_TMP, WY_TMP
286 .elseif ((i & 7) == 3)
287 vpxor WY_TMP, WY, WY
288 .elseif ((i & 7) == 4)
289 vpslld $2, WY, WY_TMP
290 .elseif ((i & 7) == 5)
291 vpsrld $30, WY, WY
292 vpor WY, WY_TMP, WY
293 .elseif ((i & 7) == 7)
294 vpaddd K_XMM(K_BASE), WY, WY_TMP
295 vmovdqu WY_TMP, PRECALC_WK(i&~7)
296
297 PRECALC_ROTATE_WY
298 .endif
299.endm
300
301.macro PRECALC r, s
302 .set i, \r
303
304 .if (i < 40)
305 .set K_XMM, 32*0
306 .elseif (i < 80)
307 .set K_XMM, 32*1
308 .elseif (i < 120)
309 .set K_XMM, 32*2
310 .else
311 .set K_XMM, 32*3
312 .endif
313
314 .if (i<32)
315 PRECALC_00_15 \s
316 .elseif (i<64)
317 PRECALC_16_31 \s
318 .elseif (i < 160)
319 PRECALC_32_79 \s
320 .endif
321.endm
322
323.macro ROTATE_STATE
324 .set T_REG, E
325 .set E, D
326 .set D, C
327 .set C, B
328 .set B, TB
329 .set TB, A
330 .set A, T_REG
331
332 .set T_REG, RE
333 .set RE, RD
334 .set RD, RC
335 .set RC, RB
336 .set RB, RTB
337 .set RTB, RA
338 .set RA, T_REG
339.endm
340
341
342
343.macro RND_FUN f, r
344 .if (\f == RND_F1)
345 ROUND_F1 \r
346 .elseif (\f == RND_F2)
347 ROUND_F2 \r
348 .elseif (\f == RND_F3)
349 ROUND_F3 \r
350 .endif
351.endm
352
353.macro RR r
354 .set round_id, (\r % 80)
355
356 .if (round_id == 0)
357 .set ROUND_FUNC, RND_F1
358 mov B, TB
359
360 rorx $(32-30), B, B
361 andn D, TB, T1
362 and C, TB
363 xor T1, TB
364 .endif
365
366 RND_FUN ROUND_FUNC, \r
367 ROTATE_STATE
368
369 .if (round_id == 18)
370 .set ROUND_FUNC, RND_F2
371 .elseif (round_id == 38)
372 .set ROUND_FUNC, RND_F3
373 .elseif (round_id == 58)
374 .set ROUND_FUNC, RND_F2
375 .endif
376
377 .set round_id, ( (\r+1) % 80)
378
379 RND_FUN ROUND_FUNC, (\r+1)
380 ROTATE_STATE
381.endm
382
383.macro ROUND_F1 r
384 add WK(\r), E
385
386 andn C, A, T1
387 lea (RE,RTB), E
388
389 rorx $(32-5), A, TA
390 rorx $(32-30),A, TB
391
392 PRECALC (\r)
393
394
395
396
397
398 and B, A
399 xor T1, A
400
401 lea (RE,RTA), E
402.endm
403
404.macro ROUND_F2 r
405 add WK(\r), E
406 lea (RE,RTB), E
407
408
409 rorx $(32-5), A, TA
410 .if ((round_id) < 79)
411 rorx $(32-30), A, TB
412 .endif
413 PRECALC (\r)
414
415 .if ((round_id) < 79)
416 xor B, A
417 .endif
418
419 add TA, E
420
421 .if ((round_id) < 79)
422 xor C, A
423 .endif
424.endm
425
426.macro ROUND_F3 r
427 add WK(\r), E
428 PRECALC (\r)
429
430 lea (RE,RTB), E
431
432 mov B, T1
433 or A, T1
434
435 rorx $(32-5), A, TA
436 rorx $(32-30), A, TB
437
438
439
440
441 and C, T1
442 and B, A
443 or T1, A
444
445 add TA, E
446
447.endm
448
449
450
451
452.macro SHA1_PIPELINED_MAIN_BODY
453
454 REGALLOC
455
456 mov (HASH_PTR), A
457 mov 4(HASH_PTR), B
458 mov 8(HASH_PTR), C
459 mov 12(HASH_PTR), D
460 mov 16(HASH_PTR), E
461
462 mov %rsp, PRECALC_BUF
463 lea (2*4*80+32)(%rsp), WK_BUF
464
465
466 PRECALC_OFFSET = 0
467 .set i, 0
468 .rept 160
469 PRECALC i
470 .set i, i + 1
471 .endr
472 PRECALC_OFFSET = 128
473 xchg WK_BUF, PRECALC_BUF
474
475 .align 32
476_loop:
477
478
479
480
481
482 cmp K_BASE, BUFFER_PTR
483 jne _begin
484 .align 32
485 jmp _end
486 .align 32
487_begin:
488
489
490
491
492
493 .set j, 0
494 .rept 5
495 RR j
496 .set j, j+2
497 .endr
498
499 jmp _loop0
500_loop0:
501
502
503
504
505
506
507
508
509
510 .rept 25
511 RR j
512 .set j, j+2
513 .endr
514
515 add $(2*64), BUFFER_PTR
516 cmp BUFFER_END, BUFFER_PTR
517 cmovae K_BASE, BUFFER_PTR
518
519
520
521
522
523
524 .rept 10
525 RR j
526 .set j, j+2
527 .endr
528
529 UPDATE_HASH (HASH_PTR), A
530 UPDATE_HASH 4(HASH_PTR), TB
531 UPDATE_HASH 8(HASH_PTR), C
532 UPDATE_HASH 12(HASH_PTR), D
533 UPDATE_HASH 16(HASH_PTR), E
534
535 cmp K_BASE, BUFFER_PTR
536 je _loop
537
538 mov TB, B
539
540
541
542
543
544
545
546
547 .set j, 0
548 .rept 10
549 RR j+80
550 .set j, j+2
551 .endr
552
553 jmp _loop1
554_loop1:
555
556
557
558
559
560 .rept 10
561 RR j+80
562 .set j, j+2
563 .endr
564
565 jmp _loop2
566_loop2:
567
568
569
570
571
572
573 .rept 10
574 RR j+80
575 .set j, j+2
576 .endr
577
578 add $(2*64), BUFFER_PTR2
579
580 cmp BUFFER_END, BUFFER_PTR2
581 cmovae K_BASE, BUFFER_PTR
582
583 jmp _loop3
584_loop3:
585
586
587
588
589
590
591 .rept 10
592 RR j+80
593 .set j, j+2
594 .endr
595
596 UPDATE_HASH (HASH_PTR), A
597 UPDATE_HASH 4(HASH_PTR), TB
598 UPDATE_HASH 8(HASH_PTR), C
599 UPDATE_HASH 12(HASH_PTR), D
600 UPDATE_HASH 16(HASH_PTR), E
601
602
603 mov A, TA
604 mov TB, A
605 mov C, TB
606 mov E, C
607 mov D, B
608 mov TA, D
609
610 REGALLOC
611
612 xchg WK_BUF, PRECALC_BUF
613
614 jmp _loop
615
616 .align 32
617 _end:
618
619.endm
620
621
622
623
624.macro SHA1_VECTOR_ASM name
625 ENTRY(\name)
626
627 push %rbx
628 push %rbp
629 push %r12
630 push %r13
631 push %r14
632 push %r15
633
634 RESERVE_STACK = (W_SIZE*4 + 8+24)
635
636
637 mov %rsp, %rbx
638 and $~(0x20-1), %rsp
639 push %rbx
640 sub $RESERVE_STACK, %rsp
641
642 avx2_zeroupper
643
644 lea K_XMM_AR(%rip), K_BASE
645
646 mov CTX, HASH_PTR
647 mov BUF, BUFFER_PTR
648 lea 64(BUF), BUFFER_PTR2
649
650 shl $6, CNT
651 add BUF, CNT
652 add $64, CNT
653 mov CNT, BUFFER_END
654
655 cmp BUFFER_END, BUFFER_PTR2
656 cmovae K_BASE, BUFFER_PTR2
657
658 xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
659
660 SHA1_PIPELINED_MAIN_BODY
661
662 avx2_zeroupper
663
664 add $RESERVE_STACK, %rsp
665 pop %rsp
666
667 pop %r15
668 pop %r14
669 pop %r13
670 pop %r12
671 pop %rbp
672 pop %rbx
673
674 ret
675
676 ENDPROC(\name)
677.endm
678
679.section .rodata
680
681#define K1 0x5a827999
682#define K2 0x6ed9eba1
683#define K3 0x8f1bbcdc
684#define K4 0xca62c1d6
685
686.align 128
687K_XMM_AR:
688 .long K1, K1, K1, K1
689 .long K1, K1, K1, K1
690 .long K2, K2, K2, K2
691 .long K2, K2, K2, K2
692 .long K3, K3, K3, K3
693 .long K3, K3, K3, K3
694 .long K4, K4, K4, K4
695 .long K4, K4, K4, K4
696
697BSWAP_SHUFB_CTL:
698 .long 0x00010203
699 .long 0x04050607
700 .long 0x08090a0b
701 .long 0x0c0d0e0f
702 .long 0x00010203
703 .long 0x04050607
704 .long 0x08090a0b
705 .long 0x0c0d0e0f
706.text
707
708SHA1_VECTOR_ASM sha1_transform_avx2
709