1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55#include <linux/linkage.h>
56#include "sha1_mb_mgr_datastruct.S"
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92.macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1
93
94 vshufps $0x44, \r1, \r0, \t0
95 vshufps $0xEE, \r1, \r0, \r0
96 vshufps $0x44, \r3, \r2, \t1
97 vshufps $0xEE, \r3, \r2, \r2
98 vshufps $0xDD, \t1, \t0, \r3
99 vshufps $0x88, \r2, \r0, \r1
100 vshufps $0xDD, \r2, \r0, \r0
101 vshufps $0x88, \t1, \t0, \t0
102
103
104
105 vshufps $0x44, \r5, \r4, \r2
106 vshufps $0xEE, \r5, \r4, \r4
107 vshufps $0x44, \r7, \r6, \t1
108 vshufps $0xEE, \r7, \r6, \r6
109 vshufps $0xDD, \t1, \r2, \r7
110 vshufps $0x88, \r6, \r4, \r5
111 vshufps $0xDD, \r6, \r4, \r4
112 vshufps $0x88, \t1, \r2, \t1
113
114 vperm2f128 $0x13, \r1, \r5, \r6
115 vperm2f128 $0x02, \r1, \r5, \r2
116 vperm2f128 $0x13, \r3, \r7, \r5
117 vperm2f128 $0x02, \r3, \r7, \r1
118 vperm2f128 $0x13, \r0, \r4, \r7
119 vperm2f128 $0x02, \r0, \r4, \r3
120 vperm2f128 $0x13, \t0, \t1, \r4
121 vperm2f128 $0x02, \t0, \t1, \r0
122
123.endm
124
125
126
127
128.macro MAGIC_F0 regF regB regC regD regT
129 vpxor \regD, \regC, \regF
130 vpand \regB, \regF, \regF
131 vpxor \regD, \regF, \regF
132.endm
133
134
135.macro MAGIC_F1 regF regB regC regD regT
136 vpxor \regC, \regD, \regF
137 vpxor \regB, \regF, \regF
138.endm
139
140
141.macro MAGIC_F2 regF regB regC regD regT
142 vpor \regC, \regB, \regF
143 vpand \regC, \regB, \regT
144 vpand \regD, \regF, \regF
145 vpor \regT, \regF, \regF
146.endm
147
148
149.macro MAGIC_F3 regF regB regC regD regT
150 MAGIC_F1 \regF,\regB,\regC,\regD,\regT
151.endm
152
153
154.macro PROLD reg imm tmp
155 vpsrld $(32-\imm), \reg, \tmp
156 vpslld $\imm, \reg, \reg
157 vpor \tmp, \reg, \reg
158.endm
159
160.macro PROLD_nd reg imm tmp src
161 vpsrld $(32-\imm), \src, \tmp
162 vpslld $\imm, \src, \reg
163 vpor \tmp, \reg, \reg
164.endm
165
166.macro SHA1_STEP_00_15 regA regB regC regD regE regT regF memW immCNT MAGIC
167 vpaddd \immCNT, \regE, \regE
168 vpaddd \memW*32(%rsp), \regE, \regE
169 PROLD_nd \regT, 5, \regF, \regA
170 vpaddd \regT, \regE, \regE
171 \MAGIC \regF, \regB, \regC, \regD, \regT
172 PROLD \regB, 30, \regT
173 vpaddd \regF, \regE, \regE
174.endm
175
176.macro SHA1_STEP_16_79 regA regB regC regD regE regT regF memW immCNT MAGIC
177 vpaddd \immCNT, \regE, \regE
178 offset = ((\memW - 14) & 15) * 32
179 vmovdqu offset(%rsp), W14
180 vpxor W14, W16, W16
181 offset = ((\memW - 8) & 15) * 32
182 vpxor offset(%rsp), W16, W16
183 offset = ((\memW - 3) & 15) * 32
184 vpxor offset(%rsp), W16, W16
185 vpsrld $(32-1), W16, \regF
186 vpslld $1, W16, W16
187 vpor W16, \regF, \regF
188
189 ROTATE_W
190
191 offset = ((\memW - 0) & 15) * 32
192 vmovdqu \regF, offset(%rsp)
193 vpaddd \regF, \regE, \regE
194 PROLD_nd \regT, 5, \regF, \regA
195 vpaddd \regT, \regE, \regE
196 \MAGIC \regF,\regB,\regC,\regD,\regT
197 PROLD \regB,30, \regT
198 vpaddd \regF, \regE, \regE
199.endm
200
201
202
203
204
205
206YMM_SAVE = (15-15)*32
207FRAMESZ = 32*16 + YMM_SAVE
208_YMM = FRAMESZ - YMM_SAVE
209
210#define VMOVPS vmovups
211
212IDX = %rax
213inp0 = %r9
214inp1 = %r10
215inp2 = %r11
216inp3 = %r12
217inp4 = %r13
218inp5 = %r14
219inp6 = %r15
220inp7 = %rcx
221arg1 = %rdi
222arg2 = %rsi
223RSP_SAVE = %rdx
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243A = %ymm0
244B = %ymm1
245C = %ymm2
246D = %ymm3
247E = %ymm4
248F = %ymm5
249T0 = %ymm6
250T1 = %ymm7
251T2 = %ymm8
252T3 = %ymm9
253T4 = %ymm10
254T5 = %ymm11
255T6 = %ymm12
256T7 = %ymm13
257T8 = %ymm14
258T9 = %ymm15
259
260AA = %ymm5
261BB = %ymm6
262CC = %ymm7
263DD = %ymm8
264EE = %ymm9
265TMP = %ymm10
266FUN = %ymm11
267K = %ymm12
268W14 = %ymm13
269W15 = %ymm14
270W16 = %ymm15
271
272.macro ROTATE_ARGS
273 TMP_ = E
274 E = D
275 D = C
276 C = B
277 B = A
278 A = TMP_
279.endm
280
281.macro ROTATE_W
282TMP_ = W16
283W16 = W15
284W15 = W14
285W14 = TMP_
286.endm
287
288
289#define DIGEST_SIZE (8*5*4)
290
291.align 32
292
293
294
295
296
297ENTRY(sha1_x8_avx2)
298
299
300 push %r12
301 push %r13
302 push %r14
303 push %r15
304
305
306 mov %rsp, RSP_SAVE
307 sub $FRAMESZ, %rsp
308
309
310 and $~0x1F, %rsp
311
312
313 vmovdqu 0*32(arg1), A
314 vmovdqu 1*32(arg1), B
315 vmovdqu 2*32(arg1), C
316 vmovdqu 3*32(arg1), D
317 vmovdqu 4*32(arg1), E
318
319
320 mov _data_ptr+0*8(arg1),inp0
321 mov _data_ptr+1*8(arg1),inp1
322 mov _data_ptr+2*8(arg1),inp2
323 mov _data_ptr+3*8(arg1),inp3
324 mov _data_ptr+4*8(arg1),inp4
325 mov _data_ptr+5*8(arg1),inp5
326 mov _data_ptr+6*8(arg1),inp6
327 mov _data_ptr+7*8(arg1),inp7
328
329 xor IDX, IDX
330lloop:
331 vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), F
332 I=0
333.rep 2
334 VMOVPS (inp0, IDX), T0
335 VMOVPS (inp1, IDX), T1
336 VMOVPS (inp2, IDX), T2
337 VMOVPS (inp3, IDX), T3
338 VMOVPS (inp4, IDX), T4
339 VMOVPS (inp5, IDX), T5
340 VMOVPS (inp6, IDX), T6
341 VMOVPS (inp7, IDX), T7
342
343 TRANSPOSE8 T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
344 vpshufb F, T0, T0
345 vmovdqu T0, (I*8)*32(%rsp)
346 vpshufb F, T1, T1
347 vmovdqu T1, (I*8+1)*32(%rsp)
348 vpshufb F, T2, T2
349 vmovdqu T2, (I*8+2)*32(%rsp)
350 vpshufb F, T3, T3
351 vmovdqu T3, (I*8+3)*32(%rsp)
352 vpshufb F, T4, T4
353 vmovdqu T4, (I*8+4)*32(%rsp)
354 vpshufb F, T5, T5
355 vmovdqu T5, (I*8+5)*32(%rsp)
356 vpshufb F, T6, T6
357 vmovdqu T6, (I*8+6)*32(%rsp)
358 vpshufb F, T7, T7
359 vmovdqu T7, (I*8+7)*32(%rsp)
360 add $32, IDX
361 I = (I+1)
362.endr
363
364 vmovdqu A,AA
365 vmovdqu B,BB
366 vmovdqu C,CC
367 vmovdqu D,DD
368 vmovdqu E,EE
369
370
371
372
373 vmovdqu K00_19(%rip), K
374
375 I = 0
376.rep 16
377 SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
378 ROTATE_ARGS
379 I = (I+1)
380.endr
381
382
383 vmovdqu ((16 - 16) & 15) * 32 (%rsp), W16
384 vmovdqu ((16 - 15) & 15) * 32 (%rsp), W15
385.rep 4
386 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
387 ROTATE_ARGS
388 I = (I+1)
389.endr
390
391
392 vmovdqu K20_39(%rip), K
393.rep 20
394 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
395 ROTATE_ARGS
396 I = (I+1)
397.endr
398
399
400 vmovdqu K40_59(%rip), K
401.rep 20
402 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
403 ROTATE_ARGS
404 I = (I+1)
405.endr
406
407
408 vmovdqu K60_79(%rip), K
409.rep 20
410 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
411 ROTATE_ARGS
412 I = (I+1)
413.endr
414
415 vpaddd AA,A,A
416 vpaddd BB,B,B
417 vpaddd CC,C,C
418 vpaddd DD,D,D
419 vpaddd EE,E,E
420
421 sub $1, arg2
422 jne lloop
423
424
425 vmovdqu A, 0*32(arg1)
426 vmovdqu B, 1*32(arg1)
427 vmovdqu C, 2*32(arg1)
428 vmovdqu D, 3*32(arg1)
429 vmovdqu E, 4*32(arg1)
430
431
432 add IDX, inp0
433 add IDX, inp1
434 add IDX, inp2
435 add IDX, inp3
436 add IDX, inp4
437 add IDX, inp5
438 add IDX, inp6
439 add IDX, inp7
440 mov inp0, _data_ptr (arg1)
441 mov inp1, _data_ptr + 1*8(arg1)
442 mov inp2, _data_ptr + 2*8(arg1)
443 mov inp3, _data_ptr + 3*8(arg1)
444 mov inp4, _data_ptr + 4*8(arg1)
445 mov inp5, _data_ptr + 5*8(arg1)
446 mov inp6, _data_ptr + 6*8(arg1)
447 mov inp7, _data_ptr + 7*8(arg1)
448
449
450
451
452 mov RSP_SAVE, %rsp
453
454
455 pop %r15
456 pop %r14
457 pop %r13
458 pop %r12
459
460 ret
461ENDPROC(sha1_x8_avx2)
462
463
464.data
465
466.align 32
467K00_19:
468.octa 0x5A8279995A8279995A8279995A827999
469.octa 0x5A8279995A8279995A8279995A827999
470K20_39:
471.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
472.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
473K40_59:
474.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
475.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
476K60_79:
477.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
478.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
479PSHUFFLE_BYTE_FLIP_MASK:
480.octa 0x0c0d0e0f08090a0b0405060700010203
481.octa 0x0c0d0e0f08090a0b0405060700010203
482