1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54#include <linux/linkage.h>
55#include <asm/frame.h>
56#include "sha1_mb_mgr_datastruct.S"
57
58
59.extern sha1_x8_avx2
60
61
62#define arg1 %rdi
63#define arg2 %rsi
64
65
66#define state arg1
67#define job arg2
68#define len2 arg2
69
70
71#define idx %r8
72#define DWORD_idx %r8d
73
74#define unused_lanes %rbx
75#define lane_data %rbx
76#define tmp2 %rbx
77#define tmp2_w %ebx
78
79#define job_rax %rax
80#define tmp1 %rax
81#define size_offset %rax
82#define tmp %rax
83#define start_offset %rax
84
85#define tmp3 %arg1
86
87#define extra_blocks %arg2
88#define p %arg2
89
90.macro LABEL prefix n
91\prefix\n\():
92.endm
93
94.macro JNE_SKIP i
95jne skip_\i
96.endm
97
98.altmacro
99.macro SET_OFFSET _offset
100offset = \_offset
101.endm
102.noaltmacro
103
104
105
106ENTRY(sha1_mb_mgr_flush_avx2)
107 FRAME_BEGIN
108 push %rbx
109
110
111 mov _unused_lanes(state), unused_lanes
112 bt $32+3, unused_lanes
113 jc return_null
114
115
116 xor idx, idx
117 offset = (_ldata + 1 * _LANE_DATA_size + _job_in_lane)
118 cmpq $0, offset(state)
119 cmovne one(%rip), idx
120 offset = (_ldata + 2 * _LANE_DATA_size + _job_in_lane)
121 cmpq $0, offset(state)
122 cmovne two(%rip), idx
123 offset = (_ldata + 3 * _LANE_DATA_size + _job_in_lane)
124 cmpq $0, offset(state)
125 cmovne three(%rip), idx
126 offset = (_ldata + 4 * _LANE_DATA_size + _job_in_lane)
127 cmpq $0, offset(state)
128 cmovne four(%rip), idx
129 offset = (_ldata + 5 * _LANE_DATA_size + _job_in_lane)
130 cmpq $0, offset(state)
131 cmovne five(%rip), idx
132 offset = (_ldata + 6 * _LANE_DATA_size + _job_in_lane)
133 cmpq $0, offset(state)
134 cmovne six(%rip), idx
135 offset = (_ldata + 7 * _LANE_DATA_size + _job_in_lane)
136 cmpq $0, offset(state)
137 cmovne seven(%rip), idx
138
139
140copy_lane_data:
141 offset = (_args + _data_ptr)
142 mov offset(state,idx,8), tmp
143
144 I = 0
145.rep 8
146 offset = (_ldata + I * _LANE_DATA_size + _job_in_lane)
147 cmpq $0, offset(state)
148.altmacro
149 JNE_SKIP %I
150 offset = (_args + _data_ptr + 8*I)
151 mov tmp, offset(state)
152 offset = (_lens + 4*I)
153 movl $0xFFFFFFFF, offset(state)
154LABEL skip_ %I
155 I = (I+1)
156.noaltmacro
157.endr
158
159
160 vmovdqu _lens+0*16(state), %xmm0
161 vmovdqu _lens+1*16(state), %xmm1
162
163 vpminud %xmm1, %xmm0, %xmm2
164 vpalignr $8, %xmm2, %xmm3, %xmm3
165 vpminud %xmm3, %xmm2, %xmm2
166 vpalignr $4, %xmm2, %xmm3, %xmm3
167 vpminud %xmm3, %xmm2, %xmm2
168
169 vmovd %xmm2, DWORD_idx
170 mov idx, len2
171 and $0xF, idx
172 shr $4, len2
173 jz len_is_0
174
175 vpand clear_low_nibble(%rip), %xmm2, %xmm2
176 vpshufd $0, %xmm2, %xmm2
177
178 vpsubd %xmm2, %xmm0, %xmm0
179 vpsubd %xmm2, %xmm1, %xmm1
180
181 vmovdqu %xmm0, _lens+0*16(state)
182 vmovdqu %xmm1, _lens+1*16(state)
183
184
185
186 call sha1_x8_avx2
187
188
189
190len_is_0:
191
192 imul $_LANE_DATA_size, idx, lane_data
193 lea _ldata(state, lane_data), lane_data
194
195 mov _job_in_lane(lane_data), job_rax
196 movq $0, _job_in_lane(lane_data)
197 movl $STS_COMPLETED, _status(job_rax)
198 mov _unused_lanes(state), unused_lanes
199 shl $4, unused_lanes
200 or idx, unused_lanes
201 mov unused_lanes, _unused_lanes(state)
202
203 movl $0xFFFFFFFF, _lens(state, idx, 4)
204
205 vmovd _args_digest(state , idx, 4) , %xmm0
206 vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
207 vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
208 vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
209 movl _args_digest+4*32(state, idx, 4), tmp2_w
210
211 vmovdqu %xmm0, _result_digest(job_rax)
212 offset = (_result_digest + 1*16)
213 mov tmp2_w, offset(job_rax)
214
215return:
216 pop %rbx
217 FRAME_END
218 ret
219
220return_null:
221 xor job_rax, job_rax
222 jmp return
223ENDPROC(sha1_mb_mgr_flush_avx2)
224
225
226
227
228.align 16
229ENTRY(sha1_mb_mgr_get_comp_job_avx2)
230 push %rbx
231
232
233 mov _unused_lanes(state), unused_lanes
234 bt $(32+3), unused_lanes
235 jc .return_null
236
237
238 vmovdqu _lens(state), %xmm0
239 vmovdqu _lens+1*16(state), %xmm1
240
241 vpminud %xmm1, %xmm0, %xmm2
242 vpalignr $8, %xmm2, %xmm3, %xmm3
243 vpminud %xmm3, %xmm2, %xmm2
244 vpalignr $4, %xmm2, %xmm3, %xmm3
245 vpminud %xmm3, %xmm2, %xmm2
246
247 vmovd %xmm2, DWORD_idx
248 test $~0xF, idx
249 jnz .return_null
250
251
252 imul $_LANE_DATA_size, idx, lane_data
253 lea _ldata(state, lane_data), lane_data
254
255 mov _job_in_lane(lane_data), job_rax
256 movq $0, _job_in_lane(lane_data)
257 movl $STS_COMPLETED, _status(job_rax)
258 mov _unused_lanes(state), unused_lanes
259 shl $4, unused_lanes
260 or idx, unused_lanes
261 mov unused_lanes, _unused_lanes(state)
262
263 movl $0xFFFFFFFF, _lens(state, idx, 4)
264
265 vmovd _args_digest(state, idx, 4), %xmm0
266 vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
267 vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
268 vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
269 movl _args_digest+4*32(state, idx, 4), tmp2_w
270
271 vmovdqu %xmm0, _result_digest(job_rax)
272 movl tmp2_w, _result_digest+1*16(job_rax)
273
274 pop %rbx
275
276 ret
277
278.return_null:
279 xor job_rax, job_rax
280 pop %rbx
281 ret
282ENDPROC(sha1_mb_mgr_get_comp_job_avx2)
283
284.section .rodata.cst16.clear_low_nibble, "aM", @progbits, 16
285.align 16
286clear_low_nibble:
287.octa 0x000000000000000000000000FFFFFFF0
288
289.section .rodata.cst8, "aM", @progbits, 8
290.align 8
291one:
292.quad 1
293two:
294.quad 2
295three:
296.quad 3
297four:
298.quad 4
299five:
300.quad 5
301six:
302.quad 6
303seven:
304.quad 7
305