1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54#include <linux/linkage.h>
55#include <asm/frame.h>
56#include "sha512_mb_mgr_datastruct.S"
57
58.extern sha512_x4_avx2
59
60
61#define arg1 %rdi
62#define arg2 %rsi
63
64
65#define idx %rdx
66
67
68#define state arg1
69#define job arg2
70#define len2 arg2
71
72#define unused_lanes %rbx
73#define lane_data %rbx
74#define tmp2 %rbx
75
76#define job_rax %rax
77#define tmp1 %rax
78#define size_offset %rax
79#define tmp %rax
80#define start_offset %rax
81
82#define tmp3 arg1
83
84#define extra_blocks arg2
85#define p arg2
86
87#define tmp4 %r8
88#define lens0 %r8
89
90#define lens1 %r9
91#define lens2 %r10
92#define lens3 %r11
93
94.macro LABEL prefix n
95\prefix\n\():
96.endm
97
98.macro JNE_SKIP i
99jne skip_\i
100.endm
101
102.altmacro
103.macro SET_OFFSET _offset
104offset = \_offset
105.endm
106.noaltmacro
107
108
109
110ENTRY(sha512_mb_mgr_flush_avx2)
111 FRAME_BEGIN
112 push %rbx
113
114
115 mov _unused_lanes(state), unused_lanes
116 bt $32+7, unused_lanes
117 jc return_null
118
119
120 xor idx, idx
121 offset = (_ldata + 1*_LANE_DATA_size + _job_in_lane)
122 cmpq $0, offset(state)
123 cmovne one(%rip), idx
124 offset = (_ldata + 2*_LANE_DATA_size + _job_in_lane)
125 cmpq $0, offset(state)
126 cmovne two(%rip), idx
127 offset = (_ldata + 3*_LANE_DATA_size + _job_in_lane)
128 cmpq $0, offset(state)
129 cmovne three(%rip), idx
130
131
132copy_lane_data:
133 offset = (_args + _data_ptr)
134 mov offset(state,idx,8), tmp
135
136 I = 0
137.rep 4
138 offset = (_ldata + I * _LANE_DATA_size + _job_in_lane)
139 cmpq $0, offset(state)
140.altmacro
141 JNE_SKIP %I
142 offset = (_args + _data_ptr + 8*I)
143 mov tmp, offset(state)
144 offset = (_lens + 8*I +4)
145 movl $0xFFFFFFFF, offset(state)
146LABEL skip_ %I
147 I = (I+1)
148.noaltmacro
149.endr
150
151
152 mov _lens + 0*8(state),lens0
153 mov lens0,idx
154 mov _lens + 1*8(state),lens1
155 cmp idx,lens1
156 cmovb lens1,idx
157 mov _lens + 2*8(state),lens2
158 cmp idx,lens2
159 cmovb lens2,idx
160 mov _lens + 3*8(state),lens3
161 cmp idx,lens3
162 cmovb lens3,idx
163 mov idx,len2
164 and $0xF,idx
165 and $~0xFF,len2
166 jz len_is_0
167
168 sub len2, lens0
169 sub len2, lens1
170 sub len2, lens2
171 sub len2, lens3
172 shr $32,len2
173 mov lens0, _lens + 0*8(state)
174 mov lens1, _lens + 1*8(state)
175 mov lens2, _lens + 2*8(state)
176 mov lens3, _lens + 3*8(state)
177
178
179
180 call sha512_x4_avx2
181
182
183len_is_0:
184
185 imul $_LANE_DATA_size, idx, lane_data
186 lea _ldata(state, lane_data), lane_data
187
188 mov _job_in_lane(lane_data), job_rax
189 movq $0, _job_in_lane(lane_data)
190 movl $STS_COMPLETED, _status(job_rax)
191 mov _unused_lanes(state), unused_lanes
192 shl $8, unused_lanes
193 or idx, unused_lanes
194 mov unused_lanes, _unused_lanes(state)
195
196 movl $0xFFFFFFFF, _lens+4(state, idx, 8)
197
198 vmovq _args_digest+0*32(state, idx, 8), %xmm0
199 vpinsrq $1, _args_digest+1*32(state, idx, 8), %xmm0, %xmm0
200 vmovq _args_digest+2*32(state, idx, 8), %xmm1
201 vpinsrq $1, _args_digest+3*32(state, idx, 8), %xmm1, %xmm1
202 vmovq _args_digest+4*32(state, idx, 8), %xmm2
203 vpinsrq $1, _args_digest+5*32(state, idx, 8), %xmm2, %xmm2
204 vmovq _args_digest+6*32(state, idx, 8), %xmm3
205 vpinsrq $1, _args_digest+7*32(state, idx, 8), %xmm3, %xmm3
206
207 vmovdqu %xmm0, _result_digest(job_rax)
208 vmovdqu %xmm1, _result_digest+1*16(job_rax)
209 vmovdqu %xmm2, _result_digest+2*16(job_rax)
210 vmovdqu %xmm3, _result_digest+3*16(job_rax)
211
212return:
213 pop %rbx
214 FRAME_END
215 ret
216
217return_null:
218 xor job_rax, job_rax
219 jmp return
220ENDPROC(sha512_mb_mgr_flush_avx2)
221.align 16
222
223ENTRY(sha512_mb_mgr_get_comp_job_avx2)
224 push %rbx
225
226 mov _unused_lanes(state), unused_lanes
227 bt $(32+7), unused_lanes
228 jc .return_null
229
230
231 mov _lens(state),lens0
232 mov lens0,idx
233 mov _lens+1*8(state),lens1
234 cmp idx,lens1
235 cmovb lens1,idx
236 mov _lens+2*8(state),lens2
237 cmp idx,lens2
238 cmovb lens2,idx
239 mov _lens+3*8(state),lens3
240 cmp idx,lens3
241 cmovb lens3,idx
242 test $~0xF,idx
243 jnz .return_null
244 and $0xF,idx
245
246
247 imul $_LANE_DATA_size, idx, lane_data
248 lea _ldata(state, lane_data), lane_data
249
250 mov _job_in_lane(lane_data), job_rax
251 movq $0, _job_in_lane(lane_data)
252 movl $STS_COMPLETED, _status(job_rax)
253 mov _unused_lanes(state), unused_lanes
254 shl $8, unused_lanes
255 or idx, unused_lanes
256 mov unused_lanes, _unused_lanes(state)
257
258 movl $0xFFFFFFFF, _lens+4(state, idx, 8)
259
260 vmovq _args_digest(state, idx, 8), %xmm0
261 vpinsrq $1, _args_digest+1*32(state, idx, 8), %xmm0, %xmm0
262 vmovq _args_digest+2*32(state, idx, 8), %xmm1
263 vpinsrq $1, _args_digest+3*32(state, idx, 8), %xmm1, %xmm1
264 vmovq _args_digest+4*32(state, idx, 8), %xmm2
265 vpinsrq $1, _args_digest+5*32(state, idx, 8), %xmm2, %xmm2
266 vmovq _args_digest+6*32(state, idx, 8), %xmm3
267 vpinsrq $1, _args_digest+7*32(state, idx, 8), %xmm3, %xmm3
268
269 vmovdqu %xmm0, _result_digest+0*16(job_rax)
270 vmovdqu %xmm1, _result_digest+1*16(job_rax)
271 vmovdqu %xmm2, _result_digest+2*16(job_rax)
272 vmovdqu %xmm3, _result_digest+3*16(job_rax)
273
274 pop %rbx
275
276 ret
277
278.return_null:
279 xor job_rax, job_rax
280 pop %rbx
281 ret
282ENDPROC(sha512_mb_mgr_get_comp_job_avx2)
283.data
284
285.align 16
286one:
287.quad 1
288two:
289.quad 2
290three:
291.quad 3
292