1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54#include <linux/linkage.h>
55#include <asm/frame.h>
56#include "sha512_mb_mgr_datastruct.S"
57
58.extern sha512_x4_avx2
59
60#define arg1 %rdi
61#define arg2 %rsi
62
63#define idx %rdx
64#define last_len %rdx
65
66#define size_offset %rcx
67#define tmp2 %rcx
68
69
70#define state arg1
71#define job arg2
72#define len2 arg2
73#define p2 arg2
74
75#define p %r11
76#define start_offset %r11
77
78#define unused_lanes %rbx
79
80#define job_rax %rax
81#define len %rax
82
83#define lane %r12
84#define tmp3 %r12
85#define lens3 %r12
86
87#define extra_blocks %r8
88#define lens0 %r8
89
90#define tmp %r9
91#define lens1 %r9
92
93#define lane_data %r10
94#define lens2 %r10
95
96#define DWORD_len %eax
97
98
99
100
101ENTRY(sha512_mb_mgr_submit_avx2)
102 FRAME_BEGIN
103 push %rbx
104 push %r12
105
106 mov _unused_lanes(state), unused_lanes
107 movzb %bl,lane
108 shr $8, unused_lanes
109 imul $_LANE_DATA_size, lane,lane_data
110 movl $STS_BEING_PROCESSED, _status(job)
111 lea _ldata(state, lane_data), lane_data
112 mov unused_lanes, _unused_lanes(state)
113 movl _len(job), DWORD_len
114
115 mov job, _job_in_lane(lane_data)
116 movl DWORD_len,_lens+4(state , lane, 8)
117
118
119 vmovdqu _result_digest+0*16(job), %xmm0
120 vmovdqu _result_digest+1*16(job), %xmm1
121 vmovdqu _result_digest+2*16(job), %xmm2
122 vmovdqu _result_digest+3*16(job), %xmm3
123
124 vmovq %xmm0, _args_digest(state, lane, 8)
125 vpextrq $1, %xmm0, _args_digest+1*32(state , lane, 8)
126 vmovq %xmm1, _args_digest+2*32(state , lane, 8)
127 vpextrq $1, %xmm1, _args_digest+3*32(state , lane, 8)
128 vmovq %xmm2, _args_digest+4*32(state , lane, 8)
129 vpextrq $1, %xmm2, _args_digest+5*32(state , lane, 8)
130 vmovq %xmm3, _args_digest+6*32(state , lane, 8)
131 vpextrq $1, %xmm3, _args_digest+7*32(state , lane, 8)
132
133 mov _buffer(job), p
134 mov p, _args_data_ptr(state, lane, 8)
135
136 cmp $0xFF, unused_lanes
137 jne return_null
138
139start_loop:
140
141
142 mov _lens+0*8(state),lens0
143 mov lens0,idx
144 mov _lens+1*8(state),lens1
145 cmp idx,lens1
146 cmovb lens1, idx
147 mov _lens+2*8(state),lens2
148 cmp idx,lens2
149 cmovb lens2,idx
150 mov _lens+3*8(state),lens3
151 cmp idx,lens3
152 cmovb lens3,idx
153 mov idx,len2
154 and $0xF,idx
155 and $~0xFF,len2
156 jz len_is_0
157
158 sub len2,lens0
159 sub len2,lens1
160 sub len2,lens2
161 sub len2,lens3
162 shr $32,len2
163 mov lens0, _lens + 0*8(state)
164 mov lens1, _lens + 1*8(state)
165 mov lens2, _lens + 2*8(state)
166 mov lens3, _lens + 3*8(state)
167
168
169
170 call sha512_x4_avx2
171
172
173len_is_0:
174
175
176 imul $_LANE_DATA_size, idx, lane_data
177 lea _ldata(state, lane_data), lane_data
178
179 mov _job_in_lane(lane_data), job_rax
180 mov _unused_lanes(state), unused_lanes
181 movq $0, _job_in_lane(lane_data)
182 movl $STS_COMPLETED, _status(job_rax)
183 shl $8, unused_lanes
184 or idx, unused_lanes
185 mov unused_lanes, _unused_lanes(state)
186
187 movl $0xFFFFFFFF,_lens+4(state,idx,8)
188 vmovq _args_digest+0*32(state , idx, 8), %xmm0
189 vpinsrq $1, _args_digest+1*32(state , idx, 8), %xmm0, %xmm0
190 vmovq _args_digest+2*32(state , idx, 8), %xmm1
191 vpinsrq $1, _args_digest+3*32(state , idx, 8), %xmm1, %xmm1
192 vmovq _args_digest+4*32(state , idx, 8), %xmm2
193 vpinsrq $1, _args_digest+5*32(state , idx, 8), %xmm2, %xmm2
194 vmovq _args_digest+6*32(state , idx, 8), %xmm3
195 vpinsrq $1, _args_digest+7*32(state , idx, 8), %xmm3, %xmm3
196
197 vmovdqu %xmm0, _result_digest + 0*16(job_rax)
198 vmovdqu %xmm1, _result_digest + 1*16(job_rax)
199 vmovdqu %xmm2, _result_digest + 2*16(job_rax)
200 vmovdqu %xmm3, _result_digest + 3*16(job_rax)
201
202return:
203 pop %r12
204 pop %rbx
205 FRAME_END
206 ret
207
208return_null:
209 xor job_rax, job_rax
210 jmp return
211ENDPROC(sha512_mb_mgr_submit_avx2)
212
213
214
215
216
217
218
219
220
221
222
223
224
225