1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54#include <linux/linkage.h>
55#include <asm/frame.h>
56#include "sha256_mb_mgr_datastruct.S"
57
58.extern sha256_x8_avx2
59
60
61arg1 = %rdi
62arg2 = %rsi
63size_offset = %rcx
64tmp2 = %rcx
65extra_blocks = %rdx
66
67
68#define state arg1
69#define job %rsi
70#define len2 arg2
71#define p2 arg2
72
73
74idx = %r8
75DWORD_idx = %r8d
76last_len = %r8
77
78p = %r11
79start_offset = %r11
80
81unused_lanes = %rbx
82BYTE_unused_lanes = %bl
83
84job_rax = %rax
85len = %rax
86DWORD_len = %eax
87
88lane = %r12
89tmp3 = %r12
90
91tmp = %r9
92DWORD_tmp = %r9d
93
94lane_data = %r10
95
96
97
98
99ENTRY(sha256_mb_mgr_submit_avx2)
100 FRAME_BEGIN
101 push %rbx
102 push %r12
103
104 mov _unused_lanes(state), unused_lanes
105 mov unused_lanes, lane
106 and $0xF, lane
107 shr $4, unused_lanes
108 imul $_LANE_DATA_size, lane, lane_data
109 movl $STS_BEING_PROCESSED, _status(job)
110 lea _ldata(state, lane_data), lane_data
111 mov unused_lanes, _unused_lanes(state)
112 movl _len(job), DWORD_len
113
114 mov job, _job_in_lane(lane_data)
115 shl $4, len
116 or lane, len
117
118 movl DWORD_len, _lens(state , lane, 4)
119
120
121 vmovdqu _result_digest(job), %xmm0
122 vmovdqu _result_digest+1*16(job), %xmm1
123 vmovd %xmm0, _args_digest(state, lane, 4)
124 vpextrd $1, %xmm0, _args_digest+1*32(state , lane, 4)
125 vpextrd $2, %xmm0, _args_digest+2*32(state , lane, 4)
126 vpextrd $3, %xmm0, _args_digest+3*32(state , lane, 4)
127 vmovd %xmm1, _args_digest+4*32(state , lane, 4)
128
129 vpextrd $1, %xmm1, _args_digest+5*32(state , lane, 4)
130 vpextrd $2, %xmm1, _args_digest+6*32(state , lane, 4)
131 vpextrd $3, %xmm1, _args_digest+7*32(state , lane, 4)
132
133 mov _buffer(job), p
134 mov p, _args_data_ptr(state, lane, 8)
135
136 cmp $0xF, unused_lanes
137 jne return_null
138
139start_loop:
140
141 vmovdqa _lens(state), %xmm0
142 vmovdqa _lens+1*16(state), %xmm1
143
144 vpminud %xmm1, %xmm0, %xmm2
145 vpalignr $8, %xmm2, %xmm3, %xmm3
146 vpminud %xmm3, %xmm2, %xmm2
147 vpalignr $4, %xmm2, %xmm3, %xmm3
148 vpminud %xmm3, %xmm2, %xmm2
149
150 vmovd %xmm2, DWORD_idx
151 mov idx, len2
152 and $0xF, idx
153 shr $4, len2
154 jz len_is_0
155
156 vpand clear_low_nibble(%rip), %xmm2, %xmm2
157 vpshufd $0, %xmm2, %xmm2
158
159 vpsubd %xmm2, %xmm0, %xmm0
160 vpsubd %xmm2, %xmm1, %xmm1
161
162 vmovdqa %xmm0, _lens + 0*16(state)
163 vmovdqa %xmm1, _lens + 1*16(state)
164
165
166
167 call sha256_x8_avx2
168
169
170
171len_is_0:
172
173 imul $_LANE_DATA_size, idx, lane_data
174 lea _ldata(state, lane_data), lane_data
175
176 mov _job_in_lane(lane_data), job_rax
177 mov _unused_lanes(state), unused_lanes
178 movq $0, _job_in_lane(lane_data)
179 movl $STS_COMPLETED, _status(job_rax)
180 shl $4, unused_lanes
181 or idx, unused_lanes
182 mov unused_lanes, _unused_lanes(state)
183
184 movl $0xFFFFFFFF, _lens(state,idx,4)
185
186 vmovd _args_digest(state, idx, 4), %xmm0
187 vpinsrd $1, _args_digest+1*32(state , idx, 4), %xmm0, %xmm0
188 vpinsrd $2, _args_digest+2*32(state , idx, 4), %xmm0, %xmm0
189 vpinsrd $3, _args_digest+3*32(state , idx, 4), %xmm0, %xmm0
190 vmovd _args_digest+4*32(state, idx, 4), %xmm1
191
192 vpinsrd $1, _args_digest+5*32(state , idx, 4), %xmm1, %xmm1
193 vpinsrd $2, _args_digest+6*32(state , idx, 4), %xmm1, %xmm1
194 vpinsrd $3, _args_digest+7*32(state , idx, 4), %xmm1, %xmm1
195
196 vmovdqu %xmm0, _result_digest(job_rax)
197 vmovdqu %xmm1, _result_digest+1*16(job_rax)
198
199return:
200 pop %r12
201 pop %rbx
202 FRAME_END
203 ret
204
205return_null:
206 xor job_rax, job_rax
207 jmp return
208
209ENDPROC(sha256_mb_mgr_submit_avx2)
210
211.data
212
213.align 16
214clear_low_nibble:
215 .octa 0x000000000000000000000000FFFFFFF0
216