1
2
3
4
5
6
7
8
9
10
11
12#include <linux/linkage.h>
13
14.section .rodata.cst32.ANMASK, "aM", @progbits, 32
15.align 32
16ANMASK: .octa 0x0000000003ffffff0000000003ffffff
17 .octa 0x0000000003ffffff0000000003ffffff
18
19.section .rodata.cst32.ORMASK, "aM", @progbits, 32
20.align 32
21ORMASK: .octa 0x00000000010000000000000001000000
22 .octa 0x00000000010000000000000001000000
23
24.text
25
26#define h0 0x00(%rdi)
27#define h1 0x04(%rdi)
28#define h2 0x08(%rdi)
29#define h3 0x0c(%rdi)
30#define h4 0x10(%rdi)
31#define r0 0x00(%rdx)
32#define r1 0x04(%rdx)
33#define r2 0x08(%rdx)
34#define r3 0x0c(%rdx)
35#define r4 0x10(%rdx)
36#define u0 0x00(%r8)
37#define u1 0x04(%r8)
38#define u2 0x08(%r8)
39#define u3 0x0c(%r8)
40#define u4 0x10(%r8)
41#define w0 0x14(%r8)
42#define w1 0x18(%r8)
43#define w2 0x1c(%r8)
44#define w3 0x20(%r8)
45#define w4 0x24(%r8)
46#define y0 0x28(%r8)
47#define y1 0x2c(%r8)
48#define y2 0x30(%r8)
49#define y3 0x34(%r8)
50#define y4 0x38(%r8)
51#define m %rsi
52#define hc0 %ymm0
53#define hc1 %ymm1
54#define hc2 %ymm2
55#define hc3 %ymm3
56#define hc4 %ymm4
57#define hc0x %xmm0
58#define hc1x %xmm1
59#define hc2x %xmm2
60#define hc3x %xmm3
61#define hc4x %xmm4
62#define t1 %ymm5
63#define t2 %ymm6
64#define t1x %xmm5
65#define t2x %xmm6
66#define ruwy0 %ymm7
67#define ruwy1 %ymm8
68#define ruwy2 %ymm9
69#define ruwy3 %ymm10
70#define ruwy4 %ymm11
71#define ruwy0x %xmm7
72#define ruwy1x %xmm8
73#define ruwy2x %xmm9
74#define ruwy3x %xmm10
75#define ruwy4x %xmm11
76#define svxz1 %ymm12
77#define svxz2 %ymm13
78#define svxz3 %ymm14
79#define svxz4 %ymm15
80#define d0 %r9
81#define d1 %r10
82#define d2 %r11
83#define d3 %r12
84#define d4 %r13
85
86ENTRY(poly1305_4block_avx2)
87
88
89
90
91
92
93
94
95
96
97 vzeroupper
98 push %rbx
99 push %r12
100 push %r13
101
102
103 vmovd y0,ruwy0x
104 vmovd w0,t1x
105 vpunpcklqdq t1,ruwy0,ruwy0
106 vmovd u0,t1x
107 vmovd r0,t2x
108 vpunpcklqdq t2,t1,t1
109 vperm2i128 $0x20,t1,ruwy0,ruwy0
110
111
112 vmovd y1,ruwy1x
113 vmovd w1,t1x
114 vpunpcklqdq t1,ruwy1,ruwy1
115 vmovd u1,t1x
116 vmovd r1,t2x
117 vpunpcklqdq t2,t1,t1
118 vperm2i128 $0x20,t1,ruwy1,ruwy1
119 vpslld $2,ruwy1,svxz1
120 vpaddd ruwy1,svxz1,svxz1
121
122
123 vmovd y2,ruwy2x
124 vmovd w2,t1x
125 vpunpcklqdq t1,ruwy2,ruwy2
126 vmovd u2,t1x
127 vmovd r2,t2x
128 vpunpcklqdq t2,t1,t1
129 vperm2i128 $0x20,t1,ruwy2,ruwy2
130 vpslld $2,ruwy2,svxz2
131 vpaddd ruwy2,svxz2,svxz2
132
133
134 vmovd y3,ruwy3x
135 vmovd w3,t1x
136 vpunpcklqdq t1,ruwy3,ruwy3
137 vmovd u3,t1x
138 vmovd r3,t2x
139 vpunpcklqdq t2,t1,t1
140 vperm2i128 $0x20,t1,ruwy3,ruwy3
141 vpslld $2,ruwy3,svxz3
142 vpaddd ruwy3,svxz3,svxz3
143
144
145 vmovd y4,ruwy4x
146 vmovd w4,t1x
147 vpunpcklqdq t1,ruwy4,ruwy4
148 vmovd u4,t1x
149 vmovd r4,t2x
150 vpunpcklqdq t2,t1,t1
151 vperm2i128 $0x20,t1,ruwy4,ruwy4
152 vpslld $2,ruwy4,svxz4
153 vpaddd ruwy4,svxz4,svxz4
154
155.Ldoblock4:
156
157
158 vmovd 0x00(m),hc0x
159 vmovd 0x10(m),t1x
160 vpunpcklqdq t1,hc0,hc0
161 vmovd 0x20(m),t1x
162 vmovd 0x30(m),t2x
163 vpunpcklqdq t2,t1,t1
164 vperm2i128 $0x20,t1,hc0,hc0
165 vpand ANMASK(%rip),hc0,hc0
166 vmovd h0,t1x
167 vpaddd t1,hc0,hc0
168
169
170 vmovd 0x03(m),hc1x
171 vmovd 0x13(m),t1x
172 vpunpcklqdq t1,hc1,hc1
173 vmovd 0x23(m),t1x
174 vmovd 0x33(m),t2x
175 vpunpcklqdq t2,t1,t1
176 vperm2i128 $0x20,t1,hc1,hc1
177 vpsrld $2,hc1,hc1
178 vpand ANMASK(%rip),hc1,hc1
179 vmovd h1,t1x
180 vpaddd t1,hc1,hc1
181
182
183 vmovd 0x06(m),hc2x
184 vmovd 0x16(m),t1x
185 vpunpcklqdq t1,hc2,hc2
186 vmovd 0x26(m),t1x
187 vmovd 0x36(m),t2x
188 vpunpcklqdq t2,t1,t1
189 vperm2i128 $0x20,t1,hc2,hc2
190 vpsrld $4,hc2,hc2
191 vpand ANMASK(%rip),hc2,hc2
192 vmovd h2,t1x
193 vpaddd t1,hc2,hc2
194
195
196 vmovd 0x09(m),hc3x
197 vmovd 0x19(m),t1x
198 vpunpcklqdq t1,hc3,hc3
199 vmovd 0x29(m),t1x
200 vmovd 0x39(m),t2x
201 vpunpcklqdq t2,t1,t1
202 vperm2i128 $0x20,t1,hc3,hc3
203 vpsrld $6,hc3,hc3
204 vpand ANMASK(%rip),hc3,hc3
205 vmovd h3,t1x
206 vpaddd t1,hc3,hc3
207
208
209 vmovd 0x0c(m),hc4x
210 vmovd 0x1c(m),t1x
211 vpunpcklqdq t1,hc4,hc4
212 vmovd 0x2c(m),t1x
213 vmovd 0x3c(m),t2x
214 vpunpcklqdq t2,t1,t1
215 vperm2i128 $0x20,t1,hc4,hc4
216 vpsrld $8,hc4,hc4
217 vpor ORMASK(%rip),hc4,hc4
218 vmovd h4,t1x
219 vpaddd t1,hc4,hc4
220
221
222 vpmuludq hc0,ruwy0,t1
223
224 vpmuludq hc1,svxz4,t2
225 vpaddq t2,t1,t1
226
227 vpmuludq hc2,svxz3,t2
228 vpaddq t2,t1,t1
229
230 vpmuludq hc3,svxz2,t2
231 vpaddq t2,t1,t1
232
233 vpmuludq hc4,svxz1,t2
234 vpaddq t2,t1,t1
235
236 vpermq $0xee,t1,t2
237 vpaddq t2,t1,t1
238 vpsrldq $8,t1,t2
239 vpaddq t2,t1,t1
240 vmovq t1x,d0
241
242
243 vpmuludq hc0,ruwy1,t1
244
245 vpmuludq hc1,ruwy0,t2
246 vpaddq t2,t1,t1
247
248 vpmuludq hc2,svxz4,t2
249 vpaddq t2,t1,t1
250
251 vpmuludq hc3,svxz3,t2
252 vpaddq t2,t1,t1
253
254 vpmuludq hc4,svxz2,t2
255 vpaddq t2,t1,t1
256
257 vpermq $0xee,t1,t2
258 vpaddq t2,t1,t1
259 vpsrldq $8,t1,t2
260 vpaddq t2,t1,t1
261 vmovq t1x,d1
262
263
264 vpmuludq hc0,ruwy2,t1
265
266 vpmuludq hc1,ruwy1,t2
267 vpaddq t2,t1,t1
268
269 vpmuludq hc2,ruwy0,t2
270 vpaddq t2,t1,t1
271
272 vpmuludq hc3,svxz4,t2
273 vpaddq t2,t1,t1
274
275 vpmuludq hc4,svxz3,t2
276 vpaddq t2,t1,t1
277
278 vpermq $0xee,t1,t2
279 vpaddq t2,t1,t1
280 vpsrldq $8,t1,t2
281 vpaddq t2,t1,t1
282 vmovq t1x,d2
283
284
285 vpmuludq hc0,ruwy3,t1
286
287 vpmuludq hc1,ruwy2,t2
288 vpaddq t2,t1,t1
289
290 vpmuludq hc2,ruwy1,t2
291 vpaddq t2,t1,t1
292
293 vpmuludq hc3,ruwy0,t2
294 vpaddq t2,t1,t1
295
296 vpmuludq hc4,svxz4,t2
297 vpaddq t2,t1,t1
298
299 vpermq $0xee,t1,t2
300 vpaddq t2,t1,t1
301 vpsrldq $8,t1,t2
302 vpaddq t2,t1,t1
303 vmovq t1x,d3
304
305
306 vpmuludq hc0,ruwy4,t1
307
308 vpmuludq hc1,ruwy3,t2
309 vpaddq t2,t1,t1
310
311 vpmuludq hc2,ruwy2,t2
312 vpaddq t2,t1,t1
313
314 vpmuludq hc3,ruwy1,t2
315 vpaddq t2,t1,t1
316
317 vpmuludq hc4,ruwy0,t2
318 vpaddq t2,t1,t1
319
320 vpermq $0xee,t1,t2
321 vpaddq t2,t1,t1
322 vpsrldq $8,t1,t2
323 vpaddq t2,t1,t1
324 vmovq t1x,d4
325
326
327 mov d0,%rax
328 shr $26,%rax
329 add %rax,d1
330
331 mov d0,%rbx
332 and $0x3ffffff,%ebx
333
334
335 mov d1,%rax
336 shr $26,%rax
337 add %rax,d2
338
339 mov d1,%rax
340 and $0x3ffffff,%eax
341 mov %eax,h1
342
343
344 mov d2,%rax
345 shr $26,%rax
346 add %rax,d3
347
348 mov d2,%rax
349 and $0x3ffffff,%eax
350 mov %eax,h2
351
352
353 mov d3,%rax
354 shr $26,%rax
355 add %rax,d4
356
357 mov d3,%rax
358 and $0x3ffffff,%eax
359 mov %eax,h3
360
361
362 mov d4,%rax
363 shr $26,%rax
364 lea (%eax,%eax,4),%eax
365 add %eax,%ebx
366
367 mov d4,%rax
368 and $0x3ffffff,%eax
369 mov %eax,h4
370
371
372 mov %ebx,%eax
373 shr $26,%eax
374 add %eax,h1
375
376 andl $0x3ffffff,%ebx
377 mov %ebx,h0
378
379 add $0x40,m
380 dec %rcx
381 jnz .Ldoblock4
382
383 vzeroupper
384 pop %r13
385 pop %r12
386 pop %rbx
387 ret
388ENDPROC(poly1305_4block_avx2)
389