1
2
3
4
5
6
7
8
9
10
11
12#include <linux/linkage.h>
13
14.section .rodata.cst16.ANMASK, "aM", @progbits, 16
15.align 16
16ANMASK: .octa 0x0000000003ffffff0000000003ffffff
17
18.section .rodata.cst16.ORMASK, "aM", @progbits, 16
19.align 16
20ORMASK: .octa 0x00000000010000000000000001000000
21
22.text
23
24#define h0 0x00(%rdi)
25#define h1 0x04(%rdi)
26#define h2 0x08(%rdi)
27#define h3 0x0c(%rdi)
28#define h4 0x10(%rdi)
29#define r0 0x00(%rdx)
30#define r1 0x04(%rdx)
31#define r2 0x08(%rdx)
32#define r3 0x0c(%rdx)
33#define r4 0x10(%rdx)
34#define s1 0x00(%rsp)
35#define s2 0x04(%rsp)
36#define s3 0x08(%rsp)
37#define s4 0x0c(%rsp)
38#define m %rsi
39#define h01 %xmm0
40#define h23 %xmm1
41#define h44 %xmm2
42#define t1 %xmm3
43#define t2 %xmm4
44#define t3 %xmm5
45#define t4 %xmm6
46#define mask %xmm7
47#define d0 %r8
48#define d1 %r9
49#define d2 %r10
50#define d3 %r11
51#define d4 %r12
52
53ENTRY(poly1305_block_sse2)
54
55
56
57
58
59
60
61
62
63 push %rbx
64 push %r12
65 sub $0x10,%rsp
66
67
68 mov r1,%eax
69 lea (%eax,%eax,4),%eax
70 mov %eax,s1
71 mov r2,%eax
72 lea (%eax,%eax,4),%eax
73 mov %eax,s2
74 mov r3,%eax
75 lea (%eax,%eax,4),%eax
76 mov %eax,s3
77 mov r4,%eax
78 lea (%eax,%eax,4),%eax
79 mov %eax,s4
80
81 movdqa ANMASK(%rip),mask
82
83.Ldoblock:
84
85
86
87 movd h0,h01
88 movd h1,t1
89 movd h2,h23
90 movd h3,t2
91 movd h4,h44
92 punpcklqdq t1,h01
93 punpcklqdq t2,h23
94 punpcklqdq h44,h44
95
96
97 movd 0x00(m),t1
98 movd 0x03(m),t2
99 psrld $2,t2
100 punpcklqdq t2,t1
101 pand mask,t1
102 paddd t1,h01
103
104 movd 0x06(m),t1
105 movd 0x09(m),t2
106 psrld $4,t1
107 psrld $6,t2
108 punpcklqdq t2,t1
109 pand mask,t1
110 paddd t1,h23
111
112 mov 0x0c(m),%eax
113 shr $8,%eax
114 or $0x01000000,%eax
115 movd %eax,t1
116 pshufd $0xc4,t1,t1
117 paddd t1,h44
118
119
120
121 movd r0,t1
122 movd s4,t2
123 punpcklqdq t2,t1
124 pmuludq h01,t1
125 movd s3,t2
126 movd s2,t3
127 punpcklqdq t3,t2
128 pmuludq h23,t2
129 paddq t2,t1
130
131
132 movd r1,t2
133 movd r0,t3
134 punpcklqdq t3,t2
135 pmuludq h01,t2
136 movd s4,t3
137 movd s3,t4
138 punpcklqdq t4,t3
139 pmuludq h23,t3
140 paddq t3,t2
141
142
143 movd s1,t3
144 movd s2,t4
145 punpcklqdq t4,t3
146 pmuludq h44,t3
147
148
149 movdqa t1,t4
150 punpcklqdq t2,t4
151 punpckhqdq t2,t1
152 paddq t4,t1
153 paddq t3,t1
154 movq t1,d0
155 psrldq $8,t1
156 movq t1,d1
157
158
159
160 movd r2,t1
161 movd r1,t2
162 punpcklqdq t2,t1
163 pmuludq h01,t1
164 movd r0,t2
165 movd s4,t3
166 punpcklqdq t3,t2
167 pmuludq h23,t2
168 paddq t2,t1
169
170
171 movd r3,t2
172 movd r2,t3
173 punpcklqdq t3,t2
174 pmuludq h01,t2
175 movd r1,t3
176 movd r0,t4
177 punpcklqdq t4,t3
178 pmuludq h23,t3
179 paddq t3,t2
180
181
182 movd s3,t3
183 movd s4,t4
184 punpcklqdq t4,t3
185 pmuludq h44,t3
186
187
188 movdqa t1,t4
189 punpcklqdq t2,t4
190 punpckhqdq t2,t1
191 paddq t4,t1
192 paddq t3,t1
193 movq t1,d2
194 psrldq $8,t1
195 movq t1,d3
196
197
198
199 movd r4,t1
200 movd r3,t2
201 punpcklqdq t2,t1
202 pmuludq h01,t1
203 movd r2,t2
204 movd r1,t3
205 punpcklqdq t3,t2
206 pmuludq h23,t2
207 paddq t2,t1
208
209 movd r0,t3
210 pmuludq h44,t3
211
212 movdqa t1,t4
213 psrldq $8,t4
214 paddq t4,t1
215 paddq t3,t1
216 movq t1,d4
217
218
219 mov d0,%rax
220 shr $26,%rax
221 add %rax,d1
222
223 mov d0,%rbx
224 and $0x3ffffff,%ebx
225
226
227 mov d1,%rax
228 shr $26,%rax
229 add %rax,d2
230
231 mov d1,%rax
232 and $0x3ffffff,%eax
233 mov %eax,h1
234
235
236 mov d2,%rax
237 shr $26,%rax
238 add %rax,d3
239
240 mov d2,%rax
241 and $0x3ffffff,%eax
242 mov %eax,h2
243
244
245 mov d3,%rax
246 shr $26,%rax
247 add %rax,d4
248
249 mov d3,%rax
250 and $0x3ffffff,%eax
251 mov %eax,h3
252
253
254 mov d4,%rax
255 shr $26,%rax
256 lea (%eax,%eax,4),%eax
257 add %eax,%ebx
258
259 mov d4,%rax
260 and $0x3ffffff,%eax
261 mov %eax,h4
262
263
264 mov %ebx,%eax
265 shr $26,%eax
266 add %eax,h1
267
268 andl $0x3ffffff,%ebx
269 mov %ebx,h0
270
271 add $0x10,m
272 dec %rcx
273 jnz .Ldoblock
274
275 add $0x10,%rsp
276 pop %r12
277 pop %rbx
278 ret
279ENDPROC(poly1305_block_sse2)
280
281
282#define u0 0x00(%r8)
283#define u1 0x04(%r8)
284#define u2 0x08(%r8)
285#define u3 0x0c(%r8)
286#define u4 0x10(%r8)
287#define hc0 %xmm0
288#define hc1 %xmm1
289#define hc2 %xmm2
290#define hc3 %xmm5
291#define hc4 %xmm6
292#define ru0 %xmm7
293#define ru1 %xmm8
294#define ru2 %xmm9
295#define ru3 %xmm10
296#define ru4 %xmm11
297#define sv1 %xmm12
298#define sv2 %xmm13
299#define sv3 %xmm14
300#define sv4 %xmm15
301#undef d0
302#define d0 %r13
303
304ENTRY(poly1305_2block_sse2)
305
306
307
308
309
310
311
312
313
314
315
316 push %rbx
317 push %r12
318 push %r13
319
320
321 movd u0,ru0
322 movd r0,t1
323 punpcklqdq t1,ru0
324
325
326 movd u1,ru1
327 movd r1,t1
328 punpcklqdq t1,ru1
329 movdqa ru1,sv1
330 pslld $2,sv1
331 paddd ru1,sv1
332
333
334 movd u2,ru2
335 movd r2,t1
336 punpcklqdq t1,ru2
337 movdqa ru2,sv2
338 pslld $2,sv2
339 paddd ru2,sv2
340
341
342 movd u3,ru3
343 movd r3,t1
344 punpcklqdq t1,ru3
345 movdqa ru3,sv3
346 pslld $2,sv3
347 paddd ru3,sv3
348
349
350 movd u4,ru4
351 movd r4,t1
352 punpcklqdq t1,ru4
353 movdqa ru4,sv4
354 pslld $2,sv4
355 paddd ru4,sv4
356
357.Ldoblock2:
358
359 movd 0x00(m),hc0
360 movd 0x10(m),t1
361 punpcklqdq t1,hc0
362 pand ANMASK(%rip),hc0
363 movd h0,t1
364 paddd t1,hc0
365
366 movd 0x03(m),hc1
367 movd 0x13(m),t1
368 punpcklqdq t1,hc1
369 psrld $2,hc1
370 pand ANMASK(%rip),hc1
371 movd h1,t1
372 paddd t1,hc1
373
374 movd 0x06(m),hc2
375 movd 0x16(m),t1
376 punpcklqdq t1,hc2
377 psrld $4,hc2
378 pand ANMASK(%rip),hc2
379 movd h2,t1
380 paddd t1,hc2
381
382 movd 0x09(m),hc3
383 movd 0x19(m),t1
384 punpcklqdq t1,hc3
385 psrld $6,hc3
386 pand ANMASK(%rip),hc3
387 movd h3,t1
388 paddd t1,hc3
389
390 movd 0x0c(m),hc4
391 movd 0x1c(m),t1
392 punpcklqdq t1,hc4
393 psrld $8,hc4
394 por ORMASK(%rip),hc4
395 movd h4,t1
396 paddd t1,hc4
397
398
399 movdqa ru0,t1
400 pmuludq hc0,t1
401
402 movdqa sv4,t2
403 pmuludq hc1,t2
404 paddq t2,t1
405
406 movdqa sv3,t2
407 pmuludq hc2,t2
408 paddq t2,t1
409
410 movdqa sv2,t2
411 pmuludq hc3,t2
412 paddq t2,t1
413
414 movdqa sv1,t2
415 pmuludq hc4,t2
416 paddq t2,t1
417
418 movdqa t1,t2
419 psrldq $8,t2
420 paddq t2,t1
421 movq t1,d0
422
423
424 movdqa ru1,t1
425 pmuludq hc0,t1
426
427 movdqa ru0,t2
428 pmuludq hc1,t2
429 paddq t2,t1
430
431 movdqa sv4,t2
432 pmuludq hc2,t2
433 paddq t2,t1
434
435 movdqa sv3,t2
436 pmuludq hc3,t2
437 paddq t2,t1
438
439 movdqa sv2,t2
440 pmuludq hc4,t2
441 paddq t2,t1
442
443 movdqa t1,t2
444 psrldq $8,t2
445 paddq t2,t1
446 movq t1,d1
447
448
449 movdqa ru2,t1
450 pmuludq hc0,t1
451
452 movdqa ru1,t2
453 pmuludq hc1,t2
454 paddq t2,t1
455
456 movdqa ru0,t2
457 pmuludq hc2,t2
458 paddq t2,t1
459
460 movdqa sv4,t2
461 pmuludq hc3,t2
462 paddq t2,t1
463
464 movdqa sv3,t2
465 pmuludq hc4,t2
466 paddq t2,t1
467
468 movdqa t1,t2
469 psrldq $8,t2
470 paddq t2,t1
471 movq t1,d2
472
473
474 movdqa ru3,t1
475 pmuludq hc0,t1
476
477 movdqa ru2,t2
478 pmuludq hc1,t2
479 paddq t2,t1
480
481 movdqa ru1,t2
482 pmuludq hc2,t2
483 paddq t2,t1
484
485 movdqa ru0,t2
486 pmuludq hc3,t2
487 paddq t2,t1
488
489 movdqa sv4,t2
490 pmuludq hc4,t2
491 paddq t2,t1
492
493 movdqa t1,t2
494 psrldq $8,t2
495 paddq t2,t1
496 movq t1,d3
497
498
499 movdqa ru4,t1
500 pmuludq hc0,t1
501
502 movdqa ru3,t2
503 pmuludq hc1,t2
504 paddq t2,t1
505
506 movdqa ru2,t2
507 pmuludq hc2,t2
508 paddq t2,t1
509
510 movdqa ru1,t2
511 pmuludq hc3,t2
512 paddq t2,t1
513
514 movdqa ru0,t2
515 pmuludq hc4,t2
516 paddq t2,t1
517
518 movdqa t1,t2
519 psrldq $8,t2
520 paddq t2,t1
521 movq t1,d4
522
523
524 mov d0,%rax
525 shr $26,%rax
526 add %rax,d1
527
528 mov d0,%rbx
529 and $0x3ffffff,%ebx
530
531
532 mov d1,%rax
533 shr $26,%rax
534 add %rax,d2
535
536 mov d1,%rax
537 and $0x3ffffff,%eax
538 mov %eax,h1
539
540
541 mov d2,%rax
542 shr $26,%rax
543 add %rax,d3
544
545 mov d2,%rax
546 and $0x3ffffff,%eax
547 mov %eax,h2
548
549
550 mov d3,%rax
551 shr $26,%rax
552 add %rax,d4
553
554 mov d3,%rax
555 and $0x3ffffff,%eax
556 mov %eax,h3
557
558
559 mov d4,%rax
560 shr $26,%rax
561 lea (%eax,%eax,4),%eax
562 add %eax,%ebx
563
564 mov d4,%rax
565 and $0x3ffffff,%eax
566 mov %eax,h4
567
568
569 mov %ebx,%eax
570 shr $26,%eax
571 add %eax,h1
572
573 andl $0x3ffffff,%ebx
574 mov %ebx,h0
575
576 add $0x20,m
577 dec %rcx
578 jnz .Ldoblock2
579
580 pop %r13
581 pop %r12
582 pop %rbx
583 ret
584ENDPROC(poly1305_2block_sse2)
585