1
2
3
4
5
6
7
8
9
10
11
12#include <linux/linkage.h>
13
14.data
15.align 16
16
17ANMASK: .octa 0x0000000003ffffff0000000003ffffff
18ORMASK: .octa 0x00000000010000000000000001000000
19
20.text
21
22#define h0 0x00(%rdi)
23#define h1 0x04(%rdi)
24#define h2 0x08(%rdi)
25#define h3 0x0c(%rdi)
26#define h4 0x10(%rdi)
27#define r0 0x00(%rdx)
28#define r1 0x04(%rdx)
29#define r2 0x08(%rdx)
30#define r3 0x0c(%rdx)
31#define r4 0x10(%rdx)
32#define s1 0x00(%rsp)
33#define s2 0x04(%rsp)
34#define s3 0x08(%rsp)
35#define s4 0x0c(%rsp)
36#define m %rsi
37#define h01 %xmm0
38#define h23 %xmm1
39#define h44 %xmm2
40#define t1 %xmm3
41#define t2 %xmm4
42#define t3 %xmm5
43#define t4 %xmm6
44#define mask %xmm7
45#define d0 %r8
46#define d1 %r9
47#define d2 %r10
48#define d3 %r11
49#define d4 %r12
50
51ENTRY(poly1305_block_sse2)
52
53
54
55
56
57
58
59
60
61 push %rbx
62 push %r12
63 sub $0x10,%rsp
64
65
66 mov r1,%eax
67 lea (%eax,%eax,4),%eax
68 mov %eax,s1
69 mov r2,%eax
70 lea (%eax,%eax,4),%eax
71 mov %eax,s2
72 mov r3,%eax
73 lea (%eax,%eax,4),%eax
74 mov %eax,s3
75 mov r4,%eax
76 lea (%eax,%eax,4),%eax
77 mov %eax,s4
78
79 movdqa ANMASK(%rip),mask
80
81.Ldoblock:
82
83
84
85 movd h0,h01
86 movd h1,t1
87 movd h2,h23
88 movd h3,t2
89 movd h4,h44
90 punpcklqdq t1,h01
91 punpcklqdq t2,h23
92 punpcklqdq h44,h44
93
94
95 movd 0x00(m),t1
96 movd 0x03(m),t2
97 psrld $2,t2
98 punpcklqdq t2,t1
99 pand mask,t1
100 paddd t1,h01
101
102 movd 0x06(m),t1
103 movd 0x09(m),t2
104 psrld $4,t1
105 psrld $6,t2
106 punpcklqdq t2,t1
107 pand mask,t1
108 paddd t1,h23
109
110 mov 0x0c(m),%eax
111 shr $8,%eax
112 or $0x01000000,%eax
113 movd %eax,t1
114 pshufd $0xc4,t1,t1
115 paddd t1,h44
116
117
118
119 movd r0,t1
120 movd s4,t2
121 punpcklqdq t2,t1
122 pmuludq h01,t1
123 movd s3,t2
124 movd s2,t3
125 punpcklqdq t3,t2
126 pmuludq h23,t2
127 paddq t2,t1
128
129
130 movd r1,t2
131 movd r0,t3
132 punpcklqdq t3,t2
133 pmuludq h01,t2
134 movd s4,t3
135 movd s3,t4
136 punpcklqdq t4,t3
137 pmuludq h23,t3
138 paddq t3,t2
139
140
141 movd s1,t3
142 movd s2,t4
143 punpcklqdq t4,t3
144 pmuludq h44,t3
145
146
147 movdqa t1,t4
148 punpcklqdq t2,t4
149 punpckhqdq t2,t1
150 paddq t4,t1
151 paddq t3,t1
152 movq t1,d0
153 psrldq $8,t1
154 movq t1,d1
155
156
157
158 movd r2,t1
159 movd r1,t2
160 punpcklqdq t2,t1
161 pmuludq h01,t1
162 movd r0,t2
163 movd s4,t3
164 punpcklqdq t3,t2
165 pmuludq h23,t2
166 paddq t2,t1
167
168
169 movd r3,t2
170 movd r2,t3
171 punpcklqdq t3,t2
172 pmuludq h01,t2
173 movd r1,t3
174 movd r0,t4
175 punpcklqdq t4,t3
176 pmuludq h23,t3
177 paddq t3,t2
178
179
180 movd s3,t3
181 movd s4,t4
182 punpcklqdq t4,t3
183 pmuludq h44,t3
184
185
186 movdqa t1,t4
187 punpcklqdq t2,t4
188 punpckhqdq t2,t1
189 paddq t4,t1
190 paddq t3,t1
191 movq t1,d2
192 psrldq $8,t1
193 movq t1,d3
194
195
196
197 movd r4,t1
198 movd r3,t2
199 punpcklqdq t2,t1
200 pmuludq h01,t1
201 movd r2,t2
202 movd r1,t3
203 punpcklqdq t3,t2
204 pmuludq h23,t2
205 paddq t2,t1
206
207 movd r0,t3
208 pmuludq h44,t3
209
210 movdqa t1,t4
211 psrldq $8,t4
212 paddq t4,t1
213 paddq t3,t1
214 movq t1,d4
215
216
217 mov d0,%rax
218 shr $26,%rax
219 add %rax,d1
220
221 mov d0,%rbx
222 and $0x3ffffff,%ebx
223
224
225 mov d1,%rax
226 shr $26,%rax
227 add %rax,d2
228
229 mov d1,%rax
230 and $0x3ffffff,%eax
231 mov %eax,h1
232
233
234 mov d2,%rax
235 shr $26,%rax
236 add %rax,d3
237
238 mov d2,%rax
239 and $0x3ffffff,%eax
240 mov %eax,h2
241
242
243 mov d3,%rax
244 shr $26,%rax
245 add %rax,d4
246
247 mov d3,%rax
248 and $0x3ffffff,%eax
249 mov %eax,h3
250
251
252 mov d4,%rax
253 shr $26,%rax
254 lea (%eax,%eax,4),%eax
255 add %eax,%ebx
256
257 mov d4,%rax
258 and $0x3ffffff,%eax
259 mov %eax,h4
260
261
262 mov %ebx,%eax
263 shr $26,%eax
264 add %eax,h1
265
266 andl $0x3ffffff,%ebx
267 mov %ebx,h0
268
269 add $0x10,m
270 dec %rcx
271 jnz .Ldoblock
272
273 add $0x10,%rsp
274 pop %r12
275 pop %rbx
276 ret
277ENDPROC(poly1305_block_sse2)
278
279
280#define u0 0x00(%r8)
281#define u1 0x04(%r8)
282#define u2 0x08(%r8)
283#define u3 0x0c(%r8)
284#define u4 0x10(%r8)
285#define hc0 %xmm0
286#define hc1 %xmm1
287#define hc2 %xmm2
288#define hc3 %xmm5
289#define hc4 %xmm6
290#define ru0 %xmm7
291#define ru1 %xmm8
292#define ru2 %xmm9
293#define ru3 %xmm10
294#define ru4 %xmm11
295#define sv1 %xmm12
296#define sv2 %xmm13
297#define sv3 %xmm14
298#define sv4 %xmm15
299#undef d0
300#define d0 %r13
301
302ENTRY(poly1305_2block_sse2)
303
304
305
306
307
308
309
310
311
312
313
314 push %rbx
315 push %r12
316 push %r13
317
318
319 movd u0,ru0
320 movd r0,t1
321 punpcklqdq t1,ru0
322
323
324 movd u1,ru1
325 movd r1,t1
326 punpcklqdq t1,ru1
327 movdqa ru1,sv1
328 pslld $2,sv1
329 paddd ru1,sv1
330
331
332 movd u2,ru2
333 movd r2,t1
334 punpcklqdq t1,ru2
335 movdqa ru2,sv2
336 pslld $2,sv2
337 paddd ru2,sv2
338
339
340 movd u3,ru3
341 movd r3,t1
342 punpcklqdq t1,ru3
343 movdqa ru3,sv3
344 pslld $2,sv3
345 paddd ru3,sv3
346
347
348 movd u4,ru4
349 movd r4,t1
350 punpcklqdq t1,ru4
351 movdqa ru4,sv4
352 pslld $2,sv4
353 paddd ru4,sv4
354
355.Ldoblock2:
356
357 movd 0x00(m),hc0
358 movd 0x10(m),t1
359 punpcklqdq t1,hc0
360 pand ANMASK(%rip),hc0
361 movd h0,t1
362 paddd t1,hc0
363
364 movd 0x03(m),hc1
365 movd 0x13(m),t1
366 punpcklqdq t1,hc1
367 psrld $2,hc1
368 pand ANMASK(%rip),hc1
369 movd h1,t1
370 paddd t1,hc1
371
372 movd 0x06(m),hc2
373 movd 0x16(m),t1
374 punpcklqdq t1,hc2
375 psrld $4,hc2
376 pand ANMASK(%rip),hc2
377 movd h2,t1
378 paddd t1,hc2
379
380 movd 0x09(m),hc3
381 movd 0x19(m),t1
382 punpcklqdq t1,hc3
383 psrld $6,hc3
384 pand ANMASK(%rip),hc3
385 movd h3,t1
386 paddd t1,hc3
387
388 movd 0x0c(m),hc4
389 movd 0x1c(m),t1
390 punpcklqdq t1,hc4
391 psrld $8,hc4
392 por ORMASK(%rip),hc4
393 movd h4,t1
394 paddd t1,hc4
395
396
397 movdqa ru0,t1
398 pmuludq hc0,t1
399
400 movdqa sv4,t2
401 pmuludq hc1,t2
402 paddq t2,t1
403
404 movdqa sv3,t2
405 pmuludq hc2,t2
406 paddq t2,t1
407
408 movdqa sv2,t2
409 pmuludq hc3,t2
410 paddq t2,t1
411
412 movdqa sv1,t2
413 pmuludq hc4,t2
414 paddq t2,t1
415
416 movdqa t1,t2
417 psrldq $8,t2
418 paddq t2,t1
419 movq t1,d0
420
421
422 movdqa ru1,t1
423 pmuludq hc0,t1
424
425 movdqa ru0,t2
426 pmuludq hc1,t2
427 paddq t2,t1
428
429 movdqa sv4,t2
430 pmuludq hc2,t2
431 paddq t2,t1
432
433 movdqa sv3,t2
434 pmuludq hc3,t2
435 paddq t2,t1
436
437 movdqa sv2,t2
438 pmuludq hc4,t2
439 paddq t2,t1
440
441 movdqa t1,t2
442 psrldq $8,t2
443 paddq t2,t1
444 movq t1,d1
445
446
447 movdqa ru2,t1
448 pmuludq hc0,t1
449
450 movdqa ru1,t2
451 pmuludq hc1,t2
452 paddq t2,t1
453
454 movdqa ru0,t2
455 pmuludq hc2,t2
456 paddq t2,t1
457
458 movdqa sv4,t2
459 pmuludq hc3,t2
460 paddq t2,t1
461
462 movdqa sv3,t2
463 pmuludq hc4,t2
464 paddq t2,t1
465
466 movdqa t1,t2
467 psrldq $8,t2
468 paddq t2,t1
469 movq t1,d2
470
471
472 movdqa ru3,t1
473 pmuludq hc0,t1
474
475 movdqa ru2,t2
476 pmuludq hc1,t2
477 paddq t2,t1
478
479 movdqa ru1,t2
480 pmuludq hc2,t2
481 paddq t2,t1
482
483 movdqa ru0,t2
484 pmuludq hc3,t2
485 paddq t2,t1
486
487 movdqa sv4,t2
488 pmuludq hc4,t2
489 paddq t2,t1
490
491 movdqa t1,t2
492 psrldq $8,t2
493 paddq t2,t1
494 movq t1,d3
495
496
497 movdqa ru4,t1
498 pmuludq hc0,t1
499
500 movdqa ru3,t2
501 pmuludq hc1,t2
502 paddq t2,t1
503
504 movdqa ru2,t2
505 pmuludq hc2,t2
506 paddq t2,t1
507
508 movdqa ru1,t2
509 pmuludq hc3,t2
510 paddq t2,t1
511
512 movdqa ru0,t2
513 pmuludq hc4,t2
514 paddq t2,t1
515
516 movdqa t1,t2
517 psrldq $8,t2
518 paddq t2,t1
519 movq t1,d4
520
521
522 mov d0,%rax
523 shr $26,%rax
524 add %rax,d1
525
526 mov d0,%rbx
527 and $0x3ffffff,%ebx
528
529
530 mov d1,%rax
531 shr $26,%rax
532 add %rax,d2
533
534 mov d1,%rax
535 and $0x3ffffff,%eax
536 mov %eax,h1
537
538
539 mov d2,%rax
540 shr $26,%rax
541 add %rax,d3
542
543 mov d2,%rax
544 and $0x3ffffff,%eax
545 mov %eax,h2
546
547
548 mov d3,%rax
549 shr $26,%rax
550 add %rax,d4
551
552 mov d3,%rax
553 and $0x3ffffff,%eax
554 mov %eax,h3
555
556
557 mov d4,%rax
558 shr $26,%rax
559 lea (%eax,%eax,4),%eax
560 add %eax,%ebx
561
562 mov d4,%rax
563 and $0x3ffffff,%eax
564 mov %eax,h4
565
566
567 mov %ebx,%eax
568 shr $26,%eax
569 add %eax,h1
570
571 andl $0x3ffffff,%ebx
572 mov %ebx,h0
573
574 add $0x20,m
575 dec %rcx
576 jnz .Ldoblock2
577
578 pop %r13
579 pop %r12
580 pop %rbx
581 ret
582ENDPROC(poly1305_2block_sse2)
583