1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49#include <linux/linkage.h>
50
51
52#define MOVDQ movdqu
53
54
55
56
57
58.macro addm p1 p2
59 add \p1, \p2
60 mov \p2, \p1
61.endm
62
63
64
65
66
67.macro COPY_XMM_AND_BSWAP p1 p2 p3
68 MOVDQ \p2, \p1
69 pshufb \p3, \p1
70.endm
71
72
73
74X0 = %xmm4
75X1 = %xmm5
76X2 = %xmm6
77X3 = %xmm7
78
79XTMP0 = %xmm0
80XTMP1 = %xmm1
81XTMP2 = %xmm2
82XTMP3 = %xmm3
83XTMP4 = %xmm8
84XFER = %xmm9
85
86SHUF_00BA = %xmm10
87SHUF_DC00 = %xmm11
88BYTE_FLIP_MASK = %xmm12
89
90NUM_BLKS = %rdx
91INP = %rsi
92CTX = %rdi
93
94SRND = %rsi
95c = %ecx
96d = %r8d
97e = %edx
98TBL = %r12
99a = %eax
100b = %ebx
101
102f = %r9d
103g = %r10d
104h = %r11d
105
106y0 = %r13d
107y1 = %r14d
108y2 = %r15d
109
110
111
112_INP_END_SIZE = 8
113_INP_SIZE = 8
114_XFER_SIZE = 16
115_XMM_SAVE_SIZE = 0
116
117_INP_END = 0
118_INP = _INP_END + _INP_END_SIZE
119_XFER = _INP + _INP_SIZE
120_XMM_SAVE = _XFER + _XFER_SIZE
121STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
122
123
124
125.macro rotate_Xs
126X_ = X0
127X0 = X1
128X1 = X2
129X2 = X3
130X3 = X_
131.endm
132
133
134
135.macro ROTATE_ARGS
136TMP_ = h
137h = g
138g = f
139f = e
140e = d
141d = c
142c = b
143b = a
144a = TMP_
145.endm
146
147.macro FOUR_ROUNDS_AND_SCHED
148
149
150 movdqa X3, XTMP0
151 mov e, y0
152 ror $(25-11), y0
153 mov a, y1
154 palignr $4, X2, XTMP0
155 ror $(22-13), y1
156 xor e, y0
157 mov f, y2
158 ror $(11-6), y0
159 movdqa X1, XTMP1
160 xor a, y1
161 xor g, y2
162 paddd X0, XTMP0
163 xor e, y0
164 and e, y2
165 ror $(13-2), y1
166
167 palignr $4, X0, XTMP1
168 xor a, y1
169 ror $6, y0
170 xor g, y2
171 movdqa XTMP1, XTMP2
172 ror $2, y1
173 add y0, y2
174 add _XFER(%rsp) , y2
175 movdqa XTMP1, XTMP3
176 mov a, y0
177 add y2, h
178 mov a, y2
179 pslld $(32-7), XTMP1
180 or c, y0
181 add h, d
182 and c, y2
183 psrld $7, XTMP2
184 and b, y0
185 add y1, h
186 por XTMP2, XTMP1
187 or y2, y0
188 add y0, h
189
190 ROTATE_ARGS
191 movdqa XTMP3, XTMP2
192 mov e, y0
193 mov a, y1
194 movdqa XTMP3, XTMP4
195 ror $(25-11), y0
196 xor e, y0
197 mov f, y2
198 ror $(22-13), y1
199 pslld $(32-18), XTMP3
200 xor a, y1
201 ror $(11-6), y0
202 xor g, y2
203 psrld $18, XTMP2
204 ror $(13-2), y1
205 xor e, y0
206 and e, y2
207 ror $6, y0
208 pxor XTMP3, XTMP1
209 xor a, y1
210 xor g, y2
211 psrld $3, XTMP4
212 add y0, y2
213 add (1*4 + _XFER)(%rsp), y2
214 ror $2, y1
215 pxor XTMP2, XTMP1
216 mov a, y0
217 add y2, h
218 mov a, y2
219 pxor XTMP4, XTMP1
220 or c, y0
221 add h, d
222 and c, y2
223
224 pshufd $0b11111010, X3, XTMP2
225 and b, y0
226 add y1, h
227 paddd XTMP1, XTMP0
228 or y2, y0
229 add y0, h
230
231 ROTATE_ARGS
232 movdqa XTMP2, XTMP3
233 mov e, y0
234 mov a, y1
235 ror $(25-11), y0
236 movdqa XTMP2, XTMP4
237 xor e, y0
238 ror $(22-13), y1
239 mov f, y2
240 xor a, y1
241 ror $(11-6), y0
242 psrlq $17, XTMP2
243 xor g, y2
244 psrlq $19, XTMP3
245 xor e, y0
246 and e, y2
247 psrld $10, XTMP4
248 ror $(13-2), y1
249 xor a, y1
250 xor g, y2
251 ror $6, y0
252 pxor XTMP3, XTMP2
253 add y0, y2
254 ror $2, y1
255 add (2*4 + _XFER)(%rsp), y2
256 pxor XTMP2, XTMP4
257 mov a, y0
258 add y2, h
259 mov a, y2
260 pshufb SHUF_00BA, XTMP4
261 or c, y0
262 add h, d
263 and c, y2
264 paddd XTMP4, XTMP0
265 and b, y0
266 add y1, h
267
268 pshufd $0b01010000, XTMP0, XTMP2
269 or y2, y0
270 add y0, h
271
272 ROTATE_ARGS
273 movdqa XTMP2, XTMP3
274 mov e, y0
275 ror $(25-11), y0
276 mov a, y1
277 movdqa XTMP2, X0
278 ror $(22-13), y1
279 xor e, y0
280 mov f, y2
281 ror $(11-6), y0
282 psrlq $17, XTMP2
283 xor a, y1
284 xor g, y2
285 psrlq $19, XTMP3
286 xor e, y0
287 and e, y2
288 ror $(13-2), y1
289 psrld $10, X0
290 xor a, y1
291 ror $6, y0
292 xor g, y2
293 pxor XTMP3, XTMP2
294 ror $2, y1
295 add y0, y2
296 add (3*4 + _XFER)(%rsp), y2
297 pxor XTMP2, X0
298 mov a, y0
299 add y2, h
300 mov a, y2
301 pshufb SHUF_DC00, X0
302 or c, y0
303 add h, d
304 and c, y2
305 paddd XTMP0, X0
306 and b, y0
307 add y1, h
308 or y2, y0
309 add y0, h
310
311 ROTATE_ARGS
312 rotate_Xs
313.endm
314
315
316.macro DO_ROUND round
317 mov e, y0
318 ror $(25-11), y0
319 mov a, y1
320 xor e, y0
321 ror $(22-13), y1
322 mov f, y2
323 xor a, y1
324 ror $(11-6), y0
325 xor g, y2
326 xor e, y0
327 ror $(13-2), y1
328 and e, y2
329 xor a, y1
330 ror $6, y0
331 xor g, y2
332 add y0, y2
333 ror $2, y1
334 offset = \round * 4 + _XFER
335 add offset(%rsp), y2
336 mov a, y0
337 add y2, h
338 mov a, y2
339 or c, y0
340 add h, d
341 and c, y2
342 and b, y0
343 add y1, h
344 or y2, y0
345 add y0, h
346 ROTATE_ARGS
347.endm
348
349
350
351
352
353
354
355.text
356ENTRY(sha256_transform_ssse3)
357.align 32
358 pushq %rbx
359 pushq %r12
360 pushq %r13
361 pushq %r14
362 pushq %r15
363 pushq %rbp
364 mov %rsp, %rbp
365
366 subq $STACK_SIZE, %rsp
367 and $~15, %rsp
368
369 shl $6, NUM_BLKS
370 jz done_hash
371 add INP, NUM_BLKS
372 mov NUM_BLKS, _INP_END(%rsp)
373
374
375 mov 4*0(CTX), a
376 mov 4*1(CTX), b
377 mov 4*2(CTX), c
378 mov 4*3(CTX), d
379 mov 4*4(CTX), e
380 mov 4*5(CTX), f
381 mov 4*6(CTX), g
382 mov 4*7(CTX), h
383
384 movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
385 movdqa _SHUF_00BA(%rip), SHUF_00BA
386 movdqa _SHUF_DC00(%rip), SHUF_DC00
387
388loop0:
389 lea K256(%rip), TBL
390
391
392 COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
393 COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
394 COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
395 COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
396
397 mov INP, _INP(%rsp)
398
399
400 mov $3, SRND
401.align 16
402loop1:
403 movdqa (TBL), XFER
404 paddd X0, XFER
405 movdqa XFER, _XFER(%rsp)
406 FOUR_ROUNDS_AND_SCHED
407
408 movdqa 1*16(TBL), XFER
409 paddd X0, XFER
410 movdqa XFER, _XFER(%rsp)
411 FOUR_ROUNDS_AND_SCHED
412
413 movdqa 2*16(TBL), XFER
414 paddd X0, XFER
415 movdqa XFER, _XFER(%rsp)
416 FOUR_ROUNDS_AND_SCHED
417
418 movdqa 3*16(TBL), XFER
419 paddd X0, XFER
420 movdqa XFER, _XFER(%rsp)
421 add $4*16, TBL
422 FOUR_ROUNDS_AND_SCHED
423
424 sub $1, SRND
425 jne loop1
426
427 mov $2, SRND
428loop2:
429 paddd (TBL), X0
430 movdqa X0, _XFER(%rsp)
431 DO_ROUND 0
432 DO_ROUND 1
433 DO_ROUND 2
434 DO_ROUND 3
435 paddd 1*16(TBL), X1
436 movdqa X1, _XFER(%rsp)
437 add $2*16, TBL
438 DO_ROUND 0
439 DO_ROUND 1
440 DO_ROUND 2
441 DO_ROUND 3
442
443 movdqa X2, X0
444 movdqa X3, X1
445
446 sub $1, SRND
447 jne loop2
448
449 addm (4*0)(CTX),a
450 addm (4*1)(CTX),b
451 addm (4*2)(CTX),c
452 addm (4*3)(CTX),d
453 addm (4*4)(CTX),e
454 addm (4*5)(CTX),f
455 addm (4*6)(CTX),g
456 addm (4*7)(CTX),h
457
458 mov _INP(%rsp), INP
459 add $64, INP
460 cmp _INP_END(%rsp), INP
461 jne loop0
462
463done_hash:
464
465 mov %rbp, %rsp
466 popq %rbp
467 popq %r15
468 popq %r14
469 popq %r13
470 popq %r12
471 popq %rbx
472
473 ret
474ENDPROC(sha256_transform_ssse3)
475
476.section .rodata.cst256.K256, "aM", @progbits, 256
477.align 64
478K256:
479 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
480 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
481 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
482 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
483 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
484 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
485 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
486 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
487 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
488 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
489 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
490 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
491 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
492 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
493 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
494 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
495
496.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
497.align 16
498PSHUFFLE_BYTE_FLIP_MASK:
499 .octa 0x0c0d0e0f08090a0b0405060700010203
500
501.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
502.align 16
503
504_SHUF_00BA:
505 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
506
507.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
508.align 16
509
510_SHUF_DC00:
511 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
512