1
2
3
4
5
6
7
8
9
10
11
12#include <linux/linkage.h>
13#include <asm/frame.h>
14
15#define SHUFFLE_MASK(i0, i1, i2, i3) \
16 (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
17
18#define MASK1 SHUFFLE_MASK(3, 0, 1, 2)
19#define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
20#define MASK3 SHUFFLE_MASK(1, 2, 3, 0)
21
22#define STATE0 %xmm0
23#define STATE1 %xmm1
24#define STATE2 %xmm2
25#define STATE3 %xmm3
26#define STATE4 %xmm4
27#define KEY %xmm5
28#define MSG %xmm5
29#define T0 %xmm6
30#define T1 %xmm7
31
32.section .rodata.cst16.morus640_const, "aM", @progbits, 32
33.align 16
34.Lmorus640_const_0:
35 .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
36 .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
37.Lmorus640_const_1:
38 .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
39 .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
40
41.section .rodata.cst16.morus640_counter, "aM", @progbits, 16
42.align 16
43.Lmorus640_counter:
44 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
45 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
46
47.text
48
49.macro morus640_round s0, s1, s2, s3, s4, b, w
50 movdqa \s1, T0
51 pand \s2, T0
52 pxor T0, \s0
53 pxor \s3, \s0
54 movdqa \s0, T0
55 pslld $\b, T0
56 psrld $(32 - \b), \s0
57 pxor T0, \s0
58 pshufd $\w, \s3, \s3
59.endm
60
61
62
63
64
65
66
67
68
69
70
71__morus640_update:
72 morus640_round STATE0, STATE1, STATE2, STATE3, STATE4, 5, MASK1
73 pxor MSG, STATE1
74 morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2
75 pxor MSG, STATE2
76 morus640_round STATE2, STATE3, STATE4, STATE0, STATE1, 7, MASK3
77 pxor MSG, STATE3
78 morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2
79 pxor MSG, STATE4
80 morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1
81 ret
82ENDPROC(__morus640_update)
83
84
85
86
87
88
89
90
91
92
93
94__morus640_update_zero:
95 morus640_round STATE0, STATE1, STATE2, STATE3, STATE4, 5, MASK1
96 morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2
97 morus640_round STATE2, STATE3, STATE4, STATE0, STATE1, 7, MASK3
98 morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2
99 morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1
100 ret
101ENDPROC(__morus640_update_zero)
102
103
104
105
106
107
108
109
110
111
112
113
114
115__load_partial:
116 xor %r9d, %r9d
117 pxor MSG, MSG
118
119 mov %rcx, %r8
120 and $0x1, %r8
121 jz .Lld_partial_1
122
123 mov %rcx, %r8
124 and $0x1E, %r8
125 add %rsi, %r8
126 mov (%r8), %r9b
127
128.Lld_partial_1:
129 mov %rcx, %r8
130 and $0x2, %r8
131 jz .Lld_partial_2
132
133 mov %rcx, %r8
134 and $0x1C, %r8
135 add %rsi, %r8
136 shl $16, %r9
137 mov (%r8), %r9w
138
139.Lld_partial_2:
140 mov %rcx, %r8
141 and $0x4, %r8
142 jz .Lld_partial_4
143
144 mov %rcx, %r8
145 and $0x18, %r8
146 add %rsi, %r8
147 shl $32, %r9
148 mov (%r8), %r8d
149 xor %r8, %r9
150
151.Lld_partial_4:
152 movq %r9, MSG
153
154 mov %rcx, %r8
155 and $0x8, %r8
156 jz .Lld_partial_8
157
158 mov %rcx, %r8
159 and $0x10, %r8
160 add %rsi, %r8
161 pslldq $8, MSG
162 movq (%r8), T0
163 pxor T0, MSG
164
165.Lld_partial_8:
166 ret
167ENDPROC(__load_partial)
168
169
170
171
172
173
174
175
176
177
178
179
180
181__store_partial:
182 mov %rcx, %r8
183 mov %rdx, %r9
184
185 movq T0, %r10
186
187 cmp $8, %r8
188 jl .Lst_partial_8
189
190 mov %r10, (%r9)
191 psrldq $8, T0
192 movq T0, %r10
193
194 sub $8, %r8
195 add $8, %r9
196
197.Lst_partial_8:
198 cmp $4, %r8
199 jl .Lst_partial_4
200
201 mov %r10d, (%r9)
202 shr $32, %r10
203
204 sub $4, %r8
205 add $4, %r9
206
207.Lst_partial_4:
208 cmp $2, %r8
209 jl .Lst_partial_2
210
211 mov %r10w, (%r9)
212 shr $16, %r10
213
214 sub $2, %r8
215 add $2, %r9
216
217.Lst_partial_2:
218 cmp $1, %r8
219 jl .Lst_partial_1
220
221 mov %r10b, (%r9)
222
223.Lst_partial_1:
224 ret
225ENDPROC(__store_partial)
226
227
228
229
230ENTRY(crypto_morus640_sse2_init)
231 FRAME_BEGIN
232
233
234 movdqu (%rdx), STATE0
235
236 movdqu (%rsi), KEY
237 movdqa KEY, STATE1
238
239 pcmpeqd STATE2, STATE2
240
241 movdqa .Lmorus640_const_0, STATE3
242 movdqa .Lmorus640_const_1, STATE4
243
244
245 call __morus640_update_zero
246 call __morus640_update_zero
247 call __morus640_update_zero
248 call __morus640_update_zero
249 call __morus640_update_zero
250 call __morus640_update_zero
251 call __morus640_update_zero
252 call __morus640_update_zero
253 call __morus640_update_zero
254 call __morus640_update_zero
255 call __morus640_update_zero
256 call __morus640_update_zero
257 call __morus640_update_zero
258 call __morus640_update_zero
259 call __morus640_update_zero
260 call __morus640_update_zero
261
262
263 pxor KEY, STATE1
264
265
266 movdqu STATE0, (0 * 16)(%rdi)
267 movdqu STATE1, (1 * 16)(%rdi)
268 movdqu STATE2, (2 * 16)(%rdi)
269 movdqu STATE3, (3 * 16)(%rdi)
270 movdqu STATE4, (4 * 16)(%rdi)
271
272 FRAME_END
273 ret
274ENDPROC(crypto_morus640_sse2_init)
275
276
277
278
279
280ENTRY(crypto_morus640_sse2_ad)
281 FRAME_BEGIN
282
283 cmp $16, %rdx
284 jb .Lad_out
285
286
287 movdqu (0 * 16)(%rdi), STATE0
288 movdqu (1 * 16)(%rdi), STATE1
289 movdqu (2 * 16)(%rdi), STATE2
290 movdqu (3 * 16)(%rdi), STATE3
291 movdqu (4 * 16)(%rdi), STATE4
292
293 mov %rsi, %r8
294 and $0xF, %r8
295 jnz .Lad_u_loop
296
297.align 4
298.Lad_a_loop:
299 movdqa (%rsi), MSG
300 call __morus640_update
301 sub $16, %rdx
302 add $16, %rsi
303 cmp $16, %rdx
304 jge .Lad_a_loop
305
306 jmp .Lad_cont
307.align 4
308.Lad_u_loop:
309 movdqu (%rsi), MSG
310 call __morus640_update
311 sub $16, %rdx
312 add $16, %rsi
313 cmp $16, %rdx
314 jge .Lad_u_loop
315
316.Lad_cont:
317
318 movdqu STATE0, (0 * 16)(%rdi)
319 movdqu STATE1, (1 * 16)(%rdi)
320 movdqu STATE2, (2 * 16)(%rdi)
321 movdqu STATE3, (3 * 16)(%rdi)
322 movdqu STATE4, (4 * 16)(%rdi)
323
324.Lad_out:
325 FRAME_END
326 ret
327ENDPROC(crypto_morus640_sse2_ad)
328
329
330
331
332
333ENTRY(crypto_morus640_sse2_enc)
334 FRAME_BEGIN
335
336 cmp $16, %rcx
337 jb .Lenc_out
338
339
340 movdqu (0 * 16)(%rdi), STATE0
341 movdqu (1 * 16)(%rdi), STATE1
342 movdqu (2 * 16)(%rdi), STATE2
343 movdqu (3 * 16)(%rdi), STATE3
344 movdqu (4 * 16)(%rdi), STATE4
345
346 mov %rsi, %r8
347 or %rdx, %r8
348 and $0xF, %r8
349 jnz .Lenc_u_loop
350
351.align 4
352.Lenc_a_loop:
353 movdqa (%rsi), MSG
354 movdqa MSG, T0
355 pxor STATE0, T0
356 pshufd $MASK3, STATE1, T1
357 pxor T1, T0
358 movdqa STATE2, T1
359 pand STATE3, T1
360 pxor T1, T0
361 movdqa T0, (%rdx)
362
363 call __morus640_update
364 sub $16, %rcx
365 add $16, %rsi
366 add $16, %rdx
367 cmp $16, %rcx
368 jge .Lenc_a_loop
369
370 jmp .Lenc_cont
371.align 4
372.Lenc_u_loop:
373 movdqu (%rsi), MSG
374 movdqa MSG, T0
375 pxor STATE0, T0
376 pshufd $MASK3, STATE1, T1
377 pxor T1, T0
378 movdqa STATE2, T1
379 pand STATE3, T1
380 pxor T1, T0
381 movdqu T0, (%rdx)
382
383 call __morus640_update
384 sub $16, %rcx
385 add $16, %rsi
386 add $16, %rdx
387 cmp $16, %rcx
388 jge .Lenc_u_loop
389
390.Lenc_cont:
391
392 movdqu STATE0, (0 * 16)(%rdi)
393 movdqu STATE1, (1 * 16)(%rdi)
394 movdqu STATE2, (2 * 16)(%rdi)
395 movdqu STATE3, (3 * 16)(%rdi)
396 movdqu STATE4, (4 * 16)(%rdi)
397
398.Lenc_out:
399 FRAME_END
400 ret
401ENDPROC(crypto_morus640_sse2_enc)
402
403
404
405
406
407ENTRY(crypto_morus640_sse2_enc_tail)
408 FRAME_BEGIN
409
410
411 movdqu (0 * 16)(%rdi), STATE0
412 movdqu (1 * 16)(%rdi), STATE1
413 movdqu (2 * 16)(%rdi), STATE2
414 movdqu (3 * 16)(%rdi), STATE3
415 movdqu (4 * 16)(%rdi), STATE4
416
417
418 call __load_partial
419
420 movdqa MSG, T0
421 pxor STATE0, T0
422 pshufd $MASK3, STATE1, T1
423 pxor T1, T0
424 movdqa STATE2, T1
425 pand STATE3, T1
426 pxor T1, T0
427
428 call __store_partial
429
430 call __morus640_update
431
432
433 movdqu STATE0, (0 * 16)(%rdi)
434 movdqu STATE1, (1 * 16)(%rdi)
435 movdqu STATE2, (2 * 16)(%rdi)
436 movdqu STATE3, (3 * 16)(%rdi)
437 movdqu STATE4, (4 * 16)(%rdi)
438
439 FRAME_END
440 ret
441ENDPROC(crypto_morus640_sse2_enc_tail)
442
443
444
445
446
447ENTRY(crypto_morus640_sse2_dec)
448 FRAME_BEGIN
449
450 cmp $16, %rcx
451 jb .Ldec_out
452
453
454 movdqu (0 * 16)(%rdi), STATE0
455 movdqu (1 * 16)(%rdi), STATE1
456 movdqu (2 * 16)(%rdi), STATE2
457 movdqu (3 * 16)(%rdi), STATE3
458 movdqu (4 * 16)(%rdi), STATE4
459
460 mov %rsi, %r8
461 or %rdx, %r8
462 and $0xF, %r8
463 jnz .Ldec_u_loop
464
465.align 4
466.Ldec_a_loop:
467 movdqa (%rsi), MSG
468 pxor STATE0, MSG
469 pshufd $MASK3, STATE1, T0
470 pxor T0, MSG
471 movdqa STATE2, T0
472 pand STATE3, T0
473 pxor T0, MSG
474 movdqa MSG, (%rdx)
475
476 call __morus640_update
477 sub $16, %rcx
478 add $16, %rsi
479 add $16, %rdx
480 cmp $16, %rcx
481 jge .Ldec_a_loop
482
483 jmp .Ldec_cont
484.align 4
485.Ldec_u_loop:
486 movdqu (%rsi), MSG
487 pxor STATE0, MSG
488 pshufd $MASK3, STATE1, T0
489 pxor T0, MSG
490 movdqa STATE2, T0
491 pand STATE3, T0
492 pxor T0, MSG
493 movdqu MSG, (%rdx)
494
495 call __morus640_update
496 sub $16, %rcx
497 add $16, %rsi
498 add $16, %rdx
499 cmp $16, %rcx
500 jge .Ldec_u_loop
501
502.Ldec_cont:
503
504 movdqu STATE0, (0 * 16)(%rdi)
505 movdqu STATE1, (1 * 16)(%rdi)
506 movdqu STATE2, (2 * 16)(%rdi)
507 movdqu STATE3, (3 * 16)(%rdi)
508 movdqu STATE4, (4 * 16)(%rdi)
509
510.Ldec_out:
511 FRAME_END
512 ret
513ENDPROC(crypto_morus640_sse2_dec)
514
515
516
517
518
519ENTRY(crypto_morus640_sse2_dec_tail)
520 FRAME_BEGIN
521
522
523 movdqu (0 * 16)(%rdi), STATE0
524 movdqu (1 * 16)(%rdi), STATE1
525 movdqu (2 * 16)(%rdi), STATE2
526 movdqu (3 * 16)(%rdi), STATE3
527 movdqu (4 * 16)(%rdi), STATE4
528
529
530 call __load_partial
531
532 pxor STATE0, MSG
533 pshufd $MASK3, STATE1, T0
534 pxor T0, MSG
535 movdqa STATE2, T0
536 pand STATE3, T0
537 pxor T0, MSG
538 movdqa MSG, T0
539
540 call __store_partial
541
542
543 movq %rcx, T0
544 punpcklbw T0, T0
545 punpcklbw T0, T0
546 punpcklbw T0, T0
547 punpcklbw T0, T0
548 movdqa .Lmorus640_counter, T1
549 pcmpgtb T1, T0
550 pand T0, MSG
551
552 call __morus640_update
553
554
555 movdqu STATE0, (0 * 16)(%rdi)
556 movdqu STATE1, (1 * 16)(%rdi)
557 movdqu STATE2, (2 * 16)(%rdi)
558 movdqu STATE3, (3 * 16)(%rdi)
559 movdqu STATE4, (4 * 16)(%rdi)
560
561 FRAME_END
562 ret
563ENDPROC(crypto_morus640_sse2_dec_tail)
564
565
566
567
568
569ENTRY(crypto_morus640_sse2_final)
570 FRAME_BEGIN
571
572
573 movdqu (0 * 16)(%rdi), STATE0
574 movdqu (1 * 16)(%rdi), STATE1
575 movdqu (2 * 16)(%rdi), STATE2
576 movdqu (3 * 16)(%rdi), STATE3
577 movdqu (4 * 16)(%rdi), STATE4
578
579
580 pxor STATE0, STATE4
581
582
583 movq %rdx, MSG
584 movq %rcx, T0
585 pslldq $8, T0
586 pxor T0, MSG
587 psllq $3, MSG
588
589
590 call __morus640_update
591 call __morus640_update
592 call __morus640_update
593 call __morus640_update
594 call __morus640_update
595 call __morus640_update
596 call __morus640_update
597 call __morus640_update
598 call __morus640_update
599 call __morus640_update
600
601
602 movdqu (%rsi), MSG
603
604 pxor STATE0, MSG
605 pshufd $MASK3, STATE1, T0
606 pxor T0, MSG
607 movdqa STATE2, T0
608 pand STATE3, T0
609 pxor T0, MSG
610
611 movdqu MSG, (%rsi)
612
613 FRAME_END
614 ret
615ENDPROC(crypto_morus640_sse2_final)
616