1
2
3
4
5
6
7
8
9
10
11
12
13#include <linux/linkage.h>
14
15#define CAMELLIA_TABLE_BYTE_LEN 272
16
17
18#define key_table 0
19#define key_length CAMELLIA_TABLE_BYTE_LEN
20
21
22#define CTX %rdi
23#define RIO %r8
24
25
26
27
28#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
29 vpand x, mask4bit, tmp0; \
30 vpandn x, mask4bit, x; \
31 vpsrld $4, x, x; \
32 \
33 vpshufb tmp0, lo_t, tmp0; \
34 vpshufb x, hi_t, x; \
35 vpxor tmp0, x, x;
36
37#define ymm0_x xmm0
38#define ymm1_x xmm1
39#define ymm2_x xmm2
40#define ymm3_x xmm3
41#define ymm4_x xmm4
42#define ymm5_x xmm5
43#define ymm6_x xmm6
44#define ymm7_x xmm7
45#define ymm8_x xmm8
46#define ymm9_x xmm9
47#define ymm10_x xmm10
48#define ymm11_x xmm11
49#define ymm12_x xmm12
50#define ymm13_x xmm13
51#define ymm14_x xmm14
52#define ymm15_x xmm15
53
54
55
56
57
58
59
60
61
62
63
64
65
66#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
67 t7, mem_cd, key) \
68
69
70 \
71 vbroadcasti128 .Linv_shift_row, t4; \
72 vpbroadcastd .L0f0f0f0f, t7; \
73 vbroadcasti128 .Lpre_tf_lo_s1, t5; \
74 vbroadcasti128 .Lpre_tf_hi_s1, t6; \
75 vbroadcasti128 .Lpre_tf_lo_s4, t2; \
76 vbroadcasti128 .Lpre_tf_hi_s4, t3; \
77 \
78 \
79 vpshufb t4, x0, x0; \
80 vpshufb t4, x7, x7; \
81 vpshufb t4, x3, x3; \
82 vpshufb t4, x6, x6; \
83 vpshufb t4, x2, x2; \
84 vpshufb t4, x5, x5; \
85 vpshufb t4, x1, x1; \
86 vpshufb t4, x4, x4; \
87 \
88 \
89 \
90 filter_8bit(x0, t5, t6, t7, t4); \
91 filter_8bit(x7, t5, t6, t7, t4); \
92 vextracti128 $1, x0, t0
93 vextracti128 $1, x7, t1
94 filter_8bit(x3, t2, t3, t7, t4); \
95 filter_8bit(x6, t2, t3, t7, t4); \
96 vextracti128 $1, x3, t3
97 vextracti128 $1, x6, t2
98 filter_8bit(x2, t5, t6, t7, t4); \
99 filter_8bit(x5, t5, t6, t7, t4); \
100 filter_8bit(x1, t5, t6, t7, t4); \
101 filter_8bit(x4, t5, t6, t7, t4); \
102 \
103 vpxor t4
104 \
105 \
106 vextracti128 $1, x2, t6
107 vextracti128 $1, x5, t5
108 vaesenclast t4
109 vaesenclast t4
110 vinserti128 $1, t0
111 vaesenclast t4
112 vaesenclast t4
113 vinserti128 $1, t1
114 vaesenclast t4
115 vaesenclast t4
116 vinserti128 $1, t3
117 vaesenclast t4
118 vaesenclast t4
119 vinserti128 $1, t2
120 vextracti128 $1, x1, t3
121 vextracti128 $1, x4, t2
122 vbroadcasti128 .Lpost_tf_lo_s1, t0; \
123 vbroadcasti128 .Lpost_tf_hi_s1, t1; \
124 vaesenclast t4
125 vaesenclast t4
126 vinserti128 $1, t6
127 vaesenclast t4
128 vaesenclast t4
129 vinserti128 $1, t5
130 vaesenclast t4
131 vaesenclast t4
132 vinserti128 $1, t3
133 vaesenclast t4
134 vaesenclast t4
135 vinserti128 $1, t2
136 \
137 \
138 vbroadcasti128 .Lpost_tf_lo_s3, t2; \
139 vbroadcasti128 .Lpost_tf_hi_s3, t3; \
140 filter_8bit(x0, t0, t1, t7, t6); \
141 filter_8bit(x7, t0, t1, t7, t6); \
142 filter_8bit(x3, t0, t1, t7, t6); \
143 filter_8bit(x6, t0, t1, t7, t6); \
144 \
145 \
146 vbroadcasti128 .Lpost_tf_lo_s2, t4; \
147 vbroadcasti128 .Lpost_tf_hi_s2, t5; \
148 filter_8bit(x2, t2, t3, t7, t6); \
149 filter_8bit(x5, t2, t3, t7, t6); \
150 \
151 vpbroadcastq key, t0; \
152 \
153 \
154 filter_8bit(x1, t4, t5, t7, t2); \
155 filter_8bit(x4, t4, t5, t7, t2); \
156 vpxor t7, t7, t7; \
157 \
158 vpsrldq $1, t0, t1; \
159 vpsrldq $2, t0, t2; \
160 vpshufb t7, t1, t1; \
161 vpsrldq $3, t0, t3; \
162 \
163 \
164 vpxor x5, x0, x0; \
165 vpxor x6, x1, x1; \
166 vpxor x7, x2, x2; \
167 vpxor x4, x3, x3; \
168 \
169 vpshufb t7, t2, t2; \
170 vpsrldq $4, t0, t4; \
171 vpshufb t7, t3, t3; \
172 vpsrldq $5, t0, t5; \
173 vpshufb t7, t4, t4; \
174 \
175 vpxor x2, x4, x4; \
176 vpxor x3, x5, x5; \
177 vpxor x0, x6, x6; \
178 vpxor x1, x7, x7; \
179 \
180 vpsrldq $6, t0, t6; \
181 vpshufb t7, t5, t5; \
182 vpshufb t7, t6, t6; \
183 \
184 vpxor x7, x0, x0; \
185 vpxor x4, x1, x1; \
186 vpxor x5, x2, x2; \
187 vpxor x6, x3, x3; \
188 \
189 vpxor x3, x4, x4; \
190 vpxor x0, x5, x5; \
191 vpxor x1, x6, x6; \
192 vpxor x2, x7, x7; \
193 \
194 \
195 \
196 vpxor t6, x1, x1; \
197 vpxor 5 * 32(mem_cd), x1, x1; \
198 \
199 vpsrldq $7, t0, t6; \
200 vpshufb t7, t0, t0; \
201 vpshufb t7, t6, t7; \
202 \
203 vpxor t7, x0, x0; \
204 vpxor 4 * 32(mem_cd), x0, x0; \
205 \
206 vpxor t5, x2, x2; \
207 vpxor 6 * 32(mem_cd), x2, x2; \
208 \
209 vpxor t4, x3, x3; \
210 vpxor 7 * 32(mem_cd), x3, x3; \
211 \
212 vpxor t3, x4, x4; \
213 vpxor 0 * 32(mem_cd), x4, x4; \
214 \
215 vpxor t2, x5, x5; \
216 vpxor 1 * 32(mem_cd), x5, x5; \
217 \
218 vpxor t1, x6, x6; \
219 vpxor 2 * 32(mem_cd), x6, x6; \
220 \
221 vpxor t0, x7, x7; \
222 vpxor 3 * 32(mem_cd), x7, x7;
223
224
225
226
227
228.align 8
229roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
230 roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
231 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
232 %rcx, (%r9));
233 ret;
234ENDPROC(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
235
236.align 8
237roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
238 roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
239 %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
240 %rax, (%r9));
241 ret;
242ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
243
244
245
246
247
248
249
250#define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
251 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
252 leaq (key_table + (i) * 8)(CTX), %r9; \
253 call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
254 \
255 vmovdqu x0, 4 * 32(mem_cd); \
256 vmovdqu x1, 5 * 32(mem_cd); \
257 vmovdqu x2, 6 * 32(mem_cd); \
258 vmovdqu x3, 7 * 32(mem_cd); \
259 vmovdqu x4, 0 * 32(mem_cd); \
260 vmovdqu x5, 1 * 32(mem_cd); \
261 vmovdqu x6, 2 * 32(mem_cd); \
262 vmovdqu x7, 3 * 32(mem_cd); \
263 \
264 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
265 call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
266 \
267 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
268
269#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab)
270
271#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
272 \
273 vmovdqu x4, 4 * 32(mem_ab); \
274 vmovdqu x5, 5 * 32(mem_ab); \
275 vmovdqu x6, 6 * 32(mem_ab); \
276 vmovdqu x7, 7 * 32(mem_ab); \
277 vmovdqu x0, 0 * 32(mem_ab); \
278 vmovdqu x1, 1 * 32(mem_ab); \
279 vmovdqu x2, 2 * 32(mem_ab); \
280 vmovdqu x3, 3 * 32(mem_ab);
281
282#define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
283 y6, y7, mem_ab, mem_cd, i) \
284 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
285 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
286 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
287 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
288 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
289 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
290
291#define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
292 y6, y7, mem_ab, mem_cd, i) \
293 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
294 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
295 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
296 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
297 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
298 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
299
300
301
302
303
304
305
306#define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
307 vpcmpgtb v0, zero, t0; \
308 vpaddb v0, v0, v0; \
309 vpabsb t0, t0; \
310 \
311 vpcmpgtb v1, zero, t1; \
312 vpaddb v1, v1, v1; \
313 vpabsb t1, t1; \
314 \
315 vpcmpgtb v2, zero, t2; \
316 vpaddb v2, v2, v2; \
317 vpabsb t2, t2; \
318 \
319 vpor t0, v1, v1; \
320 \
321 vpcmpgtb v3, zero, t0; \
322 vpaddb v3, v3, v3; \
323 vpabsb t0, t0; \
324 \
325 vpor t1, v2, v2; \
326 vpor t2, v3, v3; \
327 vpor t0, v0, v0;
328
329
330
331
332
333
334
335
336#define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
337 tt1, tt2, tt3, kll, klr, krl, krr) \
338
339
340
341
342 \
343 vpbroadcastd kll, t0; \
344 vpxor tt0, tt0, tt0; \
345 vpshufb tt0, t0, t3; \
346 vpsrldq $1, t0, t0; \
347 vpshufb tt0, t0, t2; \
348 vpsrldq $1, t0, t0; \
349 vpshufb tt0, t0, t1; \
350 vpsrldq $1, t0, t0; \
351 vpshufb tt0, t0, t0; \
352 \
353 vpand l0, t0, t0; \
354 vpand l1, t1, t1; \
355 vpand l2, t2, t2; \
356 vpand l3, t3, t3; \
357 \
358 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
359 \
360 vpxor l4, t0, l4; \
361 vpbroadcastd krr, t0; \
362 vmovdqu l4, 4 * 32(l); \
363 vpxor l5, t1, l5; \
364 vmovdqu l5, 5 * 32(l); \
365 vpxor l6, t2, l6; \
366 vmovdqu l6, 6 * 32(l); \
367 vpxor l7, t3, l7; \
368 vmovdqu l7, 7 * 32(l); \
369 \
370
371
372
373
374 \
375 \
376 vpshufb tt0, t0, t3; \
377 vpsrldq $1, t0, t0; \
378 vpshufb tt0, t0, t2; \
379 vpsrldq $1, t0, t0; \
380 vpshufb tt0, t0, t1; \
381 vpsrldq $1, t0, t0; \
382 vpshufb tt0, t0, t0; \
383 \
384 vpor 4 * 32(r), t0, t0; \
385 vpor 5 * 32(r), t1, t1; \
386 vpor 6 * 32(r), t2, t2; \
387 vpor 7 * 32(r), t3, t3; \
388 \
389 vpxor 0 * 32(r), t0, t0; \
390 vpxor 1 * 32(r), t1, t1; \
391 vpxor 2 * 32(r), t2, t2; \
392 vpxor 3 * 32(r), t3, t3; \
393 vmovdqu t0, 0 * 32(r); \
394 vpbroadcastd krl, t0; \
395 vmovdqu t1, 1 * 32(r); \
396 vmovdqu t2, 2 * 32(r); \
397 vmovdqu t3, 3 * 32(r); \
398 \
399
400
401
402
403 \
404 vpshufb tt0, t0, t3; \
405 vpsrldq $1, t0, t0; \
406 vpshufb tt0, t0, t2; \
407 vpsrldq $1, t0, t0; \
408 vpshufb tt0, t0, t1; \
409 vpsrldq $1, t0, t0; \
410 vpshufb tt0, t0, t0; \
411 \
412 vpand 0 * 32(r), t0, t0; \
413 vpand 1 * 32(r), t1, t1; \
414 vpand 2 * 32(r), t2, t2; \
415 vpand 3 * 32(r), t3, t3; \
416 \
417 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
418 \
419 vpxor 4 * 32(r), t0, t0; \
420 vpxor 5 * 32(r), t1, t1; \
421 vpxor 6 * 32(r), t2, t2; \
422 vpxor 7 * 32(r), t3, t3; \
423 vmovdqu t0, 4 * 32(r); \
424 vpbroadcastd klr, t0; \
425 vmovdqu t1, 5 * 32(r); \
426 vmovdqu t2, 6 * 32(r); \
427 vmovdqu t3, 7 * 32(r); \
428 \
429
430
431
432
433 \
434 \
435 vpshufb tt0, t0, t3; \
436 vpsrldq $1, t0, t0; \
437 vpshufb tt0, t0, t2; \
438 vpsrldq $1, t0, t0; \
439 vpshufb tt0, t0, t1; \
440 vpsrldq $1, t0, t0; \
441 vpshufb tt0, t0, t0; \
442 \
443 vpor l4, t0, t0; \
444 vpor l5, t1, t1; \
445 vpor l6, t2, t2; \
446 vpor l7, t3, t3; \
447 \
448 vpxor l0, t0, l0; \
449 vmovdqu l0, 0 * 32(l); \
450 vpxor l1, t1, l1; \
451 vmovdqu l1, 1 * 32(l); \
452 vpxor l2, t2, l2; \
453 vmovdqu l2, 2 * 32(l); \
454 vpxor l3, t3, l3; \
455 vmovdqu l3, 3 * 32(l);
456
457#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
458 vpunpckhdq x1, x0, t2; \
459 vpunpckldq x1, x0, x0; \
460 \
461 vpunpckldq x3, x2, t1; \
462 vpunpckhdq x3, x2, x2; \
463 \
464 vpunpckhqdq t1, x0, x1; \
465 vpunpcklqdq t1, x0, x0; \
466 \
467 vpunpckhqdq x2, t2, x3; \
468 vpunpcklqdq x2, t2, x2;
469
470#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
471 a3, b3, c3, d3, st0, st1) \
472 vmovdqu d2, st0; \
473 vmovdqu d3, st1; \
474 transpose_4x4(a0, a1, a2, a3, d2, d3); \
475 transpose_4x4(b0, b1, b2, b3, d2, d3); \
476 vmovdqu st0, d2; \
477 vmovdqu st1, d3; \
478 \
479 vmovdqu a0, st0; \
480 vmovdqu a1, st1; \
481 transpose_4x4(c0, c1, c2, c3, a0, a1); \
482 transpose_4x4(d0, d1, d2, d3, a0, a1); \
483 \
484 vbroadcasti128 .Lshufb_16x16b, a0; \
485 vmovdqu st1, a1; \
486 vpshufb a0, a2, a2; \
487 vpshufb a0, a3, a3; \
488 vpshufb a0, b0, b0; \
489 vpshufb a0, b1, b1; \
490 vpshufb a0, b2, b2; \
491 vpshufb a0, b3, b3; \
492 vpshufb a0, a1, a1; \
493 vpshufb a0, c0, c0; \
494 vpshufb a0, c1, c1; \
495 vpshufb a0, c2, c2; \
496 vpshufb a0, c3, c3; \
497 vpshufb a0, d0, d0; \
498 vpshufb a0, d1, d1; \
499 vpshufb a0, d2, d2; \
500 vpshufb a0, d3, d3; \
501 vmovdqu d3, st1; \
502 vmovdqu st0, d3; \
503 vpshufb a0, d3, a0; \
504 vmovdqu d2, st0; \
505 \
506 transpose_4x4(a0, b0, c0, d0, d2, d3); \
507 transpose_4x4(a1, b1, c1, d1, d2, d3); \
508 vmovdqu st0, d2; \
509 vmovdqu st1, d3; \
510 \
511 vmovdqu b0, st0; \
512 vmovdqu b1, st1; \
513 transpose_4x4(a2, b2, c2, d2, b0, b1); \
514 transpose_4x4(a3, b3, c3, d3, b0, b1); \
515 vmovdqu st0, b0; \
516 vmovdqu st1, b1; \
517
518
519
520#define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
521 y6, y7, rio, key) \
522 vpbroadcastq key, x0; \
523 vpshufb .Lpack_bswap, x0, x0; \
524 \
525 vpxor 0 * 32(rio), x0, y7; \
526 vpxor 1 * 32(rio), x0, y6; \
527 vpxor 2 * 32(rio), x0, y5; \
528 vpxor 3 * 32(rio), x0, y4; \
529 vpxor 4 * 32(rio), x0, y3; \
530 vpxor 5 * 32(rio), x0, y2; \
531 vpxor 6 * 32(rio), x0, y1; \
532 vpxor 7 * 32(rio), x0, y0; \
533 vpxor 8 * 32(rio), x0, x7; \
534 vpxor 9 * 32(rio), x0, x6; \
535 vpxor 10 * 32(rio), x0, x5; \
536 vpxor 11 * 32(rio), x0, x4; \
537 vpxor 12 * 32(rio), x0, x3; \
538 vpxor 13 * 32(rio), x0, x2; \
539 vpxor 14 * 32(rio), x0, x1; \
540 vpxor 15 * 32(rio), x0, x0;
541
542
543#define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
544 y6, y7, mem_ab, mem_cd) \
545 byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
546 y4, y5, y6, y7, (mem_ab), (mem_cd)); \
547 \
548 vmovdqu x0, 0 * 32(mem_ab); \
549 vmovdqu x1, 1 * 32(mem_ab); \
550 vmovdqu x2, 2 * 32(mem_ab); \
551 vmovdqu x3, 3 * 32(mem_ab); \
552 vmovdqu x4, 4 * 32(mem_ab); \
553 vmovdqu x5, 5 * 32(mem_ab); \
554 vmovdqu x6, 6 * 32(mem_ab); \
555 vmovdqu x7, 7 * 32(mem_ab); \
556 vmovdqu y0, 0 * 32(mem_cd); \
557 vmovdqu y1, 1 * 32(mem_cd); \
558 vmovdqu y2, 2 * 32(mem_cd); \
559 vmovdqu y3, 3 * 32(mem_cd); \
560 vmovdqu y4, 4 * 32(mem_cd); \
561 vmovdqu y5, 5 * 32(mem_cd); \
562 vmovdqu y6, 6 * 32(mem_cd); \
563 vmovdqu y7, 7 * 32(mem_cd);
564
565
566#define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
567 y5, y6, y7, key, stack_tmp0, stack_tmp1) \
568 byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
569 y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
570 \
571 vmovdqu x0, stack_tmp0; \
572 \
573 vpbroadcastq key, x0; \
574 vpshufb .Lpack_bswap, x0, x0; \
575 \
576 vpxor x0, y7, y7; \
577 vpxor x0, y6, y6; \
578 vpxor x0, y5, y5; \
579 vpxor x0, y4, y4; \
580 vpxor x0, y3, y3; \
581 vpxor x0, y2, y2; \
582 vpxor x0, y1, y1; \
583 vpxor x0, y0, y0; \
584 vpxor x0, x7, x7; \
585 vpxor x0, x6, x6; \
586 vpxor x0, x5, x5; \
587 vpxor x0, x4, x4; \
588 vpxor x0, x3, x3; \
589 vpxor x0, x2, x2; \
590 vpxor x0, x1, x1; \
591 vpxor stack_tmp0, x0, x0;
592
593#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
594 y6, y7, rio) \
595 vmovdqu x0, 0 * 32(rio); \
596 vmovdqu x1, 1 * 32(rio); \
597 vmovdqu x2, 2 * 32(rio); \
598 vmovdqu x3, 3 * 32(rio); \
599 vmovdqu x4, 4 * 32(rio); \
600 vmovdqu x5, 5 * 32(rio); \
601 vmovdqu x6, 6 * 32(rio); \
602 vmovdqu x7, 7 * 32(rio); \
603 vmovdqu y0, 8 * 32(rio); \
604 vmovdqu y1, 9 * 32(rio); \
605 vmovdqu y2, 10 * 32(rio); \
606 vmovdqu y3, 11 * 32(rio); \
607 vmovdqu y4, 12 * 32(rio); \
608 vmovdqu y5, 13 * 32(rio); \
609 vmovdqu y6, 14 * 32(rio); \
610 vmovdqu y7, 15 * 32(rio);
611
612.data
613.align 32
614
615#define SHUFB_BYTES(idx) \
616 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
617
618.Lshufb_16x16b:
619 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
620 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
621
622.Lpack_bswap:
623 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
624 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
625
626
627.Lbswap128_mask:
628 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
629
630
631.Lxts_gf128mul_and_shl1_mask_0:
632 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
633.Lxts_gf128mul_and_shl1_mask_1:
634 .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650.Lpre_tf_lo_s1:
651 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
652 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
653.Lpre_tf_hi_s1:
654 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
655 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671.Lpre_tf_lo_s4:
672 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
673 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
674.Lpre_tf_hi_s4:
675 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
676 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694.Lpost_tf_lo_s1:
695 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
696 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
697.Lpost_tf_hi_s1:
698 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
699 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717.Lpost_tf_lo_s2:
718 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
719 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
720.Lpost_tf_hi_s2:
721 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
722 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740.Lpost_tf_lo_s3:
741 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
742 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
743.Lpost_tf_hi_s3:
744 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
745 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
746
747
748.Linv_shift_row:
749 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
750 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
751
752.align 4
753
754.L0f0f0f0f:
755 .long 0x0f0f0f0f
756
757.text
758
759.align 8
760__camellia_enc_blk32:
761
762
763
764
765
766
767
768
769
770 leaq 8 * 32(%rax), %rcx;
771
772 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
773 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
774 %ymm15, %rax, %rcx);
775
776 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
777 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
778 %ymm15, %rax, %rcx, 0);
779
780 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
781 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
782 %ymm15,
783 ((key_table + (8) * 8) + 0)(CTX),
784 ((key_table + (8) * 8) + 4)(CTX),
785 ((key_table + (8) * 8) + 8)(CTX),
786 ((key_table + (8) * 8) + 12)(CTX));
787
788 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
789 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
790 %ymm15, %rax, %rcx, 8);
791
792 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
793 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
794 %ymm15,
795 ((key_table + (16) * 8) + 0)(CTX),
796 ((key_table + (16) * 8) + 4)(CTX),
797 ((key_table + (16) * 8) + 8)(CTX),
798 ((key_table + (16) * 8) + 12)(CTX));
799
800 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
801 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
802 %ymm15, %rax, %rcx, 16);
803
804 movl $24, %r8d;
805 cmpl $16, key_length(CTX);
806 jne .Lenc_max32;
807
808.Lenc_done:
809
810 vmovdqu 0 * 32(%rcx), %ymm8;
811 vmovdqu 1 * 32(%rcx), %ymm9;
812 vmovdqu 2 * 32(%rcx), %ymm10;
813 vmovdqu 3 * 32(%rcx), %ymm11;
814 vmovdqu 4 * 32(%rcx), %ymm12;
815 vmovdqu 5 * 32(%rcx), %ymm13;
816 vmovdqu 6 * 32(%rcx), %ymm14;
817 vmovdqu 7 * 32(%rcx), %ymm15;
818
819 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
820 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
821 %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
822
823 ret;
824
825.align 8
826.Lenc_max32:
827 movl $32, %r8d;
828
829 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
830 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
831 %ymm15,
832 ((key_table + (24) * 8) + 0)(CTX),
833 ((key_table + (24) * 8) + 4)(CTX),
834 ((key_table + (24) * 8) + 8)(CTX),
835 ((key_table + (24) * 8) + 12)(CTX));
836
837 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
838 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
839 %ymm15, %rax, %rcx, 24);
840
841 jmp .Lenc_done;
842ENDPROC(__camellia_enc_blk32)
843
844.align 8
845__camellia_dec_blk32:
846
847
848
849
850
851
852
853
854
855
856 leaq 8 * 32(%rax), %rcx;
857
858 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
859 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
860 %ymm15, %rax, %rcx);
861
862 cmpl $32, %r8d;
863 je .Ldec_max32;
864
865.Ldec_max24:
866 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
867 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
868 %ymm15, %rax, %rcx, 16);
869
870 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
871 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
872 %ymm15,
873 ((key_table + (16) * 8) + 8)(CTX),
874 ((key_table + (16) * 8) + 12)(CTX),
875 ((key_table + (16) * 8) + 0)(CTX),
876 ((key_table + (16) * 8) + 4)(CTX));
877
878 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
879 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
880 %ymm15, %rax, %rcx, 8);
881
882 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
883 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
884 %ymm15,
885 ((key_table + (8) * 8) + 8)(CTX),
886 ((key_table + (8) * 8) + 12)(CTX),
887 ((key_table + (8) * 8) + 0)(CTX),
888 ((key_table + (8) * 8) + 4)(CTX));
889
890 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
891 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
892 %ymm15, %rax, %rcx, 0);
893
894
895 vmovdqu 0 * 32(%rcx), %ymm8;
896 vmovdqu 1 * 32(%rcx), %ymm9;
897 vmovdqu 2 * 32(%rcx), %ymm10;
898 vmovdqu 3 * 32(%rcx), %ymm11;
899 vmovdqu 4 * 32(%rcx), %ymm12;
900 vmovdqu 5 * 32(%rcx), %ymm13;
901 vmovdqu 6 * 32(%rcx), %ymm14;
902 vmovdqu 7 * 32(%rcx), %ymm15;
903
904 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
905 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
906 %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
907
908 ret;
909
910.align 8
911.Ldec_max32:
912 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
913 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
914 %ymm15, %rax, %rcx, 24);
915
916 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
917 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
918 %ymm15,
919 ((key_table + (24) * 8) + 8)(CTX),
920 ((key_table + (24) * 8) + 12)(CTX),
921 ((key_table + (24) * 8) + 0)(CTX),
922 ((key_table + (24) * 8) + 4)(CTX));
923
924 jmp .Ldec_max24;
925ENDPROC(__camellia_dec_blk32)
926
927ENTRY(camellia_ecb_enc_32way)
928
929
930
931
932
933
934 vzeroupper;
935
936 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
937 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
938 %ymm15, %rdx, (key_table)(CTX));
939
940
941 movq %rsi, %rax;
942
943 call __camellia_enc_blk32;
944
945 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
946 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
947 %ymm8, %rsi);
948
949 vzeroupper;
950
951 ret;
952ENDPROC(camellia_ecb_enc_32way)
953
954ENTRY(camellia_ecb_dec_32way)
955
956
957
958
959
960
961 vzeroupper;
962
963 cmpl $16, key_length(CTX);
964 movl $32, %r8d;
965 movl $24, %eax;
966 cmovel %eax, %r8d;
967
968 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
969 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
970 %ymm15, %rdx, (key_table)(CTX, %r8, 8));
971
972
973 movq %rsi, %rax;
974
975 call __camellia_dec_blk32;
976
977 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
978 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
979 %ymm8, %rsi);
980
981 vzeroupper;
982
983 ret;
984ENDPROC(camellia_ecb_dec_32way)
985
986ENTRY(camellia_cbc_dec_32way)
987
988
989
990
991
992
993 vzeroupper;
994
995 cmpl $16, key_length(CTX);
996 movl $32, %r8d;
997 movl $24, %eax;
998 cmovel %eax, %r8d;
999
1000 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1001 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1002 %ymm15, %rdx, (key_table)(CTX, %r8, 8));
1003
1004 movq %rsp, %r10;
1005 cmpq %rsi, %rdx;
1006 je .Lcbc_dec_use_stack;
1007
1008
1009 movq %rsi, %rax;
1010 jmp .Lcbc_dec_continue;
1011
1012.Lcbc_dec_use_stack:
1013
1014
1015
1016
1017 subq $(16 * 32), %rsp;
1018 movq %rsp, %rax;
1019
1020.Lcbc_dec_continue:
1021 call __camellia_dec_blk32;
1022
1023 vmovdqu %ymm7, (%rax);
1024 vpxor %ymm7, %ymm7, %ymm7;
1025 vinserti128 $1, (%rdx), %ymm7, %ymm7;
1026 vpxor (%rax), %ymm7, %ymm7;
1027 movq %r10, %rsp;
1028 vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
1029 vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
1030 vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
1031 vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
1032 vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
1033 vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
1034 vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
1035 vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
1036 vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
1037 vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
1038 vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
1039 vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
1040 vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
1041 vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
1042 vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
1043 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1044 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1045 %ymm8, %rsi);
1046
1047 vzeroupper;
1048
1049 ret;
1050ENDPROC(camellia_cbc_dec_32way)
1051
1052#define inc_le128(x, minus_one, tmp) \
1053 vpcmpeqq minus_one, x, tmp; \
1054 vpsubq minus_one, x, x; \
1055 vpslldq $8, tmp, tmp; \
1056 vpsubq tmp, x, x;
1057
1058#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
1059 vpcmpeqq minus_one, x, tmp1; \
1060 vpcmpeqq minus_two, x, tmp2; \
1061 vpsubq minus_two, x, x; \
1062 vpor tmp2, tmp1, tmp1; \
1063 vpslldq $8, tmp1, tmp1; \
1064 vpsubq tmp1, x, x;
1065
1066ENTRY(camellia_ctr_32way)
1067
1068
1069
1070
1071
1072
1073
1074 vzeroupper;
1075
1076 movq %rsp, %r10;
1077 cmpq %rsi, %rdx;
1078 je .Lctr_use_stack;
1079
1080
1081 movq %rsi, %rax;
1082 jmp .Lctr_continue;
1083
1084.Lctr_use_stack:
1085 subq $(16 * 32), %rsp;
1086 movq %rsp, %rax;
1087
1088.Lctr_continue:
1089 vpcmpeqd %ymm15, %ymm15, %ymm15;
1090 vpsrldq $8, %ymm15, %ymm15;
1091 vpaddq %ymm15, %ymm15, %ymm12;
1092
1093
1094 vmovdqu (%rcx), %xmm0;
1095 vmovdqa %xmm0, %xmm1;
1096 inc_le128(%xmm0, %xmm15, %xmm14);
1097 vbroadcasti128 .Lbswap128_mask, %ymm14;
1098 vinserti128 $1, %xmm0, %ymm1, %ymm0;
1099 vpshufb %ymm14, %ymm0, %ymm13;
1100 vmovdqu %ymm13, 15 * 32(%rax);
1101
1102
1103 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1104 vpshufb %ymm14, %ymm0, %ymm13;
1105 vmovdqu %ymm13, 14 * 32(%rax);
1106 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1107 vpshufb %ymm14, %ymm0, %ymm13;
1108 vmovdqu %ymm13, 13 * 32(%rax);
1109 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1110 vpshufb %ymm14, %ymm0, %ymm13;
1111 vmovdqu %ymm13, 12 * 32(%rax);
1112 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1113 vpshufb %ymm14, %ymm0, %ymm13;
1114 vmovdqu %ymm13, 11 * 32(%rax);
1115 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1116 vpshufb %ymm14, %ymm0, %ymm10;
1117 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1118 vpshufb %ymm14, %ymm0, %ymm9;
1119 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1120 vpshufb %ymm14, %ymm0, %ymm8;
1121 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1122 vpshufb %ymm14, %ymm0, %ymm7;
1123 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1124 vpshufb %ymm14, %ymm0, %ymm6;
1125 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1126 vpshufb %ymm14, %ymm0, %ymm5;
1127 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1128 vpshufb %ymm14, %ymm0, %ymm4;
1129 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1130 vpshufb %ymm14, %ymm0, %ymm3;
1131 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1132 vpshufb %ymm14, %ymm0, %ymm2;
1133 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1134 vpshufb %ymm14, %ymm0, %ymm1;
1135 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1136 vextracti128 $1, %ymm0, %xmm13;
1137 vpshufb %ymm14, %ymm0, %ymm0;
1138 inc_le128(%xmm13, %xmm15, %xmm14);
1139 vmovdqu %xmm13, (%rcx);
1140
1141
1142 vpbroadcastq (key_table)(CTX), %ymm15;
1143 vpshufb .Lpack_bswap, %ymm15, %ymm15;
1144 vpxor %ymm0, %ymm15, %ymm0;
1145 vpxor %ymm1, %ymm15, %ymm1;
1146 vpxor %ymm2, %ymm15, %ymm2;
1147 vpxor %ymm3, %ymm15, %ymm3;
1148 vpxor %ymm4, %ymm15, %ymm4;
1149 vpxor %ymm5, %ymm15, %ymm5;
1150 vpxor %ymm6, %ymm15, %ymm6;
1151 vpxor %ymm7, %ymm15, %ymm7;
1152 vpxor %ymm8, %ymm15, %ymm8;
1153 vpxor %ymm9, %ymm15, %ymm9;
1154 vpxor %ymm10, %ymm15, %ymm10;
1155 vpxor 11 * 32(%rax), %ymm15, %ymm11;
1156 vpxor 12 * 32(%rax), %ymm15, %ymm12;
1157 vpxor 13 * 32(%rax), %ymm15, %ymm13;
1158 vpxor 14 * 32(%rax), %ymm15, %ymm14;
1159 vpxor 15 * 32(%rax), %ymm15, %ymm15;
1160
1161 call __camellia_enc_blk32;
1162
1163 movq %r10, %rsp;
1164
1165 vpxor 0 * 32(%rdx), %ymm7, %ymm7;
1166 vpxor 1 * 32(%rdx), %ymm6, %ymm6;
1167 vpxor 2 * 32(%rdx), %ymm5, %ymm5;
1168 vpxor 3 * 32(%rdx), %ymm4, %ymm4;
1169 vpxor 4 * 32(%rdx), %ymm3, %ymm3;
1170 vpxor 5 * 32(%rdx), %ymm2, %ymm2;
1171 vpxor 6 * 32(%rdx), %ymm1, %ymm1;
1172 vpxor 7 * 32(%rdx), %ymm0, %ymm0;
1173 vpxor 8 * 32(%rdx), %ymm15, %ymm15;
1174 vpxor 9 * 32(%rdx), %ymm14, %ymm14;
1175 vpxor 10 * 32(%rdx), %ymm13, %ymm13;
1176 vpxor 11 * 32(%rdx), %ymm12, %ymm12;
1177 vpxor 12 * 32(%rdx), %ymm11, %ymm11;
1178 vpxor 13 * 32(%rdx), %ymm10, %ymm10;
1179 vpxor 14 * 32(%rdx), %ymm9, %ymm9;
1180 vpxor 15 * 32(%rdx), %ymm8, %ymm8;
1181 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1182 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1183 %ymm8, %rsi);
1184
1185 vzeroupper;
1186
1187 ret;
1188ENDPROC(camellia_ctr_32way)
1189
1190#define gf128mul_x_ble(iv, mask, tmp) \
1191 vpsrad $31, iv, tmp; \
1192 vpaddq iv, iv, iv; \
1193 vpshufd $0x13, tmp, tmp; \
1194 vpand mask, tmp, tmp; \
1195 vpxor tmp, iv, iv;
1196
1197#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
1198 vpsrad $31, iv, tmp0; \
1199 vpaddq iv, iv, tmp1; \
1200 vpsllq $2, iv, iv; \
1201 vpshufd $0x13, tmp0, tmp0; \
1202 vpsrad $31, tmp1, tmp1; \
1203 vpand mask2, tmp0, tmp0; \
1204 vpshufd $0x13, tmp1, tmp1; \
1205 vpxor tmp0, iv, iv; \
1206 vpand mask1, tmp1, tmp1; \
1207 vpxor tmp1, iv, iv;
1208
1209.align 8
1210camellia_xts_crypt_32way:
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220 vzeroupper;
1221
1222 subq $(16 * 32), %rsp;
1223 movq %rsp, %rax;
1224
1225 vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12;
1226
1227
1228 vmovdqu (%rcx), %xmm0;
1229 vmovdqa %xmm0, %xmm15;
1230 gf128mul_x_ble(%xmm0, %xmm12, %xmm13);
1231 vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13;
1232 vinserti128 $1, %xmm0, %ymm15, %ymm0;
1233 vpxor 0 * 32(%rdx), %ymm0, %ymm15;
1234 vmovdqu %ymm15, 15 * 32(%rax);
1235 vmovdqu %ymm0, 0 * 32(%rsi);
1236
1237
1238 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1239 vpxor 1 * 32(%rdx), %ymm0, %ymm15;
1240 vmovdqu %ymm15, 14 * 32(%rax);
1241 vmovdqu %ymm0, 1 * 32(%rsi);
1242
1243 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1244 vpxor 2 * 32(%rdx), %ymm0, %ymm15;
1245 vmovdqu %ymm15, 13 * 32(%rax);
1246 vmovdqu %ymm0, 2 * 32(%rsi);
1247
1248 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1249 vpxor 3 * 32(%rdx), %ymm0, %ymm15;
1250 vmovdqu %ymm15, 12 * 32(%rax);
1251 vmovdqu %ymm0, 3 * 32(%rsi);
1252
1253 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1254 vpxor 4 * 32(%rdx), %ymm0, %ymm11;
1255 vmovdqu %ymm0, 4 * 32(%rsi);
1256
1257 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1258 vpxor 5 * 32(%rdx), %ymm0, %ymm10;
1259 vmovdqu %ymm0, 5 * 32(%rsi);
1260
1261 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1262 vpxor 6 * 32(%rdx), %ymm0, %ymm9;
1263 vmovdqu %ymm0, 6 * 32(%rsi);
1264
1265 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1266 vpxor 7 * 32(%rdx), %ymm0, %ymm8;
1267 vmovdqu %ymm0, 7 * 32(%rsi);
1268
1269 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1270 vpxor 8 * 32(%rdx), %ymm0, %ymm7;
1271 vmovdqu %ymm0, 8 * 32(%rsi);
1272
1273 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1274 vpxor 9 * 32(%rdx), %ymm0, %ymm6;
1275 vmovdqu %ymm0, 9 * 32(%rsi);
1276
1277 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1278 vpxor 10 * 32(%rdx), %ymm0, %ymm5;
1279 vmovdqu %ymm0, 10 * 32(%rsi);
1280
1281 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1282 vpxor 11 * 32(%rdx), %ymm0, %ymm4;
1283 vmovdqu %ymm0, 11 * 32(%rsi);
1284
1285 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1286 vpxor 12 * 32(%rdx), %ymm0, %ymm3;
1287 vmovdqu %ymm0, 12 * 32(%rsi);
1288
1289 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1290 vpxor 13 * 32(%rdx), %ymm0, %ymm2;
1291 vmovdqu %ymm0, 13 * 32(%rsi);
1292
1293 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1294 vpxor 14 * 32(%rdx), %ymm0, %ymm1;
1295 vmovdqu %ymm0, 14 * 32(%rsi);
1296
1297 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1298 vpxor 15 * 32(%rdx), %ymm0, %ymm15;
1299 vmovdqu %ymm15, 0 * 32(%rax);
1300 vmovdqu %ymm0, 15 * 32(%rsi);
1301
1302 vextracti128 $1, %ymm0, %xmm0;
1303 gf128mul_x_ble(%xmm0, %xmm12, %xmm15);
1304 vmovdqu %xmm0, (%rcx);
1305
1306
1307 vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
1308 vpshufb .Lpack_bswap, %ymm15, %ymm15;
1309 vpxor 0 * 32(%rax), %ymm15, %ymm0;
1310 vpxor %ymm1, %ymm15, %ymm1;
1311 vpxor %ymm2, %ymm15, %ymm2;
1312 vpxor %ymm3, %ymm15, %ymm3;
1313 vpxor %ymm4, %ymm15, %ymm4;
1314 vpxor %ymm5, %ymm15, %ymm5;
1315 vpxor %ymm6, %ymm15, %ymm6;
1316 vpxor %ymm7, %ymm15, %ymm7;
1317 vpxor %ymm8, %ymm15, %ymm8;
1318 vpxor %ymm9, %ymm15, %ymm9;
1319 vpxor %ymm10, %ymm15, %ymm10;
1320 vpxor %ymm11, %ymm15, %ymm11;
1321 vpxor 12 * 32(%rax), %ymm15, %ymm12;
1322 vpxor 13 * 32(%rax), %ymm15, %ymm13;
1323 vpxor 14 * 32(%rax), %ymm15, %ymm14;
1324 vpxor 15 * 32(%rax), %ymm15, %ymm15;
1325
1326 call *%r9;
1327
1328 addq $(16 * 32), %rsp;
1329
1330 vpxor 0 * 32(%rsi), %ymm7, %ymm7;
1331 vpxor 1 * 32(%rsi), %ymm6, %ymm6;
1332 vpxor 2 * 32(%rsi), %ymm5, %ymm5;
1333 vpxor 3 * 32(%rsi), %ymm4, %ymm4;
1334 vpxor 4 * 32(%rsi), %ymm3, %ymm3;
1335 vpxor 5 * 32(%rsi), %ymm2, %ymm2;
1336 vpxor 6 * 32(%rsi), %ymm1, %ymm1;
1337 vpxor 7 * 32(%rsi), %ymm0, %ymm0;
1338 vpxor 8 * 32(%rsi), %ymm15, %ymm15;
1339 vpxor 9 * 32(%rsi), %ymm14, %ymm14;
1340 vpxor 10 * 32(%rsi), %ymm13, %ymm13;
1341 vpxor 11 * 32(%rsi), %ymm12, %ymm12;
1342 vpxor 12 * 32(%rsi), %ymm11, %ymm11;
1343 vpxor 13 * 32(%rsi), %ymm10, %ymm10;
1344 vpxor 14 * 32(%rsi), %ymm9, %ymm9;
1345 vpxor 15 * 32(%rsi), %ymm8, %ymm8;
1346 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1347 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1348 %ymm8, %rsi);
1349
1350 vzeroupper;
1351
1352 ret;
1353ENDPROC(camellia_xts_crypt_32way)
1354
1355ENTRY(camellia_xts_enc_32way)
1356
1357
1358
1359
1360
1361
1362
1363 xorl %r8d, %r8d;
1364
1365 leaq __camellia_enc_blk32, %r9;
1366
1367 jmp camellia_xts_crypt_32way;
1368ENDPROC(camellia_xts_enc_32way)
1369
1370ENTRY(camellia_xts_dec_32way)
1371
1372
1373
1374
1375
1376
1377
1378 cmpl $16, key_length(CTX);
1379 movl $32, %r8d;
1380 movl $24, %eax;
1381 cmovel %eax, %r8d;
1382
1383 leaq __camellia_dec_blk32, %r9;
1384
1385 jmp camellia_xts_crypt_32way;
1386ENDPROC(camellia_xts_dec_32way)
1387