1
2
3
4
5
6
7
8#include <linux/linkage.h>
9#include <asm/frame.h>
10
11#define CAMELLIA_TABLE_BYTE_LEN 272
12
13
14#define key_table 0
15#define key_length CAMELLIA_TABLE_BYTE_LEN
16
17
18#define CTX %rdi
19#define RIO %r8
20
21
22
23
24#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
25 vpand x, mask4bit, tmp0; \
26 vpandn x, mask4bit, x; \
27 vpsrld $4, x, x; \
28 \
29 vpshufb tmp0, lo_t, tmp0; \
30 vpshufb x, hi_t, x; \
31 vpxor tmp0, x, x;
32
33#define ymm0_x xmm0
34#define ymm1_x xmm1
35#define ymm2_x xmm2
36#define ymm3_x xmm3
37#define ymm4_x xmm4
38#define ymm5_x xmm5
39#define ymm6_x xmm6
40#define ymm7_x xmm7
41#define ymm8_x xmm8
42#define ymm9_x xmm9
43#define ymm10_x xmm10
44#define ymm11_x xmm11
45#define ymm12_x xmm12
46#define ymm13_x xmm13
47#define ymm14_x xmm14
48#define ymm15_x xmm15
49
50
51
52
53
54
55
56
57
58
59
60
61
62#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
63 t7, mem_cd, key) \
64
65
66 \
67 vbroadcasti128 .Linv_shift_row, t4; \
68 vpbroadcastd .L0f0f0f0f, t7; \
69 vbroadcasti128 .Lpre_tf_lo_s1, t5; \
70 vbroadcasti128 .Lpre_tf_hi_s1, t6; \
71 vbroadcasti128 .Lpre_tf_lo_s4, t2; \
72 vbroadcasti128 .Lpre_tf_hi_s4, t3; \
73 \
74 \
75 vpshufb t4, x0, x0; \
76 vpshufb t4, x7, x7; \
77 vpshufb t4, x3, x3; \
78 vpshufb t4, x6, x6; \
79 vpshufb t4, x2, x2; \
80 vpshufb t4, x5, x5; \
81 vpshufb t4, x1, x1; \
82 vpshufb t4, x4, x4; \
83 \
84 \
85 \
86 filter_8bit(x0, t5, t6, t7, t4); \
87 filter_8bit(x7, t5, t6, t7, t4); \
88 vextracti128 $1, x0, t0
89 vextracti128 $1, x7, t1
90 filter_8bit(x3, t2, t3, t7, t4); \
91 filter_8bit(x6, t2, t3, t7, t4); \
92 vextracti128 $1, x3, t3
93 vextracti128 $1, x6, t2
94 filter_8bit(x2, t5, t6, t7, t4); \
95 filter_8bit(x5, t5, t6, t7, t4); \
96 filter_8bit(x1, t5, t6, t7, t4); \
97 filter_8bit(x4, t5, t6, t7, t4); \
98 \
99 vpxor t4
100 \
101 \
102 vextracti128 $1, x2, t6
103 vextracti128 $1, x5, t5
104 vaesenclast t4
105 vaesenclast t4
106 vinserti128 $1, t0
107 vaesenclast t4
108 vaesenclast t4
109 vinserti128 $1, t1
110 vaesenclast t4
111 vaesenclast t4
112 vinserti128 $1, t3
113 vaesenclast t4
114 vaesenclast t4
115 vinserti128 $1, t2
116 vextracti128 $1, x1, t3
117 vextracti128 $1, x4, t2
118 vbroadcasti128 .Lpost_tf_lo_s1, t0; \
119 vbroadcasti128 .Lpost_tf_hi_s1, t1; \
120 vaesenclast t4
121 vaesenclast t4
122 vinserti128 $1, t6
123 vaesenclast t4
124 vaesenclast t4
125 vinserti128 $1, t5
126 vaesenclast t4
127 vaesenclast t4
128 vinserti128 $1, t3
129 vaesenclast t4
130 vaesenclast t4
131 vinserti128 $1, t2
132 \
133 \
134 vbroadcasti128 .Lpost_tf_lo_s3, t2; \
135 vbroadcasti128 .Lpost_tf_hi_s3, t3; \
136 filter_8bit(x0, t0, t1, t7, t6); \
137 filter_8bit(x7, t0, t1, t7, t6); \
138 filter_8bit(x3, t0, t1, t7, t6); \
139 filter_8bit(x6, t0, t1, t7, t6); \
140 \
141 \
142 vbroadcasti128 .Lpost_tf_lo_s2, t4; \
143 vbroadcasti128 .Lpost_tf_hi_s2, t5; \
144 filter_8bit(x2, t2, t3, t7, t6); \
145 filter_8bit(x5, t2, t3, t7, t6); \
146 \
147 vpbroadcastq key, t0; \
148 \
149 \
150 filter_8bit(x1, t4, t5, t7, t2); \
151 filter_8bit(x4, t4, t5, t7, t2); \
152 vpxor t7, t7, t7; \
153 \
154 vpsrldq $1, t0, t1; \
155 vpsrldq $2, t0, t2; \
156 vpshufb t7, t1, t1; \
157 vpsrldq $3, t0, t3; \
158 \
159 \
160 vpxor x5, x0, x0; \
161 vpxor x6, x1, x1; \
162 vpxor x7, x2, x2; \
163 vpxor x4, x3, x3; \
164 \
165 vpshufb t7, t2, t2; \
166 vpsrldq $4, t0, t4; \
167 vpshufb t7, t3, t3; \
168 vpsrldq $5, t0, t5; \
169 vpshufb t7, t4, t4; \
170 \
171 vpxor x2, x4, x4; \
172 vpxor x3, x5, x5; \
173 vpxor x0, x6, x6; \
174 vpxor x1, x7, x7; \
175 \
176 vpsrldq $6, t0, t6; \
177 vpshufb t7, t5, t5; \
178 vpshufb t7, t6, t6; \
179 \
180 vpxor x7, x0, x0; \
181 vpxor x4, x1, x1; \
182 vpxor x5, x2, x2; \
183 vpxor x6, x3, x3; \
184 \
185 vpxor x3, x4, x4; \
186 vpxor x0, x5, x5; \
187 vpxor x1, x6, x6; \
188 vpxor x2, x7, x7; \
189 \
190 \
191 \
192 vpxor t6, x1, x1; \
193 vpxor 5 * 32(mem_cd), x1, x1; \
194 \
195 vpsrldq $7, t0, t6; \
196 vpshufb t7, t0, t0; \
197 vpshufb t7, t6, t7; \
198 \
199 vpxor t7, x0, x0; \
200 vpxor 4 * 32(mem_cd), x0, x0; \
201 \
202 vpxor t5, x2, x2; \
203 vpxor 6 * 32(mem_cd), x2, x2; \
204 \
205 vpxor t4, x3, x3; \
206 vpxor 7 * 32(mem_cd), x3, x3; \
207 \
208 vpxor t3, x4, x4; \
209 vpxor 0 * 32(mem_cd), x4, x4; \
210 \
211 vpxor t2, x5, x5; \
212 vpxor 1 * 32(mem_cd), x5, x5; \
213 \
214 vpxor t1, x6, x6; \
215 vpxor 2 * 32(mem_cd), x6, x6; \
216 \
217 vpxor t0, x7, x7; \
218 vpxor 3 * 32(mem_cd), x7, x7;
219
220
221
222
223
224.align 8
225SYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
226 roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
227 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
228 %rcx, (%r9));
229 ret;
230SYM_FUNC_END(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
231
232.align 8
233SYM_FUNC_START_LOCAL(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
234 roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
235 %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
236 %rax, (%r9));
237 ret;
238SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
239
240
241
242
243
244
245
246#define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
247 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
248 leaq (key_table + (i) * 8)(CTX), %r9; \
249 call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
250 \
251 vmovdqu x0, 4 * 32(mem_cd); \
252 vmovdqu x1, 5 * 32(mem_cd); \
253 vmovdqu x2, 6 * 32(mem_cd); \
254 vmovdqu x3, 7 * 32(mem_cd); \
255 vmovdqu x4, 0 * 32(mem_cd); \
256 vmovdqu x5, 1 * 32(mem_cd); \
257 vmovdqu x6, 2 * 32(mem_cd); \
258 vmovdqu x7, 3 * 32(mem_cd); \
259 \
260 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
261 call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
262 \
263 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
264
265#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab)
266
267#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
268 \
269 vmovdqu x4, 4 * 32(mem_ab); \
270 vmovdqu x5, 5 * 32(mem_ab); \
271 vmovdqu x6, 6 * 32(mem_ab); \
272 vmovdqu x7, 7 * 32(mem_ab); \
273 vmovdqu x0, 0 * 32(mem_ab); \
274 vmovdqu x1, 1 * 32(mem_ab); \
275 vmovdqu x2, 2 * 32(mem_ab); \
276 vmovdqu x3, 3 * 32(mem_ab);
277
278#define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
279 y6, y7, mem_ab, mem_cd, i) \
280 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
281 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
282 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
283 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
284 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
285 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
286
287#define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
288 y6, y7, mem_ab, mem_cd, i) \
289 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
290 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
291 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
292 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
293 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
294 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
295
296
297
298
299
300
301
302#define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
303 vpcmpgtb v0, zero, t0; \
304 vpaddb v0, v0, v0; \
305 vpabsb t0, t0; \
306 \
307 vpcmpgtb v1, zero, t1; \
308 vpaddb v1, v1, v1; \
309 vpabsb t1, t1; \
310 \
311 vpcmpgtb v2, zero, t2; \
312 vpaddb v2, v2, v2; \
313 vpabsb t2, t2; \
314 \
315 vpor t0, v1, v1; \
316 \
317 vpcmpgtb v3, zero, t0; \
318 vpaddb v3, v3, v3; \
319 vpabsb t0, t0; \
320 \
321 vpor t1, v2, v2; \
322 vpor t2, v3, v3; \
323 vpor t0, v0, v0;
324
325
326
327
328
329
330
331
332#define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
333 tt1, tt2, tt3, kll, klr, krl, krr) \
334
335
336
337
338 \
339 vpbroadcastd kll, t0; \
340 vpxor tt0, tt0, tt0; \
341 vpshufb tt0, t0, t3; \
342 vpsrldq $1, t0, t0; \
343 vpshufb tt0, t0, t2; \
344 vpsrldq $1, t0, t0; \
345 vpshufb tt0, t0, t1; \
346 vpsrldq $1, t0, t0; \
347 vpshufb tt0, t0, t0; \
348 \
349 vpand l0, t0, t0; \
350 vpand l1, t1, t1; \
351 vpand l2, t2, t2; \
352 vpand l3, t3, t3; \
353 \
354 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
355 \
356 vpxor l4, t0, l4; \
357 vpbroadcastd krr, t0; \
358 vmovdqu l4, 4 * 32(l); \
359 vpxor l5, t1, l5; \
360 vmovdqu l5, 5 * 32(l); \
361 vpxor l6, t2, l6; \
362 vmovdqu l6, 6 * 32(l); \
363 vpxor l7, t3, l7; \
364 vmovdqu l7, 7 * 32(l); \
365 \
366
367
368
369
370 \
371 \
372 vpshufb tt0, t0, t3; \
373 vpsrldq $1, t0, t0; \
374 vpshufb tt0, t0, t2; \
375 vpsrldq $1, t0, t0; \
376 vpshufb tt0, t0, t1; \
377 vpsrldq $1, t0, t0; \
378 vpshufb tt0, t0, t0; \
379 \
380 vpor 4 * 32(r), t0, t0; \
381 vpor 5 * 32(r), t1, t1; \
382 vpor 6 * 32(r), t2, t2; \
383 vpor 7 * 32(r), t3, t3; \
384 \
385 vpxor 0 * 32(r), t0, t0; \
386 vpxor 1 * 32(r), t1, t1; \
387 vpxor 2 * 32(r), t2, t2; \
388 vpxor 3 * 32(r), t3, t3; \
389 vmovdqu t0, 0 * 32(r); \
390 vpbroadcastd krl, t0; \
391 vmovdqu t1, 1 * 32(r); \
392 vmovdqu t2, 2 * 32(r); \
393 vmovdqu t3, 3 * 32(r); \
394 \
395
396
397
398
399 \
400 vpshufb tt0, t0, t3; \
401 vpsrldq $1, t0, t0; \
402 vpshufb tt0, t0, t2; \
403 vpsrldq $1, t0, t0; \
404 vpshufb tt0, t0, t1; \
405 vpsrldq $1, t0, t0; \
406 vpshufb tt0, t0, t0; \
407 \
408 vpand 0 * 32(r), t0, t0; \
409 vpand 1 * 32(r), t1, t1; \
410 vpand 2 * 32(r), t2, t2; \
411 vpand 3 * 32(r), t3, t3; \
412 \
413 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
414 \
415 vpxor 4 * 32(r), t0, t0; \
416 vpxor 5 * 32(r), t1, t1; \
417 vpxor 6 * 32(r), t2, t2; \
418 vpxor 7 * 32(r), t3, t3; \
419 vmovdqu t0, 4 * 32(r); \
420 vpbroadcastd klr, t0; \
421 vmovdqu t1, 5 * 32(r); \
422 vmovdqu t2, 6 * 32(r); \
423 vmovdqu t3, 7 * 32(r); \
424 \
425
426
427
428
429 \
430 \
431 vpshufb tt0, t0, t3; \
432 vpsrldq $1, t0, t0; \
433 vpshufb tt0, t0, t2; \
434 vpsrldq $1, t0, t0; \
435 vpshufb tt0, t0, t1; \
436 vpsrldq $1, t0, t0; \
437 vpshufb tt0, t0, t0; \
438 \
439 vpor l4, t0, t0; \
440 vpor l5, t1, t1; \
441 vpor l6, t2, t2; \
442 vpor l7, t3, t3; \
443 \
444 vpxor l0, t0, l0; \
445 vmovdqu l0, 0 * 32(l); \
446 vpxor l1, t1, l1; \
447 vmovdqu l1, 1 * 32(l); \
448 vpxor l2, t2, l2; \
449 vmovdqu l2, 2 * 32(l); \
450 vpxor l3, t3, l3; \
451 vmovdqu l3, 3 * 32(l);
452
453#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
454 vpunpckhdq x1, x0, t2; \
455 vpunpckldq x1, x0, x0; \
456 \
457 vpunpckldq x3, x2, t1; \
458 vpunpckhdq x3, x2, x2; \
459 \
460 vpunpckhqdq t1, x0, x1; \
461 vpunpcklqdq t1, x0, x0; \
462 \
463 vpunpckhqdq x2, t2, x3; \
464 vpunpcklqdq x2, t2, x2;
465
466#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
467 a3, b3, c3, d3, st0, st1) \
468 vmovdqu d2, st0; \
469 vmovdqu d3, st1; \
470 transpose_4x4(a0, a1, a2, a3, d2, d3); \
471 transpose_4x4(b0, b1, b2, b3, d2, d3); \
472 vmovdqu st0, d2; \
473 vmovdqu st1, d3; \
474 \
475 vmovdqu a0, st0; \
476 vmovdqu a1, st1; \
477 transpose_4x4(c0, c1, c2, c3, a0, a1); \
478 transpose_4x4(d0, d1, d2, d3, a0, a1); \
479 \
480 vbroadcasti128 .Lshufb_16x16b, a0; \
481 vmovdqu st1, a1; \
482 vpshufb a0, a2, a2; \
483 vpshufb a0, a3, a3; \
484 vpshufb a0, b0, b0; \
485 vpshufb a0, b1, b1; \
486 vpshufb a0, b2, b2; \
487 vpshufb a0, b3, b3; \
488 vpshufb a0, a1, a1; \
489 vpshufb a0, c0, c0; \
490 vpshufb a0, c1, c1; \
491 vpshufb a0, c2, c2; \
492 vpshufb a0, c3, c3; \
493 vpshufb a0, d0, d0; \
494 vpshufb a0, d1, d1; \
495 vpshufb a0, d2, d2; \
496 vpshufb a0, d3, d3; \
497 vmovdqu d3, st1; \
498 vmovdqu st0, d3; \
499 vpshufb a0, d3, a0; \
500 vmovdqu d2, st0; \
501 \
502 transpose_4x4(a0, b0, c0, d0, d2, d3); \
503 transpose_4x4(a1, b1, c1, d1, d2, d3); \
504 vmovdqu st0, d2; \
505 vmovdqu st1, d3; \
506 \
507 vmovdqu b0, st0; \
508 vmovdqu b1, st1; \
509 transpose_4x4(a2, b2, c2, d2, b0, b1); \
510 transpose_4x4(a3, b3, c3, d3, b0, b1); \
511 vmovdqu st0, b0; \
512 vmovdqu st1, b1; \
513
514
515
516#define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
517 y6, y7, rio, key) \
518 vpbroadcastq key, x0; \
519 vpshufb .Lpack_bswap, x0, x0; \
520 \
521 vpxor 0 * 32(rio), x0, y7; \
522 vpxor 1 * 32(rio), x0, y6; \
523 vpxor 2 * 32(rio), x0, y5; \
524 vpxor 3 * 32(rio), x0, y4; \
525 vpxor 4 * 32(rio), x0, y3; \
526 vpxor 5 * 32(rio), x0, y2; \
527 vpxor 6 * 32(rio), x0, y1; \
528 vpxor 7 * 32(rio), x0, y0; \
529 vpxor 8 * 32(rio), x0, x7; \
530 vpxor 9 * 32(rio), x0, x6; \
531 vpxor 10 * 32(rio), x0, x5; \
532 vpxor 11 * 32(rio), x0, x4; \
533 vpxor 12 * 32(rio), x0, x3; \
534 vpxor 13 * 32(rio), x0, x2; \
535 vpxor 14 * 32(rio), x0, x1; \
536 vpxor 15 * 32(rio), x0, x0;
537
538
539#define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
540 y6, y7, mem_ab, mem_cd) \
541 byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
542 y4, y5, y6, y7, (mem_ab), (mem_cd)); \
543 \
544 vmovdqu x0, 0 * 32(mem_ab); \
545 vmovdqu x1, 1 * 32(mem_ab); \
546 vmovdqu x2, 2 * 32(mem_ab); \
547 vmovdqu x3, 3 * 32(mem_ab); \
548 vmovdqu x4, 4 * 32(mem_ab); \
549 vmovdqu x5, 5 * 32(mem_ab); \
550 vmovdqu x6, 6 * 32(mem_ab); \
551 vmovdqu x7, 7 * 32(mem_ab); \
552 vmovdqu y0, 0 * 32(mem_cd); \
553 vmovdqu y1, 1 * 32(mem_cd); \
554 vmovdqu y2, 2 * 32(mem_cd); \
555 vmovdqu y3, 3 * 32(mem_cd); \
556 vmovdqu y4, 4 * 32(mem_cd); \
557 vmovdqu y5, 5 * 32(mem_cd); \
558 vmovdqu y6, 6 * 32(mem_cd); \
559 vmovdqu y7, 7 * 32(mem_cd);
560
561
562#define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
563 y5, y6, y7, key, stack_tmp0, stack_tmp1) \
564 byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
565 y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
566 \
567 vmovdqu x0, stack_tmp0; \
568 \
569 vpbroadcastq key, x0; \
570 vpshufb .Lpack_bswap, x0, x0; \
571 \
572 vpxor x0, y7, y7; \
573 vpxor x0, y6, y6; \
574 vpxor x0, y5, y5; \
575 vpxor x0, y4, y4; \
576 vpxor x0, y3, y3; \
577 vpxor x0, y2, y2; \
578 vpxor x0, y1, y1; \
579 vpxor x0, y0, y0; \
580 vpxor x0, x7, x7; \
581 vpxor x0, x6, x6; \
582 vpxor x0, x5, x5; \
583 vpxor x0, x4, x4; \
584 vpxor x0, x3, x3; \
585 vpxor x0, x2, x2; \
586 vpxor x0, x1, x1; \
587 vpxor stack_tmp0, x0, x0;
588
589#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
590 y6, y7, rio) \
591 vmovdqu x0, 0 * 32(rio); \
592 vmovdqu x1, 1 * 32(rio); \
593 vmovdqu x2, 2 * 32(rio); \
594 vmovdqu x3, 3 * 32(rio); \
595 vmovdqu x4, 4 * 32(rio); \
596 vmovdqu x5, 5 * 32(rio); \
597 vmovdqu x6, 6 * 32(rio); \
598 vmovdqu x7, 7 * 32(rio); \
599 vmovdqu y0, 8 * 32(rio); \
600 vmovdqu y1, 9 * 32(rio); \
601 vmovdqu y2, 10 * 32(rio); \
602 vmovdqu y3, 11 * 32(rio); \
603 vmovdqu y4, 12 * 32(rio); \
604 vmovdqu y5, 13 * 32(rio); \
605 vmovdqu y6, 14 * 32(rio); \
606 vmovdqu y7, 15 * 32(rio);
607
608
609.section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
610.align 32
611#define SHUFB_BYTES(idx) \
612 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
613.Lshufb_16x16b:
614 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
615 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
616
617.section .rodata.cst32.pack_bswap, "aM", @progbits, 32
618.align 32
619.Lpack_bswap:
620 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
621 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
622
623
624.section .rodata.cst16, "aM", @progbits, 16
625.align 16
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641.Lpre_tf_lo_s1:
642 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
643 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
644.Lpre_tf_hi_s1:
645 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
646 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662.Lpre_tf_lo_s4:
663 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
664 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
665.Lpre_tf_hi_s4:
666 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
667 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685.Lpost_tf_lo_s1:
686 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
687 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
688.Lpost_tf_hi_s1:
689 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
690 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708.Lpost_tf_lo_s2:
709 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
710 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
711.Lpost_tf_hi_s2:
712 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
713 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731.Lpost_tf_lo_s3:
732 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
733 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
734.Lpost_tf_hi_s3:
735 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
736 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
737
738
739.Linv_shift_row:
740 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
741 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
742
743.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
744.align 4
745
746.L0f0f0f0f:
747 .long 0x0f0f0f0f
748
749.text
750
751.align 8
752SYM_FUNC_START_LOCAL(__camellia_enc_blk32)
753
754
755
756
757
758
759
760
761 FRAME_BEGIN
762
763 leaq 8 * 32(%rax), %rcx;
764
765 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
766 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
767 %ymm15, %rax, %rcx);
768
769 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
770 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
771 %ymm15, %rax, %rcx, 0);
772
773 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
774 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
775 %ymm15,
776 ((key_table + (8) * 8) + 0)(CTX),
777 ((key_table + (8) * 8) + 4)(CTX),
778 ((key_table + (8) * 8) + 8)(CTX),
779 ((key_table + (8) * 8) + 12)(CTX));
780
781 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
782 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
783 %ymm15, %rax, %rcx, 8);
784
785 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
786 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
787 %ymm15,
788 ((key_table + (16) * 8) + 0)(CTX),
789 ((key_table + (16) * 8) + 4)(CTX),
790 ((key_table + (16) * 8) + 8)(CTX),
791 ((key_table + (16) * 8) + 12)(CTX));
792
793 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
794 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
795 %ymm15, %rax, %rcx, 16);
796
797 movl $24, %r8d;
798 cmpl $16, key_length(CTX);
799 jne .Lenc_max32;
800
801.Lenc_done:
802
803 vmovdqu 0 * 32(%rcx), %ymm8;
804 vmovdqu 1 * 32(%rcx), %ymm9;
805 vmovdqu 2 * 32(%rcx), %ymm10;
806 vmovdqu 3 * 32(%rcx), %ymm11;
807 vmovdqu 4 * 32(%rcx), %ymm12;
808 vmovdqu 5 * 32(%rcx), %ymm13;
809 vmovdqu 6 * 32(%rcx), %ymm14;
810 vmovdqu 7 * 32(%rcx), %ymm15;
811
812 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
813 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
814 %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
815
816 FRAME_END
817 ret;
818
819.align 8
820.Lenc_max32:
821 movl $32, %r8d;
822
823 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
824 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
825 %ymm15,
826 ((key_table + (24) * 8) + 0)(CTX),
827 ((key_table + (24) * 8) + 4)(CTX),
828 ((key_table + (24) * 8) + 8)(CTX),
829 ((key_table + (24) * 8) + 12)(CTX));
830
831 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
832 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
833 %ymm15, %rax, %rcx, 24);
834
835 jmp .Lenc_done;
836SYM_FUNC_END(__camellia_enc_blk32)
837
838.align 8
839SYM_FUNC_START_LOCAL(__camellia_dec_blk32)
840
841
842
843
844
845
846
847
848
849 FRAME_BEGIN
850
851 leaq 8 * 32(%rax), %rcx;
852
853 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
854 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
855 %ymm15, %rax, %rcx);
856
857 cmpl $32, %r8d;
858 je .Ldec_max32;
859
860.Ldec_max24:
861 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
862 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
863 %ymm15, %rax, %rcx, 16);
864
865 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
866 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
867 %ymm15,
868 ((key_table + (16) * 8) + 8)(CTX),
869 ((key_table + (16) * 8) + 12)(CTX),
870 ((key_table + (16) * 8) + 0)(CTX),
871 ((key_table + (16) * 8) + 4)(CTX));
872
873 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
874 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
875 %ymm15, %rax, %rcx, 8);
876
877 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
878 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
879 %ymm15,
880 ((key_table + (8) * 8) + 8)(CTX),
881 ((key_table + (8) * 8) + 12)(CTX),
882 ((key_table + (8) * 8) + 0)(CTX),
883 ((key_table + (8) * 8) + 4)(CTX));
884
885 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
886 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
887 %ymm15, %rax, %rcx, 0);
888
889
890 vmovdqu 0 * 32(%rcx), %ymm8;
891 vmovdqu 1 * 32(%rcx), %ymm9;
892 vmovdqu 2 * 32(%rcx), %ymm10;
893 vmovdqu 3 * 32(%rcx), %ymm11;
894 vmovdqu 4 * 32(%rcx), %ymm12;
895 vmovdqu 5 * 32(%rcx), %ymm13;
896 vmovdqu 6 * 32(%rcx), %ymm14;
897 vmovdqu 7 * 32(%rcx), %ymm15;
898
899 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
900 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
901 %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
902
903 FRAME_END
904 ret;
905
906.align 8
907.Ldec_max32:
908 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
909 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
910 %ymm15, %rax, %rcx, 24);
911
912 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
913 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
914 %ymm15,
915 ((key_table + (24) * 8) + 8)(CTX),
916 ((key_table + (24) * 8) + 12)(CTX),
917 ((key_table + (24) * 8) + 0)(CTX),
918 ((key_table + (24) * 8) + 4)(CTX));
919
920 jmp .Ldec_max24;
921SYM_FUNC_END(__camellia_dec_blk32)
922
923SYM_FUNC_START(camellia_ecb_enc_32way)
924
925
926
927
928
929 FRAME_BEGIN
930
931 vzeroupper;
932
933 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
934 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
935 %ymm15, %rdx, (key_table)(CTX));
936
937
938 movq %rsi, %rax;
939
940 call __camellia_enc_blk32;
941
942 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
943 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
944 %ymm8, %rsi);
945
946 vzeroupper;
947
948 FRAME_END
949 ret;
950SYM_FUNC_END(camellia_ecb_enc_32way)
951
952SYM_FUNC_START(camellia_ecb_dec_32way)
953
954
955
956
957
958 FRAME_BEGIN
959
960 vzeroupper;
961
962 cmpl $16, key_length(CTX);
963 movl $32, %r8d;
964 movl $24, %eax;
965 cmovel %eax, %r8d;
966
967 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
968 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
969 %ymm15, %rdx, (key_table)(CTX, %r8, 8));
970
971
972 movq %rsi, %rax;
973
974 call __camellia_dec_blk32;
975
976 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
977 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
978 %ymm8, %rsi);
979
980 vzeroupper;
981
982 FRAME_END
983 ret;
984SYM_FUNC_END(camellia_ecb_dec_32way)
985
986SYM_FUNC_START(camellia_cbc_dec_32way)
987
988
989
990
991
992 FRAME_BEGIN
993 subq $(16 * 32), %rsp;
994
995 vzeroupper;
996
997 cmpl $16, key_length(CTX);
998 movl $32, %r8d;
999 movl $24, %eax;
1000 cmovel %eax, %r8d;
1001
1002 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1003 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1004 %ymm15, %rdx, (key_table)(CTX, %r8, 8));
1005
1006 cmpq %rsi, %rdx;
1007 je .Lcbc_dec_use_stack;
1008
1009
1010 movq %rsi, %rax;
1011 jmp .Lcbc_dec_continue;
1012
1013.Lcbc_dec_use_stack:
1014
1015
1016
1017
1018 movq %rsp, %rax;
1019
1020.Lcbc_dec_continue:
1021 call __camellia_dec_blk32;
1022
1023 vmovdqu %ymm7, (%rax);
1024 vpxor %ymm7, %ymm7, %ymm7;
1025 vinserti128 $1, (%rdx), %ymm7, %ymm7;
1026 vpxor (%rax), %ymm7, %ymm7;
1027 vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
1028 vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
1029 vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
1030 vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
1031 vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
1032 vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
1033 vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
1034 vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
1035 vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
1036 vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
1037 vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
1038 vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
1039 vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
1040 vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
1041 vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
1042 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1043 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1044 %ymm8, %rsi);
1045
1046 vzeroupper;
1047
1048 addq $(16 * 32), %rsp;
1049 FRAME_END
1050 ret;
1051SYM_FUNC_END(camellia_cbc_dec_32way)
1052