1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18#include <linux/linkage.h>
19#include <asm/frame.h>
20
21#define CAMELLIA_TABLE_BYTE_LEN 272
22
23
24#define key_table 0
25#define key_length CAMELLIA_TABLE_BYTE_LEN
26
27
28#define CTX %rdi
29
30
31
32
33#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
34 vpand x, mask4bit, tmp0; \
35 vpandn x, mask4bit, x; \
36 vpsrld $4, x, x; \
37 \
38 vpshufb tmp0, lo_t, tmp0; \
39 vpshufb x, hi_t, x; \
40 vpxor tmp0, x, x;
41
42
43
44
45
46
47
48
49
50#define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
51 t7, mem_cd, key) \
52
53
54 \
55 vmovdqa .Linv_shift_row, t4; \
56 vbroadcastss .L0f0f0f0f, t7; \
57 vmovdqa .Lpre_tf_lo_s1, t0; \
58 vmovdqa .Lpre_tf_hi_s1, t1; \
59 \
60 \
61 vpshufb t4, x0, x0; \
62 vpshufb t4, x7, x7; \
63 vpshufb t4, x1, x1; \
64 vpshufb t4, x4, x4; \
65 vpshufb t4, x2, x2; \
66 vpshufb t4, x5, x5; \
67 vpshufb t4, x3, x3; \
68 vpshufb t4, x6, x6; \
69 \
70 \
71 vmovdqa .Lpre_tf_lo_s4, t2; \
72 vmovdqa .Lpre_tf_hi_s4, t3; \
73 filter_8bit(x0, t0, t1, t7, t6); \
74 filter_8bit(x7, t0, t1, t7, t6); \
75 filter_8bit(x1, t0, t1, t7, t6); \
76 filter_8bit(x4, t0, t1, t7, t6); \
77 filter_8bit(x2, t0, t1, t7, t6); \
78 filter_8bit(x5, t0, t1, t7, t6); \
79 \
80 \
81 vpxor t4, t4, t4; \
82 filter_8bit(x3, t2, t3, t7, t6); \
83 filter_8bit(x6, t2, t3, t7, t6); \
84 \
85 \
86 vmovdqa .Lpost_tf_lo_s1, t0; \
87 vmovdqa .Lpost_tf_hi_s1, t1; \
88 vaesenclast t4, x0, x0; \
89 vaesenclast t4, x7, x7; \
90 vaesenclast t4, x1, x1; \
91 vaesenclast t4, x4, x4; \
92 vaesenclast t4, x2, x2; \
93 vaesenclast t4, x5, x5; \
94 vaesenclast t4, x3, x3; \
95 vaesenclast t4, x6, x6; \
96 \
97 \
98 vmovdqa .Lpost_tf_lo_s3, t2; \
99 vmovdqa .Lpost_tf_hi_s3, t3; \
100 filter_8bit(x0, t0, t1, t7, t6); \
101 filter_8bit(x7, t0, t1, t7, t6); \
102 filter_8bit(x3, t0, t1, t7, t6); \
103 filter_8bit(x6, t0, t1, t7, t6); \
104 \
105 \
106 vmovdqa .Lpost_tf_lo_s2, t4; \
107 vmovdqa .Lpost_tf_hi_s2, t5; \
108 filter_8bit(x2, t2, t3, t7, t6); \
109 filter_8bit(x5, t2, t3, t7, t6); \
110 \
111 vpxor t6, t6, t6; \
112 vmovq key, t0; \
113 \
114 \
115 filter_8bit(x1, t4, t5, t7, t2); \
116 filter_8bit(x4, t4, t5, t7, t2); \
117 \
118 vpsrldq $5, t0, t5; \
119 vpsrldq $1, t0, t1; \
120 vpsrldq $2, t0, t2; \
121 vpsrldq $3, t0, t3; \
122 vpsrldq $4, t0, t4; \
123 vpshufb t6, t0, t0; \
124 vpshufb t6, t1, t1; \
125 vpshufb t6, t2, t2; \
126 vpshufb t6, t3, t3; \
127 vpshufb t6, t4, t4; \
128 vpsrldq $2, t5, t7; \
129 vpshufb t6, t7, t7; \
130 \
131
132
133 \
134 vpxor x5, x0, x0; \
135 vpxor x6, x1, x1; \
136 vpxor x7, x2, x2; \
137 vpxor x4, x3, x3; \
138 \
139 vpxor x2, x4, x4; \
140 vpxor x3, x5, x5; \
141 vpxor x0, x6, x6; \
142 vpxor x1, x7, x7; \
143 \
144 vpxor x7, x0, x0; \
145 vpxor x4, x1, x1; \
146 vpxor x5, x2, x2; \
147 vpxor x6, x3, x3; \
148 \
149 vpxor x3, x4, x4; \
150 vpxor x0, x5, x5; \
151 vpxor x1, x6, x6; \
152 vpxor x2, x7, x7; \
153 \
154
155
156 \
157 \
158 vpxor t3, x4, x4; \
159 vpxor 0 * 16(mem_cd), x4, x4; \
160 \
161 vpxor t2, x5, x5; \
162 vpxor 1 * 16(mem_cd), x5, x5; \
163 \
164 vpsrldq $1, t5, t3; \
165 vpshufb t6, t5, t5; \
166 vpshufb t6, t3, t6; \
167 \
168 vpxor t1, x6, x6; \
169 vpxor 2 * 16(mem_cd), x6, x6; \
170 \
171 vpxor t0, x7, x7; \
172 vpxor 3 * 16(mem_cd), x7, x7; \
173 \
174 vpxor t7, x0, x0; \
175 vpxor 4 * 16(mem_cd), x0, x0; \
176 \
177 vpxor t6, x1, x1; \
178 vpxor 5 * 16(mem_cd), x1, x1; \
179 \
180 vpxor t5, x2, x2; \
181 vpxor 6 * 16(mem_cd), x2, x2; \
182 \
183 vpxor t4, x3, x3; \
184 vpxor 7 * 16(mem_cd), x3, x3;
185
186
187
188
189
190.align 8
191SYM_FUNC_START_LOCAL(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
192 roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
193 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
194 %rcx, (%r9));
195 ret;
196SYM_FUNC_END(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
197
198.align 8
199SYM_FUNC_START_LOCAL(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
200 roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,
201 %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
202 %rax, (%r9));
203 ret;
204SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
205
206
207
208
209
210
211
212#define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
213 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
214 leaq (key_table + (i) * 8)(CTX), %r9; \
215 call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
216 \
217 vmovdqu x4, 0 * 16(mem_cd); \
218 vmovdqu x5, 1 * 16(mem_cd); \
219 vmovdqu x6, 2 * 16(mem_cd); \
220 vmovdqu x7, 3 * 16(mem_cd); \
221 vmovdqu x0, 4 * 16(mem_cd); \
222 vmovdqu x1, 5 * 16(mem_cd); \
223 vmovdqu x2, 6 * 16(mem_cd); \
224 vmovdqu x3, 7 * 16(mem_cd); \
225 \
226 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
227 call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
228 \
229 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
230
231#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab)
232
233#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
234 \
235 vmovdqu x0, 0 * 16(mem_ab); \
236 vmovdqu x1, 1 * 16(mem_ab); \
237 vmovdqu x2, 2 * 16(mem_ab); \
238 vmovdqu x3, 3 * 16(mem_ab); \
239 vmovdqu x4, 4 * 16(mem_ab); \
240 vmovdqu x5, 5 * 16(mem_ab); \
241 vmovdqu x6, 6 * 16(mem_ab); \
242 vmovdqu x7, 7 * 16(mem_ab);
243
244#define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
245 y6, y7, mem_ab, mem_cd, i) \
246 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
247 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
248 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
249 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
250 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
251 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
252
253#define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
254 y6, y7, mem_ab, mem_cd, i) \
255 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
256 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
257 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
258 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
259 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
260 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
261
262
263
264
265
266
267
268#define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
269 vpcmpgtb v0, zero, t0; \
270 vpaddb v0, v0, v0; \
271 vpabsb t0, t0; \
272 \
273 vpcmpgtb v1, zero, t1; \
274 vpaddb v1, v1, v1; \
275 vpabsb t1, t1; \
276 \
277 vpcmpgtb v2, zero, t2; \
278 vpaddb v2, v2, v2; \
279 vpabsb t2, t2; \
280 \
281 vpor t0, v1, v1; \
282 \
283 vpcmpgtb v3, zero, t0; \
284 vpaddb v3, v3, v3; \
285 vpabsb t0, t0; \
286 \
287 vpor t1, v2, v2; \
288 vpor t2, v3, v3; \
289 vpor t0, v0, v0;
290
291
292
293
294
295
296
297
298#define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
299 tt1, tt2, tt3, kll, klr, krl, krr) \
300
301
302
303
304 \
305 vpxor tt0, tt0, tt0; \
306 vmovd kll, t0; \
307 vpshufb tt0, t0, t3; \
308 vpsrldq $1, t0, t0; \
309 vpshufb tt0, t0, t2; \
310 vpsrldq $1, t0, t0; \
311 vpshufb tt0, t0, t1; \
312 vpsrldq $1, t0, t0; \
313 vpshufb tt0, t0, t0; \
314 \
315 vpand l0, t0, t0; \
316 vpand l1, t1, t1; \
317 vpand l2, t2, t2; \
318 vpand l3, t3, t3; \
319 \
320 rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
321 \
322 vpxor l4, t0, l4; \
323 vmovdqu l4, 4 * 16(l); \
324 vpxor l5, t1, l5; \
325 vmovdqu l5, 5 * 16(l); \
326 vpxor l6, t2, l6; \
327 vmovdqu l6, 6 * 16(l); \
328 vpxor l7, t3, l7; \
329 vmovdqu l7, 7 * 16(l); \
330 \
331
332
333
334
335 \
336 \
337 vmovd krr, t0; \
338 vpshufb tt0, t0, t3; \
339 vpsrldq $1, t0, t0; \
340 vpshufb tt0, t0, t2; \
341 vpsrldq $1, t0, t0; \
342 vpshufb tt0, t0, t1; \
343 vpsrldq $1, t0, t0; \
344 vpshufb tt0, t0, t0; \
345 \
346 vpor 4 * 16(r), t0, t0; \
347 vpor 5 * 16(r), t1, t1; \
348 vpor 6 * 16(r), t2, t2; \
349 vpor 7 * 16(r), t3, t3; \
350 \
351 vpxor 0 * 16(r), t0, t0; \
352 vpxor 1 * 16(r), t1, t1; \
353 vpxor 2 * 16(r), t2, t2; \
354 vpxor 3 * 16(r), t3, t3; \
355 vmovdqu t0, 0 * 16(r); \
356 vmovdqu t1, 1 * 16(r); \
357 vmovdqu t2, 2 * 16(r); \
358 vmovdqu t3, 3 * 16(r); \
359 \
360
361
362
363
364 \
365 vmovd krl, t0; \
366 vpshufb tt0, t0, t3; \
367 vpsrldq $1, t0, t0; \
368 vpshufb tt0, t0, t2; \
369 vpsrldq $1, t0, t0; \
370 vpshufb tt0, t0, t1; \
371 vpsrldq $1, t0, t0; \
372 vpshufb tt0, t0, t0; \
373 \
374 vpand 0 * 16(r), t0, t0; \
375 vpand 1 * 16(r), t1, t1; \
376 vpand 2 * 16(r), t2, t2; \
377 vpand 3 * 16(r), t3, t3; \
378 \
379 rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
380 \
381 vpxor 4 * 16(r), t0, t0; \
382 vpxor 5 * 16(r), t1, t1; \
383 vpxor 6 * 16(r), t2, t2; \
384 vpxor 7 * 16(r), t3, t3; \
385 vmovdqu t0, 4 * 16(r); \
386 vmovdqu t1, 5 * 16(r); \
387 vmovdqu t2, 6 * 16(r); \
388 vmovdqu t3, 7 * 16(r); \
389 \
390
391
392
393
394 \
395 \
396 vmovd klr, t0; \
397 vpshufb tt0, t0, t3; \
398 vpsrldq $1, t0, t0; \
399 vpshufb tt0, t0, t2; \
400 vpsrldq $1, t0, t0; \
401 vpshufb tt0, t0, t1; \
402 vpsrldq $1, t0, t0; \
403 vpshufb tt0, t0, t0; \
404 \
405 vpor l4, t0, t0; \
406 vpor l5, t1, t1; \
407 vpor l6, t2, t2; \
408 vpor l7, t3, t3; \
409 \
410 vpxor l0, t0, l0; \
411 vmovdqu l0, 0 * 16(l); \
412 vpxor l1, t1, l1; \
413 vmovdqu l1, 1 * 16(l); \
414 vpxor l2, t2, l2; \
415 vmovdqu l2, 2 * 16(l); \
416 vpxor l3, t3, l3; \
417 vmovdqu l3, 3 * 16(l);
418
419#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
420 vpunpckhdq x1, x0, t2; \
421 vpunpckldq x1, x0, x0; \
422 \
423 vpunpckldq x3, x2, t1; \
424 vpunpckhdq x3, x2, x2; \
425 \
426 vpunpckhqdq t1, x0, x1; \
427 vpunpcklqdq t1, x0, x0; \
428 \
429 vpunpckhqdq x2, t2, x3; \
430 vpunpcklqdq x2, t2, x2;
431
432#define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \
433 b3, c3, d3, st0, st1) \
434 vmovdqu d2, st0; \
435 vmovdqu d3, st1; \
436 transpose_4x4(a0, a1, a2, a3, d2, d3); \
437 transpose_4x4(b0, b1, b2, b3, d2, d3); \
438 vmovdqu st0, d2; \
439 vmovdqu st1, d3; \
440 \
441 vmovdqu a0, st0; \
442 vmovdqu a1, st1; \
443 transpose_4x4(c0, c1, c2, c3, a0, a1); \
444 transpose_4x4(d0, d1, d2, d3, a0, a1); \
445 \
446 vmovdqu .Lshufb_16x16b, a0; \
447 vmovdqu st1, a1; \
448 vpshufb a0, a2, a2; \
449 vpshufb a0, a3, a3; \
450 vpshufb a0, b0, b0; \
451 vpshufb a0, b1, b1; \
452 vpshufb a0, b2, b2; \
453 vpshufb a0, b3, b3; \
454 vpshufb a0, a1, a1; \
455 vpshufb a0, c0, c0; \
456 vpshufb a0, c1, c1; \
457 vpshufb a0, c2, c2; \
458 vpshufb a0, c3, c3; \
459 vpshufb a0, d0, d0; \
460 vpshufb a0, d1, d1; \
461 vpshufb a0, d2, d2; \
462 vpshufb a0, d3, d3; \
463 vmovdqu d3, st1; \
464 vmovdqu st0, d3; \
465 vpshufb a0, d3, a0; \
466 vmovdqu d2, st0; \
467 \
468 transpose_4x4(a0, b0, c0, d0, d2, d3); \
469 transpose_4x4(a1, b1, c1, d1, d2, d3); \
470 vmovdqu st0, d2; \
471 vmovdqu st1, d3; \
472 \
473 vmovdqu b0, st0; \
474 vmovdqu b1, st1; \
475 transpose_4x4(a2, b2, c2, d2, b0, b1); \
476 transpose_4x4(a3, b3, c3, d3, b0, b1); \
477 vmovdqu st0, b0; \
478 vmovdqu st1, b1; \
479
480
481
482#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
483 y6, y7, rio, key) \
484 vmovq key, x0; \
485 vpshufb .Lpack_bswap, x0, x0; \
486 \
487 vpxor 0 * 16(rio), x0, y7; \
488 vpxor 1 * 16(rio), x0, y6; \
489 vpxor 2 * 16(rio), x0, y5; \
490 vpxor 3 * 16(rio), x0, y4; \
491 vpxor 4 * 16(rio), x0, y3; \
492 vpxor 5 * 16(rio), x0, y2; \
493 vpxor 6 * 16(rio), x0, y1; \
494 vpxor 7 * 16(rio), x0, y0; \
495 vpxor 8 * 16(rio), x0, x7; \
496 vpxor 9 * 16(rio), x0, x6; \
497 vpxor 10 * 16(rio), x0, x5; \
498 vpxor 11 * 16(rio), x0, x4; \
499 vpxor 12 * 16(rio), x0, x3; \
500 vpxor 13 * 16(rio), x0, x2; \
501 vpxor 14 * 16(rio), x0, x1; \
502 vpxor 15 * 16(rio), x0, x0;
503
504
505#define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
506 y6, y7, mem_ab, mem_cd) \
507 byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
508 y5, y6, y7, (mem_ab), (mem_cd)); \
509 \
510 vmovdqu x0, 0 * 16(mem_ab); \
511 vmovdqu x1, 1 * 16(mem_ab); \
512 vmovdqu x2, 2 * 16(mem_ab); \
513 vmovdqu x3, 3 * 16(mem_ab); \
514 vmovdqu x4, 4 * 16(mem_ab); \
515 vmovdqu x5, 5 * 16(mem_ab); \
516 vmovdqu x6, 6 * 16(mem_ab); \
517 vmovdqu x7, 7 * 16(mem_ab); \
518 vmovdqu y0, 0 * 16(mem_cd); \
519 vmovdqu y1, 1 * 16(mem_cd); \
520 vmovdqu y2, 2 * 16(mem_cd); \
521 vmovdqu y3, 3 * 16(mem_cd); \
522 vmovdqu y4, 4 * 16(mem_cd); \
523 vmovdqu y5, 5 * 16(mem_cd); \
524 vmovdqu y6, 6 * 16(mem_cd); \
525 vmovdqu y7, 7 * 16(mem_cd);
526
527
528#define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
529 y5, y6, y7, key, stack_tmp0, stack_tmp1) \
530 byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \
531 y7, x3, x7, stack_tmp0, stack_tmp1); \
532 \
533 vmovdqu x0, stack_tmp0; \
534 \
535 vmovq key, x0; \
536 vpshufb .Lpack_bswap, x0, x0; \
537 \
538 vpxor x0, y7, y7; \
539 vpxor x0, y6, y6; \
540 vpxor x0, y5, y5; \
541 vpxor x0, y4, y4; \
542 vpxor x0, y3, y3; \
543 vpxor x0, y2, y2; \
544 vpxor x0, y1, y1; \
545 vpxor x0, y0, y0; \
546 vpxor x0, x7, x7; \
547 vpxor x0, x6, x6; \
548 vpxor x0, x5, x5; \
549 vpxor x0, x4, x4; \
550 vpxor x0, x3, x3; \
551 vpxor x0, x2, x2; \
552 vpxor x0, x1, x1; \
553 vpxor stack_tmp0, x0, x0;
554
555#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
556 y6, y7, rio) \
557 vmovdqu x0, 0 * 16(rio); \
558 vmovdqu x1, 1 * 16(rio); \
559 vmovdqu x2, 2 * 16(rio); \
560 vmovdqu x3, 3 * 16(rio); \
561 vmovdqu x4, 4 * 16(rio); \
562 vmovdqu x5, 5 * 16(rio); \
563 vmovdqu x6, 6 * 16(rio); \
564 vmovdqu x7, 7 * 16(rio); \
565 vmovdqu y0, 8 * 16(rio); \
566 vmovdqu y1, 9 * 16(rio); \
567 vmovdqu y2, 10 * 16(rio); \
568 vmovdqu y3, 11 * 16(rio); \
569 vmovdqu y4, 12 * 16(rio); \
570 vmovdqu y5, 13 * 16(rio); \
571 vmovdqu y6, 14 * 16(rio); \
572 vmovdqu y7, 15 * 16(rio);
573
574
575
576.section .rodata.cst16, "aM", @progbits, 16
577.align 16
578
579#define SHUFB_BYTES(idx) \
580 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
581
582.Lshufb_16x16b:
583 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
584
585.Lpack_bswap:
586 .long 0x00010203
587 .long 0x04050607
588 .long 0x80808080
589 .long 0x80808080
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605.Lpre_tf_lo_s1:
606 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
607 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
608.Lpre_tf_hi_s1:
609 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
610 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626.Lpre_tf_lo_s4:
627 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
628 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
629.Lpre_tf_hi_s4:
630 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
631 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649.Lpost_tf_lo_s1:
650 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
651 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
652.Lpost_tf_hi_s1:
653 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
654 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672.Lpost_tf_lo_s2:
673 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
674 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
675.Lpost_tf_hi_s2:
676 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
677 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695.Lpost_tf_lo_s3:
696 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
697 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
698.Lpost_tf_hi_s3:
699 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
700 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
701
702
703.Linv_shift_row:
704 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
705 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
706
707
708.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
709.align 4
710.L0f0f0f0f:
711 .long 0x0f0f0f0f
712
713.text
714
715.align 8
716SYM_FUNC_START_LOCAL(__camellia_enc_blk16)
717
718
719
720
721
722
723
724
725 FRAME_BEGIN
726
727 leaq 8 * 16(%rax), %rcx;
728
729 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
730 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
731 %xmm15, %rax, %rcx);
732
733 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
734 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
735 %xmm15, %rax, %rcx, 0);
736
737 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
738 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
739 %xmm15,
740 ((key_table + (8) * 8) + 0)(CTX),
741 ((key_table + (8) * 8) + 4)(CTX),
742 ((key_table + (8) * 8) + 8)(CTX),
743 ((key_table + (8) * 8) + 12)(CTX));
744
745 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
746 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
747 %xmm15, %rax, %rcx, 8);
748
749 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
750 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
751 %xmm15,
752 ((key_table + (16) * 8) + 0)(CTX),
753 ((key_table + (16) * 8) + 4)(CTX),
754 ((key_table + (16) * 8) + 8)(CTX),
755 ((key_table + (16) * 8) + 12)(CTX));
756
757 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
758 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
759 %xmm15, %rax, %rcx, 16);
760
761 movl $24, %r8d;
762 cmpl $16, key_length(CTX);
763 jne .Lenc_max32;
764
765.Lenc_done:
766
767 vmovdqu 0 * 16(%rcx), %xmm8;
768 vmovdqu 1 * 16(%rcx), %xmm9;
769 vmovdqu 2 * 16(%rcx), %xmm10;
770 vmovdqu 3 * 16(%rcx), %xmm11;
771 vmovdqu 4 * 16(%rcx), %xmm12;
772 vmovdqu 5 * 16(%rcx), %xmm13;
773 vmovdqu 6 * 16(%rcx), %xmm14;
774 vmovdqu 7 * 16(%rcx), %xmm15;
775
776 outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
777 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
778 %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
779
780 FRAME_END
781 ret;
782
783.align 8
784.Lenc_max32:
785 movl $32, %r8d;
786
787 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
788 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
789 %xmm15,
790 ((key_table + (24) * 8) + 0)(CTX),
791 ((key_table + (24) * 8) + 4)(CTX),
792 ((key_table + (24) * 8) + 8)(CTX),
793 ((key_table + (24) * 8) + 12)(CTX));
794
795 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
796 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
797 %xmm15, %rax, %rcx, 24);
798
799 jmp .Lenc_done;
800SYM_FUNC_END(__camellia_enc_blk16)
801
802.align 8
803SYM_FUNC_START_LOCAL(__camellia_dec_blk16)
804
805
806
807
808
809
810
811
812
813 FRAME_BEGIN
814
815 leaq 8 * 16(%rax), %rcx;
816
817 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
818 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
819 %xmm15, %rax, %rcx);
820
821 cmpl $32, %r8d;
822 je .Ldec_max32;
823
824.Ldec_max24:
825 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
826 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
827 %xmm15, %rax, %rcx, 16);
828
829 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
830 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
831 %xmm15,
832 ((key_table + (16) * 8) + 8)(CTX),
833 ((key_table + (16) * 8) + 12)(CTX),
834 ((key_table + (16) * 8) + 0)(CTX),
835 ((key_table + (16) * 8) + 4)(CTX));
836
837 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
838 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
839 %xmm15, %rax, %rcx, 8);
840
841 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
842 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
843 %xmm15,
844 ((key_table + (8) * 8) + 8)(CTX),
845 ((key_table + (8) * 8) + 12)(CTX),
846 ((key_table + (8) * 8) + 0)(CTX),
847 ((key_table + (8) * 8) + 4)(CTX));
848
849 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
850 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
851 %xmm15, %rax, %rcx, 0);
852
853
854 vmovdqu 0 * 16(%rcx), %xmm8;
855 vmovdqu 1 * 16(%rcx), %xmm9;
856 vmovdqu 2 * 16(%rcx), %xmm10;
857 vmovdqu 3 * 16(%rcx), %xmm11;
858 vmovdqu 4 * 16(%rcx), %xmm12;
859 vmovdqu 5 * 16(%rcx), %xmm13;
860 vmovdqu 6 * 16(%rcx), %xmm14;
861 vmovdqu 7 * 16(%rcx), %xmm15;
862
863 outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
864 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
865 %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
866
867 FRAME_END
868 ret;
869
870.align 8
871.Ldec_max32:
872 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
873 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
874 %xmm15, %rax, %rcx, 24);
875
876 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
877 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
878 %xmm15,
879 ((key_table + (24) * 8) + 8)(CTX),
880 ((key_table + (24) * 8) + 12)(CTX),
881 ((key_table + (24) * 8) + 0)(CTX),
882 ((key_table + (24) * 8) + 4)(CTX));
883
884 jmp .Ldec_max24;
885SYM_FUNC_END(__camellia_dec_blk16)
886
887SYM_FUNC_START(camellia_ecb_enc_16way)
888
889
890
891
892
893 FRAME_BEGIN
894
895 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
896 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
897 %xmm15, %rdx, (key_table)(CTX));
898
899
900 movq %rsi, %rax;
901
902 call __camellia_enc_blk16;
903
904 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
905 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
906 %xmm8, %rsi);
907
908 FRAME_END
909 ret;
910SYM_FUNC_END(camellia_ecb_enc_16way)
911
912SYM_FUNC_START(camellia_ecb_dec_16way)
913
914
915
916
917
918 FRAME_BEGIN
919
920 cmpl $16, key_length(CTX);
921 movl $32, %r8d;
922 movl $24, %eax;
923 cmovel %eax, %r8d;
924
925 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
926 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
927 %xmm15, %rdx, (key_table)(CTX, %r8, 8));
928
929
930 movq %rsi, %rax;
931
932 call __camellia_dec_blk16;
933
934 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
935 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
936 %xmm8, %rsi);
937
938 FRAME_END
939 ret;
940SYM_FUNC_END(camellia_ecb_dec_16way)
941
942SYM_FUNC_START(camellia_cbc_dec_16way)
943
944
945
946
947
948 FRAME_BEGIN
949
950 cmpl $16, key_length(CTX);
951 movl $32, %r8d;
952 movl $24, %eax;
953 cmovel %eax, %r8d;
954
955 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
956 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
957 %xmm15, %rdx, (key_table)(CTX, %r8, 8));
958
959
960
961
962
963 subq $(16 * 16), %rsp;
964 movq %rsp, %rax;
965
966 call __camellia_dec_blk16;
967
968 addq $(16 * 16), %rsp;
969
970 vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
971 vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
972 vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
973 vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
974 vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
975 vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
976 vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
977 vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
978 vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
979 vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
980 vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
981 vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
982 vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
983 vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
984 vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
985 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
986 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
987 %xmm8, %rsi);
988
989 FRAME_END
990 ret;
991SYM_FUNC_END(camellia_cbc_dec_16way)
992