1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18#include <linux/linkage.h>
19
20#define CAMELLIA_TABLE_BYTE_LEN 272
21
22
23#define key_table 0
24#define key_length CAMELLIA_TABLE_BYTE_LEN
25
26
27#define CTX %rdi
28
29
30
31
32#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
33 vpand x, mask4bit, tmp0; \
34 vpandn x, mask4bit, x; \
35 vpsrld $4, x, x; \
36 \
37 vpshufb tmp0, lo_t, tmp0; \
38 vpshufb x, hi_t, x; \
39 vpxor tmp0, x, x;
40
41
42
43
44
45
46
47
48
49#define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
50 t7, mem_cd, key) \
51
52
53 \
54 vmovdqa .Linv_shift_row, t4; \
55 vbroadcastss .L0f0f0f0f, t7; \
56 vmovdqa .Lpre_tf_lo_s1, t0; \
57 vmovdqa .Lpre_tf_hi_s1, t1; \
58 \
59 \
60 vpshufb t4, x0, x0; \
61 vpshufb t4, x7, x7; \
62 vpshufb t4, x1, x1; \
63 vpshufb t4, x4, x4; \
64 vpshufb t4, x2, x2; \
65 vpshufb t4, x5, x5; \
66 vpshufb t4, x3, x3; \
67 vpshufb t4, x6, x6; \
68 \
69 \
70 vmovdqa .Lpre_tf_lo_s4, t2; \
71 vmovdqa .Lpre_tf_hi_s4, t3; \
72 filter_8bit(x0, t0, t1, t7, t6); \
73 filter_8bit(x7, t0, t1, t7, t6); \
74 filter_8bit(x1, t0, t1, t7, t6); \
75 filter_8bit(x4, t0, t1, t7, t6); \
76 filter_8bit(x2, t0, t1, t7, t6); \
77 filter_8bit(x5, t0, t1, t7, t6); \
78 \
79 \
80 vpxor t4, t4, t4; \
81 filter_8bit(x3, t2, t3, t7, t6); \
82 filter_8bit(x6, t2, t3, t7, t6); \
83 \
84 \
85 vmovdqa .Lpost_tf_lo_s1, t0; \
86 vmovdqa .Lpost_tf_hi_s1, t1; \
87 vaesenclast t4, x0, x0; \
88 vaesenclast t4, x7, x7; \
89 vaesenclast t4, x1, x1; \
90 vaesenclast t4, x4, x4; \
91 vaesenclast t4, x2, x2; \
92 vaesenclast t4, x5, x5; \
93 vaesenclast t4, x3, x3; \
94 vaesenclast t4, x6, x6; \
95 \
96 \
97 vmovdqa .Lpost_tf_lo_s3, t2; \
98 vmovdqa .Lpost_tf_hi_s3, t3; \
99 filter_8bit(x0, t0, t1, t7, t6); \
100 filter_8bit(x7, t0, t1, t7, t6); \
101 filter_8bit(x3, t0, t1, t7, t6); \
102 filter_8bit(x6, t0, t1, t7, t6); \
103 \
104 \
105 vmovdqa .Lpost_tf_lo_s2, t4; \
106 vmovdqa .Lpost_tf_hi_s2, t5; \
107 filter_8bit(x2, t2, t3, t7, t6); \
108 filter_8bit(x5, t2, t3, t7, t6); \
109 \
110 vpxor t6, t6, t6; \
111 vmovq key, t0; \
112 \
113 \
114 filter_8bit(x1, t4, t5, t7, t2); \
115 filter_8bit(x4, t4, t5, t7, t2); \
116 \
117 vpsrldq $5, t0, t5; \
118 vpsrldq $1, t0, t1; \
119 vpsrldq $2, t0, t2; \
120 vpsrldq $3, t0, t3; \
121 vpsrldq $4, t0, t4; \
122 vpshufb t6, t0, t0; \
123 vpshufb t6, t1, t1; \
124 vpshufb t6, t2, t2; \
125 vpshufb t6, t3, t3; \
126 vpshufb t6, t4, t4; \
127 vpsrldq $2, t5, t7; \
128 vpshufb t6, t7, t7; \
129 \
130
131
132 \
133 vpxor x5, x0, x0; \
134 vpxor x6, x1, x1; \
135 vpxor x7, x2, x2; \
136 vpxor x4, x3, x3; \
137 \
138 vpxor x2, x4, x4; \
139 vpxor x3, x5, x5; \
140 vpxor x0, x6, x6; \
141 vpxor x1, x7, x7; \
142 \
143 vpxor x7, x0, x0; \
144 vpxor x4, x1, x1; \
145 vpxor x5, x2, x2; \
146 vpxor x6, x3, x3; \
147 \
148 vpxor x3, x4, x4; \
149 vpxor x0, x5, x5; \
150 vpxor x1, x6, x6; \
151 vpxor x2, x7, x7; \
152 \
153
154
155 \
156 \
157 vpxor t3, x4, x4; \
158 vpxor 0 * 16(mem_cd), x4, x4; \
159 \
160 vpxor t2, x5, x5; \
161 vpxor 1 * 16(mem_cd), x5, x5; \
162 \
163 vpsrldq $1, t5, t3; \
164 vpshufb t6, t5, t5; \
165 vpshufb t6, t3, t6; \
166 \
167 vpxor t1, x6, x6; \
168 vpxor 2 * 16(mem_cd), x6, x6; \
169 \
170 vpxor t0, x7, x7; \
171 vpxor 3 * 16(mem_cd), x7, x7; \
172 \
173 vpxor t7, x0, x0; \
174 vpxor 4 * 16(mem_cd), x0, x0; \
175 \
176 vpxor t6, x1, x1; \
177 vpxor 5 * 16(mem_cd), x1, x1; \
178 \
179 vpxor t5, x2, x2; \
180 vpxor 6 * 16(mem_cd), x2, x2; \
181 \
182 vpxor t4, x3, x3; \
183 vpxor 7 * 16(mem_cd), x3, x3;
184
185
186
187
188
189.align 8
190roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
191 roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
192 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
193 %rcx, (%r9));
194 ret;
195ENDPROC(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
196
197.align 8
198roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
199 roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,
200 %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
201 %rax, (%r9));
202 ret;
203ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
204
205
206
207
208
209
210
211#define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
212 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
213 leaq (key_table + (i) * 8)(CTX), %r9; \
214 call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
215 \
216 vmovdqu x4, 0 * 16(mem_cd); \
217 vmovdqu x5, 1 * 16(mem_cd); \
218 vmovdqu x6, 2 * 16(mem_cd); \
219 vmovdqu x7, 3 * 16(mem_cd); \
220 vmovdqu x0, 4 * 16(mem_cd); \
221 vmovdqu x1, 5 * 16(mem_cd); \
222 vmovdqu x2, 6 * 16(mem_cd); \
223 vmovdqu x3, 7 * 16(mem_cd); \
224 \
225 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
226 call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
227 \
228 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
229
230#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab)
231
232#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
233 \
234 vmovdqu x0, 0 * 16(mem_ab); \
235 vmovdqu x1, 1 * 16(mem_ab); \
236 vmovdqu x2, 2 * 16(mem_ab); \
237 vmovdqu x3, 3 * 16(mem_ab); \
238 vmovdqu x4, 4 * 16(mem_ab); \
239 vmovdqu x5, 5 * 16(mem_ab); \
240 vmovdqu x6, 6 * 16(mem_ab); \
241 vmovdqu x7, 7 * 16(mem_ab);
242
243#define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
244 y6, y7, mem_ab, mem_cd, i) \
245 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
246 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
247 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
248 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
249 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
250 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
251
252#define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
253 y6, y7, mem_ab, mem_cd, i) \
254 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
255 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
256 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
257 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
258 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
259 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
260
261
262
263
264
265
266
267#define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
268 vpcmpgtb v0, zero, t0; \
269 vpaddb v0, v0, v0; \
270 vpabsb t0, t0; \
271 \
272 vpcmpgtb v1, zero, t1; \
273 vpaddb v1, v1, v1; \
274 vpabsb t1, t1; \
275 \
276 vpcmpgtb v2, zero, t2; \
277 vpaddb v2, v2, v2; \
278 vpabsb t2, t2; \
279 \
280 vpor t0, v1, v1; \
281 \
282 vpcmpgtb v3, zero, t0; \
283 vpaddb v3, v3, v3; \
284 vpabsb t0, t0; \
285 \
286 vpor t1, v2, v2; \
287 vpor t2, v3, v3; \
288 vpor t0, v0, v0;
289
290
291
292
293
294
295
296
297#define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
298 tt1, tt2, tt3, kll, klr, krl, krr) \
299
300
301
302
303 \
304 vpxor tt0, tt0, tt0; \
305 vmovd kll, t0; \
306 vpshufb tt0, t0, t3; \
307 vpsrldq $1, t0, t0; \
308 vpshufb tt0, t0, t2; \
309 vpsrldq $1, t0, t0; \
310 vpshufb tt0, t0, t1; \
311 vpsrldq $1, t0, t0; \
312 vpshufb tt0, t0, t0; \
313 \
314 vpand l0, t0, t0; \
315 vpand l1, t1, t1; \
316 vpand l2, t2, t2; \
317 vpand l3, t3, t3; \
318 \
319 rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
320 \
321 vpxor l4, t0, l4; \
322 vmovdqu l4, 4 * 16(l); \
323 vpxor l5, t1, l5; \
324 vmovdqu l5, 5 * 16(l); \
325 vpxor l6, t2, l6; \
326 vmovdqu l6, 6 * 16(l); \
327 vpxor l7, t3, l7; \
328 vmovdqu l7, 7 * 16(l); \
329 \
330
331
332
333
334 \
335 \
336 vmovd krr, t0; \
337 vpshufb tt0, t0, t3; \
338 vpsrldq $1, t0, t0; \
339 vpshufb tt0, t0, t2; \
340 vpsrldq $1, t0, t0; \
341 vpshufb tt0, t0, t1; \
342 vpsrldq $1, t0, t0; \
343 vpshufb tt0, t0, t0; \
344 \
345 vpor 4 * 16(r), t0, t0; \
346 vpor 5 * 16(r), t1, t1; \
347 vpor 6 * 16(r), t2, t2; \
348 vpor 7 * 16(r), t3, t3; \
349 \
350 vpxor 0 * 16(r), t0, t0; \
351 vpxor 1 * 16(r), t1, t1; \
352 vpxor 2 * 16(r), t2, t2; \
353 vpxor 3 * 16(r), t3, t3; \
354 vmovdqu t0, 0 * 16(r); \
355 vmovdqu t1, 1 * 16(r); \
356 vmovdqu t2, 2 * 16(r); \
357 vmovdqu t3, 3 * 16(r); \
358 \
359
360
361
362
363 \
364 vmovd krl, t0; \
365 vpshufb tt0, t0, t3; \
366 vpsrldq $1, t0, t0; \
367 vpshufb tt0, t0, t2; \
368 vpsrldq $1, t0, t0; \
369 vpshufb tt0, t0, t1; \
370 vpsrldq $1, t0, t0; \
371 vpshufb tt0, t0, t0; \
372 \
373 vpand 0 * 16(r), t0, t0; \
374 vpand 1 * 16(r), t1, t1; \
375 vpand 2 * 16(r), t2, t2; \
376 vpand 3 * 16(r), t3, t3; \
377 \
378 rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
379 \
380 vpxor 4 * 16(r), t0, t0; \
381 vpxor 5 * 16(r), t1, t1; \
382 vpxor 6 * 16(r), t2, t2; \
383 vpxor 7 * 16(r), t3, t3; \
384 vmovdqu t0, 4 * 16(r); \
385 vmovdqu t1, 5 * 16(r); \
386 vmovdqu t2, 6 * 16(r); \
387 vmovdqu t3, 7 * 16(r); \
388 \
389
390
391
392
393 \
394 \
395 vmovd klr, t0; \
396 vpshufb tt0, t0, t3; \
397 vpsrldq $1, t0, t0; \
398 vpshufb tt0, t0, t2; \
399 vpsrldq $1, t0, t0; \
400 vpshufb tt0, t0, t1; \
401 vpsrldq $1, t0, t0; \
402 vpshufb tt0, t0, t0; \
403 \
404 vpor l4, t0, t0; \
405 vpor l5, t1, t1; \
406 vpor l6, t2, t2; \
407 vpor l7, t3, t3; \
408 \
409 vpxor l0, t0, l0; \
410 vmovdqu l0, 0 * 16(l); \
411 vpxor l1, t1, l1; \
412 vmovdqu l1, 1 * 16(l); \
413 vpxor l2, t2, l2; \
414 vmovdqu l2, 2 * 16(l); \
415 vpxor l3, t3, l3; \
416 vmovdqu l3, 3 * 16(l);
417
418#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
419 vpunpckhdq x1, x0, t2; \
420 vpunpckldq x1, x0, x0; \
421 \
422 vpunpckldq x3, x2, t1; \
423 vpunpckhdq x3, x2, x2; \
424 \
425 vpunpckhqdq t1, x0, x1; \
426 vpunpcklqdq t1, x0, x0; \
427 \
428 vpunpckhqdq x2, t2, x3; \
429 vpunpcklqdq x2, t2, x2;
430
431#define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \
432 b3, c3, d3, st0, st1) \
433 vmovdqu d2, st0; \
434 vmovdqu d3, st1; \
435 transpose_4x4(a0, a1, a2, a3, d2, d3); \
436 transpose_4x4(b0, b1, b2, b3, d2, d3); \
437 vmovdqu st0, d2; \
438 vmovdqu st1, d3; \
439 \
440 vmovdqu a0, st0; \
441 vmovdqu a1, st1; \
442 transpose_4x4(c0, c1, c2, c3, a0, a1); \
443 transpose_4x4(d0, d1, d2, d3, a0, a1); \
444 \
445 vmovdqu .Lshufb_16x16b, a0; \
446 vmovdqu st1, a1; \
447 vpshufb a0, a2, a2; \
448 vpshufb a0, a3, a3; \
449 vpshufb a0, b0, b0; \
450 vpshufb a0, b1, b1; \
451 vpshufb a0, b2, b2; \
452 vpshufb a0, b3, b3; \
453 vpshufb a0, a1, a1; \
454 vpshufb a0, c0, c0; \
455 vpshufb a0, c1, c1; \
456 vpshufb a0, c2, c2; \
457 vpshufb a0, c3, c3; \
458 vpshufb a0, d0, d0; \
459 vpshufb a0, d1, d1; \
460 vpshufb a0, d2, d2; \
461 vpshufb a0, d3, d3; \
462 vmovdqu d3, st1; \
463 vmovdqu st0, d3; \
464 vpshufb a0, d3, a0; \
465 vmovdqu d2, st0; \
466 \
467 transpose_4x4(a0, b0, c0, d0, d2, d3); \
468 transpose_4x4(a1, b1, c1, d1, d2, d3); \
469 vmovdqu st0, d2; \
470 vmovdqu st1, d3; \
471 \
472 vmovdqu b0, st0; \
473 vmovdqu b1, st1; \
474 transpose_4x4(a2, b2, c2, d2, b0, b1); \
475 transpose_4x4(a3, b3, c3, d3, b0, b1); \
476 vmovdqu st0, b0; \
477 vmovdqu st1, b1; \
478
479
480
481#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
482 y6, y7, rio, key) \
483 vmovq key, x0; \
484 vpshufb .Lpack_bswap, x0, x0; \
485 \
486 vpxor 0 * 16(rio), x0, y7; \
487 vpxor 1 * 16(rio), x0, y6; \
488 vpxor 2 * 16(rio), x0, y5; \
489 vpxor 3 * 16(rio), x0, y4; \
490 vpxor 4 * 16(rio), x0, y3; \
491 vpxor 5 * 16(rio), x0, y2; \
492 vpxor 6 * 16(rio), x0, y1; \
493 vpxor 7 * 16(rio), x0, y0; \
494 vpxor 8 * 16(rio), x0, x7; \
495 vpxor 9 * 16(rio), x0, x6; \
496 vpxor 10 * 16(rio), x0, x5; \
497 vpxor 11 * 16(rio), x0, x4; \
498 vpxor 12 * 16(rio), x0, x3; \
499 vpxor 13 * 16(rio), x0, x2; \
500 vpxor 14 * 16(rio), x0, x1; \
501 vpxor 15 * 16(rio), x0, x0;
502
503
504#define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
505 y6, y7, mem_ab, mem_cd) \
506 byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
507 y5, y6, y7, (mem_ab), (mem_cd)); \
508 \
509 vmovdqu x0, 0 * 16(mem_ab); \
510 vmovdqu x1, 1 * 16(mem_ab); \
511 vmovdqu x2, 2 * 16(mem_ab); \
512 vmovdqu x3, 3 * 16(mem_ab); \
513 vmovdqu x4, 4 * 16(mem_ab); \
514 vmovdqu x5, 5 * 16(mem_ab); \
515 vmovdqu x6, 6 * 16(mem_ab); \
516 vmovdqu x7, 7 * 16(mem_ab); \
517 vmovdqu y0, 0 * 16(mem_cd); \
518 vmovdqu y1, 1 * 16(mem_cd); \
519 vmovdqu y2, 2 * 16(mem_cd); \
520 vmovdqu y3, 3 * 16(mem_cd); \
521 vmovdqu y4, 4 * 16(mem_cd); \
522 vmovdqu y5, 5 * 16(mem_cd); \
523 vmovdqu y6, 6 * 16(mem_cd); \
524 vmovdqu y7, 7 * 16(mem_cd);
525
526
527#define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
528 y5, y6, y7, key, stack_tmp0, stack_tmp1) \
529 byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \
530 y7, x3, x7, stack_tmp0, stack_tmp1); \
531 \
532 vmovdqu x0, stack_tmp0; \
533 \
534 vmovq key, x0; \
535 vpshufb .Lpack_bswap, x0, x0; \
536 \
537 vpxor x0, y7, y7; \
538 vpxor x0, y6, y6; \
539 vpxor x0, y5, y5; \
540 vpxor x0, y4, y4; \
541 vpxor x0, y3, y3; \
542 vpxor x0, y2, y2; \
543 vpxor x0, y1, y1; \
544 vpxor x0, y0, y0; \
545 vpxor x0, x7, x7; \
546 vpxor x0, x6, x6; \
547 vpxor x0, x5, x5; \
548 vpxor x0, x4, x4; \
549 vpxor x0, x3, x3; \
550 vpxor x0, x2, x2; \
551 vpxor x0, x1, x1; \
552 vpxor stack_tmp0, x0, x0;
553
554#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
555 y6, y7, rio) \
556 vmovdqu x0, 0 * 16(rio); \
557 vmovdqu x1, 1 * 16(rio); \
558 vmovdqu x2, 2 * 16(rio); \
559 vmovdqu x3, 3 * 16(rio); \
560 vmovdqu x4, 4 * 16(rio); \
561 vmovdqu x5, 5 * 16(rio); \
562 vmovdqu x6, 6 * 16(rio); \
563 vmovdqu x7, 7 * 16(rio); \
564 vmovdqu y0, 8 * 16(rio); \
565 vmovdqu y1, 9 * 16(rio); \
566 vmovdqu y2, 10 * 16(rio); \
567 vmovdqu y3, 11 * 16(rio); \
568 vmovdqu y4, 12 * 16(rio); \
569 vmovdqu y5, 13 * 16(rio); \
570 vmovdqu y6, 14 * 16(rio); \
571 vmovdqu y7, 15 * 16(rio);
572
573.data
574.align 16
575
576#define SHUFB_BYTES(idx) \
577 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
578
579.Lshufb_16x16b:
580 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
581
582.Lpack_bswap:
583 .long 0x00010203
584 .long 0x04050607
585 .long 0x80808080
586 .long 0x80808080
587
588
589.Lbswap128_mask:
590 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
591
592
593.Lxts_gf128mul_and_shl1_mask:
594 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610.Lpre_tf_lo_s1:
611 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
612 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
613.Lpre_tf_hi_s1:
614 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
615 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631.Lpre_tf_lo_s4:
632 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
633 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
634.Lpre_tf_hi_s4:
635 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
636 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654.Lpost_tf_lo_s1:
655 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
656 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
657.Lpost_tf_hi_s1:
658 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
659 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677.Lpost_tf_lo_s2:
678 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
679 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
680.Lpost_tf_hi_s2:
681 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
682 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700.Lpost_tf_lo_s3:
701 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
702 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
703.Lpost_tf_hi_s3:
704 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
705 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
706
707
708.Linv_shift_row:
709 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
710 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
711
712
713.align 4
714.L0f0f0f0f:
715 .long 0x0f0f0f0f
716
717.text
718
719.align 8
720__camellia_enc_blk16:
721
722
723
724
725
726
727
728
729
730 leaq 8 * 16(%rax), %rcx;
731
732 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
733 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
734 %xmm15, %rax, %rcx);
735
736 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
737 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
738 %xmm15, %rax, %rcx, 0);
739
740 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
741 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
742 %xmm15,
743 ((key_table + (8) * 8) + 0)(CTX),
744 ((key_table + (8) * 8) + 4)(CTX),
745 ((key_table + (8) * 8) + 8)(CTX),
746 ((key_table + (8) * 8) + 12)(CTX));
747
748 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
749 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
750 %xmm15, %rax, %rcx, 8);
751
752 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
753 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
754 %xmm15,
755 ((key_table + (16) * 8) + 0)(CTX),
756 ((key_table + (16) * 8) + 4)(CTX),
757 ((key_table + (16) * 8) + 8)(CTX),
758 ((key_table + (16) * 8) + 12)(CTX));
759
760 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
761 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
762 %xmm15, %rax, %rcx, 16);
763
764 movl $24, %r8d;
765 cmpl $16, key_length(CTX);
766 jne .Lenc_max32;
767
768.Lenc_done:
769
770 vmovdqu 0 * 16(%rcx), %xmm8;
771 vmovdqu 1 * 16(%rcx), %xmm9;
772 vmovdqu 2 * 16(%rcx), %xmm10;
773 vmovdqu 3 * 16(%rcx), %xmm11;
774 vmovdqu 4 * 16(%rcx), %xmm12;
775 vmovdqu 5 * 16(%rcx), %xmm13;
776 vmovdqu 6 * 16(%rcx), %xmm14;
777 vmovdqu 7 * 16(%rcx), %xmm15;
778
779 outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
780 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
781 %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
782
783 ret;
784
785.align 8
786.Lenc_max32:
787 movl $32, %r8d;
788
789 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
790 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
791 %xmm15,
792 ((key_table + (24) * 8) + 0)(CTX),
793 ((key_table + (24) * 8) + 4)(CTX),
794 ((key_table + (24) * 8) + 8)(CTX),
795 ((key_table + (24) * 8) + 12)(CTX));
796
797 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
798 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
799 %xmm15, %rax, %rcx, 24);
800
801 jmp .Lenc_done;
802ENDPROC(__camellia_enc_blk16)
803
804.align 8
805__camellia_dec_blk16:
806
807
808
809
810
811
812
813
814
815
816 leaq 8 * 16(%rax), %rcx;
817
818 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
819 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
820 %xmm15, %rax, %rcx);
821
822 cmpl $32, %r8d;
823 je .Ldec_max32;
824
825.Ldec_max24:
826 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
827 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
828 %xmm15, %rax, %rcx, 16);
829
830 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
831 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
832 %xmm15,
833 ((key_table + (16) * 8) + 8)(CTX),
834 ((key_table + (16) * 8) + 12)(CTX),
835 ((key_table + (16) * 8) + 0)(CTX),
836 ((key_table + (16) * 8) + 4)(CTX));
837
838 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
839 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
840 %xmm15, %rax, %rcx, 8);
841
842 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
843 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
844 %xmm15,
845 ((key_table + (8) * 8) + 8)(CTX),
846 ((key_table + (8) * 8) + 12)(CTX),
847 ((key_table + (8) * 8) + 0)(CTX),
848 ((key_table + (8) * 8) + 4)(CTX));
849
850 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
851 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
852 %xmm15, %rax, %rcx, 0);
853
854
855 vmovdqu 0 * 16(%rcx), %xmm8;
856 vmovdqu 1 * 16(%rcx), %xmm9;
857 vmovdqu 2 * 16(%rcx), %xmm10;
858 vmovdqu 3 * 16(%rcx), %xmm11;
859 vmovdqu 4 * 16(%rcx), %xmm12;
860 vmovdqu 5 * 16(%rcx), %xmm13;
861 vmovdqu 6 * 16(%rcx), %xmm14;
862 vmovdqu 7 * 16(%rcx), %xmm15;
863
864 outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
865 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
866 %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
867
868 ret;
869
870.align 8
871.Ldec_max32:
872 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
873 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
874 %xmm15, %rax, %rcx, 24);
875
876 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
877 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
878 %xmm15,
879 ((key_table + (24) * 8) + 8)(CTX),
880 ((key_table + (24) * 8) + 12)(CTX),
881 ((key_table + (24) * 8) + 0)(CTX),
882 ((key_table + (24) * 8) + 4)(CTX));
883
884 jmp .Ldec_max24;
885ENDPROC(__camellia_dec_blk16)
886
887ENTRY(camellia_ecb_enc_16way)
888
889
890
891
892
893
894 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
895 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
896 %xmm15, %rdx, (key_table)(CTX));
897
898
899 movq %rsi, %rax;
900
901 call __camellia_enc_blk16;
902
903 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
904 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
905 %xmm8, %rsi);
906
907 ret;
908ENDPROC(camellia_ecb_enc_16way)
909
910ENTRY(camellia_ecb_dec_16way)
911
912
913
914
915
916
917 cmpl $16, key_length(CTX);
918 movl $32, %r8d;
919 movl $24, %eax;
920 cmovel %eax, %r8d;
921
922 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
923 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
924 %xmm15, %rdx, (key_table)(CTX, %r8, 8));
925
926
927 movq %rsi, %rax;
928
929 call __camellia_dec_blk16;
930
931 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
932 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
933 %xmm8, %rsi);
934
935 ret;
936ENDPROC(camellia_ecb_dec_16way)
937
938ENTRY(camellia_cbc_dec_16way)
939
940
941
942
943
944
945 cmpl $16, key_length(CTX);
946 movl $32, %r8d;
947 movl $24, %eax;
948 cmovel %eax, %r8d;
949
950 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
951 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
952 %xmm15, %rdx, (key_table)(CTX, %r8, 8));
953
954
955
956
957
958 subq $(16 * 16), %rsp;
959 movq %rsp, %rax;
960
961 call __camellia_dec_blk16;
962
963 addq $(16 * 16), %rsp;
964
965 vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
966 vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
967 vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
968 vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
969 vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
970 vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
971 vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
972 vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
973 vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
974 vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
975 vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
976 vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
977 vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
978 vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
979 vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
980 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
981 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
982 %xmm8, %rsi);
983
984 ret;
985ENDPROC(camellia_cbc_dec_16way)
986
987#define inc_le128(x, minus_one, tmp) \
988 vpcmpeqq minus_one, x, tmp; \
989 vpsubq minus_one, x, x; \
990 vpslldq $8, tmp, tmp; \
991 vpsubq tmp, x, x;
992
993ENTRY(camellia_ctr_16way)
994
995
996
997
998
999
1000
1001 subq $(16 * 16), %rsp;
1002 movq %rsp, %rax;
1003
1004 vmovdqa .Lbswap128_mask, %xmm14;
1005
1006
1007 vmovdqu (%rcx), %xmm0;
1008 vpshufb %xmm14, %xmm0, %xmm15;
1009 vmovdqu %xmm15, 15 * 16(%rax);
1010
1011 vpcmpeqd %xmm15, %xmm15, %xmm15;
1012 vpsrldq $8, %xmm15, %xmm15;
1013
1014
1015 inc_le128(%xmm0, %xmm15, %xmm13);
1016 vpshufb %xmm14, %xmm0, %xmm13;
1017 vmovdqu %xmm13, 14 * 16(%rax);
1018 inc_le128(%xmm0, %xmm15, %xmm13);
1019 vpshufb %xmm14, %xmm0, %xmm13;
1020 vmovdqu %xmm13, 13 * 16(%rax);
1021 inc_le128(%xmm0, %xmm15, %xmm13);
1022 vpshufb %xmm14, %xmm0, %xmm12;
1023 inc_le128(%xmm0, %xmm15, %xmm13);
1024 vpshufb %xmm14, %xmm0, %xmm11;
1025 inc_le128(%xmm0, %xmm15, %xmm13);
1026 vpshufb %xmm14, %xmm0, %xmm10;
1027 inc_le128(%xmm0, %xmm15, %xmm13);
1028 vpshufb %xmm14, %xmm0, %xmm9;
1029 inc_le128(%xmm0, %xmm15, %xmm13);
1030 vpshufb %xmm14, %xmm0, %xmm8;
1031 inc_le128(%xmm0, %xmm15, %xmm13);
1032 vpshufb %xmm14, %xmm0, %xmm7;
1033 inc_le128(%xmm0, %xmm15, %xmm13);
1034 vpshufb %xmm14, %xmm0, %xmm6;
1035 inc_le128(%xmm0, %xmm15, %xmm13);
1036 vpshufb %xmm14, %xmm0, %xmm5;
1037 inc_le128(%xmm0, %xmm15, %xmm13);
1038 vpshufb %xmm14, %xmm0, %xmm4;
1039 inc_le128(%xmm0, %xmm15, %xmm13);
1040 vpshufb %xmm14, %xmm0, %xmm3;
1041 inc_le128(%xmm0, %xmm15, %xmm13);
1042 vpshufb %xmm14, %xmm0, %xmm2;
1043 inc_le128(%xmm0, %xmm15, %xmm13);
1044 vpshufb %xmm14, %xmm0, %xmm1;
1045 inc_le128(%xmm0, %xmm15, %xmm13);
1046 vmovdqa %xmm0, %xmm13;
1047 vpshufb %xmm14, %xmm0, %xmm0;
1048 inc_le128(%xmm13, %xmm15, %xmm14);
1049 vmovdqu %xmm13, (%rcx);
1050
1051
1052 vmovq (key_table)(CTX), %xmm15;
1053 vpshufb .Lpack_bswap, %xmm15, %xmm15;
1054 vpxor %xmm0, %xmm15, %xmm0;
1055 vpxor %xmm1, %xmm15, %xmm1;
1056 vpxor %xmm2, %xmm15, %xmm2;
1057 vpxor %xmm3, %xmm15, %xmm3;
1058 vpxor %xmm4, %xmm15, %xmm4;
1059 vpxor %xmm5, %xmm15, %xmm5;
1060 vpxor %xmm6, %xmm15, %xmm6;
1061 vpxor %xmm7, %xmm15, %xmm7;
1062 vpxor %xmm8, %xmm15, %xmm8;
1063 vpxor %xmm9, %xmm15, %xmm9;
1064 vpxor %xmm10, %xmm15, %xmm10;
1065 vpxor %xmm11, %xmm15, %xmm11;
1066 vpxor %xmm12, %xmm15, %xmm12;
1067 vpxor 13 * 16(%rax), %xmm15, %xmm13;
1068 vpxor 14 * 16(%rax), %xmm15, %xmm14;
1069 vpxor 15 * 16(%rax), %xmm15, %xmm15;
1070
1071 call __camellia_enc_blk16;
1072
1073 addq $(16 * 16), %rsp;
1074
1075 vpxor 0 * 16(%rdx), %xmm7, %xmm7;
1076 vpxor 1 * 16(%rdx), %xmm6, %xmm6;
1077 vpxor 2 * 16(%rdx), %xmm5, %xmm5;
1078 vpxor 3 * 16(%rdx), %xmm4, %xmm4;
1079 vpxor 4 * 16(%rdx), %xmm3, %xmm3;
1080 vpxor 5 * 16(%rdx), %xmm2, %xmm2;
1081 vpxor 6 * 16(%rdx), %xmm1, %xmm1;
1082 vpxor 7 * 16(%rdx), %xmm0, %xmm0;
1083 vpxor 8 * 16(%rdx), %xmm15, %xmm15;
1084 vpxor 9 * 16(%rdx), %xmm14, %xmm14;
1085 vpxor 10 * 16(%rdx), %xmm13, %xmm13;
1086 vpxor 11 * 16(%rdx), %xmm12, %xmm12;
1087 vpxor 12 * 16(%rdx), %xmm11, %xmm11;
1088 vpxor 13 * 16(%rdx), %xmm10, %xmm10;
1089 vpxor 14 * 16(%rdx), %xmm9, %xmm9;
1090 vpxor 15 * 16(%rdx), %xmm8, %xmm8;
1091 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
1092 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
1093 %xmm8, %rsi);
1094
1095 ret;
1096ENDPROC(camellia_ctr_16way)
1097
1098#define gf128mul_x_ble(iv, mask, tmp) \
1099 vpsrad $31, iv, tmp; \
1100 vpaddq iv, iv, iv; \
1101 vpshufd $0x13, tmp, tmp; \
1102 vpand mask, tmp, tmp; \
1103 vpxor tmp, iv, iv;
1104
1105.align 8
1106camellia_xts_crypt_16way:
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116 subq $(16 * 16), %rsp;
1117 movq %rsp, %rax;
1118
1119 vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14;
1120
1121
1122 vmovdqu (%rcx), %xmm0;
1123 vpxor 0 * 16(%rdx), %xmm0, %xmm15;
1124 vmovdqu %xmm15, 15 * 16(%rax);
1125 vmovdqu %xmm0, 0 * 16(%rsi);
1126
1127
1128 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1129 vpxor 1 * 16(%rdx), %xmm0, %xmm15;
1130 vmovdqu %xmm15, 14 * 16(%rax);
1131 vmovdqu %xmm0, 1 * 16(%rsi);
1132
1133 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1134 vpxor 2 * 16(%rdx), %xmm0, %xmm13;
1135 vmovdqu %xmm0, 2 * 16(%rsi);
1136
1137 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1138 vpxor 3 * 16(%rdx), %xmm0, %xmm12;
1139 vmovdqu %xmm0, 3 * 16(%rsi);
1140
1141 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1142 vpxor 4 * 16(%rdx), %xmm0, %xmm11;
1143 vmovdqu %xmm0, 4 * 16(%rsi);
1144
1145 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1146 vpxor 5 * 16(%rdx), %xmm0, %xmm10;
1147 vmovdqu %xmm0, 5 * 16(%rsi);
1148
1149 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1150 vpxor 6 * 16(%rdx), %xmm0, %xmm9;
1151 vmovdqu %xmm0, 6 * 16(%rsi);
1152
1153 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1154 vpxor 7 * 16(%rdx), %xmm0, %xmm8;
1155 vmovdqu %xmm0, 7 * 16(%rsi);
1156
1157 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1158 vpxor 8 * 16(%rdx), %xmm0, %xmm7;
1159 vmovdqu %xmm0, 8 * 16(%rsi);
1160
1161 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1162 vpxor 9 * 16(%rdx), %xmm0, %xmm6;
1163 vmovdqu %xmm0, 9 * 16(%rsi);
1164
1165 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1166 vpxor 10 * 16(%rdx), %xmm0, %xmm5;
1167 vmovdqu %xmm0, 10 * 16(%rsi);
1168
1169 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1170 vpxor 11 * 16(%rdx), %xmm0, %xmm4;
1171 vmovdqu %xmm0, 11 * 16(%rsi);
1172
1173 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1174 vpxor 12 * 16(%rdx), %xmm0, %xmm3;
1175 vmovdqu %xmm0, 12 * 16(%rsi);
1176
1177 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1178 vpxor 13 * 16(%rdx), %xmm0, %xmm2;
1179 vmovdqu %xmm0, 13 * 16(%rsi);
1180
1181 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1182 vpxor 14 * 16(%rdx), %xmm0, %xmm1;
1183 vmovdqu %xmm0, 14 * 16(%rsi);
1184
1185 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1186 vpxor 15 * 16(%rdx), %xmm0, %xmm15;
1187 vmovdqu %xmm15, 0 * 16(%rax);
1188 vmovdqu %xmm0, 15 * 16(%rsi);
1189
1190 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1191 vmovdqu %xmm0, (%rcx);
1192
1193
1194 vmovq (key_table)(CTX, %r8, 8), %xmm15;
1195 vpshufb .Lpack_bswap, %xmm15, %xmm15;
1196 vpxor 0 * 16(%rax), %xmm15, %xmm0;
1197 vpxor %xmm1, %xmm15, %xmm1;
1198 vpxor %xmm2, %xmm15, %xmm2;
1199 vpxor %xmm3, %xmm15, %xmm3;
1200 vpxor %xmm4, %xmm15, %xmm4;
1201 vpxor %xmm5, %xmm15, %xmm5;
1202 vpxor %xmm6, %xmm15, %xmm6;
1203 vpxor %xmm7, %xmm15, %xmm7;
1204 vpxor %xmm8, %xmm15, %xmm8;
1205 vpxor %xmm9, %xmm15, %xmm9;
1206 vpxor %xmm10, %xmm15, %xmm10;
1207 vpxor %xmm11, %xmm15, %xmm11;
1208 vpxor %xmm12, %xmm15, %xmm12;
1209 vpxor %xmm13, %xmm15, %xmm13;
1210 vpxor 14 * 16(%rax), %xmm15, %xmm14;
1211 vpxor 15 * 16(%rax), %xmm15, %xmm15;
1212
1213 call *%r9;
1214
1215 addq $(16 * 16), %rsp;
1216
1217 vpxor 0 * 16(%rsi), %xmm7, %xmm7;
1218 vpxor 1 * 16(%rsi), %xmm6, %xmm6;
1219 vpxor 2 * 16(%rsi), %xmm5, %xmm5;
1220 vpxor 3 * 16(%rsi), %xmm4, %xmm4;
1221 vpxor 4 * 16(%rsi), %xmm3, %xmm3;
1222 vpxor 5 * 16(%rsi), %xmm2, %xmm2;
1223 vpxor 6 * 16(%rsi), %xmm1, %xmm1;
1224 vpxor 7 * 16(%rsi), %xmm0, %xmm0;
1225 vpxor 8 * 16(%rsi), %xmm15, %xmm15;
1226 vpxor 9 * 16(%rsi), %xmm14, %xmm14;
1227 vpxor 10 * 16(%rsi), %xmm13, %xmm13;
1228 vpxor 11 * 16(%rsi), %xmm12, %xmm12;
1229 vpxor 12 * 16(%rsi), %xmm11, %xmm11;
1230 vpxor 13 * 16(%rsi), %xmm10, %xmm10;
1231 vpxor 14 * 16(%rsi), %xmm9, %xmm9;
1232 vpxor 15 * 16(%rsi), %xmm8, %xmm8;
1233 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
1234 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
1235 %xmm8, %rsi);
1236
1237 ret;
1238ENDPROC(camellia_xts_crypt_16way)
1239
1240ENTRY(camellia_xts_enc_16way)
1241
1242
1243
1244
1245
1246
1247 xorl %r8d, %r8d;
1248
1249 leaq __camellia_enc_blk16, %r9;
1250
1251 jmp camellia_xts_crypt_16way;
1252ENDPROC(camellia_xts_enc_16way)
1253
1254ENTRY(camellia_xts_dec_16way)
1255
1256
1257
1258
1259
1260
1261
1262 cmpl $16, key_length(CTX);
1263 movl $32, %r8d;
1264 movl $24, %eax;
1265 cmovel %eax, %r8d;
1266
1267 leaq __camellia_dec_blk16, %r9;
1268
1269 jmp camellia_xts_crypt_16way;
1270ENDPROC(camellia_xts_dec_16way)
1271