1
2
3
4
5
6
7
8
9
10
11
12
13#include <linux/linkage.h>
14
15.file "blowfish-avx2-asm_64.S"
16
17.data
18.align 32
19
20.Lprefetch_mask:
21.long 0*64
22.long 1*64
23.long 2*64
24.long 3*64
25.long 4*64
26.long 5*64
27.long 6*64
28.long 7*64
29
30.Lbswap32_mask:
31.long 0x00010203
32.long 0x04050607
33.long 0x08090a0b
34.long 0x0c0d0e0f
35
36.Lbswap128_mask:
37 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
38.Lbswap_iv_mask:
39 .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
40
41.text
42
43#define p 0
44#define s0 ((16 + 2) * 4)
45#define s1 ((16 + 2 + (1 * 256)) * 4)
46#define s2 ((16 + 2 + (2 * 256)) * 4)
47#define s3 ((16 + 2 + (3 * 256)) * 4)
48
49
50#define CTX %rdi
51#define RIO %rdx
52
53#define RS0 %rax
54#define RS1 %r8
55#define RS2 %r9
56#define RS3 %r10
57
58#define RLOOP %r11
59#define RLOOPd %r11d
60
61#define RXr0 %ymm8
62#define RXr1 %ymm9
63#define RXr2 %ymm10
64#define RXr3 %ymm11
65#define RXl0 %ymm12
66#define RXl1 %ymm13
67#define RXl2 %ymm14
68#define RXl3 %ymm15
69
70
71#define RT0 %ymm0
72#define RT0x %xmm0
73#define RT1 %ymm1
74#define RT1x %xmm1
75#define RIDX0 %ymm2
76#define RIDX1 %ymm3
77#define RIDX1x %xmm3
78#define RIDX2 %ymm4
79#define RIDX3 %ymm5
80
81
82#define RNOT %ymm6
83
84
85#define RBYTE %ymm7
86
87
88
89
90#define F(xl, xr) \
91 vpsrld $24, xl, RIDX0; \
92 vpsrld $16, xl, RIDX1; \
93 vpsrld $8, xl, RIDX2; \
94 vpand RBYTE, RIDX1, RIDX1; \
95 vpand RBYTE, RIDX2, RIDX2; \
96 vpand RBYTE, xl, RIDX3; \
97 \
98 vpgatherdd RNOT, (RS0, RIDX0, 4), RT0; \
99 vpcmpeqd RNOT, RNOT, RNOT; \
100 vpcmpeqd RIDX0, RIDX0, RIDX0; \
101 \
102 vpgatherdd RNOT, (RS1, RIDX1, 4), RT1; \
103 vpcmpeqd RIDX1, RIDX1, RIDX1; \
104 vpaddd RT0, RT1, RT0; \
105 \
106 vpgatherdd RIDX0, (RS2, RIDX2, 4), RT1; \
107 vpxor RT0, RT1, RT0; \
108 \
109 vpgatherdd RIDX1, (RS3, RIDX3, 4), RT1; \
110 vpcmpeqd RNOT, RNOT, RNOT; \
111 vpaddd RT0, RT1, RT0; \
112 \
113 vpxor RT0, xr, xr;
114
115#define add_roundkey(xl, nmem) \
116 vpbroadcastd nmem, RT0; \
117 vpxor RT0, xl
118 vpxor RT0, xl
119 vpxor RT0, xl
120 vpxor RT0, xl
121
122#define round_enc() \
123 add_roundkey(RXr, p(CTX,RLOOP,4)); \
124 F(RXl0, RXr0); \
125 F(RXl1, RXr1); \
126 F(RXl2, RXr2); \
127 F(RXl3, RXr3); \
128 \
129 add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
130 F(RXr0, RXl0); \
131 F(RXr1, RXl1); \
132 F(RXr2, RXl2); \
133 F(RXr3, RXl3);
134
135#define round_dec() \
136 add_roundkey(RXr, p+4*2(CTX,RLOOP,4)); \
137 F(RXl0, RXr0); \
138 F(RXl1, RXr1); \
139 F(RXl2, RXr2); \
140 F(RXl3, RXr3); \
141 \
142 add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
143 F(RXr0, RXl0); \
144 F(RXr1, RXl1); \
145 F(RXr2, RXl2); \
146 F(RXr3, RXl3);
147
148#define init_round_constants() \
149 vpcmpeqd RNOT, RNOT, RNOT; \
150 leaq s0(CTX), RS0; \
151 leaq s1(CTX), RS1; \
152 leaq s2(CTX), RS2; \
153 leaq s3(CTX), RS3; \
154 vpsrld $24, RNOT, RBYTE;
155
156#define transpose_2x2(x0, x1, t0) \
157 vpunpckldq x0, x1, t0; \
158 vpunpckhdq x0, x1, x1; \
159 \
160 vpunpcklqdq t0, x1, x0; \
161 vpunpckhqdq t0, x1, x1;
162
163#define read_block(xl, xr) \
164 vbroadcasti128 .Lbswap32_mask, RT1; \
165 \
166 vpshufb RT1, xl
167 vpshufb RT1, xr
168 vpshufb RT1, xl
169 vpshufb RT1, xr
170 vpshufb RT1, xl
171 vpshufb RT1, xr
172 vpshufb RT1, xl
173 vpshufb RT1, xr
174 \
175 transpose_2x2(xl
176 transpose_2x2(xl
177 transpose_2x2(xl
178 transpose_2x2(xl
179
180#define write_block(xl, xr) \
181 vbroadcasti128 .Lbswap32_mask, RT1; \
182 \
183 transpose_2x2(xl
184 transpose_2x2(xl
185 transpose_2x2(xl
186 transpose_2x2(xl
187 \
188 vpshufb RT1, xl
189 vpshufb RT1, xr
190 vpshufb RT1, xl
191 vpshufb RT1, xr
192 vpshufb RT1, xl
193 vpshufb RT1, xr
194 vpshufb RT1, xl
195 vpshufb RT1, xr
196
197.align 8
198__blowfish_enc_blk32:
199
200
201
202
203
204
205 init_round_constants();
206
207 read_block(RXl, RXr);
208
209 movl $1, RLOOPd;
210 add_roundkey(RXl, p+4*(0)(CTX));
211
212.align 4
213.L__enc_loop:
214 round_enc();
215
216 leal 2(RLOOPd), RLOOPd;
217 cmpl $17, RLOOPd;
218 jne .L__enc_loop;
219
220 add_roundkey(RXr, p+4*(17)(CTX));
221
222 write_block(RXl, RXr);
223
224 ret;
225ENDPROC(__blowfish_enc_blk32)
226
227.align 8
228__blowfish_dec_blk32:
229
230
231
232
233
234
235 init_round_constants();
236
237 read_block(RXl, RXr);
238
239 movl $14, RLOOPd;
240 add_roundkey(RXl, p+4*(17)(CTX));
241
242.align 4
243.L__dec_loop:
244 round_dec();
245
246 addl $-2, RLOOPd;
247 jns .L__dec_loop;
248
249 add_roundkey(RXr, p+4*(0)(CTX));
250
251 write_block(RXl, RXr);
252
253 ret;
254ENDPROC(__blowfish_dec_blk32)
255
256ENTRY(blowfish_ecb_enc_32way)
257
258
259
260
261
262
263 vzeroupper;
264
265 vmovdqu 0*32(%rdx), RXl0;
266 vmovdqu 1*32(%rdx), RXr0;
267 vmovdqu 2*32(%rdx), RXl1;
268 vmovdqu 3*32(%rdx), RXr1;
269 vmovdqu 4*32(%rdx), RXl2;
270 vmovdqu 5*32(%rdx), RXr2;
271 vmovdqu 6*32(%rdx), RXl3;
272 vmovdqu 7*32(%rdx), RXr3;
273
274 call __blowfish_enc_blk32;
275
276 vmovdqu RXr0, 0*32(%rsi);
277 vmovdqu RXl0, 1*32(%rsi);
278 vmovdqu RXr1, 2*32(%rsi);
279 vmovdqu RXl1, 3*32(%rsi);
280 vmovdqu RXr2, 4*32(%rsi);
281 vmovdqu RXl2, 5*32(%rsi);
282 vmovdqu RXr3, 6*32(%rsi);
283 vmovdqu RXl3, 7*32(%rsi);
284
285 vzeroupper;
286
287 ret;
288ENDPROC(blowfish_ecb_enc_32way)
289
290ENTRY(blowfish_ecb_dec_32way)
291
292
293
294
295
296
297 vzeroupper;
298
299 vmovdqu 0*32(%rdx), RXl0;
300 vmovdqu 1*32(%rdx), RXr0;
301 vmovdqu 2*32(%rdx), RXl1;
302 vmovdqu 3*32(%rdx), RXr1;
303 vmovdqu 4*32(%rdx), RXl2;
304 vmovdqu 5*32(%rdx), RXr2;
305 vmovdqu 6*32(%rdx), RXl3;
306 vmovdqu 7*32(%rdx), RXr3;
307
308 call __blowfish_dec_blk32;
309
310 vmovdqu RXr0, 0*32(%rsi);
311 vmovdqu RXl0, 1*32(%rsi);
312 vmovdqu RXr1, 2*32(%rsi);
313 vmovdqu RXl1, 3*32(%rsi);
314 vmovdqu RXr2, 4*32(%rsi);
315 vmovdqu RXl2, 5*32(%rsi);
316 vmovdqu RXr3, 6*32(%rsi);
317 vmovdqu RXl3, 7*32(%rsi);
318
319 vzeroupper;
320
321 ret;
322ENDPROC(blowfish_ecb_dec_32way)
323
324ENTRY(blowfish_cbc_dec_32way)
325
326
327
328
329
330
331 vzeroupper;
332
333 vmovdqu 0*32(%rdx), RXl0;
334 vmovdqu 1*32(%rdx), RXr0;
335 vmovdqu 2*32(%rdx), RXl1;
336 vmovdqu 3*32(%rdx), RXr1;
337 vmovdqu 4*32(%rdx), RXl2;
338 vmovdqu 5*32(%rdx), RXr2;
339 vmovdqu 6*32(%rdx), RXl3;
340 vmovdqu 7*32(%rdx), RXr3;
341
342 call __blowfish_dec_blk32;
343
344
345 vmovq (%rdx), RT0x;
346 vpshufd $0x4f, RT0x, RT0x;
347 vinserti128 $1, 8(%rdx), RT0, RT0;
348 vpxor RT0, RXr0, RXr0;
349 vpxor 0*32+24(%rdx), RXl0, RXl0;
350 vpxor 1*32+24(%rdx), RXr1, RXr1;
351 vpxor 2*32+24(%rdx), RXl1, RXl1;
352 vpxor 3*32+24(%rdx), RXr2, RXr2;
353 vpxor 4*32+24(%rdx), RXl2, RXl2;
354 vpxor 5*32+24(%rdx), RXr3, RXr3;
355 vpxor 6*32+24(%rdx), RXl3, RXl3;
356
357 vmovdqu RXr0, (0*32)(%rsi);
358 vmovdqu RXl0, (1*32)(%rsi);
359 vmovdqu RXr1, (2*32)(%rsi);
360 vmovdqu RXl1, (3*32)(%rsi);
361 vmovdqu RXr2, (4*32)(%rsi);
362 vmovdqu RXl2, (5*32)(%rsi);
363 vmovdqu RXr3, (6*32)(%rsi);
364 vmovdqu RXl3, (7*32)(%rsi);
365
366 vzeroupper;
367
368 ret;
369ENDPROC(blowfish_cbc_dec_32way)
370
371ENTRY(blowfish_ctr_32way)
372
373
374
375
376
377
378
379 vzeroupper;
380
381 vpcmpeqd RT0, RT0, RT0;
382 vpsrldq $8, RT0, RT0;
383
384 vpcmpeqd RT1x, RT1x, RT1x;
385 vpaddq RT1x, RT1x, RT1x;
386 vpxor RIDX0, RIDX0, RIDX0;
387 vinserti128 $1, RT1x, RIDX0, RIDX0;
388
389 vpaddq RIDX0, RT0, RT0;
390
391 vpcmpeqd RT1, RT1, RT1;
392 vpaddq RT1, RT1, RT1;
393 vpaddq RT1, RT1, RIDX2;
394
395 vbroadcasti128 .Lbswap_iv_mask, RIDX0;
396 vbroadcasti128 .Lbswap128_mask, RIDX1;
397
398
399 vmovq (%rcx), RT1x;
400 vinserti128 $1, RT1x, RT1, RT1;
401 vpshufb RIDX0, RT1, RT1;
402
403
404 vpsubq RT0, RT1, RT1;
405 vpshufb RIDX1, RT1, RXl0;
406 vpsubq RIDX2, RT1, RT1;
407 vpshufb RIDX1, RT1, RXr0;
408 vpsubq RIDX2, RT1, RT1;
409 vpshufb RIDX1, RT1, RXl1;
410 vpsubq RIDX2, RT1, RT1;
411 vpshufb RIDX1, RT1, RXr1;
412 vpsubq RIDX2, RT1, RT1;
413 vpshufb RIDX1, RT1, RXl2;
414 vpsubq RIDX2, RT1, RT1;
415 vpshufb RIDX1, RT1, RXr2;
416 vpsubq RIDX2, RT1, RT1;
417 vpshufb RIDX1, RT1, RXl3;
418 vpsubq RIDX2, RT1, RT1;
419 vpshufb RIDX1, RT1, RXr3;
420
421
422 vpsubq RIDX2, RT1, RT1;
423 vpshufb RIDX1x, RT1x, RT1x;
424 vmovq RT1x, (%rcx);
425
426 call __blowfish_enc_blk32;
427
428
429 vpxor 0*32(%rdx), RXr0, RXr0;
430 vpxor 1*32(%rdx), RXl0, RXl0;
431 vpxor 2*32(%rdx), RXr1, RXr1;
432 vpxor 3*32(%rdx), RXl1, RXl1;
433 vpxor 4*32(%rdx), RXr2, RXr2;
434 vpxor 5*32(%rdx), RXl2, RXl2;
435 vpxor 6*32(%rdx), RXr3, RXr3;
436 vpxor 7*32(%rdx), RXl3, RXl3;
437 vmovdqu RXr0, (0*32)(%rsi);
438 vmovdqu RXl0, (1*32)(%rsi);
439 vmovdqu RXr1, (2*32)(%rsi);
440 vmovdqu RXl1, (3*32)(%rsi);
441 vmovdqu RXr2, (4*32)(%rsi);
442 vmovdqu RXl2, (5*32)(%rsi);
443 vmovdqu RXr3, (6*32)(%rsi);
444 vmovdqu RXl3, (7*32)(%rsi);
445
446 vzeroupper;
447
448 ret;
449ENDPROC(blowfish_ctr_32way)
450