1#ifndef _ASM_X86_XOR_32_H
2#define _ASM_X86_XOR_32_H
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22#define LD(x, y) " movq 8*("#x")(%1), %%mm"#y" ;\n"
23#define ST(x, y) " movq %%mm"#y", 8*("#x")(%1) ;\n"
24#define XO1(x, y) " pxor 8*("#x")(%2), %%mm"#y" ;\n"
25#define XO2(x, y) " pxor 8*("#x")(%3), %%mm"#y" ;\n"
26#define XO3(x, y) " pxor 8*("#x")(%4), %%mm"#y" ;\n"
27#define XO4(x, y) " pxor 8*("#x")(%5), %%mm"#y" ;\n"
28
29#include <asm/i387.h>
30
31static void
32xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
33{
34 unsigned long lines = bytes >> 7;
35
36 kernel_fpu_begin();
37
38 asm volatile(
39#undef BLOCK
40#define BLOCK(i) \
41 LD(i, 0) \
42 LD(i + 1, 1) \
43 LD(i + 2, 2) \
44 LD(i + 3, 3) \
45 XO1(i, 0) \
46 ST(i, 0) \
47 XO1(i+1, 1) \
48 ST(i+1, 1) \
49 XO1(i + 2, 2) \
50 ST(i + 2, 2) \
51 XO1(i + 3, 3) \
52 ST(i + 3, 3)
53
54 " .align 32 ;\n"
55 " 1: ;\n"
56
57 BLOCK(0)
58 BLOCK(4)
59 BLOCK(8)
60 BLOCK(12)
61
62 " addl $128, %1 ;\n"
63 " addl $128, %2 ;\n"
64 " decl %0 ;\n"
65 " jnz 1b ;\n"
66 : "+r" (lines),
67 "+r" (p1), "+r" (p2)
68 :
69 : "memory");
70
71 kernel_fpu_end();
72}
73
74static void
75xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
76 unsigned long *p3)
77{
78 unsigned long lines = bytes >> 7;
79
80 kernel_fpu_begin();
81
82 asm volatile(
83#undef BLOCK
84#define BLOCK(i) \
85 LD(i, 0) \
86 LD(i + 1, 1) \
87 LD(i + 2, 2) \
88 LD(i + 3, 3) \
89 XO1(i, 0) \
90 XO1(i + 1, 1) \
91 XO1(i + 2, 2) \
92 XO1(i + 3, 3) \
93 XO2(i, 0) \
94 ST(i, 0) \
95 XO2(i + 1, 1) \
96 ST(i + 1, 1) \
97 XO2(i + 2, 2) \
98 ST(i + 2, 2) \
99 XO2(i + 3, 3) \
100 ST(i + 3, 3)
101
102 " .align 32 ;\n"
103 " 1: ;\n"
104
105 BLOCK(0)
106 BLOCK(4)
107 BLOCK(8)
108 BLOCK(12)
109
110 " addl $128, %1 ;\n"
111 " addl $128, %2 ;\n"
112 " addl $128, %3 ;\n"
113 " decl %0 ;\n"
114 " jnz 1b ;\n"
115 : "+r" (lines),
116 "+r" (p1), "+r" (p2), "+r" (p3)
117 :
118 : "memory");
119
120 kernel_fpu_end();
121}
122
123static void
124xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
125 unsigned long *p3, unsigned long *p4)
126{
127 unsigned long lines = bytes >> 7;
128
129 kernel_fpu_begin();
130
131 asm volatile(
132#undef BLOCK
133#define BLOCK(i) \
134 LD(i, 0) \
135 LD(i + 1, 1) \
136 LD(i + 2, 2) \
137 LD(i + 3, 3) \
138 XO1(i, 0) \
139 XO1(i + 1, 1) \
140 XO1(i + 2, 2) \
141 XO1(i + 3, 3) \
142 XO2(i, 0) \
143 XO2(i + 1, 1) \
144 XO2(i + 2, 2) \
145 XO2(i + 3, 3) \
146 XO3(i, 0) \
147 ST(i, 0) \
148 XO3(i + 1, 1) \
149 ST(i + 1, 1) \
150 XO3(i + 2, 2) \
151 ST(i + 2, 2) \
152 XO3(i + 3, 3) \
153 ST(i + 3, 3)
154
155 " .align 32 ;\n"
156 " 1: ;\n"
157
158 BLOCK(0)
159 BLOCK(4)
160 BLOCK(8)
161 BLOCK(12)
162
163 " addl $128, %1 ;\n"
164 " addl $128, %2 ;\n"
165 " addl $128, %3 ;\n"
166 " addl $128, %4 ;\n"
167 " decl %0 ;\n"
168 " jnz 1b ;\n"
169 : "+r" (lines),
170 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
171 :
172 : "memory");
173
174 kernel_fpu_end();
175}
176
177
178static void
179xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
180 unsigned long *p3, unsigned long *p4, unsigned long *p5)
181{
182 unsigned long lines = bytes >> 7;
183
184 kernel_fpu_begin();
185
186
187
188
189
190
191
192 asm("" : "+r" (p4), "+r" (p5));
193
194 asm volatile(
195#undef BLOCK
196#define BLOCK(i) \
197 LD(i, 0) \
198 LD(i + 1, 1) \
199 LD(i + 2, 2) \
200 LD(i + 3, 3) \
201 XO1(i, 0) \
202 XO1(i + 1, 1) \
203 XO1(i + 2, 2) \
204 XO1(i + 3, 3) \
205 XO2(i, 0) \
206 XO2(i + 1, 1) \
207 XO2(i + 2, 2) \
208 XO2(i + 3, 3) \
209 XO3(i, 0) \
210 XO3(i + 1, 1) \
211 XO3(i + 2, 2) \
212 XO3(i + 3, 3) \
213 XO4(i, 0) \
214 ST(i, 0) \
215 XO4(i + 1, 1) \
216 ST(i + 1, 1) \
217 XO4(i + 2, 2) \
218 ST(i + 2, 2) \
219 XO4(i + 3, 3) \
220 ST(i + 3, 3)
221
222 " .align 32 ;\n"
223 " 1: ;\n"
224
225 BLOCK(0)
226 BLOCK(4)
227 BLOCK(8)
228 BLOCK(12)
229
230 " addl $128, %1 ;\n"
231 " addl $128, %2 ;\n"
232 " addl $128, %3 ;\n"
233 " addl $128, %4 ;\n"
234 " addl $128, %5 ;\n"
235 " decl %0 ;\n"
236 " jnz 1b ;\n"
237 : "+r" (lines),
238 "+r" (p1), "+r" (p2), "+r" (p3)
239 : "r" (p4), "r" (p5)
240 : "memory");
241
242
243
244
245 asm("" : "=r" (p4), "=r" (p5));
246
247 kernel_fpu_end();
248}
249
250#undef LD
251#undef XO1
252#undef XO2
253#undef XO3
254#undef XO4
255#undef ST
256#undef BLOCK
257
258static void
259xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
260{
261 unsigned long lines = bytes >> 6;
262
263 kernel_fpu_begin();
264
265 asm volatile(
266 " .align 32 ;\n"
267 " 1: ;\n"
268 " movq (%1), %%mm0 ;\n"
269 " movq 8(%1), %%mm1 ;\n"
270 " pxor (%2), %%mm0 ;\n"
271 " movq 16(%1), %%mm2 ;\n"
272 " movq %%mm0, (%1) ;\n"
273 " pxor 8(%2), %%mm1 ;\n"
274 " movq 24(%1), %%mm3 ;\n"
275 " movq %%mm1, 8(%1) ;\n"
276 " pxor 16(%2), %%mm2 ;\n"
277 " movq 32(%1), %%mm4 ;\n"
278 " movq %%mm2, 16(%1) ;\n"
279 " pxor 24(%2), %%mm3 ;\n"
280 " movq 40(%1), %%mm5 ;\n"
281 " movq %%mm3, 24(%1) ;\n"
282 " pxor 32(%2), %%mm4 ;\n"
283 " movq 48(%1), %%mm6 ;\n"
284 " movq %%mm4, 32(%1) ;\n"
285 " pxor 40(%2), %%mm5 ;\n"
286 " movq 56(%1), %%mm7 ;\n"
287 " movq %%mm5, 40(%1) ;\n"
288 " pxor 48(%2), %%mm6 ;\n"
289 " pxor 56(%2), %%mm7 ;\n"
290 " movq %%mm6, 48(%1) ;\n"
291 " movq %%mm7, 56(%1) ;\n"
292
293 " addl $64, %1 ;\n"
294 " addl $64, %2 ;\n"
295 " decl %0 ;\n"
296 " jnz 1b ;\n"
297 : "+r" (lines),
298 "+r" (p1), "+r" (p2)
299 :
300 : "memory");
301
302 kernel_fpu_end();
303}
304
305static void
306xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
307 unsigned long *p3)
308{
309 unsigned long lines = bytes >> 6;
310
311 kernel_fpu_begin();
312
313 asm volatile(
314 " .align 32,0x90 ;\n"
315 " 1: ;\n"
316 " movq (%1), %%mm0 ;\n"
317 " movq 8(%1), %%mm1 ;\n"
318 " pxor (%2), %%mm0 ;\n"
319 " movq 16(%1), %%mm2 ;\n"
320 " pxor 8(%2), %%mm1 ;\n"
321 " pxor (%3), %%mm0 ;\n"
322 " pxor 16(%2), %%mm2 ;\n"
323 " movq %%mm0, (%1) ;\n"
324 " pxor 8(%3), %%mm1 ;\n"
325 " pxor 16(%3), %%mm2 ;\n"
326 " movq 24(%1), %%mm3 ;\n"
327 " movq %%mm1, 8(%1) ;\n"
328 " movq 32(%1), %%mm4 ;\n"
329 " movq 40(%1), %%mm5 ;\n"
330 " pxor 24(%2), %%mm3 ;\n"
331 " movq %%mm2, 16(%1) ;\n"
332 " pxor 32(%2), %%mm4 ;\n"
333 " pxor 24(%3), %%mm3 ;\n"
334 " pxor 40(%2), %%mm5 ;\n"
335 " movq %%mm3, 24(%1) ;\n"
336 " pxor 32(%3), %%mm4 ;\n"
337 " pxor 40(%3), %%mm5 ;\n"
338 " movq 48(%1), %%mm6 ;\n"
339 " movq %%mm4, 32(%1) ;\n"
340 " movq 56(%1), %%mm7 ;\n"
341 " pxor 48(%2), %%mm6 ;\n"
342 " movq %%mm5, 40(%1) ;\n"
343 " pxor 56(%2), %%mm7 ;\n"
344 " pxor 48(%3), %%mm6 ;\n"
345 " pxor 56(%3), %%mm7 ;\n"
346 " movq %%mm6, 48(%1) ;\n"
347 " movq %%mm7, 56(%1) ;\n"
348
349 " addl $64, %1 ;\n"
350 " addl $64, %2 ;\n"
351 " addl $64, %3 ;\n"
352 " decl %0 ;\n"
353 " jnz 1b ;\n"
354 : "+r" (lines),
355 "+r" (p1), "+r" (p2), "+r" (p3)
356 :
357 : "memory" );
358
359 kernel_fpu_end();
360}
361
362static void
363xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
364 unsigned long *p3, unsigned long *p4)
365{
366 unsigned long lines = bytes >> 6;
367
368 kernel_fpu_begin();
369
370 asm volatile(
371 " .align 32,0x90 ;\n"
372 " 1: ;\n"
373 " movq (%1), %%mm0 ;\n"
374 " movq 8(%1), %%mm1 ;\n"
375 " pxor (%2), %%mm0 ;\n"
376 " movq 16(%1), %%mm2 ;\n"
377 " pxor 8(%2), %%mm1 ;\n"
378 " pxor (%3), %%mm0 ;\n"
379 " pxor 16(%2), %%mm2 ;\n"
380 " pxor 8(%3), %%mm1 ;\n"
381 " pxor (%4), %%mm0 ;\n"
382 " movq 24(%1), %%mm3 ;\n"
383 " pxor 16(%3), %%mm2 ;\n"
384 " pxor 8(%4), %%mm1 ;\n"
385 " movq %%mm0, (%1) ;\n"
386 " movq 32(%1), %%mm4 ;\n"
387 " pxor 24(%2), %%mm3 ;\n"
388 " pxor 16(%4), %%mm2 ;\n"
389 " movq %%mm1, 8(%1) ;\n"
390 " movq 40(%1), %%mm5 ;\n"
391 " pxor 32(%2), %%mm4 ;\n"
392 " pxor 24(%3), %%mm3 ;\n"
393 " movq %%mm2, 16(%1) ;\n"
394 " pxor 40(%2), %%mm5 ;\n"
395 " pxor 32(%3), %%mm4 ;\n"
396 " pxor 24(%4), %%mm3 ;\n"
397 " movq %%mm3, 24(%1) ;\n"
398 " movq 56(%1), %%mm7 ;\n"
399 " movq 48(%1), %%mm6 ;\n"
400 " pxor 40(%3), %%mm5 ;\n"
401 " pxor 32(%4), %%mm4 ;\n"
402 " pxor 48(%2), %%mm6 ;\n"
403 " movq %%mm4, 32(%1) ;\n"
404 " pxor 56(%2), %%mm7 ;\n"
405 " pxor 40(%4), %%mm5 ;\n"
406 " pxor 48(%3), %%mm6 ;\n"
407 " pxor 56(%3), %%mm7 ;\n"
408 " movq %%mm5, 40(%1) ;\n"
409 " pxor 48(%4), %%mm6 ;\n"
410 " pxor 56(%4), %%mm7 ;\n"
411 " movq %%mm6, 48(%1) ;\n"
412 " movq %%mm7, 56(%1) ;\n"
413
414 " addl $64, %1 ;\n"
415 " addl $64, %2 ;\n"
416 " addl $64, %3 ;\n"
417 " addl $64, %4 ;\n"
418 " decl %0 ;\n"
419 " jnz 1b ;\n"
420 : "+r" (lines),
421 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
422 :
423 : "memory");
424
425 kernel_fpu_end();
426}
427
428static void
429xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
430 unsigned long *p3, unsigned long *p4, unsigned long *p5)
431{
432 unsigned long lines = bytes >> 6;
433
434 kernel_fpu_begin();
435
436
437
438
439
440
441
442 asm("" : "+r" (p4), "+r" (p5));
443
444 asm volatile(
445 " .align 32,0x90 ;\n"
446 " 1: ;\n"
447 " movq (%1), %%mm0 ;\n"
448 " movq 8(%1), %%mm1 ;\n"
449 " pxor (%2), %%mm0 ;\n"
450 " pxor 8(%2), %%mm1 ;\n"
451 " movq 16(%1), %%mm2 ;\n"
452 " pxor (%3), %%mm0 ;\n"
453 " pxor 8(%3), %%mm1 ;\n"
454 " pxor 16(%2), %%mm2 ;\n"
455 " pxor (%4), %%mm0 ;\n"
456 " pxor 8(%4), %%mm1 ;\n"
457 " pxor 16(%3), %%mm2 ;\n"
458 " movq 24(%1), %%mm3 ;\n"
459 " pxor (%5), %%mm0 ;\n"
460 " pxor 8(%5), %%mm1 ;\n"
461 " movq %%mm0, (%1) ;\n"
462 " pxor 16(%4), %%mm2 ;\n"
463 " pxor 24(%2), %%mm3 ;\n"
464 " movq %%mm1, 8(%1) ;\n"
465 " pxor 16(%5), %%mm2 ;\n"
466 " pxor 24(%3), %%mm3 ;\n"
467 " movq 32(%1), %%mm4 ;\n"
468 " movq %%mm2, 16(%1) ;\n"
469 " pxor 24(%4), %%mm3 ;\n"
470 " pxor 32(%2), %%mm4 ;\n"
471 " movq 40(%1), %%mm5 ;\n"
472 " pxor 24(%5), %%mm3 ;\n"
473 " pxor 32(%3), %%mm4 ;\n"
474 " pxor 40(%2), %%mm5 ;\n"
475 " movq %%mm3, 24(%1) ;\n"
476 " pxor 32(%4), %%mm4 ;\n"
477 " pxor 40(%3), %%mm5 ;\n"
478 " movq 48(%1), %%mm6 ;\n"
479 " movq 56(%1), %%mm7 ;\n"
480 " pxor 32(%5), %%mm4 ;\n"
481 " pxor 40(%4), %%mm5 ;\n"
482 " pxor 48(%2), %%mm6 ;\n"
483 " pxor 56(%2), %%mm7 ;\n"
484 " movq %%mm4, 32(%1) ;\n"
485 " pxor 48(%3), %%mm6 ;\n"
486 " pxor 56(%3), %%mm7 ;\n"
487 " pxor 40(%5), %%mm5 ;\n"
488 " pxor 48(%4), %%mm6 ;\n"
489 " pxor 56(%4), %%mm7 ;\n"
490 " movq %%mm5, 40(%1) ;\n"
491 " pxor 48(%5), %%mm6 ;\n"
492 " pxor 56(%5), %%mm7 ;\n"
493 " movq %%mm6, 48(%1) ;\n"
494 " movq %%mm7, 56(%1) ;\n"
495
496 " addl $64, %1 ;\n"
497 " addl $64, %2 ;\n"
498 " addl $64, %3 ;\n"
499 " addl $64, %4 ;\n"
500 " addl $64, %5 ;\n"
501 " decl %0 ;\n"
502 " jnz 1b ;\n"
503 : "+r" (lines),
504 "+r" (p1), "+r" (p2), "+r" (p3)
505 : "r" (p4), "r" (p5)
506 : "memory");
507
508
509
510
511 asm("" : "=r" (p4), "=r" (p5));
512
513 kernel_fpu_end();
514}
515
516static struct xor_block_template xor_block_pII_mmx = {
517 .name = "pII_mmx",
518 .do_2 = xor_pII_mmx_2,
519 .do_3 = xor_pII_mmx_3,
520 .do_4 = xor_pII_mmx_4,
521 .do_5 = xor_pII_mmx_5,
522};
523
524static struct xor_block_template xor_block_p5_mmx = {
525 .name = "p5_mmx",
526 .do_2 = xor_p5_mmx_2,
527 .do_3 = xor_p5_mmx_3,
528 .do_4 = xor_p5_mmx_4,
529 .do_5 = xor_p5_mmx_5,
530};
531
532
533
534
535
536
537#define OFFS(x) "16*("#x")"
538#define PF_OFFS(x) "256+16*("#x")"
539#define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n"
540#define LD(x, y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
541#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
542#define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n"
543#define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n"
544#define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n"
545#define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n"
546#define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n"
547#define XO1(x, y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
548#define XO2(x, y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"
549#define XO3(x, y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n"
550#define XO4(x, y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n"
551#define XO5(x, y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n"
552
553
554static void
555xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
556{
557 unsigned long lines = bytes >> 8;
558
559 kernel_fpu_begin();
560
561 asm volatile(
562#undef BLOCK
563#define BLOCK(i) \
564 LD(i, 0) \
565 LD(i + 1, 1) \
566 PF1(i) \
567 PF1(i + 2) \
568 LD(i + 2, 2) \
569 LD(i + 3, 3) \
570 PF0(i + 4) \
571 PF0(i + 6) \
572 XO1(i, 0) \
573 XO1(i + 1, 1) \
574 XO1(i + 2, 2) \
575 XO1(i + 3, 3) \
576 ST(i, 0) \
577 ST(i + 1, 1) \
578 ST(i + 2, 2) \
579 ST(i + 3, 3) \
580
581
582 PF0(0)
583 PF0(2)
584
585 " .align 32 ;\n"
586 " 1: ;\n"
587
588 BLOCK(0)
589 BLOCK(4)
590 BLOCK(8)
591 BLOCK(12)
592
593 " addl $256, %1 ;\n"
594 " addl $256, %2 ;\n"
595 " decl %0 ;\n"
596 " jnz 1b ;\n"
597 : "+r" (lines),
598 "+r" (p1), "+r" (p2)
599 :
600 : "memory");
601
602 kernel_fpu_end();
603}
604
605static void
606xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
607 unsigned long *p3)
608{
609 unsigned long lines = bytes >> 8;
610
611 kernel_fpu_begin();
612
613 asm volatile(
614#undef BLOCK
615#define BLOCK(i) \
616 PF1(i) \
617 PF1(i + 2) \
618 LD(i,0) \
619 LD(i + 1, 1) \
620 LD(i + 2, 2) \
621 LD(i + 3, 3) \
622 PF2(i) \
623 PF2(i + 2) \
624 PF0(i + 4) \
625 PF0(i + 6) \
626 XO1(i,0) \
627 XO1(i + 1, 1) \
628 XO1(i + 2, 2) \
629 XO1(i + 3, 3) \
630 XO2(i,0) \
631 XO2(i + 1, 1) \
632 XO2(i + 2, 2) \
633 XO2(i + 3, 3) \
634 ST(i,0) \
635 ST(i + 1, 1) \
636 ST(i + 2, 2) \
637 ST(i + 3, 3) \
638
639
640 PF0(0)
641 PF0(2)
642
643 " .align 32 ;\n"
644 " 1: ;\n"
645
646 BLOCK(0)
647 BLOCK(4)
648 BLOCK(8)
649 BLOCK(12)
650
651 " addl $256, %1 ;\n"
652 " addl $256, %2 ;\n"
653 " addl $256, %3 ;\n"
654 " decl %0 ;\n"
655 " jnz 1b ;\n"
656 : "+r" (lines),
657 "+r" (p1), "+r"(p2), "+r"(p3)
658 :
659 : "memory" );
660
661 kernel_fpu_end();
662}
663
664static void
665xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
666 unsigned long *p3, unsigned long *p4)
667{
668 unsigned long lines = bytes >> 8;
669
670 kernel_fpu_begin();
671
672 asm volatile(
673#undef BLOCK
674#define BLOCK(i) \
675 PF1(i) \
676 PF1(i + 2) \
677 LD(i,0) \
678 LD(i + 1, 1) \
679 LD(i + 2, 2) \
680 LD(i + 3, 3) \
681 PF2(i) \
682 PF2(i + 2) \
683 XO1(i,0) \
684 XO1(i + 1, 1) \
685 XO1(i + 2, 2) \
686 XO1(i + 3, 3) \
687 PF3(i) \
688 PF3(i + 2) \
689 PF0(i + 4) \
690 PF0(i + 6) \
691 XO2(i,0) \
692 XO2(i + 1, 1) \
693 XO2(i + 2, 2) \
694 XO2(i + 3, 3) \
695 XO3(i,0) \
696 XO3(i + 1, 1) \
697 XO3(i + 2, 2) \
698 XO3(i + 3, 3) \
699 ST(i,0) \
700 ST(i + 1, 1) \
701 ST(i + 2, 2) \
702 ST(i + 3, 3) \
703
704
705 PF0(0)
706 PF0(2)
707
708 " .align 32 ;\n"
709 " 1: ;\n"
710
711 BLOCK(0)
712 BLOCK(4)
713 BLOCK(8)
714 BLOCK(12)
715
716 " addl $256, %1 ;\n"
717 " addl $256, %2 ;\n"
718 " addl $256, %3 ;\n"
719 " addl $256, %4 ;\n"
720 " decl %0 ;\n"
721 " jnz 1b ;\n"
722 : "+r" (lines),
723 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
724 :
725 : "memory" );
726
727 kernel_fpu_end();
728}
729
730static void
731xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
732 unsigned long *p3, unsigned long *p4, unsigned long *p5)
733{
734 unsigned long lines = bytes >> 8;
735
736 kernel_fpu_begin();
737
738
739
740
741
742
743
744 asm("" : "+r" (p4), "+r" (p5));
745
746 asm volatile(
747#undef BLOCK
748#define BLOCK(i) \
749 PF1(i) \
750 PF1(i + 2) \
751 LD(i,0) \
752 LD(i + 1, 1) \
753 LD(i + 2, 2) \
754 LD(i + 3, 3) \
755 PF2(i) \
756 PF2(i + 2) \
757 XO1(i,0) \
758 XO1(i + 1, 1) \
759 XO1(i + 2, 2) \
760 XO1(i + 3, 3) \
761 PF3(i) \
762 PF3(i + 2) \
763 XO2(i,0) \
764 XO2(i + 1, 1) \
765 XO2(i + 2, 2) \
766 XO2(i + 3, 3) \
767 PF4(i) \
768 PF4(i + 2) \
769 PF0(i + 4) \
770 PF0(i + 6) \
771 XO3(i,0) \
772 XO3(i + 1, 1) \
773 XO3(i + 2, 2) \
774 XO3(i + 3, 3) \
775 XO4(i,0) \
776 XO4(i + 1, 1) \
777 XO4(i + 2, 2) \
778 XO4(i + 3, 3) \
779 ST(i,0) \
780 ST(i + 1, 1) \
781 ST(i + 2, 2) \
782 ST(i + 3, 3) \
783
784
785 PF0(0)
786 PF0(2)
787
788 " .align 32 ;\n"
789 " 1: ;\n"
790
791 BLOCK(0)
792 BLOCK(4)
793 BLOCK(8)
794 BLOCK(12)
795
796 " addl $256, %1 ;\n"
797 " addl $256, %2 ;\n"
798 " addl $256, %3 ;\n"
799 " addl $256, %4 ;\n"
800 " addl $256, %5 ;\n"
801 " decl %0 ;\n"
802 " jnz 1b ;\n"
803 : "+r" (lines),
804 "+r" (p1), "+r" (p2), "+r" (p3)
805 : "r" (p4), "r" (p5)
806 : "memory");
807
808
809
810
811 asm("" : "=r" (p4), "=r" (p5));
812
813 kernel_fpu_end();
814}
815
816static struct xor_block_template xor_block_pIII_sse = {
817 .name = "pIII_sse",
818 .do_2 = xor_sse_2,
819 .do_3 = xor_sse_3,
820 .do_4 = xor_sse_4,
821 .do_5 = xor_sse_5,
822};
823
824
825#include <asm/xor_avx.h>
826
827
828#include <asm-generic/xor.h>
829
830#undef XOR_TRY_TEMPLATES
831#define XOR_TRY_TEMPLATES \
832do { \
833 xor_speed(&xor_block_8regs); \
834 xor_speed(&xor_block_8regs_p); \
835 xor_speed(&xor_block_32regs); \
836 xor_speed(&xor_block_32regs_p); \
837 AVX_XOR_SPEED; \
838 if (cpu_has_xmm) \
839 xor_speed(&xor_block_pIII_sse); \
840 if (cpu_has_mmx) { \
841 xor_speed(&xor_block_pII_mmx); \
842 xor_speed(&xor_block_p5_mmx); \
843 } \
844} while (0)
845
846
847
848
849#define XOR_SELECT_TEMPLATE(FASTEST) \
850 AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
851
852#endif
853