1
2#ifndef _ASM_X86_XOR_AVX_H
3#define _ASM_X86_XOR_AVX_H
4
5
6
7
8
9
10
11
12
13
14#include <linux/compiler.h>
15#include <asm/fpu/api.h>
16
17#define BLOCK4(i) \
18 BLOCK(32 * i, 0) \
19 BLOCK(32 * (i + 1), 1) \
20 BLOCK(32 * (i + 2), 2) \
21 BLOCK(32 * (i + 3), 3)
22
23#define BLOCK16() \
24 BLOCK4(0) \
25 BLOCK4(4) \
26 BLOCK4(8) \
27 BLOCK4(12)
28
29static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
30{
31 unsigned long lines = bytes >> 9;
32
33 kernel_fpu_begin();
34
35 while (lines--) {
36#undef BLOCK
37#define BLOCK(i, reg) \
38do { \
39 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
40 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
41 "m" (p0[i / sizeof(*p0)])); \
42 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
43 "=m" (p0[i / sizeof(*p0)])); \
44} while (0);
45
46 BLOCK16()
47
48 p0 = (unsigned long *)((uintptr_t)p0 + 512);
49 p1 = (unsigned long *)((uintptr_t)p1 + 512);
50 }
51
52 kernel_fpu_end();
53}
54
55static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
56 unsigned long *p2)
57{
58 unsigned long lines = bytes >> 9;
59
60 kernel_fpu_begin();
61
62 while (lines--) {
63#undef BLOCK
64#define BLOCK(i, reg) \
65do { \
66 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
67 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
68 "m" (p1[i / sizeof(*p1)])); \
69 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
70 "m" (p0[i / sizeof(*p0)])); \
71 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
72 "=m" (p0[i / sizeof(*p0)])); \
73} while (0);
74
75 BLOCK16()
76
77 p0 = (unsigned long *)((uintptr_t)p0 + 512);
78 p1 = (unsigned long *)((uintptr_t)p1 + 512);
79 p2 = (unsigned long *)((uintptr_t)p2 + 512);
80 }
81
82 kernel_fpu_end();
83}
84
85static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
86 unsigned long *p2, unsigned long *p3)
87{
88 unsigned long lines = bytes >> 9;
89
90 kernel_fpu_begin();
91
92 while (lines--) {
93#undef BLOCK
94#define BLOCK(i, reg) \
95do { \
96 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
97 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
98 "m" (p2[i / sizeof(*p2)])); \
99 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
100 "m" (p1[i / sizeof(*p1)])); \
101 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
102 "m" (p0[i / sizeof(*p0)])); \
103 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
104 "=m" (p0[i / sizeof(*p0)])); \
105} while (0);
106
107 BLOCK16();
108
109 p0 = (unsigned long *)((uintptr_t)p0 + 512);
110 p1 = (unsigned long *)((uintptr_t)p1 + 512);
111 p2 = (unsigned long *)((uintptr_t)p2 + 512);
112 p3 = (unsigned long *)((uintptr_t)p3 + 512);
113 }
114
115 kernel_fpu_end();
116}
117
118static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
119 unsigned long *p2, unsigned long *p3, unsigned long *p4)
120{
121 unsigned long lines = bytes >> 9;
122
123 kernel_fpu_begin();
124
125 while (lines--) {
126#undef BLOCK
127#define BLOCK(i, reg) \
128do { \
129 asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
130 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
131 "m" (p3[i / sizeof(*p3)])); \
132 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
133 "m" (p2[i / sizeof(*p2)])); \
134 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
135 "m" (p1[i / sizeof(*p1)])); \
136 asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
137 "m" (p0[i / sizeof(*p0)])); \
138 asm volatile("vmovdqa %%ymm" #reg ", %0" : \
139 "=m" (p0[i / sizeof(*p0)])); \
140} while (0);
141
142 BLOCK16()
143
144 p0 = (unsigned long *)((uintptr_t)p0 + 512);
145 p1 = (unsigned long *)((uintptr_t)p1 + 512);
146 p2 = (unsigned long *)((uintptr_t)p2 + 512);
147 p3 = (unsigned long *)((uintptr_t)p3 + 512);
148 p4 = (unsigned long *)((uintptr_t)p4 + 512);
149 }
150
151 kernel_fpu_end();
152}
153
154static struct xor_block_template xor_block_avx = {
155 .name = "avx",
156 .do_2 = xor_avx_2,
157 .do_3 = xor_avx_3,
158 .do_4 = xor_avx_4,
159 .do_5 = xor_avx_5,
160};
161
162#define AVX_XOR_SPEED \
163do { \
164 if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \
165 xor_speed(&xor_block_avx); \
166} while (0)
167
168#define AVX_SELECT(FASTEST) \
169 (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST)
170
171#endif
172