1#ifndef _ASM_X86_XOR_64_H
2#define _ASM_X86_XOR_64_H
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37typedef struct {
38 unsigned long a, b;
39} __attribute__((aligned(16))) xmm_store_t;
40
41
42
43#define XMMS_SAVE \
44do { \
45 preempt_disable(); \
46 asm volatile( \
47 "movq %%cr0,%0 ;\n\t" \
48 "clts ;\n\t" \
49 "movups %%xmm0,(%1) ;\n\t" \
50 "movups %%xmm1,0x10(%1) ;\n\t" \
51 "movups %%xmm2,0x20(%1) ;\n\t" \
52 "movups %%xmm3,0x30(%1) ;\n\t" \
53 : "=&r" (cr0) \
54 : "r" (xmm_save) \
55 : "memory"); \
56} while (0)
57
58#define XMMS_RESTORE \
59do { \
60 asm volatile( \
61 "sfence ;\n\t" \
62 "movups (%1),%%xmm0 ;\n\t" \
63 "movups 0x10(%1),%%xmm1 ;\n\t" \
64 "movups 0x20(%1),%%xmm2 ;\n\t" \
65 "movups 0x30(%1),%%xmm3 ;\n\t" \
66 "movq %0,%%cr0 ;\n\t" \
67 : \
68 : "r" (cr0), "r" (xmm_save) \
69 : "memory"); \
70 preempt_enable(); \
71} while (0)
72
73#define OFFS(x) "16*("#x")"
74#define PF_OFFS(x) "256+16*("#x")"
75#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
76#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
77#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
78#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
79#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
80#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
81#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
82#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
83#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
84#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
85#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
86#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
87#define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
88
89
90static void
91xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
92{
93 unsigned int lines = bytes >> 8;
94 unsigned long cr0;
95 xmm_store_t xmm_save[4];
96
97 XMMS_SAVE;
98
99 asm volatile(
100#undef BLOCK
101#define BLOCK(i) \
102 LD(i, 0) \
103 LD(i + 1, 1) \
104 PF1(i) \
105 PF1(i + 2) \
106 LD(i + 2, 2) \
107 LD(i + 3, 3) \
108 PF0(i + 4) \
109 PF0(i + 6) \
110 XO1(i, 0) \
111 XO1(i + 1, 1) \
112 XO1(i + 2, 2) \
113 XO1(i + 3, 3) \
114 ST(i, 0) \
115 ST(i + 1, 1) \
116 ST(i + 2, 2) \
117 ST(i + 3, 3) \
118
119
120 PF0(0)
121 PF0(2)
122
123 " .align 32 ;\n"
124 " 1: ;\n"
125
126 BLOCK(0)
127 BLOCK(4)
128 BLOCK(8)
129 BLOCK(12)
130
131 " addq %[inc], %[p1] ;\n"
132 " addq %[inc], %[p2] ;\n"
133 " decl %[cnt] ; jnz 1b"
134 : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
135 : [inc] "r" (256UL)
136 : "memory");
137
138 XMMS_RESTORE;
139}
140
141static void
142xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
143 unsigned long *p3)
144{
145 unsigned int lines = bytes >> 8;
146 xmm_store_t xmm_save[4];
147 unsigned long cr0;
148
149 XMMS_SAVE;
150
151 asm volatile(
152#undef BLOCK
153#define BLOCK(i) \
154 PF1(i) \
155 PF1(i + 2) \
156 LD(i, 0) \
157 LD(i + 1, 1) \
158 LD(i + 2, 2) \
159 LD(i + 3, 3) \
160 PF2(i) \
161 PF2(i + 2) \
162 PF0(i + 4) \
163 PF0(i + 6) \
164 XO1(i, 0) \
165 XO1(i + 1, 1) \
166 XO1(i + 2, 2) \
167 XO1(i + 3, 3) \
168 XO2(i, 0) \
169 XO2(i + 1, 1) \
170 XO2(i + 2, 2) \
171 XO2(i + 3, 3) \
172 ST(i, 0) \
173 ST(i + 1, 1) \
174 ST(i + 2, 2) \
175 ST(i + 3, 3) \
176
177
178 PF0(0)
179 PF0(2)
180
181 " .align 32 ;\n"
182 " 1: ;\n"
183
184 BLOCK(0)
185 BLOCK(4)
186 BLOCK(8)
187 BLOCK(12)
188
189 " addq %[inc], %[p1] ;\n"
190 " addq %[inc], %[p2] ;\n"
191 " addq %[inc], %[p3] ;\n"
192 " decl %[cnt] ; jnz 1b"
193 : [cnt] "+r" (lines),
194 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
195 : [inc] "r" (256UL)
196 : "memory");
197 XMMS_RESTORE;
198}
199
200static void
201xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
202 unsigned long *p3, unsigned long *p4)
203{
204 unsigned int lines = bytes >> 8;
205 xmm_store_t xmm_save[4];
206 unsigned long cr0;
207
208 XMMS_SAVE;
209
210 asm volatile(
211#undef BLOCK
212#define BLOCK(i) \
213 PF1(i) \
214 PF1(i + 2) \
215 LD(i, 0) \
216 LD(i + 1, 1) \
217 LD(i + 2, 2) \
218 LD(i + 3, 3) \
219 PF2(i) \
220 PF2(i + 2) \
221 XO1(i, 0) \
222 XO1(i + 1, 1) \
223 XO1(i + 2, 2) \
224 XO1(i + 3, 3) \
225 PF3(i) \
226 PF3(i + 2) \
227 PF0(i + 4) \
228 PF0(i + 6) \
229 XO2(i, 0) \
230 XO2(i + 1, 1) \
231 XO2(i + 2, 2) \
232 XO2(i + 3, 3) \
233 XO3(i, 0) \
234 XO3(i + 1, 1) \
235 XO3(i + 2, 2) \
236 XO3(i + 3, 3) \
237 ST(i, 0) \
238 ST(i + 1, 1) \
239 ST(i + 2, 2) \
240 ST(i + 3, 3) \
241
242
243 PF0(0)
244 PF0(2)
245
246 " .align 32 ;\n"
247 " 1: ;\n"
248
249 BLOCK(0)
250 BLOCK(4)
251 BLOCK(8)
252 BLOCK(12)
253
254 " addq %[inc], %[p1] ;\n"
255 " addq %[inc], %[p2] ;\n"
256 " addq %[inc], %[p3] ;\n"
257 " addq %[inc], %[p4] ;\n"
258 " decl %[cnt] ; jnz 1b"
259 : [cnt] "+c" (lines),
260 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
261 : [inc] "r" (256UL)
262 : "memory" );
263
264 XMMS_RESTORE;
265}
266
267static void
268xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
269 unsigned long *p3, unsigned long *p4, unsigned long *p5)
270{
271 unsigned int lines = bytes >> 8;
272 xmm_store_t xmm_save[4];
273 unsigned long cr0;
274
275 XMMS_SAVE;
276
277 asm volatile(
278#undef BLOCK
279#define BLOCK(i) \
280 PF1(i) \
281 PF1(i + 2) \
282 LD(i, 0) \
283 LD(i + 1, 1) \
284 LD(i + 2, 2) \
285 LD(i + 3, 3) \
286 PF2(i) \
287 PF2(i + 2) \
288 XO1(i, 0) \
289 XO1(i + 1, 1) \
290 XO1(i + 2, 2) \
291 XO1(i + 3, 3) \
292 PF3(i) \
293 PF3(i + 2) \
294 XO2(i, 0) \
295 XO2(i + 1, 1) \
296 XO2(i + 2, 2) \
297 XO2(i + 3, 3) \
298 PF4(i) \
299 PF4(i + 2) \
300 PF0(i + 4) \
301 PF0(i + 6) \
302 XO3(i, 0) \
303 XO3(i + 1, 1) \
304 XO3(i + 2, 2) \
305 XO3(i + 3, 3) \
306 XO4(i, 0) \
307 XO4(i + 1, 1) \
308 XO4(i + 2, 2) \
309 XO4(i + 3, 3) \
310 ST(i, 0) \
311 ST(i + 1, 1) \
312 ST(i + 2, 2) \
313 ST(i + 3, 3) \
314
315
316 PF0(0)
317 PF0(2)
318
319 " .align 32 ;\n"
320 " 1: ;\n"
321
322 BLOCK(0)
323 BLOCK(4)
324 BLOCK(8)
325 BLOCK(12)
326
327 " addq %[inc], %[p1] ;\n"
328 " addq %[inc], %[p2] ;\n"
329 " addq %[inc], %[p3] ;\n"
330 " addq %[inc], %[p4] ;\n"
331 " addq %[inc], %[p5] ;\n"
332 " decl %[cnt] ; jnz 1b"
333 : [cnt] "+c" (lines),
334 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
335 [p5] "+r" (p5)
336 : [inc] "r" (256UL)
337 : "memory");
338
339 XMMS_RESTORE;
340}
341
342static struct xor_block_template xor_block_sse = {
343 .name = "generic_sse",
344 .do_2 = xor_sse_2,
345 .do_3 = xor_sse_3,
346 .do_4 = xor_sse_4,
347 .do_5 = xor_sse_5,
348};
349
350#undef XOR_TRY_TEMPLATES
351#define XOR_TRY_TEMPLATES \
352do { \
353 xor_speed(&xor_block_sse); \
354} while (0)
355
356
357
358
359#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
360
361#endif
362