1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18#include <stdio.h>
19#include <stdint.h>
20#include <stdbool.h>
21#include <string.h>
22
23int err;
24
25static void __check(int line, int i, int j, uint64_t result, uint64_t expect)
26{
27 if (result != expect) {
28 printf("ERROR at line %d: [%d][%d] 0x%016llx != 0x%016llx\n",
29 line, i, j, result, expect);
30 err++;
31 }
32}
33
34#define check(RES, EXP) __check(__LINE__, RES, EXP)
35
36#define MAX_VEC_SIZE_BYTES 128
37
38typedef union {
39 uint64_t ud[MAX_VEC_SIZE_BYTES / 8];
40 int64_t d[MAX_VEC_SIZE_BYTES / 8];
41 uint32_t uw[MAX_VEC_SIZE_BYTES / 4];
42 int32_t w[MAX_VEC_SIZE_BYTES / 4];
43 uint16_t uh[MAX_VEC_SIZE_BYTES / 2];
44 int16_t h[MAX_VEC_SIZE_BYTES / 2];
45 uint8_t ub[MAX_VEC_SIZE_BYTES / 1];
46 int8_t b[MAX_VEC_SIZE_BYTES / 1];
47} MMVector;
48
49#define BUFSIZE 16
50#define OUTSIZE 16
51#define MASKMOD 3
52
53MMVector buffer0[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
54MMVector buffer1[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
55MMVector mask[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
56MMVector output[OUTSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
57MMVector expect[OUTSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
58
59#define CHECK_OUTPUT_FUNC(FIELD, FIELDSZ) \
60static void check_output_##FIELD(int line, size_t num_vectors) \
61{ \
62 for (int i = 0; i < num_vectors; i++) { \
63 for (int j = 0; j < MAX_VEC_SIZE_BYTES / FIELDSZ; j++) { \
64 __check(line, i, j, output[i].FIELD[j], expect[i].FIELD[j]); \
65 } \
66 } \
67}
68
69CHECK_OUTPUT_FUNC(d, 8)
70CHECK_OUTPUT_FUNC(w, 4)
71CHECK_OUTPUT_FUNC(h, 2)
72CHECK_OUTPUT_FUNC(b, 1)
73
74static void init_buffers(void)
75{
76 int counter0 = 0;
77 int counter1 = 17;
78 for (int i = 0; i < BUFSIZE; i++) {
79 for (int j = 0; j < MAX_VEC_SIZE_BYTES; j++) {
80 buffer0[i].b[j] = counter0++;
81 buffer1[i].b[j] = counter1++;
82 }
83 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
84 mask[i].w[j] = (i + j % MASKMOD == 0) ? 0 : 1;
85 }
86 }
87}
88
89static void test_load_tmp(void)
90{
91 void *p0 = buffer0;
92 void *p1 = buffer1;
93 void *pout = output;
94
95 for (int i = 0; i < BUFSIZE; i++) {
96
97
98
99
100
101 asm("v3 = vmem(%0 + #0)\n\t"
102 "r1 = #1\n\t"
103 "v12 = vsplat(r1)\n\t"
104 "{\n\t"
105 " v12.tmp = vmem(%1 + #0)\n\t"
106 " v4.w = vadd(v12.w, v3.w)\n\t"
107 "}\n\t"
108 "v4.w = vadd(v4.w, v12.w)\n\t"
109 "vmem(%2 + #0) = v4\n\t"
110 : : "r"(p0), "r"(p1), "r"(pout)
111 : "r1", "v12", "v3", "v4", "v6", "memory");
112 p0 += sizeof(MMVector);
113 p1 += sizeof(MMVector);
114 pout += sizeof(MMVector);
115
116 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
117 expect[i].w[j] = buffer0[i].w[j] + buffer1[i].w[j] + 1;
118 }
119 }
120
121 check_output_w(__LINE__, BUFSIZE);
122}
123
124static void test_load_cur(void)
125{
126 void *p0 = buffer0;
127 void *pout = output;
128
129 for (int i = 0; i < BUFSIZE; i++) {
130 asm("{\n\t"
131 " v2.cur = vmem(%0 + #0)\n\t"
132 " vmem(%1 + #0) = v2\n\t"
133 "}\n\t"
134 : : "r"(p0), "r"(pout) : "v2", "memory");
135 p0 += sizeof(MMVector);
136 pout += sizeof(MMVector);
137
138 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
139 expect[i].uw[j] = buffer0[i].uw[j];
140 }
141 }
142
143 check_output_w(__LINE__, BUFSIZE);
144}
145
146static void test_load_aligned(void)
147{
148
149 void *p0 = buffer0;
150 void *pout = output;
151 const size_t offset = 13;
152
153 p0 += offset;
154 asm("v2 = vmem(%0 + #0)\n\t"
155 "vmem(%1 + #0) = v2\n\t"
156 : : "r"(p0), "r"(pout) : "v2", "memory");
157
158 expect[0] = buffer0[0];
159
160 check_output_w(__LINE__, 1);
161}
162
163static void test_load_unaligned(void)
164{
165 void *p0 = buffer0;
166 void *pout = output;
167 const size_t offset = 12;
168
169 p0 += offset;
170 asm("v2 = vmemu(%0 + #0)\n\t"
171 "vmem(%1 + #0) = v2\n\t"
172 : : "r"(p0), "r"(pout) : "v2", "memory");
173
174 memcpy(expect, &buffer0[0].ub[offset], sizeof(MMVector));
175
176 check_output_w(__LINE__, 1);
177}
178
179static void test_store_aligned(void)
180{
181
182 void *p0 = buffer0;
183 void *pout = output;
184 const size_t offset = 13;
185
186 pout += offset;
187 asm("v2 = vmem(%0 + #0)\n\t"
188 "vmem(%1 + #0) = v2\n\t"
189 : : "r"(p0), "r"(pout) : "v2", "memory");
190
191 expect[0] = buffer0[0];
192
193 check_output_w(__LINE__, 1);
194}
195
196static void test_store_unaligned(void)
197{
198 void *p0 = buffer0;
199 void *pout = output;
200 const size_t offset = 12;
201
202 pout += offset;
203 asm("v2 = vmem(%0 + #0)\n\t"
204 "vmemu(%1 + #0) = v2\n\t"
205 : : "r"(p0), "r"(pout) : "v2", "memory");
206
207 memcpy(expect, buffer0, 2 * sizeof(MMVector));
208 memcpy(&expect[0].ub[offset], buffer0, sizeof(MMVector));
209
210 check_output_w(__LINE__, 2);
211}
212
213static void test_masked_store(bool invert)
214{
215 void *p0 = buffer0;
216 void *pmask = mask;
217 void *pout = output;
218
219 memset(expect, 0xff, sizeof(expect));
220 memset(output, 0xff, sizeof(expect));
221
222 for (int i = 0; i < BUFSIZE; i++) {
223 if (invert) {
224 asm("r4 = #0\n\t"
225 "v4 = vsplat(r4)\n\t"
226 "v5 = vmem(%0 + #0)\n\t"
227 "q0 = vcmp.eq(v4.w, v5.w)\n\t"
228 "v5 = vmem(%1)\n\t"
229 "if (!q0) vmem(%2) = v5\n\t"
230 : : "r"(pmask), "r"(p0), "r"(pout)
231 : "r4", "v4", "v5", "q0", "memory");
232 } else {
233 asm("r4 = #0\n\t"
234 "v4 = vsplat(r4)\n\t"
235 "v5 = vmem(%0 + #0)\n\t"
236 "q0 = vcmp.eq(v4.w, v5.w)\n\t"
237 "v5 = vmem(%1)\n\t"
238 "if (q0) vmem(%2) = v5\n\t"
239 : : "r"(pmask), "r"(p0), "r"(pout)
240 : "r4", "v4", "v5", "q0", "memory");
241 }
242 p0 += sizeof(MMVector);
243 pmask += sizeof(MMVector);
244 pout += sizeof(MMVector);
245
246 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
247 if (invert) {
248 if (i + j % MASKMOD != 0) {
249 expect[i].w[j] = buffer0[i].w[j];
250 }
251 } else {
252 if (i + j % MASKMOD == 0) {
253 expect[i].w[j] = buffer0[i].w[j];
254 }
255 }
256 }
257 }
258
259 check_output_w(__LINE__, BUFSIZE);
260}
261
262static void test_new_value_store(void)
263{
264 void *p0 = buffer0;
265 void *pout = output;
266
267 asm("{\n\t"
268 " v2 = vmem(%0 + #0)\n\t"
269 " vmem(%1 + #0) = v2.new\n\t"
270 "}\n\t"
271 : : "r"(p0), "r"(pout) : "v2", "memory");
272
273 expect[0] = buffer0[0];
274
275 check_output_w(__LINE__, 1);
276}
277
278static void test_max_temps()
279{
280 void *p0 = buffer0;
281 void *pout = output;
282
283 asm("v0 = vmem(%0 + #0)\n\t"
284 "v1 = vmem(%0 + #1)\n\t"
285 "v2 = vmem(%0 + #2)\n\t"
286 "v3 = vmem(%0 + #3)\n\t"
287 "v4 = vmem(%0 + #4)\n\t"
288 "{\n\t"
289 " v1:0.w = vadd(v3:2.w, v1:0.w)\n\t"
290 " v2.b = vshuffe(v3.b, v2.b)\n\t"
291 " v3.w = vadd(v1.w, v4.w)\n\t"
292 " v4.tmp = vmem(%0 + #5)\n\t"
293 "}\n\t"
294 "vmem(%1 + #0) = v0\n\t"
295 "vmem(%1 + #1) = v1\n\t"
296 "vmem(%1 + #2) = v2\n\t"
297 "vmem(%1 + #3) = v3\n\t"
298 "vmem(%1 + #4) = v4\n\t"
299 : : "r"(p0), "r"(pout) : "memory");
300
301
302 for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
303 expect[0].w[i] = buffer0[0].w[i] + buffer0[2].w[i];
304 expect[1].w[i] = buffer0[1].w[i] + buffer0[3].w[i];
305 }
306
307 for (int i = 0; i < MAX_VEC_SIZE_BYTES / 2; i++) {
308 expect[2].uh[i] = (buffer0[2].uh[i] & 0xff) |
309 (buffer0[3].uh[i] & 0xff) << 8;
310 }
311
312 for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
313 expect[3].w[i] = buffer0[1].w[i] + buffer0[5].w[i];
314 }
315
316
317
318
319 expect[4] = buffer0[4];
320
321 check_output_b(__LINE__, 5);
322}
323
324#define VEC_OP1(ASM, EL, IN, OUT) \
325 asm("v2 = vmem(%0 + #0)\n\t" \
326 "v2" #EL " = " #ASM "(v2" #EL ")\n\t" \
327 "vmem(%1 + #0) = v2\n\t" \
328 : : "r"(IN), "r"(OUT) : "v2", "memory")
329
330#define VEC_OP2(ASM, EL, IN0, IN1, OUT) \
331 asm("v2 = vmem(%0 + #0)\n\t" \
332 "v3 = vmem(%1 + #0)\n\t" \
333 "v2" #EL " = " #ASM "(v2" #EL ", v3" #EL ")\n\t" \
334 "vmem(%2 + #0) = v2\n\t" \
335 : : "r"(IN0), "r"(IN1), "r"(OUT) : "v2", "v3", "memory")
336
337#define TEST_VEC_OP1(NAME, ASM, EL, FIELD, FIELDSZ, OP) \
338static void test_##NAME(void) \
339{ \
340 void *pin = buffer0; \
341 void *pout = output; \
342 for (int i = 0; i < BUFSIZE; i++) { \
343 VEC_OP1(ASM, EL, pin, pout); \
344 pin += sizeof(MMVector); \
345 pout += sizeof(MMVector); \
346 } \
347 for (int i = 0; i < BUFSIZE; i++) { \
348 for (int j = 0; j < MAX_VEC_SIZE_BYTES / FIELDSZ; j++) { \
349 expect[i].FIELD[j] = OP buffer0[i].FIELD[j]; \
350 } \
351 } \
352 check_output_##FIELD(__LINE__, BUFSIZE); \
353}
354
355#define TEST_VEC_OP2(NAME, ASM, EL, FIELD, FIELDSZ, OP) \
356static void test_##NAME(void) \
357{ \
358 void *p0 = buffer0; \
359 void *p1 = buffer1; \
360 void *pout = output; \
361 for (int i = 0; i < BUFSIZE; i++) { \
362 VEC_OP2(ASM, EL, p0, p1, pout); \
363 p0 += sizeof(MMVector); \
364 p1 += sizeof(MMVector); \
365 pout += sizeof(MMVector); \
366 } \
367 for (int i = 0; i < BUFSIZE; i++) { \
368 for (int j = 0; j < MAX_VEC_SIZE_BYTES / FIELDSZ; j++) { \
369 expect[i].FIELD[j] = buffer0[i].FIELD[j] OP buffer1[i].FIELD[j]; \
370 } \
371 } \
372 check_output_##FIELD(__LINE__, BUFSIZE); \
373}
374
375#define THRESHOLD 31
376
377#define PRED_OP2(ASM, IN0, IN1, OUT, INV) \
378 asm("r4 = #%3\n\t" \
379 "v1.b = vsplat(r4)\n\t" \
380 "v2 = vmem(%0 + #0)\n\t" \
381 "q0 = vcmp.gt(v2.b, v1.b)\n\t" \
382 "v3 = vmem(%1 + #0)\n\t" \
383 "q1 = vcmp.gt(v3.b, v1.b)\n\t" \
384 "q2 = " #ASM "(q0, " INV "q1)\n\t" \
385 "r4 = #0xff\n\t" \
386 "v1.b = vsplat(r4)\n\t" \
387 "if (q2) vmem(%2 + #0) = v1\n\t" \
388 : : "r"(IN0), "r"(IN1), "r"(OUT), "i"(THRESHOLD) \
389 : "r4", "v1", "v2", "v3", "q0", "q1", "q2", "memory")
390
391#define TEST_PRED_OP2(NAME, ASM, OP, INV) \
392static void test_##NAME(bool invert) \
393{ \
394 void *p0 = buffer0; \
395 void *p1 = buffer1; \
396 void *pout = output; \
397 memset(output, 0, sizeof(expect)); \
398 for (int i = 0; i < BUFSIZE; i++) { \
399 PRED_OP2(ASM, p0, p1, pout, INV); \
400 p0 += sizeof(MMVector); \
401 p1 += sizeof(MMVector); \
402 pout += sizeof(MMVector); \
403 } \
404 for (int i = 0; i < BUFSIZE; i++) { \
405 for (int j = 0; j < MAX_VEC_SIZE_BYTES; j++) { \
406 bool p0 = (buffer0[i].b[j] > THRESHOLD); \
407 bool p1 = (buffer1[i].b[j] > THRESHOLD); \
408 if (invert) { \
409 expect[i].b[j] = (p0 OP !p1) ? 0xff : 0x00; \
410 } else { \
411 expect[i].b[j] = (p0 OP p1) ? 0xff : 0x00; \
412 } \
413 } \
414 } \
415 check_output_b(__LINE__, BUFSIZE); \
416}
417
418TEST_VEC_OP2(vadd_w, vadd, .w, w, 4, +)
419TEST_VEC_OP2(vadd_h, vadd, .h, h, 2, +)
420TEST_VEC_OP2(vadd_b, vadd, .b, b, 1, +)
421TEST_VEC_OP2(vsub_w, vsub, .w, w, 4, -)
422TEST_VEC_OP2(vsub_h, vsub, .h, h, 2, -)
423TEST_VEC_OP2(vsub_b, vsub, .b, b, 1, -)
424TEST_VEC_OP2(vxor, vxor, , d, 8, ^)
425TEST_VEC_OP2(vand, vand, , d, 8, &)
426TEST_VEC_OP2(vor, vor, , d, 8, |)
427TEST_VEC_OP1(vnot, vnot, , d, 8, ~)
428
429TEST_PRED_OP2(pred_or, or, |, "")
430TEST_PRED_OP2(pred_or_n, or, |, "!")
431TEST_PRED_OP2(pred_and, and, &, "")
432TEST_PRED_OP2(pred_and_n, and, &, "!")
433TEST_PRED_OP2(pred_xor, xor, ^, "")
434
435int main()
436{
437 init_buffers();
438
439 test_load_tmp();
440 test_load_cur();
441 test_load_aligned();
442 test_load_unaligned();
443 test_store_aligned();
444 test_store_unaligned();
445 test_masked_store(false);
446 test_masked_store(true);
447 test_new_value_store();
448 test_max_temps();
449
450 test_vadd_w();
451 test_vadd_h();
452 test_vadd_b();
453 test_vsub_w();
454 test_vsub_h();
455 test_vsub_b();
456 test_vxor();
457 test_vand();
458 test_vor();
459 test_vnot();
460
461 test_pred_or(false);
462 test_pred_or_n(true);
463 test_pred_and(false);
464 test_pred_and_n(true);
465 test_pred_xor(false);
466
467 puts(err ? "FAIL" : "PASS");
468 return err ? 1 : 0;
469}
470