1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31#include <stdio.h>
32#include <string.h>
33#include <stdlib.h>
34#include <inttypes.h>
35
36typedef long HVX_Vector __attribute__((__vector_size__(128)))
37 __attribute__((aligned(128)));
38typedef long HVX_VectorPair __attribute__((__vector_size__(256)))
39 __attribute__((aligned(128)));
40typedef long HVX_VectorPred __attribute__((__vector_size__(128)))
41 __attribute__((aligned(128)));
42
43int err;
44
45
46#define MATRIX_SIZE 64
47
48
49#define SCATTER_BUFFER_SIZE (MATRIX_SIZE * MATRIX_SIZE)
50
51
52static struct {
53 unsigned short vscatter16[SCATTER_BUFFER_SIZE];
54 unsigned short vgather16[MATRIX_SIZE];
55 unsigned int vscatter32[SCATTER_BUFFER_SIZE];
56 unsigned int vgather32[MATRIX_SIZE];
57 unsigned short vscatter16_32[SCATTER_BUFFER_SIZE];
58 unsigned short vgather16_32[MATRIX_SIZE];
59} vtcm __attribute__((aligned(0x10000)));
60
61
62unsigned short vscatter16_ref[SCATTER_BUFFER_SIZE];
63unsigned short vgather16_ref[MATRIX_SIZE];
64unsigned int vscatter32_ref[SCATTER_BUFFER_SIZE];
65unsigned int vgather32_ref[MATRIX_SIZE];
66unsigned short vscatter16_32_ref[SCATTER_BUFFER_SIZE];
67unsigned short vgather16_32_ref[MATRIX_SIZE];
68
69
70unsigned short half_offsets[MATRIX_SIZE] __attribute__((aligned(128)));
71unsigned int word_offsets[MATRIX_SIZE] __attribute__((aligned(128)));
72
73
74unsigned short half_values[MATRIX_SIZE] __attribute__((aligned(128)));
75unsigned short half_values_acc[MATRIX_SIZE] __attribute__((aligned(128)));
76unsigned short half_values_masked[MATRIX_SIZE] __attribute__((aligned(128)));
77unsigned int word_values[MATRIX_SIZE] __attribute__((aligned(128)));
78unsigned int word_values_acc[MATRIX_SIZE] __attribute__((aligned(128)));
79unsigned int word_values_masked[MATRIX_SIZE] __attribute__((aligned(128)));
80
81
82unsigned short half_predicates[MATRIX_SIZE] __attribute__((aligned(128)));
83unsigned int word_predicates[MATRIX_SIZE] __attribute__((aligned(128)));
84
85
86const size_t region_len = sizeof(vtcm);
87
88
89#define SYNC_VECTOR 1
90
91static void sync_scatter(void *addr)
92{
93#if SYNC_VECTOR
94
95
96
97
98
99 asm volatile("vmem(%0 + #0):scatter_release\n" : : "r"(addr));
100
101 volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy;
102#endif
103}
104
105static void sync_gather(void *addr)
106{
107#if SYNC_VECTOR
108
109 volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy;
110#endif
111}
112
113
114#define PRINT_DATA 0
115
116#define FILL_CHAR '.'
117
118
119void prefill_vtcm_scratch(void)
120{
121 memset(&vtcm, FILL_CHAR, sizeof(vtcm));
122}
123
124
125void create_offsets_values_preds_16(void)
126{
127 unsigned short half_element = 0;
128 unsigned short half_element_masked = 0;
129 char letter = 'A';
130 char letter_masked = '@';
131
132 for (int i = 0; i < MATRIX_SIZE; i++) {
133 half_offsets[i] = i * (2 * MATRIX_SIZE + 2);
134
135 half_element = 0;
136 half_element_masked = 0;
137 for (int j = 0; j < 2; j++) {
138 half_element |= letter << j * 8;
139 half_element_masked |= letter_masked << j * 8;
140 }
141
142 half_values[i] = half_element;
143 half_values_acc[i] = ((i % 10) << 8) + (i % 10);
144 half_values_masked[i] = half_element_masked;
145
146 letter++;
147
148 if (letter == 'M') {
149 letter = 'A';
150 }
151
152 half_predicates[i] = (i % 3 == 0 || i % 5 == 0) ? ~0 : 0;
153 }
154}
155
156
157void create_offsets_values_preds_32(void)
158{
159 unsigned int word_element = 0;
160 unsigned int word_element_masked = 0;
161 char letter = 'A';
162 char letter_masked = '&';
163
164 for (int i = 0; i < MATRIX_SIZE; i++) {
165 word_offsets[i] = i * (4 * MATRIX_SIZE + 4);
166
167 word_element = 0;
168 word_element_masked = 0;
169 for (int j = 0; j < 4; j++) {
170 word_element |= letter << j * 8;
171 word_element_masked |= letter_masked << j * 8;
172 }
173
174 word_values[i] = word_element;
175 word_values_acc[i] = ((i % 10) << 8) + (i % 10);
176 word_values_masked[i] = word_element_masked;
177
178 letter++;
179
180 if (letter == 'M') {
181 letter = 'A';
182 }
183
184 word_predicates[i] = (i % 4 == 0 || i % 7 == 0) ? ~0 : 0;
185 }
186}
187
188
189
190
191
192void create_offsets_values_preds_16_32(void)
193{
194 unsigned short half_element = 0;
195 unsigned short half_element_masked = 0;
196 char letter = 'D';
197 char letter_masked = '$';
198
199 for (int i = 0; i < MATRIX_SIZE; i++) {
200 word_offsets[i] = i * (2 * MATRIX_SIZE + 2);
201
202 half_element = 0;
203 half_element_masked = 0;
204 for (int j = 0; j < 2; j++) {
205 half_element |= letter << j * 8;
206 half_element_masked |= letter_masked << j * 8;
207 }
208
209 half_values[i] = half_element;
210 half_values_acc[i] = ((i % 10) << 8) + (i % 10);
211 half_values_masked[i] = half_element_masked;
212
213 letter++;
214
215 if (letter == 'P') {
216 letter = 'D';
217 }
218
219 half_predicates[i] = (i % 2 == 0 || i % 13 == 0) ? ~0 : 0;
220 }
221}
222
223
224void vector_scatter_16(void)
225{
226 asm ("m0 = %1\n\t"
227 "v0 = vmem(%2 + #0)\n\t"
228 "v1 = vmem(%3 + #0)\n\t"
229 "vscatter(%0, m0, v0.h).h = v1\n\t"
230 : : "r"(vtcm.vscatter16), "r"(region_len),
231 "r"(half_offsets), "r"(half_values)
232 : "m0", "v0", "v1", "memory");
233
234 sync_scatter(vtcm.vscatter16);
235}
236
237
238void vector_scatter_16_acc(void)
239{
240 asm ("m0 = %1\n\t"
241 "v0 = vmem(%2 + #0)\n\t"
242 "v1 = vmem(%3 + #0)\n\t"
243 "vscatter(%0, m0, v0.h).h += v1\n\t"
244 : : "r"(vtcm.vscatter16), "r"(region_len),
245 "r"(half_offsets), "r"(half_values_acc)
246 : "m0", "v0", "v1", "memory");
247
248 sync_scatter(vtcm.vscatter16);
249}
250
251
252void vector_scatter_16_masked(void)
253{
254 asm ("r1 = #-1\n\t"
255 "v0 = vmem(%0 + #0)\n\t"
256 "q0 = vand(v0, r1)\n\t"
257 "m0 = %2\n\t"
258 "v0 = vmem(%3 + #0)\n\t"
259 "v1 = vmem(%4 + #0)\n\t"
260 "if (q0) vscatter(%1, m0, v0.h).h = v1\n\t"
261 : : "r"(half_predicates), "r"(vtcm.vscatter16), "r"(region_len),
262 "r"(half_offsets), "r"(half_values_masked)
263 : "r1", "q0", "m0", "q0", "v0", "v1", "memory");
264
265 sync_scatter(vtcm.vscatter16);
266}
267
268
269void vector_scatter_32(void)
270{
271 HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
272 HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
273 HVX_Vector *valueslo = (HVX_Vector *)word_values;
274 HVX_Vector *valueshi = (HVX_Vector *)&word_values[MATRIX_SIZE / 2];
275
276 asm ("m0 = %1\n\t"
277 "v0 = vmem(%2 + #0)\n\t"
278 "v1 = vmem(%3 + #0)\n\t"
279 "vscatter(%0, m0, v0.w).w = v1\n\t"
280 : : "r"(vtcm.vscatter32), "r"(region_len),
281 "r"(offsetslo), "r"(valueslo)
282 : "m0", "v0", "v1", "memory");
283 asm ("m0 = %1\n\t"
284 "v0 = vmem(%2 + #0)\n\t"
285 "v1 = vmem(%3 + #0)\n\t"
286 "vscatter(%0, m0, v0.w).w = v1\n\t"
287 : : "r"(vtcm.vscatter32), "r"(region_len),
288 "r"(offsetshi), "r"(valueshi)
289 : "m0", "v0", "v1", "memory");
290
291 sync_scatter(vtcm.vscatter32);
292}
293
294
295void vector_scatter_32_acc(void)
296{
297 HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
298 HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
299 HVX_Vector *valueslo = (HVX_Vector *)word_values_acc;
300 HVX_Vector *valueshi = (HVX_Vector *)&word_values_acc[MATRIX_SIZE / 2];
301
302 asm ("m0 = %1\n\t"
303 "v0 = vmem(%2 + #0)\n\t"
304 "v1 = vmem(%3 + #0)\n\t"
305 "vscatter(%0, m0, v0.w).w += v1\n\t"
306 : : "r"(vtcm.vscatter32), "r"(region_len),
307 "r"(offsetslo), "r"(valueslo)
308 : "m0", "v0", "v1", "memory");
309 asm ("m0 = %1\n\t"
310 "v0 = vmem(%2 + #0)\n\t"
311 "v1 = vmem(%3 + #0)\n\t"
312 "vscatter(%0, m0, v0.w).w += v1\n\t"
313 : : "r"(vtcm.vscatter32), "r"(region_len),
314 "r"(offsetshi), "r"(valueshi)
315 : "m0", "v0", "v1", "memory");
316
317 sync_scatter(vtcm.vscatter32);
318}
319
320
321void vector_scatter_32_masked(void)
322{
323 HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
324 HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
325 HVX_Vector *valueslo = (HVX_Vector *)word_values_masked;
326 HVX_Vector *valueshi = (HVX_Vector *)&word_values_masked[MATRIX_SIZE / 2];
327 HVX_Vector *predslo = (HVX_Vector *)word_predicates;
328 HVX_Vector *predshi = (HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
329
330 asm ("r1 = #-1\n\t"
331 "v0 = vmem(%0 + #0)\n\t"
332 "q0 = vand(v0, r1)\n\t"
333 "m0 = %2\n\t"
334 "v0 = vmem(%3 + #0)\n\t"
335 "v1 = vmem(%4 + #0)\n\t"
336 "if (q0) vscatter(%1, m0, v0.w).w = v1\n\t"
337 : : "r"(predslo), "r"(vtcm.vscatter32), "r"(region_len),
338 "r"(offsetslo), "r"(valueslo)
339 : "r1", "q0", "m0", "q0", "v0", "v1", "memory");
340 asm ("r1 = #-1\n\t"
341 "v0 = vmem(%0 + #0)\n\t"
342 "q0 = vand(v0, r1)\n\t"
343 "m0 = %2\n\t"
344 "v0 = vmem(%3 + #0)\n\t"
345 "v1 = vmem(%4 + #0)\n\t"
346 "if (q0) vscatter(%1, m0, v0.w).w = v1\n\t"
347 : : "r"(predshi), "r"(vtcm.vscatter32), "r"(region_len),
348 "r"(offsetshi), "r"(valueshi)
349 : "r1", "q0", "m0", "q0", "v0", "v1", "memory");
350
351 sync_scatter(vtcm.vscatter32);
352}
353
354
355void vector_scatter_16_32(void)
356{
357 asm ("m0 = %1\n\t"
358 "v0 = vmem(%2 + #0)\n\t"
359 "v1 = vmem(%2 + #1)\n\t"
360 "v2 = vmem(%3 + #0)\n\t"
361 "v2.h = vshuff(v2.h)\n\t"
362 "vscatter(%0, m0, v1:0.w).h = v2\n\t"
363 : : "r"(vtcm.vscatter16_32), "r"(region_len),
364 "r"(word_offsets), "r"(half_values)
365 : "m0", "v0", "v1", "v2", "memory");
366
367 sync_scatter(vtcm.vscatter16_32);
368}
369
370
371void vector_scatter_16_32_acc(void)
372{
373 asm ("m0 = %1\n\t"
374 "v0 = vmem(%2 + #0)\n\t"
375 "v1 = vmem(%2 + #1)\n\t"
376 "v2 = vmem(%3 + #0)\n\t" \
377 "v2.h = vshuff(v2.h)\n\t"
378 "vscatter(%0, m0, v1:0.w).h += v2\n\t"
379 : : "r"(vtcm.vscatter16_32), "r"(region_len),
380 "r"(word_offsets), "r"(half_values_acc)
381 : "m0", "v0", "v1", "v2", "memory");
382
383 sync_scatter(vtcm.vscatter16_32);
384}
385
386
387void vector_scatter_16_32_masked(void)
388{
389 asm ("r1 = #-1\n\t"
390 "v0 = vmem(%0 + #0)\n\t"
391 "v0.h = vshuff(v0.h)\n\t"
392 "q0 = vand(v0, r1)\n\t"
393 "m0 = %2\n\t"
394 "v0 = vmem(%3 + #0)\n\t"
395 "v1 = vmem(%3 + #1)\n\t"
396 "v2 = vmem(%4 + #0)\n\t" \
397 "v2.h = vshuff(v2.h)\n\t"
398 "if (q0) vscatter(%1, m0, v1:0.w).h = v2\n\t"
399 : : "r"(half_predicates), "r"(vtcm.vscatter16_32), "r"(region_len),
400 "r"(word_offsets), "r"(half_values_masked)
401 : "r1", "q0", "m0", "v0", "v1", "v2", "memory");
402
403 sync_scatter(vtcm.vscatter16_32);
404}
405
406
407void vector_gather_16(void)
408{
409 asm ("m0 = %1\n\t"
410 "v0 = vmem(%2 + #0)\n\t"
411 "{ vtmp.h = vgather(%0, m0, v0.h).h\n\t"
412 " vmem(%3 + #0) = vtmp.new }\n\t"
413 : : "r"(vtcm.vscatter16), "r"(region_len),
414 "r"(half_offsets), "r"(vtcm.vgather16)
415 : "m0", "v0", "memory");
416
417 sync_gather(vtcm.vgather16);
418}
419
420static unsigned short gather_16_masked_init(void)
421{
422 char letter = '?';
423 return letter | (letter << 8);
424}
425
426
427void vector_gather_16_masked(void)
428{
429 unsigned short init = gather_16_masked_init();
430
431 asm ("v0.h = vsplat(%5)\n\t"
432 "vmem(%4 + #0) = v0\n\t"
433 "r1 = #-1\n\t"
434 "v0 = vmem(%0 + #0)\n\t"
435 "q0 = vand(v0, r1)\n\t"
436 "m0 = %2\n\t"
437 "v0 = vmem(%3 + #0)\n\t"
438 "{ if (q0) vtmp.h = vgather(%1, m0, v0.h).h\n\t"
439 " vmem(%4 + #0) = vtmp.new }\n\t"
440 : : "r"(half_predicates), "r"(vtcm.vscatter16), "r"(region_len),
441 "r"(half_offsets), "r"(vtcm.vgather16), "r"(init)
442 : "r1", "q0", "m0", "v0", "memory");
443
444 sync_gather(vtcm.vgather16);
445}
446
447
448void vector_gather_32(void)
449{
450 HVX_Vector *vgatherlo = (HVX_Vector *)vtcm.vgather32;
451 HVX_Vector *vgatherhi = (HVX_Vector *)&vtcm.vgather32[MATRIX_SIZE / 2];
452 HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
453 HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
454
455 asm ("m0 = %1\n\t"
456 "v0 = vmem(%2 + #0)\n\t"
457 "{ vtmp.w = vgather(%0, m0, v0.w).w\n\t"
458 " vmem(%3 + #0) = vtmp.new }\n\t"
459 : : "r"(vtcm.vscatter32), "r"(region_len),
460 "r"(offsetslo), "r"(vgatherlo)
461 : "m0", "v0", "memory");
462 asm ("m0 = %1\n\t"
463 "v0 = vmem(%2 + #0)\n\t"
464 "{ vtmp.w = vgather(%0, m0, v0.w).w\n\t"
465 " vmem(%3 + #0) = vtmp.new }\n\t"
466 : : "r"(vtcm.vscatter32), "r"(region_len),
467 "r"(offsetshi), "r"(vgatherhi)
468 : "m0", "v0", "memory");
469
470 sync_gather(vgatherlo);
471 sync_gather(vgatherhi);
472}
473
474static unsigned int gather_32_masked_init(void)
475{
476 char letter = '?';
477 return letter | (letter << 8) | (letter << 16) | (letter << 24);
478}
479
480
481void vector_gather_32_masked(void)
482{
483 unsigned int init = gather_32_masked_init();
484 HVX_Vector *vgatherlo = (HVX_Vector *)vtcm.vgather32;
485 HVX_Vector *vgatherhi = (HVX_Vector *)&vtcm.vgather32[MATRIX_SIZE / 2];
486 HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
487 HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
488 HVX_Vector *predslo = (HVX_Vector *)word_predicates;
489 HVX_Vector *predshi = (HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
490
491 asm ("v0.h = vsplat(%5)\n\t"
492 "vmem(%4 + #0) = v0\n\t"
493 "r1 = #-1\n\t"
494 "v0 = vmem(%0 + #0)\n\t"
495 "q0 = vand(v0, r1)\n\t"
496 "m0 = %2\n\t"
497 "v0 = vmem(%3 + #0)\n\t"
498 "{ if (q0) vtmp.w = vgather(%1, m0, v0.w).w\n\t"
499 " vmem(%4 + #0) = vtmp.new }\n\t"
500 : : "r"(predslo), "r"(vtcm.vscatter32), "r"(region_len),
501 "r"(offsetslo), "r"(vgatherlo), "r"(init)
502 : "r1", "q0", "m0", "v0", "memory");
503 asm ("v0.h = vsplat(%5)\n\t"
504 "vmem(%4 + #0) = v0\n\t"
505 "r1 = #-1\n\t"
506 "v0 = vmem(%0 + #0)\n\t"
507 "q0 = vand(v0, r1)\n\t"
508 "m0 = %2\n\t"
509 "v0 = vmem(%3 + #0)\n\t"
510 "{ if (q0) vtmp.w = vgather(%1, m0, v0.w).w\n\t"
511 " vmem(%4 + #0) = vtmp.new }\n\t"
512 : : "r"(predshi), "r"(vtcm.vscatter32), "r"(region_len),
513 "r"(offsetshi), "r"(vgatherhi), "r"(init)
514 : "r1", "q0", "m0", "v0", "memory");
515
516 sync_gather(vgatherlo);
517 sync_gather(vgatherhi);
518}
519
520
521void vector_gather_16_32(void)
522{
523 asm ("m0 = %1\n\t"
524 "v0 = vmem(%2 + #0)\n\t"
525 "v1 = vmem(%2 + #1)\n\t"
526 "{ vtmp.h = vgather(%0, m0, v1:0.w).h\n\t"
527 " vmem(%3 + #0) = vtmp.new }\n\t"
528 "v0 = vmem(%3 + #0)\n\t"
529 "v0.h = vdeal(v0.h)\n\t"
530 "vmem(%3 + #0) = v0\n\t"
531 : : "r"(vtcm.vscatter16_32), "r"(region_len),
532 "r"(word_offsets), "r"(vtcm.vgather16_32)
533 : "m0", "v0", "v1", "memory");
534
535 sync_gather(vtcm.vgather16_32);
536}
537
538
539void vector_gather_16_32_masked(void)
540{
541 unsigned short init = gather_16_masked_init();
542
543 asm ("v0.h = vsplat(%5)\n\t"
544 "vmem(%4 + #0) = v0\n\t"
545 "r1 = #-1\n\t"
546 "v0 = vmem(%0 + #0)\n\t"
547 "v0.h = vshuff(v0.h)\n\t"
548 "q0 = vand(v0, r1)\n\t"
549 "m0 = %2\n\t"
550 "v0 = vmem(%3 + #0)\n\t"
551 "v1 = vmem(%3 + #1)\n\t"
552 "{ if (q0) vtmp.h = vgather(%1, m0, v1:0.w).h\n\t"
553 " vmem(%4 + #0) = vtmp.new }\n\t"
554 "v0 = vmem(%4 + #0)\n\t"
555 "v0.h = vdeal(v0.h)\n\t"
556 "vmem(%4 + #0) = v0\n\t"
557 : : "r"(half_predicates), "r"(vtcm.vscatter16_32), "r"(region_len),
558 "r"(word_offsets), "r"(vtcm.vgather16_32), "r"(init)
559 : "r1", "q0", "m0", "v0", "v1", "memory");
560
561 sync_gather(vtcm.vgather16_32);
562}
563
564static void check_buffer(const char *name, void *c, void *r, size_t size)
565{
566 char *check = (char *)c;
567 char *ref = (char *)r;
568 for (int i = 0; i < size; i++) {
569 if (check[i] != ref[i]) {
570 printf("ERROR %s [%d]: 0x%x (%c) != 0x%x (%c)\n", name, i,
571 check[i], check[i], ref[i], ref[i]);
572 err++;
573 }
574 }
575}
576
577
578
579
580
581
582
583void scalar_scatter_16(unsigned short *vscatter16)
584{
585 for (int i = 0; i < MATRIX_SIZE; ++i) {
586 vscatter16[half_offsets[i] / 2] = half_values[i];
587 }
588}
589
590void check_scatter_16()
591{
592 memset(vscatter16_ref, FILL_CHAR,
593 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
594 scalar_scatter_16(vscatter16_ref);
595 check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
596 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
597}
598
599
600void scalar_scatter_16_acc(unsigned short *vscatter16)
601{
602 for (int i = 0; i < MATRIX_SIZE; ++i) {
603 vscatter16[half_offsets[i] / 2] += half_values_acc[i];
604 }
605}
606
607
608void check_scatter_16_acc()
609{
610 memset(vscatter16_ref, FILL_CHAR,
611 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
612 scalar_scatter_16(vscatter16_ref);
613 scalar_scatter_16_acc(vscatter16_ref);
614 check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
615 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
616}
617
618
619void scalar_scatter_16_masked(unsigned short *vscatter16)
620{
621 for (int i = 0; i < MATRIX_SIZE; i++) {
622 if (half_predicates[i]) {
623 vscatter16[half_offsets[i] / 2] = half_values_masked[i];
624 }
625 }
626
627}
628
629void check_scatter_16_masked()
630{
631 memset(vscatter16_ref, FILL_CHAR,
632 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
633 scalar_scatter_16(vscatter16_ref);
634 scalar_scatter_16_acc(vscatter16_ref);
635 scalar_scatter_16_masked(vscatter16_ref);
636 check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
637 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
638}
639
640
641void scalar_scatter_32(unsigned int *vscatter32)
642{
643 for (int i = 0; i < MATRIX_SIZE; ++i) {
644 vscatter32[word_offsets[i] / 4] = word_values[i];
645 }
646}
647
648void check_scatter_32()
649{
650 memset(vscatter32_ref, FILL_CHAR,
651 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
652 scalar_scatter_32(vscatter32_ref);
653 check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
654 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
655}
656
657
658void scalar_scatter_32_acc(unsigned int *vscatter32)
659{
660 for (int i = 0; i < MATRIX_SIZE; ++i) {
661 vscatter32[word_offsets[i] / 4] += word_values_acc[i];
662 }
663}
664
665void check_scatter_32_acc()
666{
667 memset(vscatter32_ref, FILL_CHAR,
668 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
669 scalar_scatter_32(vscatter32_ref);
670 scalar_scatter_32_acc(vscatter32_ref);
671 check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
672 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
673}
674
675
676void scalar_scatter_32_masked(unsigned int *vscatter32)
677{
678 for (int i = 0; i < MATRIX_SIZE; i++) {
679 if (word_predicates[i]) {
680 vscatter32[word_offsets[i] / 4] = word_values_masked[i];
681 }
682 }
683}
684
685void check_scatter_32_masked()
686{
687 memset(vscatter32_ref, FILL_CHAR,
688 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
689 scalar_scatter_32(vscatter32_ref);
690 scalar_scatter_32_acc(vscatter32_ref);
691 scalar_scatter_32_masked(vscatter32_ref);
692 check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
693 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
694}
695
696
697void scalar_scatter_16_32(unsigned short *vscatter16_32)
698{
699 for (int i = 0; i < MATRIX_SIZE; ++i) {
700 vscatter16_32[word_offsets[i] / 2] = half_values[i];
701 }
702}
703
704void check_scatter_16_32()
705{
706 memset(vscatter16_32_ref, FILL_CHAR,
707 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
708 scalar_scatter_16_32(vscatter16_32_ref);
709 check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
710 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
711}
712
713
714void scalar_scatter_16_32_acc(unsigned short *vscatter16_32)
715{
716 for (int i = 0; i < MATRIX_SIZE; ++i) {
717 vscatter16_32[word_offsets[i] / 2] += half_values_acc[i];
718 }
719}
720
721void check_scatter_16_32_acc()
722{
723 memset(vscatter16_32_ref, FILL_CHAR,
724 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
725 scalar_scatter_16_32(vscatter16_32_ref);
726 scalar_scatter_16_32_acc(vscatter16_32_ref);
727 check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
728 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
729}
730
731
732void scalar_scatter_16_32_masked(unsigned short *vscatter16_32)
733{
734 for (int i = 0; i < MATRIX_SIZE; i++) {
735 if (half_predicates[i]) {
736 vscatter16_32[word_offsets[i] / 2] = half_values_masked[i];
737 }
738 }
739}
740
741void check_scatter_16_32_masked()
742{
743 memset(vscatter16_32_ref, FILL_CHAR,
744 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
745 scalar_scatter_16_32(vscatter16_32_ref);
746 scalar_scatter_16_32_acc(vscatter16_32_ref);
747 scalar_scatter_16_32_masked(vscatter16_32_ref);
748 check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
749 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
750}
751
752
753void scalar_gather_16(unsigned short *vgather16)
754{
755 for (int i = 0; i < MATRIX_SIZE; ++i) {
756 vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2];
757 }
758}
759
760void check_gather_16()
761{
762 memset(vgather16_ref, 0, MATRIX_SIZE * sizeof(unsigned short));
763 scalar_gather_16(vgather16_ref);
764 check_buffer(__func__, vtcm.vgather16, vgather16_ref,
765 MATRIX_SIZE * sizeof(unsigned short));
766}
767
768
769void scalar_gather_16_masked(unsigned short *vgather16)
770{
771 for (int i = 0; i < MATRIX_SIZE; ++i) {
772 if (half_predicates[i]) {
773 vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2];
774 }
775 }
776}
777
778void check_gather_16_masked()
779{
780 memset(vgather16_ref, gather_16_masked_init(),
781 MATRIX_SIZE * sizeof(unsigned short));
782 scalar_gather_16_masked(vgather16_ref);
783 check_buffer(__func__, vtcm.vgather16, vgather16_ref,
784 MATRIX_SIZE * sizeof(unsigned short));
785}
786
787
788void scalar_gather_32(unsigned int *vgather32)
789{
790 for (int i = 0; i < MATRIX_SIZE; ++i) {
791 vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4];
792 }
793}
794
795void check_gather_32(void)
796{
797 memset(vgather32_ref, 0, MATRIX_SIZE * sizeof(unsigned int));
798 scalar_gather_32(vgather32_ref);
799 check_buffer(__func__, vtcm.vgather32, vgather32_ref,
800 MATRIX_SIZE * sizeof(unsigned int));
801}
802
803
804void scalar_gather_32_masked(unsigned int *vgather32)
805{
806 for (int i = 0; i < MATRIX_SIZE; ++i) {
807 if (word_predicates[i]) {
808 vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4];
809 }
810 }
811}
812
813void check_gather_32_masked(void)
814{
815 memset(vgather32_ref, gather_32_masked_init(),
816 MATRIX_SIZE * sizeof(unsigned int));
817 scalar_gather_32_masked(vgather32_ref);
818 check_buffer(__func__, vtcm.vgather32,
819 vgather32_ref, MATRIX_SIZE * sizeof(unsigned int));
820}
821
822
823void scalar_gather_16_32(unsigned short *vgather16_32)
824{
825 for (int i = 0; i < MATRIX_SIZE; ++i) {
826 vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2];
827 }
828}
829
830void check_gather_16_32(void)
831{
832 memset(vgather16_32_ref, 0, MATRIX_SIZE * sizeof(unsigned short));
833 scalar_gather_16_32(vgather16_32_ref);
834 check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref,
835 MATRIX_SIZE * sizeof(unsigned short));
836}
837
838
839void scalar_gather_16_32_masked(unsigned short *vgather16_32)
840{
841 for (int i = 0; i < MATRIX_SIZE; ++i) {
842 if (half_predicates[i]) {
843 vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2];
844 }
845 }
846
847}
848
849void check_gather_16_32_masked(void)
850{
851 memset(vgather16_32_ref, gather_16_masked_init(),
852 MATRIX_SIZE * sizeof(unsigned short));
853 scalar_gather_16_32_masked(vgather16_32_ref);
854 check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref,
855 MATRIX_SIZE * sizeof(unsigned short));
856}
857
858
859void print_scatter16_buffer(void)
860{
861 if (PRINT_DATA) {
862 printf("\n\nPrinting the 16 bit scatter buffer");
863
864 for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
865 if ((i % MATRIX_SIZE) == 0) {
866 printf("\n");
867 }
868 for (int j = 0; j < 2; j++) {
869 printf("%c", (char)((vtcm.vscatter16[i] >> j * 8) & 0xff));
870 }
871 printf(" ");
872 }
873 printf("\n");
874 }
875}
876
877
878void print_gather_result_16(void)
879{
880 if (PRINT_DATA) {
881 printf("\n\nPrinting the 16 bit gather result\n");
882
883 for (int i = 0; i < MATRIX_SIZE; i++) {
884 for (int j = 0; j < 2; j++) {
885 printf("%c", (char)((vtcm.vgather16[i] >> j * 8) & 0xff));
886 }
887 printf(" ");
888 }
889 printf("\n");
890 }
891}
892
893
894void print_scatter32_buffer(void)
895{
896 if (PRINT_DATA) {
897 printf("\n\nPrinting the 32 bit scatter buffer");
898
899 for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
900 if ((i % MATRIX_SIZE) == 0) {
901 printf("\n");
902 }
903 for (int j = 0; j < 4; j++) {
904 printf("%c", (char)((vtcm.vscatter32[i] >> j * 8) & 0xff));
905 }
906 printf(" ");
907 }
908 printf("\n");
909 }
910}
911
912
913void print_gather_result_32(void)
914{
915 if (PRINT_DATA) {
916 printf("\n\nPrinting the 32 bit gather result\n");
917
918 for (int i = 0; i < MATRIX_SIZE; i++) {
919 for (int j = 0; j < 4; j++) {
920 printf("%c", (char)((vtcm.vgather32[i] >> j * 8) & 0xff));
921 }
922 printf(" ");
923 }
924 printf("\n");
925 }
926}
927
928
929void print_scatter16_32_buffer(void)
930{
931 if (PRINT_DATA) {
932 printf("\n\nPrinting the 16_32 bit scatter buffer");
933
934 for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
935 if ((i % MATRIX_SIZE) == 0) {
936 printf("\n");
937 }
938 for (int j = 0; j < 2; j++) {
939 printf("%c",
940 (unsigned char)((vtcm.vscatter16_32[i] >> j * 8) & 0xff));
941 }
942 printf(" ");
943 }
944 printf("\n");
945 }
946}
947
948
949void print_gather_result_16_32(void)
950{
951 if (PRINT_DATA) {
952 printf("\n\nPrinting the 16_32 bit gather result\n");
953
954 for (int i = 0; i < MATRIX_SIZE; i++) {
955 for (int j = 0; j < 2; j++) {
956 printf("%c",
957 (unsigned char)((vtcm.vgather16_32[i] >> j * 8) & 0xff));
958 }
959 printf(" ");
960 }
961 printf("\n");
962 }
963}
964
965int main()
966{
967 prefill_vtcm_scratch();
968
969
970 create_offsets_values_preds_16();
971
972 vector_scatter_16();
973 print_scatter16_buffer();
974 check_scatter_16();
975
976 vector_gather_16();
977 print_gather_result_16();
978 check_gather_16();
979
980 vector_gather_16_masked();
981 print_gather_result_16();
982 check_gather_16_masked();
983
984 vector_scatter_16_acc();
985 print_scatter16_buffer();
986 check_scatter_16_acc();
987
988 vector_scatter_16_masked();
989 print_scatter16_buffer();
990 check_scatter_16_masked();
991
992
993 create_offsets_values_preds_32();
994
995 vector_scatter_32();
996 print_scatter32_buffer();
997 check_scatter_32();
998
999 vector_gather_32();
1000 print_gather_result_32();
1001 check_gather_32();
1002
1003 vector_gather_32_masked();
1004 print_gather_result_32();
1005 check_gather_32_masked();
1006
1007 vector_scatter_32_acc();
1008 print_scatter32_buffer();
1009 check_scatter_32_acc();
1010
1011 vector_scatter_32_masked();
1012 print_scatter32_buffer();
1013 check_scatter_32_masked();
1014
1015
1016 create_offsets_values_preds_16_32();
1017
1018 vector_scatter_16_32();
1019 print_scatter16_32_buffer();
1020 check_scatter_16_32();
1021
1022 vector_gather_16_32();
1023 print_gather_result_16_32();
1024 check_gather_16_32();
1025
1026 vector_gather_16_32_masked();
1027 print_gather_result_16_32();
1028 check_gather_16_32_masked();
1029
1030 vector_scatter_16_32_acc();
1031 print_scatter16_32_buffer();
1032 check_scatter_16_32_acc();
1033
1034 vector_scatter_16_32_masked();
1035 print_scatter16_32_buffer();
1036 check_scatter_16_32_masked();
1037
1038 puts(err ? "FAIL" : "PASS");
1039 return err;
1040}
1041