qemu/tests/tcg/hexagon/scatter_gather.c
<<
>>
Prefs
   1/*
   2 *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
   3 *
   4 *  This program is free software; you can redistribute it and/or modify
   5 *  it under the terms of the GNU General Public License as published by
   6 *  the Free Software Foundation; either version 2 of the License, or
   7 *  (at your option) any later version.
   8 *
   9 *  This program is distributed in the hope that it will be useful,
  10 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 *  GNU General Public License for more details.
  13 *
  14 *  You should have received a copy of the GNU General Public License
  15 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
  16 */
  17
  18/*
  19 * This example tests the HVX scatter/gather instructions
  20 *
  21 * See section 5.13 of the V68 HVX Programmer's Reference
  22 *
  23 * There are 3 main classes operations
  24 *     _16                 16-bit elements and 16-bit offsets
  25 *     _32                 32-bit elements and 32-bit offsets
  26 *     _16_32              16-bit elements and 32-bit offsets
  27 *
  28 * There are also masked and accumulate versions
  29 */
  30
  31#include <stdio.h>
  32#include <string.h>
  33#include <stdlib.h>
  34#include <inttypes.h>
  35
  36typedef long HVX_Vector       __attribute__((__vector_size__(128)))
  37                              __attribute__((aligned(128)));
  38typedef long HVX_VectorPair   __attribute__((__vector_size__(256)))
  39                              __attribute__((aligned(128)));
  40typedef long HVX_VectorPred   __attribute__((__vector_size__(128)))
  41                              __attribute__((aligned(128)));
  42
  43int err;
  44
  45/* define the number of rows/cols in a square matrix */
  46#define MATRIX_SIZE 64
  47
  48/* define the size of the scatter buffer */
  49#define SCATTER_BUFFER_SIZE (MATRIX_SIZE * MATRIX_SIZE)
  50
  51/* fake vtcm - put buffers together and force alignment */
  52static struct {
  53    unsigned short vscatter16[SCATTER_BUFFER_SIZE];
  54    unsigned short vgather16[MATRIX_SIZE];
  55    unsigned int   vscatter32[SCATTER_BUFFER_SIZE];
  56    unsigned int   vgather32[MATRIX_SIZE];
  57    unsigned short vscatter16_32[SCATTER_BUFFER_SIZE];
  58    unsigned short vgather16_32[MATRIX_SIZE];
  59} vtcm __attribute__((aligned(0x10000)));
  60
  61/* declare the arrays of reference values */
  62unsigned short vscatter16_ref[SCATTER_BUFFER_SIZE];
  63unsigned short vgather16_ref[MATRIX_SIZE];
  64unsigned int   vscatter32_ref[SCATTER_BUFFER_SIZE];
  65unsigned int   vgather32_ref[MATRIX_SIZE];
  66unsigned short vscatter16_32_ref[SCATTER_BUFFER_SIZE];
  67unsigned short vgather16_32_ref[MATRIX_SIZE];
  68
  69/* declare the arrays of offsets */
  70unsigned short half_offsets[MATRIX_SIZE] __attribute__((aligned(128)));
  71unsigned int   word_offsets[MATRIX_SIZE] __attribute__((aligned(128)));
  72
  73/* declare the arrays of values */
  74unsigned short half_values[MATRIX_SIZE] __attribute__((aligned(128)));
  75unsigned short half_values_acc[MATRIX_SIZE] __attribute__((aligned(128)));
  76unsigned short half_values_masked[MATRIX_SIZE] __attribute__((aligned(128)));
  77unsigned int   word_values[MATRIX_SIZE] __attribute__((aligned(128)));
  78unsigned int   word_values_acc[MATRIX_SIZE] __attribute__((aligned(128)));
  79unsigned int   word_values_masked[MATRIX_SIZE] __attribute__((aligned(128)));
  80
  81/* declare the arrays of predicates */
  82unsigned short half_predicates[MATRIX_SIZE] __attribute__((aligned(128)));
  83unsigned int   word_predicates[MATRIX_SIZE] __attribute__((aligned(128)));
  84
  85/* make this big enough for all the operations */
  86const size_t region_len = sizeof(vtcm);
  87
  88/* optionally add sync instructions */
  89#define SYNC_VECTOR 1
  90
  91static void sync_scatter(void *addr)
  92{
  93#if SYNC_VECTOR
  94    /*
  95     * Do the scatter release followed by a dummy load to complete the
  96     * synchronization.  Normally the dummy load would be deferred as
  97     * long as possible to minimize stalls.
  98     */
  99    asm volatile("vmem(%0 + #0):scatter_release\n" : : "r"(addr));
 100    /* use volatile to force the load */
 101    volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy;
 102#endif
 103}
 104
 105static void sync_gather(void *addr)
 106{
 107#if SYNC_VECTOR
 108    /* use volatile to force the load */
 109    volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy;
 110#endif
 111}
 112
 113/* optionally print the results */
 114#define PRINT_DATA 0
 115
 116#define FILL_CHAR       '.'
 117
 118/* fill vtcm scratch with ee */
 119void prefill_vtcm_scratch(void)
 120{
 121    memset(&vtcm, FILL_CHAR, sizeof(vtcm));
 122}
 123
 124/* create byte offsets to be a diagonal of the matrix with 16 bit elements */
 125void create_offsets_values_preds_16(void)
 126{
 127    unsigned short half_element = 0;
 128    unsigned short half_element_masked = 0;
 129    char letter = 'A';
 130    char letter_masked = '@';
 131
 132    for (int i = 0; i < MATRIX_SIZE; i++) {
 133        half_offsets[i] = i * (2 * MATRIX_SIZE + 2);
 134
 135        half_element = 0;
 136        half_element_masked = 0;
 137        for (int j = 0; j < 2; j++) {
 138            half_element |= letter << j * 8;
 139            half_element_masked |= letter_masked << j * 8;
 140        }
 141
 142        half_values[i] = half_element;
 143        half_values_acc[i] = ((i % 10) << 8) + (i % 10);
 144        half_values_masked[i] = half_element_masked;
 145
 146        letter++;
 147        /* reset to 'A' */
 148        if (letter == 'M') {
 149            letter = 'A';
 150        }
 151
 152        half_predicates[i] = (i % 3 == 0 || i % 5 == 0) ? ~0 : 0;
 153    }
 154}
 155
 156/* create byte offsets to be a diagonal of the matrix with 32 bit elements */
 157void create_offsets_values_preds_32(void)
 158{
 159    unsigned int word_element = 0;
 160    unsigned int word_element_masked = 0;
 161    char letter = 'A';
 162    char letter_masked = '&';
 163
 164    for (int i = 0; i < MATRIX_SIZE; i++) {
 165        word_offsets[i] = i * (4 * MATRIX_SIZE + 4);
 166
 167        word_element = 0;
 168        word_element_masked = 0;
 169        for (int j = 0; j < 4; j++) {
 170            word_element |= letter << j * 8;
 171            word_element_masked |= letter_masked << j * 8;
 172        }
 173
 174        word_values[i] = word_element;
 175        word_values_acc[i] = ((i % 10) << 8) + (i % 10);
 176        word_values_masked[i] = word_element_masked;
 177
 178        letter++;
 179        /* reset to 'A' */
 180        if (letter == 'M') {
 181            letter = 'A';
 182        }
 183
 184        word_predicates[i] = (i % 4 == 0 || i % 7 == 0) ? ~0 : 0;
 185    }
 186}
 187
 188/*
 189 * create byte offsets to be a diagonal of the matrix with 16 bit elements
 190 * and 32 bit offsets
 191 */
 192void create_offsets_values_preds_16_32(void)
 193{
 194    unsigned short half_element = 0;
 195    unsigned short half_element_masked = 0;
 196    char letter = 'D';
 197    char letter_masked = '$';
 198
 199    for (int i = 0; i < MATRIX_SIZE; i++) {
 200        word_offsets[i] = i * (2 * MATRIX_SIZE + 2);
 201
 202        half_element = 0;
 203        half_element_masked = 0;
 204        for (int j = 0; j < 2; j++) {
 205            half_element |= letter << j * 8;
 206            half_element_masked |= letter_masked << j * 8;
 207        }
 208
 209        half_values[i] = half_element;
 210        half_values_acc[i] = ((i % 10) << 8) + (i % 10);
 211        half_values_masked[i] = half_element_masked;
 212
 213        letter++;
 214        /* reset to 'A' */
 215        if (letter == 'P') {
 216            letter = 'D';
 217        }
 218
 219        half_predicates[i] = (i % 2 == 0 || i % 13 == 0) ? ~0 : 0;
 220    }
 221}
 222
 223/* scatter the 16 bit elements using HVX */
 224void vector_scatter_16(void)
 225{
 226    asm ("m0 = %1\n\t"
 227         "v0 = vmem(%2 + #0)\n\t"
 228         "v1 = vmem(%3 + #0)\n\t"
 229         "vscatter(%0, m0, v0.h).h = v1\n\t"
 230         : : "r"(vtcm.vscatter16), "r"(region_len),
 231             "r"(half_offsets), "r"(half_values)
 232         : "m0", "v0", "v1", "memory");
 233
 234    sync_scatter(vtcm.vscatter16);
 235}
 236
 237/* scatter-accumulate the 16 bit elements using HVX */
 238void vector_scatter_16_acc(void)
 239{
 240    asm ("m0 = %1\n\t"
 241         "v0 = vmem(%2 + #0)\n\t"
 242         "v1 = vmem(%3 + #0)\n\t"
 243         "vscatter(%0, m0, v0.h).h += v1\n\t"
 244         : : "r"(vtcm.vscatter16), "r"(region_len),
 245             "r"(half_offsets), "r"(half_values_acc)
 246         : "m0", "v0", "v1", "memory");
 247
 248    sync_scatter(vtcm.vscatter16);
 249}
 250
 251/* masked scatter the 16 bit elements using HVX */
 252void vector_scatter_16_masked(void)
 253{
 254    asm ("r1 = #-1\n\t"
 255         "v0 = vmem(%0 + #0)\n\t"
 256         "q0 = vand(v0, r1)\n\t"
 257         "m0 = %2\n\t"
 258         "v0 = vmem(%3 + #0)\n\t"
 259         "v1 = vmem(%4 + #0)\n\t"
 260         "if (q0) vscatter(%1, m0, v0.h).h = v1\n\t"
 261         : : "r"(half_predicates), "r"(vtcm.vscatter16), "r"(region_len),
 262             "r"(half_offsets), "r"(half_values_masked)
 263         : "r1", "q0", "m0", "q0", "v0", "v1", "memory");
 264
 265    sync_scatter(vtcm.vscatter16);
 266}
 267
 268/* scatter the 32 bit elements using HVX */
 269void vector_scatter_32(void)
 270{
 271    HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
 272    HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
 273    HVX_Vector *valueslo = (HVX_Vector *)word_values;
 274    HVX_Vector *valueshi = (HVX_Vector *)&word_values[MATRIX_SIZE / 2];
 275
 276    asm ("m0 = %1\n\t"
 277         "v0 = vmem(%2 + #0)\n\t"
 278         "v1 = vmem(%3 + #0)\n\t"
 279         "vscatter(%0, m0, v0.w).w = v1\n\t"
 280         : : "r"(vtcm.vscatter32), "r"(region_len),
 281             "r"(offsetslo), "r"(valueslo)
 282         : "m0", "v0", "v1", "memory");
 283    asm ("m0 = %1\n\t"
 284         "v0 = vmem(%2 + #0)\n\t"
 285         "v1 = vmem(%3 + #0)\n\t"
 286         "vscatter(%0, m0, v0.w).w = v1\n\t"
 287         : : "r"(vtcm.vscatter32), "r"(region_len),
 288             "r"(offsetshi), "r"(valueshi)
 289         : "m0", "v0", "v1", "memory");
 290
 291    sync_scatter(vtcm.vscatter32);
 292}
 293
 294/* scatter-accumulate the 32 bit elements using HVX */
 295void vector_scatter_32_acc(void)
 296{
 297    HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
 298    HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
 299    HVX_Vector *valueslo = (HVX_Vector *)word_values_acc;
 300    HVX_Vector *valueshi = (HVX_Vector *)&word_values_acc[MATRIX_SIZE / 2];
 301
 302    asm ("m0 = %1\n\t"
 303         "v0 = vmem(%2 + #0)\n\t"
 304         "v1 = vmem(%3 + #0)\n\t"
 305         "vscatter(%0, m0, v0.w).w += v1\n\t"
 306         : : "r"(vtcm.vscatter32), "r"(region_len),
 307             "r"(offsetslo), "r"(valueslo)
 308         : "m0", "v0", "v1", "memory");
 309    asm ("m0 = %1\n\t"
 310         "v0 = vmem(%2 + #0)\n\t"
 311         "v1 = vmem(%3 + #0)\n\t"
 312         "vscatter(%0, m0, v0.w).w += v1\n\t"
 313         : : "r"(vtcm.vscatter32), "r"(region_len),
 314             "r"(offsetshi), "r"(valueshi)
 315         : "m0", "v0", "v1", "memory");
 316
 317    sync_scatter(vtcm.vscatter32);
 318}
 319
 320/* masked scatter the 32 bit elements using HVX */
 321void vector_scatter_32_masked(void)
 322{
 323    HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
 324    HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
 325    HVX_Vector *valueslo = (HVX_Vector *)word_values_masked;
 326    HVX_Vector *valueshi = (HVX_Vector *)&word_values_masked[MATRIX_SIZE / 2];
 327    HVX_Vector *predslo = (HVX_Vector *)word_predicates;
 328    HVX_Vector *predshi = (HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
 329
 330    asm ("r1 = #-1\n\t"
 331         "v0 = vmem(%0 + #0)\n\t"
 332         "q0 = vand(v0, r1)\n\t"
 333         "m0 = %2\n\t"
 334         "v0 = vmem(%3 + #0)\n\t"
 335         "v1 = vmem(%4 + #0)\n\t"
 336         "if (q0) vscatter(%1, m0, v0.w).w = v1\n\t"
 337         : : "r"(predslo), "r"(vtcm.vscatter32), "r"(region_len),
 338             "r"(offsetslo), "r"(valueslo)
 339         : "r1", "q0", "m0", "q0", "v0", "v1", "memory");
 340    asm ("r1 = #-1\n\t"
 341         "v0 = vmem(%0 + #0)\n\t"
 342         "q0 = vand(v0, r1)\n\t"
 343         "m0 = %2\n\t"
 344         "v0 = vmem(%3 + #0)\n\t"
 345         "v1 = vmem(%4 + #0)\n\t"
 346         "if (q0) vscatter(%1, m0, v0.w).w = v1\n\t"
 347         : : "r"(predshi), "r"(vtcm.vscatter32), "r"(region_len),
 348             "r"(offsetshi), "r"(valueshi)
 349         : "r1", "q0", "m0", "q0", "v0", "v1", "memory");
 350
 351    sync_scatter(vtcm.vscatter32);
 352}
 353
 354/* scatter the 16 bit elements with 32 bit offsets using HVX */
 355void vector_scatter_16_32(void)
 356{
 357    asm ("m0 = %1\n\t"
 358         "v0 = vmem(%2 + #0)\n\t"
 359         "v1 = vmem(%2 + #1)\n\t"
 360         "v2 = vmem(%3 + #0)\n\t"
 361         "v2.h = vshuff(v2.h)\n\t"  /* shuffle the values for the scatter */
 362         "vscatter(%0, m0, v1:0.w).h = v2\n\t"
 363         : : "r"(vtcm.vscatter16_32), "r"(region_len),
 364             "r"(word_offsets), "r"(half_values)
 365         : "m0", "v0", "v1", "v2", "memory");
 366
 367    sync_scatter(vtcm.vscatter16_32);
 368}
 369
 370/* scatter-accumulate the 16 bit elements with 32 bit offsets using HVX */
 371void vector_scatter_16_32_acc(void)
 372{
 373    asm ("m0 = %1\n\t"
 374         "v0 = vmem(%2 + #0)\n\t"
 375         "v1 = vmem(%2 + #1)\n\t"
 376         "v2 = vmem(%3 + #0)\n\t" \
 377         "v2.h = vshuff(v2.h)\n\t"  /* shuffle the values for the scatter */
 378         "vscatter(%0, m0, v1:0.w).h += v2\n\t"
 379         : : "r"(vtcm.vscatter16_32), "r"(region_len),
 380             "r"(word_offsets), "r"(half_values_acc)
 381         : "m0", "v0", "v1", "v2", "memory");
 382
 383    sync_scatter(vtcm.vscatter16_32);
 384}
 385
 386/* masked scatter the 16 bit elements with 32 bit offsets using HVX */
 387void vector_scatter_16_32_masked(void)
 388{
 389    asm ("r1 = #-1\n\t"
 390         "v0 = vmem(%0 + #0)\n\t"
 391         "v0.h = vshuff(v0.h)\n\t"  /* shuffle the predicates */
 392         "q0 = vand(v0, r1)\n\t"
 393         "m0 = %2\n\t"
 394         "v0 = vmem(%3 + #0)\n\t"
 395         "v1 = vmem(%3 + #1)\n\t"
 396         "v2 = vmem(%4 + #0)\n\t" \
 397         "v2.h = vshuff(v2.h)\n\t"  /* shuffle the values for the scatter */
 398         "if (q0) vscatter(%1, m0, v1:0.w).h = v2\n\t"
 399         : : "r"(half_predicates), "r"(vtcm.vscatter16_32), "r"(region_len),
 400             "r"(word_offsets), "r"(half_values_masked)
 401         : "r1", "q0", "m0", "v0", "v1", "v2", "memory");
 402
 403    sync_scatter(vtcm.vscatter16_32);
 404}
 405
 406/* gather the elements from the scatter16 buffer using HVX */
 407void vector_gather_16(void)
 408{
 409    asm ("m0 = %1\n\t"
 410         "v0 = vmem(%2 + #0)\n\t"
 411         "{ vtmp.h = vgather(%0, m0, v0.h).h\n\t"
 412         "  vmem(%3 + #0) = vtmp.new }\n\t"
 413         : : "r"(vtcm.vscatter16), "r"(region_len),
 414             "r"(half_offsets), "r"(vtcm.vgather16)
 415         : "m0", "v0", "memory");
 416
 417    sync_gather(vtcm.vgather16);
 418}
 419
 420static unsigned short gather_16_masked_init(void)
 421{
 422    char letter = '?';
 423    return letter | (letter << 8);
 424}
 425
 426/* masked gather the elements from the scatter16 buffer using HVX */
 427void vector_gather_16_masked(void)
 428{
 429    unsigned short init = gather_16_masked_init();
 430
 431    asm ("v0.h = vsplat(%5)\n\t"
 432         "vmem(%4 + #0) = v0\n\t"  /* initialize the write area */
 433         "r1 = #-1\n\t"
 434         "v0 = vmem(%0 + #0)\n\t"
 435         "q0 = vand(v0, r1)\n\t"
 436         "m0 = %2\n\t"
 437         "v0 = vmem(%3 + #0)\n\t"
 438         "{ if (q0) vtmp.h = vgather(%1, m0, v0.h).h\n\t"
 439         "  vmem(%4 + #0) = vtmp.new }\n\t"
 440         : : "r"(half_predicates), "r"(vtcm.vscatter16), "r"(region_len),
 441             "r"(half_offsets), "r"(vtcm.vgather16), "r"(init)
 442         : "r1", "q0", "m0", "v0", "memory");
 443
 444    sync_gather(vtcm.vgather16);
 445}
 446
 447/* gather the elements from the scatter32 buffer using HVX */
 448void vector_gather_32(void)
 449{
 450    HVX_Vector *vgatherlo = (HVX_Vector *)vtcm.vgather32;
 451    HVX_Vector *vgatherhi = (HVX_Vector *)&vtcm.vgather32[MATRIX_SIZE / 2];
 452    HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
 453    HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
 454
 455    asm ("m0 = %1\n\t"
 456         "v0 = vmem(%2 + #0)\n\t"
 457         "{ vtmp.w = vgather(%0, m0, v0.w).w\n\t"
 458         "  vmem(%3 + #0) = vtmp.new }\n\t"
 459         : : "r"(vtcm.vscatter32), "r"(region_len),
 460             "r"(offsetslo), "r"(vgatherlo)
 461         : "m0", "v0", "memory");
 462    asm ("m0 = %1\n\t"
 463         "v0 = vmem(%2 + #0)\n\t"
 464         "{ vtmp.w = vgather(%0, m0, v0.w).w\n\t"
 465         "  vmem(%3 + #0) = vtmp.new }\n\t"
 466         : : "r"(vtcm.vscatter32), "r"(region_len),
 467             "r"(offsetshi), "r"(vgatherhi)
 468         : "m0", "v0", "memory");
 469
 470    sync_gather(vgatherlo);
 471    sync_gather(vgatherhi);
 472}
 473
 474static unsigned int gather_32_masked_init(void)
 475{
 476    char letter = '?';
 477    return letter | (letter << 8) | (letter << 16) | (letter << 24);
 478}
 479
 480/* masked gather the elements from the scatter32 buffer using HVX */
 481void vector_gather_32_masked(void)
 482{
 483    unsigned int init = gather_32_masked_init();
 484    HVX_Vector *vgatherlo = (HVX_Vector *)vtcm.vgather32;
 485    HVX_Vector *vgatherhi = (HVX_Vector *)&vtcm.vgather32[MATRIX_SIZE / 2];
 486    HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
 487    HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
 488    HVX_Vector *predslo = (HVX_Vector *)word_predicates;
 489    HVX_Vector *predshi = (HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
 490
 491    asm ("v0.h = vsplat(%5)\n\t"
 492         "vmem(%4 + #0) = v0\n\t"  /* initialize the write area */
 493         "r1 = #-1\n\t"
 494         "v0 = vmem(%0 + #0)\n\t"
 495         "q0 = vand(v0, r1)\n\t"
 496         "m0 = %2\n\t"
 497         "v0 = vmem(%3 + #0)\n\t"
 498         "{ if (q0) vtmp.w = vgather(%1, m0, v0.w).w\n\t"
 499         "  vmem(%4 + #0) = vtmp.new }\n\t"
 500         : : "r"(predslo), "r"(vtcm.vscatter32), "r"(region_len),
 501             "r"(offsetslo), "r"(vgatherlo), "r"(init)
 502         : "r1", "q0", "m0", "v0", "memory");
 503    asm ("v0.h = vsplat(%5)\n\t"
 504         "vmem(%4 + #0) = v0\n\t"  /* initialize the write area */
 505         "r1 = #-1\n\t"
 506         "v0 = vmem(%0 + #0)\n\t"
 507         "q0 = vand(v0, r1)\n\t"
 508         "m0 = %2\n\t"
 509         "v0 = vmem(%3 + #0)\n\t"
 510         "{ if (q0) vtmp.w = vgather(%1, m0, v0.w).w\n\t"
 511         "  vmem(%4 + #0) = vtmp.new }\n\t"
 512         : : "r"(predshi), "r"(vtcm.vscatter32), "r"(region_len),
 513             "r"(offsetshi), "r"(vgatherhi), "r"(init)
 514         : "r1", "q0", "m0", "v0", "memory");
 515
 516    sync_gather(vgatherlo);
 517    sync_gather(vgatherhi);
 518}
 519
 520/* gather the elements from the scatter16_32 buffer using HVX */
 521void vector_gather_16_32(void)
 522{
 523    asm ("m0 = %1\n\t"
 524         "v0 = vmem(%2 + #0)\n\t"
 525         "v1 = vmem(%2 + #1)\n\t"
 526         "{ vtmp.h = vgather(%0, m0, v1:0.w).h\n\t"
 527         "  vmem(%3 + #0) = vtmp.new }\n\t"
 528         "v0 = vmem(%3 + #0)\n\t"
 529         "v0.h = vdeal(v0.h)\n\t"  /* deal the elements to get the order back */
 530         "vmem(%3 + #0) = v0\n\t"
 531         : : "r"(vtcm.vscatter16_32), "r"(region_len),
 532             "r"(word_offsets), "r"(vtcm.vgather16_32)
 533         : "m0", "v0", "v1", "memory");
 534
 535    sync_gather(vtcm.vgather16_32);
 536}
 537
 538/* masked gather the elements from the scatter16_32 buffer using HVX */
 539void vector_gather_16_32_masked(void)
 540{
 541    unsigned short init = gather_16_masked_init();
 542
 543    asm ("v0.h = vsplat(%5)\n\t"
 544         "vmem(%4 + #0) = v0\n\t"  /* initialize the write area */
 545         "r1 = #-1\n\t"
 546         "v0 = vmem(%0 + #0)\n\t"
 547         "v0.h = vshuff(v0.h)\n\t"  /* shuffle the predicates */
 548         "q0 = vand(v0, r1)\n\t"
 549         "m0 = %2\n\t"
 550         "v0 = vmem(%3 + #0)\n\t"
 551         "v1 = vmem(%3 + #1)\n\t"
 552         "{ if (q0) vtmp.h = vgather(%1, m0, v1:0.w).h\n\t"
 553         "  vmem(%4 + #0) = vtmp.new }\n\t"
 554         "v0 = vmem(%4 + #0)\n\t"
 555         "v0.h = vdeal(v0.h)\n\t"  /* deal the elements to get the order back */
 556         "vmem(%4 + #0) = v0\n\t"
 557         : : "r"(half_predicates), "r"(vtcm.vscatter16_32), "r"(region_len),
 558             "r"(word_offsets), "r"(vtcm.vgather16_32), "r"(init)
 559         : "r1", "q0", "m0", "v0", "v1", "memory");
 560
 561    sync_gather(vtcm.vgather16_32);
 562}
 563
 564static void check_buffer(const char *name, void *c, void *r, size_t size)
 565{
 566    char *check = (char *)c;
 567    char *ref = (char *)r;
 568    for (int i = 0; i < size; i++) {
 569        if (check[i] != ref[i]) {
 570            printf("ERROR %s [%d]: 0x%x (%c) != 0x%x (%c)\n", name, i,
 571                   check[i], check[i], ref[i], ref[i]);
 572            err++;
 573        }
 574    }
 575}
 576
 577/*
 578 * These scalar functions are the C equivalents of the vector functions that
 579 * use HVX
 580 */
 581
 582/* scatter the 16 bit elements using C */
 583void scalar_scatter_16(unsigned short *vscatter16)
 584{
 585    for (int i = 0; i < MATRIX_SIZE; ++i) {
 586        vscatter16[half_offsets[i] / 2] = half_values[i];
 587    }
 588}
 589
 590void check_scatter_16()
 591{
 592    memset(vscatter16_ref, FILL_CHAR,
 593           SCATTER_BUFFER_SIZE * sizeof(unsigned short));
 594    scalar_scatter_16(vscatter16_ref);
 595    check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
 596                 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
 597}
 598
 599/* scatter the 16 bit elements using C */
 600void scalar_scatter_16_acc(unsigned short *vscatter16)
 601{
 602    for (int i = 0; i < MATRIX_SIZE; ++i) {
 603        vscatter16[half_offsets[i] / 2] += half_values_acc[i];
 604    }
 605}
 606
 607/* scatter-accumulate the 16 bit elements using C */
 608void check_scatter_16_acc()
 609{
 610    memset(vscatter16_ref, FILL_CHAR,
 611           SCATTER_BUFFER_SIZE * sizeof(unsigned short));
 612    scalar_scatter_16(vscatter16_ref);
 613    scalar_scatter_16_acc(vscatter16_ref);
 614    check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
 615                 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
 616}
 617
 618/* masked scatter the 16 bit elements using C */
 619void scalar_scatter_16_masked(unsigned short *vscatter16)
 620{
 621    for (int i = 0; i < MATRIX_SIZE; i++) {
 622        if (half_predicates[i]) {
 623            vscatter16[half_offsets[i] / 2] = half_values_masked[i];
 624        }
 625    }
 626
 627}
 628
 629void check_scatter_16_masked()
 630{
 631    memset(vscatter16_ref, FILL_CHAR,
 632           SCATTER_BUFFER_SIZE * sizeof(unsigned short));
 633    scalar_scatter_16(vscatter16_ref);
 634    scalar_scatter_16_acc(vscatter16_ref);
 635    scalar_scatter_16_masked(vscatter16_ref);
 636    check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
 637                 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
 638}
 639
 640/* scatter the 32 bit elements using C */
 641void scalar_scatter_32(unsigned int *vscatter32)
 642{
 643    for (int i = 0; i < MATRIX_SIZE; ++i) {
 644        vscatter32[word_offsets[i] / 4] = word_values[i];
 645    }
 646}
 647
 648void check_scatter_32()
 649{
 650    memset(vscatter32_ref, FILL_CHAR,
 651           SCATTER_BUFFER_SIZE * sizeof(unsigned int));
 652    scalar_scatter_32(vscatter32_ref);
 653    check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
 654                 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
 655}
 656
 657/* scatter-accumulate the 32 bit elements using C */
 658void scalar_scatter_32_acc(unsigned int *vscatter32)
 659{
 660    for (int i = 0; i < MATRIX_SIZE; ++i) {
 661        vscatter32[word_offsets[i] / 4] += word_values_acc[i];
 662    }
 663}
 664
 665void check_scatter_32_acc()
 666{
 667    memset(vscatter32_ref, FILL_CHAR,
 668           SCATTER_BUFFER_SIZE * sizeof(unsigned int));
 669    scalar_scatter_32(vscatter32_ref);
 670    scalar_scatter_32_acc(vscatter32_ref);
 671    check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
 672                 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
 673}
 674
 675/* masked scatter the 32 bit elements using C */
 676void scalar_scatter_32_masked(unsigned int *vscatter32)
 677{
 678    for (int i = 0; i < MATRIX_SIZE; i++) {
 679        if (word_predicates[i]) {
 680            vscatter32[word_offsets[i] / 4] = word_values_masked[i];
 681        }
 682    }
 683}
 684
 685void check_scatter_32_masked()
 686{
 687    memset(vscatter32_ref, FILL_CHAR,
 688           SCATTER_BUFFER_SIZE * sizeof(unsigned int));
 689    scalar_scatter_32(vscatter32_ref);
 690    scalar_scatter_32_acc(vscatter32_ref);
 691    scalar_scatter_32_masked(vscatter32_ref);
 692    check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
 693                  SCATTER_BUFFER_SIZE * sizeof(unsigned int));
 694}
 695
 696/* scatter the 16 bit elements with 32 bit offsets using C */
 697void scalar_scatter_16_32(unsigned short *vscatter16_32)
 698{
 699    for (int i = 0; i < MATRIX_SIZE; ++i) {
 700        vscatter16_32[word_offsets[i] / 2] = half_values[i];
 701    }
 702}
 703
 704void check_scatter_16_32()
 705{
 706    memset(vscatter16_32_ref, FILL_CHAR,
 707           SCATTER_BUFFER_SIZE * sizeof(unsigned short));
 708    scalar_scatter_16_32(vscatter16_32_ref);
 709    check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
 710                 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
 711}
 712
 713/* scatter-accumulate the 16 bit elements with 32 bit offsets using C */
 714void scalar_scatter_16_32_acc(unsigned short *vscatter16_32)
 715{
 716    for (int i = 0; i < MATRIX_SIZE; ++i) {
 717        vscatter16_32[word_offsets[i] / 2] += half_values_acc[i];
 718    }
 719}
 720
 721void check_scatter_16_32_acc()
 722{
 723    memset(vscatter16_32_ref, FILL_CHAR,
 724           SCATTER_BUFFER_SIZE * sizeof(unsigned short));
 725    scalar_scatter_16_32(vscatter16_32_ref);
 726    scalar_scatter_16_32_acc(vscatter16_32_ref);
 727    check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
 728                 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
 729}
 730
 731/* masked scatter the 16 bit elements with 32 bit offsets using C */
 732void scalar_scatter_16_32_masked(unsigned short *vscatter16_32)
 733{
 734    for (int i = 0; i < MATRIX_SIZE; i++) {
 735        if (half_predicates[i]) {
 736            vscatter16_32[word_offsets[i] / 2] = half_values_masked[i];
 737        }
 738    }
 739}
 740
 741void check_scatter_16_32_masked()
 742{
 743    memset(vscatter16_32_ref, FILL_CHAR,
 744           SCATTER_BUFFER_SIZE * sizeof(unsigned short));
 745    scalar_scatter_16_32(vscatter16_32_ref);
 746    scalar_scatter_16_32_acc(vscatter16_32_ref);
 747    scalar_scatter_16_32_masked(vscatter16_32_ref);
 748    check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
 749                 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
 750}
 751
 752/* gather the elements from the scatter buffer using C */
 753void scalar_gather_16(unsigned short *vgather16)
 754{
 755    for (int i = 0; i < MATRIX_SIZE; ++i) {
 756        vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2];
 757    }
 758}
 759
 760void check_gather_16()
 761{
 762      memset(vgather16_ref, 0, MATRIX_SIZE * sizeof(unsigned short));
 763      scalar_gather_16(vgather16_ref);
 764      check_buffer(__func__, vtcm.vgather16, vgather16_ref,
 765                   MATRIX_SIZE * sizeof(unsigned short));
 766}
 767
 768/* masked gather the elements from the scatter buffer using C */
 769void scalar_gather_16_masked(unsigned short *vgather16)
 770{
 771    for (int i = 0; i < MATRIX_SIZE; ++i) {
 772        if (half_predicates[i]) {
 773            vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2];
 774        }
 775    }
 776}
 777
 778void check_gather_16_masked()
 779{
 780    memset(vgather16_ref, gather_16_masked_init(),
 781           MATRIX_SIZE * sizeof(unsigned short));
 782    scalar_gather_16_masked(vgather16_ref);
 783    check_buffer(__func__, vtcm.vgather16, vgather16_ref,
 784                 MATRIX_SIZE * sizeof(unsigned short));
 785}
 786
 787/* gather the elements from the scatter32 buffer using C */
 788void scalar_gather_32(unsigned int *vgather32)
 789{
 790    for (int i = 0; i < MATRIX_SIZE; ++i) {
 791        vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4];
 792    }
 793}
 794
 795void check_gather_32(void)
 796{
 797    memset(vgather32_ref, 0, MATRIX_SIZE * sizeof(unsigned int));
 798    scalar_gather_32(vgather32_ref);
 799    check_buffer(__func__, vtcm.vgather32, vgather32_ref,
 800                 MATRIX_SIZE * sizeof(unsigned int));
 801}
 802
 803/* masked gather the elements from the scatter32 buffer using C */
 804void scalar_gather_32_masked(unsigned int *vgather32)
 805{
 806    for (int i = 0; i < MATRIX_SIZE; ++i) {
 807        if (word_predicates[i]) {
 808            vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4];
 809        }
 810    }
 811}
 812
 813void check_gather_32_masked(void)
 814{
 815    memset(vgather32_ref, gather_32_masked_init(),
 816           MATRIX_SIZE * sizeof(unsigned int));
 817    scalar_gather_32_masked(vgather32_ref);
 818    check_buffer(__func__, vtcm.vgather32,
 819                 vgather32_ref, MATRIX_SIZE * sizeof(unsigned int));
 820}
 821
 822/* gather the elements from the scatter16_32 buffer using C */
 823void scalar_gather_16_32(unsigned short *vgather16_32)
 824{
 825    for (int i = 0; i < MATRIX_SIZE; ++i) {
 826        vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2];
 827    }
 828}
 829
 830void check_gather_16_32(void)
 831{
 832    memset(vgather16_32_ref, 0, MATRIX_SIZE * sizeof(unsigned short));
 833    scalar_gather_16_32(vgather16_32_ref);
 834    check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref,
 835                 MATRIX_SIZE * sizeof(unsigned short));
 836}
 837
 838/* masked gather the elements from the scatter16_32 buffer using C */
 839void scalar_gather_16_32_masked(unsigned short *vgather16_32)
 840{
 841    for (int i = 0; i < MATRIX_SIZE; ++i) {
 842        if (half_predicates[i]) {
 843            vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2];
 844        }
 845    }
 846
 847}
 848
 849void check_gather_16_32_masked(void)
 850{
 851    memset(vgather16_32_ref, gather_16_masked_init(),
 852           MATRIX_SIZE * sizeof(unsigned short));
 853    scalar_gather_16_32_masked(vgather16_32_ref);
 854    check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref,
 855                 MATRIX_SIZE * sizeof(unsigned short));
 856}
 857
 858/* print scatter16 buffer */
 859void print_scatter16_buffer(void)
 860{
 861    if (PRINT_DATA) {
 862        printf("\n\nPrinting the 16 bit scatter buffer");
 863
 864        for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
 865            if ((i % MATRIX_SIZE) == 0) {
 866                printf("\n");
 867            }
 868            for (int j = 0; j < 2; j++) {
 869                printf("%c", (char)((vtcm.vscatter16[i] >> j * 8) & 0xff));
 870            }
 871            printf(" ");
 872        }
 873        printf("\n");
 874    }
 875}
 876
 877/* print the gather 16 buffer */
 878void print_gather_result_16(void)
 879{
 880    if (PRINT_DATA) {
 881        printf("\n\nPrinting the 16 bit gather result\n");
 882
 883        for (int i = 0; i < MATRIX_SIZE; i++) {
 884            for (int j = 0; j < 2; j++) {
 885                printf("%c", (char)((vtcm.vgather16[i] >> j * 8) & 0xff));
 886            }
 887            printf(" ");
 888        }
 889        printf("\n");
 890    }
 891}
 892
 893/* print the scatter32 buffer */
 894void print_scatter32_buffer(void)
 895{
 896    if (PRINT_DATA) {
 897        printf("\n\nPrinting the 32 bit scatter buffer");
 898
 899        for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
 900            if ((i % MATRIX_SIZE) == 0) {
 901                printf("\n");
 902            }
 903            for (int j = 0; j < 4; j++) {
 904                printf("%c", (char)((vtcm.vscatter32[i] >> j * 8) & 0xff));
 905            }
 906            printf(" ");
 907        }
 908        printf("\n");
 909    }
 910}
 911
 912/* print the gather 32 buffer */
 913void print_gather_result_32(void)
 914{
 915    if (PRINT_DATA) {
 916        printf("\n\nPrinting the 32 bit gather result\n");
 917
 918        for (int i = 0; i < MATRIX_SIZE; i++) {
 919            for (int j = 0; j < 4; j++) {
 920                printf("%c", (char)((vtcm.vgather32[i] >> j * 8) & 0xff));
 921            }
 922            printf(" ");
 923        }
 924        printf("\n");
 925    }
 926}
 927
 928/* print the scatter16_32 buffer */
 929void print_scatter16_32_buffer(void)
 930{
 931    if (PRINT_DATA) {
 932        printf("\n\nPrinting the 16_32 bit scatter buffer");
 933
 934        for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
 935            if ((i % MATRIX_SIZE) == 0) {
 936                printf("\n");
 937            }
 938            for (int j = 0; j < 2; j++) {
 939                printf("%c",
 940                      (unsigned char)((vtcm.vscatter16_32[i] >> j * 8) & 0xff));
 941            }
 942            printf(" ");
 943        }
 944        printf("\n");
 945    }
 946}
 947
 948/* print the gather 16_32 buffer */
 949void print_gather_result_16_32(void)
 950{
 951    if (PRINT_DATA) {
 952        printf("\n\nPrinting the 16_32 bit gather result\n");
 953
 954        for (int i = 0; i < MATRIX_SIZE; i++) {
 955            for (int j = 0; j < 2; j++) {
 956                printf("%c",
 957                       (unsigned char)((vtcm.vgather16_32[i] >> j * 8) & 0xff));
 958            }
 959            printf(" ");
 960        }
 961        printf("\n");
 962    }
 963}
 964
 965int main()
 966{
 967    prefill_vtcm_scratch();
 968
 969    /* 16 bit elements with 16 bit offsets */
 970    create_offsets_values_preds_16();
 971
 972    vector_scatter_16();
 973    print_scatter16_buffer();
 974    check_scatter_16();
 975
 976    vector_gather_16();
 977    print_gather_result_16();
 978    check_gather_16();
 979
 980    vector_gather_16_masked();
 981    print_gather_result_16();
 982    check_gather_16_masked();
 983
 984    vector_scatter_16_acc();
 985    print_scatter16_buffer();
 986    check_scatter_16_acc();
 987
 988    vector_scatter_16_masked();
 989    print_scatter16_buffer();
 990    check_scatter_16_masked();
 991
 992    /* 32 bit elements with 32 bit offsets */
 993    create_offsets_values_preds_32();
 994
 995    vector_scatter_32();
 996    print_scatter32_buffer();
 997    check_scatter_32();
 998
 999    vector_gather_32();
1000    print_gather_result_32();
1001    check_gather_32();
1002
1003    vector_gather_32_masked();
1004    print_gather_result_32();
1005    check_gather_32_masked();
1006
1007    vector_scatter_32_acc();
1008    print_scatter32_buffer();
1009    check_scatter_32_acc();
1010
1011    vector_scatter_32_masked();
1012    print_scatter32_buffer();
1013    check_scatter_32_masked();
1014
1015    /* 16 bit elements with 32 bit offsets */
1016    create_offsets_values_preds_16_32();
1017
1018    vector_scatter_16_32();
1019    print_scatter16_32_buffer();
1020    check_scatter_16_32();
1021
1022    vector_gather_16_32();
1023    print_gather_result_16_32();
1024    check_gather_16_32();
1025
1026    vector_gather_16_32_masked();
1027    print_gather_result_16_32();
1028    check_gather_16_32_masked();
1029
1030    vector_scatter_16_32_acc();
1031    print_scatter16_32_buffer();
1032    check_scatter_16_32_acc();
1033
1034    vector_scatter_16_32_masked();
1035    print_scatter16_32_buffer();
1036    check_scatter_16_32_masked();
1037
1038    puts(err ? "FAIL" : "PASS");
1039    return err;
1040}
1041