qemu/tests/tcg/hexagon/scatter_gather.c
<<
>>
Prefs
   1/*
   2 *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
   3 *
   4 *  This program is free software; you can redistribute it and/or modify
   5 *  it under the terms of the GNU General Public License as published by
   6 *  the Free Software Foundation; either version 2 of the License, or
   7 *  (at your option) any later version.
   8 *
   9 *  This program is distributed in the hope that it will be useful,
  10 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 *  GNU General Public License for more details.
  13 *
  14 *  You should have received a copy of the GNU General Public License
  15 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
  16 */
  17
  18/*
  19 * This example tests the HVX scatter/gather instructions
  20 *
  21 * See section 5.13 of the V68 HVX Programmer's Reference
  22 *
  23 * There are 3 main classes operations
  24 *     _16                 16-bit elements and 16-bit offsets
  25 *     _32                 32-bit elements and 32-bit offsets
  26 *     _16_32              16-bit elements and 32-bit offsets
  27 *
  28 * There are also masked and accumulate versions
  29 */
  30
  31#include <stdio.h>
  32#include <string.h>
  33#include <stdlib.h>
  34#include <inttypes.h>
  35
  36typedef long HVX_Vector       __attribute__((__vector_size__(128)))
  37                              __attribute__((aligned(128)));
  38typedef long HVX_VectorPair   __attribute__((__vector_size__(256)))
  39                              __attribute__((aligned(128)));
  40typedef long HVX_VectorPred   __attribute__((__vector_size__(128)))
  41                              __attribute__((aligned(128)));
  42
  43#define VSCATTER_16(BASE, RGN, OFF, VALS) \
  44    __builtin_HEXAGON_V6_vscattermh_128B((int)BASE, RGN, OFF, VALS)
  45#define VSCATTER_16_MASKED(MASK, BASE, RGN, OFF, VALS) \
  46    __builtin_HEXAGON_V6_vscattermhq_128B(MASK, (int)BASE, RGN, OFF, VALS)
  47#define VSCATTER_32(BASE, RGN, OFF, VALS) \
  48    __builtin_HEXAGON_V6_vscattermw_128B((int)BASE, RGN, OFF, VALS)
  49#define VSCATTER_32_MASKED(MASK, BASE, RGN, OFF, VALS) \
  50    __builtin_HEXAGON_V6_vscattermwq_128B(MASK, (int)BASE, RGN, OFF, VALS)
  51#define VSCATTER_16_32(BASE, RGN, OFF, VALS) \
  52    __builtin_HEXAGON_V6_vscattermhw_128B((int)BASE, RGN, OFF, VALS)
  53#define VSCATTER_16_32_MASKED(MASK, BASE, RGN, OFF, VALS) \
  54    __builtin_HEXAGON_V6_vscattermhwq_128B(MASK, (int)BASE, RGN, OFF, VALS)
  55#define VSCATTER_16_ACC(BASE, RGN, OFF, VALS) \
  56    __builtin_HEXAGON_V6_vscattermh_add_128B((int)BASE, RGN, OFF, VALS)
  57#define VSCATTER_32_ACC(BASE, RGN, OFF, VALS) \
  58    __builtin_HEXAGON_V6_vscattermw_add_128B((int)BASE, RGN, OFF, VALS)
  59#define VSCATTER_16_32_ACC(BASE, RGN, OFF, VALS) \
  60    __builtin_HEXAGON_V6_vscattermhw_add_128B((int)BASE, RGN, OFF, VALS)
  61
  62#define VGATHER_16(DSTADDR, BASE, RGN, OFF) \
  63    __builtin_HEXAGON_V6_vgathermh_128B(DSTADDR, (int)BASE, RGN, OFF)
  64#define VGATHER_16_MASKED(DSTADDR, MASK, BASE, RGN, OFF) \
  65    __builtin_HEXAGON_V6_vgathermhq_128B(DSTADDR, MASK, (int)BASE, RGN, OFF)
  66#define VGATHER_32(DSTADDR, BASE, RGN, OFF) \
  67    __builtin_HEXAGON_V6_vgathermw_128B(DSTADDR, (int)BASE, RGN, OFF)
  68#define VGATHER_32_MASKED(DSTADDR, MASK, BASE, RGN, OFF) \
  69    __builtin_HEXAGON_V6_vgathermwq_128B(DSTADDR, MASK, (int)BASE, RGN, OFF)
  70#define VGATHER_16_32(DSTADDR, BASE, RGN, OFF) \
  71    __builtin_HEXAGON_V6_vgathermhw_128B(DSTADDR, (int)BASE, RGN, OFF)
  72#define VGATHER_16_32_MASKED(DSTADDR, MASK, BASE, RGN, OFF) \
  73    __builtin_HEXAGON_V6_vgathermhwq_128B(DSTADDR, MASK, (int)BASE, RGN, OFF)
  74
  75#define VSHUFF_H(V) \
  76    __builtin_HEXAGON_V6_vshuffh_128B(V)
  77#define VSPLAT_H(X) \
  78    __builtin_HEXAGON_V6_lvsplath_128B(X)
  79#define VAND_VAL(PRED, VAL) \
  80    __builtin_HEXAGON_V6_vandvrt_128B(PRED, VAL)
  81#define VDEAL_H(V) \
  82    __builtin_HEXAGON_V6_vdealh_128B(V)
  83
  84int err;
  85
  86/* define the number of rows/cols in a square matrix */
  87#define MATRIX_SIZE 64
  88
  89/* define the size of the scatter buffer */
  90#define SCATTER_BUFFER_SIZE (MATRIX_SIZE * MATRIX_SIZE)
  91
  92/* fake vtcm - put buffers together and force alignment */
  93static struct {
  94    unsigned short vscatter16[SCATTER_BUFFER_SIZE];
  95    unsigned short vgather16[MATRIX_SIZE];
  96    unsigned int   vscatter32[SCATTER_BUFFER_SIZE];
  97    unsigned int   vgather32[MATRIX_SIZE];
  98    unsigned short vscatter16_32[SCATTER_BUFFER_SIZE];
  99    unsigned short vgather16_32[MATRIX_SIZE];
 100} vtcm __attribute__((aligned(0x10000)));
 101
 102/* declare the arrays of reference values */
 103unsigned short vscatter16_ref[SCATTER_BUFFER_SIZE];
 104unsigned short vgather16_ref[MATRIX_SIZE];
 105unsigned int   vscatter32_ref[SCATTER_BUFFER_SIZE];
 106unsigned int   vgather32_ref[MATRIX_SIZE];
 107unsigned short vscatter16_32_ref[SCATTER_BUFFER_SIZE];
 108unsigned short vgather16_32_ref[MATRIX_SIZE];
 109
 110/* declare the arrays of offsets */
 111unsigned short half_offsets[MATRIX_SIZE];
 112unsigned int   word_offsets[MATRIX_SIZE];
 113
 114/* declare the arrays of values */
 115unsigned short half_values[MATRIX_SIZE];
 116unsigned short half_values_acc[MATRIX_SIZE];
 117unsigned short half_values_masked[MATRIX_SIZE];
 118unsigned int   word_values[MATRIX_SIZE];
 119unsigned int   word_values_acc[MATRIX_SIZE];
 120unsigned int   word_values_masked[MATRIX_SIZE];
 121
 122/* declare the arrays of predicates */
 123unsigned short half_predicates[MATRIX_SIZE];
 124unsigned int   word_predicates[MATRIX_SIZE];
 125
 126/* make this big enough for all the intrinsics */
 127const size_t region_len = sizeof(vtcm);
 128
 129/* optionally add sync instructions */
 130#define SYNC_VECTOR 1
 131
 132static void sync_scatter(void *addr)
 133{
 134#if SYNC_VECTOR
 135    /*
 136     * Do the scatter release followed by a dummy load to complete the
 137     * synchronization.  Normally the dummy load would be deferred as
 138     * long as possible to minimize stalls.
 139     */
 140    asm volatile("vmem(%0 + #0):scatter_release\n" : : "r"(addr));
 141    /* use volatile to force the load */
 142    volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy;
 143#endif
 144}
 145
 146static void sync_gather(void *addr)
 147{
 148#if SYNC_VECTOR
 149    /* use volatile to force the load */
 150    volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy;
 151#endif
 152}
 153
 154/* optionally print the results */
 155#define PRINT_DATA 0
 156
 157#define FILL_CHAR       '.'
 158
 159/* fill vtcm scratch with ee */
 160void prefill_vtcm_scratch(void)
 161{
 162    memset(&vtcm, FILL_CHAR, sizeof(vtcm));
 163}
 164
 165/* create byte offsets to be a diagonal of the matrix with 16 bit elements */
 166void create_offsets_values_preds_16(void)
 167{
 168    unsigned short half_element = 0;
 169    unsigned short half_element_masked = 0;
 170    char letter = 'A';
 171    char letter_masked = '@';
 172
 173    for (int i = 0; i < MATRIX_SIZE; i++) {
 174        half_offsets[i] = i * (2 * MATRIX_SIZE + 2);
 175
 176        half_element = 0;
 177        half_element_masked = 0;
 178        for (int j = 0; j < 2; j++) {
 179            half_element |= letter << j * 8;
 180            half_element_masked |= letter_masked << j * 8;
 181        }
 182
 183        half_values[i] = half_element;
 184        half_values_acc[i] = ((i % 10) << 8) + (i % 10);
 185        half_values_masked[i] = half_element_masked;
 186
 187        letter++;
 188        /* reset to 'A' */
 189        if (letter == 'M') {
 190            letter = 'A';
 191        }
 192
 193        half_predicates[i] = (i % 3 == 0 || i % 5 == 0) ? ~0 : 0;
 194    }
 195}
 196
 197/* create byte offsets to be a diagonal of the matrix with 32 bit elements */
 198void create_offsets_values_preds_32(void)
 199{
 200    unsigned int word_element = 0;
 201    unsigned int word_element_masked = 0;
 202    char letter = 'A';
 203    char letter_masked = '&';
 204
 205    for (int i = 0; i < MATRIX_SIZE; i++) {
 206        word_offsets[i] = i * (4 * MATRIX_SIZE + 4);
 207
 208        word_element = 0;
 209        word_element_masked = 0;
 210        for (int j = 0; j < 4; j++) {
 211            word_element |= letter << j * 8;
 212            word_element_masked |= letter_masked << j * 8;
 213        }
 214
 215        word_values[i] = word_element;
 216        word_values_acc[i] = ((i % 10) << 8) + (i % 10);
 217        word_values_masked[i] = word_element_masked;
 218
 219        letter++;
 220        /* reset to 'A' */
 221        if (letter == 'M') {
 222            letter = 'A';
 223        }
 224
 225        word_predicates[i] = (i % 4 == 0 || i % 7 == 0) ? ~0 : 0;
 226    }
 227}
 228
 229/*
 230 * create byte offsets to be a diagonal of the matrix with 16 bit elements
 231 * and 32 bit offsets
 232 */
 233void create_offsets_values_preds_16_32(void)
 234{
 235    unsigned short half_element = 0;
 236    unsigned short half_element_masked = 0;
 237    char letter = 'D';
 238    char letter_masked = '$';
 239
 240    for (int i = 0; i < MATRIX_SIZE; i++) {
 241        word_offsets[i] = i * (2 * MATRIX_SIZE + 2);
 242
 243        half_element = 0;
 244        half_element_masked = 0;
 245        for (int j = 0; j < 2; j++) {
 246            half_element |= letter << j * 8;
 247            half_element_masked |= letter_masked << j * 8;
 248        }
 249
 250        half_values[i] = half_element;
 251        half_values_acc[i] = ((i % 10) << 8) + (i % 10);
 252        half_values_masked[i] = half_element_masked;
 253
 254        letter++;
 255        /* reset to 'A' */
 256        if (letter == 'P') {
 257            letter = 'D';
 258        }
 259
 260        half_predicates[i] = (i % 2 == 0 || i % 13 == 0) ? ~0 : 0;
 261    }
 262}
 263
 264/* scatter the 16 bit elements using intrinsics */
 265void vector_scatter_16(void)
 266{
 267    /* copy the offsets and values to vectors */
 268    HVX_Vector offsets = *(HVX_Vector *)half_offsets;
 269    HVX_Vector values = *(HVX_Vector *)half_values;
 270
 271    VSCATTER_16(&vtcm.vscatter16, region_len, offsets, values);
 272
 273    sync_scatter(vtcm.vscatter16);
 274}
 275
 276/* scatter-accumulate the 16 bit elements using intrinsics */
 277void vector_scatter_16_acc(void)
 278{
 279    /* copy the offsets and values to vectors */
 280    HVX_Vector offsets = *(HVX_Vector *)half_offsets;
 281    HVX_Vector values = *(HVX_Vector *)half_values_acc;
 282
 283    VSCATTER_16_ACC(&vtcm.vscatter16, region_len, offsets, values);
 284
 285    sync_scatter(vtcm.vscatter16);
 286}
 287
 288/* scatter the 16 bit elements using intrinsics */
 289void vector_scatter_16_masked(void)
 290{
 291    /* copy the offsets and values to vectors */
 292    HVX_Vector offsets = *(HVX_Vector *)half_offsets;
 293    HVX_Vector values = *(HVX_Vector *)half_values_masked;
 294    HVX_Vector pred_reg = *(HVX_Vector *)half_predicates;
 295    HVX_VectorPred preds = VAND_VAL(pred_reg, ~0);
 296
 297    VSCATTER_16_MASKED(preds, &vtcm.vscatter16, region_len, offsets, values);
 298
 299    sync_scatter(vtcm.vscatter16);
 300}
 301
 302/* scatter the 32 bit elements using intrinsics */
 303void vector_scatter_32(void)
 304{
 305    /* copy the offsets and values to vectors */
 306    HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
 307    HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
 308    HVX_Vector valueslo = *(HVX_Vector *)word_values;
 309    HVX_Vector valueshi = *(HVX_Vector *)&word_values[MATRIX_SIZE / 2];
 310
 311    VSCATTER_32(&vtcm.vscatter32, region_len, offsetslo, valueslo);
 312    VSCATTER_32(&vtcm.vscatter32, region_len, offsetshi, valueshi);
 313
 314    sync_scatter(vtcm.vscatter32);
 315}
 316
 317/* scatter-acc the 32 bit elements using intrinsics */
 318void vector_scatter_32_acc(void)
 319{
 320    /* copy the offsets and values to vectors */
 321    HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
 322    HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
 323    HVX_Vector valueslo = *(HVX_Vector *)word_values_acc;
 324    HVX_Vector valueshi = *(HVX_Vector *)&word_values_acc[MATRIX_SIZE / 2];
 325
 326    VSCATTER_32_ACC(&vtcm.vscatter32, region_len, offsetslo, valueslo);
 327    VSCATTER_32_ACC(&vtcm.vscatter32, region_len, offsetshi, valueshi);
 328
 329    sync_scatter(vtcm.vscatter32);
 330}
 331
 332/* scatter the 32 bit elements using intrinsics */
 333void vector_scatter_32_masked(void)
 334{
 335    /* copy the offsets and values to vectors */
 336    HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
 337    HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
 338    HVX_Vector valueslo = *(HVX_Vector *)word_values_masked;
 339    HVX_Vector valueshi = *(HVX_Vector *)&word_values_masked[MATRIX_SIZE / 2];
 340    HVX_Vector pred_reglo = *(HVX_Vector *)word_predicates;
 341    HVX_Vector pred_reghi = *(HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
 342    HVX_VectorPred predslo = VAND_VAL(pred_reglo, ~0);
 343    HVX_VectorPred predshi = VAND_VAL(pred_reghi, ~0);
 344
 345    VSCATTER_32_MASKED(predslo, &vtcm.vscatter32, region_len, offsetslo,
 346                       valueslo);
 347    VSCATTER_32_MASKED(predshi, &vtcm.vscatter32, region_len, offsetshi,
 348                       valueshi);
 349
 350    sync_scatter(vtcm.vscatter16);
 351}
 352
 353/* scatter the 16 bit elements with 32 bit offsets using intrinsics */
 354void vector_scatter_16_32(void)
 355{
 356    HVX_VectorPair offsets;
 357    HVX_Vector values;
 358
 359    /* get the word offsets in a vector pair */
 360    offsets = *(HVX_VectorPair *)word_offsets;
 361
 362    /* these values need to be shuffled for the scatter */
 363    values = *(HVX_Vector *)half_values;
 364    values = VSHUFF_H(values);
 365
 366    VSCATTER_16_32(&vtcm.vscatter16_32, region_len, offsets, values);
 367
 368    sync_scatter(vtcm.vscatter16_32);
 369}
 370
 371/* scatter-acc the 16 bit elements with 32 bit offsets using intrinsics */
 372void vector_scatter_16_32_acc(void)
 373{
 374    HVX_VectorPair offsets;
 375    HVX_Vector values;
 376
 377    /* get the word offsets in a vector pair */
 378    offsets = *(HVX_VectorPair *)word_offsets;
 379
 380    /* these values need to be shuffled for the scatter */
 381    values = *(HVX_Vector *)half_values_acc;
 382    values = VSHUFF_H(values);
 383
 384    VSCATTER_16_32_ACC(&vtcm.vscatter16_32, region_len, offsets, values);
 385
 386    sync_scatter(vtcm.vscatter16_32);
 387}
 388
 389/* masked scatter the 16 bit elements with 32 bit offsets using intrinsics */
 390void vector_scatter_16_32_masked(void)
 391{
 392    HVX_VectorPair offsets;
 393    HVX_Vector values;
 394    HVX_Vector pred_reg;
 395
 396    /* get the word offsets in a vector pair */
 397    offsets = *(HVX_VectorPair *)word_offsets;
 398
 399    /* these values need to be shuffled for the scatter */
 400    values = *(HVX_Vector *)half_values_masked;
 401    values = VSHUFF_H(values);
 402
 403    pred_reg = *(HVX_Vector *)half_predicates;
 404    pred_reg = VSHUFF_H(pred_reg);
 405    HVX_VectorPred preds = VAND_VAL(pred_reg, ~0);
 406
 407    VSCATTER_16_32_MASKED(preds, &vtcm.vscatter16_32, region_len, offsets,
 408                          values);
 409
 410    sync_scatter(vtcm.vscatter16_32);
 411}
 412
 413/* gather the elements from the scatter16 buffer */
 414void vector_gather_16(void)
 415{
 416    HVX_Vector *vgather = (HVX_Vector *)&vtcm.vgather16;
 417    HVX_Vector offsets = *(HVX_Vector *)half_offsets;
 418
 419    VGATHER_16(vgather, &vtcm.vscatter16, region_len, offsets);
 420
 421    sync_gather(vgather);
 422}
 423
 424static unsigned short gather_16_masked_init(void)
 425{
 426    char letter = '?';
 427    return letter | (letter << 8);
 428}
 429
 430void vector_gather_16_masked(void)
 431{
 432    HVX_Vector *vgather = (HVX_Vector *)&vtcm.vgather16;
 433    HVX_Vector offsets = *(HVX_Vector *)half_offsets;
 434    HVX_Vector pred_reg = *(HVX_Vector *)half_predicates;
 435    HVX_VectorPred preds = VAND_VAL(pred_reg, ~0);
 436
 437    *vgather = VSPLAT_H(gather_16_masked_init());
 438    VGATHER_16_MASKED(vgather, preds, &vtcm.vscatter16, region_len, offsets);
 439
 440    sync_gather(vgather);
 441}
 442
 443/* gather the elements from the scatter32 buffer */
 444void vector_gather_32(void)
 445{
 446    HVX_Vector *vgatherlo = (HVX_Vector *)&vtcm.vgather32;
 447    HVX_Vector *vgatherhi =
 448        (HVX_Vector *)((int)&vtcm.vgather32 + (MATRIX_SIZE * 2));
 449    HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
 450    HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
 451
 452    VGATHER_32(vgatherlo, &vtcm.vscatter32, region_len, offsetslo);
 453    VGATHER_32(vgatherhi, &vtcm.vscatter32, region_len, offsetshi);
 454
 455    sync_gather(vgatherhi);
 456}
 457
 458static unsigned int gather_32_masked_init(void)
 459{
 460    char letter = '?';
 461    return letter | (letter << 8) | (letter << 16) | (letter << 24);
 462}
 463
 464void vector_gather_32_masked(void)
 465{
 466    HVX_Vector *vgatherlo = (HVX_Vector *)&vtcm.vgather32;
 467    HVX_Vector *vgatherhi =
 468        (HVX_Vector *)((int)&vtcm.vgather32 + (MATRIX_SIZE * 2));
 469    HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
 470    HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
 471    HVX_Vector pred_reglo = *(HVX_Vector *)word_predicates;
 472    HVX_VectorPred predslo = VAND_VAL(pred_reglo, ~0);
 473    HVX_Vector pred_reghi = *(HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
 474    HVX_VectorPred predshi = VAND_VAL(pred_reghi, ~0);
 475
 476    *vgatherlo = VSPLAT_H(gather_32_masked_init());
 477    *vgatherhi = VSPLAT_H(gather_32_masked_init());
 478    VGATHER_32_MASKED(vgatherlo, predslo, &vtcm.vscatter32, region_len,
 479                      offsetslo);
 480    VGATHER_32_MASKED(vgatherhi, predshi, &vtcm.vscatter32, region_len,
 481                      offsetshi);
 482
 483    sync_gather(vgatherlo);
 484    sync_gather(vgatherhi);
 485}
 486
 487/* gather the elements from the scatter16_32 buffer */
 488void vector_gather_16_32(void)
 489{
 490    HVX_Vector *vgather;
 491    HVX_VectorPair offsets;
 492    HVX_Vector values;
 493
 494    /* get the vtcm address to gather from */
 495    vgather = (HVX_Vector *)&vtcm.vgather16_32;
 496
 497    /* get the word offsets in a vector pair */
 498    offsets = *(HVX_VectorPair *)word_offsets;
 499
 500    VGATHER_16_32(vgather, &vtcm.vscatter16_32, region_len, offsets);
 501
 502    /* deal the elements to get the order back */
 503    values = *(HVX_Vector *)vgather;
 504    values = VDEAL_H(values);
 505
 506    /* write it back to vtcm address */
 507    *(HVX_Vector *)vgather = values;
 508}
 509
 510void vector_gather_16_32_masked(void)
 511{
 512    HVX_Vector *vgather;
 513    HVX_VectorPair offsets;
 514    HVX_Vector pred_reg;
 515    HVX_VectorPred preds;
 516    HVX_Vector values;
 517
 518    /* get the vtcm address to gather from */
 519    vgather = (HVX_Vector *)&vtcm.vgather16_32;
 520
 521    /* get the word offsets in a vector pair */
 522    offsets = *(HVX_VectorPair *)word_offsets;
 523    pred_reg = *(HVX_Vector *)half_predicates;
 524    pred_reg = VSHUFF_H(pred_reg);
 525    preds = VAND_VAL(pred_reg, ~0);
 526
 527   *vgather = VSPLAT_H(gather_16_masked_init());
 528   VGATHER_16_32_MASKED(vgather, preds, &vtcm.vscatter16_32, region_len,
 529                        offsets);
 530
 531    /* deal the elements to get the order back */
 532    values = *(HVX_Vector *)vgather;
 533    values = VDEAL_H(values);
 534
 535    /* write it back to vtcm address */
 536    *(HVX_Vector *)vgather = values;
 537}
 538
 539static void check_buffer(const char *name, void *c, void *r, size_t size)
 540{
 541    char *check = (char *)c;
 542    char *ref = (char *)r;
 543    for (int i = 0; i < size; i++) {
 544        if (check[i] != ref[i]) {
 545            printf("ERROR %s [%d]: 0x%x (%c) != 0x%x (%c)\n", name, i,
 546                   check[i], check[i], ref[i], ref[i]);
 547            err++;
 548        }
 549    }
 550}
 551
 552/*
 553 * These scalar functions are the C equivalents of the vector functions that
 554 * use HVX
 555 */
 556
 557/* scatter the 16 bit elements using C */
 558void scalar_scatter_16(unsigned short *vscatter16)
 559{
 560    for (int i = 0; i < MATRIX_SIZE; ++i) {
 561        vscatter16[half_offsets[i] / 2] = half_values[i];
 562    }
 563}
 564
 565void check_scatter_16()
 566{
 567    memset(vscatter16_ref, FILL_CHAR,
 568           SCATTER_BUFFER_SIZE * sizeof(unsigned short));
 569    scalar_scatter_16(vscatter16_ref);
 570    check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
 571                 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
 572}
 573
 574/* scatter the 16 bit elements using C */
 575void scalar_scatter_16_acc(unsigned short *vscatter16)
 576{
 577    for (int i = 0; i < MATRIX_SIZE; ++i) {
 578        vscatter16[half_offsets[i] / 2] += half_values_acc[i];
 579    }
 580}
 581
 582void check_scatter_16_acc()
 583{
 584    memset(vscatter16_ref, FILL_CHAR,
 585           SCATTER_BUFFER_SIZE * sizeof(unsigned short));
 586    scalar_scatter_16(vscatter16_ref);
 587    scalar_scatter_16_acc(vscatter16_ref);
 588    check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
 589                 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
 590}
 591
 592/* scatter the 16 bit elements using C */
 593void scalar_scatter_16_masked(unsigned short *vscatter16)
 594{
 595    for (int i = 0; i < MATRIX_SIZE; i++) {
 596        if (half_predicates[i]) {
 597            vscatter16[half_offsets[i] / 2] = half_values_masked[i];
 598        }
 599    }
 600
 601}
 602
 603void check_scatter_16_masked()
 604{
 605    memset(vscatter16_ref, FILL_CHAR,
 606           SCATTER_BUFFER_SIZE * sizeof(unsigned short));
 607    scalar_scatter_16(vscatter16_ref);
 608    scalar_scatter_16_acc(vscatter16_ref);
 609    scalar_scatter_16_masked(vscatter16_ref);
 610    check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
 611                 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
 612}
 613
 614/* scatter the 32 bit elements using C */
 615void scalar_scatter_32(unsigned int *vscatter32)
 616{
 617    for (int i = 0; i < MATRIX_SIZE; ++i) {
 618        vscatter32[word_offsets[i] / 4] = word_values[i];
 619    }
 620}
 621
 622void check_scatter_32()
 623{
 624    memset(vscatter32_ref, FILL_CHAR,
 625           SCATTER_BUFFER_SIZE * sizeof(unsigned int));
 626    scalar_scatter_32(vscatter32_ref);
 627    check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
 628                 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
 629}
 630
 631/* scatter the 32 bit elements using C */
 632void scalar_scatter_32_acc(unsigned int *vscatter32)
 633{
 634    for (int i = 0; i < MATRIX_SIZE; ++i) {
 635        vscatter32[word_offsets[i] / 4] += word_values_acc[i];
 636    }
 637}
 638
 639void check_scatter_32_acc()
 640{
 641    memset(vscatter32_ref, FILL_CHAR,
 642           SCATTER_BUFFER_SIZE * sizeof(unsigned int));
 643    scalar_scatter_32(vscatter32_ref);
 644    scalar_scatter_32_acc(vscatter32_ref);
 645    check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
 646                 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
 647}
 648
 649/* scatter the 32 bit elements using C */
 650void scalar_scatter_32_masked(unsigned int *vscatter32)
 651{
 652    for (int i = 0; i < MATRIX_SIZE; i++) {
 653        if (word_predicates[i]) {
 654            vscatter32[word_offsets[i] / 4] = word_values_masked[i];
 655        }
 656    }
 657}
 658
 659void check_scatter_32_masked()
 660{
 661    memset(vscatter32_ref, FILL_CHAR,
 662           SCATTER_BUFFER_SIZE * sizeof(unsigned int));
 663    scalar_scatter_32(vscatter32_ref);
 664    scalar_scatter_32_acc(vscatter32_ref);
 665    scalar_scatter_32_masked(vscatter32_ref);
 666    check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
 667                  SCATTER_BUFFER_SIZE * sizeof(unsigned int));
 668}
 669
 670/* scatter the 32 bit elements using C */
 671void scalar_scatter_16_32(unsigned short *vscatter16_32)
 672{
 673    for (int i = 0; i < MATRIX_SIZE; ++i) {
 674        vscatter16_32[word_offsets[i] / 2] = half_values[i];
 675    }
 676}
 677
 678void check_scatter_16_32()
 679{
 680    memset(vscatter16_32_ref, FILL_CHAR,
 681           SCATTER_BUFFER_SIZE * sizeof(unsigned short));
 682    scalar_scatter_16_32(vscatter16_32_ref);
 683    check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
 684                 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
 685}
 686
 687/* scatter the 32 bit elements using C */
 688void scalar_scatter_16_32_acc(unsigned short *vscatter16_32)
 689{
 690    for (int i = 0; i < MATRIX_SIZE; ++i) {
 691        vscatter16_32[word_offsets[i] / 2] += half_values_acc[i];
 692    }
 693}
 694
 695void check_scatter_16_32_acc()
 696{
 697    memset(vscatter16_32_ref, FILL_CHAR,
 698           SCATTER_BUFFER_SIZE * sizeof(unsigned short));
 699    scalar_scatter_16_32(vscatter16_32_ref);
 700    scalar_scatter_16_32_acc(vscatter16_32_ref);
 701    check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
 702                 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
 703}
 704
 705void scalar_scatter_16_32_masked(unsigned short *vscatter16_32)
 706{
 707    for (int i = 0; i < MATRIX_SIZE; i++) {
 708        if (half_predicates[i]) {
 709            vscatter16_32[word_offsets[i] / 2] = half_values_masked[i];
 710        }
 711    }
 712}
 713
 714void check_scatter_16_32_masked()
 715{
 716    memset(vscatter16_32_ref, FILL_CHAR,
 717           SCATTER_BUFFER_SIZE * sizeof(unsigned short));
 718    scalar_scatter_16_32(vscatter16_32_ref);
 719    scalar_scatter_16_32_acc(vscatter16_32_ref);
 720    scalar_scatter_16_32_masked(vscatter16_32_ref);
 721    check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
 722                 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
 723}
 724
 725/* gather the elements from the scatter buffer using C */
 726void scalar_gather_16(unsigned short *vgather16)
 727{
 728    for (int i = 0; i < MATRIX_SIZE; ++i) {
 729        vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2];
 730    }
 731}
 732
 733void check_gather_16()
 734{
 735      memset(vgather16_ref, 0, MATRIX_SIZE * sizeof(unsigned short));
 736      scalar_gather_16(vgather16_ref);
 737      check_buffer(__func__, vtcm.vgather16, vgather16_ref,
 738                   MATRIX_SIZE * sizeof(unsigned short));
 739}
 740
 741void scalar_gather_16_masked(unsigned short *vgather16)
 742{
 743    for (int i = 0; i < MATRIX_SIZE; ++i) {
 744        if (half_predicates[i]) {
 745            vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2];
 746        }
 747    }
 748}
 749
 750void check_gather_16_masked()
 751{
 752    memset(vgather16_ref, gather_16_masked_init(),
 753           MATRIX_SIZE * sizeof(unsigned short));
 754    scalar_gather_16_masked(vgather16_ref);
 755    check_buffer(__func__, vtcm.vgather16, vgather16_ref,
 756                 MATRIX_SIZE * sizeof(unsigned short));
 757}
 758
 759/* gather the elements from the scatter buffer using C */
 760void scalar_gather_32(unsigned int *vgather32)
 761{
 762    for (int i = 0; i < MATRIX_SIZE; ++i) {
 763        vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4];
 764    }
 765}
 766
 767void check_gather_32(void)
 768{
 769    memset(vgather32_ref, 0, MATRIX_SIZE * sizeof(unsigned int));
 770    scalar_gather_32(vgather32_ref);
 771    check_buffer(__func__, vtcm.vgather32, vgather32_ref,
 772                 MATRIX_SIZE * sizeof(unsigned int));
 773}
 774
 775void scalar_gather_32_masked(unsigned int *vgather32)
 776{
 777    for (int i = 0; i < MATRIX_SIZE; ++i) {
 778        if (word_predicates[i]) {
 779            vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4];
 780        }
 781    }
 782}
 783
 784
 785void check_gather_32_masked(void)
 786{
 787    memset(vgather32_ref, gather_32_masked_init(),
 788           MATRIX_SIZE * sizeof(unsigned int));
 789    scalar_gather_32_masked(vgather32_ref);
 790    check_buffer(__func__, vtcm.vgather32,
 791                 vgather32_ref, MATRIX_SIZE * sizeof(unsigned int));
 792}
 793
 794/* gather the elements from the scatter buffer using C */
 795void scalar_gather_16_32(unsigned short *vgather16_32)
 796{
 797    for (int i = 0; i < MATRIX_SIZE; ++i) {
 798        vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2];
 799    }
 800}
 801
 802void check_gather_16_32(void)
 803{
 804    memset(vgather16_32_ref, 0, MATRIX_SIZE * sizeof(unsigned short));
 805    scalar_gather_16_32(vgather16_32_ref);
 806    check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref,
 807                 MATRIX_SIZE * sizeof(unsigned short));
 808}
 809
 810void scalar_gather_16_32_masked(unsigned short *vgather16_32)
 811{
 812    for (int i = 0; i < MATRIX_SIZE; ++i) {
 813        if (half_predicates[i]) {
 814            vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2];
 815        }
 816    }
 817
 818}
 819
 820void check_gather_16_32_masked(void)
 821{
 822    memset(vgather16_32_ref, gather_16_masked_init(),
 823           MATRIX_SIZE * sizeof(unsigned short));
 824    scalar_gather_16_32_masked(vgather16_32_ref);
 825    check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref,
 826                 MATRIX_SIZE * sizeof(unsigned short));
 827}
 828
 829/* print scatter16 buffer */
 830void print_scatter16_buffer(void)
 831{
 832    if (PRINT_DATA) {
 833        printf("\n\nPrinting the 16 bit scatter buffer");
 834
 835        for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
 836            if ((i % MATRIX_SIZE) == 0) {
 837                printf("\n");
 838            }
 839            for (int j = 0; j < 2; j++) {
 840                printf("%c", (char)((vtcm.vscatter16[i] >> j * 8) & 0xff));
 841            }
 842            printf(" ");
 843        }
 844        printf("\n");
 845    }
 846}
 847
 848/* print the gather 16 buffer */
 849void print_gather_result_16(void)
 850{
 851    if (PRINT_DATA) {
 852        printf("\n\nPrinting the 16 bit gather result\n");
 853
 854        for (int i = 0; i < MATRIX_SIZE; i++) {
 855            for (int j = 0; j < 2; j++) {
 856                printf("%c", (char)((vtcm.vgather16[i] >> j * 8) & 0xff));
 857            }
 858            printf(" ");
 859        }
 860        printf("\n");
 861    }
 862}
 863
 864/* print the scatter32 buffer */
 865void print_scatter32_buffer(void)
 866{
 867    if (PRINT_DATA) {
 868        printf("\n\nPrinting the 32 bit scatter buffer");
 869
 870        for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
 871            if ((i % MATRIX_SIZE) == 0) {
 872                printf("\n");
 873            }
 874            for (int j = 0; j < 4; j++) {
 875                printf("%c", (char)((vtcm.vscatter32[i] >> j * 8) & 0xff));
 876            }
 877            printf(" ");
 878        }
 879        printf("\n");
 880    }
 881}
 882
 883/* print the gather 32 buffer */
 884void print_gather_result_32(void)
 885{
 886    if (PRINT_DATA) {
 887        printf("\n\nPrinting the 32 bit gather result\n");
 888
 889        for (int i = 0; i < MATRIX_SIZE; i++) {
 890            for (int j = 0; j < 4; j++) {
 891                printf("%c", (char)((vtcm.vgather32[i] >> j * 8) & 0xff));
 892            }
 893            printf(" ");
 894        }
 895        printf("\n");
 896    }
 897}
 898
 899/* print the scatter16_32 buffer */
 900void print_scatter16_32_buffer(void)
 901{
 902    if (PRINT_DATA) {
 903        printf("\n\nPrinting the 16_32 bit scatter buffer");
 904
 905        for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
 906            if ((i % MATRIX_SIZE) == 0) {
 907                printf("\n");
 908            }
 909            for (int j = 0; j < 2; j++) {
 910                printf("%c",
 911                      (unsigned char)((vtcm.vscatter16_32[i] >> j * 8) & 0xff));
 912            }
 913            printf(" ");
 914        }
 915        printf("\n");
 916    }
 917}
 918
 919/* print the gather 16_32 buffer */
 920void print_gather_result_16_32(void)
 921{
 922    if (PRINT_DATA) {
 923        printf("\n\nPrinting the 16_32 bit gather result\n");
 924
 925        for (int i = 0; i < MATRIX_SIZE; i++) {
 926            for (int j = 0; j < 2; j++) {
 927                printf("%c",
 928                       (unsigned char)((vtcm.vgather16_32[i] >> j * 8) & 0xff));
 929            }
 930            printf(" ");
 931        }
 932        printf("\n");
 933    }
 934}
 935
 936int main()
 937{
 938    prefill_vtcm_scratch();
 939
 940    /* 16 bit elements with 16 bit offsets */
 941    create_offsets_values_preds_16();
 942
 943    vector_scatter_16();
 944    print_scatter16_buffer();
 945    check_scatter_16();
 946
 947    vector_gather_16();
 948    print_gather_result_16();
 949    check_gather_16();
 950
 951    vector_gather_16_masked();
 952    print_gather_result_16();
 953    check_gather_16_masked();
 954
 955    vector_scatter_16_acc();
 956    print_scatter16_buffer();
 957    check_scatter_16_acc();
 958
 959    vector_scatter_16_masked();
 960    print_scatter16_buffer();
 961    check_scatter_16_masked();
 962
 963    /* 32 bit elements with 32 bit offsets */
 964    create_offsets_values_preds_32();
 965
 966    vector_scatter_32();
 967    print_scatter32_buffer();
 968    check_scatter_32();
 969
 970    vector_gather_32();
 971    print_gather_result_32();
 972    check_gather_32();
 973
 974    vector_gather_32_masked();
 975    print_gather_result_32();
 976    check_gather_32_masked();
 977
 978    vector_scatter_32_acc();
 979    print_scatter32_buffer();
 980    check_scatter_32_acc();
 981
 982    vector_scatter_32_masked();
 983    print_scatter32_buffer();
 984    check_scatter_32_masked();
 985
 986    /* 16 bit elements with 32 bit offsets */
 987    create_offsets_values_preds_16_32();
 988
 989    vector_scatter_16_32();
 990    print_scatter16_32_buffer();
 991    check_scatter_16_32();
 992
 993    vector_gather_16_32();
 994    print_gather_result_16_32();
 995    check_gather_16_32();
 996
 997    vector_gather_16_32_masked();
 998    print_gather_result_16_32();
 999    check_gather_16_32_masked();
1000
1001    vector_scatter_16_32_acc();
1002    print_scatter16_32_buffer();
1003    check_scatter_16_32_acc();
1004
1005    vector_scatter_16_32_masked();
1006    print_scatter16_32_buffer();
1007    check_scatter_16_32_masked();
1008
1009    puts(err ? "FAIL" : "PASS");
1010    return err;
1011}
1012