qemu/accel/tcg/tcg-runtime-gvec.c
<<
>>
Prefs
   1/*
   2 * Generic vectorized operation runtime
   3 *
   4 * Copyright (c) 2018 Linaro
   5 *
   6 * This library is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU Lesser General Public
   8 * License as published by the Free Software Foundation; either
   9 * version 2 of the License, or (at your option) any later version.
  10 *
  11 * This library is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * Lesser General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU Lesser General Public
  17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20#include "qemu/osdep.h"
  21#include "qemu/host-utils.h"
  22#include "cpu.h"
  23#include "exec/helper-proto.h"
  24#include "tcg-gvec-desc.h"
  25
  26
  27/* Virtually all hosts support 16-byte vectors.  Those that don't can emulate
  28 * them via GCC's generic vector extension.  This turns out to be simpler and
  29 * more reliable than getting the compiler to autovectorize.
  30 *
  31 * In tcg-op-gvec.c, we asserted that both the size and alignment of the data
  32 * are multiples of 16.
  33 *
  34 * When the compiler does not support all of the operations we require, the
  35 * loops are written so that we can always fall back on the base types.
  36 */
  37#ifdef CONFIG_VECTOR16
  38typedef uint8_t vec8 __attribute__((vector_size(16)));
  39typedef uint16_t vec16 __attribute__((vector_size(16)));
  40typedef uint32_t vec32 __attribute__((vector_size(16)));
  41typedef uint64_t vec64 __attribute__((vector_size(16)));
  42
  43typedef int8_t svec8 __attribute__((vector_size(16)));
  44typedef int16_t svec16 __attribute__((vector_size(16)));
  45typedef int32_t svec32 __attribute__((vector_size(16)));
  46typedef int64_t svec64 __attribute__((vector_size(16)));
  47
  48#define DUP16(X)  { X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X }
  49#define DUP8(X)   { X, X, X, X, X, X, X, X }
  50#define DUP4(X)   { X, X, X, X }
  51#define DUP2(X)   { X, X }
  52#else
  53typedef uint8_t vec8;
  54typedef uint16_t vec16;
  55typedef uint32_t vec32;
  56typedef uint64_t vec64;
  57
  58typedef int8_t svec8;
  59typedef int16_t svec16;
  60typedef int32_t svec32;
  61typedef int64_t svec64;
  62
  63#define DUP16(X)  X
  64#define DUP8(X)   X
  65#define DUP4(X)   X
  66#define DUP2(X)   X
  67#endif /* CONFIG_VECTOR16 */
  68
  69static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc)
  70{
  71    intptr_t maxsz = simd_maxsz(desc);
  72    intptr_t i;
  73
  74    if (unlikely(maxsz > oprsz)) {
  75        for (i = oprsz; i < maxsz; i += sizeof(uint64_t)) {
  76            *(uint64_t *)(d + i) = 0;
  77        }
  78    }
  79}
  80
  81void HELPER(gvec_add8)(void *d, void *a, void *b, uint32_t desc)
  82{
  83    intptr_t oprsz = simd_oprsz(desc);
  84    intptr_t i;
  85
  86    for (i = 0; i < oprsz; i += sizeof(vec8)) {
  87        *(vec8 *)(d + i) = *(vec8 *)(a + i) + *(vec8 *)(b + i);
  88    }
  89    clear_high(d, oprsz, desc);
  90}
  91
  92void HELPER(gvec_add16)(void *d, void *a, void *b, uint32_t desc)
  93{
  94    intptr_t oprsz = simd_oprsz(desc);
  95    intptr_t i;
  96
  97    for (i = 0; i < oprsz; i += sizeof(vec16)) {
  98        *(vec16 *)(d + i) = *(vec16 *)(a + i) + *(vec16 *)(b + i);
  99    }
 100    clear_high(d, oprsz, desc);
 101}
 102
 103void HELPER(gvec_add32)(void *d, void *a, void *b, uint32_t desc)
 104{
 105    intptr_t oprsz = simd_oprsz(desc);
 106    intptr_t i;
 107
 108    for (i = 0; i < oprsz; i += sizeof(vec32)) {
 109        *(vec32 *)(d + i) = *(vec32 *)(a + i) + *(vec32 *)(b + i);
 110    }
 111    clear_high(d, oprsz, desc);
 112}
 113
 114void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc)
 115{
 116    intptr_t oprsz = simd_oprsz(desc);
 117    intptr_t i;
 118
 119    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 120        *(vec64 *)(d + i) = *(vec64 *)(a + i) + *(vec64 *)(b + i);
 121    }
 122    clear_high(d, oprsz, desc);
 123}
 124
 125void HELPER(gvec_adds8)(void *d, void *a, uint64_t b, uint32_t desc)
 126{
 127    intptr_t oprsz = simd_oprsz(desc);
 128    vec8 vecb = (vec8)DUP16(b);
 129    intptr_t i;
 130
 131    for (i = 0; i < oprsz; i += sizeof(vec8)) {
 132        *(vec8 *)(d + i) = *(vec8 *)(a + i) + vecb;
 133    }
 134    clear_high(d, oprsz, desc);
 135}
 136
 137void HELPER(gvec_adds16)(void *d, void *a, uint64_t b, uint32_t desc)
 138{
 139    intptr_t oprsz = simd_oprsz(desc);
 140    vec16 vecb = (vec16)DUP8(b);
 141    intptr_t i;
 142
 143    for (i = 0; i < oprsz; i += sizeof(vec16)) {
 144        *(vec16 *)(d + i) = *(vec16 *)(a + i) + vecb;
 145    }
 146    clear_high(d, oprsz, desc);
 147}
 148
 149void HELPER(gvec_adds32)(void *d, void *a, uint64_t b, uint32_t desc)
 150{
 151    intptr_t oprsz = simd_oprsz(desc);
 152    vec32 vecb = (vec32)DUP4(b);
 153    intptr_t i;
 154
 155    for (i = 0; i < oprsz; i += sizeof(vec32)) {
 156        *(vec32 *)(d + i) = *(vec32 *)(a + i) + vecb;
 157    }
 158    clear_high(d, oprsz, desc);
 159}
 160
 161void HELPER(gvec_adds64)(void *d, void *a, uint64_t b, uint32_t desc)
 162{
 163    intptr_t oprsz = simd_oprsz(desc);
 164    vec64 vecb = (vec64)DUP2(b);
 165    intptr_t i;
 166
 167    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 168        *(vec64 *)(d + i) = *(vec64 *)(a + i) + vecb;
 169    }
 170    clear_high(d, oprsz, desc);
 171}
 172
 173void HELPER(gvec_sub8)(void *d, void *a, void *b, uint32_t desc)
 174{
 175    intptr_t oprsz = simd_oprsz(desc);
 176    intptr_t i;
 177
 178    for (i = 0; i < oprsz; i += sizeof(vec8)) {
 179        *(vec8 *)(d + i) = *(vec8 *)(a + i) - *(vec8 *)(b + i);
 180    }
 181    clear_high(d, oprsz, desc);
 182}
 183
 184void HELPER(gvec_sub16)(void *d, void *a, void *b, uint32_t desc)
 185{
 186    intptr_t oprsz = simd_oprsz(desc);
 187    intptr_t i;
 188
 189    for (i = 0; i < oprsz; i += sizeof(vec16)) {
 190        *(vec16 *)(d + i) = *(vec16 *)(a + i) - *(vec16 *)(b + i);
 191    }
 192    clear_high(d, oprsz, desc);
 193}
 194
 195void HELPER(gvec_sub32)(void *d, void *a, void *b, uint32_t desc)
 196{
 197    intptr_t oprsz = simd_oprsz(desc);
 198    intptr_t i;
 199
 200    for (i = 0; i < oprsz; i += sizeof(vec32)) {
 201        *(vec32 *)(d + i) = *(vec32 *)(a + i) - *(vec32 *)(b + i);
 202    }
 203    clear_high(d, oprsz, desc);
 204}
 205
 206void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc)
 207{
 208    intptr_t oprsz = simd_oprsz(desc);
 209    intptr_t i;
 210
 211    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 212        *(vec64 *)(d + i) = *(vec64 *)(a + i) - *(vec64 *)(b + i);
 213    }
 214    clear_high(d, oprsz, desc);
 215}
 216
 217void HELPER(gvec_subs8)(void *d, void *a, uint64_t b, uint32_t desc)
 218{
 219    intptr_t oprsz = simd_oprsz(desc);
 220    vec8 vecb = (vec8)DUP16(b);
 221    intptr_t i;
 222
 223    for (i = 0; i < oprsz; i += sizeof(vec8)) {
 224        *(vec8 *)(d + i) = *(vec8 *)(a + i) - vecb;
 225    }
 226    clear_high(d, oprsz, desc);
 227}
 228
 229void HELPER(gvec_subs16)(void *d, void *a, uint64_t b, uint32_t desc)
 230{
 231    intptr_t oprsz = simd_oprsz(desc);
 232    vec16 vecb = (vec16)DUP8(b);
 233    intptr_t i;
 234
 235    for (i = 0; i < oprsz; i += sizeof(vec16)) {
 236        *(vec16 *)(d + i) = *(vec16 *)(a + i) - vecb;
 237    }
 238    clear_high(d, oprsz, desc);
 239}
 240
 241void HELPER(gvec_subs32)(void *d, void *a, uint64_t b, uint32_t desc)
 242{
 243    intptr_t oprsz = simd_oprsz(desc);
 244    vec32 vecb = (vec32)DUP4(b);
 245    intptr_t i;
 246
 247    for (i = 0; i < oprsz; i += sizeof(vec32)) {
 248        *(vec32 *)(d + i) = *(vec32 *)(a + i) - vecb;
 249    }
 250    clear_high(d, oprsz, desc);
 251}
 252
 253void HELPER(gvec_subs64)(void *d, void *a, uint64_t b, uint32_t desc)
 254{
 255    intptr_t oprsz = simd_oprsz(desc);
 256    vec64 vecb = (vec64)DUP2(b);
 257    intptr_t i;
 258
 259    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 260        *(vec64 *)(d + i) = *(vec64 *)(a + i) - vecb;
 261    }
 262    clear_high(d, oprsz, desc);
 263}
 264
 265void HELPER(gvec_mul8)(void *d, void *a, void *b, uint32_t desc)
 266{
 267    intptr_t oprsz = simd_oprsz(desc);
 268    intptr_t i;
 269
 270    for (i = 0; i < oprsz; i += sizeof(vec8)) {
 271        *(vec8 *)(d + i) = *(vec8 *)(a + i) * *(vec8 *)(b + i);
 272    }
 273    clear_high(d, oprsz, desc);
 274}
 275
 276void HELPER(gvec_mul16)(void *d, void *a, void *b, uint32_t desc)
 277{
 278    intptr_t oprsz = simd_oprsz(desc);
 279    intptr_t i;
 280
 281    for (i = 0; i < oprsz; i += sizeof(vec16)) {
 282        *(vec16 *)(d + i) = *(vec16 *)(a + i) * *(vec16 *)(b + i);
 283    }
 284    clear_high(d, oprsz, desc);
 285}
 286
 287void HELPER(gvec_mul32)(void *d, void *a, void *b, uint32_t desc)
 288{
 289    intptr_t oprsz = simd_oprsz(desc);
 290    intptr_t i;
 291
 292    for (i = 0; i < oprsz; i += sizeof(vec32)) {
 293        *(vec32 *)(d + i) = *(vec32 *)(a + i) * *(vec32 *)(b + i);
 294    }
 295    clear_high(d, oprsz, desc);
 296}
 297
 298void HELPER(gvec_mul64)(void *d, void *a, void *b, uint32_t desc)
 299{
 300    intptr_t oprsz = simd_oprsz(desc);
 301    intptr_t i;
 302
 303    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 304        *(vec64 *)(d + i) = *(vec64 *)(a + i) * *(vec64 *)(b + i);
 305    }
 306    clear_high(d, oprsz, desc);
 307}
 308
 309void HELPER(gvec_muls8)(void *d, void *a, uint64_t b, uint32_t desc)
 310{
 311    intptr_t oprsz = simd_oprsz(desc);
 312    vec8 vecb = (vec8)DUP16(b);
 313    intptr_t i;
 314
 315    for (i = 0; i < oprsz; i += sizeof(vec8)) {
 316        *(vec8 *)(d + i) = *(vec8 *)(a + i) * vecb;
 317    }
 318    clear_high(d, oprsz, desc);
 319}
 320
 321void HELPER(gvec_muls16)(void *d, void *a, uint64_t b, uint32_t desc)
 322{
 323    intptr_t oprsz = simd_oprsz(desc);
 324    vec16 vecb = (vec16)DUP8(b);
 325    intptr_t i;
 326
 327    for (i = 0; i < oprsz; i += sizeof(vec16)) {
 328        *(vec16 *)(d + i) = *(vec16 *)(a + i) * vecb;
 329    }
 330    clear_high(d, oprsz, desc);
 331}
 332
 333void HELPER(gvec_muls32)(void *d, void *a, uint64_t b, uint32_t desc)
 334{
 335    intptr_t oprsz = simd_oprsz(desc);
 336    vec32 vecb = (vec32)DUP4(b);
 337    intptr_t i;
 338
 339    for (i = 0; i < oprsz; i += sizeof(vec32)) {
 340        *(vec32 *)(d + i) = *(vec32 *)(a + i) * vecb;
 341    }
 342    clear_high(d, oprsz, desc);
 343}
 344
 345void HELPER(gvec_muls64)(void *d, void *a, uint64_t b, uint32_t desc)
 346{
 347    intptr_t oprsz = simd_oprsz(desc);
 348    vec64 vecb = (vec64)DUP2(b);
 349    intptr_t i;
 350
 351    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 352        *(vec64 *)(d + i) = *(vec64 *)(a + i) * vecb;
 353    }
 354    clear_high(d, oprsz, desc);
 355}
 356
 357void HELPER(gvec_neg8)(void *d, void *a, uint32_t desc)
 358{
 359    intptr_t oprsz = simd_oprsz(desc);
 360    intptr_t i;
 361
 362    for (i = 0; i < oprsz; i += sizeof(vec8)) {
 363        *(vec8 *)(d + i) = -*(vec8 *)(a + i);
 364    }
 365    clear_high(d, oprsz, desc);
 366}
 367
 368void HELPER(gvec_neg16)(void *d, void *a, uint32_t desc)
 369{
 370    intptr_t oprsz = simd_oprsz(desc);
 371    intptr_t i;
 372
 373    for (i = 0; i < oprsz; i += sizeof(vec16)) {
 374        *(vec16 *)(d + i) = -*(vec16 *)(a + i);
 375    }
 376    clear_high(d, oprsz, desc);
 377}
 378
 379void HELPER(gvec_neg32)(void *d, void *a, uint32_t desc)
 380{
 381    intptr_t oprsz = simd_oprsz(desc);
 382    intptr_t i;
 383
 384    for (i = 0; i < oprsz; i += sizeof(vec32)) {
 385        *(vec32 *)(d + i) = -*(vec32 *)(a + i);
 386    }
 387    clear_high(d, oprsz, desc);
 388}
 389
 390void HELPER(gvec_neg64)(void *d, void *a, uint32_t desc)
 391{
 392    intptr_t oprsz = simd_oprsz(desc);
 393    intptr_t i;
 394
 395    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 396        *(vec64 *)(d + i) = -*(vec64 *)(a + i);
 397    }
 398    clear_high(d, oprsz, desc);
 399}
 400
 401void HELPER(gvec_mov)(void *d, void *a, uint32_t desc)
 402{
 403    intptr_t oprsz = simd_oprsz(desc);
 404
 405    memcpy(d, a, oprsz);
 406    clear_high(d, oprsz, desc);
 407}
 408
 409void HELPER(gvec_dup64)(void *d, uint32_t desc, uint64_t c)
 410{
 411    intptr_t oprsz = simd_oprsz(desc);
 412    intptr_t i;
 413
 414    if (c == 0) {
 415        oprsz = 0;
 416    } else {
 417        for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
 418            *(uint64_t *)(d + i) = c;
 419        }
 420    }
 421    clear_high(d, oprsz, desc);
 422}
 423
 424void HELPER(gvec_dup32)(void *d, uint32_t desc, uint32_t c)
 425{
 426    intptr_t oprsz = simd_oprsz(desc);
 427    intptr_t i;
 428
 429    if (c == 0) {
 430        oprsz = 0;
 431    } else {
 432        for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
 433            *(uint32_t *)(d + i) = c;
 434        }
 435    }
 436    clear_high(d, oprsz, desc);
 437}
 438
 439void HELPER(gvec_dup16)(void *d, uint32_t desc, uint32_t c)
 440{
 441    HELPER(gvec_dup32)(d, desc, 0x00010001 * (c & 0xffff));
 442}
 443
 444void HELPER(gvec_dup8)(void *d, uint32_t desc, uint32_t c)
 445{
 446    HELPER(gvec_dup32)(d, desc, 0x01010101 * (c & 0xff));
 447}
 448
 449void HELPER(gvec_not)(void *d, void *a, uint32_t desc)
 450{
 451    intptr_t oprsz = simd_oprsz(desc);
 452    intptr_t i;
 453
 454    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 455        *(vec64 *)(d + i) = ~*(vec64 *)(a + i);
 456    }
 457    clear_high(d, oprsz, desc);
 458}
 459
 460void HELPER(gvec_and)(void *d, void *a, void *b, uint32_t desc)
 461{
 462    intptr_t oprsz = simd_oprsz(desc);
 463    intptr_t i;
 464
 465    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 466        *(vec64 *)(d + i) = *(vec64 *)(a + i) & *(vec64 *)(b + i);
 467    }
 468    clear_high(d, oprsz, desc);
 469}
 470
 471void HELPER(gvec_or)(void *d, void *a, void *b, uint32_t desc)
 472{
 473    intptr_t oprsz = simd_oprsz(desc);
 474    intptr_t i;
 475
 476    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 477        *(vec64 *)(d + i) = *(vec64 *)(a + i) | *(vec64 *)(b + i);
 478    }
 479    clear_high(d, oprsz, desc);
 480}
 481
 482void HELPER(gvec_xor)(void *d, void *a, void *b, uint32_t desc)
 483{
 484    intptr_t oprsz = simd_oprsz(desc);
 485    intptr_t i;
 486
 487    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 488        *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ *(vec64 *)(b + i);
 489    }
 490    clear_high(d, oprsz, desc);
 491}
 492
 493void HELPER(gvec_andc)(void *d, void *a, void *b, uint32_t desc)
 494{
 495    intptr_t oprsz = simd_oprsz(desc);
 496    intptr_t i;
 497
 498    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 499        *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i);
 500    }
 501    clear_high(d, oprsz, desc);
 502}
 503
 504void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc)
 505{
 506    intptr_t oprsz = simd_oprsz(desc);
 507    intptr_t i;
 508
 509    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 510        *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i);
 511    }
 512    clear_high(d, oprsz, desc);
 513}
 514
 515void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc)
 516{
 517    intptr_t oprsz = simd_oprsz(desc);
 518    vec64 vecb = (vec64)DUP2(b);
 519    intptr_t i;
 520
 521    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 522        *(vec64 *)(d + i) = *(vec64 *)(a + i) & vecb;
 523    }
 524    clear_high(d, oprsz, desc);
 525}
 526
 527void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc)
 528{
 529    intptr_t oprsz = simd_oprsz(desc);
 530    vec64 vecb = (vec64)DUP2(b);
 531    intptr_t i;
 532
 533    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 534        *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ vecb;
 535    }
 536    clear_high(d, oprsz, desc);
 537}
 538
 539void HELPER(gvec_ors)(void *d, void *a, uint64_t b, uint32_t desc)
 540{
 541    intptr_t oprsz = simd_oprsz(desc);
 542    vec64 vecb = (vec64)DUP2(b);
 543    intptr_t i;
 544
 545    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 546        *(vec64 *)(d + i) = *(vec64 *)(a + i) | vecb;
 547    }
 548    clear_high(d, oprsz, desc);
 549}
 550
 551void HELPER(gvec_shl8i)(void *d, void *a, uint32_t desc)
 552{
 553    intptr_t oprsz = simd_oprsz(desc);
 554    int shift = simd_data(desc);
 555    intptr_t i;
 556
 557    for (i = 0; i < oprsz; i += sizeof(vec8)) {
 558        *(vec8 *)(d + i) = *(vec8 *)(a + i) << shift;
 559    }
 560    clear_high(d, oprsz, desc);
 561}
 562
 563void HELPER(gvec_shl16i)(void *d, void *a, uint32_t desc)
 564{
 565    intptr_t oprsz = simd_oprsz(desc);
 566    int shift = simd_data(desc);
 567    intptr_t i;
 568
 569    for (i = 0; i < oprsz; i += sizeof(vec16)) {
 570        *(vec16 *)(d + i) = *(vec16 *)(a + i) << shift;
 571    }
 572    clear_high(d, oprsz, desc);
 573}
 574
 575void HELPER(gvec_shl32i)(void *d, void *a, uint32_t desc)
 576{
 577    intptr_t oprsz = simd_oprsz(desc);
 578    int shift = simd_data(desc);
 579    intptr_t i;
 580
 581    for (i = 0; i < oprsz; i += sizeof(vec32)) {
 582        *(vec32 *)(d + i) = *(vec32 *)(a + i) << shift;
 583    }
 584    clear_high(d, oprsz, desc);
 585}
 586
 587void HELPER(gvec_shl64i)(void *d, void *a, uint32_t desc)
 588{
 589    intptr_t oprsz = simd_oprsz(desc);
 590    int shift = simd_data(desc);
 591    intptr_t i;
 592
 593    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 594        *(vec64 *)(d + i) = *(vec64 *)(a + i) << shift;
 595    }
 596    clear_high(d, oprsz, desc);
 597}
 598
 599void HELPER(gvec_shr8i)(void *d, void *a, uint32_t desc)
 600{
 601    intptr_t oprsz = simd_oprsz(desc);
 602    int shift = simd_data(desc);
 603    intptr_t i;
 604
 605    for (i = 0; i < oprsz; i += sizeof(vec8)) {
 606        *(vec8 *)(d + i) = *(vec8 *)(a + i) >> shift;
 607    }
 608    clear_high(d, oprsz, desc);
 609}
 610
 611void HELPER(gvec_shr16i)(void *d, void *a, uint32_t desc)
 612{
 613    intptr_t oprsz = simd_oprsz(desc);
 614    int shift = simd_data(desc);
 615    intptr_t i;
 616
 617    for (i = 0; i < oprsz; i += sizeof(vec16)) {
 618        *(vec16 *)(d + i) = *(vec16 *)(a + i) >> shift;
 619    }
 620    clear_high(d, oprsz, desc);
 621}
 622
 623void HELPER(gvec_shr32i)(void *d, void *a, uint32_t desc)
 624{
 625    intptr_t oprsz = simd_oprsz(desc);
 626    int shift = simd_data(desc);
 627    intptr_t i;
 628
 629    for (i = 0; i < oprsz; i += sizeof(vec32)) {
 630        *(vec32 *)(d + i) = *(vec32 *)(a + i) >> shift;
 631    }
 632    clear_high(d, oprsz, desc);
 633}
 634
 635void HELPER(gvec_shr64i)(void *d, void *a, uint32_t desc)
 636{
 637    intptr_t oprsz = simd_oprsz(desc);
 638    int shift = simd_data(desc);
 639    intptr_t i;
 640
 641    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 642        *(vec64 *)(d + i) = *(vec64 *)(a + i) >> shift;
 643    }
 644    clear_high(d, oprsz, desc);
 645}
 646
 647void HELPER(gvec_sar8i)(void *d, void *a, uint32_t desc)
 648{
 649    intptr_t oprsz = simd_oprsz(desc);
 650    int shift = simd_data(desc);
 651    intptr_t i;
 652
 653    for (i = 0; i < oprsz; i += sizeof(vec8)) {
 654        *(svec8 *)(d + i) = *(svec8 *)(a + i) >> shift;
 655    }
 656    clear_high(d, oprsz, desc);
 657}
 658
 659void HELPER(gvec_sar16i)(void *d, void *a, uint32_t desc)
 660{
 661    intptr_t oprsz = simd_oprsz(desc);
 662    int shift = simd_data(desc);
 663    intptr_t i;
 664
 665    for (i = 0; i < oprsz; i += sizeof(vec16)) {
 666        *(svec16 *)(d + i) = *(svec16 *)(a + i) >> shift;
 667    }
 668    clear_high(d, oprsz, desc);
 669}
 670
 671void HELPER(gvec_sar32i)(void *d, void *a, uint32_t desc)
 672{
 673    intptr_t oprsz = simd_oprsz(desc);
 674    int shift = simd_data(desc);
 675    intptr_t i;
 676
 677    for (i = 0; i < oprsz; i += sizeof(vec32)) {
 678        *(svec32 *)(d + i) = *(svec32 *)(a + i) >> shift;
 679    }
 680    clear_high(d, oprsz, desc);
 681}
 682
 683void HELPER(gvec_sar64i)(void *d, void *a, uint32_t desc)
 684{
 685    intptr_t oprsz = simd_oprsz(desc);
 686    int shift = simd_data(desc);
 687    intptr_t i;
 688
 689    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 690        *(svec64 *)(d + i) = *(svec64 *)(a + i) >> shift;
 691    }
 692    clear_high(d, oprsz, desc);
 693}
 694
 695/* If vectors are enabled, the compiler fills in -1 for true.
 696   Otherwise, we must take care of this by hand.  */
 697#ifdef CONFIG_VECTOR16
 698# define DO_CMP0(X)  X
 699#else
 700# define DO_CMP0(X)  -(X)
 701#endif
 702
 703#define DO_CMP1(NAME, TYPE, OP)                                            \
 704void HELPER(NAME)(void *d, void *a, void *b, uint32_t desc)                \
 705{                                                                          \
 706    intptr_t oprsz = simd_oprsz(desc);                                     \
 707    intptr_t i;                                                            \
 708    for (i = 0; i < oprsz; i += sizeof(TYPE)) {                            \
 709        *(TYPE *)(d + i) = DO_CMP0(*(TYPE *)(a + i) OP *(TYPE *)(b + i));  \
 710    }                                                                      \
 711    clear_high(d, oprsz, desc);                                            \
 712}
 713
 714#define DO_CMP2(SZ) \
 715    DO_CMP1(gvec_eq##SZ, vec##SZ, ==)    \
 716    DO_CMP1(gvec_ne##SZ, vec##SZ, !=)    \
 717    DO_CMP1(gvec_lt##SZ, svec##SZ, <)    \
 718    DO_CMP1(gvec_le##SZ, svec##SZ, <=)   \
 719    DO_CMP1(gvec_ltu##SZ, vec##SZ, <)    \
 720    DO_CMP1(gvec_leu##SZ, vec##SZ, <=)
 721
 722DO_CMP2(8)
 723DO_CMP2(16)
 724DO_CMP2(32)
 725DO_CMP2(64)
 726
 727#undef DO_CMP0
 728#undef DO_CMP1
 729#undef DO_CMP2
 730
 731void HELPER(gvec_ssadd8)(void *d, void *a, void *b, uint32_t desc)
 732{
 733    intptr_t oprsz = simd_oprsz(desc);
 734    intptr_t i;
 735
 736    for (i = 0; i < oprsz; i += sizeof(int8_t)) {
 737        int r = *(int8_t *)(a + i) + *(int8_t *)(b + i);
 738        if (r > INT8_MAX) {
 739            r = INT8_MAX;
 740        } else if (r < INT8_MIN) {
 741            r = INT8_MIN;
 742        }
 743        *(int8_t *)(d + i) = r;
 744    }
 745    clear_high(d, oprsz, desc);
 746}
 747
 748void HELPER(gvec_ssadd16)(void *d, void *a, void *b, uint32_t desc)
 749{
 750    intptr_t oprsz = simd_oprsz(desc);
 751    intptr_t i;
 752
 753    for (i = 0; i < oprsz; i += sizeof(int16_t)) {
 754        int r = *(int16_t *)(a + i) + *(int16_t *)(b + i);
 755        if (r > INT16_MAX) {
 756            r = INT16_MAX;
 757        } else if (r < INT16_MIN) {
 758            r = INT16_MIN;
 759        }
 760        *(int16_t *)(d + i) = r;
 761    }
 762    clear_high(d, oprsz, desc);
 763}
 764
 765void HELPER(gvec_ssadd32)(void *d, void *a, void *b, uint32_t desc)
 766{
 767    intptr_t oprsz = simd_oprsz(desc);
 768    intptr_t i;
 769
 770    for (i = 0; i < oprsz; i += sizeof(int32_t)) {
 771        int32_t ai = *(int32_t *)(a + i);
 772        int32_t bi = *(int32_t *)(b + i);
 773        int32_t di = ai + bi;
 774        if (((di ^ ai) &~ (ai ^ bi)) < 0) {
 775            /* Signed overflow.  */
 776            di = (di < 0 ? INT32_MAX : INT32_MIN);
 777        }
 778        *(int32_t *)(d + i) = di;
 779    }
 780    clear_high(d, oprsz, desc);
 781}
 782
 783void HELPER(gvec_ssadd64)(void *d, void *a, void *b, uint32_t desc)
 784{
 785    intptr_t oprsz = simd_oprsz(desc);
 786    intptr_t i;
 787
 788    for (i = 0; i < oprsz; i += sizeof(int64_t)) {
 789        int64_t ai = *(int64_t *)(a + i);
 790        int64_t bi = *(int64_t *)(b + i);
 791        int64_t di = ai + bi;
 792        if (((di ^ ai) &~ (ai ^ bi)) < 0) {
 793            /* Signed overflow.  */
 794            di = (di < 0 ? INT64_MAX : INT64_MIN);
 795        }
 796        *(int64_t *)(d + i) = di;
 797    }
 798    clear_high(d, oprsz, desc);
 799}
 800
 801void HELPER(gvec_sssub8)(void *d, void *a, void *b, uint32_t desc)
 802{
 803    intptr_t oprsz = simd_oprsz(desc);
 804    intptr_t i;
 805
 806    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
 807        int r = *(int8_t *)(a + i) - *(int8_t *)(b + i);
 808        if (r > INT8_MAX) {
 809            r = INT8_MAX;
 810        } else if (r < INT8_MIN) {
 811            r = INT8_MIN;
 812        }
 813        *(uint8_t *)(d + i) = r;
 814    }
 815    clear_high(d, oprsz, desc);
 816}
 817
 818void HELPER(gvec_sssub16)(void *d, void *a, void *b, uint32_t desc)
 819{
 820    intptr_t oprsz = simd_oprsz(desc);
 821    intptr_t i;
 822
 823    for (i = 0; i < oprsz; i += sizeof(int16_t)) {
 824        int r = *(int16_t *)(a + i) - *(int16_t *)(b + i);
 825        if (r > INT16_MAX) {
 826            r = INT16_MAX;
 827        } else if (r < INT16_MIN) {
 828            r = INT16_MIN;
 829        }
 830        *(int16_t *)(d + i) = r;
 831    }
 832    clear_high(d, oprsz, desc);
 833}
 834
 835void HELPER(gvec_sssub32)(void *d, void *a, void *b, uint32_t desc)
 836{
 837    intptr_t oprsz = simd_oprsz(desc);
 838    intptr_t i;
 839
 840    for (i = 0; i < oprsz; i += sizeof(int32_t)) {
 841        int32_t ai = *(int32_t *)(a + i);
 842        int32_t bi = *(int32_t *)(b + i);
 843        int32_t di = ai - bi;
 844        if (((di ^ ai) & (ai ^ bi)) < 0) {
 845            /* Signed overflow.  */
 846            di = (di < 0 ? INT32_MAX : INT32_MIN);
 847        }
 848        *(int32_t *)(d + i) = di;
 849    }
 850    clear_high(d, oprsz, desc);
 851}
 852
 853void HELPER(gvec_sssub64)(void *d, void *a, void *b, uint32_t desc)
 854{
 855    intptr_t oprsz = simd_oprsz(desc);
 856    intptr_t i;
 857
 858    for (i = 0; i < oprsz; i += sizeof(int64_t)) {
 859        int64_t ai = *(int64_t *)(a + i);
 860        int64_t bi = *(int64_t *)(b + i);
 861        int64_t di = ai - bi;
 862        if (((di ^ ai) & (ai ^ bi)) < 0) {
 863            /* Signed overflow.  */
 864            di = (di < 0 ? INT64_MAX : INT64_MIN);
 865        }
 866        *(int64_t *)(d + i) = di;
 867    }
 868    clear_high(d, oprsz, desc);
 869}
 870
 871void HELPER(gvec_usadd8)(void *d, void *a, void *b, uint32_t desc)
 872{
 873    intptr_t oprsz = simd_oprsz(desc);
 874    intptr_t i;
 875
 876    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
 877        unsigned r = *(uint8_t *)(a + i) + *(uint8_t *)(b + i);
 878        if (r > UINT8_MAX) {
 879            r = UINT8_MAX;
 880        }
 881        *(uint8_t *)(d + i) = r;
 882    }
 883    clear_high(d, oprsz, desc);
 884}
 885
 886void HELPER(gvec_usadd16)(void *d, void *a, void *b, uint32_t desc)
 887{
 888    intptr_t oprsz = simd_oprsz(desc);
 889    intptr_t i;
 890
 891    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
 892        unsigned r = *(uint16_t *)(a + i) + *(uint16_t *)(b + i);
 893        if (r > UINT16_MAX) {
 894            r = UINT16_MAX;
 895        }
 896        *(uint16_t *)(d + i) = r;
 897    }
 898    clear_high(d, oprsz, desc);
 899}
 900
 901void HELPER(gvec_usadd32)(void *d, void *a, void *b, uint32_t desc)
 902{
 903    intptr_t oprsz = simd_oprsz(desc);
 904    intptr_t i;
 905
 906    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
 907        uint32_t ai = *(uint32_t *)(a + i);
 908        uint32_t bi = *(uint32_t *)(b + i);
 909        uint32_t di = ai + bi;
 910        if (di < ai) {
 911            di = UINT32_MAX;
 912        }
 913        *(uint32_t *)(d + i) = di;
 914    }
 915    clear_high(d, oprsz, desc);
 916}
 917
 918void HELPER(gvec_usadd64)(void *d, void *a, void *b, uint32_t desc)
 919{
 920    intptr_t oprsz = simd_oprsz(desc);
 921    intptr_t i;
 922
 923    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
 924        uint64_t ai = *(uint64_t *)(a + i);
 925        uint64_t bi = *(uint64_t *)(b + i);
 926        uint64_t di = ai + bi;
 927        if (di < ai) {
 928            di = UINT64_MAX;
 929        }
 930        *(uint64_t *)(d + i) = di;
 931    }
 932    clear_high(d, oprsz, desc);
 933}
 934
 935void HELPER(gvec_ussub8)(void *d, void *a, void *b, uint32_t desc)
 936{
 937    intptr_t oprsz = simd_oprsz(desc);
 938    intptr_t i;
 939
 940    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
 941        int r = *(uint8_t *)(a + i) - *(uint8_t *)(b + i);
 942        if (r < 0) {
 943            r = 0;
 944        }
 945        *(uint8_t *)(d + i) = r;
 946    }
 947    clear_high(d, oprsz, desc);
 948}
 949
 950void HELPER(gvec_ussub16)(void *d, void *a, void *b, uint32_t desc)
 951{
 952    intptr_t oprsz = simd_oprsz(desc);
 953    intptr_t i;
 954
 955    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
 956        int r = *(uint16_t *)(a + i) - *(uint16_t *)(b + i);
 957        if (r < 0) {
 958            r = 0;
 959        }
 960        *(uint16_t *)(d + i) = r;
 961    }
 962    clear_high(d, oprsz, desc);
 963}
 964
 965void HELPER(gvec_ussub32)(void *d, void *a, void *b, uint32_t desc)
 966{
 967    intptr_t oprsz = simd_oprsz(desc);
 968    intptr_t i;
 969
 970    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
 971        uint32_t ai = *(uint32_t *)(a + i);
 972        uint32_t bi = *(uint32_t *)(b + i);
 973        uint32_t di = ai - bi;
 974        if (ai < bi) {
 975            di = 0;
 976        }
 977        *(uint32_t *)(d + i) = di;
 978    }
 979    clear_high(d, oprsz, desc);
 980}
 981
 982void HELPER(gvec_ussub64)(void *d, void *a, void *b, uint32_t desc)
 983{
 984    intptr_t oprsz = simd_oprsz(desc);
 985    intptr_t i;
 986
 987    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
 988        uint64_t ai = *(uint64_t *)(a + i);
 989        uint64_t bi = *(uint64_t *)(b + i);
 990        uint64_t di = ai - bi;
 991        if (ai < bi) {
 992            di = 0;
 993        }
 994        *(uint64_t *)(d + i) = di;
 995    }
 996    clear_high(d, oprsz, desc);
 997}
 998