qemu/accel/tcg/tcg-runtime-gvec.c
<<
>>
Prefs
   1/*
   2 * Generic vectorized operation runtime
   3 *
   4 * Copyright (c) 2018 Linaro
   5 *
   6 * This library is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU Lesser General Public
   8 * License as published by the Free Software Foundation; either
   9 * version 2.1 of the License, or (at your option) any later version.
  10 *
  11 * This library is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * Lesser General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU Lesser General Public
  17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20#include "qemu/osdep.h"
  21#include "qemu/host-utils.h"
  22#include "cpu.h"
  23#include "exec/helper-proto.h"
  24#include "tcg-gvec-desc.h"
  25
  26
  27/* Virtually all hosts support 16-byte vectors.  Those that don't can emulate
  28 * them via GCC's generic vector extension.  This turns out to be simpler and
  29 * more reliable than getting the compiler to autovectorize.
  30 *
  31 * In tcg-op-gvec.c, we asserted that both the size and alignment of the data
  32 * are multiples of 16.
  33 *
  34 * When the compiler does not support all of the operations we require, the
  35 * loops are written so that we can always fall back on the base types.
  36 */
  37#ifdef CONFIG_VECTOR16
  38typedef uint8_t vec8 __attribute__((vector_size(16)));
  39typedef uint16_t vec16 __attribute__((vector_size(16)));
  40typedef uint32_t vec32 __attribute__((vector_size(16)));
  41typedef uint64_t vec64 __attribute__((vector_size(16)));
  42
  43typedef int8_t svec8 __attribute__((vector_size(16)));
  44typedef int16_t svec16 __attribute__((vector_size(16)));
  45typedef int32_t svec32 __attribute__((vector_size(16)));
  46typedef int64_t svec64 __attribute__((vector_size(16)));
  47
  48#define DUP16(X)  { X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X }
  49#define DUP8(X)   { X, X, X, X, X, X, X, X }
  50#define DUP4(X)   { X, X, X, X }
  51#define DUP2(X)   { X, X }
  52#else
  53typedef uint8_t vec8;
  54typedef uint16_t vec16;
  55typedef uint32_t vec32;
  56typedef uint64_t vec64;
  57
  58typedef int8_t svec8;
  59typedef int16_t svec16;
  60typedef int32_t svec32;
  61typedef int64_t svec64;
  62
  63#define DUP16(X)  X
  64#define DUP8(X)   X
  65#define DUP4(X)   X
  66#define DUP2(X)   X
  67#endif /* CONFIG_VECTOR16 */
  68
  69static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc)
  70{
  71    intptr_t maxsz = simd_maxsz(desc);
  72    intptr_t i;
  73
  74    if (unlikely(maxsz > oprsz)) {
  75        for (i = oprsz; i < maxsz; i += sizeof(uint64_t)) {
  76            *(uint64_t *)(d + i) = 0;
  77        }
  78    }
  79}
  80
  81void HELPER(gvec_add8)(void *d, void *a, void *b, uint32_t desc)
  82{
  83    intptr_t oprsz = simd_oprsz(desc);
  84    intptr_t i;
  85
  86    for (i = 0; i < oprsz; i += sizeof(vec8)) {
  87        *(vec8 *)(d + i) = *(vec8 *)(a + i) + *(vec8 *)(b + i);
  88    }
  89    clear_high(d, oprsz, desc);
  90}
  91
  92void HELPER(gvec_add16)(void *d, void *a, void *b, uint32_t desc)
  93{
  94    intptr_t oprsz = simd_oprsz(desc);
  95    intptr_t i;
  96
  97    for (i = 0; i < oprsz; i += sizeof(vec16)) {
  98        *(vec16 *)(d + i) = *(vec16 *)(a + i) + *(vec16 *)(b + i);
  99    }
 100    clear_high(d, oprsz, desc);
 101}
 102
 103void HELPER(gvec_add32)(void *d, void *a, void *b, uint32_t desc)
 104{
 105    intptr_t oprsz = simd_oprsz(desc);
 106    intptr_t i;
 107
 108    for (i = 0; i < oprsz; i += sizeof(vec32)) {
 109        *(vec32 *)(d + i) = *(vec32 *)(a + i) + *(vec32 *)(b + i);
 110    }
 111    clear_high(d, oprsz, desc);
 112}
 113
 114void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc)
 115{
 116    intptr_t oprsz = simd_oprsz(desc);
 117    intptr_t i;
 118
 119    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 120        *(vec64 *)(d + i) = *(vec64 *)(a + i) + *(vec64 *)(b + i);
 121    }
 122    clear_high(d, oprsz, desc);
 123}
 124
 125void HELPER(gvec_adds8)(void *d, void *a, uint64_t b, uint32_t desc)
 126{
 127    intptr_t oprsz = simd_oprsz(desc);
 128    vec8 vecb = (vec8)DUP16(b);
 129    intptr_t i;
 130
 131    for (i = 0; i < oprsz; i += sizeof(vec8)) {
 132        *(vec8 *)(d + i) = *(vec8 *)(a + i) + vecb;
 133    }
 134    clear_high(d, oprsz, desc);
 135}
 136
 137void HELPER(gvec_adds16)(void *d, void *a, uint64_t b, uint32_t desc)
 138{
 139    intptr_t oprsz = simd_oprsz(desc);
 140    vec16 vecb = (vec16)DUP8(b);
 141    intptr_t i;
 142
 143    for (i = 0; i < oprsz; i += sizeof(vec16)) {
 144        *(vec16 *)(d + i) = *(vec16 *)(a + i) + vecb;
 145    }
 146    clear_high(d, oprsz, desc);
 147}
 148
 149void HELPER(gvec_adds32)(void *d, void *a, uint64_t b, uint32_t desc)
 150{
 151    intptr_t oprsz = simd_oprsz(desc);
 152    vec32 vecb = (vec32)DUP4(b);
 153    intptr_t i;
 154
 155    for (i = 0; i < oprsz; i += sizeof(vec32)) {
 156        *(vec32 *)(d + i) = *(vec32 *)(a + i) + vecb;
 157    }
 158    clear_high(d, oprsz, desc);
 159}
 160
 161void HELPER(gvec_adds64)(void *d, void *a, uint64_t b, uint32_t desc)
 162{
 163    intptr_t oprsz = simd_oprsz(desc);
 164    vec64 vecb = (vec64)DUP2(b);
 165    intptr_t i;
 166
 167    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 168        *(vec64 *)(d + i) = *(vec64 *)(a + i) + vecb;
 169    }
 170    clear_high(d, oprsz, desc);
 171}
 172
 173void HELPER(gvec_sub8)(void *d, void *a, void *b, uint32_t desc)
 174{
 175    intptr_t oprsz = simd_oprsz(desc);
 176    intptr_t i;
 177
 178    for (i = 0; i < oprsz; i += sizeof(vec8)) {
 179        *(vec8 *)(d + i) = *(vec8 *)(a + i) - *(vec8 *)(b + i);
 180    }
 181    clear_high(d, oprsz, desc);
 182}
 183
 184void HELPER(gvec_sub16)(void *d, void *a, void *b, uint32_t desc)
 185{
 186    intptr_t oprsz = simd_oprsz(desc);
 187    intptr_t i;
 188
 189    for (i = 0; i < oprsz; i += sizeof(vec16)) {
 190        *(vec16 *)(d + i) = *(vec16 *)(a + i) - *(vec16 *)(b + i);
 191    }
 192    clear_high(d, oprsz, desc);
 193}
 194
 195void HELPER(gvec_sub32)(void *d, void *a, void *b, uint32_t desc)
 196{
 197    intptr_t oprsz = simd_oprsz(desc);
 198    intptr_t i;
 199
 200    for (i = 0; i < oprsz; i += sizeof(vec32)) {
 201        *(vec32 *)(d + i) = *(vec32 *)(a + i) - *(vec32 *)(b + i);
 202    }
 203    clear_high(d, oprsz, desc);
 204}
 205
 206void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc)
 207{
 208    intptr_t oprsz = simd_oprsz(desc);
 209    intptr_t i;
 210
 211    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 212        *(vec64 *)(d + i) = *(vec64 *)(a + i) - *(vec64 *)(b + i);
 213    }
 214    clear_high(d, oprsz, desc);
 215}
 216
 217void HELPER(gvec_subs8)(void *d, void *a, uint64_t b, uint32_t desc)
 218{
 219    intptr_t oprsz = simd_oprsz(desc);
 220    vec8 vecb = (vec8)DUP16(b);
 221    intptr_t i;
 222
 223    for (i = 0; i < oprsz; i += sizeof(vec8)) {
 224        *(vec8 *)(d + i) = *(vec8 *)(a + i) - vecb;
 225    }
 226    clear_high(d, oprsz, desc);
 227}
 228
 229void HELPER(gvec_subs16)(void *d, void *a, uint64_t b, uint32_t desc)
 230{
 231    intptr_t oprsz = simd_oprsz(desc);
 232    vec16 vecb = (vec16)DUP8(b);
 233    intptr_t i;
 234
 235    for (i = 0; i < oprsz; i += sizeof(vec16)) {
 236        *(vec16 *)(d + i) = *(vec16 *)(a + i) - vecb;
 237    }
 238    clear_high(d, oprsz, desc);
 239}
 240
 241void HELPER(gvec_subs32)(void *d, void *a, uint64_t b, uint32_t desc)
 242{
 243    intptr_t oprsz = simd_oprsz(desc);
 244    vec32 vecb = (vec32)DUP4(b);
 245    intptr_t i;
 246
 247    for (i = 0; i < oprsz; i += sizeof(vec32)) {
 248        *(vec32 *)(d + i) = *(vec32 *)(a + i) - vecb;
 249    }
 250    clear_high(d, oprsz, desc);
 251}
 252
 253void HELPER(gvec_subs64)(void *d, void *a, uint64_t b, uint32_t desc)
 254{
 255    intptr_t oprsz = simd_oprsz(desc);
 256    vec64 vecb = (vec64)DUP2(b);
 257    intptr_t i;
 258
 259    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 260        *(vec64 *)(d + i) = *(vec64 *)(a + i) - vecb;
 261    }
 262    clear_high(d, oprsz, desc);
 263}
 264
 265void HELPER(gvec_mul8)(void *d, void *a, void *b, uint32_t desc)
 266{
 267    intptr_t oprsz = simd_oprsz(desc);
 268    intptr_t i;
 269
 270    for (i = 0; i < oprsz; i += sizeof(vec8)) {
 271        *(vec8 *)(d + i) = *(vec8 *)(a + i) * *(vec8 *)(b + i);
 272    }
 273    clear_high(d, oprsz, desc);
 274}
 275
 276void HELPER(gvec_mul16)(void *d, void *a, void *b, uint32_t desc)
 277{
 278    intptr_t oprsz = simd_oprsz(desc);
 279    intptr_t i;
 280
 281    for (i = 0; i < oprsz; i += sizeof(vec16)) {
 282        *(vec16 *)(d + i) = *(vec16 *)(a + i) * *(vec16 *)(b + i);
 283    }
 284    clear_high(d, oprsz, desc);
 285}
 286
 287void HELPER(gvec_mul32)(void *d, void *a, void *b, uint32_t desc)
 288{
 289    intptr_t oprsz = simd_oprsz(desc);
 290    intptr_t i;
 291
 292    for (i = 0; i < oprsz; i += sizeof(vec32)) {
 293        *(vec32 *)(d + i) = *(vec32 *)(a + i) * *(vec32 *)(b + i);
 294    }
 295    clear_high(d, oprsz, desc);
 296}
 297
 298void HELPER(gvec_mul64)(void *d, void *a, void *b, uint32_t desc)
 299{
 300    intptr_t oprsz = simd_oprsz(desc);
 301    intptr_t i;
 302
 303    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 304        *(vec64 *)(d + i) = *(vec64 *)(a + i) * *(vec64 *)(b + i);
 305    }
 306    clear_high(d, oprsz, desc);
 307}
 308
 309void HELPER(gvec_muls8)(void *d, void *a, uint64_t b, uint32_t desc)
 310{
 311    intptr_t oprsz = simd_oprsz(desc);
 312    vec8 vecb = (vec8)DUP16(b);
 313    intptr_t i;
 314
 315    for (i = 0; i < oprsz; i += sizeof(vec8)) {
 316        *(vec8 *)(d + i) = *(vec8 *)(a + i) * vecb;
 317    }
 318    clear_high(d, oprsz, desc);
 319}
 320
 321void HELPER(gvec_muls16)(void *d, void *a, uint64_t b, uint32_t desc)
 322{
 323    intptr_t oprsz = simd_oprsz(desc);
 324    vec16 vecb = (vec16)DUP8(b);
 325    intptr_t i;
 326
 327    for (i = 0; i < oprsz; i += sizeof(vec16)) {
 328        *(vec16 *)(d + i) = *(vec16 *)(a + i) * vecb;
 329    }
 330    clear_high(d, oprsz, desc);
 331}
 332
 333void HELPER(gvec_muls32)(void *d, void *a, uint64_t b, uint32_t desc)
 334{
 335    intptr_t oprsz = simd_oprsz(desc);
 336    vec32 vecb = (vec32)DUP4(b);
 337    intptr_t i;
 338
 339    for (i = 0; i < oprsz; i += sizeof(vec32)) {
 340        *(vec32 *)(d + i) = *(vec32 *)(a + i) * vecb;
 341    }
 342    clear_high(d, oprsz, desc);
 343}
 344
 345void HELPER(gvec_muls64)(void *d, void *a, uint64_t b, uint32_t desc)
 346{
 347    intptr_t oprsz = simd_oprsz(desc);
 348    vec64 vecb = (vec64)DUP2(b);
 349    intptr_t i;
 350
 351    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 352        *(vec64 *)(d + i) = *(vec64 *)(a + i) * vecb;
 353    }
 354    clear_high(d, oprsz, desc);
 355}
 356
 357void HELPER(gvec_neg8)(void *d, void *a, uint32_t desc)
 358{
 359    intptr_t oprsz = simd_oprsz(desc);
 360    intptr_t i;
 361
 362    for (i = 0; i < oprsz; i += sizeof(vec8)) {
 363        *(vec8 *)(d + i) = -*(vec8 *)(a + i);
 364    }
 365    clear_high(d, oprsz, desc);
 366}
 367
 368void HELPER(gvec_neg16)(void *d, void *a, uint32_t desc)
 369{
 370    intptr_t oprsz = simd_oprsz(desc);
 371    intptr_t i;
 372
 373    for (i = 0; i < oprsz; i += sizeof(vec16)) {
 374        *(vec16 *)(d + i) = -*(vec16 *)(a + i);
 375    }
 376    clear_high(d, oprsz, desc);
 377}
 378
 379void HELPER(gvec_neg32)(void *d, void *a, uint32_t desc)
 380{
 381    intptr_t oprsz = simd_oprsz(desc);
 382    intptr_t i;
 383
 384    for (i = 0; i < oprsz; i += sizeof(vec32)) {
 385        *(vec32 *)(d + i) = -*(vec32 *)(a + i);
 386    }
 387    clear_high(d, oprsz, desc);
 388}
 389
 390void HELPER(gvec_neg64)(void *d, void *a, uint32_t desc)
 391{
 392    intptr_t oprsz = simd_oprsz(desc);
 393    intptr_t i;
 394
 395    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 396        *(vec64 *)(d + i) = -*(vec64 *)(a + i);
 397    }
 398    clear_high(d, oprsz, desc);
 399}
 400
 401void HELPER(gvec_abs8)(void *d, void *a, uint32_t desc)
 402{
 403    intptr_t oprsz = simd_oprsz(desc);
 404    intptr_t i;
 405
 406    for (i = 0; i < oprsz; i += sizeof(int8_t)) {
 407        int8_t aa = *(int8_t *)(a + i);
 408        *(int8_t *)(d + i) = aa < 0 ? -aa : aa;
 409    }
 410    clear_high(d, oprsz, desc);
 411}
 412
 413void HELPER(gvec_abs16)(void *d, void *a, uint32_t desc)
 414{
 415    intptr_t oprsz = simd_oprsz(desc);
 416    intptr_t i;
 417
 418    for (i = 0; i < oprsz; i += sizeof(int16_t)) {
 419        int16_t aa = *(int16_t *)(a + i);
 420        *(int16_t *)(d + i) = aa < 0 ? -aa : aa;
 421    }
 422    clear_high(d, oprsz, desc);
 423}
 424
 425void HELPER(gvec_abs32)(void *d, void *a, uint32_t desc)
 426{
 427    intptr_t oprsz = simd_oprsz(desc);
 428    intptr_t i;
 429
 430    for (i = 0; i < oprsz; i += sizeof(int32_t)) {
 431        int32_t aa = *(int32_t *)(a + i);
 432        *(int32_t *)(d + i) = aa < 0 ? -aa : aa;
 433    }
 434    clear_high(d, oprsz, desc);
 435}
 436
 437void HELPER(gvec_abs64)(void *d, void *a, uint32_t desc)
 438{
 439    intptr_t oprsz = simd_oprsz(desc);
 440    intptr_t i;
 441
 442    for (i = 0; i < oprsz; i += sizeof(int64_t)) {
 443        int64_t aa = *(int64_t *)(a + i);
 444        *(int64_t *)(d + i) = aa < 0 ? -aa : aa;
 445    }
 446    clear_high(d, oprsz, desc);
 447}
 448
 449void HELPER(gvec_mov)(void *d, void *a, uint32_t desc)
 450{
 451    intptr_t oprsz = simd_oprsz(desc);
 452
 453    memcpy(d, a, oprsz);
 454    clear_high(d, oprsz, desc);
 455}
 456
 457void HELPER(gvec_dup64)(void *d, uint32_t desc, uint64_t c)
 458{
 459    intptr_t oprsz = simd_oprsz(desc);
 460    intptr_t i;
 461
 462    if (c == 0) {
 463        oprsz = 0;
 464    } else {
 465        for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
 466            *(uint64_t *)(d + i) = c;
 467        }
 468    }
 469    clear_high(d, oprsz, desc);
 470}
 471
 472void HELPER(gvec_dup32)(void *d, uint32_t desc, uint32_t c)
 473{
 474    intptr_t oprsz = simd_oprsz(desc);
 475    intptr_t i;
 476
 477    if (c == 0) {
 478        oprsz = 0;
 479    } else {
 480        for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
 481            *(uint32_t *)(d + i) = c;
 482        }
 483    }
 484    clear_high(d, oprsz, desc);
 485}
 486
 487void HELPER(gvec_dup16)(void *d, uint32_t desc, uint32_t c)
 488{
 489    HELPER(gvec_dup32)(d, desc, 0x00010001 * (c & 0xffff));
 490}
 491
 492void HELPER(gvec_dup8)(void *d, uint32_t desc, uint32_t c)
 493{
 494    HELPER(gvec_dup32)(d, desc, 0x01010101 * (c & 0xff));
 495}
 496
 497void HELPER(gvec_not)(void *d, void *a, uint32_t desc)
 498{
 499    intptr_t oprsz = simd_oprsz(desc);
 500    intptr_t i;
 501
 502    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 503        *(vec64 *)(d + i) = ~*(vec64 *)(a + i);
 504    }
 505    clear_high(d, oprsz, desc);
 506}
 507
 508void HELPER(gvec_and)(void *d, void *a, void *b, uint32_t desc)
 509{
 510    intptr_t oprsz = simd_oprsz(desc);
 511    intptr_t i;
 512
 513    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 514        *(vec64 *)(d + i) = *(vec64 *)(a + i) & *(vec64 *)(b + i);
 515    }
 516    clear_high(d, oprsz, desc);
 517}
 518
 519void HELPER(gvec_or)(void *d, void *a, void *b, uint32_t desc)
 520{
 521    intptr_t oprsz = simd_oprsz(desc);
 522    intptr_t i;
 523
 524    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 525        *(vec64 *)(d + i) = *(vec64 *)(a + i) | *(vec64 *)(b + i);
 526    }
 527    clear_high(d, oprsz, desc);
 528}
 529
 530void HELPER(gvec_xor)(void *d, void *a, void *b, uint32_t desc)
 531{
 532    intptr_t oprsz = simd_oprsz(desc);
 533    intptr_t i;
 534
 535    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 536        *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ *(vec64 *)(b + i);
 537    }
 538    clear_high(d, oprsz, desc);
 539}
 540
 541void HELPER(gvec_andc)(void *d, void *a, void *b, uint32_t desc)
 542{
 543    intptr_t oprsz = simd_oprsz(desc);
 544    intptr_t i;
 545
 546    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 547        *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i);
 548    }
 549    clear_high(d, oprsz, desc);
 550}
 551
 552void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc)
 553{
 554    intptr_t oprsz = simd_oprsz(desc);
 555    intptr_t i;
 556
 557    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 558        *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i);
 559    }
 560    clear_high(d, oprsz, desc);
 561}
 562
 563void HELPER(gvec_nand)(void *d, void *a, void *b, uint32_t desc)
 564{
 565    intptr_t oprsz = simd_oprsz(desc);
 566    intptr_t i;
 567
 568    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 569        *(vec64 *)(d + i) = ~(*(vec64 *)(a + i) & *(vec64 *)(b + i));
 570    }
 571    clear_high(d, oprsz, desc);
 572}
 573
 574void HELPER(gvec_nor)(void *d, void *a, void *b, uint32_t desc)
 575{
 576    intptr_t oprsz = simd_oprsz(desc);
 577    intptr_t i;
 578
 579    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 580        *(vec64 *)(d + i) = ~(*(vec64 *)(a + i) | *(vec64 *)(b + i));
 581    }
 582    clear_high(d, oprsz, desc);
 583}
 584
 585void HELPER(gvec_eqv)(void *d, void *a, void *b, uint32_t desc)
 586{
 587    intptr_t oprsz = simd_oprsz(desc);
 588    intptr_t i;
 589
 590    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 591        *(vec64 *)(d + i) = ~(*(vec64 *)(a + i) ^ *(vec64 *)(b + i));
 592    }
 593    clear_high(d, oprsz, desc);
 594}
 595
 596void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc)
 597{
 598    intptr_t oprsz = simd_oprsz(desc);
 599    vec64 vecb = (vec64)DUP2(b);
 600    intptr_t i;
 601
 602    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 603        *(vec64 *)(d + i) = *(vec64 *)(a + i) & vecb;
 604    }
 605    clear_high(d, oprsz, desc);
 606}
 607
 608void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc)
 609{
 610    intptr_t oprsz = simd_oprsz(desc);
 611    vec64 vecb = (vec64)DUP2(b);
 612    intptr_t i;
 613
 614    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 615        *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ vecb;
 616    }
 617    clear_high(d, oprsz, desc);
 618}
 619
 620void HELPER(gvec_ors)(void *d, void *a, uint64_t b, uint32_t desc)
 621{
 622    intptr_t oprsz = simd_oprsz(desc);
 623    vec64 vecb = (vec64)DUP2(b);
 624    intptr_t i;
 625
 626    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 627        *(vec64 *)(d + i) = *(vec64 *)(a + i) | vecb;
 628    }
 629    clear_high(d, oprsz, desc);
 630}
 631
 632void HELPER(gvec_shl8i)(void *d, void *a, uint32_t desc)
 633{
 634    intptr_t oprsz = simd_oprsz(desc);
 635    int shift = simd_data(desc);
 636    intptr_t i;
 637
 638    for (i = 0; i < oprsz; i += sizeof(vec8)) {
 639        *(vec8 *)(d + i) = *(vec8 *)(a + i) << shift;
 640    }
 641    clear_high(d, oprsz, desc);
 642}
 643
 644void HELPER(gvec_shl16i)(void *d, void *a, uint32_t desc)
 645{
 646    intptr_t oprsz = simd_oprsz(desc);
 647    int shift = simd_data(desc);
 648    intptr_t i;
 649
 650    for (i = 0; i < oprsz; i += sizeof(vec16)) {
 651        *(vec16 *)(d + i) = *(vec16 *)(a + i) << shift;
 652    }
 653    clear_high(d, oprsz, desc);
 654}
 655
 656void HELPER(gvec_shl32i)(void *d, void *a, uint32_t desc)
 657{
 658    intptr_t oprsz = simd_oprsz(desc);
 659    int shift = simd_data(desc);
 660    intptr_t i;
 661
 662    for (i = 0; i < oprsz; i += sizeof(vec32)) {
 663        *(vec32 *)(d + i) = *(vec32 *)(a + i) << shift;
 664    }
 665    clear_high(d, oprsz, desc);
 666}
 667
 668void HELPER(gvec_shl64i)(void *d, void *a, uint32_t desc)
 669{
 670    intptr_t oprsz = simd_oprsz(desc);
 671    int shift = simd_data(desc);
 672    intptr_t i;
 673
 674    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 675        *(vec64 *)(d + i) = *(vec64 *)(a + i) << shift;
 676    }
 677    clear_high(d, oprsz, desc);
 678}
 679
 680void HELPER(gvec_shr8i)(void *d, void *a, uint32_t desc)
 681{
 682    intptr_t oprsz = simd_oprsz(desc);
 683    int shift = simd_data(desc);
 684    intptr_t i;
 685
 686    for (i = 0; i < oprsz; i += sizeof(vec8)) {
 687        *(vec8 *)(d + i) = *(vec8 *)(a + i) >> shift;
 688    }
 689    clear_high(d, oprsz, desc);
 690}
 691
 692void HELPER(gvec_shr16i)(void *d, void *a, uint32_t desc)
 693{
 694    intptr_t oprsz = simd_oprsz(desc);
 695    int shift = simd_data(desc);
 696    intptr_t i;
 697
 698    for (i = 0; i < oprsz; i += sizeof(vec16)) {
 699        *(vec16 *)(d + i) = *(vec16 *)(a + i) >> shift;
 700    }
 701    clear_high(d, oprsz, desc);
 702}
 703
 704void HELPER(gvec_shr32i)(void *d, void *a, uint32_t desc)
 705{
 706    intptr_t oprsz = simd_oprsz(desc);
 707    int shift = simd_data(desc);
 708    intptr_t i;
 709
 710    for (i = 0; i < oprsz; i += sizeof(vec32)) {
 711        *(vec32 *)(d + i) = *(vec32 *)(a + i) >> shift;
 712    }
 713    clear_high(d, oprsz, desc);
 714}
 715
 716void HELPER(gvec_shr64i)(void *d, void *a, uint32_t desc)
 717{
 718    intptr_t oprsz = simd_oprsz(desc);
 719    int shift = simd_data(desc);
 720    intptr_t i;
 721
 722    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 723        *(vec64 *)(d + i) = *(vec64 *)(a + i) >> shift;
 724    }
 725    clear_high(d, oprsz, desc);
 726}
 727
 728void HELPER(gvec_sar8i)(void *d, void *a, uint32_t desc)
 729{
 730    intptr_t oprsz = simd_oprsz(desc);
 731    int shift = simd_data(desc);
 732    intptr_t i;
 733
 734    for (i = 0; i < oprsz; i += sizeof(vec8)) {
 735        *(svec8 *)(d + i) = *(svec8 *)(a + i) >> shift;
 736    }
 737    clear_high(d, oprsz, desc);
 738}
 739
 740void HELPER(gvec_sar16i)(void *d, void *a, uint32_t desc)
 741{
 742    intptr_t oprsz = simd_oprsz(desc);
 743    int shift = simd_data(desc);
 744    intptr_t i;
 745
 746    for (i = 0; i < oprsz; i += sizeof(vec16)) {
 747        *(svec16 *)(d + i) = *(svec16 *)(a + i) >> shift;
 748    }
 749    clear_high(d, oprsz, desc);
 750}
 751
 752void HELPER(gvec_sar32i)(void *d, void *a, uint32_t desc)
 753{
 754    intptr_t oprsz = simd_oprsz(desc);
 755    int shift = simd_data(desc);
 756    intptr_t i;
 757
 758    for (i = 0; i < oprsz; i += sizeof(vec32)) {
 759        *(svec32 *)(d + i) = *(svec32 *)(a + i) >> shift;
 760    }
 761    clear_high(d, oprsz, desc);
 762}
 763
 764void HELPER(gvec_sar64i)(void *d, void *a, uint32_t desc)
 765{
 766    intptr_t oprsz = simd_oprsz(desc);
 767    int shift = simd_data(desc);
 768    intptr_t i;
 769
 770    for (i = 0; i < oprsz; i += sizeof(vec64)) {
 771        *(svec64 *)(d + i) = *(svec64 *)(a + i) >> shift;
 772    }
 773    clear_high(d, oprsz, desc);
 774}
 775
 776void HELPER(gvec_shl8v)(void *d, void *a, void *b, uint32_t desc)
 777{
 778    intptr_t oprsz = simd_oprsz(desc);
 779    intptr_t i;
 780
 781    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
 782        uint8_t sh = *(uint8_t *)(b + i) & 7;
 783        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) << sh;
 784    }
 785    clear_high(d, oprsz, desc);
 786}
 787
 788void HELPER(gvec_shl16v)(void *d, void *a, void *b, uint32_t desc)
 789{
 790    intptr_t oprsz = simd_oprsz(desc);
 791    intptr_t i;
 792
 793    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
 794        uint8_t sh = *(uint16_t *)(b + i) & 15;
 795        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) << sh;
 796    }
 797    clear_high(d, oprsz, desc);
 798}
 799
 800void HELPER(gvec_shl32v)(void *d, void *a, void *b, uint32_t desc)
 801{
 802    intptr_t oprsz = simd_oprsz(desc);
 803    intptr_t i;
 804
 805    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
 806        uint8_t sh = *(uint32_t *)(b + i) & 31;
 807        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) << sh;
 808    }
 809    clear_high(d, oprsz, desc);
 810}
 811
 812void HELPER(gvec_shl64v)(void *d, void *a, void *b, uint32_t desc)
 813{
 814    intptr_t oprsz = simd_oprsz(desc);
 815    intptr_t i;
 816
 817    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
 818        uint8_t sh = *(uint64_t *)(b + i) & 63;
 819        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) << sh;
 820    }
 821    clear_high(d, oprsz, desc);
 822}
 823
 824void HELPER(gvec_shr8v)(void *d, void *a, void *b, uint32_t desc)
 825{
 826    intptr_t oprsz = simd_oprsz(desc);
 827    intptr_t i;
 828
 829    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
 830        uint8_t sh = *(uint8_t *)(b + i) & 7;
 831        *(uint8_t *)(d + i) = *(uint8_t *)(a + i) >> sh;
 832    }
 833    clear_high(d, oprsz, desc);
 834}
 835
 836void HELPER(gvec_shr16v)(void *d, void *a, void *b, uint32_t desc)
 837{
 838    intptr_t oprsz = simd_oprsz(desc);
 839    intptr_t i;
 840
 841    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
 842        uint8_t sh = *(uint16_t *)(b + i) & 15;
 843        *(uint16_t *)(d + i) = *(uint16_t *)(a + i) >> sh;
 844    }
 845    clear_high(d, oprsz, desc);
 846}
 847
 848void HELPER(gvec_shr32v)(void *d, void *a, void *b, uint32_t desc)
 849{
 850    intptr_t oprsz = simd_oprsz(desc);
 851    intptr_t i;
 852
 853    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
 854        uint8_t sh = *(uint32_t *)(b + i) & 31;
 855        *(uint32_t *)(d + i) = *(uint32_t *)(a + i) >> sh;
 856    }
 857    clear_high(d, oprsz, desc);
 858}
 859
 860void HELPER(gvec_shr64v)(void *d, void *a, void *b, uint32_t desc)
 861{
 862    intptr_t oprsz = simd_oprsz(desc);
 863    intptr_t i;
 864
 865    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
 866        uint8_t sh = *(uint64_t *)(b + i) & 63;
 867        *(uint64_t *)(d + i) = *(uint64_t *)(a + i) >> sh;
 868    }
 869    clear_high(d, oprsz, desc);
 870}
 871
 872void HELPER(gvec_sar8v)(void *d, void *a, void *b, uint32_t desc)
 873{
 874    intptr_t oprsz = simd_oprsz(desc);
 875    intptr_t i;
 876
 877    for (i = 0; i < oprsz; i += sizeof(int8_t)) {
 878        uint8_t sh = *(uint8_t *)(b + i) & 7;
 879        *(int8_t *)(d + i) = *(int8_t *)(a + i) >> sh;
 880    }
 881    clear_high(d, oprsz, desc);
 882}
 883
 884void HELPER(gvec_sar16v)(void *d, void *a, void *b, uint32_t desc)
 885{
 886    intptr_t oprsz = simd_oprsz(desc);
 887    intptr_t i;
 888
 889    for (i = 0; i < oprsz; i += sizeof(int16_t)) {
 890        uint8_t sh = *(uint16_t *)(b + i) & 15;
 891        *(int16_t *)(d + i) = *(int16_t *)(a + i) >> sh;
 892    }
 893    clear_high(d, oprsz, desc);
 894}
 895
 896void HELPER(gvec_sar32v)(void *d, void *a, void *b, uint32_t desc)
 897{
 898    intptr_t oprsz = simd_oprsz(desc);
 899    intptr_t i;
 900
 901    for (i = 0; i < oprsz; i += sizeof(int32_t)) {
 902        uint8_t sh = *(uint32_t *)(b + i) & 31;
 903        *(int32_t *)(d + i) = *(int32_t *)(a + i) >> sh;
 904    }
 905    clear_high(d, oprsz, desc);
 906}
 907
 908void HELPER(gvec_sar64v)(void *d, void *a, void *b, uint32_t desc)
 909{
 910    intptr_t oprsz = simd_oprsz(desc);
 911    intptr_t i;
 912
 913    for (i = 0; i < oprsz; i += sizeof(int64_t)) {
 914        uint8_t sh = *(uint64_t *)(b + i) & 63;
 915        *(int64_t *)(d + i) = *(int64_t *)(a + i) >> sh;
 916    }
 917    clear_high(d, oprsz, desc);
 918}
 919
 920/* If vectors are enabled, the compiler fills in -1 for true.
 921   Otherwise, we must take care of this by hand.  */
 922#ifdef CONFIG_VECTOR16
 923# define DO_CMP0(X)  X
 924#else
 925# define DO_CMP0(X)  -(X)
 926#endif
 927
 928#define DO_CMP1(NAME, TYPE, OP)                                            \
 929void HELPER(NAME)(void *d, void *a, void *b, uint32_t desc)                \
 930{                                                                          \
 931    intptr_t oprsz = simd_oprsz(desc);                                     \
 932    intptr_t i;                                                            \
 933    for (i = 0; i < oprsz; i += sizeof(TYPE)) {                            \
 934        *(TYPE *)(d + i) = DO_CMP0(*(TYPE *)(a + i) OP *(TYPE *)(b + i));  \
 935    }                                                                      \
 936    clear_high(d, oprsz, desc);                                            \
 937}
 938
 939#define DO_CMP2(SZ) \
 940    DO_CMP1(gvec_eq##SZ, vec##SZ, ==)    \
 941    DO_CMP1(gvec_ne##SZ, vec##SZ, !=)    \
 942    DO_CMP1(gvec_lt##SZ, svec##SZ, <)    \
 943    DO_CMP1(gvec_le##SZ, svec##SZ, <=)   \
 944    DO_CMP1(gvec_ltu##SZ, vec##SZ, <)    \
 945    DO_CMP1(gvec_leu##SZ, vec##SZ, <=)
 946
 947DO_CMP2(8)
 948DO_CMP2(16)
 949DO_CMP2(32)
 950DO_CMP2(64)
 951
 952#undef DO_CMP0
 953#undef DO_CMP1
 954#undef DO_CMP2
 955
 956void HELPER(gvec_ssadd8)(void *d, void *a, void *b, uint32_t desc)
 957{
 958    intptr_t oprsz = simd_oprsz(desc);
 959    intptr_t i;
 960
 961    for (i = 0; i < oprsz; i += sizeof(int8_t)) {
 962        int r = *(int8_t *)(a + i) + *(int8_t *)(b + i);
 963        if (r > INT8_MAX) {
 964            r = INT8_MAX;
 965        } else if (r < INT8_MIN) {
 966            r = INT8_MIN;
 967        }
 968        *(int8_t *)(d + i) = r;
 969    }
 970    clear_high(d, oprsz, desc);
 971}
 972
 973void HELPER(gvec_ssadd16)(void *d, void *a, void *b, uint32_t desc)
 974{
 975    intptr_t oprsz = simd_oprsz(desc);
 976    intptr_t i;
 977
 978    for (i = 0; i < oprsz; i += sizeof(int16_t)) {
 979        int r = *(int16_t *)(a + i) + *(int16_t *)(b + i);
 980        if (r > INT16_MAX) {
 981            r = INT16_MAX;
 982        } else if (r < INT16_MIN) {
 983            r = INT16_MIN;
 984        }
 985        *(int16_t *)(d + i) = r;
 986    }
 987    clear_high(d, oprsz, desc);
 988}
 989
 990void HELPER(gvec_ssadd32)(void *d, void *a, void *b, uint32_t desc)
 991{
 992    intptr_t oprsz = simd_oprsz(desc);
 993    intptr_t i;
 994
 995    for (i = 0; i < oprsz; i += sizeof(int32_t)) {
 996        int32_t ai = *(int32_t *)(a + i);
 997        int32_t bi = *(int32_t *)(b + i);
 998        int32_t di = ai + bi;
 999        if (((di ^ ai) &~ (ai ^ bi)) < 0) {
1000            /* Signed overflow.  */
1001            di = (di < 0 ? INT32_MAX : INT32_MIN);
1002        }
1003        *(int32_t *)(d + i) = di;
1004    }
1005    clear_high(d, oprsz, desc);
1006}
1007
1008void HELPER(gvec_ssadd64)(void *d, void *a, void *b, uint32_t desc)
1009{
1010    intptr_t oprsz = simd_oprsz(desc);
1011    intptr_t i;
1012
1013    for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1014        int64_t ai = *(int64_t *)(a + i);
1015        int64_t bi = *(int64_t *)(b + i);
1016        int64_t di = ai + bi;
1017        if (((di ^ ai) &~ (ai ^ bi)) < 0) {
1018            /* Signed overflow.  */
1019            di = (di < 0 ? INT64_MAX : INT64_MIN);
1020        }
1021        *(int64_t *)(d + i) = di;
1022    }
1023    clear_high(d, oprsz, desc);
1024}
1025
1026void HELPER(gvec_sssub8)(void *d, void *a, void *b, uint32_t desc)
1027{
1028    intptr_t oprsz = simd_oprsz(desc);
1029    intptr_t i;
1030
1031    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1032        int r = *(int8_t *)(a + i) - *(int8_t *)(b + i);
1033        if (r > INT8_MAX) {
1034            r = INT8_MAX;
1035        } else if (r < INT8_MIN) {
1036            r = INT8_MIN;
1037        }
1038        *(uint8_t *)(d + i) = r;
1039    }
1040    clear_high(d, oprsz, desc);
1041}
1042
1043void HELPER(gvec_sssub16)(void *d, void *a, void *b, uint32_t desc)
1044{
1045    intptr_t oprsz = simd_oprsz(desc);
1046    intptr_t i;
1047
1048    for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1049        int r = *(int16_t *)(a + i) - *(int16_t *)(b + i);
1050        if (r > INT16_MAX) {
1051            r = INT16_MAX;
1052        } else if (r < INT16_MIN) {
1053            r = INT16_MIN;
1054        }
1055        *(int16_t *)(d + i) = r;
1056    }
1057    clear_high(d, oprsz, desc);
1058}
1059
1060void HELPER(gvec_sssub32)(void *d, void *a, void *b, uint32_t desc)
1061{
1062    intptr_t oprsz = simd_oprsz(desc);
1063    intptr_t i;
1064
1065    for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1066        int32_t ai = *(int32_t *)(a + i);
1067        int32_t bi = *(int32_t *)(b + i);
1068        int32_t di = ai - bi;
1069        if (((di ^ ai) & (ai ^ bi)) < 0) {
1070            /* Signed overflow.  */
1071            di = (di < 0 ? INT32_MAX : INT32_MIN);
1072        }
1073        *(int32_t *)(d + i) = di;
1074    }
1075    clear_high(d, oprsz, desc);
1076}
1077
1078void HELPER(gvec_sssub64)(void *d, void *a, void *b, uint32_t desc)
1079{
1080    intptr_t oprsz = simd_oprsz(desc);
1081    intptr_t i;
1082
1083    for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1084        int64_t ai = *(int64_t *)(a + i);
1085        int64_t bi = *(int64_t *)(b + i);
1086        int64_t di = ai - bi;
1087        if (((di ^ ai) & (ai ^ bi)) < 0) {
1088            /* Signed overflow.  */
1089            di = (di < 0 ? INT64_MAX : INT64_MIN);
1090        }
1091        *(int64_t *)(d + i) = di;
1092    }
1093    clear_high(d, oprsz, desc);
1094}
1095
1096void HELPER(gvec_usadd8)(void *d, void *a, void *b, uint32_t desc)
1097{
1098    intptr_t oprsz = simd_oprsz(desc);
1099    intptr_t i;
1100
1101    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1102        unsigned r = *(uint8_t *)(a + i) + *(uint8_t *)(b + i);
1103        if (r > UINT8_MAX) {
1104            r = UINT8_MAX;
1105        }
1106        *(uint8_t *)(d + i) = r;
1107    }
1108    clear_high(d, oprsz, desc);
1109}
1110
1111void HELPER(gvec_usadd16)(void *d, void *a, void *b, uint32_t desc)
1112{
1113    intptr_t oprsz = simd_oprsz(desc);
1114    intptr_t i;
1115
1116    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1117        unsigned r = *(uint16_t *)(a + i) + *(uint16_t *)(b + i);
1118        if (r > UINT16_MAX) {
1119            r = UINT16_MAX;
1120        }
1121        *(uint16_t *)(d + i) = r;
1122    }
1123    clear_high(d, oprsz, desc);
1124}
1125
1126void HELPER(gvec_usadd32)(void *d, void *a, void *b, uint32_t desc)
1127{
1128    intptr_t oprsz = simd_oprsz(desc);
1129    intptr_t i;
1130
1131    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1132        uint32_t ai = *(uint32_t *)(a + i);
1133        uint32_t bi = *(uint32_t *)(b + i);
1134        uint32_t di = ai + bi;
1135        if (di < ai) {
1136            di = UINT32_MAX;
1137        }
1138        *(uint32_t *)(d + i) = di;
1139    }
1140    clear_high(d, oprsz, desc);
1141}
1142
1143void HELPER(gvec_usadd64)(void *d, void *a, void *b, uint32_t desc)
1144{
1145    intptr_t oprsz = simd_oprsz(desc);
1146    intptr_t i;
1147
1148    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1149        uint64_t ai = *(uint64_t *)(a + i);
1150        uint64_t bi = *(uint64_t *)(b + i);
1151        uint64_t di = ai + bi;
1152        if (di < ai) {
1153            di = UINT64_MAX;
1154        }
1155        *(uint64_t *)(d + i) = di;
1156    }
1157    clear_high(d, oprsz, desc);
1158}
1159
1160void HELPER(gvec_ussub8)(void *d, void *a, void *b, uint32_t desc)
1161{
1162    intptr_t oprsz = simd_oprsz(desc);
1163    intptr_t i;
1164
1165    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1166        int r = *(uint8_t *)(a + i) - *(uint8_t *)(b + i);
1167        if (r < 0) {
1168            r = 0;
1169        }
1170        *(uint8_t *)(d + i) = r;
1171    }
1172    clear_high(d, oprsz, desc);
1173}
1174
1175void HELPER(gvec_ussub16)(void *d, void *a, void *b, uint32_t desc)
1176{
1177    intptr_t oprsz = simd_oprsz(desc);
1178    intptr_t i;
1179
1180    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1181        int r = *(uint16_t *)(a + i) - *(uint16_t *)(b + i);
1182        if (r < 0) {
1183            r = 0;
1184        }
1185        *(uint16_t *)(d + i) = r;
1186    }
1187    clear_high(d, oprsz, desc);
1188}
1189
1190void HELPER(gvec_ussub32)(void *d, void *a, void *b, uint32_t desc)
1191{
1192    intptr_t oprsz = simd_oprsz(desc);
1193    intptr_t i;
1194
1195    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1196        uint32_t ai = *(uint32_t *)(a + i);
1197        uint32_t bi = *(uint32_t *)(b + i);
1198        uint32_t di = ai - bi;
1199        if (ai < bi) {
1200            di = 0;
1201        }
1202        *(uint32_t *)(d + i) = di;
1203    }
1204    clear_high(d, oprsz, desc);
1205}
1206
1207void HELPER(gvec_ussub64)(void *d, void *a, void *b, uint32_t desc)
1208{
1209    intptr_t oprsz = simd_oprsz(desc);
1210    intptr_t i;
1211
1212    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1213        uint64_t ai = *(uint64_t *)(a + i);
1214        uint64_t bi = *(uint64_t *)(b + i);
1215        uint64_t di = ai - bi;
1216        if (ai < bi) {
1217            di = 0;
1218        }
1219        *(uint64_t *)(d + i) = di;
1220    }
1221    clear_high(d, oprsz, desc);
1222}
1223
1224void HELPER(gvec_smin8)(void *d, void *a, void *b, uint32_t desc)
1225{
1226    intptr_t oprsz = simd_oprsz(desc);
1227    intptr_t i;
1228
1229    for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1230        int8_t aa = *(int8_t *)(a + i);
1231        int8_t bb = *(int8_t *)(b + i);
1232        int8_t dd = aa < bb ? aa : bb;
1233        *(int8_t *)(d + i) = dd;
1234    }
1235    clear_high(d, oprsz, desc);
1236}
1237
1238void HELPER(gvec_smin16)(void *d, void *a, void *b, uint32_t desc)
1239{
1240    intptr_t oprsz = simd_oprsz(desc);
1241    intptr_t i;
1242
1243    for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1244        int16_t aa = *(int16_t *)(a + i);
1245        int16_t bb = *(int16_t *)(b + i);
1246        int16_t dd = aa < bb ? aa : bb;
1247        *(int16_t *)(d + i) = dd;
1248    }
1249    clear_high(d, oprsz, desc);
1250}
1251
1252void HELPER(gvec_smin32)(void *d, void *a, void *b, uint32_t desc)
1253{
1254    intptr_t oprsz = simd_oprsz(desc);
1255    intptr_t i;
1256
1257    for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1258        int32_t aa = *(int32_t *)(a + i);
1259        int32_t bb = *(int32_t *)(b + i);
1260        int32_t dd = aa < bb ? aa : bb;
1261        *(int32_t *)(d + i) = dd;
1262    }
1263    clear_high(d, oprsz, desc);
1264}
1265
1266void HELPER(gvec_smin64)(void *d, void *a, void *b, uint32_t desc)
1267{
1268    intptr_t oprsz = simd_oprsz(desc);
1269    intptr_t i;
1270
1271    for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1272        int64_t aa = *(int64_t *)(a + i);
1273        int64_t bb = *(int64_t *)(b + i);
1274        int64_t dd = aa < bb ? aa : bb;
1275        *(int64_t *)(d + i) = dd;
1276    }
1277    clear_high(d, oprsz, desc);
1278}
1279
1280void HELPER(gvec_smax8)(void *d, void *a, void *b, uint32_t desc)
1281{
1282    intptr_t oprsz = simd_oprsz(desc);
1283    intptr_t i;
1284
1285    for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1286        int8_t aa = *(int8_t *)(a + i);
1287        int8_t bb = *(int8_t *)(b + i);
1288        int8_t dd = aa > bb ? aa : bb;
1289        *(int8_t *)(d + i) = dd;
1290    }
1291    clear_high(d, oprsz, desc);
1292}
1293
1294void HELPER(gvec_smax16)(void *d, void *a, void *b, uint32_t desc)
1295{
1296    intptr_t oprsz = simd_oprsz(desc);
1297    intptr_t i;
1298
1299    for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1300        int16_t aa = *(int16_t *)(a + i);
1301        int16_t bb = *(int16_t *)(b + i);
1302        int16_t dd = aa > bb ? aa : bb;
1303        *(int16_t *)(d + i) = dd;
1304    }
1305    clear_high(d, oprsz, desc);
1306}
1307
1308void HELPER(gvec_smax32)(void *d, void *a, void *b, uint32_t desc)
1309{
1310    intptr_t oprsz = simd_oprsz(desc);
1311    intptr_t i;
1312
1313    for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1314        int32_t aa = *(int32_t *)(a + i);
1315        int32_t bb = *(int32_t *)(b + i);
1316        int32_t dd = aa > bb ? aa : bb;
1317        *(int32_t *)(d + i) = dd;
1318    }
1319    clear_high(d, oprsz, desc);
1320}
1321
1322void HELPER(gvec_smax64)(void *d, void *a, void *b, uint32_t desc)
1323{
1324    intptr_t oprsz = simd_oprsz(desc);
1325    intptr_t i;
1326
1327    for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1328        int64_t aa = *(int64_t *)(a + i);
1329        int64_t bb = *(int64_t *)(b + i);
1330        int64_t dd = aa > bb ? aa : bb;
1331        *(int64_t *)(d + i) = dd;
1332    }
1333    clear_high(d, oprsz, desc);
1334}
1335
1336void HELPER(gvec_umin8)(void *d, void *a, void *b, uint32_t desc)
1337{
1338    intptr_t oprsz = simd_oprsz(desc);
1339    intptr_t i;
1340
1341    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1342        uint8_t aa = *(uint8_t *)(a + i);
1343        uint8_t bb = *(uint8_t *)(b + i);
1344        uint8_t dd = aa < bb ? aa : bb;
1345        *(uint8_t *)(d + i) = dd;
1346    }
1347    clear_high(d, oprsz, desc);
1348}
1349
1350void HELPER(gvec_umin16)(void *d, void *a, void *b, uint32_t desc)
1351{
1352    intptr_t oprsz = simd_oprsz(desc);
1353    intptr_t i;
1354
1355    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1356        uint16_t aa = *(uint16_t *)(a + i);
1357        uint16_t bb = *(uint16_t *)(b + i);
1358        uint16_t dd = aa < bb ? aa : bb;
1359        *(uint16_t *)(d + i) = dd;
1360    }
1361    clear_high(d, oprsz, desc);
1362}
1363
1364void HELPER(gvec_umin32)(void *d, void *a, void *b, uint32_t desc)
1365{
1366    intptr_t oprsz = simd_oprsz(desc);
1367    intptr_t i;
1368
1369    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1370        uint32_t aa = *(uint32_t *)(a + i);
1371        uint32_t bb = *(uint32_t *)(b + i);
1372        uint32_t dd = aa < bb ? aa : bb;
1373        *(uint32_t *)(d + i) = dd;
1374    }
1375    clear_high(d, oprsz, desc);
1376}
1377
1378void HELPER(gvec_umin64)(void *d, void *a, void *b, uint32_t desc)
1379{
1380    intptr_t oprsz = simd_oprsz(desc);
1381    intptr_t i;
1382
1383    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1384        uint64_t aa = *(uint64_t *)(a + i);
1385        uint64_t bb = *(uint64_t *)(b + i);
1386        uint64_t dd = aa < bb ? aa : bb;
1387        *(uint64_t *)(d + i) = dd;
1388    }
1389    clear_high(d, oprsz, desc);
1390}
1391
1392void HELPER(gvec_umax8)(void *d, void *a, void *b, uint32_t desc)
1393{
1394    intptr_t oprsz = simd_oprsz(desc);
1395    intptr_t i;
1396
1397    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1398        uint8_t aa = *(uint8_t *)(a + i);
1399        uint8_t bb = *(uint8_t *)(b + i);
1400        uint8_t dd = aa > bb ? aa : bb;
1401        *(uint8_t *)(d + i) = dd;
1402    }
1403    clear_high(d, oprsz, desc);
1404}
1405
1406void HELPER(gvec_umax16)(void *d, void *a, void *b, uint32_t desc)
1407{
1408    intptr_t oprsz = simd_oprsz(desc);
1409    intptr_t i;
1410
1411    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1412        uint16_t aa = *(uint16_t *)(a + i);
1413        uint16_t bb = *(uint16_t *)(b + i);
1414        uint16_t dd = aa > bb ? aa : bb;
1415        *(uint16_t *)(d + i) = dd;
1416    }
1417    clear_high(d, oprsz, desc);
1418}
1419
1420void HELPER(gvec_umax32)(void *d, void *a, void *b, uint32_t desc)
1421{
1422    intptr_t oprsz = simd_oprsz(desc);
1423    intptr_t i;
1424
1425    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1426        uint32_t aa = *(uint32_t *)(a + i);
1427        uint32_t bb = *(uint32_t *)(b + i);
1428        uint32_t dd = aa > bb ? aa : bb;
1429        *(uint32_t *)(d + i) = dd;
1430    }
1431    clear_high(d, oprsz, desc);
1432}
1433
1434void HELPER(gvec_umax64)(void *d, void *a, void *b, uint32_t desc)
1435{
1436    intptr_t oprsz = simd_oprsz(desc);
1437    intptr_t i;
1438
1439    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1440        uint64_t aa = *(uint64_t *)(a + i);
1441        uint64_t bb = *(uint64_t *)(b + i);
1442        uint64_t dd = aa > bb ? aa : bb;
1443        *(uint64_t *)(d + i) = dd;
1444    }
1445    clear_high(d, oprsz, desc);
1446}
1447
1448void HELPER(gvec_bitsel)(void *d, void *a, void *b, void *c, uint32_t desc)
1449{
1450    intptr_t oprsz = simd_oprsz(desc);
1451    intptr_t i;
1452
1453    for (i = 0; i < oprsz; i += sizeof(vec64)) {
1454        vec64 aa = *(vec64 *)(a + i);
1455        vec64 bb = *(vec64 *)(b + i);
1456        vec64 cc = *(vec64 *)(c + i);
1457        *(vec64 *)(d + i) = (bb & aa) | (cc & ~aa);
1458    }
1459    clear_high(d, oprsz, desc);
1460}
1461