qemu/target/riscv/vector_helper.c
<<
>>
Prefs
   1/*
   2 * RISC-V Vector Extension Helpers for QEMU.
   3 *
   4 * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
   5 *
   6 * This program is free software; you can redistribute it and/or modify it
   7 * under the terms and conditions of the GNU General Public License,
   8 * version 2 or later, as published by the Free Software Foundation.
   9 *
  10 * This program is distributed in the hope it will be useful, but WITHOUT
  11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  13 * more details.
  14 *
  15 * You should have received a copy of the GNU General Public License along with
  16 * this program.  If not, see <http://www.gnu.org/licenses/>.
  17 */
  18
  19#include "qemu/osdep.h"
  20#include "qemu/host-utils.h"
  21#include "qemu/bitops.h"
  22#include "cpu.h"
  23#include "exec/memop.h"
  24#include "exec/exec-all.h"
  25#include "exec/helper-proto.h"
  26#include "fpu/softfloat.h"
  27#include "tcg/tcg-gvec-desc.h"
  28#include "internals.h"
  29#include <math.h>
  30
  31target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
  32                            target_ulong s2)
  33{
  34    int vlmax, vl;
  35    RISCVCPU *cpu = env_archcpu(env);
  36    uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
  37    uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
  38    uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
  39    int xlen = riscv_cpu_xlen(env);
  40    bool vill = (s2 >> (xlen - 1)) & 0x1;
  41    target_ulong reserved = s2 &
  42                            MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
  43                                            xlen - 1 - R_VTYPE_RESERVED_SHIFT);
  44
  45    if (lmul & 4) {
  46        /* Fractional LMUL. */
  47        if (lmul == 4 ||
  48            cpu->cfg.elen >> (8 - lmul) < sew) {
  49            vill = true;
  50        }
  51    }
  52
  53    if ((sew > cpu->cfg.elen)
  54        || vill
  55        || (ediv != 0)
  56        || (reserved != 0)) {
  57        /* only set vill bit. */
  58        env->vill = 1;
  59        env->vtype = 0;
  60        env->vl = 0;
  61        env->vstart = 0;
  62        return 0;
  63    }
  64
  65    vlmax = vext_get_vlmax(cpu, s2);
  66    if (s1 <= vlmax) {
  67        vl = s1;
  68    } else {
  69        vl = vlmax;
  70    }
  71    env->vl = vl;
  72    env->vtype = s2;
  73    env->vstart = 0;
  74    env->vill = 0;
  75    return vl;
  76}
  77
  78/*
  79 * Note that vector data is stored in host-endian 64-bit chunks,
  80 * so addressing units smaller than that needs a host-endian fixup.
  81 */
  82#if HOST_BIG_ENDIAN
  83#define H1(x)   ((x) ^ 7)
  84#define H1_2(x) ((x) ^ 6)
  85#define H1_4(x) ((x) ^ 4)
  86#define H2(x)   ((x) ^ 3)
  87#define H4(x)   ((x) ^ 1)
  88#define H8(x)   ((x))
  89#else
  90#define H1(x)   (x)
  91#define H1_2(x) (x)
  92#define H1_4(x) (x)
  93#define H2(x)   (x)
  94#define H4(x)   (x)
  95#define H8(x)   (x)
  96#endif
  97
  98static inline uint32_t vext_nf(uint32_t desc)
  99{
 100    return FIELD_EX32(simd_data(desc), VDATA, NF);
 101}
 102
 103static inline uint32_t vext_vm(uint32_t desc)
 104{
 105    return FIELD_EX32(simd_data(desc), VDATA, VM);
 106}
 107
 108/*
 109 * Encode LMUL to lmul as following:
 110 *     LMUL    vlmul    lmul
 111 *      1       000       0
 112 *      2       001       1
 113 *      4       010       2
 114 *      8       011       3
 115 *      -       100       -
 116 *     1/8      101      -3
 117 *     1/4      110      -2
 118 *     1/2      111      -1
 119 */
 120static inline int32_t vext_lmul(uint32_t desc)
 121{
 122    return sextract32(FIELD_EX32(simd_data(desc), VDATA, LMUL), 0, 3);
 123}
 124
 125static inline uint32_t vext_vta(uint32_t desc)
 126{
 127    return FIELD_EX32(simd_data(desc), VDATA, VTA);
 128}
 129
 130static inline uint32_t vext_vta_all_1s(uint32_t desc)
 131{
 132    return FIELD_EX32(simd_data(desc), VDATA, VTA_ALL_1S);
 133}
 134
 135/*
 136 * Get the maximum number of elements can be operated.
 137 *
 138 * log2_esz: log2 of element size in bytes.
 139 */
 140static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
 141{
 142    /*
 143     * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
 144     * so vlen in bytes (vlenb) is encoded as maxsz.
 145     */
 146    uint32_t vlenb = simd_maxsz(desc);
 147
 148    /* Return VLMAX */
 149    int scale = vext_lmul(desc) - log2_esz;
 150    return scale < 0 ? vlenb >> -scale : vlenb << scale;
 151}
 152
 153/*
 154 * Get number of total elements, including prestart, body and tail elements.
 155 * Note that when LMUL < 1, the tail includes the elements past VLMAX that
 156 * are held in the same vector register.
 157 */
 158static inline uint32_t vext_get_total_elems(CPURISCVState *env, uint32_t desc,
 159                                            uint32_t esz)
 160{
 161    uint32_t vlenb = simd_maxsz(desc);
 162    uint32_t sew = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
 163    int8_t emul = ctzl(esz) - ctzl(sew) + vext_lmul(desc) < 0 ? 0 :
 164                  ctzl(esz) - ctzl(sew) + vext_lmul(desc);
 165    return (vlenb << emul) / esz;
 166}
 167
 168static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
 169{
 170    return (addr & env->cur_pmmask) | env->cur_pmbase;
 171}
 172
 173/*
 174 * This function checks watchpoint before real load operation.
 175 *
 176 * In softmmu mode, the TLB API probe_access is enough for watchpoint check.
 177 * In user mode, there is no watchpoint support now.
 178 *
 179 * It will trigger an exception if there is no mapping in TLB
 180 * and page table walk can't fill the TLB entry. Then the guest
 181 * software can return here after process the exception or never return.
 182 */
 183static void probe_pages(CPURISCVState *env, target_ulong addr,
 184                        target_ulong len, uintptr_t ra,
 185                        MMUAccessType access_type)
 186{
 187    target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
 188    target_ulong curlen = MIN(pagelen, len);
 189
 190    probe_access(env, adjust_addr(env, addr), curlen, access_type,
 191                 cpu_mmu_index(env, false), ra);
 192    if (len > curlen) {
 193        addr += curlen;
 194        curlen = len - curlen;
 195        probe_access(env, adjust_addr(env, addr), curlen, access_type,
 196                     cpu_mmu_index(env, false), ra);
 197    }
 198}
 199
 200/* set agnostic elements to 1s */
 201static void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt,
 202                              uint32_t tot)
 203{
 204    if (is_agnostic == 0) {
 205        /* policy undisturbed */
 206        return;
 207    }
 208    if (tot - cnt == 0) {
 209        return ;
 210    }
 211    memset(base + cnt, -1, tot - cnt);
 212}
 213
 214static inline void vext_set_elem_mask(void *v0, int index,
 215                                      uint8_t value)
 216{
 217    int idx = index / 64;
 218    int pos = index % 64;
 219    uint64_t old = ((uint64_t *)v0)[idx];
 220    ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
 221}
 222
 223/*
 224 * Earlier designs (pre-0.9) had a varying number of bits
 225 * per mask value (MLEN). In the 0.9 design, MLEN=1.
 226 * (Section 4.5)
 227 */
 228static inline int vext_elem_mask(void *v0, int index)
 229{
 230    int idx = index / 64;
 231    int pos = index  % 64;
 232    return (((uint64_t *)v0)[idx] >> pos) & 1;
 233}
 234
 235/* elements operations for load and store */
 236typedef void vext_ldst_elem_fn(CPURISCVState *env, target_ulong addr,
 237                               uint32_t idx, void *vd, uintptr_t retaddr);
 238
 239#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
 240static void NAME(CPURISCVState *env, abi_ptr addr,         \
 241                 uint32_t idx, void *vd, uintptr_t retaddr)\
 242{                                                          \
 243    ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
 244    *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
 245}                                                          \
 246
 247GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
 248GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
 249GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
 250GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
 251
 252#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
 253static void NAME(CPURISCVState *env, abi_ptr addr,         \
 254                 uint32_t idx, void *vd, uintptr_t retaddr)\
 255{                                                          \
 256    ETYPE data = *((ETYPE *)vd + H(idx));                  \
 257    cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
 258}
 259
 260GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
 261GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
 262GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
 263GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
 264
 265/*
 266 *** stride: access vector element from strided memory
 267 */
 268static void
 269vext_ldst_stride(void *vd, void *v0, target_ulong base,
 270                 target_ulong stride, CPURISCVState *env,
 271                 uint32_t desc, uint32_t vm,
 272                 vext_ldst_elem_fn *ldst_elem,
 273                 uint32_t log2_esz, uintptr_t ra)
 274{
 275    uint32_t i, k;
 276    uint32_t nf = vext_nf(desc);
 277    uint32_t max_elems = vext_max_elems(desc, log2_esz);
 278    uint32_t esz = 1 << log2_esz;
 279    uint32_t total_elems = vext_get_total_elems(env, desc, esz);
 280    uint32_t vta = vext_vta(desc);
 281
 282    for (i = env->vstart; i < env->vl; i++, env->vstart++) {
 283        if (!vm && !vext_elem_mask(v0, i)) {
 284            continue;
 285        }
 286
 287        k = 0;
 288        while (k < nf) {
 289            target_ulong addr = base + stride * i + (k << log2_esz);
 290            ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
 291            k++;
 292        }
 293    }
 294    env->vstart = 0;
 295    /* set tail elements to 1s */
 296    for (k = 0; k < nf; ++k) {
 297        vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
 298                          (k * max_elems + max_elems) * esz);
 299    }
 300    if (nf * max_elems % total_elems != 0) {
 301        uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
 302        uint32_t registers_used =
 303            ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
 304        vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
 305                          registers_used * vlenb);
 306    }
 307}
 308
 309#define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
 310void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
 311                  target_ulong stride, CPURISCVState *env,              \
 312                  uint32_t desc)                                        \
 313{                                                                       \
 314    uint32_t vm = vext_vm(desc);                                        \
 315    vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
 316                     ctzl(sizeof(ETYPE)), GETPC());                     \
 317}
 318
 319GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
 320GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
 321GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
 322GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
 323
 324#define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
 325void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
 326                  target_ulong stride, CPURISCVState *env,              \
 327                  uint32_t desc)                                        \
 328{                                                                       \
 329    uint32_t vm = vext_vm(desc);                                        \
 330    vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
 331                     ctzl(sizeof(ETYPE)), GETPC());                     \
 332}
 333
 334GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
 335GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
 336GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
 337GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
 338
 339/*
 340 *** unit-stride: access elements stored contiguously in memory
 341 */
 342
 343/* unmasked unit-stride load and store operation*/
 344static void
 345vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
 346             vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
 347             uintptr_t ra)
 348{
 349    uint32_t i, k;
 350    uint32_t nf = vext_nf(desc);
 351    uint32_t max_elems = vext_max_elems(desc, log2_esz);
 352    uint32_t esz = 1 << log2_esz;
 353    uint32_t total_elems = vext_get_total_elems(env, desc, esz);
 354    uint32_t vta = vext_vta(desc);
 355
 356    /* load bytes from guest memory */
 357    for (i = env->vstart; i < evl; i++, env->vstart++) {
 358        k = 0;
 359        while (k < nf) {
 360            target_ulong addr = base + ((i * nf + k) << log2_esz);
 361            ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
 362            k++;
 363        }
 364    }
 365    env->vstart = 0;
 366    /* set tail elements to 1s */
 367    for (k = 0; k < nf; ++k) {
 368        vext_set_elems_1s(vd, vta, (k * max_elems + evl) * esz,
 369                          (k * max_elems + max_elems) * esz);
 370    }
 371    if (nf * max_elems % total_elems != 0) {
 372        uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
 373        uint32_t registers_used =
 374            ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
 375        vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
 376                          registers_used * vlenb);
 377    }
 378}
 379
 380/*
 381 * masked unit-stride load and store operation will be a special case of stride,
 382 * stride = NF * sizeof (MTYPE)
 383 */
 384
 385#define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
 386void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
 387                         CPURISCVState *env, uint32_t desc)             \
 388{                                                                       \
 389    uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
 390    vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
 391                     ctzl(sizeof(ETYPE)), GETPC());                     \
 392}                                                                       \
 393                                                                        \
 394void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
 395                  CPURISCVState *env, uint32_t desc)                    \
 396{                                                                       \
 397    vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
 398                 ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
 399}
 400
 401GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
 402GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
 403GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
 404GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
 405
 406#define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
 407void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
 408                         CPURISCVState *env, uint32_t desc)              \
 409{                                                                        \
 410    uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
 411    vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
 412                     ctzl(sizeof(ETYPE)), GETPC());                      \
 413}                                                                        \
 414                                                                         \
 415void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
 416                  CPURISCVState *env, uint32_t desc)                     \
 417{                                                                        \
 418    vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
 419                 ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
 420}
 421
 422GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
 423GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
 424GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
 425GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
 426
 427/*
 428 *** unit stride mask load and store, EEW = 1
 429 */
 430void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
 431                    CPURISCVState *env, uint32_t desc)
 432{
 433    /* evl = ceil(vl/8) */
 434    uint8_t evl = (env->vl + 7) >> 3;
 435    vext_ldst_us(vd, base, env, desc, lde_b,
 436                 0, evl, GETPC());
 437}
 438
 439void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
 440                    CPURISCVState *env, uint32_t desc)
 441{
 442    /* evl = ceil(vl/8) */
 443    uint8_t evl = (env->vl + 7) >> 3;
 444    vext_ldst_us(vd, base, env, desc, ste_b,
 445                 0, evl, GETPC());
 446}
 447
 448/*
 449 *** index: access vector element from indexed memory
 450 */
 451typedef target_ulong vext_get_index_addr(target_ulong base,
 452        uint32_t idx, void *vs2);
 453
 454#define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
 455static target_ulong NAME(target_ulong base,            \
 456                         uint32_t idx, void *vs2)      \
 457{                                                      \
 458    return (base + *((ETYPE *)vs2 + H(idx)));          \
 459}
 460
 461GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
 462GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
 463GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
 464GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
 465
 466static inline void
 467vext_ldst_index(void *vd, void *v0, target_ulong base,
 468                void *vs2, CPURISCVState *env, uint32_t desc,
 469                vext_get_index_addr get_index_addr,
 470                vext_ldst_elem_fn *ldst_elem,
 471                uint32_t log2_esz, uintptr_t ra)
 472{
 473    uint32_t i, k;
 474    uint32_t nf = vext_nf(desc);
 475    uint32_t vm = vext_vm(desc);
 476    uint32_t max_elems = vext_max_elems(desc, log2_esz);
 477    uint32_t esz = 1 << log2_esz;
 478    uint32_t total_elems = vext_get_total_elems(env, desc, esz);
 479    uint32_t vta = vext_vta(desc);
 480
 481    /* load bytes from guest memory */
 482    for (i = env->vstart; i < env->vl; i++, env->vstart++) {
 483        if (!vm && !vext_elem_mask(v0, i)) {
 484            continue;
 485        }
 486
 487        k = 0;
 488        while (k < nf) {
 489            abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
 490            ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
 491            k++;
 492        }
 493    }
 494    env->vstart = 0;
 495    /* set tail elements to 1s */
 496    for (k = 0; k < nf; ++k) {
 497        vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
 498                          (k * max_elems + max_elems) * esz);
 499    }
 500    if (nf * max_elems % total_elems != 0) {
 501        uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
 502        uint32_t registers_used =
 503            ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
 504        vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
 505                          registers_used * vlenb);
 506    }
 507}
 508
 509#define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
 510void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
 511                  void *vs2, CPURISCVState *env, uint32_t desc)            \
 512{                                                                          \
 513    vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
 514                    LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
 515}
 516
 517GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
 518GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
 519GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
 520GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
 521GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
 522GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
 523GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
 524GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
 525GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
 526GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
 527GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
 528GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
 529GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
 530GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
 531GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
 532GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
 533
 534#define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
 535void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
 536                  void *vs2, CPURISCVState *env, uint32_t desc)  \
 537{                                                                \
 538    vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
 539                    STORE_FN, ctzl(sizeof(ETYPE)),               \
 540                    GETPC());                                    \
 541}
 542
 543GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
 544GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
 545GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
 546GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
 547GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
 548GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
 549GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
 550GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
 551GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
 552GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
 553GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
 554GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
 555GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
 556GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
 557GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
 558GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
 559
 560/*
 561 *** unit-stride fault-only-fisrt load instructions
 562 */
 563static inline void
 564vext_ldff(void *vd, void *v0, target_ulong base,
 565          CPURISCVState *env, uint32_t desc,
 566          vext_ldst_elem_fn *ldst_elem,
 567          uint32_t log2_esz, uintptr_t ra)
 568{
 569    void *host;
 570    uint32_t i, k, vl = 0;
 571    uint32_t nf = vext_nf(desc);
 572    uint32_t vm = vext_vm(desc);
 573    uint32_t max_elems = vext_max_elems(desc, log2_esz);
 574    uint32_t esz = 1 << log2_esz;
 575    uint32_t total_elems = vext_get_total_elems(env, desc, esz);
 576    uint32_t vta = vext_vta(desc);
 577    target_ulong addr, offset, remain;
 578
 579    /* probe every access*/
 580    for (i = env->vstart; i < env->vl; i++) {
 581        if (!vm && !vext_elem_mask(v0, i)) {
 582            continue;
 583        }
 584        addr = adjust_addr(env, base + i * (nf << log2_esz));
 585        if (i == 0) {
 586            probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
 587        } else {
 588            /* if it triggers an exception, no need to check watchpoint */
 589            remain = nf << log2_esz;
 590            while (remain > 0) {
 591                offset = -(addr | TARGET_PAGE_MASK);
 592                host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD,
 593                                         cpu_mmu_index(env, false));
 594                if (host) {
 595#ifdef CONFIG_USER_ONLY
 596                    if (page_check_range(addr, offset, PAGE_READ) < 0) {
 597                        vl = i;
 598                        goto ProbeSuccess;
 599                    }
 600#else
 601                    probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
 602#endif
 603                } else {
 604                    vl = i;
 605                    goto ProbeSuccess;
 606                }
 607                if (remain <=  offset) {
 608                    break;
 609                }
 610                remain -= offset;
 611                addr = adjust_addr(env, addr + offset);
 612            }
 613        }
 614    }
 615ProbeSuccess:
 616    /* load bytes from guest memory */
 617    if (vl != 0) {
 618        env->vl = vl;
 619    }
 620    for (i = env->vstart; i < env->vl; i++) {
 621        k = 0;
 622        if (!vm && !vext_elem_mask(v0, i)) {
 623            continue;
 624        }
 625        while (k < nf) {
 626            target_ulong addr = base + ((i * nf + k) << log2_esz);
 627            ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
 628            k++;
 629        }
 630    }
 631    env->vstart = 0;
 632    /* set tail elements to 1s */
 633    for (k = 0; k < nf; ++k) {
 634        vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
 635                          (k * max_elems + max_elems) * esz);
 636    }
 637    if (nf * max_elems % total_elems != 0) {
 638        uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
 639        uint32_t registers_used =
 640            ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
 641        vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
 642                          registers_used * vlenb);
 643    }
 644}
 645
 646#define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
 647void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
 648                  CPURISCVState *env, uint32_t desc)      \
 649{                                                         \
 650    vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
 651              ctzl(sizeof(ETYPE)), GETPC());              \
 652}
 653
 654GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
 655GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
 656GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
 657GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
 658
 659#define DO_SWAP(N, M) (M)
 660#define DO_AND(N, M)  (N & M)
 661#define DO_XOR(N, M)  (N ^ M)
 662#define DO_OR(N, M)   (N | M)
 663#define DO_ADD(N, M)  (N + M)
 664
 665/* Signed min/max */
 666#define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
 667#define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
 668
 669/* Unsigned min/max */
 670#define DO_MAXU(N, M) DO_MAX((UMTYPE)N, (UMTYPE)M)
 671#define DO_MINU(N, M) DO_MIN((UMTYPE)N, (UMTYPE)M)
 672
 673/*
 674 *** load and store whole register instructions
 675 */
 676static void
 677vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
 678                vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
 679{
 680    uint32_t i, k, off, pos;
 681    uint32_t nf = vext_nf(desc);
 682    uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
 683    uint32_t max_elems = vlenb >> log2_esz;
 684
 685    k = env->vstart / max_elems;
 686    off = env->vstart % max_elems;
 687
 688    if (off) {
 689        /* load/store rest of elements of current segment pointed by vstart */
 690        for (pos = off; pos < max_elems; pos++, env->vstart++) {
 691            target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
 692            ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd, ra);
 693        }
 694        k++;
 695    }
 696
 697    /* load/store elements for rest of segments */
 698    for (; k < nf; k++) {
 699        for (i = 0; i < max_elems; i++, env->vstart++) {
 700            target_ulong addr = base + ((i + k * max_elems) << log2_esz);
 701            ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
 702        }
 703    }
 704
 705    env->vstart = 0;
 706}
 707
 708#define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
 709void HELPER(NAME)(void *vd, target_ulong base,       \
 710                  CPURISCVState *env, uint32_t desc) \
 711{                                                    \
 712    vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
 713                    ctzl(sizeof(ETYPE)), GETPC());   \
 714}
 715
 716GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
 717GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
 718GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
 719GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
 720GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
 721GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
 722GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
 723GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
 724GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
 725GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
 726GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
 727GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
 728GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
 729GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
 730GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
 731GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
 732
 733#define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
 734void HELPER(NAME)(void *vd, target_ulong base,       \
 735                  CPURISCVState *env, uint32_t desc) \
 736{                                                    \
 737    vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
 738                    ctzl(sizeof(ETYPE)), GETPC());   \
 739}
 740
 741GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
 742GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
 743GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
 744GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
 745
 746/*
 747 *** Vector Integer Arithmetic Instructions
 748 */
 749
 750/* expand macro args before macro */
 751#define RVVCALL(macro, ...)  macro(__VA_ARGS__)
 752
 753/* (TD, T1, T2, TX1, TX2) */
 754#define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
 755#define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
 756#define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
 757#define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
 758#define OP_UUU_B uint8_t, uint8_t, uint8_t, uint8_t, uint8_t
 759#define OP_UUU_H uint16_t, uint16_t, uint16_t, uint16_t, uint16_t
 760#define OP_UUU_W uint32_t, uint32_t, uint32_t, uint32_t, uint32_t
 761#define OP_UUU_D uint64_t, uint64_t, uint64_t, uint64_t, uint64_t
 762#define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
 763#define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
 764#define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
 765#define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
 766#define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
 767#define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
 768#define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
 769#define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
 770#define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
 771#define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
 772#define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
 773#define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
 774#define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
 775#define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
 776#define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
 777#define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
 778#define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
 779#define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
 780#define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
 781#define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
 782#define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
 783#define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
 784
 785/* operation of two vector elements */
 786typedef void opivv2_fn(void *vd, void *vs1, void *vs2, int i);
 787
 788#define OPIVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)    \
 789static void do_##NAME(void *vd, void *vs1, void *vs2, int i)    \
 790{                                                               \
 791    TX1 s1 = *((T1 *)vs1 + HS1(i));                             \
 792    TX2 s2 = *((T2 *)vs2 + HS2(i));                             \
 793    *((TD *)vd + HD(i)) = OP(s2, s1);                           \
 794}
 795#define DO_SUB(N, M) (N - M)
 796#define DO_RSUB(N, M) (M - N)
 797
 798RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
 799RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
 800RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
 801RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
 802RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
 803RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
 804RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
 805RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
 806
 807static void do_vext_vv(void *vd, void *v0, void *vs1, void *vs2,
 808                       CPURISCVState *env, uint32_t desc,
 809                       opivv2_fn *fn, uint32_t esz)
 810{
 811    uint32_t vm = vext_vm(desc);
 812    uint32_t vl = env->vl;
 813    uint32_t total_elems = vext_get_total_elems(env, desc, esz);
 814    uint32_t vta = vext_vta(desc);
 815    uint32_t i;
 816
 817    for (i = env->vstart; i < vl; i++) {
 818        if (!vm && !vext_elem_mask(v0, i)) {
 819            continue;
 820        }
 821        fn(vd, vs1, vs2, i);
 822    }
 823    env->vstart = 0;
 824    /* set tail elements to 1s */
 825    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
 826}
 827
 828/* generate the helpers for OPIVV */
 829#define GEN_VEXT_VV(NAME, ESZ)                            \
 830void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
 831                  void *vs2, CPURISCVState *env,          \
 832                  uint32_t desc)                          \
 833{                                                         \
 834    do_vext_vv(vd, v0, vs1, vs2, env, desc,               \
 835               do_##NAME, ESZ);                           \
 836}
 837
 838GEN_VEXT_VV(vadd_vv_b, 1)
 839GEN_VEXT_VV(vadd_vv_h, 2)
 840GEN_VEXT_VV(vadd_vv_w, 4)
 841GEN_VEXT_VV(vadd_vv_d, 8)
 842GEN_VEXT_VV(vsub_vv_b, 1)
 843GEN_VEXT_VV(vsub_vv_h, 2)
 844GEN_VEXT_VV(vsub_vv_w, 4)
 845GEN_VEXT_VV(vsub_vv_d, 8)
 846
 847typedef void opivx2_fn(void *vd, target_long s1, void *vs2, int i);
 848
 849/*
 850 * (T1)s1 gives the real operator type.
 851 * (TX1)(T1)s1 expands the operator type of widen or narrow operations.
 852 */
 853#define OPIVX2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
 854static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
 855{                                                                   \
 856    TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
 857    *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1);                      \
 858}
 859
 860RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
 861RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
 862RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
 863RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
 864RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
 865RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
 866RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
 867RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
 868RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
 869RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
 870RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
 871RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
 872
 873static void do_vext_vx(void *vd, void *v0, target_long s1, void *vs2,
 874                       CPURISCVState *env, uint32_t desc,
 875                       opivx2_fn fn, uint32_t esz)
 876{
 877    uint32_t vm = vext_vm(desc);
 878    uint32_t vl = env->vl;
 879    uint32_t total_elems = vext_get_total_elems(env, desc, esz);
 880    uint32_t vta = vext_vta(desc);
 881    uint32_t i;
 882
 883    for (i = env->vstart; i < vl; i++) {
 884        if (!vm && !vext_elem_mask(v0, i)) {
 885            continue;
 886        }
 887        fn(vd, s1, vs2, i);
 888    }
 889    env->vstart = 0;
 890    /* set tail elements to 1s */
 891    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
 892}
 893
 894/* generate the helpers for OPIVX */
 895#define GEN_VEXT_VX(NAME, ESZ)                            \
 896void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
 897                  void *vs2, CPURISCVState *env,          \
 898                  uint32_t desc)                          \
 899{                                                         \
 900    do_vext_vx(vd, v0, s1, vs2, env, desc,                \
 901               do_##NAME, ESZ);                           \
 902}
 903
 904GEN_VEXT_VX(vadd_vx_b, 1)
 905GEN_VEXT_VX(vadd_vx_h, 2)
 906GEN_VEXT_VX(vadd_vx_w, 4)
 907GEN_VEXT_VX(vadd_vx_d, 8)
 908GEN_VEXT_VX(vsub_vx_b, 1)
 909GEN_VEXT_VX(vsub_vx_h, 2)
 910GEN_VEXT_VX(vsub_vx_w, 4)
 911GEN_VEXT_VX(vsub_vx_d, 8)
 912GEN_VEXT_VX(vrsub_vx_b, 1)
 913GEN_VEXT_VX(vrsub_vx_h, 2)
 914GEN_VEXT_VX(vrsub_vx_w, 4)
 915GEN_VEXT_VX(vrsub_vx_d, 8)
 916
 917void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
 918{
 919    intptr_t oprsz = simd_oprsz(desc);
 920    intptr_t i;
 921
 922    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
 923        *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
 924    }
 925}
 926
 927void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
 928{
 929    intptr_t oprsz = simd_oprsz(desc);
 930    intptr_t i;
 931
 932    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
 933        *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
 934    }
 935}
 936
 937void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
 938{
 939    intptr_t oprsz = simd_oprsz(desc);
 940    intptr_t i;
 941
 942    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
 943        *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
 944    }
 945}
 946
 947void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
 948{
 949    intptr_t oprsz = simd_oprsz(desc);
 950    intptr_t i;
 951
 952    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
 953        *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
 954    }
 955}
 956
 957/* Vector Widening Integer Add/Subtract */
 958#define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
 959#define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
 960#define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
 961#define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
 962#define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
 963#define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
 964#define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
 965#define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
 966#define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
 967#define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
 968#define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
 969#define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
 970RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
 971RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
 972RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
 973RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
 974RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
 975RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
 976RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
 977RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
 978RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
 979RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
 980RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
 981RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
 982RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
 983RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
 984RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
 985RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
 986RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
 987RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
 988RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
 989RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
 990RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
 991RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
 992RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
 993RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
 994GEN_VEXT_VV(vwaddu_vv_b, 2)
 995GEN_VEXT_VV(vwaddu_vv_h, 4)
 996GEN_VEXT_VV(vwaddu_vv_w, 8)
 997GEN_VEXT_VV(vwsubu_vv_b, 2)
 998GEN_VEXT_VV(vwsubu_vv_h, 4)
 999GEN_VEXT_VV(vwsubu_vv_w, 8)
1000GEN_VEXT_VV(vwadd_vv_b, 2)
1001GEN_VEXT_VV(vwadd_vv_h, 4)
1002GEN_VEXT_VV(vwadd_vv_w, 8)
1003GEN_VEXT_VV(vwsub_vv_b, 2)
1004GEN_VEXT_VV(vwsub_vv_h, 4)
1005GEN_VEXT_VV(vwsub_vv_w, 8)
1006GEN_VEXT_VV(vwaddu_wv_b, 2)
1007GEN_VEXT_VV(vwaddu_wv_h, 4)
1008GEN_VEXT_VV(vwaddu_wv_w, 8)
1009GEN_VEXT_VV(vwsubu_wv_b, 2)
1010GEN_VEXT_VV(vwsubu_wv_h, 4)
1011GEN_VEXT_VV(vwsubu_wv_w, 8)
1012GEN_VEXT_VV(vwadd_wv_b, 2)
1013GEN_VEXT_VV(vwadd_wv_h, 4)
1014GEN_VEXT_VV(vwadd_wv_w, 8)
1015GEN_VEXT_VV(vwsub_wv_b, 2)
1016GEN_VEXT_VV(vwsub_wv_h, 4)
1017GEN_VEXT_VV(vwsub_wv_w, 8)
1018
1019RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1020RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1021RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1022RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1023RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1024RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1025RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1026RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1027RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1028RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1029RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1030RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1031RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1032RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1033RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1034RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1035RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1036RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1037RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1038RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1039RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1040RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1041RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1042RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1043GEN_VEXT_VX(vwaddu_vx_b, 2)
1044GEN_VEXT_VX(vwaddu_vx_h, 4)
1045GEN_VEXT_VX(vwaddu_vx_w, 8)
1046GEN_VEXT_VX(vwsubu_vx_b, 2)
1047GEN_VEXT_VX(vwsubu_vx_h, 4)
1048GEN_VEXT_VX(vwsubu_vx_w, 8)
1049GEN_VEXT_VX(vwadd_vx_b, 2)
1050GEN_VEXT_VX(vwadd_vx_h, 4)
1051GEN_VEXT_VX(vwadd_vx_w, 8)
1052GEN_VEXT_VX(vwsub_vx_b, 2)
1053GEN_VEXT_VX(vwsub_vx_h, 4)
1054GEN_VEXT_VX(vwsub_vx_w, 8)
1055GEN_VEXT_VX(vwaddu_wx_b, 2)
1056GEN_VEXT_VX(vwaddu_wx_h, 4)
1057GEN_VEXT_VX(vwaddu_wx_w, 8)
1058GEN_VEXT_VX(vwsubu_wx_b, 2)
1059GEN_VEXT_VX(vwsubu_wx_h, 4)
1060GEN_VEXT_VX(vwsubu_wx_w, 8)
1061GEN_VEXT_VX(vwadd_wx_b, 2)
1062GEN_VEXT_VX(vwadd_wx_h, 4)
1063GEN_VEXT_VX(vwadd_wx_w, 8)
1064GEN_VEXT_VX(vwsub_wx_b, 2)
1065GEN_VEXT_VX(vwsub_wx_h, 4)
1066GEN_VEXT_VX(vwsub_wx_w, 8)
1067
1068/* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1069#define DO_VADC(N, M, C) (N + M + C)
1070#define DO_VSBC(N, M, C) (N - M - C)
1071
1072#define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1073void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1074                  CPURISCVState *env, uint32_t desc)          \
1075{                                                             \
1076    uint32_t vl = env->vl;                                    \
1077    uint32_t esz = sizeof(ETYPE);                             \
1078    uint32_t total_elems =                                    \
1079        vext_get_total_elems(env, desc, esz);                 \
1080    uint32_t vta = vext_vta(desc);                            \
1081    uint32_t i;                                               \
1082                                                              \
1083    for (i = env->vstart; i < vl; i++) {                      \
1084        ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1085        ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1086        ETYPE carry = vext_elem_mask(v0, i);                  \
1087                                                              \
1088        *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1089    }                                                         \
1090    env->vstart = 0;                                          \
1091    /* set tail elements to 1s */                             \
1092    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1093}
1094
1095GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1096GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1097GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1098GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1099
1100GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1101GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1102GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1103GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1104
1105#define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1106void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1107                  CPURISCVState *env, uint32_t desc)                     \
1108{                                                                        \
1109    uint32_t vl = env->vl;                                               \
1110    uint32_t esz = sizeof(ETYPE);                                        \
1111    uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1112    uint32_t vta = vext_vta(desc);                                       \
1113    uint32_t i;                                                          \
1114                                                                         \
1115    for (i = env->vstart; i < vl; i++) {                                 \
1116        ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1117        ETYPE carry = vext_elem_mask(v0, i);                             \
1118                                                                         \
1119        *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1120    }                                                                    \
1121    env->vstart = 0;                                          \
1122    /* set tail elements to 1s */                                        \
1123    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1124}
1125
1126GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1127GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1128GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1129GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1130
1131GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1132GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1133GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1134GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1135
1136#define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1137                          (__typeof(N))(N + M) < N)
1138#define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1139
1140#define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1141void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1142                  CPURISCVState *env, uint32_t desc)          \
1143{                                                             \
1144    uint32_t vl = env->vl;                                    \
1145    uint32_t vm = vext_vm(desc);                              \
1146    uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
1147    uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1148    uint32_t i;                                               \
1149                                                              \
1150    for (i = env->vstart; i < vl; i++) {                      \
1151        ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1152        ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1153        ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1154        vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1155    }                                                         \
1156    env->vstart = 0;                                          \
1157    /* mask destination register are always tail-agnostic */  \
1158    /* set tail elements to 1s */                             \
1159    if (vta_all_1s) {                                         \
1160        for (; i < total_elems; i++) {                        \
1161            vext_set_elem_mask(vd, i, 1);                     \
1162        }                                                     \
1163    }                                                         \
1164}
1165
1166GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1167GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1168GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1169GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1170
1171GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1172GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1173GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1174GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1175
1176#define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1177void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1178                  void *vs2, CPURISCVState *env, uint32_t desc) \
1179{                                                               \
1180    uint32_t vl = env->vl;                                      \
1181    uint32_t vm = vext_vm(desc);                                \
1182    uint32_t total_elems = env_archcpu(env)->cfg.vlen;          \
1183    uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1184    uint32_t i;                                                 \
1185                                                                \
1186    for (i = env->vstart; i < vl; i++) {                        \
1187        ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1188        ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1189        vext_set_elem_mask(vd, i,                               \
1190                DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1191    }                                                           \
1192    env->vstart = 0;                                            \
1193    /* mask destination register are always tail-agnostic */    \
1194    /* set tail elements to 1s */                               \
1195    if (vta_all_1s) {                                           \
1196        for (; i < total_elems; i++) {                          \
1197            vext_set_elem_mask(vd, i, 1);                       \
1198        }                                                       \
1199    }                                                           \
1200}
1201
1202GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1203GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1204GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1205GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1206
1207GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1208GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1209GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1210GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1211
1212/* Vector Bitwise Logical Instructions */
1213RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1214RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1215RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1216RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1217RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1218RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1219RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1220RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1221RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1222RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1223RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1224RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1225GEN_VEXT_VV(vand_vv_b, 1)
1226GEN_VEXT_VV(vand_vv_h, 2)
1227GEN_VEXT_VV(vand_vv_w, 4)
1228GEN_VEXT_VV(vand_vv_d, 8)
1229GEN_VEXT_VV(vor_vv_b, 1)
1230GEN_VEXT_VV(vor_vv_h, 2)
1231GEN_VEXT_VV(vor_vv_w, 4)
1232GEN_VEXT_VV(vor_vv_d, 8)
1233GEN_VEXT_VV(vxor_vv_b, 1)
1234GEN_VEXT_VV(vxor_vv_h, 2)
1235GEN_VEXT_VV(vxor_vv_w, 4)
1236GEN_VEXT_VV(vxor_vv_d, 8)
1237
1238RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1239RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1240RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1241RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1242RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1243RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1244RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1245RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1246RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1247RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1248RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1249RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1250GEN_VEXT_VX(vand_vx_b, 1)
1251GEN_VEXT_VX(vand_vx_h, 2)
1252GEN_VEXT_VX(vand_vx_w, 4)
1253GEN_VEXT_VX(vand_vx_d, 8)
1254GEN_VEXT_VX(vor_vx_b, 1)
1255GEN_VEXT_VX(vor_vx_h, 2)
1256GEN_VEXT_VX(vor_vx_w, 4)
1257GEN_VEXT_VX(vor_vx_d, 8)
1258GEN_VEXT_VX(vxor_vx_b, 1)
1259GEN_VEXT_VX(vxor_vx_h, 2)
1260GEN_VEXT_VX(vxor_vx_w, 4)
1261GEN_VEXT_VX(vxor_vx_d, 8)
1262
1263/* Vector Single-Width Bit Shift Instructions */
1264#define DO_SLL(N, M)  (N << (M))
1265#define DO_SRL(N, M)  (N >> (M))
1266
1267/* generate the helpers for shift instructions with two vector operators */
1268#define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1269void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1270                  void *vs2, CPURISCVState *env, uint32_t desc)           \
1271{                                                                         \
1272    uint32_t vm = vext_vm(desc);                                          \
1273    uint32_t vl = env->vl;                                                \
1274    uint32_t esz = sizeof(TS1);                                           \
1275    uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1276    uint32_t vta = vext_vta(desc);                                        \
1277    uint32_t i;                                                           \
1278                                                                          \
1279    for (i = env->vstart; i < vl; i++) {                                  \
1280        if (!vm && !vext_elem_mask(v0, i)) {                              \
1281            continue;                                                     \
1282        }                                                                 \
1283        TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1284        TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1285        *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1286    }                                                                     \
1287    env->vstart = 0;                                                      \
1288    /* set tail elements to 1s */                                         \
1289    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1290}
1291
1292GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1293GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1294GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1295GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1296
1297GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1298GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1299GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1300GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1301
1302GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1303GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1304GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1305GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1306
1307/* generate the helpers for shift instructions with one vector and one scalar */
1308#define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1309void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1310        void *vs2, CPURISCVState *env, uint32_t desc)       \
1311{                                                           \
1312    uint32_t vm = vext_vm(desc);                            \
1313    uint32_t vl = env->vl;                                  \
1314    uint32_t esz = sizeof(TD);                              \
1315    uint32_t total_elems =                                  \
1316        vext_get_total_elems(env, desc, esz);               \
1317    uint32_t vta = vext_vta(desc);                          \
1318    uint32_t i;                                             \
1319                                                            \
1320    for (i = env->vstart; i < vl; i++) {                    \
1321        if (!vm && !vext_elem_mask(v0, i)) {                \
1322            continue;                                       \
1323        }                                                   \
1324        TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1325        *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1326    }                                                       \
1327    env->vstart = 0;                                        \
1328    /* set tail elements to 1s */                           \
1329    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1330}
1331
1332GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1333GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1334GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1335GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1336
1337GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1338GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1339GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1340GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1341
1342GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1343GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1344GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1345GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1346
1347/* Vector Narrowing Integer Right Shift Instructions */
1348GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1349GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1350GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1351GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1352GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1353GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1354GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1355GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1356GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1357GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1358GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1359GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1360
1361/* Vector Integer Comparison Instructions */
1362#define DO_MSEQ(N, M) (N == M)
1363#define DO_MSNE(N, M) (N != M)
1364#define DO_MSLT(N, M) (N < M)
1365#define DO_MSLE(N, M) (N <= M)
1366#define DO_MSGT(N, M) (N > M)
1367
1368#define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1369void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1370                  CPURISCVState *env, uint32_t desc)          \
1371{                                                             \
1372    uint32_t vm = vext_vm(desc);                              \
1373    uint32_t vl = env->vl;                                    \
1374    uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
1375    uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1376    uint32_t i;                                               \
1377                                                              \
1378    for (i = env->vstart; i < vl; i++) {                      \
1379        ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1380        ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1381        if (!vm && !vext_elem_mask(v0, i)) {                  \
1382            continue;                                         \
1383        }                                                     \
1384        vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1385    }                                                         \
1386    env->vstart = 0;                                          \
1387    /* mask destination register are always tail-agnostic */  \
1388    /* set tail elements to 1s */                             \
1389    if (vta_all_1s) {                                         \
1390        for (; i < total_elems; i++) {                        \
1391            vext_set_elem_mask(vd, i, 1);                     \
1392        }                                                     \
1393    }                                                         \
1394}
1395
1396GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1397GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1398GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1399GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1400
1401GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1402GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1403GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1404GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1405
1406GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1407GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1408GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1409GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1410
1411GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1412GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1413GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1414GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1415
1416GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1417GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1418GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1419GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1420
1421GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1422GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1423GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1424GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1425
1426#define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1427void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1428                  CPURISCVState *env, uint32_t desc)                \
1429{                                                                   \
1430    uint32_t vm = vext_vm(desc);                                    \
1431    uint32_t vl = env->vl;                                          \
1432    uint32_t total_elems = env_archcpu(env)->cfg.vlen;              \
1433    uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1434    uint32_t i;                                                     \
1435                                                                    \
1436    for (i = env->vstart; i < vl; i++) {                            \
1437        ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1438        if (!vm && !vext_elem_mask(v0, i)) {                        \
1439            continue;                                               \
1440        }                                                           \
1441        vext_set_elem_mask(vd, i,                                   \
1442                DO_OP(s2, (ETYPE)(target_long)s1));                 \
1443    }                                                               \
1444    env->vstart = 0;                                                \
1445    /* mask destination register are always tail-agnostic */        \
1446    /* set tail elements to 1s */                                   \
1447    if (vta_all_1s) {                                               \
1448        for (; i < total_elems; i++) {                              \
1449            vext_set_elem_mask(vd, i, 1);                           \
1450        }                                                           \
1451    }                                                               \
1452}
1453
1454GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1455GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1456GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1457GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1458
1459GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1460GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1461GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1462GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1463
1464GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1465GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1466GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1467GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1468
1469GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1470GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1471GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1472GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1473
1474GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1475GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1476GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1477GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1478
1479GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1480GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1481GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1482GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1483
1484GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1485GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1486GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1487GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1488
1489GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1490GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1491GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1492GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1493
1494/* Vector Integer Min/Max Instructions */
1495RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1496RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1497RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1498RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1499RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1500RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1501RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1502RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1503RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1504RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1505RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1506RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1507RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1508RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1509RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1510RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1511GEN_VEXT_VV(vminu_vv_b, 1)
1512GEN_VEXT_VV(vminu_vv_h, 2)
1513GEN_VEXT_VV(vminu_vv_w, 4)
1514GEN_VEXT_VV(vminu_vv_d, 8)
1515GEN_VEXT_VV(vmin_vv_b, 1)
1516GEN_VEXT_VV(vmin_vv_h, 2)
1517GEN_VEXT_VV(vmin_vv_w, 4)
1518GEN_VEXT_VV(vmin_vv_d, 8)
1519GEN_VEXT_VV(vmaxu_vv_b, 1)
1520GEN_VEXT_VV(vmaxu_vv_h, 2)
1521GEN_VEXT_VV(vmaxu_vv_w, 4)
1522GEN_VEXT_VV(vmaxu_vv_d, 8)
1523GEN_VEXT_VV(vmax_vv_b, 1)
1524GEN_VEXT_VV(vmax_vv_h, 2)
1525GEN_VEXT_VV(vmax_vv_w, 4)
1526GEN_VEXT_VV(vmax_vv_d, 8)
1527
1528RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1529RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1530RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1531RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1532RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1533RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1534RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1535RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1536RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1537RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1538RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1539RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1540RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1541RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1542RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1543RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1544GEN_VEXT_VX(vminu_vx_b, 1)
1545GEN_VEXT_VX(vminu_vx_h, 2)
1546GEN_VEXT_VX(vminu_vx_w, 4)
1547GEN_VEXT_VX(vminu_vx_d, 8)
1548GEN_VEXT_VX(vmin_vx_b, 1)
1549GEN_VEXT_VX(vmin_vx_h, 2)
1550GEN_VEXT_VX(vmin_vx_w, 4)
1551GEN_VEXT_VX(vmin_vx_d, 8)
1552GEN_VEXT_VX(vmaxu_vx_b, 1)
1553GEN_VEXT_VX(vmaxu_vx_h, 2)
1554GEN_VEXT_VX(vmaxu_vx_w, 4)
1555GEN_VEXT_VX(vmaxu_vx_d, 8)
1556GEN_VEXT_VX(vmax_vx_b, 1)
1557GEN_VEXT_VX(vmax_vx_h, 2)
1558GEN_VEXT_VX(vmax_vx_w, 4)
1559GEN_VEXT_VX(vmax_vx_d, 8)
1560
1561/* Vector Single-Width Integer Multiply Instructions */
1562#define DO_MUL(N, M) (N * M)
1563RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1564RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1565RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1566RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1567GEN_VEXT_VV(vmul_vv_b, 1)
1568GEN_VEXT_VV(vmul_vv_h, 2)
1569GEN_VEXT_VV(vmul_vv_w, 4)
1570GEN_VEXT_VV(vmul_vv_d, 8)
1571
1572static int8_t do_mulh_b(int8_t s2, int8_t s1)
1573{
1574    return (int16_t)s2 * (int16_t)s1 >> 8;
1575}
1576
1577static int16_t do_mulh_h(int16_t s2, int16_t s1)
1578{
1579    return (int32_t)s2 * (int32_t)s1 >> 16;
1580}
1581
1582static int32_t do_mulh_w(int32_t s2, int32_t s1)
1583{
1584    return (int64_t)s2 * (int64_t)s1 >> 32;
1585}
1586
1587static int64_t do_mulh_d(int64_t s2, int64_t s1)
1588{
1589    uint64_t hi_64, lo_64;
1590
1591    muls64(&lo_64, &hi_64, s1, s2);
1592    return hi_64;
1593}
1594
1595static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1596{
1597    return (uint16_t)s2 * (uint16_t)s1 >> 8;
1598}
1599
1600static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1601{
1602    return (uint32_t)s2 * (uint32_t)s1 >> 16;
1603}
1604
1605static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1606{
1607    return (uint64_t)s2 * (uint64_t)s1 >> 32;
1608}
1609
1610static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1611{
1612    uint64_t hi_64, lo_64;
1613
1614    mulu64(&lo_64, &hi_64, s2, s1);
1615    return hi_64;
1616}
1617
1618static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1619{
1620    return (int16_t)s2 * (uint16_t)s1 >> 8;
1621}
1622
1623static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1624{
1625    return (int32_t)s2 * (uint32_t)s1 >> 16;
1626}
1627
1628static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1629{
1630    return (int64_t)s2 * (uint64_t)s1 >> 32;
1631}
1632
1633/*
1634 * Let  A = signed operand,
1635 *      B = unsigned operand
1636 *      P = mulu64(A, B), unsigned product
1637 *
1638 * LET  X = 2 ** 64  - A, 2's complement of A
1639 *      SP = signed product
1640 * THEN
1641 *      IF A < 0
1642 *          SP = -X * B
1643 *             = -(2 ** 64 - A) * B
1644 *             = A * B - 2 ** 64 * B
1645 *             = P - 2 ** 64 * B
1646 *      ELSE
1647 *          SP = P
1648 * THEN
1649 *      HI_P -= (A < 0 ? B : 0)
1650 */
1651
1652static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1653{
1654    uint64_t hi_64, lo_64;
1655
1656    mulu64(&lo_64, &hi_64, s2, s1);
1657
1658    hi_64 -= s2 < 0 ? s1 : 0;
1659    return hi_64;
1660}
1661
1662RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1663RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1664RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1665RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1666RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1667RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1668RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1669RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1670RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1671RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1672RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1673RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1674GEN_VEXT_VV(vmulh_vv_b, 1)
1675GEN_VEXT_VV(vmulh_vv_h, 2)
1676GEN_VEXT_VV(vmulh_vv_w, 4)
1677GEN_VEXT_VV(vmulh_vv_d, 8)
1678GEN_VEXT_VV(vmulhu_vv_b, 1)
1679GEN_VEXT_VV(vmulhu_vv_h, 2)
1680GEN_VEXT_VV(vmulhu_vv_w, 4)
1681GEN_VEXT_VV(vmulhu_vv_d, 8)
1682GEN_VEXT_VV(vmulhsu_vv_b, 1)
1683GEN_VEXT_VV(vmulhsu_vv_h, 2)
1684GEN_VEXT_VV(vmulhsu_vv_w, 4)
1685GEN_VEXT_VV(vmulhsu_vv_d, 8)
1686
1687RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1688RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1689RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1690RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1691RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1692RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1693RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1694RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1695RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1696RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1697RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1698RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1699RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1700RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1701RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1702RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1703GEN_VEXT_VX(vmul_vx_b, 1)
1704GEN_VEXT_VX(vmul_vx_h, 2)
1705GEN_VEXT_VX(vmul_vx_w, 4)
1706GEN_VEXT_VX(vmul_vx_d, 8)
1707GEN_VEXT_VX(vmulh_vx_b, 1)
1708GEN_VEXT_VX(vmulh_vx_h, 2)
1709GEN_VEXT_VX(vmulh_vx_w, 4)
1710GEN_VEXT_VX(vmulh_vx_d, 8)
1711GEN_VEXT_VX(vmulhu_vx_b, 1)
1712GEN_VEXT_VX(vmulhu_vx_h, 2)
1713GEN_VEXT_VX(vmulhu_vx_w, 4)
1714GEN_VEXT_VX(vmulhu_vx_d, 8)
1715GEN_VEXT_VX(vmulhsu_vx_b, 1)
1716GEN_VEXT_VX(vmulhsu_vx_h, 2)
1717GEN_VEXT_VX(vmulhsu_vx_w, 4)
1718GEN_VEXT_VX(vmulhsu_vx_d, 8)
1719
1720/* Vector Integer Divide Instructions */
1721#define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1722#define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1723#define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) :\
1724        unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1725#define DO_REM(N, M)  (unlikely(M == 0) ? N :\
1726        unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1727
1728RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1729RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1730RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1731RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1732RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1733RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1734RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1735RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1736RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1737RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1738RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1739RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1740RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1741RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1742RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1743RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1744GEN_VEXT_VV(vdivu_vv_b, 1)
1745GEN_VEXT_VV(vdivu_vv_h, 2)
1746GEN_VEXT_VV(vdivu_vv_w, 4)
1747GEN_VEXT_VV(vdivu_vv_d, 8)
1748GEN_VEXT_VV(vdiv_vv_b, 1)
1749GEN_VEXT_VV(vdiv_vv_h, 2)
1750GEN_VEXT_VV(vdiv_vv_w, 4)
1751GEN_VEXT_VV(vdiv_vv_d, 8)
1752GEN_VEXT_VV(vremu_vv_b, 1)
1753GEN_VEXT_VV(vremu_vv_h, 2)
1754GEN_VEXT_VV(vremu_vv_w, 4)
1755GEN_VEXT_VV(vremu_vv_d, 8)
1756GEN_VEXT_VV(vrem_vv_b, 1)
1757GEN_VEXT_VV(vrem_vv_h, 2)
1758GEN_VEXT_VV(vrem_vv_w, 4)
1759GEN_VEXT_VV(vrem_vv_d, 8)
1760
1761RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1762RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1763RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1764RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1765RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1766RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1767RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1768RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1769RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1770RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1771RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1772RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1773RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1774RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1775RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1776RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1777GEN_VEXT_VX(vdivu_vx_b, 1)
1778GEN_VEXT_VX(vdivu_vx_h, 2)
1779GEN_VEXT_VX(vdivu_vx_w, 4)
1780GEN_VEXT_VX(vdivu_vx_d, 8)
1781GEN_VEXT_VX(vdiv_vx_b, 1)
1782GEN_VEXT_VX(vdiv_vx_h, 2)
1783GEN_VEXT_VX(vdiv_vx_w, 4)
1784GEN_VEXT_VX(vdiv_vx_d, 8)
1785GEN_VEXT_VX(vremu_vx_b, 1)
1786GEN_VEXT_VX(vremu_vx_h, 2)
1787GEN_VEXT_VX(vremu_vx_w, 4)
1788GEN_VEXT_VX(vremu_vx_d, 8)
1789GEN_VEXT_VX(vrem_vx_b, 1)
1790GEN_VEXT_VX(vrem_vx_h, 2)
1791GEN_VEXT_VX(vrem_vx_w, 4)
1792GEN_VEXT_VX(vrem_vx_d, 8)
1793
1794/* Vector Widening Integer Multiply Instructions */
1795RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1796RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1797RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1798RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1799RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1800RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1801RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1802RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1803RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1804GEN_VEXT_VV(vwmul_vv_b, 2)
1805GEN_VEXT_VV(vwmul_vv_h, 4)
1806GEN_VEXT_VV(vwmul_vv_w, 8)
1807GEN_VEXT_VV(vwmulu_vv_b, 2)
1808GEN_VEXT_VV(vwmulu_vv_h, 4)
1809GEN_VEXT_VV(vwmulu_vv_w, 8)
1810GEN_VEXT_VV(vwmulsu_vv_b, 2)
1811GEN_VEXT_VV(vwmulsu_vv_h, 4)
1812GEN_VEXT_VV(vwmulsu_vv_w, 8)
1813
1814RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1815RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1816RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1817RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1818RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1819RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1820RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1821RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1822RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1823GEN_VEXT_VX(vwmul_vx_b, 2)
1824GEN_VEXT_VX(vwmul_vx_h, 4)
1825GEN_VEXT_VX(vwmul_vx_w, 8)
1826GEN_VEXT_VX(vwmulu_vx_b, 2)
1827GEN_VEXT_VX(vwmulu_vx_h, 4)
1828GEN_VEXT_VX(vwmulu_vx_w, 8)
1829GEN_VEXT_VX(vwmulsu_vx_b, 2)
1830GEN_VEXT_VX(vwmulsu_vx_h, 4)
1831GEN_VEXT_VX(vwmulsu_vx_w, 8)
1832
1833/* Vector Single-Width Integer Multiply-Add Instructions */
1834#define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
1835static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1836{                                                                  \
1837    TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1838    TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1839    TD d = *((TD *)vd + HD(i));                                    \
1840    *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1841}
1842
1843#define DO_MACC(N, M, D) (M * N + D)
1844#define DO_NMSAC(N, M, D) (-(M * N) + D)
1845#define DO_MADD(N, M, D) (M * D + N)
1846#define DO_NMSUB(N, M, D) (-(M * D) + N)
1847RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1848RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1849RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1850RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1851RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1852RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1853RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1854RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1855RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1856RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1857RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1858RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1859RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1860RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1861RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1862RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1863GEN_VEXT_VV(vmacc_vv_b, 1)
1864GEN_VEXT_VV(vmacc_vv_h, 2)
1865GEN_VEXT_VV(vmacc_vv_w, 4)
1866GEN_VEXT_VV(vmacc_vv_d, 8)
1867GEN_VEXT_VV(vnmsac_vv_b, 1)
1868GEN_VEXT_VV(vnmsac_vv_h, 2)
1869GEN_VEXT_VV(vnmsac_vv_w, 4)
1870GEN_VEXT_VV(vnmsac_vv_d, 8)
1871GEN_VEXT_VV(vmadd_vv_b, 1)
1872GEN_VEXT_VV(vmadd_vv_h, 2)
1873GEN_VEXT_VV(vmadd_vv_w, 4)
1874GEN_VEXT_VV(vmadd_vv_d, 8)
1875GEN_VEXT_VV(vnmsub_vv_b, 1)
1876GEN_VEXT_VV(vnmsub_vv_h, 2)
1877GEN_VEXT_VV(vnmsub_vv_w, 4)
1878GEN_VEXT_VV(vnmsub_vv_d, 8)
1879
1880#define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1881static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1882{                                                                   \
1883    TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1884    TD d = *((TD *)vd + HD(i));                                     \
1885    *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1886}
1887
1888RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1889RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1890RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1891RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1892RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1893RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1894RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1895RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1896RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1897RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1898RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1899RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1900RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1901RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1902RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1903RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1904GEN_VEXT_VX(vmacc_vx_b, 1)
1905GEN_VEXT_VX(vmacc_vx_h, 2)
1906GEN_VEXT_VX(vmacc_vx_w, 4)
1907GEN_VEXT_VX(vmacc_vx_d, 8)
1908GEN_VEXT_VX(vnmsac_vx_b, 1)
1909GEN_VEXT_VX(vnmsac_vx_h, 2)
1910GEN_VEXT_VX(vnmsac_vx_w, 4)
1911GEN_VEXT_VX(vnmsac_vx_d, 8)
1912GEN_VEXT_VX(vmadd_vx_b, 1)
1913GEN_VEXT_VX(vmadd_vx_h, 2)
1914GEN_VEXT_VX(vmadd_vx_w, 4)
1915GEN_VEXT_VX(vmadd_vx_d, 8)
1916GEN_VEXT_VX(vnmsub_vx_b, 1)
1917GEN_VEXT_VX(vnmsub_vx_h, 2)
1918GEN_VEXT_VX(vnmsub_vx_w, 4)
1919GEN_VEXT_VX(vnmsub_vx_d, 8)
1920
1921/* Vector Widening Integer Multiply-Add Instructions */
1922RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1923RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1924RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1925RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1926RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1927RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1928RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1929RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1930RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1931GEN_VEXT_VV(vwmaccu_vv_b, 2)
1932GEN_VEXT_VV(vwmaccu_vv_h, 4)
1933GEN_VEXT_VV(vwmaccu_vv_w, 8)
1934GEN_VEXT_VV(vwmacc_vv_b, 2)
1935GEN_VEXT_VV(vwmacc_vv_h, 4)
1936GEN_VEXT_VV(vwmacc_vv_w, 8)
1937GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1938GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1939GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1940
1941RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1942RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1943RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1944RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1945RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1946RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1947RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1948RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1949RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1950RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1951RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1952RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1953GEN_VEXT_VX(vwmaccu_vx_b, 2)
1954GEN_VEXT_VX(vwmaccu_vx_h, 4)
1955GEN_VEXT_VX(vwmaccu_vx_w, 8)
1956GEN_VEXT_VX(vwmacc_vx_b, 2)
1957GEN_VEXT_VX(vwmacc_vx_h, 4)
1958GEN_VEXT_VX(vwmacc_vx_w, 8)
1959GEN_VEXT_VX(vwmaccsu_vx_b, 2)
1960GEN_VEXT_VX(vwmaccsu_vx_h, 4)
1961GEN_VEXT_VX(vwmaccsu_vx_w, 8)
1962GEN_VEXT_VX(vwmaccus_vx_b, 2)
1963GEN_VEXT_VX(vwmaccus_vx_h, 4)
1964GEN_VEXT_VX(vwmaccus_vx_w, 8)
1965
1966/* Vector Integer Merge and Move Instructions */
1967#define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
1968void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
1969                  uint32_t desc)                                     \
1970{                                                                    \
1971    uint32_t vl = env->vl;                                           \
1972    uint32_t esz = sizeof(ETYPE);                                    \
1973    uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1974    uint32_t vta = vext_vta(desc);                                   \
1975    uint32_t i;                                                      \
1976                                                                     \
1977    for (i = env->vstart; i < vl; i++) {                             \
1978        ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
1979        *((ETYPE *)vd + H(i)) = s1;                                  \
1980    }                                                                \
1981    env->vstart = 0;                                                 \
1982    /* set tail elements to 1s */                                    \
1983    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1984}
1985
1986GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
1987GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
1988GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
1989GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
1990
1991#define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
1992void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
1993                  uint32_t desc)                                     \
1994{                                                                    \
1995    uint32_t vl = env->vl;                                           \
1996    uint32_t esz = sizeof(ETYPE);                                    \
1997    uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1998    uint32_t vta = vext_vta(desc);                                   \
1999    uint32_t i;                                                      \
2000                                                                     \
2001    for (i = env->vstart; i < vl; i++) {                             \
2002        *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2003    }                                                                \
2004    env->vstart = 0;                                                 \
2005    /* set tail elements to 1s */                                    \
2006    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2007}
2008
2009GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2010GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2011GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2012GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2013
2014#define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2015void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2016                  CPURISCVState *env, uint32_t desc)                 \
2017{                                                                    \
2018    uint32_t vl = env->vl;                                           \
2019    uint32_t esz = sizeof(ETYPE);                                    \
2020    uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2021    uint32_t vta = vext_vta(desc);                                   \
2022    uint32_t i;                                                      \
2023                                                                     \
2024    for (i = env->vstart; i < vl; i++) {                             \
2025        ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2026        *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2027    }                                                                \
2028    env->vstart = 0;                                                 \
2029    /* set tail elements to 1s */                                    \
2030    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2031}
2032
2033GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2034GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2035GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2036GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2037
2038#define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2039void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2040                  void *vs2, CPURISCVState *env, uint32_t desc)      \
2041{                                                                    \
2042    uint32_t vl = env->vl;                                           \
2043    uint32_t esz = sizeof(ETYPE);                                    \
2044    uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2045    uint32_t vta = vext_vta(desc);                                   \
2046    uint32_t i;                                                      \
2047                                                                     \
2048    for (i = env->vstart; i < vl; i++) {                             \
2049        ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2050        ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2051                   (ETYPE)(target_long)s1);                          \
2052        *((ETYPE *)vd + H(i)) = d;                                   \
2053    }                                                                \
2054    env->vstart = 0;                                                 \
2055    /* set tail elements to 1s */                                    \
2056    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2057}
2058
2059GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2060GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2061GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2062GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2063
2064/*
2065 *** Vector Fixed-Point Arithmetic Instructions
2066 */
2067
2068/* Vector Single-Width Saturating Add and Subtract */
2069
2070/*
2071 * As fixed point instructions probably have round mode and saturation,
2072 * define common macros for fixed point here.
2073 */
2074typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2075                          CPURISCVState *env, int vxrm);
2076
2077#define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2078static inline void                                                  \
2079do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2080          CPURISCVState *env, int vxrm)                             \
2081{                                                                   \
2082    TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2083    TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2084    *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2085}
2086
2087static inline void
2088vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2089             CPURISCVState *env,
2090             uint32_t vl, uint32_t vm, int vxrm,
2091             opivv2_rm_fn *fn)
2092{
2093    for (uint32_t i = env->vstart; i < vl; i++) {
2094        if (!vm && !vext_elem_mask(v0, i)) {
2095            continue;
2096        }
2097        fn(vd, vs1, vs2, i, env, vxrm);
2098    }
2099    env->vstart = 0;
2100}
2101
2102static inline void
2103vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2104             CPURISCVState *env,
2105             uint32_t desc,
2106             opivv2_rm_fn *fn, uint32_t esz)
2107{
2108    uint32_t vm = vext_vm(desc);
2109    uint32_t vl = env->vl;
2110    uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2111    uint32_t vta = vext_vta(desc);
2112
2113    switch (env->vxrm) {
2114    case 0: /* rnu */
2115        vext_vv_rm_1(vd, v0, vs1, vs2,
2116                     env, vl, vm, 0, fn);
2117        break;
2118    case 1: /* rne */
2119        vext_vv_rm_1(vd, v0, vs1, vs2,
2120                     env, vl, vm, 1, fn);
2121        break;
2122    case 2: /* rdn */
2123        vext_vv_rm_1(vd, v0, vs1, vs2,
2124                     env, vl, vm, 2, fn);
2125        break;
2126    default: /* rod */
2127        vext_vv_rm_1(vd, v0, vs1, vs2,
2128                     env, vl, vm, 3, fn);
2129        break;
2130    }
2131    /* set tail elements to 1s */
2132    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2133}
2134
2135/* generate helpers for fixed point instructions with OPIVV format */
2136#define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2137void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2138                  CPURISCVState *env, uint32_t desc)            \
2139{                                                               \
2140    vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2141                 do_##NAME, ESZ);                               \
2142}
2143
2144static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2145{
2146    uint8_t res = a + b;
2147    if (res < a) {
2148        res = UINT8_MAX;
2149        env->vxsat = 0x1;
2150    }
2151    return res;
2152}
2153
2154static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2155                               uint16_t b)
2156{
2157    uint16_t res = a + b;
2158    if (res < a) {
2159        res = UINT16_MAX;
2160        env->vxsat = 0x1;
2161    }
2162    return res;
2163}
2164
2165static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2166                               uint32_t b)
2167{
2168    uint32_t res = a + b;
2169    if (res < a) {
2170        res = UINT32_MAX;
2171        env->vxsat = 0x1;
2172    }
2173    return res;
2174}
2175
2176static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2177                               uint64_t b)
2178{
2179    uint64_t res = a + b;
2180    if (res < a) {
2181        res = UINT64_MAX;
2182        env->vxsat = 0x1;
2183    }
2184    return res;
2185}
2186
2187RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2188RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2189RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2190RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2191GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2192GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2193GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2194GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2195
2196typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2197                          CPURISCVState *env, int vxrm);
2198
2199#define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2200static inline void                                                  \
2201do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2202          CPURISCVState *env, int vxrm)                             \
2203{                                                                   \
2204    TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2205    *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2206}
2207
2208static inline void
2209vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2210             CPURISCVState *env,
2211             uint32_t vl, uint32_t vm, int vxrm,
2212             opivx2_rm_fn *fn)
2213{
2214    for (uint32_t i = env->vstart; i < vl; i++) {
2215        if (!vm && !vext_elem_mask(v0, i)) {
2216            continue;
2217        }
2218        fn(vd, s1, vs2, i, env, vxrm);
2219    }
2220    env->vstart = 0;
2221}
2222
2223static inline void
2224vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2225             CPURISCVState *env,
2226             uint32_t desc,
2227             opivx2_rm_fn *fn, uint32_t esz)
2228{
2229    uint32_t vm = vext_vm(desc);
2230    uint32_t vl = env->vl;
2231    uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2232    uint32_t vta = vext_vta(desc);
2233
2234    switch (env->vxrm) {
2235    case 0: /* rnu */
2236        vext_vx_rm_1(vd, v0, s1, vs2,
2237                     env, vl, vm, 0, fn);
2238        break;
2239    case 1: /* rne */
2240        vext_vx_rm_1(vd, v0, s1, vs2,
2241                     env, vl, vm, 1, fn);
2242        break;
2243    case 2: /* rdn */
2244        vext_vx_rm_1(vd, v0, s1, vs2,
2245                     env, vl, vm, 2, fn);
2246        break;
2247    default: /* rod */
2248        vext_vx_rm_1(vd, v0, s1, vs2,
2249                     env, vl, vm, 3, fn);
2250        break;
2251    }
2252    /* set tail elements to 1s */
2253    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2254}
2255
2256/* generate helpers for fixed point instructions with OPIVX format */
2257#define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2258void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2259        void *vs2, CPURISCVState *env, uint32_t desc)     \
2260{                                                         \
2261    vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2262                 do_##NAME, ESZ);                         \
2263}
2264
2265RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2266RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2267RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2268RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2269GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2270GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2271GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2272GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2273
2274static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2275{
2276    int8_t res = a + b;
2277    if ((res ^ a) & (res ^ b) & INT8_MIN) {
2278        res = a > 0 ? INT8_MAX : INT8_MIN;
2279        env->vxsat = 0x1;
2280    }
2281    return res;
2282}
2283
2284static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2285{
2286    int16_t res = a + b;
2287    if ((res ^ a) & (res ^ b) & INT16_MIN) {
2288        res = a > 0 ? INT16_MAX : INT16_MIN;
2289        env->vxsat = 0x1;
2290    }
2291    return res;
2292}
2293
2294static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2295{
2296    int32_t res = a + b;
2297    if ((res ^ a) & (res ^ b) & INT32_MIN) {
2298        res = a > 0 ? INT32_MAX : INT32_MIN;
2299        env->vxsat = 0x1;
2300    }
2301    return res;
2302}
2303
2304static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2305{
2306    int64_t res = a + b;
2307    if ((res ^ a) & (res ^ b) & INT64_MIN) {
2308        res = a > 0 ? INT64_MAX : INT64_MIN;
2309        env->vxsat = 0x1;
2310    }
2311    return res;
2312}
2313
2314RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2315RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2316RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2317RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2318GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2319GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2320GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2321GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2322
2323RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2324RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2325RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2326RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2327GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2328GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2329GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2330GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2331
2332static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2333{
2334    uint8_t res = a - b;
2335    if (res > a) {
2336        res = 0;
2337        env->vxsat = 0x1;
2338    }
2339    return res;
2340}
2341
2342static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2343                               uint16_t b)
2344{
2345    uint16_t res = a - b;
2346    if (res > a) {
2347        res = 0;
2348        env->vxsat = 0x1;
2349    }
2350    return res;
2351}
2352
2353static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2354                               uint32_t b)
2355{
2356    uint32_t res = a - b;
2357    if (res > a) {
2358        res = 0;
2359        env->vxsat = 0x1;
2360    }
2361    return res;
2362}
2363
2364static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2365                               uint64_t b)
2366{
2367    uint64_t res = a - b;
2368    if (res > a) {
2369        res = 0;
2370        env->vxsat = 0x1;
2371    }
2372    return res;
2373}
2374
2375RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2376RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2377RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2378RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2379GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2380GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2381GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2382GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2383
2384RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2385RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2386RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2387RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2388GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2389GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2390GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2391GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2392
2393static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2394{
2395    int8_t res = a - b;
2396    if ((res ^ a) & (a ^ b) & INT8_MIN) {
2397        res = a >= 0 ? INT8_MAX : INT8_MIN;
2398        env->vxsat = 0x1;
2399    }
2400    return res;
2401}
2402
2403static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2404{
2405    int16_t res = a - b;
2406    if ((res ^ a) & (a ^ b) & INT16_MIN) {
2407        res = a >= 0 ? INT16_MAX : INT16_MIN;
2408        env->vxsat = 0x1;
2409    }
2410    return res;
2411}
2412
2413static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2414{
2415    int32_t res = a - b;
2416    if ((res ^ a) & (a ^ b) & INT32_MIN) {
2417        res = a >= 0 ? INT32_MAX : INT32_MIN;
2418        env->vxsat = 0x1;
2419    }
2420    return res;
2421}
2422
2423static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2424{
2425    int64_t res = a - b;
2426    if ((res ^ a) & (a ^ b) & INT64_MIN) {
2427        res = a >= 0 ? INT64_MAX : INT64_MIN;
2428        env->vxsat = 0x1;
2429    }
2430    return res;
2431}
2432
2433RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2434RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2435RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2436RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2437GEN_VEXT_VV_RM(vssub_vv_b, 1)
2438GEN_VEXT_VV_RM(vssub_vv_h, 2)
2439GEN_VEXT_VV_RM(vssub_vv_w, 4)
2440GEN_VEXT_VV_RM(vssub_vv_d, 8)
2441
2442RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2443RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2444RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2445RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2446GEN_VEXT_VX_RM(vssub_vx_b, 1)
2447GEN_VEXT_VX_RM(vssub_vx_h, 2)
2448GEN_VEXT_VX_RM(vssub_vx_w, 4)
2449GEN_VEXT_VX_RM(vssub_vx_d, 8)
2450
2451/* Vector Single-Width Averaging Add and Subtract */
2452static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2453{
2454    uint8_t d = extract64(v, shift, 1);
2455    uint8_t d1;
2456    uint64_t D1, D2;
2457
2458    if (shift == 0 || shift > 64) {
2459        return 0;
2460    }
2461
2462    d1 = extract64(v, shift - 1, 1);
2463    D1 = extract64(v, 0, shift);
2464    if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2465        return d1;
2466    } else if (vxrm == 1) { /* round-to-nearest-even */
2467        if (shift > 1) {
2468            D2 = extract64(v, 0, shift - 1);
2469            return d1 & ((D2 != 0) | d);
2470        } else {
2471            return d1 & d;
2472        }
2473    } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2474        return !d & (D1 != 0);
2475    }
2476    return 0; /* round-down (truncate) */
2477}
2478
2479static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2480{
2481    int64_t res = (int64_t)a + b;
2482    uint8_t round = get_round(vxrm, res, 1);
2483
2484    return (res >> 1) + round;
2485}
2486
2487static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2488{
2489    int64_t res = a + b;
2490    uint8_t round = get_round(vxrm, res, 1);
2491    int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2492
2493    /* With signed overflow, bit 64 is inverse of bit 63. */
2494    return ((res >> 1) ^ over) + round;
2495}
2496
2497RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2498RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2499RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2500RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2501GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2502GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2503GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2504GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2505
2506RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2507RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2508RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2509RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2510GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2511GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2512GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2513GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2514
2515static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2516                               uint32_t a, uint32_t b)
2517{
2518    uint64_t res = (uint64_t)a + b;
2519    uint8_t round = get_round(vxrm, res, 1);
2520
2521    return (res >> 1) + round;
2522}
2523
2524static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2525                               uint64_t a, uint64_t b)
2526{
2527    uint64_t res = a + b;
2528    uint8_t round = get_round(vxrm, res, 1);
2529    uint64_t over = (uint64_t)(res < a) << 63;
2530
2531    return ((res >> 1) | over) + round;
2532}
2533
2534RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2535RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2536RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2537RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2538GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2539GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2540GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2541GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2542
2543RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2544RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2545RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2546RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2547GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2548GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2549GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2550GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2551
2552static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2553{
2554    int64_t res = (int64_t)a - b;
2555    uint8_t round = get_round(vxrm, res, 1);
2556
2557    return (res >> 1) + round;
2558}
2559
2560static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2561{
2562    int64_t res = (int64_t)a - b;
2563    uint8_t round = get_round(vxrm, res, 1);
2564    int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2565
2566    /* With signed overflow, bit 64 is inverse of bit 63. */
2567    return ((res >> 1) ^ over) + round;
2568}
2569
2570RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2571RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2572RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2573RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2574GEN_VEXT_VV_RM(vasub_vv_b, 1)
2575GEN_VEXT_VV_RM(vasub_vv_h, 2)
2576GEN_VEXT_VV_RM(vasub_vv_w, 4)
2577GEN_VEXT_VV_RM(vasub_vv_d, 8)
2578
2579RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2580RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2581RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2582RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2583GEN_VEXT_VX_RM(vasub_vx_b, 1)
2584GEN_VEXT_VX_RM(vasub_vx_h, 2)
2585GEN_VEXT_VX_RM(vasub_vx_w, 4)
2586GEN_VEXT_VX_RM(vasub_vx_d, 8)
2587
2588static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2589                               uint32_t a, uint32_t b)
2590{
2591    int64_t res = (int64_t)a - b;
2592    uint8_t round = get_round(vxrm, res, 1);
2593
2594    return (res >> 1) + round;
2595}
2596
2597static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2598                               uint64_t a, uint64_t b)
2599{
2600    uint64_t res = (uint64_t)a - b;
2601    uint8_t round = get_round(vxrm, res, 1);
2602    uint64_t over = (uint64_t)(res > a) << 63;
2603
2604    return ((res >> 1) | over) + round;
2605}
2606
2607RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2608RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2609RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2610RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2611GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2612GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2613GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2614GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2615
2616RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2617RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2618RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2619RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2620GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2621GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2622GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2623GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2624
2625/* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2626static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2627{
2628    uint8_t round;
2629    int16_t res;
2630
2631    res = (int16_t)a * (int16_t)b;
2632    round = get_round(vxrm, res, 7);
2633    res   = (res >> 7) + round;
2634
2635    if (res > INT8_MAX) {
2636        env->vxsat = 0x1;
2637        return INT8_MAX;
2638    } else if (res < INT8_MIN) {
2639        env->vxsat = 0x1;
2640        return INT8_MIN;
2641    } else {
2642        return res;
2643    }
2644}
2645
2646static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2647{
2648    uint8_t round;
2649    int32_t res;
2650
2651    res = (int32_t)a * (int32_t)b;
2652    round = get_round(vxrm, res, 15);
2653    res   = (res >> 15) + round;
2654
2655    if (res > INT16_MAX) {
2656        env->vxsat = 0x1;
2657        return INT16_MAX;
2658    } else if (res < INT16_MIN) {
2659        env->vxsat = 0x1;
2660        return INT16_MIN;
2661    } else {
2662        return res;
2663    }
2664}
2665
2666static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2667{
2668    uint8_t round;
2669    int64_t res;
2670
2671    res = (int64_t)a * (int64_t)b;
2672    round = get_round(vxrm, res, 31);
2673    res   = (res >> 31) + round;
2674
2675    if (res > INT32_MAX) {
2676        env->vxsat = 0x1;
2677        return INT32_MAX;
2678    } else if (res < INT32_MIN) {
2679        env->vxsat = 0x1;
2680        return INT32_MIN;
2681    } else {
2682        return res;
2683    }
2684}
2685
2686static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2687{
2688    uint8_t round;
2689    uint64_t hi_64, lo_64;
2690    int64_t res;
2691
2692    if (a == INT64_MIN && b == INT64_MIN) {
2693        env->vxsat = 1;
2694        return INT64_MAX;
2695    }
2696
2697    muls64(&lo_64, &hi_64, a, b);
2698    round = get_round(vxrm, lo_64, 63);
2699    /*
2700     * Cannot overflow, as there are always
2701     * 2 sign bits after multiply.
2702     */
2703    res = (hi_64 << 1) | (lo_64 >> 63);
2704    if (round) {
2705        if (res == INT64_MAX) {
2706            env->vxsat = 1;
2707        } else {
2708            res += 1;
2709        }
2710    }
2711    return res;
2712}
2713
2714RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2715RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2716RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2717RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2718GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2719GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2720GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2721GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2722
2723RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2724RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2725RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2726RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2727GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2728GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2729GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2730GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2731
2732/* Vector Single-Width Scaling Shift Instructions */
2733static inline uint8_t
2734vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2735{
2736    uint8_t round, shift = b & 0x7;
2737    uint8_t res;
2738
2739    round = get_round(vxrm, a, shift);
2740    res   = (a >> shift)  + round;
2741    return res;
2742}
2743static inline uint16_t
2744vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2745{
2746    uint8_t round, shift = b & 0xf;
2747    uint16_t res;
2748
2749    round = get_round(vxrm, a, shift);
2750    res   = (a >> shift)  + round;
2751    return res;
2752}
2753static inline uint32_t
2754vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2755{
2756    uint8_t round, shift = b & 0x1f;
2757    uint32_t res;
2758
2759    round = get_round(vxrm, a, shift);
2760    res   = (a >> shift)  + round;
2761    return res;
2762}
2763static inline uint64_t
2764vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2765{
2766    uint8_t round, shift = b & 0x3f;
2767    uint64_t res;
2768
2769    round = get_round(vxrm, a, shift);
2770    res   = (a >> shift)  + round;
2771    return res;
2772}
2773RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2774RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2775RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2776RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2777GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2778GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2779GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2780GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2781
2782RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2783RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2784RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2785RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2786GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2787GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2788GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2789GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2790
2791static inline int8_t
2792vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2793{
2794    uint8_t round, shift = b & 0x7;
2795    int8_t res;
2796
2797    round = get_round(vxrm, a, shift);
2798    res   = (a >> shift)  + round;
2799    return res;
2800}
2801static inline int16_t
2802vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2803{
2804    uint8_t round, shift = b & 0xf;
2805    int16_t res;
2806
2807    round = get_round(vxrm, a, shift);
2808    res   = (a >> shift)  + round;
2809    return res;
2810}
2811static inline int32_t
2812vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2813{
2814    uint8_t round, shift = b & 0x1f;
2815    int32_t res;
2816
2817    round = get_round(vxrm, a, shift);
2818    res   = (a >> shift)  + round;
2819    return res;
2820}
2821static inline int64_t
2822vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2823{
2824    uint8_t round, shift = b & 0x3f;
2825    int64_t res;
2826
2827    round = get_round(vxrm, a, shift);
2828    res   = (a >> shift)  + round;
2829    return res;
2830}
2831
2832RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2833RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2834RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2835RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2836GEN_VEXT_VV_RM(vssra_vv_b, 1)
2837GEN_VEXT_VV_RM(vssra_vv_h, 2)
2838GEN_VEXT_VV_RM(vssra_vv_w, 4)
2839GEN_VEXT_VV_RM(vssra_vv_d, 8)
2840
2841RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2842RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2843RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2844RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2845GEN_VEXT_VX_RM(vssra_vx_b, 1)
2846GEN_VEXT_VX_RM(vssra_vx_h, 2)
2847GEN_VEXT_VX_RM(vssra_vx_w, 4)
2848GEN_VEXT_VX_RM(vssra_vx_d, 8)
2849
2850/* Vector Narrowing Fixed-Point Clip Instructions */
2851static inline int8_t
2852vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2853{
2854    uint8_t round, shift = b & 0xf;
2855    int16_t res;
2856
2857    round = get_round(vxrm, a, shift);
2858    res   = (a >> shift)  + round;
2859    if (res > INT8_MAX) {
2860        env->vxsat = 0x1;
2861        return INT8_MAX;
2862    } else if (res < INT8_MIN) {
2863        env->vxsat = 0x1;
2864        return INT8_MIN;
2865    } else {
2866        return res;
2867    }
2868}
2869
2870static inline int16_t
2871vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2872{
2873    uint8_t round, shift = b & 0x1f;
2874    int32_t res;
2875
2876    round = get_round(vxrm, a, shift);
2877    res   = (a >> shift)  + round;
2878    if (res > INT16_MAX) {
2879        env->vxsat = 0x1;
2880        return INT16_MAX;
2881    } else if (res < INT16_MIN) {
2882        env->vxsat = 0x1;
2883        return INT16_MIN;
2884    } else {
2885        return res;
2886    }
2887}
2888
2889static inline int32_t
2890vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2891{
2892    uint8_t round, shift = b & 0x3f;
2893    int64_t res;
2894
2895    round = get_round(vxrm, a, shift);
2896    res   = (a >> shift)  + round;
2897    if (res > INT32_MAX) {
2898        env->vxsat = 0x1;
2899        return INT32_MAX;
2900    } else if (res < INT32_MIN) {
2901        env->vxsat = 0x1;
2902        return INT32_MIN;
2903    } else {
2904        return res;
2905    }
2906}
2907
2908RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2909RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2910RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2911GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2912GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2913GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2914
2915RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2916RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2917RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2918GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2919GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2920GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2921
2922static inline uint8_t
2923vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2924{
2925    uint8_t round, shift = b & 0xf;
2926    uint16_t res;
2927
2928    round = get_round(vxrm, a, shift);
2929    res   = (a >> shift)  + round;
2930    if (res > UINT8_MAX) {
2931        env->vxsat = 0x1;
2932        return UINT8_MAX;
2933    } else {
2934        return res;
2935    }
2936}
2937
2938static inline uint16_t
2939vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2940{
2941    uint8_t round, shift = b & 0x1f;
2942    uint32_t res;
2943
2944    round = get_round(vxrm, a, shift);
2945    res   = (a >> shift)  + round;
2946    if (res > UINT16_MAX) {
2947        env->vxsat = 0x1;
2948        return UINT16_MAX;
2949    } else {
2950        return res;
2951    }
2952}
2953
2954static inline uint32_t
2955vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2956{
2957    uint8_t round, shift = b & 0x3f;
2958    uint64_t res;
2959
2960    round = get_round(vxrm, a, shift);
2961    res   = (a >> shift)  + round;
2962    if (res > UINT32_MAX) {
2963        env->vxsat = 0x1;
2964        return UINT32_MAX;
2965    } else {
2966        return res;
2967    }
2968}
2969
2970RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
2971RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
2972RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
2973GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
2974GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
2975GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
2976
2977RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
2978RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
2979RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
2980GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
2981GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
2982GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
2983
2984/*
2985 *** Vector Float Point Arithmetic Instructions
2986 */
2987/* Vector Single-Width Floating-Point Add/Subtract Instructions */
2988#define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
2989static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
2990                      CPURISCVState *env)                      \
2991{                                                              \
2992    TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
2993    TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
2994    *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
2995}
2996
2997#define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
2998void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
2999                  void *vs2, CPURISCVState *env,          \
3000                  uint32_t desc)                          \
3001{                                                         \
3002    uint32_t vm = vext_vm(desc);                          \
3003    uint32_t vl = env->vl;                                \
3004    uint32_t total_elems =                                \
3005        vext_get_total_elems(env, desc, ESZ);             \
3006    uint32_t vta = vext_vta(desc);                        \
3007    uint32_t i;                                           \
3008                                                          \
3009    for (i = env->vstart; i < vl; i++) {                  \
3010        if (!vm && !vext_elem_mask(v0, i)) {              \
3011            continue;                                     \
3012        }                                                 \
3013        do_##NAME(vd, vs1, vs2, i, env);                  \
3014    }                                                     \
3015    env->vstart = 0;                                      \
3016    /* set tail elements to 1s */                         \
3017    vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3018                      total_elems * ESZ);                 \
3019}
3020
3021RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3022RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3023RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3024GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3025GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3026GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3027
3028#define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3029static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3030                      CPURISCVState *env)                      \
3031{                                                              \
3032    TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3033    *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3034}
3035
3036#define GEN_VEXT_VF(NAME, ESZ)                            \
3037void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3038                  void *vs2, CPURISCVState *env,          \
3039                  uint32_t desc)                          \
3040{                                                         \
3041    uint32_t vm = vext_vm(desc);                          \
3042    uint32_t vl = env->vl;                                \
3043    uint32_t total_elems =                                \
3044        vext_get_total_elems(env, desc, ESZ);              \
3045    uint32_t vta = vext_vta(desc);                        \
3046    uint32_t i;                                           \
3047                                                          \
3048    for (i = env->vstart; i < vl; i++) {                  \
3049        if (!vm && !vext_elem_mask(v0, i)) {              \
3050            continue;                                     \
3051        }                                                 \
3052        do_##NAME(vd, s1, vs2, i, env);                   \
3053    }                                                     \
3054    env->vstart = 0;                                      \
3055    /* set tail elements to 1s */                         \
3056    vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3057                      total_elems * ESZ);                 \
3058}
3059
3060RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3061RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3062RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3063GEN_VEXT_VF(vfadd_vf_h, 2)
3064GEN_VEXT_VF(vfadd_vf_w, 4)
3065GEN_VEXT_VF(vfadd_vf_d, 8)
3066
3067RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3068RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3069RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3070GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3071GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3072GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3073RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3074RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3075RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3076GEN_VEXT_VF(vfsub_vf_h, 2)
3077GEN_VEXT_VF(vfsub_vf_w, 4)
3078GEN_VEXT_VF(vfsub_vf_d, 8)
3079
3080static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3081{
3082    return float16_sub(b, a, s);
3083}
3084
3085static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3086{
3087    return float32_sub(b, a, s);
3088}
3089
3090static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3091{
3092    return float64_sub(b, a, s);
3093}
3094
3095RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3096RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3097RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3098GEN_VEXT_VF(vfrsub_vf_h, 2)
3099GEN_VEXT_VF(vfrsub_vf_w, 4)
3100GEN_VEXT_VF(vfrsub_vf_d, 8)
3101
3102/* Vector Widening Floating-Point Add/Subtract Instructions */
3103static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3104{
3105    return float32_add(float16_to_float32(a, true, s),
3106            float16_to_float32(b, true, s), s);
3107}
3108
3109static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3110{
3111    return float64_add(float32_to_float64(a, s),
3112            float32_to_float64(b, s), s);
3113
3114}
3115
3116RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3117RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3118GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3119GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3120RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3121RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3122GEN_VEXT_VF(vfwadd_vf_h, 4)
3123GEN_VEXT_VF(vfwadd_vf_w, 8)
3124
3125static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3126{
3127    return float32_sub(float16_to_float32(a, true, s),
3128            float16_to_float32(b, true, s), s);
3129}
3130
3131static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3132{
3133    return float64_sub(float32_to_float64(a, s),
3134            float32_to_float64(b, s), s);
3135
3136}
3137
3138RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3139RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3140GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3141GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3142RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3143RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3144GEN_VEXT_VF(vfwsub_vf_h, 4)
3145GEN_VEXT_VF(vfwsub_vf_w, 8)
3146
3147static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3148{
3149    return float32_add(a, float16_to_float32(b, true, s), s);
3150}
3151
3152static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3153{
3154    return float64_add(a, float32_to_float64(b, s), s);
3155}
3156
3157RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3158RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3159GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3160GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3161RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3162RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3163GEN_VEXT_VF(vfwadd_wf_h, 4)
3164GEN_VEXT_VF(vfwadd_wf_w, 8)
3165
3166static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3167{
3168    return float32_sub(a, float16_to_float32(b, true, s), s);
3169}
3170
3171static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3172{
3173    return float64_sub(a, float32_to_float64(b, s), s);
3174}
3175
3176RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3177RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3178GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3179GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3180RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3181RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3182GEN_VEXT_VF(vfwsub_wf_h, 4)
3183GEN_VEXT_VF(vfwsub_wf_w, 8)
3184
3185/* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3186RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3187RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3188RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3189GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3190GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3191GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3192RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3193RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3194RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3195GEN_VEXT_VF(vfmul_vf_h, 2)
3196GEN_VEXT_VF(vfmul_vf_w, 4)
3197GEN_VEXT_VF(vfmul_vf_d, 8)
3198
3199RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3200RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3201RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3202GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3203GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3204GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3205RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3206RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3207RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3208GEN_VEXT_VF(vfdiv_vf_h, 2)
3209GEN_VEXT_VF(vfdiv_vf_w, 4)
3210GEN_VEXT_VF(vfdiv_vf_d, 8)
3211
3212static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3213{
3214    return float16_div(b, a, s);
3215}
3216
3217static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3218{
3219    return float32_div(b, a, s);
3220}
3221
3222static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3223{
3224    return float64_div(b, a, s);
3225}
3226
3227RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3228RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3229RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3230GEN_VEXT_VF(vfrdiv_vf_h, 2)
3231GEN_VEXT_VF(vfrdiv_vf_w, 4)
3232GEN_VEXT_VF(vfrdiv_vf_d, 8)
3233
3234/* Vector Widening Floating-Point Multiply */
3235static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3236{
3237    return float32_mul(float16_to_float32(a, true, s),
3238            float16_to_float32(b, true, s), s);
3239}
3240
3241static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3242{
3243    return float64_mul(float32_to_float64(a, s),
3244            float32_to_float64(b, s), s);
3245
3246}
3247RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3248RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3249GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3250GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3251RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3252RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3253GEN_VEXT_VF(vfwmul_vf_h, 4)
3254GEN_VEXT_VF(vfwmul_vf_w, 8)
3255
3256/* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3257#define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3258static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3259        CPURISCVState *env)                                        \
3260{                                                                  \
3261    TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3262    TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3263    TD d = *((TD *)vd + HD(i));                                    \
3264    *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3265}
3266
3267static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3268{
3269    return float16_muladd(a, b, d, 0, s);
3270}
3271
3272static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3273{
3274    return float32_muladd(a, b, d, 0, s);
3275}
3276
3277static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3278{
3279    return float64_muladd(a, b, d, 0, s);
3280}
3281
3282RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3283RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3284RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3285GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3286GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3287GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3288
3289#define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3290static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3291        CPURISCVState *env)                                       \
3292{                                                                 \
3293    TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3294    TD d = *((TD *)vd + HD(i));                                   \
3295    *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3296}
3297
3298RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3299RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3300RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3301GEN_VEXT_VF(vfmacc_vf_h, 2)
3302GEN_VEXT_VF(vfmacc_vf_w, 4)
3303GEN_VEXT_VF(vfmacc_vf_d, 8)
3304
3305static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3306{
3307    return float16_muladd(a, b, d,
3308            float_muladd_negate_c | float_muladd_negate_product, s);
3309}
3310
3311static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3312{
3313    return float32_muladd(a, b, d,
3314            float_muladd_negate_c | float_muladd_negate_product, s);
3315}
3316
3317static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3318{
3319    return float64_muladd(a, b, d,
3320            float_muladd_negate_c | float_muladd_negate_product, s);
3321}
3322
3323RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3324RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3325RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3326GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3327GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3328GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3329RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3330RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3331RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3332GEN_VEXT_VF(vfnmacc_vf_h, 2)
3333GEN_VEXT_VF(vfnmacc_vf_w, 4)
3334GEN_VEXT_VF(vfnmacc_vf_d, 8)
3335
3336static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3337{
3338    return float16_muladd(a, b, d, float_muladd_negate_c, s);
3339}
3340
3341static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3342{
3343    return float32_muladd(a, b, d, float_muladd_negate_c, s);
3344}
3345
3346static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3347{
3348    return float64_muladd(a, b, d, float_muladd_negate_c, s);
3349}
3350
3351RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3352RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3353RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3354GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3355GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3356GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3357RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3358RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3359RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3360GEN_VEXT_VF(vfmsac_vf_h, 2)
3361GEN_VEXT_VF(vfmsac_vf_w, 4)
3362GEN_VEXT_VF(vfmsac_vf_d, 8)
3363
3364static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3365{
3366    return float16_muladd(a, b, d, float_muladd_negate_product, s);
3367}
3368
3369static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3370{
3371    return float32_muladd(a, b, d, float_muladd_negate_product, s);
3372}
3373
3374static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3375{
3376    return float64_muladd(a, b, d, float_muladd_negate_product, s);
3377}
3378
3379RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3380RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3381RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3382GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3383GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3384GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3385RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3386RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3387RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3388GEN_VEXT_VF(vfnmsac_vf_h, 2)
3389GEN_VEXT_VF(vfnmsac_vf_w, 4)
3390GEN_VEXT_VF(vfnmsac_vf_d, 8)
3391
3392static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3393{
3394    return float16_muladd(d, b, a, 0, s);
3395}
3396
3397static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3398{
3399    return float32_muladd(d, b, a, 0, s);
3400}
3401
3402static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3403{
3404    return float64_muladd(d, b, a, 0, s);
3405}
3406
3407RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3408RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3409RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3410GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3411GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3412GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3413RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3414RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3415RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3416GEN_VEXT_VF(vfmadd_vf_h, 2)
3417GEN_VEXT_VF(vfmadd_vf_w, 4)
3418GEN_VEXT_VF(vfmadd_vf_d, 8)
3419
3420static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3421{
3422    return float16_muladd(d, b, a,
3423            float_muladd_negate_c | float_muladd_negate_product, s);
3424}
3425
3426static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3427{
3428    return float32_muladd(d, b, a,
3429            float_muladd_negate_c | float_muladd_negate_product, s);
3430}
3431
3432static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3433{
3434    return float64_muladd(d, b, a,
3435            float_muladd_negate_c | float_muladd_negate_product, s);
3436}
3437
3438RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3439RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3440RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3441GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3442GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3443GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3444RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3445RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3446RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3447GEN_VEXT_VF(vfnmadd_vf_h, 2)
3448GEN_VEXT_VF(vfnmadd_vf_w, 4)
3449GEN_VEXT_VF(vfnmadd_vf_d, 8)
3450
3451static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3452{
3453    return float16_muladd(d, b, a, float_muladd_negate_c, s);
3454}
3455
3456static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3457{
3458    return float32_muladd(d, b, a, float_muladd_negate_c, s);
3459}
3460
3461static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3462{
3463    return float64_muladd(d, b, a, float_muladd_negate_c, s);
3464}
3465
3466RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3467RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3468RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3469GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3470GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3471GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3472RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3473RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3474RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3475GEN_VEXT_VF(vfmsub_vf_h, 2)
3476GEN_VEXT_VF(vfmsub_vf_w, 4)
3477GEN_VEXT_VF(vfmsub_vf_d, 8)
3478
3479static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3480{
3481    return float16_muladd(d, b, a, float_muladd_negate_product, s);
3482}
3483
3484static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3485{
3486    return float32_muladd(d, b, a, float_muladd_negate_product, s);
3487}
3488
3489static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3490{
3491    return float64_muladd(d, b, a, float_muladd_negate_product, s);
3492}
3493
3494RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3495RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3496RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3497GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3498GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3499GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3500RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3501RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3502RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3503GEN_VEXT_VF(vfnmsub_vf_h, 2)
3504GEN_VEXT_VF(vfnmsub_vf_w, 4)
3505GEN_VEXT_VF(vfnmsub_vf_d, 8)
3506
3507/* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3508static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3509{
3510    return float32_muladd(float16_to_float32(a, true, s),
3511                        float16_to_float32(b, true, s), d, 0, s);
3512}
3513
3514static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3515{
3516    return float64_muladd(float32_to_float64(a, s),
3517                        float32_to_float64(b, s), d, 0, s);
3518}
3519
3520RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3521RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3522GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3523GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3524RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3525RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3526GEN_VEXT_VF(vfwmacc_vf_h, 4)
3527GEN_VEXT_VF(vfwmacc_vf_w, 8)
3528
3529static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3530{
3531    return float32_muladd(float16_to_float32(a, true, s),
3532                        float16_to_float32(b, true, s), d,
3533                        float_muladd_negate_c | float_muladd_negate_product, s);
3534}
3535
3536static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3537{
3538    return float64_muladd(float32_to_float64(a, s),
3539                        float32_to_float64(b, s), d,
3540                        float_muladd_negate_c | float_muladd_negate_product, s);
3541}
3542
3543RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3544RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3545GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3546GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3547RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3548RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3549GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3550GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3551
3552static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3553{
3554    return float32_muladd(float16_to_float32(a, true, s),
3555                        float16_to_float32(b, true, s), d,
3556                        float_muladd_negate_c, s);
3557}
3558
3559static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3560{
3561    return float64_muladd(float32_to_float64(a, s),
3562                        float32_to_float64(b, s), d,
3563                        float_muladd_negate_c, s);
3564}
3565
3566RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3567RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3568GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3569GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3570RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3571RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3572GEN_VEXT_VF(vfwmsac_vf_h, 4)
3573GEN_VEXT_VF(vfwmsac_vf_w, 8)
3574
3575static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3576{
3577    return float32_muladd(float16_to_float32(a, true, s),
3578                        float16_to_float32(b, true, s), d,
3579                        float_muladd_negate_product, s);
3580}
3581
3582static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3583{
3584    return float64_muladd(float32_to_float64(a, s),
3585                        float32_to_float64(b, s), d,
3586                        float_muladd_negate_product, s);
3587}
3588
3589RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3590RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3591GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3592GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3593RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3594RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3595GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3596GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3597
3598/* Vector Floating-Point Square-Root Instruction */
3599/* (TD, T2, TX2) */
3600#define OP_UU_H uint16_t, uint16_t, uint16_t
3601#define OP_UU_W uint32_t, uint32_t, uint32_t
3602#define OP_UU_D uint64_t, uint64_t, uint64_t
3603
3604#define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)        \
3605static void do_##NAME(void *vd, void *vs2, int i,      \
3606        CPURISCVState *env)                            \
3607{                                                      \
3608    TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3609    *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3610}
3611
3612#define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3613void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3614        CPURISCVState *env, uint32_t desc)             \
3615{                                                      \
3616    uint32_t vm = vext_vm(desc);                       \
3617    uint32_t vl = env->vl;                             \
3618    uint32_t total_elems =                             \
3619        vext_get_total_elems(env, desc, ESZ);          \
3620    uint32_t vta = vext_vta(desc);                     \
3621    uint32_t i;                                        \
3622                                                       \
3623    if (vl == 0) {                                     \
3624        return;                                        \
3625    }                                                  \
3626    for (i = env->vstart; i < vl; i++) {               \
3627        if (!vm && !vext_elem_mask(v0, i)) {           \
3628            continue;                                  \
3629        }                                              \
3630        do_##NAME(vd, vs2, i, env);                    \
3631    }                                                  \
3632    env->vstart = 0;                                   \
3633    vext_set_elems_1s(vd, vta, vl * ESZ,               \
3634                      total_elems * ESZ);              \
3635}
3636
3637RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3638RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3639RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3640GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3641GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3642GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3643
3644/*
3645 * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3646 *
3647 * Adapted from riscv-v-spec recip.c:
3648 * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3649 */
3650static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3651{
3652    uint64_t sign = extract64(f, frac_size + exp_size, 1);
3653    uint64_t exp = extract64(f, frac_size, exp_size);
3654    uint64_t frac = extract64(f, 0, frac_size);
3655
3656    const uint8_t lookup_table[] = {
3657        52, 51, 50, 48, 47, 46, 44, 43,
3658        42, 41, 40, 39, 38, 36, 35, 34,
3659        33, 32, 31, 30, 30, 29, 28, 27,
3660        26, 25, 24, 23, 23, 22, 21, 20,
3661        19, 19, 18, 17, 16, 16, 15, 14,
3662        14, 13, 12, 12, 11, 10, 10, 9,
3663        9, 8, 7, 7, 6, 6, 5, 4,
3664        4, 3, 3, 2, 2, 1, 1, 0,
3665        127, 125, 123, 121, 119, 118, 116, 114,
3666        113, 111, 109, 108, 106, 105, 103, 102,
3667        100, 99, 97, 96, 95, 93, 92, 91,
3668        90, 88, 87, 86, 85, 84, 83, 82,
3669        80, 79, 78, 77, 76, 75, 74, 73,
3670        72, 71, 70, 70, 69, 68, 67, 66,
3671        65, 64, 63, 63, 62, 61, 60, 59,
3672        59, 58, 57, 56, 56, 55, 54, 53
3673    };
3674    const int precision = 7;
3675
3676    if (exp == 0 && frac != 0) { /* subnormal */
3677        /* Normalize the subnormal. */
3678        while (extract64(frac, frac_size - 1, 1) == 0) {
3679            exp--;
3680            frac <<= 1;
3681        }
3682
3683        frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3684    }
3685
3686    int idx = ((exp & 1) << (precision - 1)) |
3687                (frac >> (frac_size - precision + 1));
3688    uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3689                            (frac_size - precision);
3690    uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3691
3692    uint64_t val = 0;
3693    val = deposit64(val, 0, frac_size, out_frac);
3694    val = deposit64(val, frac_size, exp_size, out_exp);
3695    val = deposit64(val, frac_size + exp_size, 1, sign);
3696    return val;
3697}
3698
3699static float16 frsqrt7_h(float16 f, float_status *s)
3700{
3701    int exp_size = 5, frac_size = 10;
3702    bool sign = float16_is_neg(f);
3703
3704    /*
3705     * frsqrt7(sNaN) = canonical NaN
3706     * frsqrt7(-inf) = canonical NaN
3707     * frsqrt7(-normal) = canonical NaN
3708     * frsqrt7(-subnormal) = canonical NaN
3709     */
3710    if (float16_is_signaling_nan(f, s) ||
3711            (float16_is_infinity(f) && sign) ||
3712            (float16_is_normal(f) && sign) ||
3713            (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3714        s->float_exception_flags |= float_flag_invalid;
3715        return float16_default_nan(s);
3716    }
3717
3718    /* frsqrt7(qNaN) = canonical NaN */
3719    if (float16_is_quiet_nan(f, s)) {
3720        return float16_default_nan(s);
3721    }
3722
3723    /* frsqrt7(+-0) = +-inf */
3724    if (float16_is_zero(f)) {
3725        s->float_exception_flags |= float_flag_divbyzero;
3726        return float16_set_sign(float16_infinity, sign);
3727    }
3728
3729    /* frsqrt7(+inf) = +0 */
3730    if (float16_is_infinity(f) && !sign) {
3731        return float16_set_sign(float16_zero, sign);
3732    }
3733
3734    /* +normal, +subnormal */
3735    uint64_t val = frsqrt7(f, exp_size, frac_size);
3736    return make_float16(val);
3737}
3738
3739static float32 frsqrt7_s(float32 f, float_status *s)
3740{
3741    int exp_size = 8, frac_size = 23;
3742    bool sign = float32_is_neg(f);
3743
3744    /*
3745     * frsqrt7(sNaN) = canonical NaN
3746     * frsqrt7(-inf) = canonical NaN
3747     * frsqrt7(-normal) = canonical NaN
3748     * frsqrt7(-subnormal) = canonical NaN
3749     */
3750    if (float32_is_signaling_nan(f, s) ||
3751            (float32_is_infinity(f) && sign) ||
3752            (float32_is_normal(f) && sign) ||
3753            (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3754        s->float_exception_flags |= float_flag_invalid;
3755        return float32_default_nan(s);
3756    }
3757
3758    /* frsqrt7(qNaN) = canonical NaN */
3759    if (float32_is_quiet_nan(f, s)) {
3760        return float32_default_nan(s);
3761    }
3762
3763    /* frsqrt7(+-0) = +-inf */
3764    if (float32_is_zero(f)) {
3765        s->float_exception_flags |= float_flag_divbyzero;
3766        return float32_set_sign(float32_infinity, sign);
3767    }
3768
3769    /* frsqrt7(+inf) = +0 */
3770    if (float32_is_infinity(f) && !sign) {
3771        return float32_set_sign(float32_zero, sign);
3772    }
3773
3774    /* +normal, +subnormal */
3775    uint64_t val = frsqrt7(f, exp_size, frac_size);
3776    return make_float32(val);
3777}
3778
3779static float64 frsqrt7_d(float64 f, float_status *s)
3780{
3781    int exp_size = 11, frac_size = 52;
3782    bool sign = float64_is_neg(f);
3783
3784    /*
3785     * frsqrt7(sNaN) = canonical NaN
3786     * frsqrt7(-inf) = canonical NaN
3787     * frsqrt7(-normal) = canonical NaN
3788     * frsqrt7(-subnormal) = canonical NaN
3789     */
3790    if (float64_is_signaling_nan(f, s) ||
3791            (float64_is_infinity(f) && sign) ||
3792            (float64_is_normal(f) && sign) ||
3793            (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3794        s->float_exception_flags |= float_flag_invalid;
3795        return float64_default_nan(s);
3796    }
3797
3798    /* frsqrt7(qNaN) = canonical NaN */
3799    if (float64_is_quiet_nan(f, s)) {
3800        return float64_default_nan(s);
3801    }
3802
3803    /* frsqrt7(+-0) = +-inf */
3804    if (float64_is_zero(f)) {
3805        s->float_exception_flags |= float_flag_divbyzero;
3806        return float64_set_sign(float64_infinity, sign);
3807    }
3808
3809    /* frsqrt7(+inf) = +0 */
3810    if (float64_is_infinity(f) && !sign) {
3811        return float64_set_sign(float64_zero, sign);
3812    }
3813
3814    /* +normal, +subnormal */
3815    uint64_t val = frsqrt7(f, exp_size, frac_size);
3816    return make_float64(val);
3817}
3818
3819RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3820RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3821RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3822GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3823GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3824GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3825
3826/*
3827 * Vector Floating-Point Reciprocal Estimate Instruction
3828 *
3829 * Adapted from riscv-v-spec recip.c:
3830 * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3831 */
3832static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3833                      float_status *s)
3834{
3835    uint64_t sign = extract64(f, frac_size + exp_size, 1);
3836    uint64_t exp = extract64(f, frac_size, exp_size);
3837    uint64_t frac = extract64(f, 0, frac_size);
3838
3839    const uint8_t lookup_table[] = {
3840        127, 125, 123, 121, 119, 117, 116, 114,
3841        112, 110, 109, 107, 105, 104, 102, 100,
3842        99, 97, 96, 94, 93, 91, 90, 88,
3843        87, 85, 84, 83, 81, 80, 79, 77,
3844        76, 75, 74, 72, 71, 70, 69, 68,
3845        66, 65, 64, 63, 62, 61, 60, 59,
3846        58, 57, 56, 55, 54, 53, 52, 51,
3847        50, 49, 48, 47, 46, 45, 44, 43,
3848        42, 41, 40, 40, 39, 38, 37, 36,
3849        35, 35, 34, 33, 32, 31, 31, 30,
3850        29, 28, 28, 27, 26, 25, 25, 24,
3851        23, 23, 22, 21, 21, 20, 19, 19,
3852        18, 17, 17, 16, 15, 15, 14, 14,
3853        13, 12, 12, 11, 11, 10, 9, 9,
3854        8, 8, 7, 7, 6, 5, 5, 4,
3855        4, 3, 3, 2, 2, 1, 1, 0
3856    };
3857    const int precision = 7;
3858
3859    if (exp == 0 && frac != 0) { /* subnormal */
3860        /* Normalize the subnormal. */
3861        while (extract64(frac, frac_size - 1, 1) == 0) {
3862            exp--;
3863            frac <<= 1;
3864        }
3865
3866        frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3867
3868        if (exp != 0 && exp != UINT64_MAX) {
3869            /*
3870             * Overflow to inf or max value of same sign,
3871             * depending on sign and rounding mode.
3872             */
3873            s->float_exception_flags |= (float_flag_inexact |
3874                                         float_flag_overflow);
3875
3876            if ((s->float_rounding_mode == float_round_to_zero) ||
3877                ((s->float_rounding_mode == float_round_down) && !sign) ||
3878                ((s->float_rounding_mode == float_round_up) && sign)) {
3879                /* Return greatest/negative finite value. */
3880                return (sign << (exp_size + frac_size)) |
3881                    (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3882            } else {
3883                /* Return +-inf. */
3884                return (sign << (exp_size + frac_size)) |
3885                    MAKE_64BIT_MASK(frac_size, exp_size);
3886            }
3887        }
3888    }
3889
3890    int idx = frac >> (frac_size - precision);
3891    uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3892                            (frac_size - precision);
3893    uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3894
3895    if (out_exp == 0 || out_exp == UINT64_MAX) {
3896        /*
3897         * The result is subnormal, but don't raise the underflow exception,
3898         * because there's no additional loss of precision.
3899         */
3900        out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3901        if (out_exp == UINT64_MAX) {
3902            out_frac >>= 1;
3903            out_exp = 0;
3904        }
3905    }
3906
3907    uint64_t val = 0;
3908    val = deposit64(val, 0, frac_size, out_frac);
3909    val = deposit64(val, frac_size, exp_size, out_exp);
3910    val = deposit64(val, frac_size + exp_size, 1, sign);
3911    return val;
3912}
3913
3914static float16 frec7_h(float16 f, float_status *s)
3915{
3916    int exp_size = 5, frac_size = 10;
3917    bool sign = float16_is_neg(f);
3918
3919    /* frec7(+-inf) = +-0 */
3920    if (float16_is_infinity(f)) {
3921        return float16_set_sign(float16_zero, sign);
3922    }
3923
3924    /* frec7(+-0) = +-inf */
3925    if (float16_is_zero(f)) {
3926        s->float_exception_flags |= float_flag_divbyzero;
3927        return float16_set_sign(float16_infinity, sign);
3928    }
3929
3930    /* frec7(sNaN) = canonical NaN */
3931    if (float16_is_signaling_nan(f, s)) {
3932        s->float_exception_flags |= float_flag_invalid;
3933        return float16_default_nan(s);
3934    }
3935
3936    /* frec7(qNaN) = canonical NaN */
3937    if (float16_is_quiet_nan(f, s)) {
3938        return float16_default_nan(s);
3939    }
3940
3941    /* +-normal, +-subnormal */
3942    uint64_t val = frec7(f, exp_size, frac_size, s);
3943    return make_float16(val);
3944}
3945
3946static float32 frec7_s(float32 f, float_status *s)
3947{
3948    int exp_size = 8, frac_size = 23;
3949    bool sign = float32_is_neg(f);
3950
3951    /* frec7(+-inf) = +-0 */
3952    if (float32_is_infinity(f)) {
3953        return float32_set_sign(float32_zero, sign);
3954    }
3955
3956    /* frec7(+-0) = +-inf */
3957    if (float32_is_zero(f)) {
3958        s->float_exception_flags |= float_flag_divbyzero;
3959        return float32_set_sign(float32_infinity, sign);
3960    }
3961
3962    /* frec7(sNaN) = canonical NaN */
3963    if (float32_is_signaling_nan(f, s)) {
3964        s->float_exception_flags |= float_flag_invalid;
3965        return float32_default_nan(s);
3966    }
3967
3968    /* frec7(qNaN) = canonical NaN */
3969    if (float32_is_quiet_nan(f, s)) {
3970        return float32_default_nan(s);
3971    }
3972
3973    /* +-normal, +-subnormal */
3974    uint64_t val = frec7(f, exp_size, frac_size, s);
3975    return make_float32(val);
3976}
3977
3978static float64 frec7_d(float64 f, float_status *s)
3979{
3980    int exp_size = 11, frac_size = 52;
3981    bool sign = float64_is_neg(f);
3982
3983    /* frec7(+-inf) = +-0 */
3984    if (float64_is_infinity(f)) {
3985        return float64_set_sign(float64_zero, sign);
3986    }
3987
3988    /* frec7(+-0) = +-inf */
3989    if (float64_is_zero(f)) {
3990        s->float_exception_flags |= float_flag_divbyzero;
3991        return float64_set_sign(float64_infinity, sign);
3992    }
3993
3994    /* frec7(sNaN) = canonical NaN */
3995    if (float64_is_signaling_nan(f, s)) {
3996        s->float_exception_flags |= float_flag_invalid;
3997        return float64_default_nan(s);
3998    }
3999
4000    /* frec7(qNaN) = canonical NaN */
4001    if (float64_is_quiet_nan(f, s)) {
4002        return float64_default_nan(s);
4003    }
4004
4005    /* +-normal, +-subnormal */
4006    uint64_t val = frec7(f, exp_size, frac_size, s);
4007    return make_float64(val);
4008}
4009
4010RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4011RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4012RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4013GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4014GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4015GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4016
4017/* Vector Floating-Point MIN/MAX Instructions */
4018RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4019RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4020RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4021GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4022GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4023GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4024RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4025RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4026RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4027GEN_VEXT_VF(vfmin_vf_h, 2)
4028GEN_VEXT_VF(vfmin_vf_w, 4)
4029GEN_VEXT_VF(vfmin_vf_d, 8)
4030
4031RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4032RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4033RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4034GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4035GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4036GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4037RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4038RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4039RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4040GEN_VEXT_VF(vfmax_vf_h, 2)
4041GEN_VEXT_VF(vfmax_vf_w, 4)
4042GEN_VEXT_VF(vfmax_vf_d, 8)
4043
4044/* Vector Floating-Point Sign-Injection Instructions */
4045static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4046{
4047    return deposit64(b, 0, 15, a);
4048}
4049
4050static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4051{
4052    return deposit64(b, 0, 31, a);
4053}
4054
4055static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4056{
4057    return deposit64(b, 0, 63, a);
4058}
4059
4060RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4061RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4062RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4063GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4064GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4065GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4066RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4067RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4068RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4069GEN_VEXT_VF(vfsgnj_vf_h, 2)
4070GEN_VEXT_VF(vfsgnj_vf_w, 4)
4071GEN_VEXT_VF(vfsgnj_vf_d, 8)
4072
4073static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4074{
4075    return deposit64(~b, 0, 15, a);
4076}
4077
4078static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4079{
4080    return deposit64(~b, 0, 31, a);
4081}
4082
4083static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4084{
4085    return deposit64(~b, 0, 63, a);
4086}
4087
4088RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4089RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4090RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4091GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4092GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4093GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4094RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4095RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4096RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4097GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4098GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4099GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4100
4101static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4102{
4103    return deposit64(b ^ a, 0, 15, a);
4104}
4105
4106static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4107{
4108    return deposit64(b ^ a, 0, 31, a);
4109}
4110
4111static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4112{
4113    return deposit64(b ^ a, 0, 63, a);
4114}
4115
4116RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4117RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4118RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4119GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4120GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4121GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4122RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4123RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4124RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4125GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4126GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4127GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4128
4129/* Vector Floating-Point Compare Instructions */
4130#define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4131void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4132                  CPURISCVState *env, uint32_t desc)          \
4133{                                                             \
4134    uint32_t vm = vext_vm(desc);                              \
4135    uint32_t vl = env->vl;                                    \
4136    uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
4137    uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4138    uint32_t i;                                               \
4139                                                              \
4140    for (i = env->vstart; i < vl; i++) {                      \
4141        ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4142        ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4143        if (!vm && !vext_elem_mask(v0, i)) {                  \
4144            continue;                                         \
4145        }                                                     \
4146        vext_set_elem_mask(vd, i,                             \
4147                           DO_OP(s2, s1, &env->fp_status));   \
4148    }                                                         \
4149    env->vstart = 0;                                          \
4150    /* mask destination register are always tail-agnostic */  \
4151    /* set tail elements to 1s */                             \
4152    if (vta_all_1s) {                                         \
4153        for (; i < total_elems; i++) {                        \
4154            vext_set_elem_mask(vd, i, 1);                     \
4155        }                                                     \
4156    }                                                         \
4157}
4158
4159GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4160GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4161GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4162
4163#define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4164void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4165                  CPURISCVState *env, uint32_t desc)                \
4166{                                                                   \
4167    uint32_t vm = vext_vm(desc);                                    \
4168    uint32_t vl = env->vl;                                          \
4169    uint32_t total_elems = env_archcpu(env)->cfg.vlen;              \
4170    uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4171    uint32_t i;                                                     \
4172                                                                    \
4173    for (i = env->vstart; i < vl; i++) {                            \
4174        ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4175        if (!vm && !vext_elem_mask(v0, i)) {                        \
4176            continue;                                               \
4177        }                                                           \
4178        vext_set_elem_mask(vd, i,                                   \
4179                           DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4180    }                                                               \
4181    env->vstart = 0;                                                \
4182    /* mask destination register are always tail-agnostic */        \
4183    /* set tail elements to 1s */                                   \
4184    if (vta_all_1s) {                                               \
4185        for (; i < total_elems; i++) {                              \
4186            vext_set_elem_mask(vd, i, 1);                           \
4187        }                                                           \
4188    }                                                               \
4189}
4190
4191GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4192GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4193GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4194
4195static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4196{
4197    FloatRelation compare = float16_compare_quiet(a, b, s);
4198    return compare != float_relation_equal;
4199}
4200
4201static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4202{
4203    FloatRelation compare = float32_compare_quiet(a, b, s);
4204    return compare != float_relation_equal;
4205}
4206
4207static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4208{
4209    FloatRelation compare = float64_compare_quiet(a, b, s);
4210    return compare != float_relation_equal;
4211}
4212
4213GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4214GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4215GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4216GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4217GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4218GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4219
4220GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4221GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4222GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4223GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4224GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4225GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4226
4227GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4228GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4229GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4230GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4231GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4232GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4233
4234static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4235{
4236    FloatRelation compare = float16_compare(a, b, s);
4237    return compare == float_relation_greater;
4238}
4239
4240static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4241{
4242    FloatRelation compare = float32_compare(a, b, s);
4243    return compare == float_relation_greater;
4244}
4245
4246static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4247{
4248    FloatRelation compare = float64_compare(a, b, s);
4249    return compare == float_relation_greater;
4250}
4251
4252GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4253GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4254GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4255
4256static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4257{
4258    FloatRelation compare = float16_compare(a, b, s);
4259    return compare == float_relation_greater ||
4260           compare == float_relation_equal;
4261}
4262
4263static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4264{
4265    FloatRelation compare = float32_compare(a, b, s);
4266    return compare == float_relation_greater ||
4267           compare == float_relation_equal;
4268}
4269
4270static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4271{
4272    FloatRelation compare = float64_compare(a, b, s);
4273    return compare == float_relation_greater ||
4274           compare == float_relation_equal;
4275}
4276
4277GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4278GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4279GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4280
4281/* Vector Floating-Point Classify Instruction */
4282#define OPIVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
4283static void do_##NAME(void *vd, void *vs2, int i)      \
4284{                                                      \
4285    TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
4286    *((TD *)vd + HD(i)) = OP(s2);                      \
4287}
4288
4289#define GEN_VEXT_V(NAME, ESZ)                          \
4290void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
4291                  CPURISCVState *env, uint32_t desc)   \
4292{                                                      \
4293    uint32_t vm = vext_vm(desc);                       \
4294    uint32_t vl = env->vl;                             \
4295    uint32_t total_elems =                             \
4296        vext_get_total_elems(env, desc, ESZ);          \
4297    uint32_t vta = vext_vta(desc);                     \
4298    uint32_t i;                                        \
4299                                                       \
4300    for (i = env->vstart; i < vl; i++) {               \
4301        if (!vm && !vext_elem_mask(v0, i)) {           \
4302            continue;                                  \
4303        }                                              \
4304        do_##NAME(vd, vs2, i);                         \
4305    }                                                  \
4306    env->vstart = 0;                                   \
4307    /* set tail elements to 1s */                      \
4308    vext_set_elems_1s(vd, vta, vl * ESZ,               \
4309                      total_elems * ESZ);              \
4310}
4311
4312target_ulong fclass_h(uint64_t frs1)
4313{
4314    float16 f = frs1;
4315    bool sign = float16_is_neg(f);
4316
4317    if (float16_is_infinity(f)) {
4318        return sign ? 1 << 0 : 1 << 7;
4319    } else if (float16_is_zero(f)) {
4320        return sign ? 1 << 3 : 1 << 4;
4321    } else if (float16_is_zero_or_denormal(f)) {
4322        return sign ? 1 << 2 : 1 << 5;
4323    } else if (float16_is_any_nan(f)) {
4324        float_status s = { }; /* for snan_bit_is_one */
4325        return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4326    } else {
4327        return sign ? 1 << 1 : 1 << 6;
4328    }
4329}
4330
4331target_ulong fclass_s(uint64_t frs1)
4332{
4333    float32 f = frs1;
4334    bool sign = float32_is_neg(f);
4335
4336    if (float32_is_infinity(f)) {
4337        return sign ? 1 << 0 : 1 << 7;
4338    } else if (float32_is_zero(f)) {
4339        return sign ? 1 << 3 : 1 << 4;
4340    } else if (float32_is_zero_or_denormal(f)) {
4341        return sign ? 1 << 2 : 1 << 5;
4342    } else if (float32_is_any_nan(f)) {
4343        float_status s = { }; /* for snan_bit_is_one */
4344        return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4345    } else {
4346        return sign ? 1 << 1 : 1 << 6;
4347    }
4348}
4349
4350target_ulong fclass_d(uint64_t frs1)
4351{
4352    float64 f = frs1;
4353    bool sign = float64_is_neg(f);
4354
4355    if (float64_is_infinity(f)) {
4356        return sign ? 1 << 0 : 1 << 7;
4357    } else if (float64_is_zero(f)) {
4358        return sign ? 1 << 3 : 1 << 4;
4359    } else if (float64_is_zero_or_denormal(f)) {
4360        return sign ? 1 << 2 : 1 << 5;
4361    } else if (float64_is_any_nan(f)) {
4362        float_status s = { }; /* for snan_bit_is_one */
4363        return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4364    } else {
4365        return sign ? 1 << 1 : 1 << 6;
4366    }
4367}
4368
4369RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4370RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4371RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4372GEN_VEXT_V(vfclass_v_h, 2)
4373GEN_VEXT_V(vfclass_v_w, 4)
4374GEN_VEXT_V(vfclass_v_d, 8)
4375
4376/* Vector Floating-Point Merge Instruction */
4377
4378#define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4379void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4380                  CPURISCVState *env, uint32_t desc)          \
4381{                                                             \
4382    uint32_t vm = vext_vm(desc);                              \
4383    uint32_t vl = env->vl;                                    \
4384    uint32_t esz = sizeof(ETYPE);                             \
4385    uint32_t total_elems =                                    \
4386        vext_get_total_elems(env, desc, esz);                 \
4387    uint32_t vta = vext_vta(desc);                            \
4388    uint32_t i;                                               \
4389                                                              \
4390    for (i = env->vstart; i < vl; i++) {                      \
4391        ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4392        *((ETYPE *)vd + H(i))                                 \
4393          = (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4394    }                                                         \
4395    env->vstart = 0;                                          \
4396    /* set tail elements to 1s */                             \
4397    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4398}
4399
4400GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4401GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4402GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4403
4404/* Single-Width Floating-Point/Integer Type-Convert Instructions */
4405/* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4406RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4407RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4408RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4409GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4410GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4411GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4412
4413/* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4414RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4415RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4416RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4417GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4418GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4419GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4420
4421/* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4422RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4423RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4424RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4425GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4426GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4427GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4428
4429/* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4430RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4431RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4432RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4433GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4434GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4435GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4436
4437/* Widening Floating-Point/Integer Type-Convert Instructions */
4438/* (TD, T2, TX2) */
4439#define WOP_UU_B uint16_t, uint8_t,  uint8_t
4440#define WOP_UU_H uint32_t, uint16_t, uint16_t
4441#define WOP_UU_W uint64_t, uint32_t, uint32_t
4442/* vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.*/
4443RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4444RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4445GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4446GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4447
4448/* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4449RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4450RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4451GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4452GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4453
4454/* vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float */
4455RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4456RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4457RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4458GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4459GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4460GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4461
4462/* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4463RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4464RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4465RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4466GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4467GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4468GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4469
4470/*
4471 * vfwcvt.f.f.v vd, vs2, vm
4472 * Convert single-width float to double-width float.
4473 */
4474static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4475{
4476    return float16_to_float32(a, true, s);
4477}
4478
4479RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4480RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4481GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4482GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4483
4484/* Narrowing Floating-Point/Integer Type-Convert Instructions */
4485/* (TD, T2, TX2) */
4486#define NOP_UU_B uint8_t,  uint16_t, uint32_t
4487#define NOP_UU_H uint16_t, uint32_t, uint32_t
4488#define NOP_UU_W uint32_t, uint64_t, uint64_t
4489/* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4490RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4491RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4492RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4493GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4494GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4495GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4496
4497/* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4498RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4499RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4500RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4501GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4502GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4503GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4504
4505/* vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float */
4506RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4507RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4508GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4509GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4510
4511/* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4512RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4513RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4514GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4515GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4516
4517/* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4518static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4519{
4520    return float32_to_float16(a, true, s);
4521}
4522
4523RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4524RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4525GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4526GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4527
4528/*
4529 *** Vector Reduction Operations
4530 */
4531/* Vector Single-Width Integer Reduction Instructions */
4532#define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4533void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4534        void *vs2, CPURISCVState *env, uint32_t desc)     \
4535{                                                         \
4536    uint32_t vm = vext_vm(desc);                          \
4537    uint32_t vl = env->vl;                                \
4538    uint32_t esz = sizeof(TD);                            \
4539    uint32_t vlenb = simd_maxsz(desc);                    \
4540    uint32_t vta = vext_vta(desc);                        \
4541    uint32_t i;                                           \
4542    TD s1 =  *((TD *)vs1 + HD(0));                        \
4543                                                          \
4544    for (i = env->vstart; i < vl; i++) {                  \
4545        TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4546        if (!vm && !vext_elem_mask(v0, i)) {              \
4547            continue;                                     \
4548        }                                                 \
4549        s1 = OP(s1, (TD)s2);                              \
4550    }                                                     \
4551    *((TD *)vd + HD(0)) = s1;                             \
4552    env->vstart = 0;                                      \
4553    /* set tail elements to 1s */                         \
4554    vext_set_elems_1s(vd, vta, esz, vlenb);               \
4555}
4556
4557/* vd[0] = sum(vs1[0], vs2[*]) */
4558GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4559GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4560GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4561GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4562
4563/* vd[0] = maxu(vs1[0], vs2[*]) */
4564GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4565GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4566GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4567GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4568
4569/* vd[0] = max(vs1[0], vs2[*]) */
4570GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4571GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4572GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4573GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4574
4575/* vd[0] = minu(vs1[0], vs2[*]) */
4576GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4577GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4578GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4579GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4580
4581/* vd[0] = min(vs1[0], vs2[*]) */
4582GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4583GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4584GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4585GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4586
4587/* vd[0] = and(vs1[0], vs2[*]) */
4588GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4589GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4590GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4591GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4592
4593/* vd[0] = or(vs1[0], vs2[*]) */
4594GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4595GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4596GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4597GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4598
4599/* vd[0] = xor(vs1[0], vs2[*]) */
4600GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4601GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4602GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4603GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4604
4605/* Vector Widening Integer Reduction Instructions */
4606/* signed sum reduction into double-width accumulator */
4607GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4608GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4609GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4610
4611/* Unsigned sum reduction into double-width accumulator */
4612GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4613GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4614GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4615
4616/* Vector Single-Width Floating-Point Reduction Instructions */
4617#define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4618void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4619                  void *vs2, CPURISCVState *env,           \
4620                  uint32_t desc)                           \
4621{                                                          \
4622    uint32_t vm = vext_vm(desc);                           \
4623    uint32_t vl = env->vl;                                 \
4624    uint32_t esz = sizeof(TD);                             \
4625    uint32_t vlenb = simd_maxsz(desc);                     \
4626    uint32_t vta = vext_vta(desc);                         \
4627    uint32_t i;                                            \
4628    TD s1 =  *((TD *)vs1 + HD(0));                         \
4629                                                           \
4630    for (i = env->vstart; i < vl; i++) {                   \
4631        TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4632        if (!vm && !vext_elem_mask(v0, i)) {               \
4633            continue;                                      \
4634        }                                                  \
4635        s1 = OP(s1, (TD)s2, &env->fp_status);              \
4636    }                                                      \
4637    *((TD *)vd + HD(0)) = s1;                              \
4638    env->vstart = 0;                                       \
4639    /* set tail elements to 1s */                          \
4640    vext_set_elems_1s(vd, vta, esz, vlenb);                \
4641}
4642
4643/* Unordered sum */
4644GEN_VEXT_FRED(vfredsum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4645GEN_VEXT_FRED(vfredsum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4646GEN_VEXT_FRED(vfredsum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4647
4648/* Maximum value */
4649GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2, float16_maximum_number)
4650GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4, float32_maximum_number)
4651GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8, float64_maximum_number)
4652
4653/* Minimum value */
4654GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2, float16_minimum_number)
4655GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4, float32_minimum_number)
4656GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8, float64_minimum_number)
4657
4658/* Vector Widening Floating-Point Reduction Instructions */
4659/* Unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4660void HELPER(vfwredsum_vs_h)(void *vd, void *v0, void *vs1,
4661                            void *vs2, CPURISCVState *env, uint32_t desc)
4662{
4663    uint32_t vm = vext_vm(desc);
4664    uint32_t vl = env->vl;
4665    uint32_t esz = sizeof(uint32_t);
4666    uint32_t vlenb = simd_maxsz(desc);
4667    uint32_t vta = vext_vta(desc);
4668    uint32_t i;
4669    uint32_t s1 =  *((uint32_t *)vs1 + H4(0));
4670
4671    for (i = env->vstart; i < vl; i++) {
4672        uint16_t s2 = *((uint16_t *)vs2 + H2(i));
4673        if (!vm && !vext_elem_mask(v0, i)) {
4674            continue;
4675        }
4676        s1 = float32_add(s1, float16_to_float32(s2, true, &env->fp_status),
4677                         &env->fp_status);
4678    }
4679    *((uint32_t *)vd + H4(0)) = s1;
4680    env->vstart = 0;
4681    /* set tail elements to 1s */
4682    vext_set_elems_1s(vd, vta, esz, vlenb);
4683}
4684
4685void HELPER(vfwredsum_vs_w)(void *vd, void *v0, void *vs1,
4686                            void *vs2, CPURISCVState *env, uint32_t desc)
4687{
4688    uint32_t vm = vext_vm(desc);
4689    uint32_t vl = env->vl;
4690    uint32_t esz = sizeof(uint64_t);
4691    uint32_t vlenb = simd_maxsz(desc);
4692    uint32_t vta = vext_vta(desc);
4693    uint32_t i;
4694    uint64_t s1 =  *((uint64_t *)vs1);
4695
4696    for (i = env->vstart; i < vl; i++) {
4697        uint32_t s2 = *((uint32_t *)vs2 + H4(i));
4698        if (!vm && !vext_elem_mask(v0, i)) {
4699            continue;
4700        }
4701        s1 = float64_add(s1, float32_to_float64(s2, &env->fp_status),
4702                         &env->fp_status);
4703    }
4704    *((uint64_t *)vd) = s1;
4705    env->vstart = 0;
4706    /* set tail elements to 1s */
4707    vext_set_elems_1s(vd, vta, esz, vlenb);
4708}
4709
4710/*
4711 *** Vector Mask Operations
4712 */
4713/* Vector Mask-Register Logical Instructions */
4714#define GEN_VEXT_MASK_VV(NAME, OP)                        \
4715void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4716                  void *vs2, CPURISCVState *env,          \
4717                  uint32_t desc)                          \
4718{                                                         \
4719    uint32_t vl = env->vl;                                \
4720    uint32_t total_elems = env_archcpu(env)->cfg.vlen;    \
4721    uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4722    uint32_t i;                                           \
4723    int a, b;                                             \
4724                                                          \
4725    for (i = env->vstart; i < vl; i++) {                  \
4726        a = vext_elem_mask(vs1, i);                       \
4727        b = vext_elem_mask(vs2, i);                       \
4728        vext_set_elem_mask(vd, i, OP(b, a));              \
4729    }                                                     \
4730    env->vstart = 0;                                      \
4731    /* mask destination register are always tail-         \
4732     * agnostic                                           \
4733     */                                                   \
4734    /* set tail elements to 1s */                         \
4735    if (vta_all_1s) {                                     \
4736        for (; i < total_elems; i++) {                    \
4737            vext_set_elem_mask(vd, i, 1);                 \
4738        }                                                 \
4739    }                                                     \
4740}
4741
4742#define DO_NAND(N, M)  (!(N & M))
4743#define DO_ANDNOT(N, M)  (N & !M)
4744#define DO_NOR(N, M)  (!(N | M))
4745#define DO_ORNOT(N, M)  (N | !M)
4746#define DO_XNOR(N, M)  (!(N ^ M))
4747
4748GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4749GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4750GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4751GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4752GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4753GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4754GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4755GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4756
4757/* Vector count population in mask vcpop */
4758target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4759                             uint32_t desc)
4760{
4761    target_ulong cnt = 0;
4762    uint32_t vm = vext_vm(desc);
4763    uint32_t vl = env->vl;
4764    int i;
4765
4766    for (i = env->vstart; i < vl; i++) {
4767        if (vm || vext_elem_mask(v0, i)) {
4768            if (vext_elem_mask(vs2, i)) {
4769                cnt++;
4770            }
4771        }
4772    }
4773    env->vstart = 0;
4774    return cnt;
4775}
4776
4777/* vfirst find-first-set mask bit*/
4778target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4779                              uint32_t desc)
4780{
4781    uint32_t vm = vext_vm(desc);
4782    uint32_t vl = env->vl;
4783    int i;
4784
4785    for (i = env->vstart; i < vl; i++) {
4786        if (vm || vext_elem_mask(v0, i)) {
4787            if (vext_elem_mask(vs2, i)) {
4788                return i;
4789            }
4790        }
4791    }
4792    env->vstart = 0;
4793    return -1LL;
4794}
4795
4796enum set_mask_type {
4797    ONLY_FIRST = 1,
4798    INCLUDE_FIRST,
4799    BEFORE_FIRST,
4800};
4801
4802static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4803                   uint32_t desc, enum set_mask_type type)
4804{
4805    uint32_t vm = vext_vm(desc);
4806    uint32_t vl = env->vl;
4807    uint32_t total_elems = env_archcpu(env)->cfg.vlen;
4808    uint32_t vta_all_1s = vext_vta_all_1s(desc);
4809    int i;
4810    bool first_mask_bit = false;
4811
4812    for (i = env->vstart; i < vl; i++) {
4813        if (!vm && !vext_elem_mask(v0, i)) {
4814            continue;
4815        }
4816        /* write a zero to all following active elements */
4817        if (first_mask_bit) {
4818            vext_set_elem_mask(vd, i, 0);
4819            continue;
4820        }
4821        if (vext_elem_mask(vs2, i)) {
4822            first_mask_bit = true;
4823            if (type == BEFORE_FIRST) {
4824                vext_set_elem_mask(vd, i, 0);
4825            } else {
4826                vext_set_elem_mask(vd, i, 1);
4827            }
4828        } else {
4829            if (type == ONLY_FIRST) {
4830                vext_set_elem_mask(vd, i, 0);
4831            } else {
4832                vext_set_elem_mask(vd, i, 1);
4833            }
4834        }
4835    }
4836    env->vstart = 0;
4837    /* mask destination register are always tail-agnostic */
4838    /* set tail elements to 1s */
4839    if (vta_all_1s) {
4840        for (; i < total_elems; i++) {
4841            vext_set_elem_mask(vd, i, 1);
4842        }
4843    }
4844}
4845
4846void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4847                     uint32_t desc)
4848{
4849    vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4850}
4851
4852void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4853                     uint32_t desc)
4854{
4855    vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4856}
4857
4858void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4859                     uint32_t desc)
4860{
4861    vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4862}
4863
4864/* Vector Iota Instruction */
4865#define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4866void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4867                  uint32_t desc)                                          \
4868{                                                                         \
4869    uint32_t vm = vext_vm(desc);                                          \
4870    uint32_t vl = env->vl;                                                \
4871    uint32_t esz = sizeof(ETYPE);                                         \
4872    uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4873    uint32_t vta = vext_vta(desc);                                        \
4874    uint32_t sum = 0;                                                     \
4875    int i;                                                                \
4876                                                                          \
4877    for (i = env->vstart; i < vl; i++) {                                  \
4878        if (!vm && !vext_elem_mask(v0, i)) {                              \
4879            continue;                                                     \
4880        }                                                                 \
4881        *((ETYPE *)vd + H(i)) = sum;                                      \
4882        if (vext_elem_mask(vs2, i)) {                                     \
4883            sum++;                                                        \
4884        }                                                                 \
4885    }                                                                     \
4886    env->vstart = 0;                                                      \
4887    /* set tail elements to 1s */                                         \
4888    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4889}
4890
4891GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4892GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4893GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4894GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4895
4896/* Vector Element Index Instruction */
4897#define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4898void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4899{                                                                         \
4900    uint32_t vm = vext_vm(desc);                                          \
4901    uint32_t vl = env->vl;                                                \
4902    uint32_t esz = sizeof(ETYPE);                                         \
4903    uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4904    uint32_t vta = vext_vta(desc);                                        \
4905    int i;                                                                \
4906                                                                          \
4907    for (i = env->vstart; i < vl; i++) {                                  \
4908        if (!vm && !vext_elem_mask(v0, i)) {                              \
4909            continue;                                                     \
4910        }                                                                 \
4911        *((ETYPE *)vd + H(i)) = i;                                        \
4912    }                                                                     \
4913    env->vstart = 0;                                                      \
4914    /* set tail elements to 1s */                                         \
4915    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4916}
4917
4918GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4919GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4920GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4921GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4922
4923/*
4924 *** Vector Permutation Instructions
4925 */
4926
4927/* Vector Slide Instructions */
4928#define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4929void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4930                  CPURISCVState *env, uint32_t desc)                      \
4931{                                                                         \
4932    uint32_t vm = vext_vm(desc);                                          \
4933    uint32_t vl = env->vl;                                                \
4934    uint32_t esz = sizeof(ETYPE);                                         \
4935    uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4936    uint32_t vta = vext_vta(desc);                                        \
4937    target_ulong offset = s1, i_min, i;                                   \
4938                                                                          \
4939    i_min = MAX(env->vstart, offset);                                     \
4940    for (i = i_min; i < vl; i++) {                                        \
4941        if (!vm && !vext_elem_mask(v0, i)) {                              \
4942            continue;                                                     \
4943        }                                                                 \
4944        *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
4945    }                                                                     \
4946    /* set tail elements to 1s */                                         \
4947    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4948}
4949
4950/* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
4951GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
4952GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
4953GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
4954GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
4955
4956#define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
4957void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4958                  CPURISCVState *env, uint32_t desc)                      \
4959{                                                                         \
4960    uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
4961    uint32_t vm = vext_vm(desc);                                          \
4962    uint32_t vl = env->vl;                                                \
4963    uint32_t esz = sizeof(ETYPE);                                         \
4964    uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4965    uint32_t vta = vext_vta(desc);                                        \
4966    target_ulong i_max, i;                                                \
4967                                                                          \
4968    i_max = MAX(MIN(s1 < vlmax ? vlmax - s1 : 0, vl), env->vstart);       \
4969    for (i = env->vstart; i < i_max; ++i) {                               \
4970        if (vm || vext_elem_mask(v0, i)) {                                \
4971            *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));          \
4972        }                                                                 \
4973    }                                                                     \
4974                                                                          \
4975    for (i = i_max; i < vl; ++i) {                                        \
4976        if (vm || vext_elem_mask(v0, i)) {                                \
4977            *((ETYPE *)vd + H(i)) = 0;                                    \
4978        }                                                                 \
4979    }                                                                     \
4980                                                                          \
4981    env->vstart = 0;                                                      \
4982    /* set tail elements to 1s */                                         \
4983    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4984}
4985
4986/* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
4987GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
4988GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
4989GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
4990GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
4991
4992#define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
4993static void vslide1up_##BITWIDTH(void *vd, void *v0, target_ulong s1,       \
4994                     void *vs2, CPURISCVState *env, uint32_t desc)          \
4995{                                                                           \
4996    typedef uint##BITWIDTH##_t ETYPE;                                       \
4997    uint32_t vm = vext_vm(desc);                                            \
4998    uint32_t vl = env->vl;                                                  \
4999    uint32_t esz = sizeof(ETYPE);                                           \
5000    uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5001    uint32_t vta = vext_vta(desc);                                          \
5002    uint32_t i;                                                             \
5003                                                                            \
5004    for (i = env->vstart; i < vl; i++) {                                    \
5005        if (!vm && !vext_elem_mask(v0, i)) {                                \
5006            continue;                                                       \
5007        }                                                                   \
5008        if (i == 0) {                                                       \
5009            *((ETYPE *)vd + H(i)) = s1;                                     \
5010        } else {                                                            \
5011            *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5012        }                                                                   \
5013    }                                                                       \
5014    env->vstart = 0;                                                        \
5015    /* set tail elements to 1s */                                           \
5016    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5017}
5018
5019GEN_VEXT_VSLIE1UP(8,  H1)
5020GEN_VEXT_VSLIE1UP(16, H2)
5021GEN_VEXT_VSLIE1UP(32, H4)
5022GEN_VEXT_VSLIE1UP(64, H8)
5023
5024#define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5025void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5026                  CPURISCVState *env, uint32_t desc)              \
5027{                                                                 \
5028    vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5029}
5030
5031/* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5032GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5033GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5034GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5035GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5036
5037#define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5038static void vslide1down_##BITWIDTH(void *vd, void *v0, target_ulong s1,       \
5039                       void *vs2, CPURISCVState *env, uint32_t desc)          \
5040{                                                                             \
5041    typedef uint##BITWIDTH##_t ETYPE;                                         \
5042    uint32_t vm = vext_vm(desc);                                              \
5043    uint32_t vl = env->vl;                                                    \
5044    uint32_t esz = sizeof(ETYPE);                                             \
5045    uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5046    uint32_t vta = vext_vta(desc);                                            \
5047    uint32_t i;                                                               \
5048                                                                              \
5049    for (i = env->vstart; i < vl; i++) {                                      \
5050        if (!vm && !vext_elem_mask(v0, i)) {                                  \
5051            continue;                                                         \
5052        }                                                                     \
5053        if (i == vl - 1) {                                                    \
5054            *((ETYPE *)vd + H(i)) = s1;                                       \
5055        } else {                                                              \
5056            *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5057        }                                                                     \
5058    }                                                                         \
5059    env->vstart = 0;                                                          \
5060    /* set tail elements to 1s */                                             \
5061    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5062}
5063
5064GEN_VEXT_VSLIDE1DOWN(8,  H1)
5065GEN_VEXT_VSLIDE1DOWN(16, H2)
5066GEN_VEXT_VSLIDE1DOWN(32, H4)
5067GEN_VEXT_VSLIDE1DOWN(64, H8)
5068
5069#define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5070void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5071                  CPURISCVState *env, uint32_t desc)              \
5072{                                                                 \
5073    vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5074}
5075
5076/* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5077GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5078GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5079GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5080GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5081
5082/* Vector Floating-Point Slide Instructions */
5083#define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5084void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5085                  CPURISCVState *env, uint32_t desc)          \
5086{                                                             \
5087    vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5088}
5089
5090/* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5091GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5092GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5093GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5094
5095#define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5096void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5097                  CPURISCVState *env, uint32_t desc)          \
5098{                                                             \
5099    vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5100}
5101
5102/* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5103GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5104GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5105GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5106
5107/* Vector Register Gather Instruction */
5108#define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5109void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5110                  CPURISCVState *env, uint32_t desc)                      \
5111{                                                                         \
5112    uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5113    uint32_t vm = vext_vm(desc);                                          \
5114    uint32_t vl = env->vl;                                                \
5115    uint32_t esz = sizeof(TS2);                                           \
5116    uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5117    uint32_t vta = vext_vta(desc);                                        \
5118    uint64_t index;                                                       \
5119    uint32_t i;                                                           \
5120                                                                          \
5121    for (i = env->vstart; i < vl; i++) {                                  \
5122        if (!vm && !vext_elem_mask(v0, i)) {                              \
5123            continue;                                                     \
5124        }                                                                 \
5125        index = *((TS1 *)vs1 + HS1(i));                                   \
5126        if (index >= vlmax) {                                             \
5127            *((TS2 *)vd + HS2(i)) = 0;                                    \
5128        } else {                                                          \
5129            *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5130        }                                                                 \
5131    }                                                                     \
5132    env->vstart = 0;                                                      \
5133    /* set tail elements to 1s */                                         \
5134    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5135}
5136
5137/* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5138GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5139GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5140GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5141GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5142
5143GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5144GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5145GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5146GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5147
5148#define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5149void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5150                  CPURISCVState *env, uint32_t desc)                      \
5151{                                                                         \
5152    uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5153    uint32_t vm = vext_vm(desc);                                          \
5154    uint32_t vl = env->vl;                                                \
5155    uint32_t esz = sizeof(ETYPE);                                         \
5156    uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5157    uint32_t vta = vext_vta(desc);                                        \
5158    uint64_t index = s1;                                                  \
5159    uint32_t i;                                                           \
5160                                                                          \
5161    for (i = env->vstart; i < vl; i++) {                                  \
5162        if (!vm && !vext_elem_mask(v0, i)) {                              \
5163            continue;                                                     \
5164        }                                                                 \
5165        if (index >= vlmax) {                                             \
5166            *((ETYPE *)vd + H(i)) = 0;                                    \
5167        } else {                                                          \
5168            *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5169        }                                                                 \
5170    }                                                                     \
5171    env->vstart = 0;                                                      \
5172    /* set tail elements to 1s */                                         \
5173    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5174}
5175
5176/* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5177GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5178GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5179GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5180GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5181
5182/* Vector Compress Instruction */
5183#define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5184void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5185                  CPURISCVState *env, uint32_t desc)                      \
5186{                                                                         \
5187    uint32_t vl = env->vl;                                                \
5188    uint32_t esz = sizeof(ETYPE);                                         \
5189    uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5190    uint32_t vta = vext_vta(desc);                                        \
5191    uint32_t num = 0, i;                                                  \
5192                                                                          \
5193    for (i = env->vstart; i < vl; i++) {                                  \
5194        if (!vext_elem_mask(vs1, i)) {                                    \
5195            continue;                                                     \
5196        }                                                                 \
5197        *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5198        num++;                                                            \
5199    }                                                                     \
5200    env->vstart = 0;                                                      \
5201    /* set tail elements to 1s */                                         \
5202    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5203}
5204
5205/* Compress into vd elements of vs2 where vs1 is enabled */
5206GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5207GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5208GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5209GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5210
5211/* Vector Whole Register Move */
5212void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5213{
5214    /* EEW = SEW */
5215    uint32_t maxsz = simd_maxsz(desc);
5216    uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5217    uint32_t startb = env->vstart * sewb;
5218    uint32_t i = startb;
5219
5220    memcpy((uint8_t *)vd + H1(i),
5221           (uint8_t *)vs2 + H1(i),
5222           maxsz - startb);
5223
5224    env->vstart = 0;
5225}
5226
5227/* Vector Integer Extension */
5228#define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5229void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5230                  CPURISCVState *env, uint32_t desc)             \
5231{                                                                \
5232    uint32_t vl = env->vl;                                       \
5233    uint32_t vm = vext_vm(desc);                                 \
5234    uint32_t esz = sizeof(ETYPE);                                \
5235    uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5236    uint32_t vta = vext_vta(desc);                               \
5237    uint32_t i;                                                  \
5238                                                                 \
5239    for (i = env->vstart; i < vl; i++) {                         \
5240        if (!vm && !vext_elem_mask(v0, i)) {                     \
5241            continue;                                            \
5242        }                                                        \
5243        *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5244    }                                                            \
5245    env->vstart = 0;                                             \
5246    /* set tail elements to 1s */                                \
5247    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5248}
5249
5250GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5251GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5252GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5253GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5254GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5255GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5256
5257GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5258GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5259GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5260GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5261GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5262GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5263