qemu/target/riscv/vector_helper.c
<<
>>
Prefs
   1/*
   2 * RISC-V Vector Extension Helpers for QEMU.
   3 *
   4 * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
   5 *
   6 * This program is free software; you can redistribute it and/or modify it
   7 * under the terms and conditions of the GNU General Public License,
   8 * version 2 or later, as published by the Free Software Foundation.
   9 *
  10 * This program is distributed in the hope it will be useful, but WITHOUT
  11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  13 * more details.
  14 *
  15 * You should have received a copy of the GNU General Public License along with
  16 * this program.  If not, see <http://www.gnu.org/licenses/>.
  17 */
  18
  19#include "qemu/osdep.h"
  20#include "qemu/host-utils.h"
  21#include "qemu/bitops.h"
  22#include "cpu.h"
  23#include "exec/memop.h"
  24#include "accel/tcg/cpu-ldst.h"
  25#include "accel/tcg/probe.h"
  26#include "exec/page-protection.h"
  27#include "exec/helper-proto.h"
  28#include "exec/tlb-flags.h"
  29#include "exec/target_page.h"
  30#include "fpu/softfloat.h"
  31#include "tcg/tcg-gvec-desc.h"
  32#include "internals.h"
  33#include "vector_internals.h"
  34#include <math.h>
  35
  36target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
  37                            target_ulong s2, target_ulong x0)
  38{
  39    int vlmax, vl;
  40    RISCVCPU *cpu = env_archcpu(env);
  41    uint64_t vlmul = FIELD_EX64(s2, VTYPE, VLMUL);
  42    uint8_t vsew = FIELD_EX64(s2, VTYPE, VSEW);
  43    uint16_t sew = 8 << vsew;
  44    uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
  45    int xlen = riscv_cpu_xlen(env);
  46    bool vill = (s2 >> (xlen - 1)) & 0x1;
  47    target_ulong reserved = s2 &
  48                            MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
  49                                            xlen - 1 - R_VTYPE_RESERVED_SHIFT);
  50    uint16_t vlen = cpu->cfg.vlenb << 3;
  51    int8_t lmul;
  52
  53    if (vlmul & 4) {
  54        /*
  55         * Fractional LMUL, check:
  56         *
  57         * VLEN * LMUL >= SEW
  58         * VLEN >> (8 - lmul) >= sew
  59         * (vlenb << 3) >> (8 - lmul) >= sew
  60         */
  61        if (vlmul == 4 || (vlen >> (8 - vlmul)) < sew) {
  62            vill = true;
  63        }
  64    }
  65
  66    if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
  67        /* only set vill bit. */
  68        env->vill = 1;
  69        env->vtype = 0;
  70        env->vl = 0;
  71        env->vstart = 0;
  72        return 0;
  73    }
  74
  75    /* lmul encoded as in DisasContext::lmul */
  76    lmul = sextract32(FIELD_EX64(s2, VTYPE, VLMUL), 0, 3);
  77    vlmax = vext_get_vlmax(cpu->cfg.vlenb, vsew, lmul);
  78    if (s1 <= vlmax) {
  79        vl = s1;
  80    } else if (s1 < 2 * vlmax && cpu->cfg.rvv_vl_half_avl) {
  81        vl = (s1 + 1) >> 1;
  82    } else {
  83        vl = vlmax;
  84    }
  85
  86    if (cpu->cfg.rvv_vsetvl_x0_vill && x0 && (env->vl != vl)) {
  87        /* only set vill bit. */
  88        env->vill = 1;
  89        env->vtype = 0;
  90        env->vl = 0;
  91        env->vstart = 0;
  92        return 0;
  93    }
  94
  95    env->vl = vl;
  96    env->vtype = s2;
  97    env->vstart = 0;
  98    env->vill = 0;
  99    return vl;
 100}
 101
 102/*
 103 * Get the maximum number of elements can be operated.
 104 *
 105 * log2_esz: log2 of element size in bytes.
 106 */
 107static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
 108{
 109    /*
 110     * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
 111     * so vlen in bytes (vlenb) is encoded as maxsz.
 112     */
 113    uint32_t vlenb = simd_maxsz(desc);
 114
 115    /* Return VLMAX */
 116    int scale = vext_lmul(desc) - log2_esz;
 117    return scale < 0 ? vlenb >> -scale : vlenb << scale;
 118}
 119
 120/*
 121 * This function checks watchpoint before real load operation.
 122 *
 123 * In system mode, the TLB API probe_access is enough for watchpoint check.
 124 * In user mode, there is no watchpoint support now.
 125 *
 126 * It will trigger an exception if there is no mapping in TLB
 127 * and page table walk can't fill the TLB entry. Then the guest
 128 * software can return here after process the exception or never return.
 129 *
 130 * This function can also be used when direct access to probe_access_flags is
 131 * needed in order to access the flags. If a pointer to a flags operand is
 132 * provided the function will call probe_access_flags instead, use nonfault
 133 * and update host and flags.
 134 */
 135static void probe_pages(CPURISCVState *env, target_ulong addr, target_ulong len,
 136                        uintptr_t ra, MMUAccessType access_type, int mmu_index,
 137                        void **host, int *flags, bool nonfault)
 138{
 139    target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
 140    target_ulong curlen = MIN(pagelen, len);
 141
 142    if (flags != NULL) {
 143        *flags = probe_access_flags(env, adjust_addr(env, addr), curlen,
 144                                    access_type, mmu_index, nonfault, host, ra);
 145    } else {
 146        probe_access(env, adjust_addr(env, addr), curlen, access_type,
 147                     mmu_index, ra);
 148    }
 149
 150    if (len > curlen) {
 151        addr += curlen;
 152        curlen = len - curlen;
 153        if (flags != NULL) {
 154            *flags = probe_access_flags(env, adjust_addr(env, addr), curlen,
 155                                        access_type, mmu_index, nonfault,
 156                                        host, ra);
 157        } else {
 158            probe_access(env, adjust_addr(env, addr), curlen, access_type,
 159                         mmu_index, ra);
 160        }
 161    }
 162}
 163
 164
 165static inline void vext_set_elem_mask(void *v0, int index,
 166                                      uint8_t value)
 167{
 168    int idx = index / 64;
 169    int pos = index % 64;
 170    uint64_t old = ((uint64_t *)v0)[idx];
 171    ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
 172}
 173
 174/* elements operations for load and store */
 175typedef void vext_ldst_elem_fn_tlb(CPURISCVState *env, abi_ptr addr,
 176                                   uint32_t idx, void *vd, uintptr_t retaddr);
 177typedef void vext_ldst_elem_fn_host(void *vd, uint32_t idx, void *host);
 178
 179#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)             \
 180static inline QEMU_ALWAYS_INLINE                            \
 181void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
 182                uint32_t idx, void *vd, uintptr_t retaddr)  \
 183{                                                           \
 184    ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
 185    *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);       \
 186}                                                           \
 187                                                            \
 188static inline QEMU_ALWAYS_INLINE                            \
 189void NAME##_host(void *vd, uint32_t idx, void *host)        \
 190{                                                           \
 191    ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
 192    *cur = (ETYPE)LDSUF##_p(host);                          \
 193}
 194
 195GEN_VEXT_LD_ELEM(lde_b, uint8_t,  H1, ldub)
 196GEN_VEXT_LD_ELEM(lde_h, uint16_t, H2, lduw)
 197GEN_VEXT_LD_ELEM(lde_w, uint32_t, H4, ldl)
 198GEN_VEXT_LD_ELEM(lde_d, uint64_t, H8, ldq)
 199
 200#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)             \
 201static inline QEMU_ALWAYS_INLINE                            \
 202void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
 203                uint32_t idx, void *vd, uintptr_t retaddr)  \
 204{                                                           \
 205    ETYPE data = *((ETYPE *)vd + H(idx));                   \
 206    cpu_##STSUF##_data_ra(env, addr, data, retaddr);        \
 207}                                                           \
 208                                                            \
 209static inline QEMU_ALWAYS_INLINE                            \
 210void NAME##_host(void *vd, uint32_t idx, void *host)        \
 211{                                                           \
 212    ETYPE data = *((ETYPE *)vd + H(idx));                   \
 213    STSUF##_p(host, data);                                  \
 214}
 215
 216GEN_VEXT_ST_ELEM(ste_b, uint8_t,  H1, stb)
 217GEN_VEXT_ST_ELEM(ste_h, uint16_t, H2, stw)
 218GEN_VEXT_ST_ELEM(ste_w, uint32_t, H4, stl)
 219GEN_VEXT_ST_ELEM(ste_d, uint64_t, H8, stq)
 220
 221static inline QEMU_ALWAYS_INLINE void
 222vext_continuous_ldst_tlb(CPURISCVState *env, vext_ldst_elem_fn_tlb *ldst_tlb,
 223                       void *vd, uint32_t evl, target_ulong addr,
 224                       uint32_t reg_start, uintptr_t ra, uint32_t esz,
 225                       bool is_load)
 226{
 227    uint32_t i;
 228    for (i = env->vstart; i < evl; env->vstart = ++i, addr += esz) {
 229        ldst_tlb(env, adjust_addr(env, addr), i, vd, ra);
 230    }
 231}
 232
 233static inline QEMU_ALWAYS_INLINE void
 234vext_continuous_ldst_host(CPURISCVState *env, vext_ldst_elem_fn_host *ldst_host,
 235                        void *vd, uint32_t evl, uint32_t reg_start, void *host,
 236                        uint32_t esz, bool is_load)
 237{
 238#if HOST_BIG_ENDIAN
 239    for (; reg_start < evl; reg_start++, host += esz) {
 240        ldst_host(vd, reg_start, host);
 241    }
 242#else
 243    if (esz == 1) {
 244        uint32_t byte_offset = reg_start * esz;
 245        uint32_t size = (evl - reg_start) * esz;
 246
 247        if (is_load) {
 248            memcpy(vd + byte_offset, host, size);
 249        } else {
 250            memcpy(host, vd + byte_offset, size);
 251        }
 252    } else {
 253        for (; reg_start < evl; reg_start++, host += esz) {
 254            ldst_host(vd, reg_start, host);
 255        }
 256    }
 257#endif
 258}
 259
 260static void vext_set_tail_elems_1s(target_ulong vl, void *vd,
 261                                   uint32_t desc, uint32_t nf,
 262                                   uint32_t esz, uint32_t max_elems)
 263{
 264    uint32_t vta = vext_vta(desc);
 265    int k;
 266
 267    if (vta == 0) {
 268        return;
 269    }
 270
 271    for (k = 0; k < nf; ++k) {
 272        vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
 273                          (k * max_elems + max_elems) * esz);
 274    }
 275}
 276
 277/*
 278 * stride: access vector element from strided memory
 279 */
 280static void
 281vext_ldst_stride(void *vd, void *v0, target_ulong base, target_ulong stride,
 282                 CPURISCVState *env, uint32_t desc, uint32_t vm,
 283                 vext_ldst_elem_fn_tlb *ldst_elem, uint32_t log2_esz,
 284                 uintptr_t ra)
 285{
 286    uint32_t i, k;
 287    uint32_t nf = vext_nf(desc);
 288    uint32_t max_elems = vext_max_elems(desc, log2_esz);
 289    uint32_t esz = 1 << log2_esz;
 290    uint32_t vma = vext_vma(desc);
 291
 292    VSTART_CHECK_EARLY_EXIT(env, env->vl);
 293
 294    for (i = env->vstart; i < env->vl; env->vstart = ++i) {
 295        k = 0;
 296        while (k < nf) {
 297            if (!vm && !vext_elem_mask(v0, i)) {
 298                /* set masked-off elements to 1s */
 299                vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
 300                                  (i + k * max_elems + 1) * esz);
 301                k++;
 302                continue;
 303            }
 304            target_ulong addr = base + stride * i + (k << log2_esz);
 305            ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
 306            k++;
 307        }
 308    }
 309    env->vstart = 0;
 310
 311    vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
 312}
 313
 314#define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
 315void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
 316                  target_ulong stride, CPURISCVState *env,              \
 317                  uint32_t desc)                                        \
 318{                                                                       \
 319    uint32_t vm = vext_vm(desc);                                        \
 320    vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
 321                     ctzl(sizeof(ETYPE)), GETPC());                     \
 322}
 323
 324GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b_tlb)
 325GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h_tlb)
 326GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w_tlb)
 327GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d_tlb)
 328
 329#define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
 330void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
 331                  target_ulong stride, CPURISCVState *env,              \
 332                  uint32_t desc)                                        \
 333{                                                                       \
 334    uint32_t vm = vext_vm(desc);                                        \
 335    vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
 336                     ctzl(sizeof(ETYPE)), GETPC());                     \
 337}
 338
 339GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b_tlb)
 340GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h_tlb)
 341GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w_tlb)
 342GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d_tlb)
 343
 344/*
 345 * unit-stride: access elements stored contiguously in memory
 346 */
 347
 348/* unmasked unit-stride load and store operation */
 349static inline QEMU_ALWAYS_INLINE void
 350vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr,
 351                  uint32_t elems, uint32_t nf, uint32_t max_elems,
 352                  uint32_t log2_esz, bool is_load, int mmu_index,
 353                  vext_ldst_elem_fn_tlb *ldst_tlb,
 354                  vext_ldst_elem_fn_host *ldst_host, uintptr_t ra)
 355{
 356    void *host;
 357    int i, k, flags;
 358    uint32_t esz = 1 << log2_esz;
 359    uint32_t size = (elems * nf) << log2_esz;
 360    uint32_t evl = env->vstart + elems;
 361    MMUAccessType access_type = is_load ? MMU_DATA_LOAD : MMU_DATA_STORE;
 362
 363    /* Check page permission/pmp/watchpoint/etc. */
 364    probe_pages(env, addr, size, ra, access_type, mmu_index, &host, &flags,
 365                true);
 366
 367    if (flags == 0) {
 368        if (nf == 1) {
 369            vext_continuous_ldst_host(env, ldst_host, vd, evl, env->vstart,
 370                                      host, esz, is_load);
 371        } else {
 372            for (i = env->vstart; i < evl; ++i) {
 373                k = 0;
 374                while (k < nf) {
 375                    ldst_host(vd, i + k * max_elems, host);
 376                    host += esz;
 377                    k++;
 378                }
 379            }
 380        }
 381        env->vstart += elems;
 382    } else {
 383        if (nf == 1) {
 384            vext_continuous_ldst_tlb(env, ldst_tlb, vd, evl, addr, env->vstart,
 385                                   ra, esz, is_load);
 386        } else {
 387            /* load bytes from guest memory */
 388            for (i = env->vstart; i < evl; env->vstart = ++i) {
 389                k = 0;
 390                while (k < nf) {
 391                    ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems,
 392                             vd, ra);
 393                    addr += esz;
 394                    k++;
 395                }
 396            }
 397        }
 398    }
 399}
 400
 401static inline QEMU_ALWAYS_INLINE void
 402vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
 403             vext_ldst_elem_fn_tlb *ldst_tlb,
 404             vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
 405             uint32_t evl, uintptr_t ra, bool is_load)
 406{
 407    uint32_t k;
 408    target_ulong page_split, elems, addr;
 409    uint32_t nf = vext_nf(desc);
 410    uint32_t max_elems = vext_max_elems(desc, log2_esz);
 411    uint32_t esz = 1 << log2_esz;
 412    uint32_t msize = nf * esz;
 413    int mmu_index = riscv_env_mmu_index(env, false);
 414
 415    VSTART_CHECK_EARLY_EXIT(env, evl);
 416
 417#if defined(CONFIG_USER_ONLY)
 418    /*
 419     * For data sizes <= 6 bytes we get better performance by simply calling
 420     * vext_continuous_ldst_tlb
 421     */
 422    if (nf == 1 && (evl << log2_esz) <= 6) {
 423        addr = base + (env->vstart << log2_esz);
 424        vext_continuous_ldst_tlb(env, ldst_tlb, vd, evl, addr, env->vstart, ra,
 425                                 esz, is_load);
 426
 427        env->vstart = 0;
 428        vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
 429        return;
 430    }
 431#endif
 432
 433    /* Calculate the page range of first page */
 434    addr = base + ((env->vstart * nf) << log2_esz);
 435    page_split = -(addr | TARGET_PAGE_MASK);
 436    /* Get number of elements */
 437    elems = page_split / msize;
 438    if (unlikely(env->vstart + elems >= evl)) {
 439        elems = evl - env->vstart;
 440    }
 441
 442    /* Load/store elements in the first page */
 443    if (likely(elems)) {
 444        vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
 445                          is_load, mmu_index, ldst_tlb, ldst_host, ra);
 446    }
 447
 448    /* Load/store elements in the second page */
 449    if (unlikely(env->vstart < evl)) {
 450        /* Cross page element */
 451        if (unlikely(page_split % msize)) {
 452            for (k = 0; k < nf; k++) {
 453                addr = base + ((env->vstart * nf + k) << log2_esz);
 454                ldst_tlb(env, adjust_addr(env, addr),
 455                        env->vstart + k * max_elems, vd, ra);
 456            }
 457            env->vstart++;
 458        }
 459
 460        addr = base + ((env->vstart * nf) << log2_esz);
 461        /* Get number of elements of second page */
 462        elems = evl - env->vstart;
 463
 464        /* Load/store elements in the second page */
 465        vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
 466                          is_load, mmu_index, ldst_tlb, ldst_host, ra);
 467    }
 468
 469    env->vstart = 0;
 470    vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
 471}
 472
 473/*
 474 * masked unit-stride load and store operation will be a special case of
 475 * stride, stride = NF * sizeof (ETYPE)
 476 */
 477
 478#define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)      \
 479void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,     \
 480                         CPURISCVState *env, uint32_t desc)         \
 481{                                                                   \
 482    uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));         \
 483    vext_ldst_stride(vd, v0, base, stride, env, desc, false,        \
 484                     LOAD_FN_TLB, ctzl(sizeof(ETYPE)), GETPC());    \
 485}                                                                   \
 486                                                                    \
 487void HELPER(NAME)(void *vd, void *v0, target_ulong base,            \
 488                  CPURISCVState *env, uint32_t desc)                \
 489{                                                                   \
 490    vext_ldst_us(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST,    \
 491                 ctzl(sizeof(ETYPE)), env->vl, GETPC(), true);      \
 492}
 493
 494GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b_tlb, lde_b_host)
 495GEN_VEXT_LD_US(vle16_v, int16_t, lde_h_tlb, lde_h_host)
 496GEN_VEXT_LD_US(vle32_v, int32_t, lde_w_tlb, lde_w_host)
 497GEN_VEXT_LD_US(vle64_v, int64_t, lde_d_tlb, lde_d_host)
 498
 499#define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)         \
 500void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
 501                         CPURISCVState *env, uint32_t desc)              \
 502{                                                                        \
 503    uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
 504    vext_ldst_stride(vd, v0, base, stride, env, desc, false,             \
 505                     STORE_FN_TLB, ctzl(sizeof(ETYPE)), GETPC());        \
 506}                                                                        \
 507                                                                         \
 508void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
 509                  CPURISCVState *env, uint32_t desc)                     \
 510{                                                                        \
 511    vext_ldst_us(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,       \
 512                 ctzl(sizeof(ETYPE)), env->vl, GETPC(), false);          \
 513}
 514
 515GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b_tlb, ste_b_host)
 516GEN_VEXT_ST_US(vse16_v, int16_t, ste_h_tlb, ste_h_host)
 517GEN_VEXT_ST_US(vse32_v, int32_t, ste_w_tlb, ste_w_host)
 518GEN_VEXT_ST_US(vse64_v, int64_t, ste_d_tlb, ste_d_host)
 519
 520/*
 521 * unit stride mask load and store, EEW = 1
 522 */
 523void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
 524                    CPURISCVState *env, uint32_t desc)
 525{
 526    /* evl = ceil(vl/8) */
 527    uint8_t evl = (env->vl + 7) >> 3;
 528    vext_ldst_us(vd, base, env, desc, lde_b_tlb, lde_b_host,
 529                 0, evl, GETPC(), true);
 530}
 531
 532void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
 533                    CPURISCVState *env, uint32_t desc)
 534{
 535    /* evl = ceil(vl/8) */
 536    uint8_t evl = (env->vl + 7) >> 3;
 537    vext_ldst_us(vd, base, env, desc, ste_b_tlb, ste_b_host,
 538                 0, evl, GETPC(), false);
 539}
 540
 541/*
 542 * index: access vector element from indexed memory
 543 */
 544typedef target_ulong vext_get_index_addr(target_ulong base,
 545        uint32_t idx, void *vs2);
 546
 547#define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
 548static target_ulong NAME(target_ulong base,            \
 549                         uint32_t idx, void *vs2)      \
 550{                                                      \
 551    return (base + *((ETYPE *)vs2 + H(idx)));          \
 552}
 553
 554GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
 555GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
 556GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
 557GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
 558
 559static inline void
 560vext_ldst_index(void *vd, void *v0, target_ulong base,
 561                void *vs2, CPURISCVState *env, uint32_t desc,
 562                vext_get_index_addr get_index_addr,
 563                vext_ldst_elem_fn_tlb *ldst_elem,
 564                uint32_t log2_esz, uintptr_t ra)
 565{
 566    uint32_t i, k;
 567    uint32_t nf = vext_nf(desc);
 568    uint32_t vm = vext_vm(desc);
 569    uint32_t max_elems = vext_max_elems(desc, log2_esz);
 570    uint32_t esz = 1 << log2_esz;
 571    uint32_t vma = vext_vma(desc);
 572
 573    VSTART_CHECK_EARLY_EXIT(env, env->vl);
 574
 575    /* load bytes from guest memory */
 576    for (i = env->vstart; i < env->vl; env->vstart = ++i) {
 577        k = 0;
 578        while (k < nf) {
 579            if (!vm && !vext_elem_mask(v0, i)) {
 580                /* set masked-off elements to 1s */
 581                vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
 582                                  (i + k * max_elems + 1) * esz);
 583                k++;
 584                continue;
 585            }
 586            abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
 587            ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
 588            k++;
 589        }
 590    }
 591    env->vstart = 0;
 592
 593    vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
 594}
 595
 596#define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
 597void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
 598                  void *vs2, CPURISCVState *env, uint32_t desc)            \
 599{                                                                          \
 600    vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
 601                    LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
 602}
 603
 604GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b_tlb)
 605GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h_tlb)
 606GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w_tlb)
 607GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d_tlb)
 608GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b_tlb)
 609GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h_tlb)
 610GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w_tlb)
 611GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d_tlb)
 612GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b_tlb)
 613GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h_tlb)
 614GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w_tlb)
 615GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d_tlb)
 616GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b_tlb)
 617GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h_tlb)
 618GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w_tlb)
 619GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d_tlb)
 620
 621#define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
 622void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
 623                  void *vs2, CPURISCVState *env, uint32_t desc)  \
 624{                                                                \
 625    vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
 626                    STORE_FN, ctzl(sizeof(ETYPE)),               \
 627                    GETPC());                                    \
 628}
 629
 630GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b_tlb)
 631GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h_tlb)
 632GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w_tlb)
 633GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d_tlb)
 634GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b_tlb)
 635GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h_tlb)
 636GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w_tlb)
 637GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d_tlb)
 638GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b_tlb)
 639GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h_tlb)
 640GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w_tlb)
 641GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d_tlb)
 642GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b_tlb)
 643GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h_tlb)
 644GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w_tlb)
 645GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d_tlb)
 646
 647/*
 648 * unit-stride fault-only-fisrt load instructions
 649 */
 650static inline void
 651vext_ldff(void *vd, void *v0, target_ulong base, CPURISCVState *env,
 652          uint32_t desc, vext_ldst_elem_fn_tlb *ldst_tlb,
 653          vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz, uintptr_t ra)
 654{
 655    uint32_t i, k, vl = 0;
 656    uint32_t nf = vext_nf(desc);
 657    uint32_t vm = vext_vm(desc);
 658    uint32_t max_elems = vext_max_elems(desc, log2_esz);
 659    uint32_t esz = 1 << log2_esz;
 660    uint32_t msize = nf * esz;
 661    uint32_t vma = vext_vma(desc);
 662    target_ulong addr, addr_probe, addr_i, offset, remain, page_split, elems;
 663    int mmu_index = riscv_env_mmu_index(env, false);
 664    int flags, probe_flags;
 665    void *host;
 666
 667    VSTART_CHECK_EARLY_EXIT(env, env->vl);
 668
 669    addr = base + ((env->vstart * nf) << log2_esz);
 670    page_split = -(addr | TARGET_PAGE_MASK);
 671    /* Get number of elements */
 672    elems = page_split / msize;
 673    if (unlikely(env->vstart + elems >= env->vl)) {
 674        elems = env->vl - env->vstart;
 675    }
 676
 677    /* Check page permission/pmp/watchpoint/etc. */
 678    probe_pages(env, addr, elems * msize, ra, MMU_DATA_LOAD, mmu_index, &host,
 679                &flags, true);
 680
 681    /* If we are crossing a page check also the second page. */
 682    if (env->vl > elems) {
 683        addr_probe = addr + (elems << log2_esz);
 684        probe_pages(env, addr_probe, elems * msize, ra, MMU_DATA_LOAD,
 685                    mmu_index, &host, &probe_flags, true);
 686        flags |= probe_flags;
 687    }
 688
 689    if (flags & ~TLB_WATCHPOINT) {
 690        /* probe every access */
 691        for (i = env->vstart; i < env->vl; i++) {
 692            if (!vm && !vext_elem_mask(v0, i)) {
 693                continue;
 694            }
 695            addr_i = adjust_addr(env, base + i * (nf << log2_esz));
 696            if (i == 0) {
 697                /* Allow fault on first element. */
 698                probe_pages(env, addr_i, nf << log2_esz, ra, MMU_DATA_LOAD,
 699                            mmu_index, &host, NULL, false);
 700            } else {
 701                remain = nf << log2_esz;
 702                while (remain > 0) {
 703                    offset = -(addr_i | TARGET_PAGE_MASK);
 704
 705                    /* Probe nonfault on subsequent elements. */
 706                    probe_pages(env, addr_i, offset, 0, MMU_DATA_LOAD,
 707                                mmu_index, &host, &flags, true);
 708
 709                    /*
 710                     * Stop if invalid (unmapped) or mmio (transaction may
 711                     * fail). Do not stop if watchpoint, as the spec says that
 712                     * first-fault should continue to access the same
 713                     * elements regardless of any watchpoint.
 714                     */
 715                    if (flags & ~TLB_WATCHPOINT) {
 716                        vl = i;
 717                        goto ProbeSuccess;
 718                    }
 719                    if (remain <= offset) {
 720                        break;
 721                    }
 722                    remain -= offset;
 723                    addr_i = adjust_addr(env, addr_i + offset);
 724                }
 725            }
 726        }
 727    }
 728ProbeSuccess:
 729    /* load bytes from guest memory */
 730    if (vl != 0) {
 731        env->vl = vl;
 732    }
 733
 734    if (env->vstart < env->vl) {
 735        if (vm) {
 736            /* Load/store elements in the first page */
 737            if (likely(elems)) {
 738                vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
 739                                  log2_esz, true, mmu_index, ldst_tlb,
 740                                  ldst_host, ra);
 741            }
 742
 743            /* Load/store elements in the second page */
 744            if (unlikely(env->vstart < env->vl)) {
 745                /* Cross page element */
 746                if (unlikely(page_split % msize)) {
 747                    for (k = 0; k < nf; k++) {
 748                        addr = base + ((env->vstart * nf + k) << log2_esz);
 749                        ldst_tlb(env, adjust_addr(env, addr),
 750                                 env->vstart + k * max_elems, vd, ra);
 751                    }
 752                    env->vstart++;
 753                }
 754
 755                addr = base + ((env->vstart * nf) << log2_esz);
 756                /* Get number of elements of second page */
 757                elems = env->vl - env->vstart;
 758
 759                /* Load/store elements in the second page */
 760                vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
 761                                  log2_esz, true, mmu_index, ldst_tlb,
 762                                  ldst_host, ra);
 763            }
 764        } else {
 765            for (i = env->vstart; i < env->vl; i++) {
 766                k = 0;
 767                while (k < nf) {
 768                    if (!vext_elem_mask(v0, i)) {
 769                        /* set masked-off elements to 1s */
 770                        vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
 771                                          (i + k * max_elems + 1) * esz);
 772                        k++;
 773                        continue;
 774                    }
 775                    addr = base + ((i * nf + k) << log2_esz);
 776                    ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems,
 777                             vd, ra);
 778                    k++;
 779                }
 780            }
 781        }
 782    }
 783    env->vstart = 0;
 784
 785    vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
 786}
 787
 788#define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
 789void HELPER(NAME)(void *vd, void *v0, target_ulong base,        \
 790                  CPURISCVState *env, uint32_t desc)            \
 791{                                                               \
 792    vext_ldff(vd, v0, base, env, desc, LOAD_FN_TLB,             \
 793              LOAD_FN_HOST, ctzl(sizeof(ETYPE)), GETPC());      \
 794}
 795
 796GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b_tlb, lde_b_host)
 797GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h_tlb, lde_h_host)
 798GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w_tlb, lde_w_host)
 799GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb, lde_d_host)
 800
 801#define DO_SWAP(N, M) (M)
 802#define DO_AND(N, M)  (N & M)
 803#define DO_XOR(N, M)  (N ^ M)
 804#define DO_OR(N, M)   (N | M)
 805#define DO_ADD(N, M)  (N + M)
 806
 807/* Signed min/max */
 808#define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
 809#define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
 810
 811/*
 812 * load and store whole register instructions
 813 */
 814static inline QEMU_ALWAYS_INLINE void
 815vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
 816                vext_ldst_elem_fn_tlb *ldst_tlb,
 817                vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
 818                uintptr_t ra, bool is_load)
 819{
 820    target_ulong page_split, elems, addr;
 821    uint32_t nf = vext_nf(desc);
 822    uint32_t vlenb = riscv_cpu_cfg(env)->vlenb;
 823    uint32_t max_elems = vlenb >> log2_esz;
 824    uint32_t evl = nf * max_elems;
 825    uint32_t esz = 1 << log2_esz;
 826    int mmu_index = riscv_env_mmu_index(env, false);
 827
 828    /* Calculate the page range of first page */
 829    addr = base + (env->vstart << log2_esz);
 830    page_split = -(addr | TARGET_PAGE_MASK);
 831    /* Get number of elements */
 832    elems = page_split / esz;
 833    if (unlikely(env->vstart + elems >= evl)) {
 834        elems = evl - env->vstart;
 835    }
 836
 837    /* Load/store elements in the first page */
 838    if (likely(elems)) {
 839        vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
 840                          is_load, mmu_index, ldst_tlb, ldst_host, ra);
 841    }
 842
 843    /* Load/store elements in the second page */
 844    if (unlikely(env->vstart < evl)) {
 845        /* Cross page element */
 846        if (unlikely(page_split % esz)) {
 847            addr = base + (env->vstart << log2_esz);
 848            ldst_tlb(env, adjust_addr(env, addr), env->vstart, vd, ra);
 849            env->vstart++;
 850        }
 851
 852        addr = base + (env->vstart << log2_esz);
 853        /* Get number of elements of second page */
 854        elems = evl - env->vstart;
 855
 856        /* Load/store elements in the second page */
 857        vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
 858                          is_load, mmu_index, ldst_tlb, ldst_host, ra);
 859    }
 860
 861    env->vstart = 0;
 862}
 863
 864#define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
 865void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,  \
 866                  uint32_t desc)                                    \
 867{                                                                   \
 868    vext_ldst_whole(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST, \
 869                    ctzl(sizeof(ETYPE)), GETPC(), true);            \
 870}
 871
 872GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b_tlb, lde_b_host)
 873GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h_tlb, lde_h_host)
 874GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w_tlb, lde_w_host)
 875GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d_tlb, lde_d_host)
 876GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b_tlb, lde_b_host)
 877GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h_tlb, lde_h_host)
 878GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w_tlb, lde_w_host)
 879GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d_tlb, lde_d_host)
 880GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b_tlb, lde_b_host)
 881GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h_tlb, lde_h_host)
 882GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w_tlb, lde_w_host)
 883GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d_tlb, lde_d_host)
 884GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b_tlb, lde_b_host)
 885GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h_tlb, lde_h_host)
 886GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w_tlb, lde_w_host)
 887GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d_tlb, lde_d_host)
 888
 889#define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)     \
 890void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,      \
 891                  uint32_t desc)                                        \
 892{                                                                       \
 893    vext_ldst_whole(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,   \
 894                    ctzl(sizeof(ETYPE)), GETPC(), false);               \
 895}
 896
 897GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b_tlb, ste_b_host)
 898GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b_tlb, ste_b_host)
 899GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b_tlb, ste_b_host)
 900GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b_tlb, ste_b_host)
 901
 902/*
 903 * Vector Integer Arithmetic Instructions
 904 */
 905
 906/* (TD, T1, T2, TX1, TX2) */
 907#define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
 908#define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
 909#define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
 910#define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
 911#define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
 912#define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
 913#define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
 914#define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
 915#define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
 916#define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
 917#define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
 918#define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
 919#define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
 920#define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
 921#define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
 922#define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
 923#define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
 924#define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
 925#define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
 926#define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
 927#define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
 928#define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
 929#define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
 930
 931#define DO_SUB(N, M) (N - M)
 932#define DO_RSUB(N, M) (M - N)
 933
 934RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
 935RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
 936RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
 937RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
 938RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
 939RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
 940RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
 941RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
 942
 943GEN_VEXT_VV(vadd_vv_b, 1)
 944GEN_VEXT_VV(vadd_vv_h, 2)
 945GEN_VEXT_VV(vadd_vv_w, 4)
 946GEN_VEXT_VV(vadd_vv_d, 8)
 947GEN_VEXT_VV(vsub_vv_b, 1)
 948GEN_VEXT_VV(vsub_vv_h, 2)
 949GEN_VEXT_VV(vsub_vv_w, 4)
 950GEN_VEXT_VV(vsub_vv_d, 8)
 951
 952
 953RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
 954RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
 955RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
 956RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
 957RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
 958RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
 959RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
 960RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
 961RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
 962RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
 963RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
 964RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
 965
 966GEN_VEXT_VX(vadd_vx_b, 1)
 967GEN_VEXT_VX(vadd_vx_h, 2)
 968GEN_VEXT_VX(vadd_vx_w, 4)
 969GEN_VEXT_VX(vadd_vx_d, 8)
 970GEN_VEXT_VX(vsub_vx_b, 1)
 971GEN_VEXT_VX(vsub_vx_h, 2)
 972GEN_VEXT_VX(vsub_vx_w, 4)
 973GEN_VEXT_VX(vsub_vx_d, 8)
 974GEN_VEXT_VX(vrsub_vx_b, 1)
 975GEN_VEXT_VX(vrsub_vx_h, 2)
 976GEN_VEXT_VX(vrsub_vx_w, 4)
 977GEN_VEXT_VX(vrsub_vx_d, 8)
 978
 979void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
 980{
 981    intptr_t oprsz = simd_oprsz(desc);
 982    intptr_t i;
 983
 984    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
 985        *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
 986    }
 987}
 988
 989void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
 990{
 991    intptr_t oprsz = simd_oprsz(desc);
 992    intptr_t i;
 993
 994    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
 995        *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
 996    }
 997}
 998
 999void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
1000{
1001    intptr_t oprsz = simd_oprsz(desc);
1002    intptr_t i;
1003
1004    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1005        *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
1006    }
1007}
1008
1009void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
1010{
1011    intptr_t oprsz = simd_oprsz(desc);
1012    intptr_t i;
1013
1014    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1015        *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
1016    }
1017}
1018
1019/* Vector Widening Integer Add/Subtract */
1020#define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
1021#define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
1022#define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
1023#define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
1024#define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
1025#define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
1026#define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
1027#define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
1028#define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
1029#define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
1030#define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
1031#define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
1032RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
1033RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
1034RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
1035RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
1036RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
1037RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
1038RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
1039RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
1040RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
1041RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
1042RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
1043RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
1044RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
1045RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
1046RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
1047RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
1048RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
1049RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
1050RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
1051RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
1052RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
1053RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
1054RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
1055RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
1056GEN_VEXT_VV(vwaddu_vv_b, 2)
1057GEN_VEXT_VV(vwaddu_vv_h, 4)
1058GEN_VEXT_VV(vwaddu_vv_w, 8)
1059GEN_VEXT_VV(vwsubu_vv_b, 2)
1060GEN_VEXT_VV(vwsubu_vv_h, 4)
1061GEN_VEXT_VV(vwsubu_vv_w, 8)
1062GEN_VEXT_VV(vwadd_vv_b, 2)
1063GEN_VEXT_VV(vwadd_vv_h, 4)
1064GEN_VEXT_VV(vwadd_vv_w, 8)
1065GEN_VEXT_VV(vwsub_vv_b, 2)
1066GEN_VEXT_VV(vwsub_vv_h, 4)
1067GEN_VEXT_VV(vwsub_vv_w, 8)
1068GEN_VEXT_VV(vwaddu_wv_b, 2)
1069GEN_VEXT_VV(vwaddu_wv_h, 4)
1070GEN_VEXT_VV(vwaddu_wv_w, 8)
1071GEN_VEXT_VV(vwsubu_wv_b, 2)
1072GEN_VEXT_VV(vwsubu_wv_h, 4)
1073GEN_VEXT_VV(vwsubu_wv_w, 8)
1074GEN_VEXT_VV(vwadd_wv_b, 2)
1075GEN_VEXT_VV(vwadd_wv_h, 4)
1076GEN_VEXT_VV(vwadd_wv_w, 8)
1077GEN_VEXT_VV(vwsub_wv_b, 2)
1078GEN_VEXT_VV(vwsub_wv_h, 4)
1079GEN_VEXT_VV(vwsub_wv_w, 8)
1080
1081RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1082RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1083RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1084RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1085RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1086RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1087RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1088RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1089RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1090RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1091RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1092RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1093RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1094RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1095RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1096RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1097RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1098RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1099RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1100RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1101RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1102RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1103RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1104RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1105GEN_VEXT_VX(vwaddu_vx_b, 2)
1106GEN_VEXT_VX(vwaddu_vx_h, 4)
1107GEN_VEXT_VX(vwaddu_vx_w, 8)
1108GEN_VEXT_VX(vwsubu_vx_b, 2)
1109GEN_VEXT_VX(vwsubu_vx_h, 4)
1110GEN_VEXT_VX(vwsubu_vx_w, 8)
1111GEN_VEXT_VX(vwadd_vx_b, 2)
1112GEN_VEXT_VX(vwadd_vx_h, 4)
1113GEN_VEXT_VX(vwadd_vx_w, 8)
1114GEN_VEXT_VX(vwsub_vx_b, 2)
1115GEN_VEXT_VX(vwsub_vx_h, 4)
1116GEN_VEXT_VX(vwsub_vx_w, 8)
1117GEN_VEXT_VX(vwaddu_wx_b, 2)
1118GEN_VEXT_VX(vwaddu_wx_h, 4)
1119GEN_VEXT_VX(vwaddu_wx_w, 8)
1120GEN_VEXT_VX(vwsubu_wx_b, 2)
1121GEN_VEXT_VX(vwsubu_wx_h, 4)
1122GEN_VEXT_VX(vwsubu_wx_w, 8)
1123GEN_VEXT_VX(vwadd_wx_b, 2)
1124GEN_VEXT_VX(vwadd_wx_h, 4)
1125GEN_VEXT_VX(vwadd_wx_w, 8)
1126GEN_VEXT_VX(vwsub_wx_b, 2)
1127GEN_VEXT_VX(vwsub_wx_h, 4)
1128GEN_VEXT_VX(vwsub_wx_w, 8)
1129
1130/* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1131#define DO_VADC(N, M, C) (N + M + C)
1132#define DO_VSBC(N, M, C) (N - M - C)
1133
1134#define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1135void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1136                  CPURISCVState *env, uint32_t desc)          \
1137{                                                             \
1138    uint32_t vl = env->vl;                                    \
1139    uint32_t esz = sizeof(ETYPE);                             \
1140    uint32_t total_elems =                                    \
1141        vext_get_total_elems(env, desc, esz);                 \
1142    uint32_t vta = vext_vta(desc);                            \
1143    uint32_t i;                                               \
1144                                                              \
1145    VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1146                                                              \
1147    for (i = env->vstart; i < vl; i++) {                      \
1148        ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1149        ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1150        ETYPE carry = vext_elem_mask(v0, i);                  \
1151                                                              \
1152        *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1153    }                                                         \
1154    env->vstart = 0;                                          \
1155    /* set tail elements to 1s */                             \
1156    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1157}
1158
1159GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1160GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1161GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1162GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1163
1164GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1165GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1166GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1167GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1168
1169#define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1170void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1171                  CPURISCVState *env, uint32_t desc)                     \
1172{                                                                        \
1173    uint32_t vl = env->vl;                                               \
1174    uint32_t esz = sizeof(ETYPE);                                        \
1175    uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1176    uint32_t vta = vext_vta(desc);                                       \
1177    uint32_t i;                                                          \
1178                                                                         \
1179    VSTART_CHECK_EARLY_EXIT(env, vl);                                    \
1180                                                                         \
1181    for (i = env->vstart; i < vl; i++) {                                 \
1182        ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1183        ETYPE carry = vext_elem_mask(v0, i);                             \
1184                                                                         \
1185        *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1186    }                                                                    \
1187    env->vstart = 0;                                                     \
1188    /* set tail elements to 1s */                                        \
1189    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1190}
1191
1192GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1193GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1194GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1195GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1196
1197GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1198GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1199GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1200GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1201
1202#define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1203                          (__typeof(N))(N + M) < N)
1204#define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1205
1206#define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1207void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1208                  CPURISCVState *env, uint32_t desc)          \
1209{                                                             \
1210    uint32_t vl = env->vl;                                    \
1211    uint32_t vm = vext_vm(desc);                              \
1212    uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1213    uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1214    uint32_t i;                                               \
1215                                                              \
1216    VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1217                                                              \
1218    for (i = env->vstart; i < vl; i++) {                      \
1219        ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1220        ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1221        ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1222        vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1223    }                                                         \
1224    env->vstart = 0;                                          \
1225    /*
1226     * mask destination register are always tail-agnostic
1227     * set tail elements to 1s
1228     */                                                       \
1229    if (vta_all_1s) {                                         \
1230        for (; i < total_elems; i++) {                        \
1231            vext_set_elem_mask(vd, i, 1);                     \
1232        }                                                     \
1233    }                                                         \
1234}
1235
1236GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1237GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1238GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1239GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1240
1241GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1242GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1243GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1244GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1245
1246#define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1247void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1248                  void *vs2, CPURISCVState *env, uint32_t desc) \
1249{                                                               \
1250    uint32_t vl = env->vl;                                      \
1251    uint32_t vm = vext_vm(desc);                                \
1252    uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;      \
1253    uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1254    uint32_t i;                                                 \
1255                                                                \
1256    VSTART_CHECK_EARLY_EXIT(env, vl);                           \
1257                                                                \
1258    for (i = env->vstart; i < vl; i++) {                        \
1259        ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1260        ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1261        vext_set_elem_mask(vd, i,                               \
1262                DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1263    }                                                           \
1264    env->vstart = 0;                                            \
1265    /*
1266     * mask destination register are always tail-agnostic
1267     * set tail elements to 1s
1268     */                                                         \
1269    if (vta_all_1s) {                                           \
1270        for (; i < total_elems; i++) {                          \
1271            vext_set_elem_mask(vd, i, 1);                       \
1272        }                                                       \
1273    }                                                           \
1274}
1275
1276GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1277GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1278GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1279GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1280
1281GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1282GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1283GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1284GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1285
1286/* Vector Bitwise Logical Instructions */
1287RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1288RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1289RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1290RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1291RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1292RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1293RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1294RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1295RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1296RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1297RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1298RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1299GEN_VEXT_VV(vand_vv_b, 1)
1300GEN_VEXT_VV(vand_vv_h, 2)
1301GEN_VEXT_VV(vand_vv_w, 4)
1302GEN_VEXT_VV(vand_vv_d, 8)
1303GEN_VEXT_VV(vor_vv_b, 1)
1304GEN_VEXT_VV(vor_vv_h, 2)
1305GEN_VEXT_VV(vor_vv_w, 4)
1306GEN_VEXT_VV(vor_vv_d, 8)
1307GEN_VEXT_VV(vxor_vv_b, 1)
1308GEN_VEXT_VV(vxor_vv_h, 2)
1309GEN_VEXT_VV(vxor_vv_w, 4)
1310GEN_VEXT_VV(vxor_vv_d, 8)
1311
1312RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1313RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1314RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1315RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1316RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1317RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1318RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1319RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1320RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1321RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1322RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1323RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1324GEN_VEXT_VX(vand_vx_b, 1)
1325GEN_VEXT_VX(vand_vx_h, 2)
1326GEN_VEXT_VX(vand_vx_w, 4)
1327GEN_VEXT_VX(vand_vx_d, 8)
1328GEN_VEXT_VX(vor_vx_b, 1)
1329GEN_VEXT_VX(vor_vx_h, 2)
1330GEN_VEXT_VX(vor_vx_w, 4)
1331GEN_VEXT_VX(vor_vx_d, 8)
1332GEN_VEXT_VX(vxor_vx_b, 1)
1333GEN_VEXT_VX(vxor_vx_h, 2)
1334GEN_VEXT_VX(vxor_vx_w, 4)
1335GEN_VEXT_VX(vxor_vx_d, 8)
1336
1337/* Vector Single-Width Bit Shift Instructions */
1338#define DO_SLL(N, M)  (N << (M))
1339#define DO_SRL(N, M)  (N >> (M))
1340
1341/* generate the helpers for shift instructions with two vector operators */
1342#define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1343void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1344                  void *vs2, CPURISCVState *env, uint32_t desc)           \
1345{                                                                         \
1346    uint32_t vm = vext_vm(desc);                                          \
1347    uint32_t vl = env->vl;                                                \
1348    uint32_t esz = sizeof(TS1);                                           \
1349    uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1350    uint32_t vta = vext_vta(desc);                                        \
1351    uint32_t vma = vext_vma(desc);                                        \
1352    uint32_t i;                                                           \
1353                                                                          \
1354    VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
1355                                                                          \
1356    for (i = env->vstart; i < vl; i++) {                                  \
1357        if (!vm && !vext_elem_mask(v0, i)) {                              \
1358            /* set masked-off elements to 1s */                           \
1359            vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1360            continue;                                                     \
1361        }                                                                 \
1362        TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1363        TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1364        *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1365    }                                                                     \
1366    env->vstart = 0;                                                      \
1367    /* set tail elements to 1s */                                         \
1368    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1369}
1370
1371GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1372GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1373GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1374GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1375
1376GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1377GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1378GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1379GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1380
1381GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1382GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1383GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1384GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1385
1386/*
1387 * generate the helpers for shift instructions with one vector and one scalar
1388 */
1389#define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1390void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1391                  void *vs2, CPURISCVState *env,            \
1392                  uint32_t desc)                            \
1393{                                                           \
1394    uint32_t vm = vext_vm(desc);                            \
1395    uint32_t vl = env->vl;                                  \
1396    uint32_t esz = sizeof(TD);                              \
1397    uint32_t total_elems =                                  \
1398        vext_get_total_elems(env, desc, esz);               \
1399    uint32_t vta = vext_vta(desc);                          \
1400    uint32_t vma = vext_vma(desc);                          \
1401    uint32_t i;                                             \
1402                                                            \
1403    VSTART_CHECK_EARLY_EXIT(env, vl);                       \
1404                                                            \
1405    for (i = env->vstart; i < vl; i++) {                    \
1406        if (!vm && !vext_elem_mask(v0, i)) {                \
1407            /* set masked-off elements to 1s */             \
1408            vext_set_elems_1s(vd, vma, i * esz,             \
1409                              (i + 1) * esz);               \
1410            continue;                                       \
1411        }                                                   \
1412        TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1413        *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1414    }                                                       \
1415    env->vstart = 0;                                        \
1416    /* set tail elements to 1s */                           \
1417    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1418}
1419
1420GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1421GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1422GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1423GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1424
1425GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1426GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1427GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1428GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1429
1430GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1431GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1432GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1433GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1434
1435/* Vector Narrowing Integer Right Shift Instructions */
1436GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1437GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1438GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1439GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1440GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1441GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1442GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1443GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1444GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1445GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1446GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1447GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1448
1449/* Vector Integer Comparison Instructions */
1450#define DO_MSEQ(N, M) (N == M)
1451#define DO_MSNE(N, M) (N != M)
1452#define DO_MSLT(N, M) (N < M)
1453#define DO_MSLE(N, M) (N <= M)
1454#define DO_MSGT(N, M) (N > M)
1455
1456#define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1457void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1458                  CPURISCVState *env, uint32_t desc)          \
1459{                                                             \
1460    uint32_t vm = vext_vm(desc);                              \
1461    uint32_t vl = env->vl;                                    \
1462    uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1463    uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1464    uint32_t vma = vext_vma(desc);                            \
1465    uint32_t i;                                               \
1466                                                              \
1467    VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1468                                                              \
1469    for (i = env->vstart; i < vl; i++) {                      \
1470        ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1471        ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1472        if (!vm && !vext_elem_mask(v0, i)) {                  \
1473            /* set masked-off elements to 1s */               \
1474            if (vma) {                                        \
1475                vext_set_elem_mask(vd, i, 1);                 \
1476            }                                                 \
1477            continue;                                         \
1478        }                                                     \
1479        vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1480    }                                                         \
1481    env->vstart = 0;                                          \
1482    /*
1483     * mask destination register are always tail-agnostic
1484     * set tail elements to 1s
1485     */                                                       \
1486    if (vta_all_1s) {                                         \
1487        for (; i < total_elems; i++) {                        \
1488            vext_set_elem_mask(vd, i, 1);                     \
1489        }                                                     \
1490    }                                                         \
1491}
1492
1493GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1494GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1495GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1496GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1497
1498GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1499GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1500GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1501GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1502
1503GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1504GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1505GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1506GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1507
1508GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1509GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1510GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1511GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1512
1513GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1514GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1515GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1516GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1517
1518GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1519GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1520GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1521GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1522
1523#define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1524void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1525                  CPURISCVState *env, uint32_t desc)                \
1526{                                                                   \
1527    uint32_t vm = vext_vm(desc);                                    \
1528    uint32_t vl = env->vl;                                          \
1529    uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
1530    uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1531    uint32_t vma = vext_vma(desc);                                  \
1532    uint32_t i;                                                     \
1533                                                                    \
1534    VSTART_CHECK_EARLY_EXIT(env, vl);                               \
1535                                                                    \
1536    for (i = env->vstart; i < vl; i++) {                            \
1537        ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1538        if (!vm && !vext_elem_mask(v0, i)) {                        \
1539            /* set masked-off elements to 1s */                     \
1540            if (vma) {                                              \
1541                vext_set_elem_mask(vd, i, 1);                       \
1542            }                                                       \
1543            continue;                                               \
1544        }                                                           \
1545        vext_set_elem_mask(vd, i,                                   \
1546                DO_OP(s2, (ETYPE)(target_long)s1));                 \
1547    }                                                               \
1548    env->vstart = 0;                                                \
1549    /*
1550     * mask destination register are always tail-agnostic
1551     * set tail elements to 1s
1552     */                                                             \
1553    if (vta_all_1s) {                                               \
1554        for (; i < total_elems; i++) {                              \
1555            vext_set_elem_mask(vd, i, 1);                           \
1556        }                                                           \
1557    }                                                               \
1558}
1559
1560GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1561GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1562GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1563GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1564
1565GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1566GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1567GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1568GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1569
1570GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1571GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1572GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1573GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1574
1575GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1576GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1577GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1578GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1579
1580GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1581GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1582GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1583GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1584
1585GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1586GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1587GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1588GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1589
1590GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1591GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1592GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1593GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1594
1595GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1596GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1597GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1598GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1599
1600/* Vector Integer Min/Max Instructions */
1601RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1602RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1603RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1604RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1605RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1606RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1607RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1608RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1609RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1610RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1611RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1612RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1613RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1614RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1615RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1616RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1617GEN_VEXT_VV(vminu_vv_b, 1)
1618GEN_VEXT_VV(vminu_vv_h, 2)
1619GEN_VEXT_VV(vminu_vv_w, 4)
1620GEN_VEXT_VV(vminu_vv_d, 8)
1621GEN_VEXT_VV(vmin_vv_b, 1)
1622GEN_VEXT_VV(vmin_vv_h, 2)
1623GEN_VEXT_VV(vmin_vv_w, 4)
1624GEN_VEXT_VV(vmin_vv_d, 8)
1625GEN_VEXT_VV(vmaxu_vv_b, 1)
1626GEN_VEXT_VV(vmaxu_vv_h, 2)
1627GEN_VEXT_VV(vmaxu_vv_w, 4)
1628GEN_VEXT_VV(vmaxu_vv_d, 8)
1629GEN_VEXT_VV(vmax_vv_b, 1)
1630GEN_VEXT_VV(vmax_vv_h, 2)
1631GEN_VEXT_VV(vmax_vv_w, 4)
1632GEN_VEXT_VV(vmax_vv_d, 8)
1633
1634RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1635RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1636RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1637RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1638RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1639RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1640RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1641RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1642RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1643RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1644RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1645RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1646RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1647RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1648RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1649RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1650GEN_VEXT_VX(vminu_vx_b, 1)
1651GEN_VEXT_VX(vminu_vx_h, 2)
1652GEN_VEXT_VX(vminu_vx_w, 4)
1653GEN_VEXT_VX(vminu_vx_d, 8)
1654GEN_VEXT_VX(vmin_vx_b, 1)
1655GEN_VEXT_VX(vmin_vx_h, 2)
1656GEN_VEXT_VX(vmin_vx_w, 4)
1657GEN_VEXT_VX(vmin_vx_d, 8)
1658GEN_VEXT_VX(vmaxu_vx_b, 1)
1659GEN_VEXT_VX(vmaxu_vx_h, 2)
1660GEN_VEXT_VX(vmaxu_vx_w, 4)
1661GEN_VEXT_VX(vmaxu_vx_d, 8)
1662GEN_VEXT_VX(vmax_vx_b, 1)
1663GEN_VEXT_VX(vmax_vx_h, 2)
1664GEN_VEXT_VX(vmax_vx_w, 4)
1665GEN_VEXT_VX(vmax_vx_d, 8)
1666
1667/* Vector Single-Width Integer Multiply Instructions */
1668#define DO_MUL(N, M) (N * M)
1669RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1670RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1671RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1672RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1673GEN_VEXT_VV(vmul_vv_b, 1)
1674GEN_VEXT_VV(vmul_vv_h, 2)
1675GEN_VEXT_VV(vmul_vv_w, 4)
1676GEN_VEXT_VV(vmul_vv_d, 8)
1677
1678static int8_t do_mulh_b(int8_t s2, int8_t s1)
1679{
1680    return (int16_t)s2 * (int16_t)s1 >> 8;
1681}
1682
1683static int16_t do_mulh_h(int16_t s2, int16_t s1)
1684{
1685    return (int32_t)s2 * (int32_t)s1 >> 16;
1686}
1687
1688static int32_t do_mulh_w(int32_t s2, int32_t s1)
1689{
1690    return (int64_t)s2 * (int64_t)s1 >> 32;
1691}
1692
1693static int64_t do_mulh_d(int64_t s2, int64_t s1)
1694{
1695    uint64_t hi_64, lo_64;
1696
1697    muls64(&lo_64, &hi_64, s1, s2);
1698    return hi_64;
1699}
1700
1701static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1702{
1703    return (uint16_t)s2 * (uint16_t)s1 >> 8;
1704}
1705
1706static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1707{
1708    return (uint32_t)s2 * (uint32_t)s1 >> 16;
1709}
1710
1711static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1712{
1713    return (uint64_t)s2 * (uint64_t)s1 >> 32;
1714}
1715
1716static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1717{
1718    uint64_t hi_64, lo_64;
1719
1720    mulu64(&lo_64, &hi_64, s2, s1);
1721    return hi_64;
1722}
1723
1724static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1725{
1726    return (int16_t)s2 * (uint16_t)s1 >> 8;
1727}
1728
1729static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1730{
1731    return (int32_t)s2 * (uint32_t)s1 >> 16;
1732}
1733
1734static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1735{
1736    return (int64_t)s2 * (uint64_t)s1 >> 32;
1737}
1738
1739/*
1740 * Let  A = signed operand,
1741 *      B = unsigned operand
1742 *      P = mulu64(A, B), unsigned product
1743 *
1744 * LET  X = 2 ** 64  - A, 2's complement of A
1745 *      SP = signed product
1746 * THEN
1747 *      IF A < 0
1748 *          SP = -X * B
1749 *             = -(2 ** 64 - A) * B
1750 *             = A * B - 2 ** 64 * B
1751 *             = P - 2 ** 64 * B
1752 *      ELSE
1753 *          SP = P
1754 * THEN
1755 *      HI_P -= (A < 0 ? B : 0)
1756 */
1757
1758static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1759{
1760    uint64_t hi_64, lo_64;
1761
1762    mulu64(&lo_64, &hi_64, s2, s1);
1763
1764    hi_64 -= s2 < 0 ? s1 : 0;
1765    return hi_64;
1766}
1767
1768RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1769RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1770RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1771RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1772RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1773RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1774RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1775RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1776RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1777RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1778RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1779RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1780GEN_VEXT_VV(vmulh_vv_b, 1)
1781GEN_VEXT_VV(vmulh_vv_h, 2)
1782GEN_VEXT_VV(vmulh_vv_w, 4)
1783GEN_VEXT_VV(vmulh_vv_d, 8)
1784GEN_VEXT_VV(vmulhu_vv_b, 1)
1785GEN_VEXT_VV(vmulhu_vv_h, 2)
1786GEN_VEXT_VV(vmulhu_vv_w, 4)
1787GEN_VEXT_VV(vmulhu_vv_d, 8)
1788GEN_VEXT_VV(vmulhsu_vv_b, 1)
1789GEN_VEXT_VV(vmulhsu_vv_h, 2)
1790GEN_VEXT_VV(vmulhsu_vv_w, 4)
1791GEN_VEXT_VV(vmulhsu_vv_d, 8)
1792
1793RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1794RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1795RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1796RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1797RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1798RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1799RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1800RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1801RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1802RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1803RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1804RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1805RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1806RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1807RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1808RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1809GEN_VEXT_VX(vmul_vx_b, 1)
1810GEN_VEXT_VX(vmul_vx_h, 2)
1811GEN_VEXT_VX(vmul_vx_w, 4)
1812GEN_VEXT_VX(vmul_vx_d, 8)
1813GEN_VEXT_VX(vmulh_vx_b, 1)
1814GEN_VEXT_VX(vmulh_vx_h, 2)
1815GEN_VEXT_VX(vmulh_vx_w, 4)
1816GEN_VEXT_VX(vmulh_vx_d, 8)
1817GEN_VEXT_VX(vmulhu_vx_b, 1)
1818GEN_VEXT_VX(vmulhu_vx_h, 2)
1819GEN_VEXT_VX(vmulhu_vx_w, 4)
1820GEN_VEXT_VX(vmulhu_vx_d, 8)
1821GEN_VEXT_VX(vmulhsu_vx_b, 1)
1822GEN_VEXT_VX(vmulhsu_vx_h, 2)
1823GEN_VEXT_VX(vmulhsu_vx_w, 4)
1824GEN_VEXT_VX(vmulhsu_vx_d, 8)
1825
1826/* Vector Integer Divide Instructions */
1827#define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1828#define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1829#define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) : \
1830        unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1831#define DO_REM(N, M)  (unlikely(M == 0) ? N : \
1832        unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1833
1834RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1835RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1836RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1837RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1838RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1839RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1840RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1841RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1842RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1843RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1844RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1845RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1846RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1847RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1848RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1849RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1850GEN_VEXT_VV(vdivu_vv_b, 1)
1851GEN_VEXT_VV(vdivu_vv_h, 2)
1852GEN_VEXT_VV(vdivu_vv_w, 4)
1853GEN_VEXT_VV(vdivu_vv_d, 8)
1854GEN_VEXT_VV(vdiv_vv_b, 1)
1855GEN_VEXT_VV(vdiv_vv_h, 2)
1856GEN_VEXT_VV(vdiv_vv_w, 4)
1857GEN_VEXT_VV(vdiv_vv_d, 8)
1858GEN_VEXT_VV(vremu_vv_b, 1)
1859GEN_VEXT_VV(vremu_vv_h, 2)
1860GEN_VEXT_VV(vremu_vv_w, 4)
1861GEN_VEXT_VV(vremu_vv_d, 8)
1862GEN_VEXT_VV(vrem_vv_b, 1)
1863GEN_VEXT_VV(vrem_vv_h, 2)
1864GEN_VEXT_VV(vrem_vv_w, 4)
1865GEN_VEXT_VV(vrem_vv_d, 8)
1866
1867RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1868RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1869RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1870RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1871RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1872RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1873RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1874RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1875RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1876RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1877RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1878RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1879RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1880RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1881RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1882RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1883GEN_VEXT_VX(vdivu_vx_b, 1)
1884GEN_VEXT_VX(vdivu_vx_h, 2)
1885GEN_VEXT_VX(vdivu_vx_w, 4)
1886GEN_VEXT_VX(vdivu_vx_d, 8)
1887GEN_VEXT_VX(vdiv_vx_b, 1)
1888GEN_VEXT_VX(vdiv_vx_h, 2)
1889GEN_VEXT_VX(vdiv_vx_w, 4)
1890GEN_VEXT_VX(vdiv_vx_d, 8)
1891GEN_VEXT_VX(vremu_vx_b, 1)
1892GEN_VEXT_VX(vremu_vx_h, 2)
1893GEN_VEXT_VX(vremu_vx_w, 4)
1894GEN_VEXT_VX(vremu_vx_d, 8)
1895GEN_VEXT_VX(vrem_vx_b, 1)
1896GEN_VEXT_VX(vrem_vx_h, 2)
1897GEN_VEXT_VX(vrem_vx_w, 4)
1898GEN_VEXT_VX(vrem_vx_d, 8)
1899
1900/* Vector Widening Integer Multiply Instructions */
1901RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1902RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1903RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1904RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1905RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1906RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1907RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1908RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1909RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1910GEN_VEXT_VV(vwmul_vv_b, 2)
1911GEN_VEXT_VV(vwmul_vv_h, 4)
1912GEN_VEXT_VV(vwmul_vv_w, 8)
1913GEN_VEXT_VV(vwmulu_vv_b, 2)
1914GEN_VEXT_VV(vwmulu_vv_h, 4)
1915GEN_VEXT_VV(vwmulu_vv_w, 8)
1916GEN_VEXT_VV(vwmulsu_vv_b, 2)
1917GEN_VEXT_VV(vwmulsu_vv_h, 4)
1918GEN_VEXT_VV(vwmulsu_vv_w, 8)
1919
1920RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1921RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1922RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1923RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1924RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1925RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1926RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1927RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1928RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1929GEN_VEXT_VX(vwmul_vx_b, 2)
1930GEN_VEXT_VX(vwmul_vx_h, 4)
1931GEN_VEXT_VX(vwmul_vx_w, 8)
1932GEN_VEXT_VX(vwmulu_vx_b, 2)
1933GEN_VEXT_VX(vwmulu_vx_h, 4)
1934GEN_VEXT_VX(vwmulu_vx_w, 8)
1935GEN_VEXT_VX(vwmulsu_vx_b, 2)
1936GEN_VEXT_VX(vwmulsu_vx_h, 4)
1937GEN_VEXT_VX(vwmulsu_vx_w, 8)
1938
1939/* Vector Single-Width Integer Multiply-Add Instructions */
1940#define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
1941static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1942{                                                                  \
1943    TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1944    TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1945    TD d = *((TD *)vd + HD(i));                                    \
1946    *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1947}
1948
1949#define DO_MACC(N, M, D) (M * N + D)
1950#define DO_NMSAC(N, M, D) (-(M * N) + D)
1951#define DO_MADD(N, M, D) (M * D + N)
1952#define DO_NMSUB(N, M, D) (-(M * D) + N)
1953RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1954RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1955RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1956RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1957RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1958RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1959RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1960RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1961RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1962RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1963RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1964RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1965RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1966RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1967RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1968RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1969GEN_VEXT_VV(vmacc_vv_b, 1)
1970GEN_VEXT_VV(vmacc_vv_h, 2)
1971GEN_VEXT_VV(vmacc_vv_w, 4)
1972GEN_VEXT_VV(vmacc_vv_d, 8)
1973GEN_VEXT_VV(vnmsac_vv_b, 1)
1974GEN_VEXT_VV(vnmsac_vv_h, 2)
1975GEN_VEXT_VV(vnmsac_vv_w, 4)
1976GEN_VEXT_VV(vnmsac_vv_d, 8)
1977GEN_VEXT_VV(vmadd_vv_b, 1)
1978GEN_VEXT_VV(vmadd_vv_h, 2)
1979GEN_VEXT_VV(vmadd_vv_w, 4)
1980GEN_VEXT_VV(vmadd_vv_d, 8)
1981GEN_VEXT_VV(vnmsub_vv_b, 1)
1982GEN_VEXT_VV(vnmsub_vv_h, 2)
1983GEN_VEXT_VV(vnmsub_vv_w, 4)
1984GEN_VEXT_VV(vnmsub_vv_d, 8)
1985
1986#define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1987static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1988{                                                                   \
1989    TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1990    TD d = *((TD *)vd + HD(i));                                     \
1991    *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1992}
1993
1994RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1995RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1996RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1997RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1998RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1999RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
2000RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
2001RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
2002RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
2003RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
2004RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
2005RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
2006RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
2007RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
2008RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
2009RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
2010GEN_VEXT_VX(vmacc_vx_b, 1)
2011GEN_VEXT_VX(vmacc_vx_h, 2)
2012GEN_VEXT_VX(vmacc_vx_w, 4)
2013GEN_VEXT_VX(vmacc_vx_d, 8)
2014GEN_VEXT_VX(vnmsac_vx_b, 1)
2015GEN_VEXT_VX(vnmsac_vx_h, 2)
2016GEN_VEXT_VX(vnmsac_vx_w, 4)
2017GEN_VEXT_VX(vnmsac_vx_d, 8)
2018GEN_VEXT_VX(vmadd_vx_b, 1)
2019GEN_VEXT_VX(vmadd_vx_h, 2)
2020GEN_VEXT_VX(vmadd_vx_w, 4)
2021GEN_VEXT_VX(vmadd_vx_d, 8)
2022GEN_VEXT_VX(vnmsub_vx_b, 1)
2023GEN_VEXT_VX(vnmsub_vx_h, 2)
2024GEN_VEXT_VX(vnmsub_vx_w, 4)
2025GEN_VEXT_VX(vnmsub_vx_d, 8)
2026
2027/* Vector Widening Integer Multiply-Add Instructions */
2028RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
2029RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
2030RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
2031RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
2032RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
2033RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
2034RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
2035RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
2036RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
2037GEN_VEXT_VV(vwmaccu_vv_b, 2)
2038GEN_VEXT_VV(vwmaccu_vv_h, 4)
2039GEN_VEXT_VV(vwmaccu_vv_w, 8)
2040GEN_VEXT_VV(vwmacc_vv_b, 2)
2041GEN_VEXT_VV(vwmacc_vv_h, 4)
2042GEN_VEXT_VV(vwmacc_vv_w, 8)
2043GEN_VEXT_VV(vwmaccsu_vv_b, 2)
2044GEN_VEXT_VV(vwmaccsu_vv_h, 4)
2045GEN_VEXT_VV(vwmaccsu_vv_w, 8)
2046
2047RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
2048RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
2049RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
2050RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
2051RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
2052RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
2053RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
2054RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
2055RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
2056RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
2057RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
2058RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
2059GEN_VEXT_VX(vwmaccu_vx_b, 2)
2060GEN_VEXT_VX(vwmaccu_vx_h, 4)
2061GEN_VEXT_VX(vwmaccu_vx_w, 8)
2062GEN_VEXT_VX(vwmacc_vx_b, 2)
2063GEN_VEXT_VX(vwmacc_vx_h, 4)
2064GEN_VEXT_VX(vwmacc_vx_w, 8)
2065GEN_VEXT_VX(vwmaccsu_vx_b, 2)
2066GEN_VEXT_VX(vwmaccsu_vx_h, 4)
2067GEN_VEXT_VX(vwmaccsu_vx_w, 8)
2068GEN_VEXT_VX(vwmaccus_vx_b, 2)
2069GEN_VEXT_VX(vwmaccus_vx_h, 4)
2070GEN_VEXT_VX(vwmaccus_vx_w, 8)
2071
2072/* Vector Integer Merge and Move Instructions */
2073#define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
2074void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
2075                  uint32_t desc)                                     \
2076{                                                                    \
2077    uint32_t vl = env->vl;                                           \
2078    uint32_t esz = sizeof(ETYPE);                                    \
2079    uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2080    uint32_t vta = vext_vta(desc);                                   \
2081    uint32_t i;                                                      \
2082                                                                     \
2083    VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2084                                                                     \
2085    for (i = env->vstart; i < vl; i++) {                             \
2086        ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
2087        *((ETYPE *)vd + H(i)) = s1;                                  \
2088    }                                                                \
2089    env->vstart = 0;                                                 \
2090    /* set tail elements to 1s */                                    \
2091    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2092}
2093
2094GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
2095GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2096GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2097GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2098
2099#define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
2100void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
2101                  uint32_t desc)                                     \
2102{                                                                    \
2103    uint32_t vl = env->vl;                                           \
2104    uint32_t esz = sizeof(ETYPE);                                    \
2105    uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2106    uint32_t vta = vext_vta(desc);                                   \
2107    uint32_t i;                                                      \
2108                                                                     \
2109    VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2110                                                                     \
2111    for (i = env->vstart; i < vl; i++) {                             \
2112        *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2113    }                                                                \
2114    env->vstart = 0;                                                 \
2115    /* set tail elements to 1s */                                    \
2116    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2117}
2118
2119GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2120GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2121GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2122GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2123
2124#define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2125void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2126                  CPURISCVState *env, uint32_t desc)                 \
2127{                                                                    \
2128    uint32_t vl = env->vl;                                           \
2129    uint32_t esz = sizeof(ETYPE);                                    \
2130    uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2131    uint32_t vta = vext_vta(desc);                                   \
2132    uint32_t i;                                                      \
2133                                                                     \
2134    VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2135                                                                     \
2136    for (i = env->vstart; i < vl; i++) {                             \
2137        ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2138        *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2139    }                                                                \
2140    env->vstart = 0;                                                 \
2141    /* set tail elements to 1s */                                    \
2142    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2143}
2144
2145GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2146GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2147GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2148GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2149
2150#define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2151void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2152                  void *vs2, CPURISCVState *env, uint32_t desc)      \
2153{                                                                    \
2154    uint32_t vl = env->vl;                                           \
2155    uint32_t esz = sizeof(ETYPE);                                    \
2156    uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2157    uint32_t vta = vext_vta(desc);                                   \
2158    uint32_t i;                                                      \
2159                                                                     \
2160    VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2161                                                                     \
2162    for (i = env->vstart; i < vl; i++) {                             \
2163        ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2164        ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2165                   (ETYPE)(target_long)s1);                          \
2166        *((ETYPE *)vd + H(i)) = d;                                   \
2167    }                                                                \
2168    env->vstart = 0;                                                 \
2169    /* set tail elements to 1s */                                    \
2170    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2171}
2172
2173GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2174GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2175GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2176GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2177
2178/*
2179 * Vector Fixed-Point Arithmetic Instructions
2180 */
2181
2182/* Vector Single-Width Saturating Add and Subtract */
2183
2184/*
2185 * As fixed point instructions probably have round mode and saturation,
2186 * define common macros for fixed point here.
2187 */
2188typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2189                          CPURISCVState *env, int vxrm);
2190
2191#define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2192static inline void                                                  \
2193do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2194          CPURISCVState *env, int vxrm)                             \
2195{                                                                   \
2196    TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2197    TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2198    *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2199}
2200
2201static inline void
2202vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2203             CPURISCVState *env,
2204             uint32_t vl, uint32_t vm, int vxrm,
2205             opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
2206{
2207    for (uint32_t i = env->vstart; i < vl; i++) {
2208        if (!vm && !vext_elem_mask(v0, i)) {
2209            /* set masked-off elements to 1s */
2210            vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2211            continue;
2212        }
2213        fn(vd, vs1, vs2, i, env, vxrm);
2214    }
2215    env->vstart = 0;
2216}
2217
2218static inline void
2219vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2220             CPURISCVState *env,
2221             uint32_t desc,
2222             opivv2_rm_fn *fn, uint32_t esz)
2223{
2224    uint32_t vm = vext_vm(desc);
2225    uint32_t vl = env->vl;
2226    uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2227    uint32_t vta = vext_vta(desc);
2228    uint32_t vma = vext_vma(desc);
2229
2230    VSTART_CHECK_EARLY_EXIT(env, vl);
2231
2232    switch (env->vxrm) {
2233    case 0: /* rnu */
2234        vext_vv_rm_1(vd, v0, vs1, vs2,
2235                     env, vl, vm, 0, fn, vma, esz);
2236        break;
2237    case 1: /* rne */
2238        vext_vv_rm_1(vd, v0, vs1, vs2,
2239                     env, vl, vm, 1, fn, vma, esz);
2240        break;
2241    case 2: /* rdn */
2242        vext_vv_rm_1(vd, v0, vs1, vs2,
2243                     env, vl, vm, 2, fn, vma, esz);
2244        break;
2245    default: /* rod */
2246        vext_vv_rm_1(vd, v0, vs1, vs2,
2247                     env, vl, vm, 3, fn, vma, esz);
2248        break;
2249    }
2250    /* set tail elements to 1s */
2251    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2252}
2253
2254/* generate helpers for fixed point instructions with OPIVV format */
2255#define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2256void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2257                  CPURISCVState *env, uint32_t desc)            \
2258{                                                               \
2259    vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2260                 do_##NAME, ESZ);                               \
2261}
2262
2263static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a,
2264                             uint8_t b)
2265{
2266    uint8_t res = a + b;
2267    if (res < a) {
2268        res = UINT8_MAX;
2269        env->vxsat = 0x1;
2270    }
2271    return res;
2272}
2273
2274static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2275                               uint16_t b)
2276{
2277    uint16_t res = a + b;
2278    if (res < a) {
2279        res = UINT16_MAX;
2280        env->vxsat = 0x1;
2281    }
2282    return res;
2283}
2284
2285static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2286                               uint32_t b)
2287{
2288    uint32_t res = a + b;
2289    if (res < a) {
2290        res = UINT32_MAX;
2291        env->vxsat = 0x1;
2292    }
2293    return res;
2294}
2295
2296static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2297                               uint64_t b)
2298{
2299    uint64_t res = a + b;
2300    if (res < a) {
2301        res = UINT64_MAX;
2302        env->vxsat = 0x1;
2303    }
2304    return res;
2305}
2306
2307RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2308RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2309RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2310RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2311GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2312GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2313GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2314GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2315
2316typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2317                          CPURISCVState *env, int vxrm);
2318
2319#define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2320static inline void                                                  \
2321do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2322          CPURISCVState *env, int vxrm)                             \
2323{                                                                   \
2324    TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2325    *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2326}
2327
2328static inline void
2329vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2330             CPURISCVState *env,
2331             uint32_t vl, uint32_t vm, int vxrm,
2332             opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2333{
2334    for (uint32_t i = env->vstart; i < vl; i++) {
2335        if (!vm && !vext_elem_mask(v0, i)) {
2336            /* set masked-off elements to 1s */
2337            vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2338            continue;
2339        }
2340        fn(vd, s1, vs2, i, env, vxrm);
2341    }
2342    env->vstart = 0;
2343}
2344
2345static inline void
2346vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2347             CPURISCVState *env,
2348             uint32_t desc,
2349             opivx2_rm_fn *fn, uint32_t esz)
2350{
2351    uint32_t vm = vext_vm(desc);
2352    uint32_t vl = env->vl;
2353    uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2354    uint32_t vta = vext_vta(desc);
2355    uint32_t vma = vext_vma(desc);
2356
2357    VSTART_CHECK_EARLY_EXIT(env, vl);
2358
2359    switch (env->vxrm) {
2360    case 0: /* rnu */
2361        vext_vx_rm_1(vd, v0, s1, vs2,
2362                     env, vl, vm, 0, fn, vma, esz);
2363        break;
2364    case 1: /* rne */
2365        vext_vx_rm_1(vd, v0, s1, vs2,
2366                     env, vl, vm, 1, fn, vma, esz);
2367        break;
2368    case 2: /* rdn */
2369        vext_vx_rm_1(vd, v0, s1, vs2,
2370                     env, vl, vm, 2, fn, vma, esz);
2371        break;
2372    default: /* rod */
2373        vext_vx_rm_1(vd, v0, s1, vs2,
2374                     env, vl, vm, 3, fn, vma, esz);
2375        break;
2376    }
2377    /* set tail elements to 1s */
2378    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2379}
2380
2381/* generate helpers for fixed point instructions with OPIVX format */
2382#define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2383void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2384                  void *vs2, CPURISCVState *env,          \
2385                  uint32_t desc)                          \
2386{                                                         \
2387    vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2388                 do_##NAME, ESZ);                         \
2389}
2390
2391RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2392RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2393RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2394RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2395GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2396GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2397GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2398GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2399
2400static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2401{
2402    int8_t res = a + b;
2403    if ((res ^ a) & (res ^ b) & INT8_MIN) {
2404        res = a > 0 ? INT8_MAX : INT8_MIN;
2405        env->vxsat = 0x1;
2406    }
2407    return res;
2408}
2409
2410static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a,
2411                             int16_t b)
2412{
2413    int16_t res = a + b;
2414    if ((res ^ a) & (res ^ b) & INT16_MIN) {
2415        res = a > 0 ? INT16_MAX : INT16_MIN;
2416        env->vxsat = 0x1;
2417    }
2418    return res;
2419}
2420
2421static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a,
2422                             int32_t b)
2423{
2424    int32_t res = a + b;
2425    if ((res ^ a) & (res ^ b) & INT32_MIN) {
2426        res = a > 0 ? INT32_MAX : INT32_MIN;
2427        env->vxsat = 0x1;
2428    }
2429    return res;
2430}
2431
2432static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a,
2433                             int64_t b)
2434{
2435    int64_t res = a + b;
2436    if ((res ^ a) & (res ^ b) & INT64_MIN) {
2437        res = a > 0 ? INT64_MAX : INT64_MIN;
2438        env->vxsat = 0x1;
2439    }
2440    return res;
2441}
2442
2443RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2444RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2445RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2446RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2447GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2448GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2449GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2450GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2451
2452RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2453RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2454RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2455RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2456GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2457GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2458GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2459GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2460
2461static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a,
2462                             uint8_t b)
2463{
2464    uint8_t res = a - b;
2465    if (res > a) {
2466        res = 0;
2467        env->vxsat = 0x1;
2468    }
2469    return res;
2470}
2471
2472static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2473                               uint16_t b)
2474{
2475    uint16_t res = a - b;
2476    if (res > a) {
2477        res = 0;
2478        env->vxsat = 0x1;
2479    }
2480    return res;
2481}
2482
2483static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2484                               uint32_t b)
2485{
2486    uint32_t res = a - b;
2487    if (res > a) {
2488        res = 0;
2489        env->vxsat = 0x1;
2490    }
2491    return res;
2492}
2493
2494static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2495                               uint64_t b)
2496{
2497    uint64_t res = a - b;
2498    if (res > a) {
2499        res = 0;
2500        env->vxsat = 0x1;
2501    }
2502    return res;
2503}
2504
2505RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2506RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2507RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2508RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2509GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2510GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2511GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2512GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2513
2514RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2515RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2516RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2517RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2518GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2519GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2520GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2521GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2522
2523static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2524{
2525    int8_t res = a - b;
2526    if ((res ^ a) & (a ^ b) & INT8_MIN) {
2527        res = a >= 0 ? INT8_MAX : INT8_MIN;
2528        env->vxsat = 0x1;
2529    }
2530    return res;
2531}
2532
2533static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a,
2534                             int16_t b)
2535{
2536    int16_t res = a - b;
2537    if ((res ^ a) & (a ^ b) & INT16_MIN) {
2538        res = a >= 0 ? INT16_MAX : INT16_MIN;
2539        env->vxsat = 0x1;
2540    }
2541    return res;
2542}
2543
2544static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a,
2545                             int32_t b)
2546{
2547    int32_t res = a - b;
2548    if ((res ^ a) & (a ^ b) & INT32_MIN) {
2549        res = a >= 0 ? INT32_MAX : INT32_MIN;
2550        env->vxsat = 0x1;
2551    }
2552    return res;
2553}
2554
2555static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a,
2556                             int64_t b)
2557{
2558    int64_t res = a - b;
2559    if ((res ^ a) & (a ^ b) & INT64_MIN) {
2560        res = a >= 0 ? INT64_MAX : INT64_MIN;
2561        env->vxsat = 0x1;
2562    }
2563    return res;
2564}
2565
2566RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2567RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2568RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2569RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2570GEN_VEXT_VV_RM(vssub_vv_b, 1)
2571GEN_VEXT_VV_RM(vssub_vv_h, 2)
2572GEN_VEXT_VV_RM(vssub_vv_w, 4)
2573GEN_VEXT_VV_RM(vssub_vv_d, 8)
2574
2575RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2576RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2577RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2578RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2579GEN_VEXT_VX_RM(vssub_vx_b, 1)
2580GEN_VEXT_VX_RM(vssub_vx_h, 2)
2581GEN_VEXT_VX_RM(vssub_vx_w, 4)
2582GEN_VEXT_VX_RM(vssub_vx_d, 8)
2583
2584/* Vector Single-Width Averaging Add and Subtract */
2585static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2586{
2587    uint8_t d = extract64(v, shift, 1);
2588    uint8_t d1;
2589    uint64_t D1, D2;
2590
2591    if (shift == 0 || shift > 64) {
2592        return 0;
2593    }
2594
2595    d1 = extract64(v, shift - 1, 1);
2596    D1 = extract64(v, 0, shift);
2597    if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2598        return d1;
2599    } else if (vxrm == 1) { /* round-to-nearest-even */
2600        if (shift > 1) {
2601            D2 = extract64(v, 0, shift - 1);
2602            return d1 & ((D2 != 0) | d);
2603        } else {
2604            return d1 & d;
2605        }
2606    } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2607        return !d & (D1 != 0);
2608    }
2609    return 0; /* round-down (truncate) */
2610}
2611
2612static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a,
2613                             int32_t b)
2614{
2615    int64_t res = (int64_t)a + b;
2616    uint8_t round = get_round(vxrm, res, 1);
2617
2618    return (res >> 1) + round;
2619}
2620
2621static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a,
2622                             int64_t b)
2623{
2624    int64_t res = a + b;
2625    uint8_t round = get_round(vxrm, res, 1);
2626    int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2627
2628    /* With signed overflow, bit 64 is inverse of bit 63. */
2629    return ((res >> 1) ^ over) + round;
2630}
2631
2632RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2633RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2634RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2635RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2636GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2637GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2638GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2639GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2640
2641RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2642RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2643RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2644RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2645GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2646GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2647GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2648GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2649
2650static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2651                               uint32_t a, uint32_t b)
2652{
2653    uint64_t res = (uint64_t)a + b;
2654    uint8_t round = get_round(vxrm, res, 1);
2655
2656    return (res >> 1) + round;
2657}
2658
2659static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2660                               uint64_t a, uint64_t b)
2661{
2662    uint64_t res = a + b;
2663    uint8_t round = get_round(vxrm, res, 1);
2664    uint64_t over = (uint64_t)(res < a) << 63;
2665
2666    return ((res >> 1) | over) + round;
2667}
2668
2669RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2670RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2671RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2672RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2673GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2674GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2675GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2676GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2677
2678RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2679RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2680RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2681RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2682GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2683GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2684GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2685GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2686
2687static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a,
2688                             int32_t b)
2689{
2690    int64_t res = (int64_t)a - b;
2691    uint8_t round = get_round(vxrm, res, 1);
2692
2693    return (res >> 1) + round;
2694}
2695
2696static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a,
2697                             int64_t b)
2698{
2699    int64_t res = (int64_t)a - b;
2700    uint8_t round = get_round(vxrm, res, 1);
2701    int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2702
2703    /* With signed overflow, bit 64 is inverse of bit 63. */
2704    return ((res >> 1) ^ over) + round;
2705}
2706
2707RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2708RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2709RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2710RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2711GEN_VEXT_VV_RM(vasub_vv_b, 1)
2712GEN_VEXT_VV_RM(vasub_vv_h, 2)
2713GEN_VEXT_VV_RM(vasub_vv_w, 4)
2714GEN_VEXT_VV_RM(vasub_vv_d, 8)
2715
2716RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2717RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2718RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2719RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2720GEN_VEXT_VX_RM(vasub_vx_b, 1)
2721GEN_VEXT_VX_RM(vasub_vx_h, 2)
2722GEN_VEXT_VX_RM(vasub_vx_w, 4)
2723GEN_VEXT_VX_RM(vasub_vx_d, 8)
2724
2725static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2726                               uint32_t a, uint32_t b)
2727{
2728    int64_t res = (int64_t)a - b;
2729    uint8_t round = get_round(vxrm, res, 1);
2730
2731    return (res >> 1) + round;
2732}
2733
2734static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2735                               uint64_t a, uint64_t b)
2736{
2737    uint64_t res = (uint64_t)a - b;
2738    uint8_t round = get_round(vxrm, res, 1);
2739    uint64_t over = (uint64_t)(res > a) << 63;
2740
2741    return ((res >> 1) | over) + round;
2742}
2743
2744RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2745RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2746RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2747RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2748GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2749GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2750GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2751GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2752
2753RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2754RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2755RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2756RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2757GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2758GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2759GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2760GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2761
2762/* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2763static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2764{
2765    uint8_t round;
2766    int16_t res;
2767
2768    res = (int16_t)a * (int16_t)b;
2769    round = get_round(vxrm, res, 7);
2770    res = (res >> 7) + round;
2771
2772    if (res > INT8_MAX) {
2773        env->vxsat = 0x1;
2774        return INT8_MAX;
2775    } else if (res < INT8_MIN) {
2776        env->vxsat = 0x1;
2777        return INT8_MIN;
2778    } else {
2779        return res;
2780    }
2781}
2782
2783static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2784{
2785    uint8_t round;
2786    int32_t res;
2787
2788    res = (int32_t)a * (int32_t)b;
2789    round = get_round(vxrm, res, 15);
2790    res = (res >> 15) + round;
2791
2792    if (res > INT16_MAX) {
2793        env->vxsat = 0x1;
2794        return INT16_MAX;
2795    } else if (res < INT16_MIN) {
2796        env->vxsat = 0x1;
2797        return INT16_MIN;
2798    } else {
2799        return res;
2800    }
2801}
2802
2803static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2804{
2805    uint8_t round;
2806    int64_t res;
2807
2808    res = (int64_t)a * (int64_t)b;
2809    round = get_round(vxrm, res, 31);
2810    res = (res >> 31) + round;
2811
2812    if (res > INT32_MAX) {
2813        env->vxsat = 0x1;
2814        return INT32_MAX;
2815    } else if (res < INT32_MIN) {
2816        env->vxsat = 0x1;
2817        return INT32_MIN;
2818    } else {
2819        return res;
2820    }
2821}
2822
2823static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2824{
2825    uint8_t round;
2826    uint64_t hi_64, lo_64;
2827    int64_t res;
2828
2829    if (a == INT64_MIN && b == INT64_MIN) {
2830        env->vxsat = 1;
2831        return INT64_MAX;
2832    }
2833
2834    muls64(&lo_64, &hi_64, a, b);
2835    round = get_round(vxrm, lo_64, 63);
2836    /*
2837     * Cannot overflow, as there are always
2838     * 2 sign bits after multiply.
2839     */
2840    res = (hi_64 << 1) | (lo_64 >> 63);
2841    if (round) {
2842        if (res == INT64_MAX) {
2843            env->vxsat = 1;
2844        } else {
2845            res += 1;
2846        }
2847    }
2848    return res;
2849}
2850
2851RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2852RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2853RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2854RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2855GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2856GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2857GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2858GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2859
2860RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2861RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2862RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2863RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2864GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2865GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2866GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2867GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2868
2869/* Vector Single-Width Scaling Shift Instructions */
2870static inline uint8_t
2871vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2872{
2873    uint8_t round, shift = b & 0x7;
2874    uint8_t res;
2875
2876    round = get_round(vxrm, a, shift);
2877    res = (a >> shift) + round;
2878    return res;
2879}
2880static inline uint16_t
2881vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2882{
2883    uint8_t round, shift = b & 0xf;
2884
2885    round = get_round(vxrm, a, shift);
2886    return (a >> shift) + round;
2887}
2888static inline uint32_t
2889vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2890{
2891    uint8_t round, shift = b & 0x1f;
2892
2893    round = get_round(vxrm, a, shift);
2894    return (a >> shift) + round;
2895}
2896static inline uint64_t
2897vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2898{
2899    uint8_t round, shift = b & 0x3f;
2900
2901    round = get_round(vxrm, a, shift);
2902    return (a >> shift) + round;
2903}
2904RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2905RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2906RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2907RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2908GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2909GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2910GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2911GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2912
2913RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2914RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2915RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2916RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2917GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2918GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2919GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2920GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2921
2922static inline int8_t
2923vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2924{
2925    uint8_t round, shift = b & 0x7;
2926
2927    round = get_round(vxrm, a, shift);
2928    return (a >> shift) + round;
2929}
2930static inline int16_t
2931vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2932{
2933    uint8_t round, shift = b & 0xf;
2934
2935    round = get_round(vxrm, a, shift);
2936    return (a >> shift) + round;
2937}
2938static inline int32_t
2939vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2940{
2941    uint8_t round, shift = b & 0x1f;
2942
2943    round = get_round(vxrm, a, shift);
2944    return (a >> shift) + round;
2945}
2946static inline int64_t
2947vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2948{
2949    uint8_t round, shift = b & 0x3f;
2950
2951    round = get_round(vxrm, a, shift);
2952    return (a >> shift) + round;
2953}
2954
2955RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2956RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2957RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2958RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2959GEN_VEXT_VV_RM(vssra_vv_b, 1)
2960GEN_VEXT_VV_RM(vssra_vv_h, 2)
2961GEN_VEXT_VV_RM(vssra_vv_w, 4)
2962GEN_VEXT_VV_RM(vssra_vv_d, 8)
2963
2964RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2965RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2966RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2967RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2968GEN_VEXT_VX_RM(vssra_vx_b, 1)
2969GEN_VEXT_VX_RM(vssra_vx_h, 2)
2970GEN_VEXT_VX_RM(vssra_vx_w, 4)
2971GEN_VEXT_VX_RM(vssra_vx_d, 8)
2972
2973/* Vector Narrowing Fixed-Point Clip Instructions */
2974static inline int8_t
2975vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2976{
2977    uint8_t round, shift = b & 0xf;
2978    int16_t res;
2979
2980    round = get_round(vxrm, a, shift);
2981    res = (a >> shift) + round;
2982    if (res > INT8_MAX) {
2983        env->vxsat = 0x1;
2984        return INT8_MAX;
2985    } else if (res < INT8_MIN) {
2986        env->vxsat = 0x1;
2987        return INT8_MIN;
2988    } else {
2989        return res;
2990    }
2991}
2992
2993static inline int16_t
2994vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2995{
2996    uint8_t round, shift = b & 0x1f;
2997    int32_t res;
2998
2999    round = get_round(vxrm, a, shift);
3000    res = (a >> shift) + round;
3001    if (res > INT16_MAX) {
3002        env->vxsat = 0x1;
3003        return INT16_MAX;
3004    } else if (res < INT16_MIN) {
3005        env->vxsat = 0x1;
3006        return INT16_MIN;
3007    } else {
3008        return res;
3009    }
3010}
3011
3012static inline int32_t
3013vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
3014{
3015    uint8_t round, shift = b & 0x3f;
3016    int64_t res;
3017
3018    round = get_round(vxrm, a, shift);
3019    res = (a >> shift) + round;
3020    if (res > INT32_MAX) {
3021        env->vxsat = 0x1;
3022        return INT32_MAX;
3023    } else if (res < INT32_MIN) {
3024        env->vxsat = 0x1;
3025        return INT32_MIN;
3026    } else {
3027        return res;
3028    }
3029}
3030
3031RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
3032RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
3033RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
3034GEN_VEXT_VV_RM(vnclip_wv_b, 1)
3035GEN_VEXT_VV_RM(vnclip_wv_h, 2)
3036GEN_VEXT_VV_RM(vnclip_wv_w, 4)
3037
3038RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
3039RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
3040RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
3041GEN_VEXT_VX_RM(vnclip_wx_b, 1)
3042GEN_VEXT_VX_RM(vnclip_wx_h, 2)
3043GEN_VEXT_VX_RM(vnclip_wx_w, 4)
3044
3045static inline uint8_t
3046vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
3047{
3048    uint8_t round, shift = b & 0xf;
3049    uint16_t res;
3050
3051    round = get_round(vxrm, a, shift);
3052    res = (a >> shift) + round;
3053    if (res > UINT8_MAX) {
3054        env->vxsat = 0x1;
3055        return UINT8_MAX;
3056    } else {
3057        return res;
3058    }
3059}
3060
3061static inline uint16_t
3062vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
3063{
3064    uint8_t round, shift = b & 0x1f;
3065    uint32_t res;
3066
3067    round = get_round(vxrm, a, shift);
3068    res = (a >> shift) + round;
3069    if (res > UINT16_MAX) {
3070        env->vxsat = 0x1;
3071        return UINT16_MAX;
3072    } else {
3073        return res;
3074    }
3075}
3076
3077static inline uint32_t
3078vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
3079{
3080    uint8_t round, shift = b & 0x3f;
3081    uint64_t res;
3082
3083    round = get_round(vxrm, a, shift);
3084    res = (a >> shift) + round;
3085    if (res > UINT32_MAX) {
3086        env->vxsat = 0x1;
3087        return UINT32_MAX;
3088    } else {
3089        return res;
3090    }
3091}
3092
3093RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
3094RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
3095RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
3096GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
3097GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
3098GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
3099
3100RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
3101RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
3102RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
3103GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
3104GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
3105GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
3106
3107/*
3108 * Vector Float Point Arithmetic Instructions
3109 */
3110/* Vector Single-Width Floating-Point Add/Subtract Instructions */
3111#define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
3112static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
3113                      CPURISCVState *env)                      \
3114{                                                              \
3115    TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
3116    TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3117    *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
3118}
3119
3120#define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
3121void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
3122                  void *vs2, CPURISCVState *env,          \
3123                  uint32_t desc)                          \
3124{                                                         \
3125    uint32_t vm = vext_vm(desc);                          \
3126    uint32_t vl = env->vl;                                \
3127    uint32_t total_elems =                                \
3128        vext_get_total_elems(env, desc, ESZ);             \
3129    uint32_t vta = vext_vta(desc);                        \
3130    uint32_t vma = vext_vma(desc);                        \
3131    uint32_t i;                                           \
3132                                                          \
3133    VSTART_CHECK_EARLY_EXIT(env, vl);                     \
3134                                                          \
3135    for (i = env->vstart; i < vl; i++) {                  \
3136        if (!vm && !vext_elem_mask(v0, i)) {              \
3137            /* set masked-off elements to 1s */           \
3138            vext_set_elems_1s(vd, vma, i * ESZ,           \
3139                              (i + 1) * ESZ);             \
3140            continue;                                     \
3141        }                                                 \
3142        do_##NAME(vd, vs1, vs2, i, env);                  \
3143    }                                                     \
3144    env->vstart = 0;                                      \
3145    /* set tail elements to 1s */                         \
3146    vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3147                      total_elems * ESZ);                 \
3148}
3149
3150RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3151RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3152RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3153GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3154GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3155GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3156
3157#define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3158static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3159                      CPURISCVState *env)                      \
3160{                                                              \
3161    TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3162    *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3163}
3164
3165#define GEN_VEXT_VF(NAME, ESZ)                            \
3166void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3167                  void *vs2, CPURISCVState *env,          \
3168                  uint32_t desc)                          \
3169{                                                         \
3170    uint32_t vm = vext_vm(desc);                          \
3171    uint32_t vl = env->vl;                                \
3172    uint32_t total_elems =                                \
3173        vext_get_total_elems(env, desc, ESZ);             \
3174    uint32_t vta = vext_vta(desc);                        \
3175    uint32_t vma = vext_vma(desc);                        \
3176    uint32_t i;                                           \
3177                                                          \
3178    VSTART_CHECK_EARLY_EXIT(env, vl);                     \
3179                                                          \
3180    for (i = env->vstart; i < vl; i++) {                  \
3181        if (!vm && !vext_elem_mask(v0, i)) {              \
3182            /* set masked-off elements to 1s */           \
3183            vext_set_elems_1s(vd, vma, i * ESZ,           \
3184                              (i + 1) * ESZ);             \
3185            continue;                                     \
3186        }                                                 \
3187        do_##NAME(vd, s1, vs2, i, env);                   \
3188    }                                                     \
3189    env->vstart = 0;                                      \
3190    /* set tail elements to 1s */                         \
3191    vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3192                      total_elems * ESZ);                 \
3193}
3194
3195RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3196RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3197RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3198GEN_VEXT_VF(vfadd_vf_h, 2)
3199GEN_VEXT_VF(vfadd_vf_w, 4)
3200GEN_VEXT_VF(vfadd_vf_d, 8)
3201
3202RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3203RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3204RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3205GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3206GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3207GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3208RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3209RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3210RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3211GEN_VEXT_VF(vfsub_vf_h, 2)
3212GEN_VEXT_VF(vfsub_vf_w, 4)
3213GEN_VEXT_VF(vfsub_vf_d, 8)
3214
3215static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3216{
3217    return float16_sub(b, a, s);
3218}
3219
3220static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3221{
3222    return float32_sub(b, a, s);
3223}
3224
3225static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3226{
3227    return float64_sub(b, a, s);
3228}
3229
3230RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3231RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3232RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3233GEN_VEXT_VF(vfrsub_vf_h, 2)
3234GEN_VEXT_VF(vfrsub_vf_w, 4)
3235GEN_VEXT_VF(vfrsub_vf_d, 8)
3236
3237/* Vector Widening Floating-Point Add/Subtract Instructions */
3238static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3239{
3240    return float32_add(float16_to_float32(a, true, s),
3241                       float16_to_float32(b, true, s), s);
3242}
3243
3244static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3245{
3246    return float64_add(float32_to_float64(a, s),
3247                       float32_to_float64(b, s), s);
3248
3249}
3250
3251RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3252RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3253GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3254GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3255RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3256RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3257GEN_VEXT_VF(vfwadd_vf_h, 4)
3258GEN_VEXT_VF(vfwadd_vf_w, 8)
3259
3260static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3261{
3262    return float32_sub(float16_to_float32(a, true, s),
3263                       float16_to_float32(b, true, s), s);
3264}
3265
3266static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3267{
3268    return float64_sub(float32_to_float64(a, s),
3269                       float32_to_float64(b, s), s);
3270
3271}
3272
3273RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3274RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3275GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3276GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3277RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3278RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3279GEN_VEXT_VF(vfwsub_vf_h, 4)
3280GEN_VEXT_VF(vfwsub_vf_w, 8)
3281
3282static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3283{
3284    return float32_add(a, float16_to_float32(b, true, s), s);
3285}
3286
3287static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3288{
3289    return float64_add(a, float32_to_float64(b, s), s);
3290}
3291
3292RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3293RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3294GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3295GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3296RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3297RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3298GEN_VEXT_VF(vfwadd_wf_h, 4)
3299GEN_VEXT_VF(vfwadd_wf_w, 8)
3300
3301static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3302{
3303    return float32_sub(a, float16_to_float32(b, true, s), s);
3304}
3305
3306static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3307{
3308    return float64_sub(a, float32_to_float64(b, s), s);
3309}
3310
3311RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3312RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3313GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3314GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3315RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3316RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3317GEN_VEXT_VF(vfwsub_wf_h, 4)
3318GEN_VEXT_VF(vfwsub_wf_w, 8)
3319
3320/* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3321RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3322RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3323RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3324GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3325GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3326GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3327RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3328RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3329RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3330GEN_VEXT_VF(vfmul_vf_h, 2)
3331GEN_VEXT_VF(vfmul_vf_w, 4)
3332GEN_VEXT_VF(vfmul_vf_d, 8)
3333
3334RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3335RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3336RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3337GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3338GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3339GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3340RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3341RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3342RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3343GEN_VEXT_VF(vfdiv_vf_h, 2)
3344GEN_VEXT_VF(vfdiv_vf_w, 4)
3345GEN_VEXT_VF(vfdiv_vf_d, 8)
3346
3347static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3348{
3349    return float16_div(b, a, s);
3350}
3351
3352static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3353{
3354    return float32_div(b, a, s);
3355}
3356
3357static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3358{
3359    return float64_div(b, a, s);
3360}
3361
3362RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3363RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3364RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3365GEN_VEXT_VF(vfrdiv_vf_h, 2)
3366GEN_VEXT_VF(vfrdiv_vf_w, 4)
3367GEN_VEXT_VF(vfrdiv_vf_d, 8)
3368
3369/* Vector Widening Floating-Point Multiply */
3370static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3371{
3372    return float32_mul(float16_to_float32(a, true, s),
3373                       float16_to_float32(b, true, s), s);
3374}
3375
3376static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3377{
3378    return float64_mul(float32_to_float64(a, s),
3379                       float32_to_float64(b, s), s);
3380
3381}
3382RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3383RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3384GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3385GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3386RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3387RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3388GEN_VEXT_VF(vfwmul_vf_h, 4)
3389GEN_VEXT_VF(vfwmul_vf_w, 8)
3390
3391/* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3392#define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3393static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3394                      CPURISCVState *env)                          \
3395{                                                                  \
3396    TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3397    TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3398    TD d = *((TD *)vd + HD(i));                                    \
3399    *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3400}
3401
3402static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3403{
3404    return float16_muladd(a, b, d, 0, s);
3405}
3406
3407static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3408{
3409    return float32_muladd(a, b, d, 0, s);
3410}
3411
3412static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3413{
3414    return float64_muladd(a, b, d, 0, s);
3415}
3416
3417RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3418RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3419RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3420GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3421GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3422GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3423
3424#define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3425static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3426                      CPURISCVState *env)                         \
3427{                                                                 \
3428    TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3429    TD d = *((TD *)vd + HD(i));                                   \
3430    *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3431}
3432
3433RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3434RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3435RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3436GEN_VEXT_VF(vfmacc_vf_h, 2)
3437GEN_VEXT_VF(vfmacc_vf_w, 4)
3438GEN_VEXT_VF(vfmacc_vf_d, 8)
3439
3440static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3441{
3442    return float16_muladd(a, b, d, float_muladd_negate_c |
3443                                   float_muladd_negate_product, s);
3444}
3445
3446static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3447{
3448    return float32_muladd(a, b, d, float_muladd_negate_c |
3449                                   float_muladd_negate_product, s);
3450}
3451
3452static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3453{
3454    return float64_muladd(a, b, d, float_muladd_negate_c |
3455                                   float_muladd_negate_product, s);
3456}
3457
3458RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3459RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3460RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3461GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3462GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3463GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3464RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3465RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3466RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3467GEN_VEXT_VF(vfnmacc_vf_h, 2)
3468GEN_VEXT_VF(vfnmacc_vf_w, 4)
3469GEN_VEXT_VF(vfnmacc_vf_d, 8)
3470
3471static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3472{
3473    return float16_muladd(a, b, d, float_muladd_negate_c, s);
3474}
3475
3476static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3477{
3478    return float32_muladd(a, b, d, float_muladd_negate_c, s);
3479}
3480
3481static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3482{
3483    return float64_muladd(a, b, d, float_muladd_negate_c, s);
3484}
3485
3486RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3487RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3488RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3489GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3490GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3491GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3492RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3493RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3494RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3495GEN_VEXT_VF(vfmsac_vf_h, 2)
3496GEN_VEXT_VF(vfmsac_vf_w, 4)
3497GEN_VEXT_VF(vfmsac_vf_d, 8)
3498
3499static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3500{
3501    return float16_muladd(a, b, d, float_muladd_negate_product, s);
3502}
3503
3504static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3505{
3506    return float32_muladd(a, b, d, float_muladd_negate_product, s);
3507}
3508
3509static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3510{
3511    return float64_muladd(a, b, d, float_muladd_negate_product, s);
3512}
3513
3514RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3515RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3516RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3517GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3518GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3519GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3520RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3521RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3522RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3523GEN_VEXT_VF(vfnmsac_vf_h, 2)
3524GEN_VEXT_VF(vfnmsac_vf_w, 4)
3525GEN_VEXT_VF(vfnmsac_vf_d, 8)
3526
3527static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3528{
3529    return float16_muladd(d, b, a, 0, s);
3530}
3531
3532static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3533{
3534    return float32_muladd(d, b, a, 0, s);
3535}
3536
3537static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3538{
3539    return float64_muladd(d, b, a, 0, s);
3540}
3541
3542RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3543RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3544RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3545GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3546GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3547GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3548RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3549RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3550RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3551GEN_VEXT_VF(vfmadd_vf_h, 2)
3552GEN_VEXT_VF(vfmadd_vf_w, 4)
3553GEN_VEXT_VF(vfmadd_vf_d, 8)
3554
3555static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3556{
3557    return float16_muladd(d, b, a, float_muladd_negate_c |
3558                                   float_muladd_negate_product, s);
3559}
3560
3561static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3562{
3563    return float32_muladd(d, b, a, float_muladd_negate_c |
3564                                   float_muladd_negate_product, s);
3565}
3566
3567static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3568{
3569    return float64_muladd(d, b, a, float_muladd_negate_c |
3570                                   float_muladd_negate_product, s);
3571}
3572
3573RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3574RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3575RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3576GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3577GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3578GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3579RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3580RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3581RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3582GEN_VEXT_VF(vfnmadd_vf_h, 2)
3583GEN_VEXT_VF(vfnmadd_vf_w, 4)
3584GEN_VEXT_VF(vfnmadd_vf_d, 8)
3585
3586static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3587{
3588    return float16_muladd(d, b, a, float_muladd_negate_c, s);
3589}
3590
3591static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3592{
3593    return float32_muladd(d, b, a, float_muladd_negate_c, s);
3594}
3595
3596static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3597{
3598    return float64_muladd(d, b, a, float_muladd_negate_c, s);
3599}
3600
3601RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3602RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3603RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3604GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3605GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3606GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3607RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3608RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3609RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3610GEN_VEXT_VF(vfmsub_vf_h, 2)
3611GEN_VEXT_VF(vfmsub_vf_w, 4)
3612GEN_VEXT_VF(vfmsub_vf_d, 8)
3613
3614static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3615{
3616    return float16_muladd(d, b, a, float_muladd_negate_product, s);
3617}
3618
3619static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3620{
3621    return float32_muladd(d, b, a, float_muladd_negate_product, s);
3622}
3623
3624static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3625{
3626    return float64_muladd(d, b, a, float_muladd_negate_product, s);
3627}
3628
3629RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3630RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3631RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3632GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3633GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3634GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3635RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3636RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3637RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3638GEN_VEXT_VF(vfnmsub_vf_h, 2)
3639GEN_VEXT_VF(vfnmsub_vf_w, 4)
3640GEN_VEXT_VF(vfnmsub_vf_d, 8)
3641
3642/* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3643static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3644{
3645    return float32_muladd(float16_to_float32(a, true, s),
3646                          float16_to_float32(b, true, s), d, 0, s);
3647}
3648
3649static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3650{
3651    return float64_muladd(float32_to_float64(a, s),
3652                          float32_to_float64(b, s), d, 0, s);
3653}
3654
3655RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3656RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3657GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3658GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3659RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3660RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3661GEN_VEXT_VF(vfwmacc_vf_h, 4)
3662GEN_VEXT_VF(vfwmacc_vf_w, 8)
3663
3664static uint32_t fwmaccbf16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3665{
3666    return float32_muladd(bfloat16_to_float32(a, s),
3667                          bfloat16_to_float32(b, s), d, 0, s);
3668}
3669
3670RVVCALL(OPFVV3, vfwmaccbf16_vv, WOP_UUU_H, H4, H2, H2, fwmaccbf16)
3671GEN_VEXT_VV_ENV(vfwmaccbf16_vv, 4)
3672RVVCALL(OPFVF3, vfwmaccbf16_vf, WOP_UUU_H, H4, H2, fwmaccbf16)
3673GEN_VEXT_VF(vfwmaccbf16_vf, 4)
3674
3675static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3676{
3677    return float32_muladd(float16_to_float32(a, true, s),
3678                          float16_to_float32(b, true, s), d,
3679                          float_muladd_negate_c | float_muladd_negate_product,
3680                          s);
3681}
3682
3683static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3684{
3685    return float64_muladd(float32_to_float64(a, s), float32_to_float64(b, s),
3686                          d, float_muladd_negate_c |
3687                             float_muladd_negate_product, s);
3688}
3689
3690RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3691RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3692GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3693GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3694RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3695RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3696GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3697GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3698
3699static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3700{
3701    return float32_muladd(float16_to_float32(a, true, s),
3702                          float16_to_float32(b, true, s), d,
3703                          float_muladd_negate_c, s);
3704}
3705
3706static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3707{
3708    return float64_muladd(float32_to_float64(a, s),
3709                          float32_to_float64(b, s), d,
3710                          float_muladd_negate_c, s);
3711}
3712
3713RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3714RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3715GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3716GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3717RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3718RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3719GEN_VEXT_VF(vfwmsac_vf_h, 4)
3720GEN_VEXT_VF(vfwmsac_vf_w, 8)
3721
3722static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3723{
3724    return float32_muladd(float16_to_float32(a, true, s),
3725                          float16_to_float32(b, true, s), d,
3726                          float_muladd_negate_product, s);
3727}
3728
3729static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3730{
3731    return float64_muladd(float32_to_float64(a, s),
3732                          float32_to_float64(b, s), d,
3733                          float_muladd_negate_product, s);
3734}
3735
3736RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3737RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3738GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3739GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3740RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3741RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3742GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3743GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3744
3745/* Vector Floating-Point Square-Root Instruction */
3746#define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
3747static void do_##NAME(void *vd, void *vs2, int i,      \
3748                      CPURISCVState *env)              \
3749{                                                      \
3750    TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3751    *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3752}
3753
3754#define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3755void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3756                  CPURISCVState *env, uint32_t desc)   \
3757{                                                      \
3758    uint32_t vm = vext_vm(desc);                       \
3759    uint32_t vl = env->vl;                             \
3760    uint32_t total_elems =                             \
3761        vext_get_total_elems(env, desc, ESZ);          \
3762    uint32_t vta = vext_vta(desc);                     \
3763    uint32_t vma = vext_vma(desc);                     \
3764    uint32_t i;                                        \
3765                                                       \
3766    VSTART_CHECK_EARLY_EXIT(env, vl);                  \
3767                                                       \
3768    if (vl == 0) {                                     \
3769        return;                                        \
3770    }                                                  \
3771    for (i = env->vstart; i < vl; i++) {               \
3772        if (!vm && !vext_elem_mask(v0, i)) {           \
3773            /* set masked-off elements to 1s */        \
3774            vext_set_elems_1s(vd, vma, i * ESZ,        \
3775                              (i + 1) * ESZ);          \
3776            continue;                                  \
3777        }                                              \
3778        do_##NAME(vd, vs2, i, env);                    \
3779    }                                                  \
3780    env->vstart = 0;                                   \
3781    vext_set_elems_1s(vd, vta, vl * ESZ,               \
3782                      total_elems * ESZ);              \
3783}
3784
3785RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3786RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3787RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3788GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3789GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3790GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3791
3792/*
3793 * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3794 *
3795 * Adapted from riscv-v-spec recip.c:
3796 * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3797 */
3798static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3799{
3800    uint64_t sign = extract64(f, frac_size + exp_size, 1);
3801    uint64_t exp = extract64(f, frac_size, exp_size);
3802    uint64_t frac = extract64(f, 0, frac_size);
3803
3804    const uint8_t lookup_table[] = {
3805        52, 51, 50, 48, 47, 46, 44, 43,
3806        42, 41, 40, 39, 38, 36, 35, 34,
3807        33, 32, 31, 30, 30, 29, 28, 27,
3808        26, 25, 24, 23, 23, 22, 21, 20,
3809        19, 19, 18, 17, 16, 16, 15, 14,
3810        14, 13, 12, 12, 11, 10, 10, 9,
3811        9, 8, 7, 7, 6, 6, 5, 4,
3812        4, 3, 3, 2, 2, 1, 1, 0,
3813        127, 125, 123, 121, 119, 118, 116, 114,
3814        113, 111, 109, 108, 106, 105, 103, 102,
3815        100, 99, 97, 96, 95, 93, 92, 91,
3816        90, 88, 87, 86, 85, 84, 83, 82,
3817        80, 79, 78, 77, 76, 75, 74, 73,
3818        72, 71, 70, 70, 69, 68, 67, 66,
3819        65, 64, 63, 63, 62, 61, 60, 59,
3820        59, 58, 57, 56, 56, 55, 54, 53
3821    };
3822    const int precision = 7;
3823
3824    if (exp == 0 && frac != 0) { /* subnormal */
3825        /* Normalize the subnormal. */
3826        while (extract64(frac, frac_size - 1, 1) == 0) {
3827            exp--;
3828            frac <<= 1;
3829        }
3830
3831        frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3832    }
3833
3834    int idx = ((exp & 1) << (precision - 1)) |
3835              (frac >> (frac_size - precision + 1));
3836    uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3837                        (frac_size - precision);
3838    uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3839
3840    uint64_t val = 0;
3841    val = deposit64(val, 0, frac_size, out_frac);
3842    val = deposit64(val, frac_size, exp_size, out_exp);
3843    val = deposit64(val, frac_size + exp_size, 1, sign);
3844    return val;
3845}
3846
3847static float16 frsqrt7_h(float16 f, float_status *s)
3848{
3849    int exp_size = 5, frac_size = 10;
3850    bool sign = float16_is_neg(f);
3851
3852    /*
3853     * frsqrt7(sNaN) = canonical NaN
3854     * frsqrt7(-inf) = canonical NaN
3855     * frsqrt7(-normal) = canonical NaN
3856     * frsqrt7(-subnormal) = canonical NaN
3857     */
3858    if (float16_is_signaling_nan(f, s) ||
3859        (float16_is_infinity(f) && sign) ||
3860        (float16_is_normal(f) && sign) ||
3861        (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3862        s->float_exception_flags |= float_flag_invalid;
3863        return float16_default_nan(s);
3864    }
3865
3866    /* frsqrt7(qNaN) = canonical NaN */
3867    if (float16_is_quiet_nan(f, s)) {
3868        return float16_default_nan(s);
3869    }
3870
3871    /* frsqrt7(+-0) = +-inf */
3872    if (float16_is_zero(f)) {
3873        s->float_exception_flags |= float_flag_divbyzero;
3874        return float16_set_sign(float16_infinity, sign);
3875    }
3876
3877    /* frsqrt7(+inf) = +0 */
3878    if (float16_is_infinity(f) && !sign) {
3879        return float16_set_sign(float16_zero, sign);
3880    }
3881
3882    /* +normal, +subnormal */
3883    uint64_t val = frsqrt7(f, exp_size, frac_size);
3884    return make_float16(val);
3885}
3886
3887static float32 frsqrt7_s(float32 f, float_status *s)
3888{
3889    int exp_size = 8, frac_size = 23;
3890    bool sign = float32_is_neg(f);
3891
3892    /*
3893     * frsqrt7(sNaN) = canonical NaN
3894     * frsqrt7(-inf) = canonical NaN
3895     * frsqrt7(-normal) = canonical NaN
3896     * frsqrt7(-subnormal) = canonical NaN
3897     */
3898    if (float32_is_signaling_nan(f, s) ||
3899        (float32_is_infinity(f) && sign) ||
3900        (float32_is_normal(f) && sign) ||
3901        (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3902        s->float_exception_flags |= float_flag_invalid;
3903        return float32_default_nan(s);
3904    }
3905
3906    /* frsqrt7(qNaN) = canonical NaN */
3907    if (float32_is_quiet_nan(f, s)) {
3908        return float32_default_nan(s);
3909    }
3910
3911    /* frsqrt7(+-0) = +-inf */
3912    if (float32_is_zero(f)) {
3913        s->float_exception_flags |= float_flag_divbyzero;
3914        return float32_set_sign(float32_infinity, sign);
3915    }
3916
3917    /* frsqrt7(+inf) = +0 */
3918    if (float32_is_infinity(f) && !sign) {
3919        return float32_set_sign(float32_zero, sign);
3920    }
3921
3922    /* +normal, +subnormal */
3923    uint64_t val = frsqrt7(f, exp_size, frac_size);
3924    return make_float32(val);
3925}
3926
3927static float64 frsqrt7_d(float64 f, float_status *s)
3928{
3929    int exp_size = 11, frac_size = 52;
3930    bool sign = float64_is_neg(f);
3931
3932    /*
3933     * frsqrt7(sNaN) = canonical NaN
3934     * frsqrt7(-inf) = canonical NaN
3935     * frsqrt7(-normal) = canonical NaN
3936     * frsqrt7(-subnormal) = canonical NaN
3937     */
3938    if (float64_is_signaling_nan(f, s) ||
3939        (float64_is_infinity(f) && sign) ||
3940        (float64_is_normal(f) && sign) ||
3941        (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3942        s->float_exception_flags |= float_flag_invalid;
3943        return float64_default_nan(s);
3944    }
3945
3946    /* frsqrt7(qNaN) = canonical NaN */
3947    if (float64_is_quiet_nan(f, s)) {
3948        return float64_default_nan(s);
3949    }
3950
3951    /* frsqrt7(+-0) = +-inf */
3952    if (float64_is_zero(f)) {
3953        s->float_exception_flags |= float_flag_divbyzero;
3954        return float64_set_sign(float64_infinity, sign);
3955    }
3956
3957    /* frsqrt7(+inf) = +0 */
3958    if (float64_is_infinity(f) && !sign) {
3959        return float64_set_sign(float64_zero, sign);
3960    }
3961
3962    /* +normal, +subnormal */
3963    uint64_t val = frsqrt7(f, exp_size, frac_size);
3964    return make_float64(val);
3965}
3966
3967RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3968RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3969RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3970GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3971GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3972GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3973
3974/*
3975 * Vector Floating-Point Reciprocal Estimate Instruction
3976 *
3977 * Adapted from riscv-v-spec recip.c:
3978 * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3979 */
3980static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3981                      float_status *s)
3982{
3983    uint64_t sign = extract64(f, frac_size + exp_size, 1);
3984    uint64_t exp = extract64(f, frac_size, exp_size);
3985    uint64_t frac = extract64(f, 0, frac_size);
3986
3987    const uint8_t lookup_table[] = {
3988        127, 125, 123, 121, 119, 117, 116, 114,
3989        112, 110, 109, 107, 105, 104, 102, 100,
3990        99, 97, 96, 94, 93, 91, 90, 88,
3991        87, 85, 84, 83, 81, 80, 79, 77,
3992        76, 75, 74, 72, 71, 70, 69, 68,
3993        66, 65, 64, 63, 62, 61, 60, 59,
3994        58, 57, 56, 55, 54, 53, 52, 51,
3995        50, 49, 48, 47, 46, 45, 44, 43,
3996        42, 41, 40, 40, 39, 38, 37, 36,
3997        35, 35, 34, 33, 32, 31, 31, 30,
3998        29, 28, 28, 27, 26, 25, 25, 24,
3999        23, 23, 22, 21, 21, 20, 19, 19,
4000        18, 17, 17, 16, 15, 15, 14, 14,
4001        13, 12, 12, 11, 11, 10, 9, 9,
4002        8, 8, 7, 7, 6, 5, 5, 4,
4003        4, 3, 3, 2, 2, 1, 1, 0
4004    };
4005    const int precision = 7;
4006
4007    if (exp == 0 && frac != 0) { /* subnormal */
4008        /* Normalize the subnormal. */
4009        while (extract64(frac, frac_size - 1, 1) == 0) {
4010            exp--;
4011            frac <<= 1;
4012        }
4013
4014        frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
4015
4016        if (exp != 0 && exp != UINT64_MAX) {
4017            /*
4018             * Overflow to inf or max value of same sign,
4019             * depending on sign and rounding mode.
4020             */
4021            s->float_exception_flags |= (float_flag_inexact |
4022                                         float_flag_overflow);
4023
4024            if ((s->float_rounding_mode == float_round_to_zero) ||
4025                ((s->float_rounding_mode == float_round_down) && !sign) ||
4026                ((s->float_rounding_mode == float_round_up) && sign)) {
4027                /* Return greatest/negative finite value. */
4028                return (sign << (exp_size + frac_size)) |
4029                       (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
4030            } else {
4031                /* Return +-inf. */
4032                return (sign << (exp_size + frac_size)) |
4033                       MAKE_64BIT_MASK(frac_size, exp_size);
4034            }
4035        }
4036    }
4037
4038    int idx = frac >> (frac_size - precision);
4039    uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
4040                        (frac_size - precision);
4041    uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
4042
4043    if (out_exp == 0 || out_exp == UINT64_MAX) {
4044        /*
4045         * The result is subnormal, but don't raise the underflow exception,
4046         * because there's no additional loss of precision.
4047         */
4048        out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
4049        if (out_exp == UINT64_MAX) {
4050            out_frac >>= 1;
4051            out_exp = 0;
4052        }
4053    }
4054
4055    uint64_t val = 0;
4056    val = deposit64(val, 0, frac_size, out_frac);
4057    val = deposit64(val, frac_size, exp_size, out_exp);
4058    val = deposit64(val, frac_size + exp_size, 1, sign);
4059    return val;
4060}
4061
4062static float16 frec7_h(float16 f, float_status *s)
4063{
4064    int exp_size = 5, frac_size = 10;
4065    bool sign = float16_is_neg(f);
4066
4067    /* frec7(+-inf) = +-0 */
4068    if (float16_is_infinity(f)) {
4069        return float16_set_sign(float16_zero, sign);
4070    }
4071
4072    /* frec7(+-0) = +-inf */
4073    if (float16_is_zero(f)) {
4074        s->float_exception_flags |= float_flag_divbyzero;
4075        return float16_set_sign(float16_infinity, sign);
4076    }
4077
4078    /* frec7(sNaN) = canonical NaN */
4079    if (float16_is_signaling_nan(f, s)) {
4080        s->float_exception_flags |= float_flag_invalid;
4081        return float16_default_nan(s);
4082    }
4083
4084    /* frec7(qNaN) = canonical NaN */
4085    if (float16_is_quiet_nan(f, s)) {
4086        return float16_default_nan(s);
4087    }
4088
4089    /* +-normal, +-subnormal */
4090    uint64_t val = frec7(f, exp_size, frac_size, s);
4091    return make_float16(val);
4092}
4093
4094static float32 frec7_s(float32 f, float_status *s)
4095{
4096    int exp_size = 8, frac_size = 23;
4097    bool sign = float32_is_neg(f);
4098
4099    /* frec7(+-inf) = +-0 */
4100    if (float32_is_infinity(f)) {
4101        return float32_set_sign(float32_zero, sign);
4102    }
4103
4104    /* frec7(+-0) = +-inf */
4105    if (float32_is_zero(f)) {
4106        s->float_exception_flags |= float_flag_divbyzero;
4107        return float32_set_sign(float32_infinity, sign);
4108    }
4109
4110    /* frec7(sNaN) = canonical NaN */
4111    if (float32_is_signaling_nan(f, s)) {
4112        s->float_exception_flags |= float_flag_invalid;
4113        return float32_default_nan(s);
4114    }
4115
4116    /* frec7(qNaN) = canonical NaN */
4117    if (float32_is_quiet_nan(f, s)) {
4118        return float32_default_nan(s);
4119    }
4120
4121    /* +-normal, +-subnormal */
4122    uint64_t val = frec7(f, exp_size, frac_size, s);
4123    return make_float32(val);
4124}
4125
4126static float64 frec7_d(float64 f, float_status *s)
4127{
4128    int exp_size = 11, frac_size = 52;
4129    bool sign = float64_is_neg(f);
4130
4131    /* frec7(+-inf) = +-0 */
4132    if (float64_is_infinity(f)) {
4133        return float64_set_sign(float64_zero, sign);
4134    }
4135
4136    /* frec7(+-0) = +-inf */
4137    if (float64_is_zero(f)) {
4138        s->float_exception_flags |= float_flag_divbyzero;
4139        return float64_set_sign(float64_infinity, sign);
4140    }
4141
4142    /* frec7(sNaN) = canonical NaN */
4143    if (float64_is_signaling_nan(f, s)) {
4144        s->float_exception_flags |= float_flag_invalid;
4145        return float64_default_nan(s);
4146    }
4147
4148    /* frec7(qNaN) = canonical NaN */
4149    if (float64_is_quiet_nan(f, s)) {
4150        return float64_default_nan(s);
4151    }
4152
4153    /* +-normal, +-subnormal */
4154    uint64_t val = frec7(f, exp_size, frac_size, s);
4155    return make_float64(val);
4156}
4157
4158RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4159RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4160RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4161GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4162GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4163GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4164
4165/* Vector Floating-Point MIN/MAX Instructions */
4166RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4167RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4168RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4169GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4170GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4171GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4172RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4173RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4174RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4175GEN_VEXT_VF(vfmin_vf_h, 2)
4176GEN_VEXT_VF(vfmin_vf_w, 4)
4177GEN_VEXT_VF(vfmin_vf_d, 8)
4178
4179RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4180RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4181RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4182GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4183GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4184GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4185RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4186RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4187RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4188GEN_VEXT_VF(vfmax_vf_h, 2)
4189GEN_VEXT_VF(vfmax_vf_w, 4)
4190GEN_VEXT_VF(vfmax_vf_d, 8)
4191
4192/* Vector Floating-Point Sign-Injection Instructions */
4193static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4194{
4195    return deposit64(b, 0, 15, a);
4196}
4197
4198static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4199{
4200    return deposit64(b, 0, 31, a);
4201}
4202
4203static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4204{
4205    return deposit64(b, 0, 63, a);
4206}
4207
4208RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4209RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4210RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4211GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4212GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4213GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4214RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4215RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4216RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4217GEN_VEXT_VF(vfsgnj_vf_h, 2)
4218GEN_VEXT_VF(vfsgnj_vf_w, 4)
4219GEN_VEXT_VF(vfsgnj_vf_d, 8)
4220
4221static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4222{
4223    return deposit64(~b, 0, 15, a);
4224}
4225
4226static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4227{
4228    return deposit64(~b, 0, 31, a);
4229}
4230
4231static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4232{
4233    return deposit64(~b, 0, 63, a);
4234}
4235
4236RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4237RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4238RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4239GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4240GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4241GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4242RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4243RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4244RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4245GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4246GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4247GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4248
4249static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4250{
4251    return deposit64(b ^ a, 0, 15, a);
4252}
4253
4254static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4255{
4256    return deposit64(b ^ a, 0, 31, a);
4257}
4258
4259static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4260{
4261    return deposit64(b ^ a, 0, 63, a);
4262}
4263
4264RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4265RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4266RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4267GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4268GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4269GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4270RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4271RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4272RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4273GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4274GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4275GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4276
4277/* Vector Floating-Point Compare Instructions */
4278#define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4279void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4280                  CPURISCVState *env, uint32_t desc)          \
4281{                                                             \
4282    uint32_t vm = vext_vm(desc);                              \
4283    uint32_t vl = env->vl;                                    \
4284    uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
4285    uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4286    uint32_t vma = vext_vma(desc);                            \
4287    uint32_t i;                                               \
4288                                                              \
4289    VSTART_CHECK_EARLY_EXIT(env, vl);                         \
4290                                                              \
4291    for (i = env->vstart; i < vl; i++) {                      \
4292        ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4293        ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4294        if (!vm && !vext_elem_mask(v0, i)) {                  \
4295            /* set masked-off elements to 1s */               \
4296            if (vma) {                                        \
4297                vext_set_elem_mask(vd, i, 1);                 \
4298            }                                                 \
4299            continue;                                         \
4300        }                                                     \
4301        vext_set_elem_mask(vd, i,                             \
4302                           DO_OP(s2, s1, &env->fp_status));   \
4303    }                                                         \
4304    env->vstart = 0;                                          \
4305    /*
4306     * mask destination register are always tail-agnostic
4307     * set tail elements to 1s
4308     */                                                       \
4309    if (vta_all_1s) {                                         \
4310        for (; i < total_elems; i++) {                        \
4311            vext_set_elem_mask(vd, i, 1);                     \
4312        }                                                     \
4313    }                                                         \
4314}
4315
4316GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4317GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4318GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4319
4320#define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4321void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4322                  CPURISCVState *env, uint32_t desc)                \
4323{                                                                   \
4324    uint32_t vm = vext_vm(desc);                                    \
4325    uint32_t vl = env->vl;                                          \
4326    uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
4327    uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4328    uint32_t vma = vext_vma(desc);                                  \
4329    uint32_t i;                                                     \
4330                                                                    \
4331    VSTART_CHECK_EARLY_EXIT(env, vl);                               \
4332                                                                    \
4333    for (i = env->vstart; i < vl; i++) {                            \
4334        ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4335        if (!vm && !vext_elem_mask(v0, i)) {                        \
4336            /* set masked-off elements to 1s */                     \
4337            if (vma) {                                              \
4338                vext_set_elem_mask(vd, i, 1);                       \
4339            }                                                       \
4340            continue;                                               \
4341        }                                                           \
4342        vext_set_elem_mask(vd, i,                                   \
4343                           DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4344    }                                                               \
4345    env->vstart = 0;                                                \
4346    /*
4347     * mask destination register are always tail-agnostic
4348     * set tail elements to 1s
4349     */                                                             \
4350    if (vta_all_1s) {                                               \
4351        for (; i < total_elems; i++) {                              \
4352            vext_set_elem_mask(vd, i, 1);                           \
4353        }                                                           \
4354    }                                                               \
4355}
4356
4357GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4358GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4359GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4360
4361static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4362{
4363    FloatRelation compare = float16_compare_quiet(a, b, s);
4364    return compare != float_relation_equal;
4365}
4366
4367static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4368{
4369    FloatRelation compare = float32_compare_quiet(a, b, s);
4370    return compare != float_relation_equal;
4371}
4372
4373static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4374{
4375    FloatRelation compare = float64_compare_quiet(a, b, s);
4376    return compare != float_relation_equal;
4377}
4378
4379GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4380GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4381GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4382GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4383GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4384GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4385
4386GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4387GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4388GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4389GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4390GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4391GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4392
4393GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4394GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4395GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4396GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4397GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4398GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4399
4400static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4401{
4402    FloatRelation compare = float16_compare(a, b, s);
4403    return compare == float_relation_greater;
4404}
4405
4406static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4407{
4408    FloatRelation compare = float32_compare(a, b, s);
4409    return compare == float_relation_greater;
4410}
4411
4412static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4413{
4414    FloatRelation compare = float64_compare(a, b, s);
4415    return compare == float_relation_greater;
4416}
4417
4418GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4419GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4420GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4421
4422static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4423{
4424    FloatRelation compare = float16_compare(a, b, s);
4425    return compare == float_relation_greater ||
4426           compare == float_relation_equal;
4427}
4428
4429static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4430{
4431    FloatRelation compare = float32_compare(a, b, s);
4432    return compare == float_relation_greater ||
4433           compare == float_relation_equal;
4434}
4435
4436static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4437{
4438    FloatRelation compare = float64_compare(a, b, s);
4439    return compare == float_relation_greater ||
4440           compare == float_relation_equal;
4441}
4442
4443GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4444GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4445GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4446
4447/* Vector Floating-Point Classify Instruction */
4448target_ulong fclass_h(uint64_t frs1)
4449{
4450    float16 f = frs1;
4451    bool sign = float16_is_neg(f);
4452
4453    if (float16_is_infinity(f)) {
4454        return sign ? 1 << 0 : 1 << 7;
4455    } else if (float16_is_zero(f)) {
4456        return sign ? 1 << 3 : 1 << 4;
4457    } else if (float16_is_zero_or_denormal(f)) {
4458        return sign ? 1 << 2 : 1 << 5;
4459    } else if (float16_is_any_nan(f)) {
4460        float_status s = { }; /* for snan_bit_is_one */
4461        return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4462    } else {
4463        return sign ? 1 << 1 : 1 << 6;
4464    }
4465}
4466
4467target_ulong fclass_s(uint64_t frs1)
4468{
4469    float32 f = frs1;
4470    bool sign = float32_is_neg(f);
4471
4472    if (float32_is_infinity(f)) {
4473        return sign ? 1 << 0 : 1 << 7;
4474    } else if (float32_is_zero(f)) {
4475        return sign ? 1 << 3 : 1 << 4;
4476    } else if (float32_is_zero_or_denormal(f)) {
4477        return sign ? 1 << 2 : 1 << 5;
4478    } else if (float32_is_any_nan(f)) {
4479        float_status s = { }; /* for snan_bit_is_one */
4480        return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4481    } else {
4482        return sign ? 1 << 1 : 1 << 6;
4483    }
4484}
4485
4486target_ulong fclass_d(uint64_t frs1)
4487{
4488    float64 f = frs1;
4489    bool sign = float64_is_neg(f);
4490
4491    if (float64_is_infinity(f)) {
4492        return sign ? 1 << 0 : 1 << 7;
4493    } else if (float64_is_zero(f)) {
4494        return sign ? 1 << 3 : 1 << 4;
4495    } else if (float64_is_zero_or_denormal(f)) {
4496        return sign ? 1 << 2 : 1 << 5;
4497    } else if (float64_is_any_nan(f)) {
4498        float_status s = { }; /* for snan_bit_is_one */
4499        return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4500    } else {
4501        return sign ? 1 << 1 : 1 << 6;
4502    }
4503}
4504
4505RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4506RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4507RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4508GEN_VEXT_V(vfclass_v_h, 2)
4509GEN_VEXT_V(vfclass_v_w, 4)
4510GEN_VEXT_V(vfclass_v_d, 8)
4511
4512/* Vector Floating-Point Merge Instruction */
4513
4514#define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4515void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4516                  CPURISCVState *env, uint32_t desc)          \
4517{                                                             \
4518    uint32_t vm = vext_vm(desc);                              \
4519    uint32_t vl = env->vl;                                    \
4520    uint32_t esz = sizeof(ETYPE);                             \
4521    uint32_t total_elems =                                    \
4522        vext_get_total_elems(env, desc, esz);                 \
4523    uint32_t vta = vext_vta(desc);                            \
4524    uint32_t i;                                               \
4525                                                              \
4526    VSTART_CHECK_EARLY_EXIT(env, vl);                         \
4527                                                              \
4528    for (i = env->vstart; i < vl; i++) {                      \
4529        ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4530        *((ETYPE *)vd + H(i)) =                               \
4531            (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4532    }                                                         \
4533    env->vstart = 0;                                          \
4534    /* set tail elements to 1s */                             \
4535    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4536}
4537
4538GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4539GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4540GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4541
4542/* Single-Width Floating-Point/Integer Type-Convert Instructions */
4543/* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4544RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4545RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4546RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4547GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4548GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4549GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4550
4551/* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4552RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4553RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4554RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4555GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4556GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4557GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4558
4559/* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4560RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4561RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4562RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4563GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4564GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4565GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4566
4567/* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4568RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4569RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4570RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4571GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4572GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4573GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4574
4575/* Widening Floating-Point/Integer Type-Convert Instructions */
4576/* (TD, T2, TX2) */
4577#define WOP_UU_B uint16_t, uint8_t,  uint8_t
4578#define WOP_UU_H uint32_t, uint16_t, uint16_t
4579#define WOP_UU_W uint64_t, uint32_t, uint32_t
4580/*
4581 * vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.
4582 */
4583RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4584RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4585GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4586GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4587
4588/* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4589RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4590RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4591GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4592GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4593
4594/*
4595 * vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float.
4596 */
4597RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4598RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4599RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4600GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4601GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4602GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4603
4604/* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4605RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4606RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4607RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4608GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4609GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4610GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4611
4612/*
4613 * vfwcvt.f.f.v vd, vs2, vm # Convert single-width float to double-width float.
4614 */
4615static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4616{
4617    return float16_to_float32(a, true, s);
4618}
4619
4620RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4621RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4622GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4623GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4624
4625RVVCALL(OPFVV1, vfwcvtbf16_f_f_v, WOP_UU_H, H4, H2, bfloat16_to_float32)
4626GEN_VEXT_V_ENV(vfwcvtbf16_f_f_v, 4)
4627
4628/* Narrowing Floating-Point/Integer Type-Convert Instructions */
4629/* (TD, T2, TX2) */
4630#define NOP_UU_B uint8_t,  uint16_t, uint32_t
4631#define NOP_UU_H uint16_t, uint32_t, uint32_t
4632#define NOP_UU_W uint32_t, uint64_t, uint64_t
4633/* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4634RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4635RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4636RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4637GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4638GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4639GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4640
4641/* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4642RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4643RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4644RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4645GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4646GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4647GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4648
4649/*
4650 * vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float.
4651 */
4652RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4653RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4654GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4655GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4656
4657/* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4658RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4659RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4660GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4661GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4662
4663/* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4664static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4665{
4666    return float32_to_float16(a, true, s);
4667}
4668
4669RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4670RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4671GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4672GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4673
4674RVVCALL(OPFVV1, vfncvtbf16_f_f_w, NOP_UU_H, H2, H4, float32_to_bfloat16)
4675GEN_VEXT_V_ENV(vfncvtbf16_f_f_w, 2)
4676
4677/*
4678 * Vector Reduction Operations
4679 */
4680/* Vector Single-Width Integer Reduction Instructions */
4681#define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4682void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4683                  void *vs2, CPURISCVState *env,          \
4684                  uint32_t desc)                          \
4685{                                                         \
4686    uint32_t vm = vext_vm(desc);                          \
4687    uint32_t vl = env->vl;                                \
4688    uint32_t esz = sizeof(TD);                            \
4689    uint32_t vlenb = simd_maxsz(desc);                    \
4690    uint32_t vta = vext_vta(desc);                        \
4691    uint32_t i;                                           \
4692    TD s1 =  *((TD *)vs1 + HD(0));                        \
4693                                                          \
4694    VSTART_CHECK_EARLY_EXIT(env, vl);                     \
4695                                                          \
4696    for (i = env->vstart; i < vl; i++) {                  \
4697        TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4698        if (!vm && !vext_elem_mask(v0, i)) {              \
4699            continue;                                     \
4700        }                                                 \
4701        s1 = OP(s1, (TD)s2);                              \
4702    }                                                     \
4703    if (vl > 0) {                                         \
4704        *((TD *)vd + HD(0)) = s1;                         \
4705    }                                                     \
4706    env->vstart = 0;                                      \
4707    /* set tail elements to 1s */                         \
4708    vext_set_elems_1s(vd, vta, esz, vlenb);               \
4709}
4710
4711/* vd[0] = sum(vs1[0], vs2[*]) */
4712GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4713GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4714GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4715GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4716
4717/* vd[0] = maxu(vs1[0], vs2[*]) */
4718GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4719GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4720GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4721GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4722
4723/* vd[0] = max(vs1[0], vs2[*]) */
4724GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4725GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4726GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4727GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4728
4729/* vd[0] = minu(vs1[0], vs2[*]) */
4730GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4731GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4732GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4733GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4734
4735/* vd[0] = min(vs1[0], vs2[*]) */
4736GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4737GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4738GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4739GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4740
4741/* vd[0] = and(vs1[0], vs2[*]) */
4742GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4743GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4744GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4745GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4746
4747/* vd[0] = or(vs1[0], vs2[*]) */
4748GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4749GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4750GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4751GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4752
4753/* vd[0] = xor(vs1[0], vs2[*]) */
4754GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4755GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4756GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4757GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4758
4759/* Vector Widening Integer Reduction Instructions */
4760/* signed sum reduction into double-width accumulator */
4761GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4762GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4763GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4764
4765/* Unsigned sum reduction into double-width accumulator */
4766GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4767GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4768GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4769
4770/* Vector Single-Width Floating-Point Reduction Instructions */
4771#define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4772void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4773                  void *vs2, CPURISCVState *env,           \
4774                  uint32_t desc)                           \
4775{                                                          \
4776    uint32_t vm = vext_vm(desc);                           \
4777    uint32_t vl = env->vl;                                 \
4778    uint32_t esz = sizeof(TD);                             \
4779    uint32_t vlenb = simd_maxsz(desc);                     \
4780    uint32_t vta = vext_vta(desc);                         \
4781    uint32_t i;                                            \
4782    TD s1 =  *((TD *)vs1 + HD(0));                         \
4783                                                           \
4784    VSTART_CHECK_EARLY_EXIT(env, vl);                      \
4785                                                           \
4786    for (i = env->vstart; i < vl; i++) {                   \
4787        TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4788        if (!vm && !vext_elem_mask(v0, i)) {               \
4789            continue;                                      \
4790        }                                                  \
4791        s1 = OP(s1, (TD)s2, &env->fp_status);              \
4792    }                                                      \
4793    if (vl > 0) {                                          \
4794        *((TD *)vd + HD(0)) = s1;                          \
4795    }                                                      \
4796    env->vstart = 0;                                       \
4797    /* set tail elements to 1s */                          \
4798    vext_set_elems_1s(vd, vta, esz, vlenb);                \
4799}
4800
4801/* Unordered sum */
4802GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4803GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4804GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4805
4806/* Ordered sum */
4807GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4808GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4809GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4810
4811/* Maximum value */
4812GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2,
4813              float16_maximum_number)
4814GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4,
4815              float32_maximum_number)
4816GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8,
4817              float64_maximum_number)
4818
4819/* Minimum value */
4820GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2,
4821              float16_minimum_number)
4822GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4,
4823              float32_minimum_number)
4824GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8,
4825              float64_minimum_number)
4826
4827/* Vector Widening Floating-Point Add Instructions */
4828static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4829{
4830    return float32_add(a, float16_to_float32(b, true, s), s);
4831}
4832
4833static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4834{
4835    return float64_add(a, float32_to_float64(b, s), s);
4836}
4837
4838/* Vector Widening Floating-Point Reduction Instructions */
4839/* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4840GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4841GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4842GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4843GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4844
4845/*
4846 * Vector Mask Operations
4847 */
4848/* Vector Mask-Register Logical Instructions */
4849#define GEN_VEXT_MASK_VV(NAME, OP)                        \
4850void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4851                  void *vs2, CPURISCVState *env,          \
4852                  uint32_t desc)                          \
4853{                                                         \
4854    uint32_t vl = env->vl;                                \
4855    uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;\
4856    uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4857    uint32_t i;                                           \
4858    int a, b;                                             \
4859                                                          \
4860    VSTART_CHECK_EARLY_EXIT(env, vl);                     \
4861                                                          \
4862    for (i = env->vstart; i < vl; i++) {                  \
4863        a = vext_elem_mask(vs1, i);                       \
4864        b = vext_elem_mask(vs2, i);                       \
4865        vext_set_elem_mask(vd, i, OP(b, a));              \
4866    }                                                     \
4867    env->vstart = 0;                                      \
4868    /*
4869     * mask destination register are always tail-agnostic
4870     * set tail elements to 1s
4871     */                                                   \
4872    if (vta_all_1s) {                                     \
4873        for (; i < total_elems; i++) {                    \
4874            vext_set_elem_mask(vd, i, 1);                 \
4875        }                                                 \
4876    }                                                     \
4877}
4878
4879#define DO_NAND(N, M)  (!(N & M))
4880#define DO_ANDNOT(N, M)  (N & !M)
4881#define DO_NOR(N, M)  (!(N | M))
4882#define DO_ORNOT(N, M)  (N | !M)
4883#define DO_XNOR(N, M)  (!(N ^ M))
4884
4885GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4886GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4887GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4888GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4889GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4890GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4891GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4892GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4893
4894/* Vector count population in mask vcpop */
4895target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4896                             uint32_t desc)
4897{
4898    target_ulong cnt = 0;
4899    uint32_t vm = vext_vm(desc);
4900    uint32_t vl = env->vl;
4901    int i;
4902
4903    for (i = env->vstart; i < vl; i++) {
4904        if (vm || vext_elem_mask(v0, i)) {
4905            if (vext_elem_mask(vs2, i)) {
4906                cnt++;
4907            }
4908        }
4909    }
4910    env->vstart = 0;
4911    return cnt;
4912}
4913
4914/* vfirst find-first-set mask bit */
4915target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4916                              uint32_t desc)
4917{
4918    uint32_t vm = vext_vm(desc);
4919    uint32_t vl = env->vl;
4920    int i;
4921
4922    for (i = env->vstart; i < vl; i++) {
4923        if (vm || vext_elem_mask(v0, i)) {
4924            if (vext_elem_mask(vs2, i)) {
4925                return i;
4926            }
4927        }
4928    }
4929    env->vstart = 0;
4930    return -1LL;
4931}
4932
4933enum set_mask_type {
4934    ONLY_FIRST = 1,
4935    INCLUDE_FIRST,
4936    BEFORE_FIRST,
4937};
4938
4939static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4940                   uint32_t desc, enum set_mask_type type)
4941{
4942    uint32_t vm = vext_vm(desc);
4943    uint32_t vl = env->vl;
4944    uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;
4945    uint32_t vta_all_1s = vext_vta_all_1s(desc);
4946    uint32_t vma = vext_vma(desc);
4947    int i;
4948    bool first_mask_bit = false;
4949
4950    VSTART_CHECK_EARLY_EXIT(env, vl);
4951
4952    for (i = env->vstart; i < vl; i++) {
4953        if (!vm && !vext_elem_mask(v0, i)) {
4954            /* set masked-off elements to 1s */
4955            if (vma) {
4956                vext_set_elem_mask(vd, i, 1);
4957            }
4958            continue;
4959        }
4960        /* write a zero to all following active elements */
4961        if (first_mask_bit) {
4962            vext_set_elem_mask(vd, i, 0);
4963            continue;
4964        }
4965        if (vext_elem_mask(vs2, i)) {
4966            first_mask_bit = true;
4967            if (type == BEFORE_FIRST) {
4968                vext_set_elem_mask(vd, i, 0);
4969            } else {
4970                vext_set_elem_mask(vd, i, 1);
4971            }
4972        } else {
4973            if (type == ONLY_FIRST) {
4974                vext_set_elem_mask(vd, i, 0);
4975            } else {
4976                vext_set_elem_mask(vd, i, 1);
4977            }
4978        }
4979    }
4980    env->vstart = 0;
4981    /*
4982     * mask destination register are always tail-agnostic
4983     * set tail elements to 1s
4984     */
4985    if (vta_all_1s) {
4986        for (; i < total_elems; i++) {
4987            vext_set_elem_mask(vd, i, 1);
4988        }
4989    }
4990}
4991
4992void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4993                     uint32_t desc)
4994{
4995    vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4996}
4997
4998void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4999                     uint32_t desc)
5000{
5001    vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
5002}
5003
5004void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
5005                     uint32_t desc)
5006{
5007    vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
5008}
5009
5010/* Vector Iota Instruction */
5011#define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
5012void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
5013                  uint32_t desc)                                          \
5014{                                                                         \
5015    uint32_t vm = vext_vm(desc);                                          \
5016    uint32_t vl = env->vl;                                                \
5017    uint32_t esz = sizeof(ETYPE);                                         \
5018    uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5019    uint32_t vta = vext_vta(desc);                                        \
5020    uint32_t vma = vext_vma(desc);                                        \
5021    uint32_t sum = 0;                                                     \
5022    int i;                                                                \
5023                                                                          \
5024    VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5025                                                                          \
5026    for (i = env->vstart; i < vl; i++) {                                  \
5027        if (!vm && !vext_elem_mask(v0, i)) {                              \
5028            /* set masked-off elements to 1s */                           \
5029            vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5030            continue;                                                     \
5031        }                                                                 \
5032        *((ETYPE *)vd + H(i)) = sum;                                      \
5033        if (vext_elem_mask(vs2, i)) {                                     \
5034            sum++;                                                        \
5035        }                                                                 \
5036    }                                                                     \
5037    env->vstart = 0;                                                      \
5038    /* set tail elements to 1s */                                         \
5039    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5040}
5041
5042GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
5043GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
5044GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
5045GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
5046
5047/* Vector Element Index Instruction */
5048#define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
5049void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
5050{                                                                         \
5051    uint32_t vm = vext_vm(desc);                                          \
5052    uint32_t vl = env->vl;                                                \
5053    uint32_t esz = sizeof(ETYPE);                                         \
5054    uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5055    uint32_t vta = vext_vta(desc);                                        \
5056    uint32_t vma = vext_vma(desc);                                        \
5057    int i;                                                                \
5058                                                                          \
5059    VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5060                                                                          \
5061    for (i = env->vstart; i < vl; i++) {                                  \
5062        if (!vm && !vext_elem_mask(v0, i)) {                              \
5063            /* set masked-off elements to 1s */                           \
5064            vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5065            continue;                                                     \
5066        }                                                                 \
5067        *((ETYPE *)vd + H(i)) = i;                                        \
5068    }                                                                     \
5069    env->vstart = 0;                                                      \
5070    /* set tail elements to 1s */                                         \
5071    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5072}
5073
5074GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
5075GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
5076GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
5077GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
5078
5079/*
5080 * Vector Permutation Instructions
5081 */
5082
5083/* Vector Slide Instructions */
5084#define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
5085void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5086                  CPURISCVState *env, uint32_t desc)                      \
5087{                                                                         \
5088    uint32_t vm = vext_vm(desc);                                          \
5089    uint32_t vl = env->vl;                                                \
5090    uint32_t esz = sizeof(ETYPE);                                         \
5091    uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5092    uint32_t vta = vext_vta(desc);                                        \
5093    uint32_t vma = vext_vma(desc);                                        \
5094    target_ulong offset = s1, i_min, i;                                   \
5095                                                                          \
5096    VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5097                                                                          \
5098    i_min = MAX(env->vstart, offset);                                     \
5099    for (i = i_min; i < vl; i++) {                                        \
5100        if (!vm && !vext_elem_mask(v0, i)) {                              \
5101            /* set masked-off elements to 1s */                           \
5102            vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5103            continue;                                                     \
5104        }                                                                 \
5105        *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
5106    }                                                                     \
5107    env->vstart = 0;                                                      \
5108    /* set tail elements to 1s */                                         \
5109    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5110}
5111
5112/* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
5113GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
5114GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
5115GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
5116GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
5117
5118#define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
5119void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5120                  CPURISCVState *env, uint32_t desc)                      \
5121{                                                                         \
5122    uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5123    uint32_t vm = vext_vm(desc);                                          \
5124    uint32_t vl = env->vl;                                                \
5125    uint32_t esz = sizeof(ETYPE);                                         \
5126    uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5127    uint32_t vta = vext_vta(desc);                                        \
5128    uint32_t vma = vext_vma(desc);                                        \
5129    target_ulong i_max, i_min, i;                                         \
5130                                                                          \
5131    VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5132                                                                          \
5133    i_min = MIN(s1 < vlmax ? vlmax - s1 : 0, vl);                         \
5134    i_max = MAX(i_min, env->vstart);                                      \
5135    for (i = env->vstart; i < i_max; ++i) {                               \
5136        if (!vm && !vext_elem_mask(v0, i)) {                              \
5137            /* set masked-off elements to 1s */                           \
5138            vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5139            continue;                                                     \
5140        }                                                                 \
5141        *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
5142    }                                                                     \
5143                                                                          \
5144    for (i = i_max; i < vl; ++i) {                                        \
5145        if (!vm && !vext_elem_mask(v0, i)) {                              \
5146            vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5147            continue;                                                     \
5148        }                                                                 \
5149        *((ETYPE *)vd + H(i)) = 0;                                        \
5150    }                                                                     \
5151                                                                          \
5152    env->vstart = 0;                                                      \
5153    /* set tail elements to 1s */                                         \
5154    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5155}
5156
5157/* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5158GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
5159GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5160GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5161GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5162
5163#define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
5164static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5165                                 void *vs2, CPURISCVState *env,             \
5166                                 uint32_t desc)                             \
5167{                                                                           \
5168    typedef uint##BITWIDTH##_t ETYPE;                                       \
5169    uint32_t vm = vext_vm(desc);                                            \
5170    uint32_t vl = env->vl;                                                  \
5171    uint32_t esz = sizeof(ETYPE);                                           \
5172    uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5173    uint32_t vta = vext_vta(desc);                                          \
5174    uint32_t vma = vext_vma(desc);                                          \
5175    uint32_t i;                                                             \
5176                                                                            \
5177    VSTART_CHECK_EARLY_EXIT(env, vl);                                       \
5178                                                                            \
5179    for (i = env->vstart; i < vl; i++) {                                    \
5180        if (!vm && !vext_elem_mask(v0, i)) {                                \
5181            /* set masked-off elements to 1s */                             \
5182            vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
5183            continue;                                                       \
5184        }                                                                   \
5185        if (i == 0) {                                                       \
5186            *((ETYPE *)vd + H(i)) = s1;                                     \
5187        } else {                                                            \
5188            *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5189        }                                                                   \
5190    }                                                                       \
5191    env->vstart = 0;                                                        \
5192    /* set tail elements to 1s */                                           \
5193    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5194}
5195
5196GEN_VEXT_VSLIE1UP(8,  H1)
5197GEN_VEXT_VSLIE1UP(16, H2)
5198GEN_VEXT_VSLIE1UP(32, H4)
5199GEN_VEXT_VSLIE1UP(64, H8)
5200
5201#define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5202void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5203                  CPURISCVState *env, uint32_t desc)              \
5204{                                                                 \
5205    vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5206}
5207
5208/* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5209GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5210GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5211GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5212GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5213
5214#define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5215static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5216                                   void *vs2, CPURISCVState *env,             \
5217                                   uint32_t desc)                             \
5218{                                                                             \
5219    typedef uint##BITWIDTH##_t ETYPE;                                         \
5220    uint32_t vm = vext_vm(desc);                                              \
5221    uint32_t vl = env->vl;                                                    \
5222    uint32_t esz = sizeof(ETYPE);                                             \
5223    uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5224    uint32_t vta = vext_vta(desc);                                            \
5225    uint32_t vma = vext_vma(desc);                                            \
5226    uint32_t i;                                                               \
5227                                                                              \
5228    VSTART_CHECK_EARLY_EXIT(env, vl);                                         \
5229                                                                              \
5230    for (i = env->vstart; i < vl; i++) {                                      \
5231        if (!vm && !vext_elem_mask(v0, i)) {                                  \
5232            /* set masked-off elements to 1s */                               \
5233            vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
5234            continue;                                                         \
5235        }                                                                     \
5236        if (i == vl - 1) {                                                    \
5237            *((ETYPE *)vd + H(i)) = s1;                                       \
5238        } else {                                                              \
5239            *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5240        }                                                                     \
5241    }                                                                         \
5242    env->vstart = 0;                                                          \
5243    /* set tail elements to 1s */                                             \
5244    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5245}
5246
5247GEN_VEXT_VSLIDE1DOWN(8,  H1)
5248GEN_VEXT_VSLIDE1DOWN(16, H2)
5249GEN_VEXT_VSLIDE1DOWN(32, H4)
5250GEN_VEXT_VSLIDE1DOWN(64, H8)
5251
5252#define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5253void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5254                  CPURISCVState *env, uint32_t desc)              \
5255{                                                                 \
5256    vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5257}
5258
5259/* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5260GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5261GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5262GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5263GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5264
5265/* Vector Floating-Point Slide Instructions */
5266#define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5267void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5268                  CPURISCVState *env, uint32_t desc)          \
5269{                                                             \
5270    vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5271}
5272
5273/* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5274GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5275GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5276GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5277
5278#define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5279void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5280                  CPURISCVState *env, uint32_t desc)          \
5281{                                                             \
5282    vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5283}
5284
5285/* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5286GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5287GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5288GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5289
5290/* Vector Register Gather Instruction */
5291#define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5292void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5293                  CPURISCVState *env, uint32_t desc)                      \
5294{                                                                         \
5295    uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5296    uint32_t vm = vext_vm(desc);                                          \
5297    uint32_t vl = env->vl;                                                \
5298    uint32_t esz = sizeof(TS2);                                           \
5299    uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5300    uint32_t vta = vext_vta(desc);                                        \
5301    uint32_t vma = vext_vma(desc);                                        \
5302    uint64_t index;                                                       \
5303    uint32_t i;                                                           \
5304                                                                          \
5305    VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5306                                                                          \
5307    for (i = env->vstart; i < vl; i++) {                                  \
5308        if (!vm && !vext_elem_mask(v0, i)) {                              \
5309            /* set masked-off elements to 1s */                           \
5310            vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5311            continue;                                                     \
5312        }                                                                 \
5313        index = *((TS1 *)vs1 + HS1(i));                                   \
5314        if (index >= vlmax) {                                             \
5315            *((TS2 *)vd + HS2(i)) = 0;                                    \
5316        } else {                                                          \
5317            *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5318        }                                                                 \
5319    }                                                                     \
5320    env->vstart = 0;                                                      \
5321    /* set tail elements to 1s */                                         \
5322    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5323}
5324
5325/* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5326GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5327GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5328GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5329GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5330
5331GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5332GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5333GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5334GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5335
5336#define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5337void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5338                  CPURISCVState *env, uint32_t desc)                      \
5339{                                                                         \
5340    uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5341    uint32_t vm = vext_vm(desc);                                          \
5342    uint32_t vl = env->vl;                                                \
5343    uint32_t esz = sizeof(ETYPE);                                         \
5344    uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5345    uint32_t vta = vext_vta(desc);                                        \
5346    uint32_t vma = vext_vma(desc);                                        \
5347    uint64_t index = s1;                                                  \
5348    uint32_t i;                                                           \
5349                                                                          \
5350    VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5351                                                                          \
5352    for (i = env->vstart; i < vl; i++) {                                  \
5353        if (!vm && !vext_elem_mask(v0, i)) {                              \
5354            /* set masked-off elements to 1s */                           \
5355            vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5356            continue;                                                     \
5357        }                                                                 \
5358        if (index >= vlmax) {                                             \
5359            *((ETYPE *)vd + H(i)) = 0;                                    \
5360        } else {                                                          \
5361            *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5362        }                                                                 \
5363    }                                                                     \
5364    env->vstart = 0;                                                      \
5365    /* set tail elements to 1s */                                         \
5366    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5367}
5368
5369/* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5370GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5371GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5372GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5373GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5374
5375/* Vector Compress Instruction */
5376#define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5377void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5378                  CPURISCVState *env, uint32_t desc)                      \
5379{                                                                         \
5380    uint32_t vl = env->vl;                                                \
5381    uint32_t esz = sizeof(ETYPE);                                         \
5382    uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5383    uint32_t vta = vext_vta(desc);                                        \
5384    uint32_t num = 0, i;                                                  \
5385                                                                          \
5386    VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5387                                                                          \
5388    for (i = env->vstart; i < vl; i++) {                                  \
5389        if (!vext_elem_mask(vs1, i)) {                                    \
5390            continue;                                                     \
5391        }                                                                 \
5392        *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5393        num++;                                                            \
5394    }                                                                     \
5395    env->vstart = 0;                                                      \
5396    /* set tail elements to 1s */                                         \
5397    vext_set_elems_1s(vd, vta, num * esz, total_elems * esz);             \
5398}
5399
5400/* Compress into vd elements of vs2 where vs1 is enabled */
5401GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5402GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5403GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5404GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5405
5406/* Vector Whole Register Move */
5407void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5408{
5409    /* EEW = SEW */
5410    uint32_t maxsz = simd_maxsz(desc);
5411    uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5412    uint32_t startb = env->vstart * sewb;
5413    uint32_t i = startb;
5414
5415    if (startb >= maxsz) {
5416        env->vstart = 0;
5417        return;
5418    }
5419
5420    if (HOST_BIG_ENDIAN && i % 8 != 0) {
5421        uint32_t j = ROUND_UP(i, 8);
5422        memcpy((uint8_t *)vd + H1(j - 1),
5423               (uint8_t *)vs2 + H1(j - 1),
5424               j - i);
5425        i = j;
5426    }
5427
5428    memcpy((uint8_t *)vd + H1(i),
5429           (uint8_t *)vs2 + H1(i),
5430           maxsz - i);
5431
5432    env->vstart = 0;
5433}
5434
5435/* Vector Integer Extension */
5436#define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5437void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5438                  CPURISCVState *env, uint32_t desc)             \
5439{                                                                \
5440    uint32_t vl = env->vl;                                       \
5441    uint32_t vm = vext_vm(desc);                                 \
5442    uint32_t esz = sizeof(ETYPE);                                \
5443    uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5444    uint32_t vta = vext_vta(desc);                               \
5445    uint32_t vma = vext_vma(desc);                               \
5446    uint32_t i;                                                  \
5447                                                                 \
5448    VSTART_CHECK_EARLY_EXIT(env, vl);                            \
5449                                                                 \
5450    for (i = env->vstart; i < vl; i++) {                         \
5451        if (!vm && !vext_elem_mask(v0, i)) {                     \
5452            /* set masked-off elements to 1s */                  \
5453            vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5454            continue;                                            \
5455        }                                                        \
5456        *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5457    }                                                            \
5458    env->vstart = 0;                                             \
5459    /* set tail elements to 1s */                                \
5460    vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5461}
5462
5463GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5464GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5465GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5466GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5467GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5468GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5469
5470GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5471GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5472GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5473GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5474GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5475GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5476