LXR qemu/target/arm/tcg/sme

   1/*
   2 * ARM SME Operations
   3 *
   4 * Copyright (c) 2022 Linaro, Ltd.
   5 *
   6 * This library is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU Lesser General Public
   8 * License as published by the Free Software Foundation; either
   9 * version 2.1 of the License, or (at your option) any later version.
  10 *
  11 * This library is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * Lesser General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU Lesser General Public
  17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20#include "qemu/osdep.h"
  21#include "cpu.h"
  22#include "internals.h"
  23#include "tcg/tcg-gvec-desc.h"
  24#include "exec/helper-proto.h"
  25#include "exec/cpu_ldst.h"
  26#include "exec/exec-all.h"
  27#include "qemu/int128.h"
  28#include "fpu/softfloat.h"
  29#include "vec_internal.h"
  30#include "sve_ldst_internal.h"
  31
  32void helper_set_svcr(CPUARMState *env, uint32_t val, uint32_t mask)
  33{
  34    aarch64_set_svcr(env, val, mask);
  35}
  36
  37void helper_sme_zero(CPUARMState *env, uint32_t imm, uint32_t svl)
  38{
  39    uint32_t i;
  40
  41    /*
  42     * Special case clearing the entire ZA space.
  43     * This falls into the CONSTRAINED UNPREDICTABLE zeroing of any
  44     * parts of the ZA storage outside of SVL.
  45     */
  46    if (imm == 0xff) {
  47        memset(env->zarray, 0, sizeof(env->zarray));
  48        return;
  49    }
  50
  51    /*
  52     * Recall that ZAnH.D[m] is spread across ZA[n+8*m],
  53     * so each row is discontiguous within ZA[].
  54     */
  55    for (i = 0; i < svl; i++) {
  56        if (imm & (1 << (i % 8))) {
  57            memset(&env->zarray[i], 0, svl);
  58        }
  59    }
  60}
  61
  62
  63/*
  64 * When considering the ZA storage as an array of elements of
  65 * type T, the index within that array of the Nth element of
  66 * a vertical slice of a tile can be calculated like this,
  67 * regardless of the size of type T. This is because the tiles
  68 * are interleaved, so if type T is size N bytes then row 1 of
  69 * the tile is N rows away from row 0. The division by N to
  70 * convert a byte offset into an array index and the multiplication
  71 * by N to convert from vslice-index-within-the-tile to
  72 * the index within the ZA storage cancel out.
  73 */
  74#define tile_vslice_index(i) ((i) * sizeof(ARMVectorReg))
  75
  76/*
  77 * When doing byte arithmetic on the ZA storage, the element
  78 * byteoff bytes away in a tile vertical slice is always this
  79 * many bytes away in the ZA storage, regardless of the
  80 * size of the tile element, assuming that byteoff is a multiple
  81 * of the element size. Again this is because of the interleaving
  82 * of the tiles. For instance if we have 1 byte per element then
  83 * each row of the ZA storage has one byte of the vslice data,
  84 * and (counting from 0) byte 8 goes in row 8 of the storage
  85 * at offset (8 * row-size-in-bytes).
  86 * If we have 8 bytes per element then each row of the ZA storage
  87 * has 8 bytes of the data, but there are 8 interleaved tiles and
  88 * so byte 8 of the data goes into row 1 of the tile,
  89 * which is again row 8 of the storage, so the offset is still
  90 * (8 * row-size-in-bytes). Similarly for other element sizes.
  91 */
  92#define tile_vslice_offset(byteoff) ((byteoff) * sizeof(ARMVectorReg))
  93
  94
  95/*
  96 * Move Zreg vector to ZArray column.
  97 */
  98#define DO_MOVA_C(NAME, TYPE, H)                                        \
  99void HELPER(NAME)(void *za, void *vn, void *vg, uint32_t desc)          \
 100{                                                                       \
 101    int i, oprsz = simd_oprsz(desc);                                    \
 102    for (i = 0; i < oprsz; ) {                                          \
 103        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
 104        do {                                                            \
 105            if (pg & 1) {                                               \
 106                *(TYPE *)(za + tile_vslice_offset(i)) = *(TYPE *)(vn + H(i)); \
 107            }                                                           \
 108            i += sizeof(TYPE);                                          \
 109            pg >>= sizeof(TYPE);                                        \
 110        } while (i & 15);                                               \
 111    }                                                                   \
 112}
 113
 114DO_MOVA_C(sme_mova_cz_b, uint8_t, H1)
 115DO_MOVA_C(sme_mova_cz_h, uint16_t, H1_2)
 116DO_MOVA_C(sme_mova_cz_s, uint32_t, H1_4)
 117
 118void HELPER(sme_mova_cz_d)(void *za, void *vn, void *vg, uint32_t desc)
 119{
 120    int i, oprsz = simd_oprsz(desc) / 8;
 121    uint8_t *pg = vg;
 122    uint64_t *n = vn;
 123    uint64_t *a = za;
 124
 125    for (i = 0; i < oprsz; i++) {
 126        if (pg[H1(i)] & 1) {
 127            a[tile_vslice_index(i)] = n[i];
 128        }
 129    }
 130}
 131
 132void HELPER(sme_mova_cz_q)(void *za, void *vn, void *vg, uint32_t desc)
 133{
 134    int i, oprsz = simd_oprsz(desc) / 16;
 135    uint16_t *pg = vg;
 136    Int128 *n = vn;
 137    Int128 *a = za;
 138
 139    /*
 140     * Int128 is used here simply to copy 16 bytes, and to simplify
 141     * the address arithmetic.
 142     */
 143    for (i = 0; i < oprsz; i++) {
 144        if (pg[H2(i)] & 1) {
 145            a[tile_vslice_index(i)] = n[i];
 146        }
 147    }
 148}
 149
 150#undef DO_MOVA_C
 151
 152/*
 153 * Move ZArray column to Zreg vector.
 154 */
 155#define DO_MOVA_Z(NAME, TYPE, H)                                        \
 156void HELPER(NAME)(void *vd, void *za, void *vg, uint32_t desc)          \
 157{                                                                       \
 158    int i, oprsz = simd_oprsz(desc);                                    \
 159    for (i = 0; i < oprsz; ) {                                          \
 160        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
 161        do {                                                            \
 162            if (pg & 1) {                                               \
 163                *(TYPE *)(vd + H(i)) = *(TYPE *)(za + tile_vslice_offset(i)); \
 164            }                                                           \
 165            i += sizeof(TYPE);                                          \
 166            pg >>= sizeof(TYPE);                                        \
 167        } while (i & 15);                                               \
 168    }                                                                   \
 169}
 170
 171DO_MOVA_Z(sme_mova_zc_b, uint8_t, H1)
 172DO_MOVA_Z(sme_mova_zc_h, uint16_t, H1_2)
 173DO_MOVA_Z(sme_mova_zc_s, uint32_t, H1_4)
 174
 175void HELPER(sme_mova_zc_d)(void *vd, void *za, void *vg, uint32_t desc)
 176{
 177    int i, oprsz = simd_oprsz(desc) / 8;
 178    uint8_t *pg = vg;
 179    uint64_t *d = vd;
 180    uint64_t *a = za;
 181
 182    for (i = 0; i < oprsz; i++) {
 183        if (pg[H1(i)] & 1) {
 184            d[i] = a[tile_vslice_index(i)];
 185        }
 186    }
 187}
 188
 189void HELPER(sme_mova_zc_q)(void *vd, void *za, void *vg, uint32_t desc)
 190{
 191    int i, oprsz = simd_oprsz(desc) / 16;
 192    uint16_t *pg = vg;
 193    Int128 *d = vd;
 194    Int128 *a = za;
 195
 196    /*
 197     * Int128 is used here simply to copy 16 bytes, and to simplify
 198     * the address arithmetic.
 199     */
 200    for (i = 0; i < oprsz; i++, za += sizeof(ARMVectorReg)) {
 201        if (pg[H2(i)] & 1) {
 202            d[i] = a[tile_vslice_index(i)];
 203        }
 204    }
 205}
 206
 207#undef DO_MOVA_Z
 208
 209/*
 210 * Clear elements in a tile slice comprising len bytes.
 211 */
 212
 213typedef void ClearFn(void *ptr, size_t off, size_t len);
 214
 215static void clear_horizontal(void *ptr, size_t off, size_t len)
 216{
 217    memset(ptr + off, 0, len);
 218}
 219
 220static void clear_vertical_b(void *vptr, size_t off, size_t len)
 221{
 222    for (size_t i = 0; i < len; ++i) {
 223        *(uint8_t *)(vptr + tile_vslice_offset(i + off)) = 0;
 224    }
 225}
 226
 227static void clear_vertical_h(void *vptr, size_t off, size_t len)
 228{
 229    for (size_t i = 0; i < len; i += 2) {
 230        *(uint16_t *)(vptr + tile_vslice_offset(i + off)) = 0;
 231    }
 232}
 233
 234static void clear_vertical_s(void *vptr, size_t off, size_t len)
 235{
 236    for (size_t i = 0; i < len; i += 4) {
 237        *(uint32_t *)(vptr + tile_vslice_offset(i + off)) = 0;
 238    }
 239}
 240
 241static void clear_vertical_d(void *vptr, size_t off, size_t len)
 242{
 243    for (size_t i = 0; i < len; i += 8) {
 244        *(uint64_t *)(vptr + tile_vslice_offset(i + off)) = 0;
 245    }
 246}
 247
 248static void clear_vertical_q(void *vptr, size_t off, size_t len)
 249{
 250    for (size_t i = 0; i < len; i += 16) {
 251        memset(vptr + tile_vslice_offset(i + off), 0, 16);
 252    }
 253}
 254
 255/*
 256 * Copy elements from an array into a tile slice comprising len bytes.
 257 */
 258
 259typedef void CopyFn(void *dst, const void *src, size_t len);
 260
 261static void copy_horizontal(void *dst, const void *src, size_t len)
 262{
 263    memcpy(dst, src, len);
 264}
 265
 266static void copy_vertical_b(void *vdst, const void *vsrc, size_t len)
 267{
 268    const uint8_t *src = vsrc;
 269    uint8_t *dst = vdst;
 270    size_t i;
 271
 272    for (i = 0; i < len; ++i) {
 273        dst[tile_vslice_index(i)] = src[i];
 274    }
 275}
 276
 277static void copy_vertical_h(void *vdst, const void *vsrc, size_t len)
 278{
 279    const uint16_t *src = vsrc;
 280    uint16_t *dst = vdst;
 281    size_t i;
 282
 283    for (i = 0; i < len / 2; ++i) {
 284        dst[tile_vslice_index(i)] = src[i];
 285    }
 286}
 287
 288static void copy_vertical_s(void *vdst, const void *vsrc, size_t len)
 289{
 290    const uint32_t *src = vsrc;
 291    uint32_t *dst = vdst;
 292    size_t i;
 293
 294    for (i = 0; i < len / 4; ++i) {
 295        dst[tile_vslice_index(i)] = src[i];
 296    }
 297}
 298
 299static void copy_vertical_d(void *vdst, const void *vsrc, size_t len)
 300{
 301    const uint64_t *src = vsrc;
 302    uint64_t *dst = vdst;
 303    size_t i;
 304
 305    for (i = 0; i < len / 8; ++i) {
 306        dst[tile_vslice_index(i)] = src[i];
 307    }
 308}
 309
 310static void copy_vertical_q(void *vdst, const void *vsrc, size_t len)
 311{
 312    for (size_t i = 0; i < len; i += 16) {
 313        memcpy(vdst + tile_vslice_offset(i), vsrc + i, 16);
 314    }
 315}
 316
 317/*
 318 * Host and TLB primitives for vertical tile slice addressing.
 319 */
 320
 321#define DO_LD(NAME, TYPE, HOST, TLB)                                        \
 322static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host)  \
 323{                                                                           \
 324    TYPE val = HOST(host);                                                  \
 325    *(TYPE *)(za + tile_vslice_offset(off)) = val;                          \
 326}                                                                           \
 327static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za,           \
 328                        intptr_t off, target_ulong addr, uintptr_t ra)      \
 329{                                                                           \
 330    TYPE val = TLB(env, useronly_clean_ptr(addr), ra);                      \
 331    *(TYPE *)(za + tile_vslice_offset(off)) = val;                          \
 332}
 333
 334#define DO_ST(NAME, TYPE, HOST, TLB)                                        \
 335static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host)  \
 336{                                                                           \
 337    TYPE val = *(TYPE *)(za + tile_vslice_offset(off));                     \
 338    HOST(host, val);                                                        \
 339}                                                                           \
 340static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za,           \
 341                        intptr_t off, target_ulong addr, uintptr_t ra)      \
 342{                                                                           \
 343    TYPE val = *(TYPE *)(za + tile_vslice_offset(off));                     \
 344    TLB(env, useronly_clean_ptr(addr), val, ra);                            \
 345}
 346
 347/*
 348 * The ARMVectorReg elements are stored in host-endian 64-bit units.
 349 * For 128-bit quantities, the sequence defined by the Elem[] pseudocode
 350 * corresponds to storing the two 64-bit pieces in little-endian order.
 351 */
 352#define DO_LDQ(HNAME, VNAME, BE, HOST, TLB)                                 \
 353static inline void HNAME##_host(void *za, intptr_t off, void *host)         \
 354{                                                                           \
 355    uint64_t val0 = HOST(host), val1 = HOST(host + 8);                      \
 356    uint64_t *ptr = za + off;                                               \
 357    ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1;                   \
 358}                                                                           \
 359static inline void VNAME##_v_host(void *za, intptr_t off, void *host)       \
 360{                                                                           \
 361    HNAME##_host(za, tile_vslice_offset(off), host);                        \
 362}                                                                           \
 363static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off,    \
 364                               target_ulong addr, uintptr_t ra)             \
 365{                                                                           \
 366    uint64_t val0 = TLB(env, useronly_clean_ptr(addr), ra);                 \
 367    uint64_t val1 = TLB(env, useronly_clean_ptr(addr + 8), ra);             \
 368    uint64_t *ptr = za + off;                                               \
 369    ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1;                   \
 370}                                                                           \
 371static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off,  \
 372                               target_ulong addr, uintptr_t ra)             \
 373{                                                                           \
 374    HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra);                \
 375}
 376
 377#define DO_STQ(HNAME, VNAME, BE, HOST, TLB)                                 \
 378static inline void HNAME##_host(void *za, intptr_t off, void *host)         \
 379{                                                                           \
 380    uint64_t *ptr = za + off;                                               \
 381    HOST(host, ptr[BE]);                                                    \
 382    HOST(host + 1, ptr[!BE]);                                               \
 383}                                                                           \
 384static inline void VNAME##_v_host(void *za, intptr_t off, void *host)       \
 385{                                                                           \
 386    HNAME##_host(za, tile_vslice_offset(off), host);                        \
 387}                                                                           \
 388static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off,    \
 389                               target_ulong addr, uintptr_t ra)             \
 390{                                                                           \
 391    uint64_t *ptr = za + off;                                               \
 392    TLB(env, useronly_clean_ptr(addr), ptr[BE], ra);                        \
 393    TLB(env, useronly_clean_ptr(addr + 8), ptr[!BE], ra);                   \
 394}                                                                           \
 395static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off,  \
 396                               target_ulong addr, uintptr_t ra)             \
 397{                                                                           \
 398    HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra);                \
 399}
 400
 401DO_LD(ld1b, uint8_t, ldub_p, cpu_ldub_data_ra)
 402DO_LD(ld1h_be, uint16_t, lduw_be_p, cpu_lduw_be_data_ra)
 403DO_LD(ld1h_le, uint16_t, lduw_le_p, cpu_lduw_le_data_ra)
 404DO_LD(ld1s_be, uint32_t, ldl_be_p, cpu_ldl_be_data_ra)
 405DO_LD(ld1s_le, uint32_t, ldl_le_p, cpu_ldl_le_data_ra)
 406DO_LD(ld1d_be, uint64_t, ldq_be_p, cpu_ldq_be_data_ra)
 407DO_LD(ld1d_le, uint64_t, ldq_le_p, cpu_ldq_le_data_ra)
 408
 409DO_LDQ(sve_ld1qq_be, sme_ld1q_be, 1, ldq_be_p, cpu_ldq_be_data_ra)
 410DO_LDQ(sve_ld1qq_le, sme_ld1q_le, 0, ldq_le_p, cpu_ldq_le_data_ra)
 411
 412DO_ST(st1b, uint8_t, stb_p, cpu_stb_data_ra)
 413DO_ST(st1h_be, uint16_t, stw_be_p, cpu_stw_be_data_ra)
 414DO_ST(st1h_le, uint16_t, stw_le_p, cpu_stw_le_data_ra)
 415DO_ST(st1s_be, uint32_t, stl_be_p, cpu_stl_be_data_ra)
 416DO_ST(st1s_le, uint32_t, stl_le_p, cpu_stl_le_data_ra)
 417DO_ST(st1d_be, uint64_t, stq_be_p, cpu_stq_be_data_ra)
 418DO_ST(st1d_le, uint64_t, stq_le_p, cpu_stq_le_data_ra)
 419
 420DO_STQ(sve_st1qq_be, sme_st1q_be, 1, stq_be_p, cpu_stq_be_data_ra)
 421DO_STQ(sve_st1qq_le, sme_st1q_le, 0, stq_le_p, cpu_stq_le_data_ra)
 422
 423#undef DO_LD
 424#undef DO_ST
 425#undef DO_LDQ
 426#undef DO_STQ
 427
 428/*
 429 * Common helper for all contiguous predicated loads.
 430 */
 431
 432static inline QEMU_ALWAYS_INLINE
 433void sme_ld1(CPUARMState *env, void *za, uint64_t *vg,
 434             const target_ulong addr, uint32_t desc, const uintptr_t ra,
 435             const int esz, uint32_t mtedesc, bool vertical,
 436             sve_ldst1_host_fn *host_fn,
 437             sve_ldst1_tlb_fn *tlb_fn,
 438             ClearFn *clr_fn,
 439             CopyFn *cpy_fn)
 440{
 441    const intptr_t reg_max = simd_oprsz(desc);
 442    const intptr_t esize = 1 << esz;
 443    intptr_t reg_off, reg_last;
 444    SVEContLdSt info;
 445    void *host;
 446    int flags;
 447
 448    /* Find the active elements.  */
 449    if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
 450        /* The entire predicate was false; no load occurs.  */
 451        clr_fn(za, 0, reg_max);
 452        return;
 453    }
 454
 455    /* Probe the page(s).  Exit with exception for any invalid page. */
 456    sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra);
 457
 458    /* Handle watchpoints for all active elements. */
 459    sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
 460                              BP_MEM_READ, ra);
 461
 462    /*
 463     * Handle mte checks for all active elements.
 464     * Since TBI must be set for MTE, !mtedesc => !mte_active.
 465     */
 466    if (mtedesc) {
 467        sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
 468                                mtedesc, ra);
 469    }
 470
 471    flags = info.page[0].flags | info.page[1].flags;
 472    if (unlikely(flags != 0)) {
 473#ifdef CONFIG_USER_ONLY
 474        g_assert_not_reached();
 475#else
 476        /*
 477         * At least one page includes MMIO.
 478         * Any bus operation can fail with cpu_transaction_failed,
 479         * which for ARM will raise SyncExternal.  Perform the load
 480         * into scratch memory to preserve register state until the end.
 481         */
 482        ARMVectorReg scratch = { };
 483
 484        reg_off = info.reg_off_first[0];
 485        reg_last = info.reg_off_last[1];
 486        if (reg_last < 0) {
 487            reg_last = info.reg_off_split;
 488            if (reg_last < 0) {
 489                reg_last = info.reg_off_last[0];
 490            }
 491        }
 492
 493        do {
 494            uint64_t pg = vg[reg_off >> 6];
 495            do {
 496                if ((pg >> (reg_off & 63)) & 1) {
 497                    tlb_fn(env, &scratch, reg_off, addr + reg_off, ra);
 498                }
 499                reg_off += esize;
 500            } while (reg_off & 63);
 501        } while (reg_off <= reg_last);
 502
 503        cpy_fn(za, &scratch, reg_max);
 504        return;
 505#endif
 506    }
 507
 508    /* The entire operation is in RAM, on valid pages. */
 509
 510    reg_off = info.reg_off_first[0];
 511    reg_last = info.reg_off_last[0];
 512    host = info.page[0].host;
 513
 514    if (!vertical) {
 515        memset(za, 0, reg_max);
 516    } else if (reg_off) {
 517        clr_fn(za, 0, reg_off);
 518    }
 519
 520    while (reg_off <= reg_last) {
 521        uint64_t pg = vg[reg_off >> 6];
 522        do {
 523            if ((pg >> (reg_off & 63)) & 1) {
 524                host_fn(za, reg_off, host + reg_off);
 525            } else if (vertical) {
 526                clr_fn(za, reg_off, esize);
 527            }
 528            reg_off += esize;
 529        } while (reg_off <= reg_last && (reg_off & 63));
 530    }
 531
 532    /*
 533     * Use the slow path to manage the cross-page misalignment.
 534     * But we know this is RAM and cannot trap.
 535     */
 536    reg_off = info.reg_off_split;
 537    if (unlikely(reg_off >= 0)) {
 538        tlb_fn(env, za, reg_off, addr + reg_off, ra);
 539    }
 540
 541    reg_off = info.reg_off_first[1];
 542    if (unlikely(reg_off >= 0)) {
 543        reg_last = info.reg_off_last[1];
 544        host = info.page[1].host;
 545
 546        do {
 547            uint64_t pg = vg[reg_off >> 6];
 548            do {
 549                if ((pg >> (reg_off & 63)) & 1) {
 550                    host_fn(za, reg_off, host + reg_off);
 551                } else if (vertical) {
 552                    clr_fn(za, reg_off, esize);
 553                }
 554                reg_off += esize;
 555            } while (reg_off & 63);
 556        } while (reg_off <= reg_last);
 557    }
 558}
 559
 560static inline QEMU_ALWAYS_INLINE
 561void sme_ld1_mte(CPUARMState *env, void *za, uint64_t *vg,
 562                 target_ulong addr, uint32_t desc, uintptr_t ra,
 563                 const int esz, bool vertical,
 564                 sve_ldst1_host_fn *host_fn,
 565                 sve_ldst1_tlb_fn *tlb_fn,
 566                 ClearFn *clr_fn,
 567                 CopyFn *cpy_fn)
 568{
 569    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
 570    int bit55 = extract64(addr, 55, 1);
 571
 572    /* Remove mtedesc from the normal sve descriptor. */
 573    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
 574
 575    /* Perform gross MTE suppression early. */
 576    if (!tbi_check(desc, bit55) ||
 577        tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
 578        mtedesc = 0;
 579    }
 580
 581    sme_ld1(env, za, vg, addr, desc, ra, esz, mtedesc, vertical,
 582            host_fn, tlb_fn, clr_fn, cpy_fn);
 583}
 584
 585#define DO_LD(L, END, ESZ)                                                 \
 586void HELPER(sme_ld1##L##END##_h)(CPUARMState *env, void *za, void *vg,     \
 587                                 target_ulong addr, uint32_t desc)         \
 588{                                                                          \
 589    sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false,               \
 590            sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb,           \
 591            clear_horizontal, copy_horizontal);                            \
 592}                                                                          \
 593void HELPER(sme_ld1##L##END##_v)(CPUARMState *env, void *za, void *vg,     \
 594                                 target_ulong addr, uint32_t desc)         \
 595{                                                                          \
 596    sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true,                \
 597            sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb,             \
 598            clear_vertical_##L, copy_vertical_##L);                        \
 599}                                                                          \
 600void HELPER(sme_ld1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
 601                                     target_ulong addr, uint32_t desc)     \
 602{                                                                          \
 603    sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false,              \
 604                sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb,       \
 605                clear_horizontal, copy_horizontal);                        \
 606}                                                                          \
 607void HELPER(sme_ld1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
 608                                     target_ulong addr, uint32_t desc)     \
 609{                                                                          \
 610    sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true,               \
 611                sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb,         \
 612                clear_vertical_##L, copy_vertical_##L);                    \
 613}
 614
 615DO_LD(b, , MO_8)
 616DO_LD(h, _be, MO_16)
 617DO_LD(h, _le, MO_16)
 618DO_LD(s, _be, MO_32)
 619DO_LD(s, _le, MO_32)
 620DO_LD(d, _be, MO_64)
 621DO_LD(d, _le, MO_64)
 622DO_LD(q, _be, MO_128)
 623DO_LD(q, _le, MO_128)
 624
 625#undef DO_LD
 626
 627/*
 628 * Common helper for all contiguous predicated stores.
 629 */
 630
 631static inline QEMU_ALWAYS_INLINE
 632void sme_st1(CPUARMState *env, void *za, uint64_t *vg,
 633             const target_ulong addr, uint32_t desc, const uintptr_t ra,
 634             const int esz, uint32_t mtedesc, bool vertical,
 635             sve_ldst1_host_fn *host_fn,
 636             sve_ldst1_tlb_fn *tlb_fn)
 637{
 638    const intptr_t reg_max = simd_oprsz(desc);
 639    const intptr_t esize = 1 << esz;
 640    intptr_t reg_off, reg_last;
 641    SVEContLdSt info;
 642    void *host;
 643    int flags;
 644
 645    /* Find the active elements.  */
 646    if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
 647        /* The entire predicate was false; no store occurs.  */
 648        return;
 649    }
 650
 651    /* Probe the page(s).  Exit with exception for any invalid page. */
 652    sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra);
 653
 654    /* Handle watchpoints for all active elements. */
 655    sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
 656                              BP_MEM_WRITE, ra);
 657
 658    /*
 659     * Handle mte checks for all active elements.
 660     * Since TBI must be set for MTE, !mtedesc => !mte_active.
 661     */
 662    if (mtedesc) {
 663        sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
 664                                mtedesc, ra);
 665    }
 666
 667    flags = info.page[0].flags | info.page[1].flags;
 668    if (unlikely(flags != 0)) {
 669#ifdef CONFIG_USER_ONLY
 670        g_assert_not_reached();
 671#else
 672        /*
 673         * At least one page includes MMIO.
 674         * Any bus operation can fail with cpu_transaction_failed,
 675         * which for ARM will raise SyncExternal.  We cannot avoid
 676         * this fault and will leave with the store incomplete.
 677         */
 678        reg_off = info.reg_off_first[0];
 679        reg_last = info.reg_off_last[1];
 680        if (reg_last < 0) {
 681            reg_last = info.reg_off_split;
 682            if (reg_last < 0) {
 683                reg_last = info.reg_off_last[0];
 684            }
 685        }
 686
 687        do {
 688            uint64_t pg = vg[reg_off >> 6];
 689            do {
 690                if ((pg >> (reg_off & 63)) & 1) {
 691                    tlb_fn(env, za, reg_off, addr + reg_off, ra);
 692                }
 693                reg_off += esize;
 694            } while (reg_off & 63);
 695        } while (reg_off <= reg_last);
 696        return;
 697#endif
 698    }
 699
 700    reg_off = info.reg_off_first[0];
 701    reg_last = info.reg_off_last[0];
 702    host = info.page[0].host;
 703
 704    while (reg_off <= reg_last) {
 705        uint64_t pg = vg[reg_off >> 6];
 706        do {
 707            if ((pg >> (reg_off & 63)) & 1) {
 708                host_fn(za, reg_off, host + reg_off);
 709            }
 710            reg_off += 1 << esz;
 711        } while (reg_off <= reg_last && (reg_off & 63));
 712    }
 713
 714    /*
 715     * Use the slow path to manage the cross-page misalignment.
 716     * But we know this is RAM and cannot trap.
 717     */
 718    reg_off = info.reg_off_split;
 719    if (unlikely(reg_off >= 0)) {
 720        tlb_fn(env, za, reg_off, addr + reg_off, ra);
 721    }
 722
 723    reg_off = info.reg_off_first[1];
 724    if (unlikely(reg_off >= 0)) {
 725        reg_last = info.reg_off_last[1];
 726        host = info.page[1].host;
 727
 728        do {
 729            uint64_t pg = vg[reg_off >> 6];
 730            do {
 731                if ((pg >> (reg_off & 63)) & 1) {
 732                    host_fn(za, reg_off, host + reg_off);
 733                }
 734                reg_off += 1 << esz;
 735            } while (reg_off & 63);
 736        } while (reg_off <= reg_last);
 737    }
 738}
 739
 740static inline QEMU_ALWAYS_INLINE
 741void sme_st1_mte(CPUARMState *env, void *za, uint64_t *vg, target_ulong addr,
 742                 uint32_t desc, uintptr_t ra, int esz, bool vertical,
 743                 sve_ldst1_host_fn *host_fn,
 744                 sve_ldst1_tlb_fn *tlb_fn)
 745{
 746    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
 747    int bit55 = extract64(addr, 55, 1);
 748
 749    /* Remove mtedesc from the normal sve descriptor. */
 750    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
 751
 752    /* Perform gross MTE suppression early. */
 753    if (!tbi_check(desc, bit55) ||
 754        tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
 755        mtedesc = 0;
 756    }
 757
 758    sme_st1(env, za, vg, addr, desc, ra, esz, mtedesc,
 759            vertical, host_fn, tlb_fn);
 760}
 761
 762#define DO_ST(L, END, ESZ)                                                 \
 763void HELPER(sme_st1##L##END##_h)(CPUARMState *env, void *za, void *vg,     \
 764                                 target_ulong addr, uint32_t desc)         \
 765{                                                                          \
 766    sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false,               \
 767            sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb);          \
 768}                                                                          \
 769void HELPER(sme_st1##L##END##_v)(CPUARMState *env, void *za, void *vg,     \
 770                                 target_ulong addr, uint32_t desc)         \
 771{                                                                          \
 772    sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true,                \
 773            sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb);            \
 774}                                                                          \
 775void HELPER(sme_st1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
 776                                     target_ulong addr, uint32_t desc)     \
 777{                                                                          \
 778    sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false,              \
 779                sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb);      \
 780}                                                                          \
 781void HELPER(sme_st1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
 782                                     target_ulong addr, uint32_t desc)     \
 783{                                                                          \
 784    sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true,               \
 785                sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb);        \
 786}
 787
 788DO_ST(b, , MO_8)
 789DO_ST(h, _be, MO_16)
 790DO_ST(h, _le, MO_16)
 791DO_ST(s, _be, MO_32)
 792DO_ST(s, _le, MO_32)
 793DO_ST(d, _be, MO_64)
 794DO_ST(d, _le, MO_64)
 795DO_ST(q, _be, MO_128)
 796DO_ST(q, _le, MO_128)
 797
 798#undef DO_ST
 799
 800void HELPER(sme_addha_s)(void *vzda, void *vzn, void *vpn,
 801                         void *vpm, uint32_t desc)
 802{
 803    intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
 804    uint64_t *pn = vpn, *pm = vpm;
 805    uint32_t *zda = vzda, *zn = vzn;
 806
 807    for (row = 0; row < oprsz; ) {
 808        uint64_t pa = pn[row >> 4];
 809        do {
 810            if (pa & 1) {
 811                for (col = 0; col < oprsz; ) {
 812                    uint64_t pb = pm[col >> 4];
 813                    do {
 814                        if (pb & 1) {
 815                            zda[tile_vslice_index(row) + H4(col)] += zn[H4(col)];
 816                        }
 817                        pb >>= 4;
 818                    } while (++col & 15);
 819                }
 820            }
 821            pa >>= 4;
 822        } while (++row & 15);
 823    }
 824}
 825
 826void HELPER(sme_addha_d)(void *vzda, void *vzn, void *vpn,
 827                         void *vpm, uint32_t desc)
 828{
 829    intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
 830    uint8_t *pn = vpn, *pm = vpm;
 831    uint64_t *zda = vzda, *zn = vzn;
 832
 833    for (row = 0; row < oprsz; ++row) {
 834        if (pn[H1(row)] & 1) {
 835            for (col = 0; col < oprsz; ++col) {
 836                if (pm[H1(col)] & 1) {
 837                    zda[tile_vslice_index(row) + col] += zn[col];
 838                }
 839            }
 840        }
 841    }
 842}
 843
 844void HELPER(sme_addva_s)(void *vzda, void *vzn, void *vpn,
 845                         void *vpm, uint32_t desc)
 846{
 847    intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
 848    uint64_t *pn = vpn, *pm = vpm;
 849    uint32_t *zda = vzda, *zn = vzn;
 850
 851    for (row = 0; row < oprsz; ) {
 852        uint64_t pa = pn[row >> 4];
 853        do {
 854            if (pa & 1) {
 855                uint32_t zn_row = zn[H4(row)];
 856                for (col = 0; col < oprsz; ) {
 857                    uint64_t pb = pm[col >> 4];
 858                    do {
 859                        if (pb & 1) {
 860                            zda[tile_vslice_index(row) + H4(col)] += zn_row;
 861                        }
 862                        pb >>= 4;
 863                    } while (++col & 15);
 864                }
 865            }
 866            pa >>= 4;
 867        } while (++row & 15);
 868    }
 869}
 870
 871void HELPER(sme_addva_d)(void *vzda, void *vzn, void *vpn,
 872                         void *vpm, uint32_t desc)
 873{
 874    intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
 875    uint8_t *pn = vpn, *pm = vpm;
 876    uint64_t *zda = vzda, *zn = vzn;
 877
 878    for (row = 0; row < oprsz; ++row) {
 879        if (pn[H1(row)] & 1) {
 880            uint64_t zn_row = zn[row];
 881            for (col = 0; col < oprsz; ++col) {
 882                if (pm[H1(col)] & 1) {
 883                    zda[tile_vslice_index(row) + col] += zn_row;
 884                }
 885            }
 886        }
 887    }
 888}
 889
 890void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn,
 891                         void *vpm, void *vst, uint32_t desc)
 892{
 893    intptr_t row, col, oprsz = simd_maxsz(desc);
 894    uint32_t neg = simd_data(desc) << 31;
 895    uint16_t *pn = vpn, *pm = vpm;
 896    float_status fpst;
 897
 898    /*
 899     * Make a copy of float_status because this operation does not
 900     * update the cumulative fp exception status.  It also produces
 901     * default nans.
 902     */
 903    fpst = *(float_status *)vst;
 904    set_default_nan_mode(true, &fpst);
 905
 906    for (row = 0; row < oprsz; ) {
 907        uint16_t pa = pn[H2(row >> 4)];
 908        do {
 909            if (pa & 1) {
 910                void *vza_row = vza + tile_vslice_offset(row);
 911                uint32_t n = *(uint32_t *)(vzn + H1_4(row)) ^ neg;
 912
 913                for (col = 0; col < oprsz; ) {
 914                    uint16_t pb = pm[H2(col >> 4)];
 915                    do {
 916                        if (pb & 1) {
 917                            uint32_t *a = vza_row + H1_4(col);
 918                            uint32_t *m = vzm + H1_4(col);
 919                            *a = float32_muladd(n, *m, *a, 0, vst);
 920                        }
 921                        col += 4;
 922                        pb >>= 4;
 923                    } while (col & 15);
 924                }
 925            }
 926            row += 4;
 927            pa >>= 4;
 928        } while (row & 15);
 929    }
 930}
 931
 932void HELPER(sme_fmopa_d)(void *vza, void *vzn, void *vzm, void *vpn,
 933                         void *vpm, void *vst, uint32_t desc)
 934{
 935    intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
 936    uint64_t neg = (uint64_t)simd_data(desc) << 63;
 937    uint64_t *za = vza, *zn = vzn, *zm = vzm;
 938    uint8_t *pn = vpn, *pm = vpm;
 939    float_status fpst = *(float_status *)vst;
 940
 941    set_default_nan_mode(true, &fpst);
 942
 943    for (row = 0; row < oprsz; ++row) {
 944        if (pn[H1(row)] & 1) {
 945            uint64_t *za_row = &za[tile_vslice_index(row)];
 946            uint64_t n = zn[row] ^ neg;
 947
 948            for (col = 0; col < oprsz; ++col) {
 949                if (pm[H1(col)] & 1) {
 950                    uint64_t *a = &za_row[col];
 951                    *a = float64_muladd(n, zm[col], *a, 0, &fpst);
 952                }
 953            }
 954        }
 955    }
 956}
 957
 958/*
 959 * Alter PAIR as needed for controlling predicates being false,
 960 * and for NEG on an enabled row element.
 961 */
 962static inline uint32_t f16mop_adj_pair(uint32_t pair, uint32_t pg, uint32_t neg)
 963{
 964    /*
 965     * The pseudocode uses a conditional negate after the conditional zero.
 966     * It is simpler here to unconditionally negate before conditional zero.
 967     */
 968    pair ^= neg;
 969    if (!(pg & 1)) {
 970        pair &= 0xffff0000u;
 971    }
 972    if (!(pg & 4)) {
 973        pair &= 0x0000ffffu;
 974    }
 975    return pair;
 976}
 977
 978static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2,
 979                          float_status *s_std, float_status *s_odd)
 980{
 981    float64 e1r = float16_to_float64(e1 & 0xffff, true, s_std);
 982    float64 e1c = float16_to_float64(e1 >> 16, true, s_std);
 983    float64 e2r = float16_to_float64(e2 & 0xffff, true, s_std);
 984    float64 e2c = float16_to_float64(e2 >> 16, true, s_std);
 985    float64 t64;
 986    float32 t32;
 987
 988    /*
 989     * The ARM pseudocode function FPDot performs both multiplies
 990     * and the add with a single rounding operation.  Emulate this
 991     * by performing the first multiply in round-to-odd, then doing
 992     * the second multiply as fused multiply-add, and rounding to
 993     * float32 all in one step.
 994     */
 995    t64 = float64_mul(e1r, e2r, s_odd);
 996    t64 = float64r32_muladd(e1c, e2c, t64, 0, s_std);
 997
 998    /* This conversion is exact, because we've already rounded. */
 999    t32 = float64_to_float32(t64, s_std);
1000

1001    /* The final accumulation step is not fused. */
1002    return float32_add(sum, t32, s_std);
1003}
1004
1005void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn,
1006                         void *vpm, void *vst, uint32_t desc)
1007{
1008    intptr_t row, col, oprsz = simd_maxsz(desc);
1009    uint32_t neg = simd_data(desc) * 0x80008000u;
1010    uint16_t *pn = vpn, *pm = vpm;
1011    float_status fpst_odd, fpst_std;
1012
1013    /*
1014     * Make a copy of float_status because this operation does not
1015     * update the cumulative fp exception status.  It also produces
1016     * default nans.  Make a second copy with round-to-odd -- see above.
1017     */
1018    fpst_std = *(float_status *)vst;
1019    set_default_nan_mode(true, &fpst_std);
1020    fpst_odd = fpst_std;
1021    set_float_rounding_mode(float_round_to_odd, &fpst_odd);
1022
1023    for (row = 0; row < oprsz; ) {
1024        uint16_t prow = pn[H2(row >> 4)];
1025        do {
1026            void *vza_row = vza + tile_vslice_offset(row);
1027            uint32_t n = *(uint32_t *)(vzn + H1_4(row));
1028
1029            n = f16mop_adj_pair(n, prow, neg);
1030
1031            for (col = 0; col < oprsz; ) {
1032                uint16_t pcol = pm[H2(col >> 4)];
1033                do {
1034                    if (prow & pcol & 0b0101) {
1035                        uint32_t *a = vza_row + H1_4(col);
1036                        uint32_t m = *(uint32_t *)(vzm + H1_4(col));
1037
1038                        m = f16mop_adj_pair(m, pcol, 0);
1039                        *a = f16_dotadd(*a, n, m, &fpst_std, &fpst_odd);
1040
1041                        col += 4;
1042                        pcol >>= 4;
1043                    }
1044                } while (col & 15);
1045            }
1046            row += 4;
1047            prow >>= 4;
1048        } while (row & 15);
1049    }
1050}
1051
1052void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm, void *vpn,
1053                        void *vpm, uint32_t desc)
1054{
1055    intptr_t row, col, oprsz = simd_maxsz(desc);
1056    uint32_t neg = simd_data(desc) * 0x80008000u;
1057    uint16_t *pn = vpn, *pm = vpm;
1058
1059    for (row = 0; row < oprsz; ) {
1060        uint16_t prow = pn[H2(row >> 4)];
1061        do {
1062            void *vza_row = vza + tile_vslice_offset(row);
1063            uint32_t n = *(uint32_t *)(vzn + H1_4(row));
1064
1065            n = f16mop_adj_pair(n, prow, neg);
1066
1067            for (col = 0; col < oprsz; ) {
1068                uint16_t pcol = pm[H2(col >> 4)];
1069                do {
1070                    if (prow & pcol & 0b0101) {
1071                        uint32_t *a = vza_row + H1_4(col);
1072                        uint32_t m = *(uint32_t *)(vzm + H1_4(col));
1073
1074                        m = f16mop_adj_pair(m, pcol, 0);
1075                        *a = bfdotadd(*a, n, m);
1076
1077                        col += 4;
1078                        pcol >>= 4;
1079                    }
1080                } while (col & 15);
1081            }
1082            row += 4;
1083            prow >>= 4;
1084        } while (row & 15);
1085    }
1086}
1087
1088typedef uint64_t IMOPFn(uint64_t, uint64_t, uint64_t, uint8_t, bool);
1089
1090static inline void do_imopa(uint64_t *za, uint64_t *zn, uint64_t *zm,
1091                            uint8_t *pn, uint8_t *pm,
1092                            uint32_t desc, IMOPFn *fn)
1093{
1094    intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
1095    bool neg = simd_data(desc);
1096
1097    for (row = 0; row < oprsz; ++row) {
1098        uint8_t pa = pn[H1(row)];
1099        uint64_t *za_row = &za[tile_vslice_index(row)];
1100        uint64_t n = zn[row];
1101
1102        for (col = 0; col < oprsz; ++col) {
1103            uint8_t pb = pm[H1(col)];
1104            uint64_t *a = &za_row[col];
1105
1106            *a = fn(n, zm[col], *a, pa & pb, neg);
1107        }
1108    }
1109}
1110
1111#define DEF_IMOP_32(NAME, NTYPE, MTYPE) \
1112static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
1113{                                                                           \
1114    uint32_t sum0 = 0, sum1 = 0;                                            \
1115    /* Apply P to N as a mask, making the inactive elements 0. */           \
1116    n &= expand_pred_b(p);                                                  \
1117    sum0 += (NTYPE)(n >> 0) * (MTYPE)(m >> 0);                              \
1118    sum0 += (NTYPE)(n >> 8) * (MTYPE)(m >> 8);                              \
1119    sum0 += (NTYPE)(n >> 16) * (MTYPE)(m >> 16);                            \
1120    sum0 += (NTYPE)(n >> 24) * (MTYPE)(m >> 24);                            \
1121    sum1 += (NTYPE)(n >> 32) * (MTYPE)(m >> 32);                            \
1122    sum1 += (NTYPE)(n >> 40) * (MTYPE)(m >> 40);                            \
1123    sum1 += (NTYPE)(n >> 48) * (MTYPE)(m >> 48);                            \
1124    sum1 += (NTYPE)(n >> 56) * (MTYPE)(m >> 56);                            \
1125    if (neg) {                                                              \
1126        sum0 = (uint32_t)a - sum0, sum1 = (uint32_t)(a >> 32) - sum1;       \
1127    } else {                                                                \
1128        sum0 = (uint32_t)a + sum0, sum1 = (uint32_t)(a >> 32) + sum1;       \
1129    }                                                                       \
1130    return ((uint64_t)sum1 << 32) | sum0;                                   \
1131}
1132
1133#define DEF_IMOP_64(NAME, NTYPE, MTYPE) \
1134static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
1135{                                                                           \
1136    uint64_t sum = 0;                                                       \
1137    /* Apply P to N as a mask, making the inactive elements 0. */           \
1138    n &= expand_pred_h(p);                                                  \
1139    sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0);                               \
1140    sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16);                             \
1141    sum += (NTYPE)(n >> 32) * (MTYPE)(m >> 32);                             \
1142    sum += (NTYPE)(n >> 48) * (MTYPE)(m >> 48);                             \
1143    return neg ? a - sum : a + sum;                                         \
1144}
1145
1146DEF_IMOP_32(smopa_s, int8_t, int8_t)
1147DEF_IMOP_32(umopa_s, uint8_t, uint8_t)
1148DEF_IMOP_32(sumopa_s, int8_t, uint8_t)
1149DEF_IMOP_32(usmopa_s, uint8_t, int8_t)
1150
1151DEF_IMOP_64(smopa_d, int16_t, int16_t)
1152DEF_IMOP_64(umopa_d, uint16_t, uint16_t)
1153DEF_IMOP_64(sumopa_d, int16_t, uint16_t)
1154DEF_IMOP_64(usmopa_d, uint16_t, int16_t)
1155
1156#define DEF_IMOPH(NAME) \
1157    void HELPER(sme_##NAME)(void *vza, void *vzn, void *vzm, void *vpn,      \
1158                            void *vpm, uint32_t desc)                        \
1159    { do_imopa(vza, vzn, vzm, vpn, vpm, desc, NAME); }
1160
1161DEF_IMOPH(smopa_s)
1162DEF_IMOPH(umopa_s)
1163DEF_IMOPH(sumopa_s)
1164DEF_IMOPH(usmopa_s)
1165DEF_IMOPH(smopa_d)
1166DEF_IMOPH(umopa_d)
1167DEF_IMOPH(sumopa_d)
1168DEF_IMOPH(usmopa_d)
1169