LXR qemu/target/arm/sme

   1/*
   2 * ARM SME Operations
   3 *
   4 * Copyright (c) 2022 Linaro, Ltd.
   5 *
   6 * This library is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU Lesser General Public
   8 * License as published by the Free Software Foundation; either
   9 * version 2.1 of the License, or (at your option) any later version.
  10 *
  11 * This library is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * Lesser General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU Lesser General Public
  17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20#include "qemu/osdep.h"
  21#include "cpu.h"
  22#include "internals.h"
  23#include "tcg/tcg-gvec-desc.h"
  24#include "exec/helper-proto.h"
  25#include "exec/cpu_ldst.h"
  26#include "exec/exec-all.h"
  27#include "qemu/int128.h"
  28#include "fpu/softfloat.h"
  29#include "vec_internal.h"
  30#include "sve_ldst_internal.h"
  31
  32/* ResetSVEState */
  33void arm_reset_sve_state(CPUARMState *env)
  34{
  35    memset(env->vfp.zregs, 0, sizeof(env->vfp.zregs));
  36    /* Recall that FFR is stored as pregs[16]. */
  37    memset(env->vfp.pregs, 0, sizeof(env->vfp.pregs));
  38    vfp_set_fpcr(env, 0x0800009f);
  39}
  40
  41void helper_set_pstate_sm(CPUARMState *env, uint32_t i)
  42{
  43    if (i == FIELD_EX64(env->svcr, SVCR, SM)) {
  44        return;
  45    }
  46    env->svcr ^= R_SVCR_SM_MASK;
  47    arm_reset_sve_state(env);
  48}
  49
  50void helper_set_pstate_za(CPUARMState *env, uint32_t i)
  51{
  52    if (i == FIELD_EX64(env->svcr, SVCR, ZA)) {
  53        return;
  54    }
  55    env->svcr ^= R_SVCR_ZA_MASK;
  56
  57    /*
  58     * ResetSMEState.
  59     *
  60     * SetPSTATE_ZA zeros on enable and disable.  We can zero this only
  61     * on enable: while disabled, the storage is inaccessible and the
  62     * value does not matter.  We're not saving the storage in vmstate
  63     * when disabled either.
  64     */
  65    if (i) {
  66        memset(env->zarray, 0, sizeof(env->zarray));
  67    }
  68}
  69
  70void helper_sme_zero(CPUARMState *env, uint32_t imm, uint32_t svl)
  71{
  72    uint32_t i;
  73
  74    /*
  75     * Special case clearing the entire ZA space.
  76     * This falls into the CONSTRAINED UNPREDICTABLE zeroing of any
  77     * parts of the ZA storage outside of SVL.
  78     */
  79    if (imm == 0xff) {
  80        memset(env->zarray, 0, sizeof(env->zarray));
  81        return;
  82    }
  83
  84    /*
  85     * Recall that ZAnH.D[m] is spread across ZA[n+8*m],
  86     * so each row is discontiguous within ZA[].
  87     */
  88    for (i = 0; i < svl; i++) {
  89        if (imm & (1 << (i % 8))) {
  90            memset(&env->zarray[i], 0, svl);
  91        }
  92    }
  93}
  94
  95
  96/*
  97 * When considering the ZA storage as an array of elements of
  98 * type T, the index within that array of the Nth element of
  99 * a vertical slice of a tile can be calculated like this,
 100 * regardless of the size of type T. This is because the tiles
 101 * are interleaved, so if type T is size N bytes then row 1 of
 102 * the tile is N rows away from row 0. The division by N to
 103 * convert a byte offset into an array index and the multiplication
 104 * by N to convert from vslice-index-within-the-tile to
 105 * the index within the ZA storage cancel out.
 106 */
 107#define tile_vslice_index(i) ((i) * sizeof(ARMVectorReg))
 108
 109/*
 110 * When doing byte arithmetic on the ZA storage, the element
 111 * byteoff bytes away in a tile vertical slice is always this
 112 * many bytes away in the ZA storage, regardless of the
 113 * size of the tile element, assuming that byteoff is a multiple
 114 * of the element size. Again this is because of the interleaving
 115 * of the tiles. For instance if we have 1 byte per element then
 116 * each row of the ZA storage has one byte of the vslice data,
 117 * and (counting from 0) byte 8 goes in row 8 of the storage
 118 * at offset (8 * row-size-in-bytes).
 119 * If we have 8 bytes per element then each row of the ZA storage
 120 * has 8 bytes of the data, but there are 8 interleaved tiles and
 121 * so byte 8 of the data goes into row 1 of the tile,
 122 * which is again row 8 of the storage, so the offset is still
 123 * (8 * row-size-in-bytes). Similarly for other element sizes.
 124 */
 125#define tile_vslice_offset(byteoff) ((byteoff) * sizeof(ARMVectorReg))
 126
 127
 128/*
 129 * Move Zreg vector to ZArray column.
 130 */
 131#define DO_MOVA_C(NAME, TYPE, H)                                        \
 132void HELPER(NAME)(void *za, void *vn, void *vg, uint32_t desc)          \
 133{                                                                       \
 134    int i, oprsz = simd_oprsz(desc);                                    \
 135    for (i = 0; i < oprsz; ) {                                          \
 136        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
 137        do {                                                            \
 138            if (pg & 1) {                                               \
 139                *(TYPE *)(za + tile_vslice_offset(i)) = *(TYPE *)(vn + H(i)); \
 140            }                                                           \
 141            i += sizeof(TYPE);                                          \
 142            pg >>= sizeof(TYPE);                                        \
 143        } while (i & 15);                                               \
 144    }                                                                   \
 145}
 146
 147DO_MOVA_C(sme_mova_cz_b, uint8_t, H1)
 148DO_MOVA_C(sme_mova_cz_h, uint16_t, H1_2)
 149DO_MOVA_C(sme_mova_cz_s, uint32_t, H1_4)
 150
 151void HELPER(sme_mova_cz_d)(void *za, void *vn, void *vg, uint32_t desc)
 152{
 153    int i, oprsz = simd_oprsz(desc) / 8;
 154    uint8_t *pg = vg;
 155    uint64_t *n = vn;
 156    uint64_t *a = za;
 157
 158    for (i = 0; i < oprsz; i++) {
 159        if (pg[H1(i)] & 1) {
 160            a[tile_vslice_index(i)] = n[i];
 161        }
 162    }
 163}
 164
 165void HELPER(sme_mova_cz_q)(void *za, void *vn, void *vg, uint32_t desc)
 166{
 167    int i, oprsz = simd_oprsz(desc) / 16;
 168    uint16_t *pg = vg;
 169    Int128 *n = vn;
 170    Int128 *a = za;
 171
 172    /*
 173     * Int128 is used here simply to copy 16 bytes, and to simplify
 174     * the address arithmetic.
 175     */
 176    for (i = 0; i < oprsz; i++) {
 177        if (pg[H2(i)] & 1) {
 178            a[tile_vslice_index(i)] = n[i];
 179        }
 180    }
 181}
 182
 183#undef DO_MOVA_C
 184
 185/*
 186 * Move ZArray column to Zreg vector.
 187 */
 188#define DO_MOVA_Z(NAME, TYPE, H)                                        \
 189void HELPER(NAME)(void *vd, void *za, void *vg, uint32_t desc)          \
 190{                                                                       \
 191    int i, oprsz = simd_oprsz(desc);                                    \
 192    for (i = 0; i < oprsz; ) {                                          \
 193        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
 194        do {                                                            \
 195            if (pg & 1) {                                               \
 196                *(TYPE *)(vd + H(i)) = *(TYPE *)(za + tile_vslice_offset(i)); \
 197            }                                                           \
 198            i += sizeof(TYPE);                                          \
 199            pg >>= sizeof(TYPE);                                        \
 200        } while (i & 15);                                               \
 201    }                                                                   \
 202}
 203
 204DO_MOVA_Z(sme_mova_zc_b, uint8_t, H1)
 205DO_MOVA_Z(sme_mova_zc_h, uint16_t, H1_2)
 206DO_MOVA_Z(sme_mova_zc_s, uint32_t, H1_4)
 207
 208void HELPER(sme_mova_zc_d)(void *vd, void *za, void *vg, uint32_t desc)
 209{
 210    int i, oprsz = simd_oprsz(desc) / 8;
 211    uint8_t *pg = vg;
 212    uint64_t *d = vd;
 213    uint64_t *a = za;
 214
 215    for (i = 0; i < oprsz; i++) {
 216        if (pg[H1(i)] & 1) {
 217            d[i] = a[tile_vslice_index(i)];
 218        }
 219    }
 220}
 221
 222void HELPER(sme_mova_zc_q)(void *vd, void *za, void *vg, uint32_t desc)
 223{
 224    int i, oprsz = simd_oprsz(desc) / 16;
 225    uint16_t *pg = vg;
 226    Int128 *d = vd;
 227    Int128 *a = za;
 228
 229    /*
 230     * Int128 is used here simply to copy 16 bytes, and to simplify
 231     * the address arithmetic.
 232     */
 233    for (i = 0; i < oprsz; i++, za += sizeof(ARMVectorReg)) {
 234        if (pg[H2(i)] & 1) {
 235            d[i] = a[tile_vslice_index(i)];
 236        }
 237    }
 238}
 239
 240#undef DO_MOVA_Z
 241
 242/*
 243 * Clear elements in a tile slice comprising len bytes.
 244 */
 245
 246typedef void ClearFn(void *ptr, size_t off, size_t len);
 247
 248static void clear_horizontal(void *ptr, size_t off, size_t len)
 249{
 250    memset(ptr + off, 0, len);
 251}
 252
 253static void clear_vertical_b(void *vptr, size_t off, size_t len)
 254{
 255    for (size_t i = 0; i < len; ++i) {
 256        *(uint8_t *)(vptr + tile_vslice_offset(i + off)) = 0;
 257    }
 258}
 259
 260static void clear_vertical_h(void *vptr, size_t off, size_t len)
 261{
 262    for (size_t i = 0; i < len; i += 2) {
 263        *(uint16_t *)(vptr + tile_vslice_offset(i + off)) = 0;
 264    }
 265}
 266
 267static void clear_vertical_s(void *vptr, size_t off, size_t len)
 268{
 269    for (size_t i = 0; i < len; i += 4) {
 270        *(uint32_t *)(vptr + tile_vslice_offset(i + off)) = 0;
 271    }
 272}
 273
 274static void clear_vertical_d(void *vptr, size_t off, size_t len)
 275{
 276    for (size_t i = 0; i < len; i += 8) {
 277        *(uint64_t *)(vptr + tile_vslice_offset(i + off)) = 0;
 278    }
 279}
 280
 281static void clear_vertical_q(void *vptr, size_t off, size_t len)
 282{
 283    for (size_t i = 0; i < len; i += 16) {
 284        memset(vptr + tile_vslice_offset(i + off), 0, 16);
 285    }
 286}
 287
 288/*
 289 * Copy elements from an array into a tile slice comprising len bytes.
 290 */
 291
 292typedef void CopyFn(void *dst, const void *src, size_t len);
 293
 294static void copy_horizontal(void *dst, const void *src, size_t len)
 295{
 296    memcpy(dst, src, len);
 297}
 298
 299static void copy_vertical_b(void *vdst, const void *vsrc, size_t len)
 300{
 301    const uint8_t *src = vsrc;
 302    uint8_t *dst = vdst;
 303    size_t i;
 304
 305    for (i = 0; i < len; ++i) {
 306        dst[tile_vslice_index(i)] = src[i];
 307    }
 308}
 309
 310static void copy_vertical_h(void *vdst, const void *vsrc, size_t len)
 311{
 312    const uint16_t *src = vsrc;
 313    uint16_t *dst = vdst;
 314    size_t i;
 315
 316    for (i = 0; i < len / 2; ++i) {
 317        dst[tile_vslice_index(i)] = src[i];
 318    }
 319}
 320
 321static void copy_vertical_s(void *vdst, const void *vsrc, size_t len)
 322{
 323    const uint32_t *src = vsrc;
 324    uint32_t *dst = vdst;
 325    size_t i;
 326
 327    for (i = 0; i < len / 4; ++i) {
 328        dst[tile_vslice_index(i)] = src[i];
 329    }
 330}
 331
 332static void copy_vertical_d(void *vdst, const void *vsrc, size_t len)
 333{
 334    const uint64_t *src = vsrc;
 335    uint64_t *dst = vdst;
 336    size_t i;
 337
 338    for (i = 0; i < len / 8; ++i) {
 339        dst[tile_vslice_index(i)] = src[i];
 340    }
 341}
 342
 343static void copy_vertical_q(void *vdst, const void *vsrc, size_t len)
 344{
 345    for (size_t i = 0; i < len; i += 16) {
 346        memcpy(vdst + tile_vslice_offset(i), vsrc + i, 16);
 347    }
 348}
 349
 350/*
 351 * Host and TLB primitives for vertical tile slice addressing.
 352 */
 353
 354#define DO_LD(NAME, TYPE, HOST, TLB)                                        \
 355static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host)  \
 356{                                                                           \
 357    TYPE val = HOST(host);                                                  \
 358    *(TYPE *)(za + tile_vslice_offset(off)) = val;                          \
 359}                                                                           \
 360static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za,           \
 361                        intptr_t off, target_ulong addr, uintptr_t ra)      \
 362{                                                                           \
 363    TYPE val = TLB(env, useronly_clean_ptr(addr), ra);                      \
 364    *(TYPE *)(za + tile_vslice_offset(off)) = val;                          \
 365}
 366
 367#define DO_ST(NAME, TYPE, HOST, TLB)                                        \
 368static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host)  \
 369{                                                                           \
 370    TYPE val = *(TYPE *)(za + tile_vslice_offset(off));                     \
 371    HOST(host, val);                                                        \
 372}                                                                           \
 373static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za,           \
 374                        intptr_t off, target_ulong addr, uintptr_t ra)      \
 375{                                                                           \
 376    TYPE val = *(TYPE *)(za + tile_vslice_offset(off));                     \
 377    TLB(env, useronly_clean_ptr(addr), val, ra);                            \
 378}
 379
 380/*
 381 * The ARMVectorReg elements are stored in host-endian 64-bit units.
 382 * For 128-bit quantities, the sequence defined by the Elem[] pseudocode
 383 * corresponds to storing the two 64-bit pieces in little-endian order.
 384 */
 385#define DO_LDQ(HNAME, VNAME, BE, HOST, TLB)                                 \
 386static inline void HNAME##_host(void *za, intptr_t off, void *host)         \
 387{                                                                           \
 388    uint64_t val0 = HOST(host), val1 = HOST(host + 8);                      \
 389    uint64_t *ptr = za + off;                                               \
 390    ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1;                   \
 391}                                                                           \
 392static inline void VNAME##_v_host(void *za, intptr_t off, void *host)       \
 393{                                                                           \
 394    HNAME##_host(za, tile_vslice_offset(off), host);                        \
 395}                                                                           \
 396static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off,    \
 397                               target_ulong addr, uintptr_t ra)             \
 398{                                                                           \
 399    uint64_t val0 = TLB(env, useronly_clean_ptr(addr), ra);                 \
 400    uint64_t val1 = TLB(env, useronly_clean_ptr(addr + 8), ra);             \
 401    uint64_t *ptr = za + off;                                               \
 402    ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1;                   \
 403}                                                                           \
 404static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off,  \
 405                               target_ulong addr, uintptr_t ra)             \
 406{                                                                           \
 407    HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra);                \
 408}
 409
 410#define DO_STQ(HNAME, VNAME, BE, HOST, TLB)                                 \
 411static inline void HNAME##_host(void *za, intptr_t off, void *host)         \
 412{                                                                           \
 413    uint64_t *ptr = za + off;                                               \
 414    HOST(host, ptr[BE]);                                                    \
 415    HOST(host + 1, ptr[!BE]);                                               \
 416}                                                                           \
 417static inline void VNAME##_v_host(void *za, intptr_t off, void *host)       \
 418{                                                                           \
 419    HNAME##_host(za, tile_vslice_offset(off), host);                        \
 420}                                                                           \
 421static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off,    \
 422                               target_ulong addr, uintptr_t ra)             \
 423{                                                                           \
 424    uint64_t *ptr = za + off;                                               \
 425    TLB(env, useronly_clean_ptr(addr), ptr[BE], ra);                        \
 426    TLB(env, useronly_clean_ptr(addr + 8), ptr[!BE], ra);                   \
 427}                                                                           \
 428static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off,  \
 429                               target_ulong addr, uintptr_t ra)             \
 430{                                                                           \
 431    HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra);                \
 432}
 433
 434DO_LD(ld1b, uint8_t, ldub_p, cpu_ldub_data_ra)
 435DO_LD(ld1h_be, uint16_t, lduw_be_p, cpu_lduw_be_data_ra)
 436DO_LD(ld1h_le, uint16_t, lduw_le_p, cpu_lduw_le_data_ra)
 437DO_LD(ld1s_be, uint32_t, ldl_be_p, cpu_ldl_be_data_ra)
 438DO_LD(ld1s_le, uint32_t, ldl_le_p, cpu_ldl_le_data_ra)
 439DO_LD(ld1d_be, uint64_t, ldq_be_p, cpu_ldq_be_data_ra)
 440DO_LD(ld1d_le, uint64_t, ldq_le_p, cpu_ldq_le_data_ra)
 441
 442DO_LDQ(sve_ld1qq_be, sme_ld1q_be, 1, ldq_be_p, cpu_ldq_be_data_ra)
 443DO_LDQ(sve_ld1qq_le, sme_ld1q_le, 0, ldq_le_p, cpu_ldq_le_data_ra)
 444
 445DO_ST(st1b, uint8_t, stb_p, cpu_stb_data_ra)
 446DO_ST(st1h_be, uint16_t, stw_be_p, cpu_stw_be_data_ra)
 447DO_ST(st1h_le, uint16_t, stw_le_p, cpu_stw_le_data_ra)
 448DO_ST(st1s_be, uint32_t, stl_be_p, cpu_stl_be_data_ra)
 449DO_ST(st1s_le, uint32_t, stl_le_p, cpu_stl_le_data_ra)
 450DO_ST(st1d_be, uint64_t, stq_be_p, cpu_stq_be_data_ra)
 451DO_ST(st1d_le, uint64_t, stq_le_p, cpu_stq_le_data_ra)
 452
 453DO_STQ(sve_st1qq_be, sme_st1q_be, 1, stq_be_p, cpu_stq_be_data_ra)
 454DO_STQ(sve_st1qq_le, sme_st1q_le, 0, stq_le_p, cpu_stq_le_data_ra)
 455
 456#undef DO_LD
 457#undef DO_ST
 458#undef DO_LDQ
 459#undef DO_STQ
 460
 461/*
 462 * Common helper for all contiguous predicated loads.
 463 */
 464
 465static inline QEMU_ALWAYS_INLINE
 466void sme_ld1(CPUARMState *env, void *za, uint64_t *vg,
 467             const target_ulong addr, uint32_t desc, const uintptr_t ra,
 468             const int esz, uint32_t mtedesc, bool vertical,
 469             sve_ldst1_host_fn *host_fn,
 470             sve_ldst1_tlb_fn *tlb_fn,
 471             ClearFn *clr_fn,
 472             CopyFn *cpy_fn)
 473{
 474    const intptr_t reg_max = simd_oprsz(desc);
 475    const intptr_t esize = 1 << esz;
 476    intptr_t reg_off, reg_last;
 477    SVEContLdSt info;
 478    void *host;
 479    int flags;
 480
 481    /* Find the active elements.  */
 482    if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
 483        /* The entire predicate was false; no load occurs.  */
 484        clr_fn(za, 0, reg_max);
 485        return;
 486    }
 487
 488    /* Probe the page(s).  Exit with exception for any invalid page. */
 489    sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra);
 490
 491    /* Handle watchpoints for all active elements. */
 492    sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
 493                              BP_MEM_READ, ra);
 494
 495    /*
 496     * Handle mte checks for all active elements.
 497     * Since TBI must be set for MTE, !mtedesc => !mte_active.
 498     */
 499    if (mtedesc) {
 500        sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
 501                                mtedesc, ra);
 502    }
 503
 504    flags = info.page[0].flags | info.page[1].flags;
 505    if (unlikely(flags != 0)) {
 506#ifdef CONFIG_USER_ONLY
 507        g_assert_not_reached();
 508#else
 509        /*
 510         * At least one page includes MMIO.
 511         * Any bus operation can fail with cpu_transaction_failed,
 512         * which for ARM will raise SyncExternal.  Perform the load
 513         * into scratch memory to preserve register state until the end.
 514         */
 515        ARMVectorReg scratch = { };
 516
 517        reg_off = info.reg_off_first[0];
 518        reg_last = info.reg_off_last[1];
 519        if (reg_last < 0) {
 520            reg_last = info.reg_off_split;
 521            if (reg_last < 0) {
 522                reg_last = info.reg_off_last[0];
 523            }
 524        }
 525
 526        do {
 527            uint64_t pg = vg[reg_off >> 6];
 528            do {
 529                if ((pg >> (reg_off & 63)) & 1) {
 530                    tlb_fn(env, &scratch, reg_off, addr + reg_off, ra);
 531                }
 532                reg_off += esize;
 533            } while (reg_off & 63);
 534        } while (reg_off <= reg_last);
 535
 536        cpy_fn(za, &scratch, reg_max);
 537        return;
 538#endif
 539    }
 540
 541    /* The entire operation is in RAM, on valid pages. */
 542
 543    reg_off = info.reg_off_first[0];
 544    reg_last = info.reg_off_last[0];
 545    host = info.page[0].host;
 546
 547    if (!vertical) {
 548        memset(za, 0, reg_max);
 549    } else if (reg_off) {
 550        clr_fn(za, 0, reg_off);
 551    }
 552
 553    while (reg_off <= reg_last) {
 554        uint64_t pg = vg[reg_off >> 6];
 555        do {
 556            if ((pg >> (reg_off & 63)) & 1) {
 557                host_fn(za, reg_off, host + reg_off);
 558            } else if (vertical) {
 559                clr_fn(za, reg_off, esize);
 560            }
 561            reg_off += esize;
 562        } while (reg_off <= reg_last && (reg_off & 63));
 563    }
 564
 565    /*
 566     * Use the slow path to manage the cross-page misalignment.
 567     * But we know this is RAM and cannot trap.
 568     */
 569    reg_off = info.reg_off_split;
 570    if (unlikely(reg_off >= 0)) {
 571        tlb_fn(env, za, reg_off, addr + reg_off, ra);
 572    }
 573
 574    reg_off = info.reg_off_first[1];
 575    if (unlikely(reg_off >= 0)) {
 576        reg_last = info.reg_off_last[1];
 577        host = info.page[1].host;
 578
 579        do {
 580            uint64_t pg = vg[reg_off >> 6];
 581            do {
 582                if ((pg >> (reg_off & 63)) & 1) {
 583                    host_fn(za, reg_off, host + reg_off);
 584                } else if (vertical) {
 585                    clr_fn(za, reg_off, esize);
 586                }
 587                reg_off += esize;
 588            } while (reg_off & 63);
 589        } while (reg_off <= reg_last);
 590    }
 591}
 592
 593static inline QEMU_ALWAYS_INLINE
 594void sme_ld1_mte(CPUARMState *env, void *za, uint64_t *vg,
 595                 target_ulong addr, uint32_t desc, uintptr_t ra,
 596                 const int esz, bool vertical,
 597                 sve_ldst1_host_fn *host_fn,
 598                 sve_ldst1_tlb_fn *tlb_fn,
 599                 ClearFn *clr_fn,
 600                 CopyFn *cpy_fn)
 601{
 602    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
 603    int bit55 = extract64(addr, 55, 1);
 604
 605    /* Remove mtedesc from the normal sve descriptor. */
 606    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
 607
 608    /* Perform gross MTE suppression early. */
 609    if (!tbi_check(desc, bit55) ||
 610        tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
 611        mtedesc = 0;
 612    }
 613
 614    sme_ld1(env, za, vg, addr, desc, ra, esz, mtedesc, vertical,
 615            host_fn, tlb_fn, clr_fn, cpy_fn);
 616}
 617
 618#define DO_LD(L, END, ESZ)                                                 \
 619void HELPER(sme_ld1##L##END##_h)(CPUARMState *env, void *za, void *vg,     \
 620                                 target_ulong addr, uint32_t desc)         \
 621{                                                                          \
 622    sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false,               \
 623            sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb,           \
 624            clear_horizontal, copy_horizontal);                            \
 625}                                                                          \
 626void HELPER(sme_ld1##L##END##_v)(CPUARMState *env, void *za, void *vg,     \
 627                                 target_ulong addr, uint32_t desc)         \
 628{                                                                          \
 629    sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true,                \
 630            sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb,             \
 631            clear_vertical_##L, copy_vertical_##L);                        \
 632}                                                                          \
 633void HELPER(sme_ld1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
 634                                     target_ulong addr, uint32_t desc)     \
 635{                                                                          \
 636    sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false,              \
 637                sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb,       \
 638                clear_horizontal, copy_horizontal);                        \
 639}                                                                          \
 640void HELPER(sme_ld1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
 641                                     target_ulong addr, uint32_t desc)     \
 642{                                                                          \
 643    sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true,               \
 644                sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb,         \
 645                clear_vertical_##L, copy_vertical_##L);                    \
 646}
 647
 648DO_LD(b, , MO_8)
 649DO_LD(h, _be, MO_16)
 650DO_LD(h, _le, MO_16)
 651DO_LD(s, _be, MO_32)
 652DO_LD(s, _le, MO_32)
 653DO_LD(d, _be, MO_64)
 654DO_LD(d, _le, MO_64)
 655DO_LD(q, _be, MO_128)
 656DO_LD(q, _le, MO_128)
 657
 658#undef DO_LD
 659
 660/*
 661 * Common helper for all contiguous predicated stores.
 662 */
 663
 664static inline QEMU_ALWAYS_INLINE
 665void sme_st1(CPUARMState *env, void *za, uint64_t *vg,
 666             const target_ulong addr, uint32_t desc, const uintptr_t ra,
 667             const int esz, uint32_t mtedesc, bool vertical,
 668             sve_ldst1_host_fn *host_fn,
 669             sve_ldst1_tlb_fn *tlb_fn)
 670{
 671    const intptr_t reg_max = simd_oprsz(desc);
 672    const intptr_t esize = 1 << esz;
 673    intptr_t reg_off, reg_last;
 674    SVEContLdSt info;
 675    void *host;
 676    int flags;
 677
 678    /* Find the active elements.  */
 679    if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
 680        /* The entire predicate was false; no store occurs.  */
 681        return;
 682    }
 683
 684    /* Probe the page(s).  Exit with exception for any invalid page. */
 685    sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra);
 686
 687    /* Handle watchpoints for all active elements. */
 688    sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
 689                              BP_MEM_WRITE, ra);
 690
 691    /*
 692     * Handle mte checks for all active elements.
 693     * Since TBI must be set for MTE, !mtedesc => !mte_active.
 694     */
 695    if (mtedesc) {
 696        sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
 697                                mtedesc, ra);
 698    }
 699
 700    flags = info.page[0].flags | info.page[1].flags;
 701    if (unlikely(flags != 0)) {
 702#ifdef CONFIG_USER_ONLY
 703        g_assert_not_reached();
 704#else
 705        /*
 706         * At least one page includes MMIO.
 707         * Any bus operation can fail with cpu_transaction_failed,
 708         * which for ARM will raise SyncExternal.  We cannot avoid
 709         * this fault and will leave with the store incomplete.
 710         */
 711        reg_off = info.reg_off_first[0];
 712        reg_last = info.reg_off_last[1];
 713        if (reg_last < 0) {
 714            reg_last = info.reg_off_split;
 715            if (reg_last < 0) {
 716                reg_last = info.reg_off_last[0];
 717            }
 718        }
 719
 720        do {
 721            uint64_t pg = vg[reg_off >> 6];
 722            do {
 723                if ((pg >> (reg_off & 63)) & 1) {
 724                    tlb_fn(env, za, reg_off, addr + reg_off, ra);
 725                }
 726                reg_off += esize;
 727            } while (reg_off & 63);
 728        } while (reg_off <= reg_last);
 729        return;
 730#endif
 731    }
 732
 733    reg_off = info.reg_off_first[0];
 734    reg_last = info.reg_off_last[0];
 735    host = info.page[0].host;
 736
 737    while (reg_off <= reg_last) {
 738        uint64_t pg = vg[reg_off >> 6];
 739        do {
 740            if ((pg >> (reg_off & 63)) & 1) {
 741                host_fn(za, reg_off, host + reg_off);
 742            }
 743            reg_off += 1 << esz;
 744        } while (reg_off <= reg_last && (reg_off & 63));
 745    }
 746
 747    /*
 748     * Use the slow path to manage the cross-page misalignment.
 749     * But we know this is RAM and cannot trap.
 750     */
 751    reg_off = info.reg_off_split;
 752    if (unlikely(reg_off >= 0)) {
 753        tlb_fn(env, za, reg_off, addr + reg_off, ra);
 754    }
 755
 756    reg_off = info.reg_off_first[1];
 757    if (unlikely(reg_off >= 0)) {
 758        reg_last = info.reg_off_last[1];
 759        host = info.page[1].host;
 760
 761        do {
 762            uint64_t pg = vg[reg_off >> 6];
 763            do {
 764                if ((pg >> (reg_off & 63)) & 1) {
 765                    host_fn(za, reg_off, host + reg_off);
 766                }
 767                reg_off += 1 << esz;
 768            } while (reg_off & 63);
 769        } while (reg_off <= reg_last);
 770    }
 771}
 772
 773static inline QEMU_ALWAYS_INLINE
 774void sme_st1_mte(CPUARMState *env, void *za, uint64_t *vg, target_ulong addr,
 775                 uint32_t desc, uintptr_t ra, int esz, bool vertical,
 776                 sve_ldst1_host_fn *host_fn,
 777                 sve_ldst1_tlb_fn *tlb_fn)
 778{
 779    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
 780    int bit55 = extract64(addr, 55, 1);
 781
 782    /* Remove mtedesc from the normal sve descriptor. */
 783    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
 784
 785    /* Perform gross MTE suppression early. */
 786    if (!tbi_check(desc, bit55) ||
 787        tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
 788        mtedesc = 0;
 789    }
 790
 791    sme_st1(env, za, vg, addr, desc, ra, esz, mtedesc,
 792            vertical, host_fn, tlb_fn);
 793}
 794
 795#define DO_ST(L, END, ESZ)                                                 \
 796void HELPER(sme_st1##L##END##_h)(CPUARMState *env, void *za, void *vg,     \
 797                                 target_ulong addr, uint32_t desc)         \
 798{                                                                          \
 799    sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false,               \
 800            sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb);          \
 801}                                                                          \
 802void HELPER(sme_st1##L##END##_v)(CPUARMState *env, void *za, void *vg,     \
 803                                 target_ulong addr, uint32_t desc)         \
 804{                                                                          \
 805    sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true,                \
 806            sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb);            \
 807}                                                                          \
 808void HELPER(sme_st1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
 809                                     target_ulong addr, uint32_t desc)     \
 810{                                                                          \
 811    sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false,              \
 812                sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb);      \
 813}                                                                          \
 814void HELPER(sme_st1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
 815                                     target_ulong addr, uint32_t desc)     \
 816{                                                                          \
 817    sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true,               \
 818                sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb);        \
 819}
 820
 821DO_ST(b, , MO_8)
 822DO_ST(h, _be, MO_16)
 823DO_ST(h, _le, MO_16)
 824DO_ST(s, _be, MO_32)
 825DO_ST(s, _le, MO_32)
 826DO_ST(d, _be, MO_64)
 827DO_ST(d, _le, MO_64)
 828DO_ST(q, _be, MO_128)
 829DO_ST(q, _le, MO_128)
 830
 831#undef DO_ST
 832
 833void HELPER(sme_addha_s)(void *vzda, void *vzn, void *vpn,
 834                         void *vpm, uint32_t desc)
 835{
 836    intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
 837    uint64_t *pn = vpn, *pm = vpm;
 838    uint32_t *zda = vzda, *zn = vzn;
 839
 840    for (row = 0; row < oprsz; ) {
 841        uint64_t pa = pn[row >> 4];
 842        do {
 843            if (pa & 1) {
 844                for (col = 0; col < oprsz; ) {
 845                    uint64_t pb = pm[col >> 4];
 846                    do {
 847                        if (pb & 1) {
 848                            zda[tile_vslice_index(row) + H4(col)] += zn[H4(col)];
 849                        }
 850                        pb >>= 4;
 851                    } while (++col & 15);
 852                }
 853            }
 854            pa >>= 4;
 855        } while (++row & 15);
 856    }
 857}
 858
 859void HELPER(sme_addha_d)(void *vzda, void *vzn, void *vpn,
 860                         void *vpm, uint32_t desc)
 861{
 862    intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
 863    uint8_t *pn = vpn, *pm = vpm;
 864    uint64_t *zda = vzda, *zn = vzn;
 865
 866    for (row = 0; row < oprsz; ++row) {
 867        if (pn[H1(row)] & 1) {
 868            for (col = 0; col < oprsz; ++col) {
 869                if (pm[H1(col)] & 1) {
 870                    zda[tile_vslice_index(row) + col] += zn[col];
 871                }
 872            }
 873        }
 874    }
 875}
 876
 877void HELPER(sme_addva_s)(void *vzda, void *vzn, void *vpn,
 878                         void *vpm, uint32_t desc)
 879{
 880    intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
 881    uint64_t *pn = vpn, *pm = vpm;
 882    uint32_t *zda = vzda, *zn = vzn;
 883
 884    for (row = 0; row < oprsz; ) {
 885        uint64_t pa = pn[row >> 4];
 886        do {
 887            if (pa & 1) {
 888                uint32_t zn_row = zn[H4(row)];
 889                for (col = 0; col < oprsz; ) {
 890                    uint64_t pb = pm[col >> 4];
 891                    do {
 892                        if (pb & 1) {
 893                            zda[tile_vslice_index(row) + H4(col)] += zn_row;
 894                        }
 895                        pb >>= 4;
 896                    } while (++col & 15);
 897                }
 898            }
 899            pa >>= 4;
 900        } while (++row & 15);
 901    }
 902}
 903
 904void HELPER(sme_addva_d)(void *vzda, void *vzn, void *vpn,
 905                         void *vpm, uint32_t desc)
 906{
 907    intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
 908    uint8_t *pn = vpn, *pm = vpm;
 909    uint64_t *zda = vzda, *zn = vzn;
 910
 911    for (row = 0; row < oprsz; ++row) {
 912        if (pn[H1(row)] & 1) {
 913            uint64_t zn_row = zn[row];
 914            for (col = 0; col < oprsz; ++col) {
 915                if (pm[H1(col)] & 1) {
 916                    zda[tile_vslice_index(row) + col] += zn_row;
 917                }
 918            }
 919        }
 920    }
 921}
 922
 923void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn,
 924                         void *vpm, void *vst, uint32_t desc)
 925{
 926    intptr_t row, col, oprsz = simd_maxsz(desc);
 927    uint32_t neg = simd_data(desc) << 31;
 928    uint16_t *pn = vpn, *pm = vpm;
 929    float_status fpst;
 930
 931    /*
 932     * Make a copy of float_status because this operation does not
 933     * update the cumulative fp exception status.  It also produces
 934     * default nans.
 935     */
 936    fpst = *(float_status *)vst;
 937    set_default_nan_mode(true, &fpst);
 938
 939    for (row = 0; row < oprsz; ) {
 940        uint16_t pa = pn[H2(row >> 4)];
 941        do {
 942            if (pa & 1) {
 943                void *vza_row = vza + tile_vslice_offset(row);
 944                uint32_t n = *(uint32_t *)(vzn + H1_4(row)) ^ neg;
 945
 946                for (col = 0; col < oprsz; ) {
 947                    uint16_t pb = pm[H2(col >> 4)];
 948                    do {
 949                        if (pb & 1) {
 950                            uint32_t *a = vza_row + H1_4(col);
 951                            uint32_t *m = vzm + H1_4(col);
 952                            *a = float32_muladd(n, *m, *a, 0, vst);
 953                        }
 954                        col += 4;
 955                        pb >>= 4;
 956                    } while (col & 15);
 957                }
 958            }
 959            row += 4;
 960            pa >>= 4;
 961        } while (row & 15);
 962    }
 963}
 964
 965void HELPER(sme_fmopa_d)(void *vza, void *vzn, void *vzm, void *vpn,
 966                         void *vpm, void *vst, uint32_t desc)
 967{
 968    intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
 969    uint64_t neg = (uint64_t)simd_data(desc) << 63;
 970    uint64_t *za = vza, *zn = vzn, *zm = vzm;
 971    uint8_t *pn = vpn, *pm = vpm;
 972    float_status fpst = *(float_status *)vst;
 973
 974    set_default_nan_mode(true, &fpst);
 975
 976    for (row = 0; row < oprsz; ++row) {
 977        if (pn[H1(row)] & 1) {
 978            uint64_t *za_row = &za[tile_vslice_index(row)];
 979            uint64_t n = zn[row] ^ neg;
 980
 981            for (col = 0; col < oprsz; ++col) {
 982                if (pm[H1(col)] & 1) {
 983                    uint64_t *a = &za_row[col];
 984                    *a = float64_muladd(n, zm[col], *a, 0, &fpst);
 985                }
 986            }
 987        }
 988    }
 989}
 990
 991/*
 992 * Alter PAIR as needed for controlling predicates being false,
 993 * and for NEG on an enabled row element.
 994 */
 995static inline uint32_t f16mop_adj_pair(uint32_t pair, uint32_t pg, uint32_t neg)
 996{
 997    /*
 998     * The pseudocode uses a conditional negate after the conditional zero.
 999     * It is simpler here to unconditionally negate before conditional zero.
1000     */

1001    pair ^= neg;
1002    if (!(pg & 1)) {
1003        pair &= 0xffff0000u;
1004    }
1005    if (!(pg & 4)) {
1006        pair &= 0x0000ffffu;
1007    }
1008    return pair;
1009}
1010
1011static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2,
1012                          float_status *s_std, float_status *s_odd)
1013{
1014    float64 e1r = float16_to_float64(e1 & 0xffff, true, s_std);
1015    float64 e1c = float16_to_float64(e1 >> 16, true, s_std);
1016    float64 e2r = float16_to_float64(e2 & 0xffff, true, s_std);
1017    float64 e2c = float16_to_float64(e2 >> 16, true, s_std);
1018    float64 t64;
1019    float32 t32;
1020
1021    /*
1022     * The ARM pseudocode function FPDot performs both multiplies
1023     * and the add with a single rounding operation.  Emulate this
1024     * by performing the first multiply in round-to-odd, then doing
1025     * the second multiply as fused multiply-add, and rounding to
1026     * float32 all in one step.
1027     */
1028    t64 = float64_mul(e1r, e2r, s_odd);
1029    t64 = float64r32_muladd(e1c, e2c, t64, 0, s_std);
1030
1031    /* This conversion is exact, because we've already rounded. */
1032    t32 = float64_to_float32(t64, s_std);
1033
1034    /* The final accumulation step is not fused. */
1035    return float32_add(sum, t32, s_std);
1036}
1037
1038void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn,
1039                         void *vpm, void *vst, uint32_t desc)
1040{
1041    intptr_t row, col, oprsz = simd_maxsz(desc);
1042    uint32_t neg = simd_data(desc) * 0x80008000u;
1043    uint16_t *pn = vpn, *pm = vpm;
1044    float_status fpst_odd, fpst_std;
1045
1046    /*
1047     * Make a copy of float_status because this operation does not
1048     * update the cumulative fp exception status.  It also produces
1049     * default nans.  Make a second copy with round-to-odd -- see above.
1050     */
1051    fpst_std = *(float_status *)vst;
1052    set_default_nan_mode(true, &fpst_std);
1053    fpst_odd = fpst_std;
1054    set_float_rounding_mode(float_round_to_odd, &fpst_odd);
1055
1056    for (row = 0; row < oprsz; ) {
1057        uint16_t prow = pn[H2(row >> 4)];
1058        do {
1059            void *vza_row = vza + tile_vslice_offset(row);
1060            uint32_t n = *(uint32_t *)(vzn + H1_4(row));
1061
1062            n = f16mop_adj_pair(n, prow, neg);
1063
1064            for (col = 0; col < oprsz; ) {
1065                uint16_t pcol = pm[H2(col >> 4)];
1066                do {
1067                    if (prow & pcol & 0b0101) {
1068                        uint32_t *a = vza_row + H1_4(col);
1069                        uint32_t m = *(uint32_t *)(vzm + H1_4(col));
1070
1071                        m = f16mop_adj_pair(m, pcol, 0);
1072                        *a = f16_dotadd(*a, n, m, &fpst_std, &fpst_odd);
1073
1074                        col += 4;
1075                        pcol >>= 4;
1076                    }
1077                } while (col & 15);
1078            }
1079            row += 4;
1080            prow >>= 4;
1081        } while (row & 15);
1082    }
1083}
1084
1085void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm, void *vpn,
1086                        void *vpm, uint32_t desc)
1087{
1088    intptr_t row, col, oprsz = simd_maxsz(desc);
1089    uint32_t neg = simd_data(desc) * 0x80008000u;
1090    uint16_t *pn = vpn, *pm = vpm;
1091
1092    for (row = 0; row < oprsz; ) {
1093        uint16_t prow = pn[H2(row >> 4)];
1094        do {
1095            void *vza_row = vza + tile_vslice_offset(row);
1096            uint32_t n = *(uint32_t *)(vzn + H1_4(row));
1097
1098            n = f16mop_adj_pair(n, prow, neg);
1099
1100            for (col = 0; col < oprsz; ) {
1101                uint16_t pcol = pm[H2(col >> 4)];
1102                do {
1103                    if (prow & pcol & 0b0101) {
1104                        uint32_t *a = vza_row + H1_4(col);
1105                        uint32_t m = *(uint32_t *)(vzm + H1_4(col));
1106
1107                        m = f16mop_adj_pair(m, pcol, 0);
1108                        *a = bfdotadd(*a, n, m);
1109
1110                        col += 4;
1111                        pcol >>= 4;
1112                    }
1113                } while (col & 15);
1114            }
1115            row += 4;
1116            prow >>= 4;
1117        } while (row & 15);
1118    }
1119}
1120
1121typedef uint64_t IMOPFn(uint64_t, uint64_t, uint64_t, uint8_t, bool);
1122
1123static inline void do_imopa(uint64_t *za, uint64_t *zn, uint64_t *zm,
1124                            uint8_t *pn, uint8_t *pm,
1125                            uint32_t desc, IMOPFn *fn)
1126{
1127    intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
1128    bool neg = simd_data(desc);
1129
1130    for (row = 0; row < oprsz; ++row) {
1131        uint8_t pa = pn[H1(row)];
1132        uint64_t *za_row = &za[tile_vslice_index(row)];
1133        uint64_t n = zn[row];
1134
1135        for (col = 0; col < oprsz; ++col) {
1136            uint8_t pb = pm[H1(col)];
1137            uint64_t *a = &za_row[col];
1138
1139            *a = fn(n, zm[col], *a, pa & pb, neg);
1140        }
1141    }
1142}
1143
1144#define DEF_IMOP_32(NAME, NTYPE, MTYPE) \
1145static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
1146{                                                                           \
1147    uint32_t sum0 = 0, sum1 = 0;                                            \
1148    /* Apply P to N as a mask, making the inactive elements 0. */           \
1149    n &= expand_pred_b(p);                                                  \
1150    sum0 += (NTYPE)(n >> 0) * (MTYPE)(m >> 0);                              \
1151    sum0 += (NTYPE)(n >> 8) * (MTYPE)(m >> 8);                              \
1152    sum0 += (NTYPE)(n >> 16) * (MTYPE)(m >> 16);                            \
1153    sum0 += (NTYPE)(n >> 24) * (MTYPE)(m >> 24);                            \
1154    sum1 += (NTYPE)(n >> 32) * (MTYPE)(m >> 32);                            \
1155    sum1 += (NTYPE)(n >> 40) * (MTYPE)(m >> 40);                            \
1156    sum1 += (NTYPE)(n >> 48) * (MTYPE)(m >> 48);                            \
1157    sum1 += (NTYPE)(n >> 56) * (MTYPE)(m >> 56);                            \
1158    if (neg) {                                                              \
1159        sum0 = (uint32_t)a - sum0, sum1 = (uint32_t)(a >> 32) - sum1;       \
1160    } else {                                                                \
1161        sum0 = (uint32_t)a + sum0, sum1 = (uint32_t)(a >> 32) + sum1;       \
1162    }                                                                       \
1163    return ((uint64_t)sum1 << 32) | sum0;                                   \
1164}
1165
1166#define DEF_IMOP_64(NAME, NTYPE, MTYPE) \
1167static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
1168{                                                                           \
1169    uint64_t sum = 0;                                                       \
1170    /* Apply P to N as a mask, making the inactive elements 0. */           \
1171    n &= expand_pred_h(p);                                                  \
1172    sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0);                               \
1173    sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16);                             \
1174    sum += (NTYPE)(n >> 32) * (MTYPE)(m >> 32);                             \
1175    sum += (NTYPE)(n >> 48) * (MTYPE)(m >> 48);                             \
1176    return neg ? a - sum : a + sum;                                         \
1177}
1178
1179DEF_IMOP_32(smopa_s, int8_t, int8_t)
1180DEF_IMOP_32(umopa_s, uint8_t, uint8_t)
1181DEF_IMOP_32(sumopa_s, int8_t, uint8_t)
1182DEF_IMOP_32(usmopa_s, uint8_t, int8_t)
1183
1184DEF_IMOP_64(smopa_d, int16_t, int16_t)
1185DEF_IMOP_64(umopa_d, uint16_t, uint16_t)
1186DEF_IMOP_64(sumopa_d, int16_t, uint16_t)
1187DEF_IMOP_64(usmopa_d, uint16_t, int16_t)
1188
1189#define DEF_IMOPH(NAME) \
1190    void HELPER(sme_##NAME)(void *vza, void *vzn, void *vzm, void *vpn,      \
1191                            void *vpm, uint32_t desc)                        \
1192    { do_imopa(vza, vzn, vzm, vpn, vpm, desc, NAME); }
1193
1194DEF_IMOPH(smopa_s)
1195DEF_IMOPH(umopa_s)
1196DEF_IMOPH(sumopa_s)
1197DEF_IMOPH(usmopa_s)
1198DEF_IMOPH(smopa_d)
1199DEF_IMOPH(umopa_d)
1200DEF_IMOPH(sumopa_d)
1201DEF_IMOPH(usmopa_d)
1202