qemu/target/arm/tcg/sve_helper.c
<<
>>
Prefs
   1/*
   2 * ARM SVE Operations
   3 *
   4 * Copyright (c) 2018 Linaro, Ltd.
   5 *
   6 * This library is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU Lesser General Public
   8 * License as published by the Free Software Foundation; either
   9 * version 2.1 of the License, or (at your option) any later version.
  10 *
  11 * This library is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * Lesser General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU Lesser General Public
  17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20#include "qemu/osdep.h"
  21#include "cpu.h"
  22#include "internals.h"
  23#include "exec/exec-all.h"
  24#include "exec/helper-proto.h"
  25#include "tcg/tcg-gvec-desc.h"
  26#include "fpu/softfloat.h"
  27#include "tcg/tcg.h"
  28#include "vec_internal.h"
  29#include "sve_ldst_internal.h"
  30#include "hw/core/tcg-cpu-ops.h"
  31
  32
  33/* Return a value for NZCV as per the ARM PredTest pseudofunction.
  34 *
  35 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
  36 * and bit 0 set if C is set.  Compare the definitions of these variables
  37 * within CPUARMState.
  38 */
  39
  40/* For no G bits set, NZCV = C.  */
  41#define PREDTEST_INIT  1
  42
  43/* This is an iterative function, called for each Pd and Pg word
  44 * moving forward.
  45 */
  46static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
  47{
  48    if (likely(g)) {
  49        /* Compute N from first D & G.
  50           Use bit 2 to signal first G bit seen.  */
  51        if (!(flags & 4)) {
  52            flags |= ((d & (g & -g)) != 0) << 31;
  53            flags |= 4;
  54        }
  55
  56        /* Accumulate Z from each D & G.  */
  57        flags |= ((d & g) != 0) << 1;
  58
  59        /* Compute C from last !(D & G).  Replace previous.  */
  60        flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
  61    }
  62    return flags;
  63}
  64
  65/* This is an iterative function, called for each Pd and Pg word
  66 * moving backward.
  67 */
  68static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
  69{
  70    if (likely(g)) {
  71        /* Compute C from first (i.e last) !(D & G).
  72           Use bit 2 to signal first G bit seen.  */
  73        if (!(flags & 4)) {
  74            flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
  75            flags |= (d & pow2floor(g)) == 0;
  76        }
  77
  78        /* Accumulate Z from each D & G.  */
  79        flags |= ((d & g) != 0) << 1;
  80
  81        /* Compute N from last (i.e first) D & G.  Replace previous.  */
  82        flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
  83    }
  84    return flags;
  85}
  86
  87/* The same for a single word predicate.  */
  88uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
  89{
  90    return iter_predtest_fwd(d, g, PREDTEST_INIT);
  91}
  92
  93/* The same for a multi-word predicate.  */
  94uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
  95{
  96    uint32_t flags = PREDTEST_INIT;
  97    uint64_t *d = vd, *g = vg;
  98    uintptr_t i = 0;
  99
 100    do {
 101        flags = iter_predtest_fwd(d[i], g[i], flags);
 102    } while (++i < words);
 103
 104    return flags;
 105}
 106
 107/* Similarly for single word elements.  */
 108static inline uint64_t expand_pred_s(uint8_t byte)
 109{
 110    static const uint64_t word[] = {
 111        [0x01] = 0x00000000ffffffffull,
 112        [0x10] = 0xffffffff00000000ull,
 113        [0x11] = 0xffffffffffffffffull,
 114    };
 115    return word[byte & 0x11];
 116}
 117
 118#define LOGICAL_PPPP(NAME, FUNC) \
 119void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
 120{                                                                         \
 121    uintptr_t opr_sz = simd_oprsz(desc);                                  \
 122    uint64_t *d = vd, *n = vn, *m = vm, *g = vg;                          \
 123    uintptr_t i;                                                          \
 124    for (i = 0; i < opr_sz / 8; ++i) {                                    \
 125        d[i] = FUNC(n[i], m[i], g[i]);                                    \
 126    }                                                                     \
 127}
 128
 129#define DO_AND(N, M, G)  (((N) & (M)) & (G))
 130#define DO_BIC(N, M, G)  (((N) & ~(M)) & (G))
 131#define DO_EOR(N, M, G)  (((N) ^ (M)) & (G))
 132#define DO_ORR(N, M, G)  (((N) | (M)) & (G))
 133#define DO_ORN(N, M, G)  (((N) | ~(M)) & (G))
 134#define DO_NOR(N, M, G)  (~((N) | (M)) & (G))
 135#define DO_NAND(N, M, G) (~((N) & (M)) & (G))
 136#define DO_SEL(N, M, G)  (((N) & (G)) | ((M) & ~(G)))
 137
 138LOGICAL_PPPP(sve_and_pppp, DO_AND)
 139LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
 140LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
 141LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
 142LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
 143LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
 144LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
 145LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
 146
 147#undef DO_AND
 148#undef DO_BIC
 149#undef DO_EOR
 150#undef DO_ORR
 151#undef DO_ORN
 152#undef DO_NOR
 153#undef DO_NAND
 154#undef DO_SEL
 155#undef LOGICAL_PPPP
 156
 157/* Fully general three-operand expander, controlled by a predicate.
 158 * This is complicated by the host-endian storage of the register file.
 159 */
 160/* ??? I don't expect the compiler could ever vectorize this itself.
 161 * With some tables we can convert bit masks to byte masks, and with
 162 * extra care wrt byte/word ordering we could use gcc generic vectors
 163 * and do 16 bytes at a time.
 164 */
 165#define DO_ZPZZ(NAME, TYPE, H, OP)                                       \
 166void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
 167{                                                                       \
 168    intptr_t i, opr_sz = simd_oprsz(desc);                              \
 169    for (i = 0; i < opr_sz; ) {                                         \
 170        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
 171        do {                                                            \
 172            if (pg & 1) {                                               \
 173                TYPE nn = *(TYPE *)(vn + H(i));                         \
 174                TYPE mm = *(TYPE *)(vm + H(i));                         \
 175                *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
 176            }                                                           \
 177            i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
 178        } while (i & 15);                                               \
 179    }                                                                   \
 180}
 181
 182/* Similarly, specialized for 64-bit operands.  */
 183#define DO_ZPZZ_D(NAME, TYPE, OP)                                \
 184void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
 185{                                                               \
 186    intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
 187    TYPE *d = vd, *n = vn, *m = vm;                             \
 188    uint8_t *pg = vg;                                           \
 189    for (i = 0; i < opr_sz; i += 1) {                           \
 190        if (pg[H1(i)] & 1) {                                    \
 191            TYPE nn = n[i], mm = m[i];                          \
 192            d[i] = OP(nn, mm);                                  \
 193        }                                                       \
 194    }                                                           \
 195}
 196
 197#define DO_AND(N, M)  (N & M)
 198#define DO_EOR(N, M)  (N ^ M)
 199#define DO_ORR(N, M)  (N | M)
 200#define DO_BIC(N, M)  (N & ~M)
 201#define DO_ADD(N, M)  (N + M)
 202#define DO_SUB(N, M)  (N - M)
 203#define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
 204#define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
 205#define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
 206#define DO_MUL(N, M)  (N * M)
 207
 208
 209/*
 210 * We must avoid the C undefined behaviour cases: division by
 211 * zero and signed division of INT_MIN by -1. Both of these
 212 * have architecturally defined required results for Arm.
 213 * We special case all signed divisions by -1 to avoid having
 214 * to deduce the minimum integer for the type involved.
 215 */
 216#define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
 217#define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
 218
 219DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
 220DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
 221DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
 222DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
 223
 224DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
 225DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
 226DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
 227DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
 228
 229DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
 230DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
 231DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
 232DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
 233
 234DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
 235DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
 236DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
 237DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
 238
 239DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
 240DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
 241DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
 242DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
 243
 244DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
 245DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
 246DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
 247DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
 248
 249DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
 250DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
 251DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
 252DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
 253
 254DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
 255DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
 256DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
 257DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
 258
 259DO_ZPZZ(sve_smin_zpzz_b, int8_t,  H1, DO_MIN)
 260DO_ZPZZ(sve_smin_zpzz_h, int16_t,  H1_2, DO_MIN)
 261DO_ZPZZ(sve_smin_zpzz_s, int32_t,  H1_4, DO_MIN)
 262DO_ZPZZ_D(sve_smin_zpzz_d, int64_t,  DO_MIN)
 263
 264DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
 265DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
 266DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
 267DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
 268
 269DO_ZPZZ(sve_sabd_zpzz_b, int8_t,  H1, DO_ABD)
 270DO_ZPZZ(sve_sabd_zpzz_h, int16_t,  H1_2, DO_ABD)
 271DO_ZPZZ(sve_sabd_zpzz_s, int32_t,  H1_4, DO_ABD)
 272DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t,  DO_ABD)
 273
 274DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
 275DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
 276DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
 277DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
 278
 279/* Because the computation type is at least twice as large as required,
 280   these work for both signed and unsigned source types.  */
 281static inline uint8_t do_mulh_b(int32_t n, int32_t m)
 282{
 283    return (n * m) >> 8;
 284}
 285
 286static inline uint16_t do_mulh_h(int32_t n, int32_t m)
 287{
 288    return (n * m) >> 16;
 289}
 290
 291static inline uint32_t do_mulh_s(int64_t n, int64_t m)
 292{
 293    return (n * m) >> 32;
 294}
 295
 296static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
 297{
 298    uint64_t lo, hi;
 299    muls64(&lo, &hi, n, m);
 300    return hi;
 301}
 302
 303static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
 304{
 305    uint64_t lo, hi;
 306    mulu64(&lo, &hi, n, m);
 307    return hi;
 308}
 309
 310DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
 311DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
 312DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
 313DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
 314
 315DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
 316DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
 317DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
 318DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
 319
 320DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
 321DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
 322DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
 323DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
 324
 325DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
 326DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
 327
 328DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
 329DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
 330
 331/* Note that all bits of the shift are significant
 332   and not modulo the element size.  */
 333#define DO_ASR(N, M)  (N >> MIN(M, sizeof(N) * 8 - 1))
 334#define DO_LSR(N, M)  (M < sizeof(N) * 8 ? N >> M : 0)
 335#define DO_LSL(N, M)  (M < sizeof(N) * 8 ? N << M : 0)
 336
 337DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
 338DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
 339DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
 340
 341DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
 342DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
 343DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
 344
 345DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
 346DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
 347DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
 348
 349DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
 350DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
 351DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
 352
 353static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
 354{
 355    int8_t n1 = n, n2 = n >> 8;
 356    return m + n1 + n2;
 357}
 358
 359static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
 360{
 361    int16_t n1 = n, n2 = n >> 16;
 362    return m + n1 + n2;
 363}
 364
 365static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
 366{
 367    int32_t n1 = n, n2 = n >> 32;
 368    return m + n1 + n2;
 369}
 370
 371DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
 372DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
 373DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
 374
 375static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
 376{
 377    uint8_t n1 = n, n2 = n >> 8;
 378    return m + n1 + n2;
 379}
 380
 381static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
 382{
 383    uint16_t n1 = n, n2 = n >> 16;
 384    return m + n1 + n2;
 385}
 386
 387static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
 388{
 389    uint32_t n1 = n, n2 = n >> 32;
 390    return m + n1 + n2;
 391}
 392
 393DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
 394DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
 395DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
 396
 397#define do_srshl_b(n, m)  do_sqrshl_bhs(n, m, 8, true, NULL)
 398#define do_srshl_h(n, m)  do_sqrshl_bhs(n, m, 16, true, NULL)
 399#define do_srshl_s(n, m)  do_sqrshl_bhs(n, m, 32, true, NULL)
 400#define do_srshl_d(n, m)  do_sqrshl_d(n, m, true, NULL)
 401
 402DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
 403DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
 404DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
 405DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
 406
 407#define do_urshl_b(n, m)  do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
 408#define do_urshl_h(n, m)  do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
 409#define do_urshl_s(n, m)  do_uqrshl_bhs(n, m, 32, true, NULL)
 410#define do_urshl_d(n, m)  do_uqrshl_d(n, m, true, NULL)
 411
 412DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
 413DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
 414DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
 415DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
 416
 417/*
 418 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
 419 * We pass in a pointer to a dummy saturation field to trigger
 420 * the saturating arithmetic but discard the information about
 421 * whether it has occurred.
 422 */
 423#define do_sqshl_b(n, m) \
 424   ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
 425#define do_sqshl_h(n, m) \
 426   ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
 427#define do_sqshl_s(n, m) \
 428   ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
 429#define do_sqshl_d(n, m) \
 430   ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
 431
 432DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
 433DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
 434DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
 435DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
 436
 437#define do_uqshl_b(n, m) \
 438   ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
 439#define do_uqshl_h(n, m) \
 440   ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
 441#define do_uqshl_s(n, m) \
 442   ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
 443#define do_uqshl_d(n, m) \
 444   ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
 445
 446DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
 447DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
 448DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
 449DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
 450
 451#define do_sqrshl_b(n, m) \
 452   ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
 453#define do_sqrshl_h(n, m) \
 454   ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
 455#define do_sqrshl_s(n, m) \
 456   ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
 457#define do_sqrshl_d(n, m) \
 458   ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
 459
 460DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
 461DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
 462DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
 463DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
 464
 465#undef do_sqrshl_d
 466
 467#define do_uqrshl_b(n, m) \
 468   ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
 469#define do_uqrshl_h(n, m) \
 470   ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
 471#define do_uqrshl_s(n, m) \
 472   ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
 473#define do_uqrshl_d(n, m) \
 474   ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
 475
 476DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
 477DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
 478DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
 479DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
 480
 481#undef do_uqrshl_d
 482
 483#define DO_HADD_BHS(n, m)  (((int64_t)n + m) >> 1)
 484#define DO_HADD_D(n, m)    ((n >> 1) + (m >> 1) + (n & m & 1))
 485
 486DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
 487DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
 488DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
 489DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
 490
 491DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
 492DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
 493DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
 494DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
 495
 496#define DO_RHADD_BHS(n, m)  (((int64_t)n + m + 1) >> 1)
 497#define DO_RHADD_D(n, m)    ((n >> 1) + (m >> 1) + ((n | m) & 1))
 498
 499DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
 500DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
 501DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
 502DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
 503
 504DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
 505DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
 506DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
 507DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
 508
 509#define DO_HSUB_BHS(n, m)  (((int64_t)n - m) >> 1)
 510#define DO_HSUB_D(n, m)    ((n >> 1) - (m >> 1) - (~n & m & 1))
 511
 512DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
 513DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
 514DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
 515DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
 516
 517DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
 518DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
 519DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
 520DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
 521
 522static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
 523{
 524    return val >= max ? max : val <= min ? min : val;
 525}
 526
 527#define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
 528#define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
 529#define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
 530
 531static inline int64_t do_sqadd_d(int64_t n, int64_t m)
 532{
 533    int64_t r = n + m;
 534    if (((r ^ n) & ~(n ^ m)) < 0) {
 535        /* Signed overflow.  */
 536        return r < 0 ? INT64_MAX : INT64_MIN;
 537    }
 538    return r;
 539}
 540
 541DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
 542DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
 543DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
 544DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
 545
 546#define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
 547#define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
 548#define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
 549
 550static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
 551{
 552    uint64_t r = n + m;
 553    return r < n ? UINT64_MAX : r;
 554}
 555
 556DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
 557DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
 558DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
 559DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
 560
 561#define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
 562#define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
 563#define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
 564
 565static inline int64_t do_sqsub_d(int64_t n, int64_t m)
 566{
 567    int64_t r = n - m;
 568    if (((r ^ n) & (n ^ m)) < 0) {
 569        /* Signed overflow.  */
 570        return r < 0 ? INT64_MAX : INT64_MIN;
 571    }
 572    return r;
 573}
 574
 575DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
 576DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
 577DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
 578DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
 579
 580#define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
 581#define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
 582#define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
 583
 584static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
 585{
 586    return n > m ? n - m : 0;
 587}
 588
 589DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
 590DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
 591DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
 592DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
 593
 594#define DO_SUQADD_B(n, m) \
 595    do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
 596#define DO_SUQADD_H(n, m) \
 597    do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
 598#define DO_SUQADD_S(n, m) \
 599    do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
 600
 601static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
 602{
 603    uint64_t r = n + m;
 604
 605    if (n < 0) {
 606        /* Note that m - abs(n) cannot underflow. */
 607        if (r > INT64_MAX) {
 608            /* Result is either very large positive or negative. */
 609            if (m > -n) {
 610                /* m > abs(n), so r is a very large positive. */
 611                return INT64_MAX;
 612            }
 613            /* Result is negative. */
 614        }
 615    } else {
 616        /* Both inputs are positive: check for overflow.  */
 617        if (r < m || r > INT64_MAX) {
 618            return INT64_MAX;
 619        }
 620    }
 621    return r;
 622}
 623
 624DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
 625DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
 626DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
 627DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
 628
 629#define DO_USQADD_B(n, m) \
 630    do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
 631#define DO_USQADD_H(n, m) \
 632    do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
 633#define DO_USQADD_S(n, m) \
 634    do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
 635
 636static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
 637{
 638    uint64_t r = n + m;
 639
 640    if (m < 0) {
 641        return n < -m ? 0 : r;
 642    }
 643    return r < n ? UINT64_MAX : r;
 644}
 645
 646DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
 647DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
 648DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
 649DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
 650
 651#undef DO_ZPZZ
 652#undef DO_ZPZZ_D
 653
 654/*
 655 * Three operand expander, operating on element pairs.
 656 * If the slot I is even, the elements from from VN {I, I+1}.
 657 * If the slot I is odd, the elements from from VM {I-1, I}.
 658 * Load all of the input elements in each pair before overwriting output.
 659 */
 660#define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
 661void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
 662{                                                               \
 663    intptr_t i, opr_sz = simd_oprsz(desc);                      \
 664    for (i = 0; i < opr_sz; ) {                                 \
 665        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
 666        do {                                                    \
 667            TYPE n0 = *(TYPE *)(vn + H(i));                     \
 668            TYPE m0 = *(TYPE *)(vm + H(i));                     \
 669            TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));      \
 670            TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));      \
 671            if (pg & 1) {                                       \
 672                *(TYPE *)(vd + H(i)) = OP(n0, n1);              \
 673            }                                                   \
 674            i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
 675            if (pg & 1) {                                       \
 676                *(TYPE *)(vd + H(i)) = OP(m0, m1);              \
 677            }                                                   \
 678            i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
 679        } while (i & 15);                                       \
 680    }                                                           \
 681}
 682
 683/* Similarly, specialized for 64-bit operands.  */
 684#define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
 685void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
 686{                                                               \
 687    intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
 688    TYPE *d = vd, *n = vn, *m = vm;                             \
 689    uint8_t *pg = vg;                                           \
 690    for (i = 0; i < opr_sz; i += 2) {                           \
 691        TYPE n0 = n[i], n1 = n[i + 1];                          \
 692        TYPE m0 = m[i], m1 = m[i + 1];                          \
 693        if (pg[H1(i)] & 1) {                                    \
 694            d[i] = OP(n0, n1);                                  \
 695        }                                                       \
 696        if (pg[H1(i + 1)] & 1) {                                \
 697            d[i + 1] = OP(m0, m1);                              \
 698        }                                                       \
 699    }                                                           \
 700}
 701
 702DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
 703DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
 704DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
 705DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
 706
 707DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
 708DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
 709DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
 710DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
 711
 712DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
 713DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
 714DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
 715DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
 716
 717DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
 718DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
 719DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
 720DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
 721
 722DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
 723DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
 724DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
 725DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
 726
 727#undef DO_ZPZZ_PAIR
 728#undef DO_ZPZZ_PAIR_D
 729
 730#define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP)                              \
 731void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
 732                  void *status, uint32_t desc)                          \
 733{                                                                       \
 734    intptr_t i, opr_sz = simd_oprsz(desc);                              \
 735    for (i = 0; i < opr_sz; ) {                                         \
 736        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
 737        do {                                                            \
 738            TYPE n0 = *(TYPE *)(vn + H(i));                             \
 739            TYPE m0 = *(TYPE *)(vm + H(i));                             \
 740            TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));              \
 741            TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));              \
 742            if (pg & 1) {                                               \
 743                *(TYPE *)(vd + H(i)) = OP(n0, n1, status);              \
 744            }                                                           \
 745            i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
 746            if (pg & 1) {                                               \
 747                *(TYPE *)(vd + H(i)) = OP(m0, m1, status);              \
 748            }                                                           \
 749            i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
 750        } while (i & 15);                                               \
 751    }                                                                   \
 752}
 753
 754DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
 755DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
 756DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
 757
 758DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
 759DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
 760DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
 761
 762DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
 763DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
 764DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
 765
 766DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
 767DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
 768DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
 769
 770DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
 771DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
 772DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
 773
 774#undef DO_ZPZZ_PAIR_FP
 775
 776/* Three-operand expander, controlled by a predicate, in which the
 777 * third operand is "wide".  That is, for D = N op M, the same 64-bit
 778 * value of M is used with all of the narrower values of N.
 779 */
 780#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP)                               \
 781void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
 782{                                                                       \
 783    intptr_t i, opr_sz = simd_oprsz(desc);                              \
 784    for (i = 0; i < opr_sz; ) {                                         \
 785        uint8_t pg = *(uint8_t *)(vg + H1(i >> 3));                     \
 786        TYPEW mm = *(TYPEW *)(vm + i);                                  \
 787        do {                                                            \
 788            if (pg & 1) {                                               \
 789                TYPE nn = *(TYPE *)(vn + H(i));                         \
 790                *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
 791            }                                                           \
 792            i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
 793        } while (i & 7);                                                \
 794    }                                                                   \
 795}
 796
 797DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
 798DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
 799DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
 800
 801DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
 802DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
 803DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
 804
 805DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
 806DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
 807DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
 808
 809#undef DO_ZPZW
 810
 811/* Fully general two-operand expander, controlled by a predicate.
 812 */
 813#define DO_ZPZ(NAME, TYPE, H, OP)                               \
 814void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
 815{                                                               \
 816    intptr_t i, opr_sz = simd_oprsz(desc);                      \
 817    for (i = 0; i < opr_sz; ) {                                 \
 818        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
 819        do {                                                    \
 820            if (pg & 1) {                                       \
 821                TYPE nn = *(TYPE *)(vn + H(i));                 \
 822                *(TYPE *)(vd + H(i)) = OP(nn);                  \
 823            }                                                   \
 824            i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
 825        } while (i & 15);                                       \
 826    }                                                           \
 827}
 828
 829/* Similarly, specialized for 64-bit operands.  */
 830#define DO_ZPZ_D(NAME, TYPE, OP)                                \
 831void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
 832{                                                               \
 833    intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
 834    TYPE *d = vd, *n = vn;                                      \
 835    uint8_t *pg = vg;                                           \
 836    for (i = 0; i < opr_sz; i += 1) {                           \
 837        if (pg[H1(i)] & 1) {                                    \
 838            TYPE nn = n[i];                                     \
 839            d[i] = OP(nn);                                      \
 840        }                                                       \
 841    }                                                           \
 842}
 843
 844#define DO_CLS_B(N)   (clrsb32(N) - 24)
 845#define DO_CLS_H(N)   (clrsb32(N) - 16)
 846
 847DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
 848DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
 849DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
 850DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
 851
 852#define DO_CLZ_B(N)   (clz32(N) - 24)
 853#define DO_CLZ_H(N)   (clz32(N) - 16)
 854
 855DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
 856DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
 857DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
 858DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
 859
 860DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
 861DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
 862DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
 863DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
 864
 865#define DO_CNOT(N)    (N == 0)
 866
 867DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
 868DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
 869DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
 870DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
 871
 872#define DO_FABS(N)    (N & ((__typeof(N))-1 >> 1))
 873
 874DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
 875DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
 876DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
 877
 878#define DO_FNEG(N)    (N ^ ~((__typeof(N))-1 >> 1))
 879
 880DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
 881DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
 882DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
 883
 884#define DO_NOT(N)    (~N)
 885
 886DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
 887DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
 888DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
 889DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
 890
 891#define DO_SXTB(N)    ((int8_t)N)
 892#define DO_SXTH(N)    ((int16_t)N)
 893#define DO_SXTS(N)    ((int32_t)N)
 894#define DO_UXTB(N)    ((uint8_t)N)
 895#define DO_UXTH(N)    ((uint16_t)N)
 896#define DO_UXTS(N)    ((uint32_t)N)
 897
 898DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
 899DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
 900DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
 901DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
 902DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
 903DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
 904
 905DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
 906DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
 907DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
 908DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
 909DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
 910DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
 911
 912#define DO_ABS(N)    (N < 0 ? -N : N)
 913
 914DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
 915DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
 916DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
 917DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
 918
 919#define DO_NEG(N)    (-N)
 920
 921DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
 922DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
 923DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
 924DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
 925
 926DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
 927DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
 928DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
 929
 930DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
 931DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
 932
 933DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
 934
 935void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
 936{
 937    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
 938    uint64_t *d = vd, *n = vn;
 939    uint8_t *pg = vg;
 940
 941    for (i = 0; i < opr_sz; i += 2) {
 942        if (pg[H1(i)] & 1) {
 943            uint64_t n0 = n[i + 0];
 944            uint64_t n1 = n[i + 1];
 945            d[i + 0] = n1;
 946            d[i + 1] = n0;
 947        }
 948    }
 949}
 950
 951DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
 952DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
 953DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
 954DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
 955
 956#define DO_SQABS(X) \
 957    ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
 958       x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
 959
 960DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
 961DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
 962DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
 963DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
 964
 965#define DO_SQNEG(X) \
 966    ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
 967       x_ == min_ ? -min_ - 1 : -x_; })
 968
 969DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
 970DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
 971DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
 972DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
 973
 974DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
 975DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
 976
 977/* Three-operand expander, unpredicated, in which the third operand is "wide".
 978 */
 979#define DO_ZZW(NAME, TYPE, TYPEW, H, OP)                       \
 980void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
 981{                                                              \
 982    intptr_t i, opr_sz = simd_oprsz(desc);                     \
 983    for (i = 0; i < opr_sz; ) {                                \
 984        TYPEW mm = *(TYPEW *)(vm + i);                         \
 985        do {                                                   \
 986            TYPE nn = *(TYPE *)(vn + H(i));                    \
 987            *(TYPE *)(vd + H(i)) = OP(nn, mm);                 \
 988            i += sizeof(TYPE);                                 \
 989        } while (i & 7);                                       \
 990    }                                                          \
 991}
 992
 993DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
 994DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
 995DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
 996
 997DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
 998DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
 999DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1000
1001DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1002DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1003DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1004
1005#undef DO_ZZW
1006
1007#undef DO_CLS_B
1008#undef DO_CLS_H
1009#undef DO_CLZ_B
1010#undef DO_CLZ_H
1011#undef DO_CNOT
1012#undef DO_FABS
1013#undef DO_FNEG
1014#undef DO_ABS
1015#undef DO_NEG
1016#undef DO_ZPZ
1017#undef DO_ZPZ_D
1018
1019/*
1020 * Three-operand expander, unpredicated, in which the two inputs are
1021 * selected from the top or bottom half of the wide column.
1022 */
1023#define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1024void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1025{                                                                       \
1026    intptr_t i, opr_sz = simd_oprsz(desc);                              \
1027    int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1028    int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1029    for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1030        TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1031        TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1032        *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                            \
1033    }                                                                   \
1034}
1035
1036DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1037DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1038DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1039
1040DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1041DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1042DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1043
1044DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1045DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1046DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1047
1048DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1049DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1050DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1051
1052DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1053DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1054DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1055
1056DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1057DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1058DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1059
1060DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1061DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1062DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1063
1064DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1065DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1066DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1067
1068/* Note that the multiply cannot overflow, but the doubling can. */
1069static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1070{
1071    int16_t val = n * m;
1072    return DO_SQADD_H(val, val);
1073}
1074
1075static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1076{
1077    int32_t val = n * m;
1078    return DO_SQADD_S(val, val);
1079}
1080
1081static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1082{
1083    int64_t val = n * m;
1084    return do_sqadd_d(val, val);
1085}
1086
1087DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1088DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1089DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1090
1091#undef DO_ZZZ_TB
1092
1093#define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1094void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1095{                                                              \
1096    intptr_t i, opr_sz = simd_oprsz(desc);                     \
1097    int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1098    for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1099        TYPEW nn = *(TYPEW *)(vn + HW(i));                     \
1100        TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));              \
1101        *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                   \
1102    }                                                          \
1103}
1104
1105DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1106DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1107DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1108
1109DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1110DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1111DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1112
1113DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1114DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1115DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1116
1117DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1118DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1119DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1120
1121#undef DO_ZZZ_WTB
1122
1123#define DO_ZZZ_NTB(NAME, TYPE, H, OP)                                   \
1124void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1125{                                                                       \
1126    intptr_t i, opr_sz = simd_oprsz(desc);                              \
1127    intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1128    intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1129    for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {                    \
1130        TYPE nn = *(TYPE *)(vn + H(i + sel1));                          \
1131        TYPE mm = *(TYPE *)(vm + H(i + sel2));                          \
1132        *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm);                       \
1133    }                                                                   \
1134}
1135
1136DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1137DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1138DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1139DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1140
1141#undef DO_ZZZ_NTB
1142
1143#define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1144void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1145{                                                               \
1146    intptr_t i, opr_sz = simd_oprsz(desc);                      \
1147    intptr_t sel1 = simd_data(desc) * sizeof(TYPEN);            \
1148    for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {               \
1149        TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));               \
1150        TYPEW mm = *(TYPEN *)(vm + HN(i + sel1));               \
1151        TYPEW aa = *(TYPEW *)(va + HW(i));                      \
1152        *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa;               \
1153    }                                                           \
1154}
1155
1156DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1157DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1158DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1159
1160DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1161DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1162DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1163
1164DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1165DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1166DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1167
1168DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1169DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1170DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1171
1172#define DO_NMUL(N, M)  -(N * M)
1173
1174DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1175DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1176DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1177
1178DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1179DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1180DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1181
1182#undef DO_ZZZW_ACC
1183
1184#define DO_XTNB(NAME, TYPE, OP) \
1185void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
1186{                                                            \
1187    intptr_t i, opr_sz = simd_oprsz(desc);                   \
1188    for (i = 0; i < opr_sz; i += sizeof(TYPE)) {             \
1189        TYPE nn = *(TYPE *)(vn + i);                         \
1190        nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4);  \
1191        *(TYPE *)(vd + i) = nn;                              \
1192    }                                                        \
1193}
1194
1195#define DO_XTNT(NAME, TYPE, TYPEN, H, OP)                               \
1196void HELPER(NAME)(void *vd, void *vn, uint32_t desc)                    \
1197{                                                                       \
1198    intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN));      \
1199    for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
1200        TYPE nn = *(TYPE *)(vn + i);                                    \
1201        *(TYPEN *)(vd + i + odd) = OP(nn);                              \
1202    }                                                                   \
1203}
1204
1205#define DO_SQXTN_H(n)  do_sat_bhs(n, INT8_MIN, INT8_MAX)
1206#define DO_SQXTN_S(n)  do_sat_bhs(n, INT16_MIN, INT16_MAX)
1207#define DO_SQXTN_D(n)  do_sat_bhs(n, INT32_MIN, INT32_MAX)
1208
1209DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1210DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1211DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1212
1213DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1214DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1215DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1216
1217#define DO_UQXTN_H(n)  do_sat_bhs(n, 0, UINT8_MAX)
1218#define DO_UQXTN_S(n)  do_sat_bhs(n, 0, UINT16_MAX)
1219#define DO_UQXTN_D(n)  do_sat_bhs(n, 0, UINT32_MAX)
1220
1221DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1222DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1223DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1224
1225DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1226DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1227DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1228
1229DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1230DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1231DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1232
1233DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1234DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1235DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1236
1237#undef DO_XTNB
1238#undef DO_XTNT
1239
1240void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1241{
1242    intptr_t i, opr_sz = simd_oprsz(desc);
1243    int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1244    uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1245    uint32_t *a = va, *n = vn;
1246    uint64_t *d = vd, *m = vm;
1247
1248    for (i = 0; i < opr_sz / 8; ++i) {
1249        uint32_t e1 = a[2 * i + H4(0)];
1250        uint32_t e2 = n[2 * i + sel] ^ inv;
1251        uint64_t c = extract64(m[i], 32, 1);
1252        /* Compute and store the entire 33-bit result at once. */
1253        d[i] = c + e1 + e2;
1254    }
1255}
1256
1257void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1258{
1259    intptr_t i, opr_sz = simd_oprsz(desc);
1260    int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1261    uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1262    uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1263
1264    for (i = 0; i < opr_sz / 8; i += 2) {
1265        Int128 e1 = int128_make64(a[i]);
1266        Int128 e2 = int128_make64(n[i + sel] ^ inv);
1267        Int128 c = int128_make64(m[i + 1] & 1);
1268        Int128 r = int128_add(int128_add(e1, e2), c);
1269        d[i + 0] = int128_getlo(r);
1270        d[i + 1] = int128_gethi(r);
1271    }
1272}
1273
1274#define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1275void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1276{                                                                       \
1277    intptr_t i, opr_sz = simd_oprsz(desc);                              \
1278    int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1279    int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1280    for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1281        TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1282        TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1283        TYPEW aa = *(TYPEW *)(va + HW(i));                              \
1284        *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm));           \
1285    }                                                                   \
1286}
1287
1288DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1289           do_sqdmull_h, DO_SQADD_H)
1290DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1291           do_sqdmull_s, DO_SQADD_S)
1292DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1293           do_sqdmull_d, do_sqadd_d)
1294
1295DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1296           do_sqdmull_h, DO_SQSUB_H)
1297DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1298           do_sqdmull_s, DO_SQSUB_S)
1299DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1300           do_sqdmull_d, do_sqsub_d)
1301
1302#undef DO_SQDMLAL
1303
1304#define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1305void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1306{                                                               \
1307    intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);       \
1308    int rot = simd_data(desc);                                  \
1309    int sel_a = rot & 1, sel_b = sel_a ^ 1;                     \
1310    bool sub_r = rot == 1 || rot == 2;                          \
1311    bool sub_i = rot >= 2;                                      \
1312    TYPE *d = vd, *n = vn, *m = vm, *a = va;                    \
1313    for (i = 0; i < opr_sz; i += 2) {                           \
1314        TYPE elt1_a = n[H(i + sel_a)];                          \
1315        TYPE elt2_a = m[H(i + sel_a)];                          \
1316        TYPE elt2_b = m[H(i + sel_b)];                          \
1317        d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r);           \
1318        d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i);   \
1319    }                                                           \
1320}
1321
1322#define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1323
1324DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1325DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1326DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1327DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1328
1329#define DO_SQRDMLAH_B(N, M, A, S) \
1330    do_sqrdmlah_b(N, M, A, S, true)
1331#define DO_SQRDMLAH_H(N, M, A, S) \
1332    ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1333#define DO_SQRDMLAH_S(N, M, A, S) \
1334    ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1335#define DO_SQRDMLAH_D(N, M, A, S) \
1336    do_sqrdmlah_d(N, M, A, S, true)
1337
1338DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1339DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1340DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1341DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1342
1343#define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1344void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)    \
1345{                                                                           \
1346    intptr_t i, j, oprsz = simd_oprsz(desc);                                \
1347    int rot = extract32(desc, SIMD_DATA_SHIFT, 2);                          \
1348    int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2;                  \
1349    int sel_a = rot & 1, sel_b = sel_a ^ 1;                                 \
1350    bool sub_r = rot == 1 || rot == 2;                                      \
1351    bool sub_i = rot >= 2;                                                  \
1352    TYPE *d = vd, *n = vn, *m = vm, *a = va;                                \
1353    for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) {         \
1354        TYPE elt2_a = m[H(i + idx + sel_a)];                                \
1355        TYPE elt2_b = m[H(i + idx + sel_b)];                                \
1356        for (j = 0; j < 16 / sizeof(TYPE); j += 2) {                        \
1357            TYPE elt1_a = n[H(i + j + sel_a)];                              \
1358            d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r);          \
1359            d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i);  \
1360        }                                                                   \
1361    }                                                                       \
1362}
1363
1364DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1365DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1366
1367DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1368DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1369
1370#undef DO_CMLA
1371#undef DO_CMLA_FUNC
1372#undef DO_CMLA_IDX_FUNC
1373#undef DO_SQRDMLAH_B
1374#undef DO_SQRDMLAH_H
1375#undef DO_SQRDMLAH_S
1376#undef DO_SQRDMLAH_D
1377
1378/* Note N and M are 4 elements bundled into one unit. */
1379static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1380                         int sel_a, int sel_b, int sub_i)
1381{
1382    for (int i = 0; i <= 1; i++) {
1383        int32_t elt1_r = (int8_t)(n >> (16 * i));
1384        int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1385        int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1386        int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1387
1388        a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1389    }
1390    return a;
1391}
1392
1393static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1394                         int sel_a, int sel_b, int sub_i)
1395{
1396    for (int i = 0; i <= 1; i++) {
1397        int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1398        int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1399        int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1400        int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1401
1402        a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1403    }
1404    return a;
1405}
1406
1407void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1408                              void *va, uint32_t desc)
1409{
1410    int opr_sz = simd_oprsz(desc);
1411    int rot = simd_data(desc);
1412    int sel_a = rot & 1;
1413    int sel_b = sel_a ^ 1;
1414    int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1415    uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1416
1417    for (int e = 0; e < opr_sz / 4; e++) {
1418        d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1419    }
1420}
1421
1422void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1423                              void *va, uint32_t desc)
1424{
1425    int opr_sz = simd_oprsz(desc);
1426    int rot = simd_data(desc);
1427    int sel_a = rot & 1;
1428    int sel_b = sel_a ^ 1;
1429    int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1430    uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1431
1432    for (int e = 0; e < opr_sz / 8; e++) {
1433        d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1434    }
1435}
1436
1437void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1438                             void *va, uint32_t desc)
1439{
1440    int opr_sz = simd_oprsz(desc);
1441    int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1442    int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1443    int sel_a = rot & 1;
1444    int sel_b = sel_a ^ 1;
1445    int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1446    uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1447
1448    for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1449        uint32_t seg_m = m[seg + idx];
1450        for (int e = 0; e < 4; e++) {
1451            d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1452                                   sel_a, sel_b, sub_i);
1453        }
1454    }
1455}
1456
1457void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1458                             void *va, uint32_t desc)
1459{
1460    int seg, opr_sz = simd_oprsz(desc);
1461    int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1462    int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1463    int sel_a = rot & 1;
1464    int sel_b = sel_a ^ 1;
1465    int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1466    uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1467
1468    for (seg = 0; seg < opr_sz / 8; seg += 2) {
1469        uint64_t seg_m = m[seg + idx];
1470        for (int e = 0; e < 2; e++) {
1471            d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1472                                   sel_a, sel_b, sub_i);
1473        }
1474    }
1475}
1476
1477#define DO_ZZXZ(NAME, TYPE, H, OP) \
1478void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1479{                                                                       \
1480    intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE);     \
1481    intptr_t i, j, idx = simd_data(desc);                               \
1482    TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx);           \
1483    for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {               \
1484        TYPE mm = m[i];                                                 \
1485        for (j = 0; j < segment; j++) {                                 \
1486            d[i + j] = OP(n[i + j], mm, a[i + j]);                      \
1487        }                                                               \
1488    }                                                                   \
1489}
1490
1491#define DO_SQRDMLAH_H(N, M, A) \
1492    ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1493#define DO_SQRDMLAH_S(N, M, A) \
1494    ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1495#define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1496
1497DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1498DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1499DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1500
1501#define DO_SQRDMLSH_H(N, M, A) \
1502    ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1503#define DO_SQRDMLSH_S(N, M, A) \
1504    ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1505#define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1506
1507DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1508DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1509DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1510
1511#undef DO_ZZXZ
1512
1513#define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1514void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
1515{                                                                         \
1516    intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1517    intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1518    intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1519    for (i = 0; i < oprsz; i += 16) {                                     \
1520        TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1521        for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1522            TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1523            TYPEW aa = *(TYPEW *)(va + HW(i + j));                        \
1524            *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa);                  \
1525        }                                                                 \
1526    }                                                                     \
1527}
1528
1529#define DO_MLA(N, M, A)  (A + N * M)
1530
1531DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1532DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1533DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1534DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1535
1536#define DO_MLS(N, M, A)  (A - N * M)
1537
1538DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1539DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1540DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1541DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1542
1543#define DO_SQDMLAL_S(N, M, A)  DO_SQADD_S(A, do_sqdmull_s(N, M))
1544#define DO_SQDMLAL_D(N, M, A)  do_sqadd_d(A, do_sqdmull_d(N, M))
1545
1546DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1547DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1548
1549#define DO_SQDMLSL_S(N, M, A)  DO_SQSUB_S(A, do_sqdmull_s(N, M))
1550#define DO_SQDMLSL_D(N, M, A)  do_sqsub_d(A, do_sqdmull_d(N, M))
1551
1552DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1553DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1554
1555#undef DO_MLA
1556#undef DO_MLS
1557#undef DO_ZZXW
1558
1559#define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1560void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)            \
1561{                                                                         \
1562    intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1563    intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1564    intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1565    for (i = 0; i < oprsz; i += 16) {                                     \
1566        TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1567        for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1568            TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1569            *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm);                      \
1570        }                                                                 \
1571    }                                                                     \
1572}
1573
1574DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1575DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1576
1577DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1578DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1579
1580DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1581DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1582
1583#undef DO_ZZX
1584
1585#define DO_BITPERM(NAME, TYPE, OP) \
1586void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1587{                                                              \
1588    intptr_t i, opr_sz = simd_oprsz(desc);                     \
1589    for (i = 0; i < opr_sz; i += sizeof(TYPE)) {               \
1590        TYPE nn = *(TYPE *)(vn + i);                           \
1591        TYPE mm = *(TYPE *)(vm + i);                           \
1592        *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8);      \
1593    }                                                          \
1594}
1595
1596static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1597{
1598    uint64_t res = 0;
1599    int db, rb = 0;
1600
1601    for (db = 0; db < n; ++db) {
1602        if ((mask >> db) & 1) {
1603            res |= ((data >> db) & 1) << rb;
1604            ++rb;
1605        }
1606    }
1607    return res;
1608}
1609
1610DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1611DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1612DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1613DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1614
1615static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1616{
1617    uint64_t res = 0;
1618    int rb, db = 0;
1619
1620    for (rb = 0; rb < n; ++rb) {
1621        if ((mask >> rb) & 1) {
1622            res |= ((data >> db) & 1) << rb;
1623            ++db;
1624        }
1625    }
1626    return res;
1627}
1628
1629DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1630DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1631DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1632DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1633
1634static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1635{
1636    uint64_t resm = 0, resu = 0;
1637    int db, rbm = 0, rbu = 0;
1638
1639    for (db = 0; db < n; ++db) {
1640        uint64_t val = (data >> db) & 1;
1641        if ((mask >> db) & 1) {
1642            resm |= val << rbm++;
1643        } else {
1644            resu |= val << rbu++;
1645        }
1646    }
1647
1648    return resm | (resu << rbm);
1649}
1650
1651DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1652DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1653DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1654DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1655
1656#undef DO_BITPERM
1657
1658#define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP)                  \
1659void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
1660{                                                               \
1661    intptr_t i, opr_sz = simd_oprsz(desc);                      \
1662    int sub_r = simd_data(desc);                                \
1663    if (sub_r) {                                                \
1664        for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1665            TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1666            TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1667            TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1668            TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1669            acc_r = ADD_OP(acc_r, el2_i);                       \
1670            acc_i = SUB_OP(acc_i, el2_r);                       \
1671            *(TYPE *)(vd + H(i)) = acc_r;                       \
1672            *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1673        }                                                       \
1674    } else {                                                    \
1675        for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1676            TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1677            TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1678            TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1679            TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1680            acc_r = SUB_OP(acc_r, el2_i);                       \
1681            acc_i = ADD_OP(acc_i, el2_r);                       \
1682            *(TYPE *)(vd + H(i)) = acc_r;                       \
1683            *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1684        }                                                       \
1685    }                                                           \
1686}
1687
1688DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1689DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1690DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1691DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1692
1693DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1694DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1695DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1696DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1697
1698#undef DO_CADD
1699
1700#define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1701void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
1702{                                                              \
1703    intptr_t i, opr_sz = simd_oprsz(desc);                     \
1704    intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN);      \
1705    int shift = simd_data(desc) >> 1;                          \
1706    for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1707        TYPEW nn = *(TYPEN *)(vn + HN(i + sel));               \
1708        *(TYPEW *)(vd + HW(i)) = nn << shift;                  \
1709    }                                                          \
1710}
1711
1712DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1713DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1714DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1715
1716DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1717DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1718DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1719
1720#undef DO_ZZI_SHLL
1721
1722/* Two-operand reduction expander, controlled by a predicate.
1723 * The difference between TYPERED and TYPERET has to do with
1724 * sign-extension.  E.g. for SMAX, TYPERED must be signed,
1725 * but TYPERET must be unsigned so that e.g. a 32-bit value
1726 * is not sign-extended to the ABI uint64_t return type.
1727 */
1728/* ??? If we were to vectorize this by hand the reduction ordering
1729 * would change.  For integer operands, this is perfectly fine.
1730 */
1731#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1732uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1733{                                                          \
1734    intptr_t i, opr_sz = simd_oprsz(desc);                 \
1735    TYPERED ret = INIT;                                    \
1736    for (i = 0; i < opr_sz; ) {                            \
1737        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
1738        do {                                               \
1739            if (pg & 1) {                                  \
1740                TYPEELT nn = *(TYPEELT *)(vn + H(i));      \
1741                ret = OP(ret, nn);                         \
1742            }                                              \
1743            i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT);  \
1744        } while (i & 15);                                  \
1745    }                                                      \
1746    return (TYPERET)ret;                                   \
1747}
1748
1749#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP)             \
1750uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1751{                                                          \
1752    intptr_t i, opr_sz = simd_oprsz(desc) / 8;             \
1753    TYPEE *n = vn;                                         \
1754    uint8_t *pg = vg;                                      \
1755    TYPER ret = INIT;                                      \
1756    for (i = 0; i < opr_sz; i += 1) {                      \
1757        if (pg[H1(i)] & 1) {                               \
1758            TYPEE nn = n[i];                               \
1759            ret = OP(ret, nn);                             \
1760        }                                                  \
1761    }                                                      \
1762    return ret;                                            \
1763}
1764
1765DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1766DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1767DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1768DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1769
1770DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1771DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1772DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1773DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1774
1775DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1776DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1777DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1778DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1779
1780DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1781DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1782DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1783
1784DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1785DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1786DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1787DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1788
1789DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1790DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1791DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1792DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1793
1794DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1795DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1796DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1797DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1798
1799DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1800DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1801DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1802DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1803
1804DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1805DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1806DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1807DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1808
1809#undef DO_VPZ
1810#undef DO_VPZ_D
1811
1812/* Two vector operand, one scalar operand, unpredicated.  */
1813#define DO_ZZI(NAME, TYPE, OP)                                       \
1814void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc)   \
1815{                                                                    \
1816    intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);            \
1817    TYPE s = s64, *d = vd, *n = vn;                                  \
1818    for (i = 0; i < opr_sz; ++i) {                                   \
1819        d[i] = OP(n[i], s);                                          \
1820    }                                                                \
1821}
1822
1823#define DO_SUBR(X, Y)   (Y - X)
1824
1825DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1826DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1827DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1828DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1829
1830DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1831DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1832DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1833DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1834
1835DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1836DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1837DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1838DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1839
1840DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1841DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1842DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1843DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1844
1845DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1846DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1847DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1848DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1849
1850#undef DO_ZZI
1851
1852#undef DO_AND
1853#undef DO_ORR
1854#undef DO_EOR
1855#undef DO_BIC
1856#undef DO_ADD
1857#undef DO_SUB
1858#undef DO_MAX
1859#undef DO_MIN
1860#undef DO_ABD
1861#undef DO_MUL
1862#undef DO_DIV
1863#undef DO_ASR
1864#undef DO_LSR
1865#undef DO_LSL
1866#undef DO_SUBR
1867
1868/* Similar to the ARM LastActiveElement pseudocode function, except the
1869   result is multiplied by the element size.  This includes the not found
1870   indication; e.g. not found for esz=3 is -8.  */
1871static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1872{
1873    uint64_t mask = pred_esz_masks[esz];
1874    intptr_t i = words;
1875
1876    do {
1877        uint64_t this_g = g[--i] & mask;
1878        if (this_g) {
1879            return i * 64 + (63 - clz64(this_g));
1880        }
1881    } while (i > 0);
1882    return (intptr_t)-1 << esz;
1883}
1884
1885uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1886{
1887    intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1888    uint32_t flags = PREDTEST_INIT;
1889    uint64_t *d = vd, *g = vg;
1890    intptr_t i = 0;
1891
1892    do {
1893        uint64_t this_d = d[i];
1894        uint64_t this_g = g[i];
1895
1896        if (this_g) {
1897            if (!(flags & 4)) {
1898                /* Set in D the first bit of G.  */
1899                this_d |= this_g & -this_g;
1900                d[i] = this_d;
1901            }
1902            flags = iter_predtest_fwd(this_d, this_g, flags);
1903        }
1904    } while (++i < words);
1905
1906    return flags;
1907}
1908
1909uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1910{
1911    intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1912    intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1913    uint32_t flags = PREDTEST_INIT;
1914    uint64_t *d = vd, *g = vg, esz_mask;
1915    intptr_t i, next;
1916
1917    next = last_active_element(vd, words, esz) + (1 << esz);
1918    esz_mask = pred_esz_masks[esz];
1919
1920    /* Similar to the pseudocode for pnext, but scaled by ESZ
1921       so that we find the correct bit.  */
1922    if (next < words * 64) {
1923        uint64_t mask = -1;
1924
1925        if (next & 63) {
1926            mask = ~((1ull << (next & 63)) - 1);
1927            next &= -64;
1928        }
1929        do {
1930            uint64_t this_g = g[next / 64] & esz_mask & mask;
1931            if (this_g != 0) {
1932                next = (next & -64) + ctz64(this_g);
1933                break;
1934            }
1935            next += 64;
1936            mask = -1;
1937        } while (next < words * 64);
1938    }
1939
1940    i = 0;
1941    do {
1942        uint64_t this_d = 0;
1943        if (i == next / 64) {
1944            this_d = 1ull << (next & 63);
1945        }
1946        d[i] = this_d;
1947        flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1948    } while (++i < words);
1949
1950    return flags;
1951}
1952
1953/*
1954 * Copy Zn into Zd, and store zero into inactive elements.
1955 * If inv, store zeros into the active elements.
1956 */
1957void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1958{
1959    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1960    uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1961    uint64_t *d = vd, *n = vn;
1962    uint8_t *pg = vg;
1963
1964    for (i = 0; i < opr_sz; i += 1) {
1965        d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
1966    }
1967}
1968
1969void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1970{
1971    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1972    uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1973    uint64_t *d = vd, *n = vn;
1974    uint8_t *pg = vg;
1975
1976    for (i = 0; i < opr_sz; i += 1) {
1977        d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
1978    }
1979}
1980
1981void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1982{
1983    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1984    uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1985    uint64_t *d = vd, *n = vn;
1986    uint8_t *pg = vg;
1987
1988    for (i = 0; i < opr_sz; i += 1) {
1989        d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
1990    }
1991}
1992
1993void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1994{
1995    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1996    uint64_t *d = vd, *n = vn;
1997    uint8_t *pg = vg;
1998    uint8_t inv = simd_data(desc);
1999
2000    for (i = 0; i < opr_sz; i += 1) {
2001        d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2002    }
2003}
2004
2005/* Three-operand expander, immediate operand, controlled by a predicate.
2006 */
2007#define DO_ZPZI(NAME, TYPE, H, OP)                              \
2008void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2009{                                                               \
2010    intptr_t i, opr_sz = simd_oprsz(desc);                      \
2011    TYPE imm = simd_data(desc);                                 \
2012    for (i = 0; i < opr_sz; ) {                                 \
2013        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
2014        do {                                                    \
2015            if (pg & 1) {                                       \
2016                TYPE nn = *(TYPE *)(vn + H(i));                 \
2017                *(TYPE *)(vd + H(i)) = OP(nn, imm);             \
2018            }                                                   \
2019            i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
2020        } while (i & 15);                                       \
2021    }                                                           \
2022}
2023
2024/* Similarly, specialized for 64-bit operands.  */
2025#define DO_ZPZI_D(NAME, TYPE, OP)                               \
2026void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2027{                                                               \
2028    intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
2029    TYPE *d = vd, *n = vn;                                      \
2030    TYPE imm = simd_data(desc);                                 \
2031    uint8_t *pg = vg;                                           \
2032    for (i = 0; i < opr_sz; i += 1) {                           \
2033        if (pg[H1(i)] & 1) {                                    \
2034            TYPE nn = n[i];                                     \
2035            d[i] = OP(nn, imm);                                 \
2036        }                                                       \
2037    }                                                           \
2038}
2039
2040#define DO_SHR(N, M)  (N >> M)
2041#define DO_SHL(N, M)  (N << M)
2042
2043/* Arithmetic shift right for division.  This rounds negative numbers
2044   toward zero as per signed division.  Therefore before shifting,
2045   when N is negative, add 2**M-1.  */
2046#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2047
2048static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2049{
2050    if (likely(sh < 64)) {
2051        return (x >> sh) + ((x >> (sh - 1)) & 1);
2052    } else if (sh == 64) {
2053        return x >> 63;
2054    } else {
2055        return 0;
2056    }
2057}
2058
2059static inline int64_t do_srshr(int64_t x, unsigned sh)
2060{
2061    if (likely(sh < 64)) {
2062        return (x >> sh) + ((x >> (sh - 1)) & 1);
2063    } else {
2064        /* Rounding the sign bit always produces 0. */
2065        return 0;
2066    }
2067}
2068
2069DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2070DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2071DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2072DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2073
2074DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2075DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2076DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2077DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2078
2079DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2080DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2081DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2082DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2083
2084DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2085DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2086DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2087DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2088
2089/* SVE2 bitwise shift by immediate */
2090DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2091DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2092DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2093DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2094
2095DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2096DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2097DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2098DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2099
2100DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2101DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2102DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2103DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2104
2105DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2106DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2107DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2108DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2109
2110#define do_suqrshl_b(n, m) \
2111   ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2112#define do_suqrshl_h(n, m) \
2113   ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2114#define do_suqrshl_s(n, m) \
2115   ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2116#define do_suqrshl_d(n, m) \
2117   ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2118
2119DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2120DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2121DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2122DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2123
2124#undef DO_ASRD
2125#undef DO_ZPZI
2126#undef DO_ZPZI_D
2127
2128#define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2129void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
2130{                                                            \
2131    intptr_t i, opr_sz = simd_oprsz(desc);                   \
2132    int shift = simd_data(desc);                             \
2133    for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {            \
2134        TYPEW nn = *(TYPEW *)(vn + i);                       \
2135        *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift);           \
2136    }                                                        \
2137}
2138
2139#define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP)                  \
2140void HELPER(NAME)(void *vd, void *vn, uint32_t desc)              \
2141{                                                                 \
2142    intptr_t i, opr_sz = simd_oprsz(desc);                        \
2143    int shift = simd_data(desc);                                  \
2144    for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                 \
2145        TYPEW nn = *(TYPEW *)(vn + HW(i));                        \
2146        *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift);   \
2147    }                                                             \
2148}
2149
2150DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2151DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2152DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2153
2154DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2155DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2156DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2157
2158DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2159DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2160DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2161
2162DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2163DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2164DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2165
2166#define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2167#define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2168#define DO_SQSHRUN_D(x, sh) \
2169    do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2170
2171DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2172DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2173DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2174
2175DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2176DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2177DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2178
2179#define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2180#define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2181#define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2182
2183DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2184DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2185DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2186
2187DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2188DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2189DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2190
2191#define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2192#define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2193#define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2194
2195DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2196DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2197DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2198
2199DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2200DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2201DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2202
2203#define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2204#define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2205#define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2206
2207DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2208DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2209DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2210
2211DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2212DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2213DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2214
2215#define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2216#define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2217#define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2218
2219DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2220DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2221DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2222
2223DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2224DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2225DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2226
2227#define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2228#define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2229#define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2230
2231DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2232DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2233DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2234
2235DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2236DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2237DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2238
2239#undef DO_SHRNB
2240#undef DO_SHRNT
2241
2242#define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP)                           \
2243void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2244{                                                                           \
2245    intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2246    for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2247        TYPEW nn = *(TYPEW *)(vn + i);                                      \
2248        TYPEW mm = *(TYPEW *)(vm + i);                                      \
2249        *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT);                      \
2250    }                                                                       \
2251}
2252
2253#define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP)                   \
2254void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2255{                                                                           \
2256    intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2257    for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2258        TYPEW nn = *(TYPEW *)(vn + HW(i));                                  \
2259        TYPEW mm = *(TYPEW *)(vm + HW(i));                                  \
2260        *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT);         \
2261    }                                                                       \
2262}
2263
2264#define DO_ADDHN(N, M, SH)  ((N + M) >> SH)
2265#define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2266#define DO_SUBHN(N, M, SH)  ((N - M) >> SH)
2267#define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2268
2269DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2270DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2271DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2272
2273DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2274DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2275DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2276
2277DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2278DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2279DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2280
2281DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2282DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2283DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2284
2285DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2286DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2287DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2288
2289DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2290DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2291DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2292
2293DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2294DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2295DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2296
2297DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2298DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2299DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2300
2301#undef DO_RSUBHN
2302#undef DO_SUBHN
2303#undef DO_RADDHN
2304#undef DO_ADDHN
2305
2306#undef DO_BINOPNB
2307
2308/* Fully general four-operand expander, controlled by a predicate.
2309 */
2310#define DO_ZPZZZ(NAME, TYPE, H, OP)                           \
2311void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2312                  void *vg, uint32_t desc)                    \
2313{                                                             \
2314    intptr_t i, opr_sz = simd_oprsz(desc);                    \
2315    for (i = 0; i < opr_sz; ) {                               \
2316        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));       \
2317        do {                                                  \
2318            if (pg & 1) {                                     \
2319                TYPE nn = *(TYPE *)(vn + H(i));               \
2320                TYPE mm = *(TYPE *)(vm + H(i));               \
2321                TYPE aa = *(TYPE *)(va + H(i));               \
2322                *(TYPE *)(vd + H(i)) = OP(aa, nn, mm);        \
2323            }                                                 \
2324            i += sizeof(TYPE), pg >>= sizeof(TYPE);           \
2325        } while (i & 15);                                     \
2326    }                                                         \
2327}
2328
2329/* Similarly, specialized for 64-bit operands.  */
2330#define DO_ZPZZZ_D(NAME, TYPE, OP)                            \
2331void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2332                  void *vg, uint32_t desc)                    \
2333{                                                             \
2334    intptr_t i, opr_sz = simd_oprsz(desc) / 8;                \
2335    TYPE *d = vd, *a = va, *n = vn, *m = vm;                  \
2336    uint8_t *pg = vg;                                         \
2337    for (i = 0; i < opr_sz; i += 1) {                         \
2338        if (pg[H1(i)] & 1) {                                  \
2339            TYPE aa = a[i], nn = n[i], mm = m[i];             \
2340            d[i] = OP(aa, nn, mm);                            \
2341        }                                                     \
2342    }                                                         \
2343}
2344
2345#define DO_MLA(A, N, M)  (A + N * M)
2346#define DO_MLS(A, N, M)  (A - N * M)
2347
2348DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2349DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2350
2351DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2352DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2353
2354DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2355DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2356
2357DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2358DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2359
2360#undef DO_MLA
2361#undef DO_MLS
2362#undef DO_ZPZZZ
2363#undef DO_ZPZZZ_D
2364
2365void HELPER(sve_index_b)(void *vd, uint32_t start,
2366                         uint32_t incr, uint32_t desc)
2367{
2368    intptr_t i, opr_sz = simd_oprsz(desc);
2369    uint8_t *d = vd;
2370    for (i = 0; i < opr_sz; i += 1) {
2371        d[H1(i)] = start + i * incr;
2372    }
2373}
2374
2375void HELPER(sve_index_h)(void *vd, uint32_t start,
2376                         uint32_t incr, uint32_t desc)
2377{
2378    intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2379    uint16_t *d = vd;
2380    for (i = 0; i < opr_sz; i += 1) {
2381        d[H2(i)] = start + i * incr;
2382    }
2383}
2384
2385void HELPER(sve_index_s)(void *vd, uint32_t start,
2386                         uint32_t incr, uint32_t desc)
2387{
2388    intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2389    uint32_t *d = vd;
2390    for (i = 0; i < opr_sz; i += 1) {
2391        d[H4(i)] = start + i * incr;
2392    }
2393}
2394
2395void HELPER(sve_index_d)(void *vd, uint64_t start,
2396                         uint64_t incr, uint32_t desc)
2397{
2398    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2399    uint64_t *d = vd;
2400    for (i = 0; i < opr_sz; i += 1) {
2401        d[i] = start + i * incr;
2402    }
2403}
2404
2405void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2406{
2407    intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2408    uint32_t sh = simd_data(desc);
2409    uint32_t *d = vd, *n = vn, *m = vm;
2410    for (i = 0; i < opr_sz; i += 1) {
2411        d[i] = n[i] + (m[i] << sh);
2412    }
2413}
2414
2415void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2416{
2417    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2418    uint64_t sh = simd_data(desc);
2419    uint64_t *d = vd, *n = vn, *m = vm;
2420    for (i = 0; i < opr_sz; i += 1) {
2421        d[i] = n[i] + (m[i] << sh);
2422    }
2423}
2424
2425void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2426{
2427    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2428    uint64_t sh = simd_data(desc);
2429    uint64_t *d = vd, *n = vn, *m = vm;
2430    for (i = 0; i < opr_sz; i += 1) {
2431        d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2432    }
2433}
2434
2435void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2436{
2437    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2438    uint64_t sh = simd_data(desc);
2439    uint64_t *d = vd, *n = vn, *m = vm;
2440    for (i = 0; i < opr_sz; i += 1) {
2441        d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2442    }
2443}
2444
2445void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2446{
2447    /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2448    static const uint16_t coeff[] = {
2449        0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2450        0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2451        0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2452        0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2453    };
2454    intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2455    uint16_t *d = vd, *n = vn;
2456
2457    for (i = 0; i < opr_sz; i++) {
2458        uint16_t nn = n[i];
2459        intptr_t idx = extract32(nn, 0, 5);
2460        uint16_t exp = extract32(nn, 5, 5);
2461        d[i] = coeff[idx] | (exp << 10);
2462    }
2463}
2464
2465void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2466{
2467    /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2468    static const uint32_t coeff[] = {
2469        0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2470        0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2471        0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2472        0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2473        0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2474        0x1ef532, 0x20b051, 0x227043, 0x243516,
2475        0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2476        0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2477        0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2478        0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2479        0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2480        0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2481        0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2482        0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2483        0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2484        0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2485    };
2486    intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2487    uint32_t *d = vd, *n = vn;
2488
2489    for (i = 0; i < opr_sz; i++) {
2490        uint32_t nn = n[i];
2491        intptr_t idx = extract32(nn, 0, 6);
2492        uint32_t exp = extract32(nn, 6, 8);
2493        d[i] = coeff[idx] | (exp << 23);
2494    }
2495}
2496
2497void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2498{
2499    /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2500    static const uint64_t coeff[] = {
2501        0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2502        0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2503        0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2504        0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2505        0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2506        0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2507        0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2508        0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2509        0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2510        0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2511        0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2512        0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2513        0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2514        0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2515        0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2516        0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2517        0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2518        0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2519        0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2520        0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2521        0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2522        0xFA7C1819E90D8ull,
2523    };
2524    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2525    uint64_t *d = vd, *n = vn;
2526
2527    for (i = 0; i < opr_sz; i++) {
2528        uint64_t nn = n[i];
2529        intptr_t idx = extract32(nn, 0, 6);
2530        uint64_t exp = extract32(nn, 6, 11);
2531        d[i] = coeff[idx] | (exp << 52);
2532    }
2533}
2534
2535void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2536{
2537    intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2538    uint16_t *d = vd, *n = vn, *m = vm;
2539    for (i = 0; i < opr_sz; i += 1) {
2540        uint16_t nn = n[i];
2541        uint16_t mm = m[i];
2542        if (mm & 1) {
2543            nn = float16_one;
2544        }
2545        d[i] = nn ^ (mm & 2) << 14;
2546    }
2547}
2548
2549void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2550{
2551    intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2552    uint32_t *d = vd, *n = vn, *m = vm;
2553    for (i = 0; i < opr_sz; i += 1) {
2554        uint32_t nn = n[i];
2555        uint32_t mm = m[i];
2556        if (mm & 1) {
2557            nn = float32_one;
2558        }
2559        d[i] = nn ^ (mm & 2) << 30;
2560    }
2561}
2562
2563void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2564{
2565    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2566    uint64_t *d = vd, *n = vn, *m = vm;
2567    for (i = 0; i < opr_sz; i += 1) {
2568        uint64_t nn = n[i];
2569        uint64_t mm = m[i];
2570        if (mm & 1) {
2571            nn = float64_one;
2572        }
2573        d[i] = nn ^ (mm & 2) << 62;
2574    }
2575}
2576
2577/*
2578 * Signed saturating addition with scalar operand.
2579 */
2580
2581void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2582{
2583    intptr_t i, oprsz = simd_oprsz(desc);
2584
2585    for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2586        *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2587    }
2588}
2589
2590void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2591{
2592    intptr_t i, oprsz = simd_oprsz(desc);
2593
2594    for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2595        *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2596    }
2597}
2598
2599void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2600{
2601    intptr_t i, oprsz = simd_oprsz(desc);
2602
2603    for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2604        *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2605    }
2606}
2607
2608void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2609{
2610    intptr_t i, oprsz = simd_oprsz(desc);
2611
2612    for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2613        *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2614    }
2615}
2616
2617/*
2618 * Unsigned saturating addition with scalar operand.
2619 */
2620
2621void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2622{
2623    intptr_t i, oprsz = simd_oprsz(desc);
2624
2625    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2626        *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2627    }
2628}
2629
2630void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2631{
2632    intptr_t i, oprsz = simd_oprsz(desc);
2633
2634    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2635        *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2636    }
2637}
2638
2639void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2640{
2641    intptr_t i, oprsz = simd_oprsz(desc);
2642
2643    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2644        *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2645    }
2646}
2647
2648void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2649{
2650    intptr_t i, oprsz = simd_oprsz(desc);
2651
2652    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2653        *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2654    }
2655}
2656
2657void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2658{
2659    intptr_t i, oprsz = simd_oprsz(desc);
2660
2661    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2662        *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2663    }
2664}
2665
2666/* Two operand predicated copy immediate with merge.  All valid immediates
2667 * can fit within 17 signed bits in the simd_data field.
2668 */
2669void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2670                         uint64_t mm, uint32_t desc)
2671{
2672    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2673    uint64_t *d = vd, *n = vn;
2674    uint8_t *pg = vg;
2675
2676    mm = dup_const(MO_8, mm);
2677    for (i = 0; i < opr_sz; i += 1) {
2678        uint64_t nn = n[i];
2679        uint64_t pp = expand_pred_b(pg[H1(i)]);
2680        d[i] = (mm & pp) | (nn & ~pp);
2681    }
2682}
2683
2684void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2685                         uint64_t mm, uint32_t desc)
2686{
2687    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2688    uint64_t *d = vd, *n = vn;
2689    uint8_t *pg = vg;
2690
2691    mm = dup_const(MO_16, mm);
2692    for (i = 0; i < opr_sz; i += 1) {
2693        uint64_t nn = n[i];
2694        uint64_t pp = expand_pred_h(pg[H1(i)]);
2695        d[i] = (mm & pp) | (nn & ~pp);
2696    }
2697}
2698
2699void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2700                         uint64_t mm, uint32_t desc)
2701{
2702    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2703    uint64_t *d = vd, *n = vn;
2704    uint8_t *pg = vg;
2705
2706    mm = dup_const(MO_32, mm);
2707    for (i = 0; i < opr_sz; i += 1) {
2708        uint64_t nn = n[i];
2709        uint64_t pp = expand_pred_s(pg[H1(i)]);
2710        d[i] = (mm & pp) | (nn & ~pp);
2711    }
2712}
2713
2714void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2715                         uint64_t mm, uint32_t desc)
2716{
2717    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2718    uint64_t *d = vd, *n = vn;
2719    uint8_t *pg = vg;
2720
2721    for (i = 0; i < opr_sz; i += 1) {
2722        uint64_t nn = n[i];
2723        d[i] = (pg[H1(i)] & 1 ? mm : nn);
2724    }
2725}
2726
2727void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2728{
2729    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2730    uint64_t *d = vd;
2731    uint8_t *pg = vg;
2732
2733    val = dup_const(MO_8, val);
2734    for (i = 0; i < opr_sz; i += 1) {
2735        d[i] = val & expand_pred_b(pg[H1(i)]);
2736    }
2737}
2738
2739void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2740{
2741    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2742    uint64_t *d = vd;
2743    uint8_t *pg = vg;
2744
2745    val = dup_const(MO_16, val);
2746    for (i = 0; i < opr_sz; i += 1) {
2747        d[i] = val & expand_pred_h(pg[H1(i)]);
2748    }
2749}
2750
2751void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2752{
2753    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2754    uint64_t *d = vd;
2755    uint8_t *pg = vg;
2756
2757    val = dup_const(MO_32, val);
2758    for (i = 0; i < opr_sz; i += 1) {
2759        d[i] = val & expand_pred_s(pg[H1(i)]);
2760    }
2761}
2762
2763void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2764{
2765    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2766    uint64_t *d = vd;
2767    uint8_t *pg = vg;
2768
2769    for (i = 0; i < opr_sz; i += 1) {
2770        d[i] = (pg[H1(i)] & 1 ? val : 0);
2771    }
2772}
2773
2774/* Big-endian hosts need to frob the byte indices.  If the copy
2775 * happens to be 8-byte aligned, then no frobbing necessary.
2776 */
2777static void swap_memmove(void *vd, void *vs, size_t n)
2778{
2779    uintptr_t d = (uintptr_t)vd;
2780    uintptr_t s = (uintptr_t)vs;
2781    uintptr_t o = (d | s | n) & 7;
2782    size_t i;
2783
2784#if !HOST_BIG_ENDIAN
2785    o = 0;
2786#endif
2787    switch (o) {
2788    case 0:
2789        memmove(vd, vs, n);
2790        break;
2791
2792    case 4:
2793        if (d < s || d >= s + n) {
2794            for (i = 0; i < n; i += 4) {
2795                *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2796            }
2797        } else {
2798            for (i = n; i > 0; ) {
2799                i -= 4;
2800                *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2801            }
2802        }
2803        break;
2804
2805    case 2:
2806    case 6:
2807        if (d < s || d >= s + n) {
2808            for (i = 0; i < n; i += 2) {
2809                *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2810            }
2811        } else {
2812            for (i = n; i > 0; ) {
2813                i -= 2;
2814                *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2815            }
2816        }
2817        break;
2818
2819    default:
2820        if (d < s || d >= s + n) {
2821            for (i = 0; i < n; i++) {
2822                *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2823            }
2824        } else {
2825            for (i = n; i > 0; ) {
2826                i -= 1;
2827                *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2828            }
2829        }
2830        break;
2831    }
2832}
2833
2834/* Similarly for memset of 0.  */
2835static void swap_memzero(void *vd, size_t n)
2836{
2837    uintptr_t d = (uintptr_t)vd;
2838    uintptr_t o = (d | n) & 7;
2839    size_t i;
2840
2841    /* Usually, the first bit of a predicate is set, so N is 0.  */
2842    if (likely(n == 0)) {
2843        return;
2844    }
2845
2846#if !HOST_BIG_ENDIAN
2847    o = 0;
2848#endif
2849    switch (o) {
2850    case 0:
2851        memset(vd, 0, n);
2852        break;
2853
2854    case 4:
2855        for (i = 0; i < n; i += 4) {
2856            *(uint32_t *)H1_4(d + i) = 0;
2857        }
2858        break;
2859
2860    case 2:
2861    case 6:
2862        for (i = 0; i < n; i += 2) {
2863            *(uint16_t *)H1_2(d + i) = 0;
2864        }
2865        break;
2866
2867    default:
2868        for (i = 0; i < n; i++) {
2869            *(uint8_t *)H1(d + i) = 0;
2870        }
2871        break;
2872    }
2873}
2874
2875void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2876{
2877    intptr_t opr_sz = simd_oprsz(desc);
2878    size_t n_ofs = simd_data(desc);
2879    size_t n_siz = opr_sz - n_ofs;
2880
2881    if (vd != vm) {
2882        swap_memmove(vd, vn + n_ofs, n_siz);
2883        swap_memmove(vd + n_siz, vm, n_ofs);
2884    } else if (vd != vn) {
2885        swap_memmove(vd + n_siz, vd, n_ofs);
2886        swap_memmove(vd, vn + n_ofs, n_siz);
2887    } else {
2888        /* vd == vn == vm.  Need temp space.  */
2889        ARMVectorReg tmp;
2890        swap_memmove(&tmp, vm, n_ofs);
2891        swap_memmove(vd, vd + n_ofs, n_siz);
2892        memcpy(vd + n_siz, &tmp, n_ofs);
2893    }
2894}
2895
2896#define DO_INSR(NAME, TYPE, H) \
2897void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2898{                                                                  \
2899    intptr_t opr_sz = simd_oprsz(desc);                            \
2900    swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE));    \
2901    *(TYPE *)(vd + H(0)) = val;                                    \
2902}
2903
2904DO_INSR(sve_insr_b, uint8_t, H1)
2905DO_INSR(sve_insr_h, uint16_t, H1_2)
2906DO_INSR(sve_insr_s, uint32_t, H1_4)
2907DO_INSR(sve_insr_d, uint64_t, H1_8)
2908
2909#undef DO_INSR
2910
2911void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2912{
2913    intptr_t i, j, opr_sz = simd_oprsz(desc);
2914    for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2915        uint64_t f = *(uint64_t *)(vn + i);
2916        uint64_t b = *(uint64_t *)(vn + j);
2917        *(uint64_t *)(vd + i) = bswap64(b);
2918        *(uint64_t *)(vd + j) = bswap64(f);
2919    }
2920}
2921
2922void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2923{
2924    intptr_t i, j, opr_sz = simd_oprsz(desc);
2925    for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2926        uint64_t f = *(uint64_t *)(vn + i);
2927        uint64_t b = *(uint64_t *)(vn + j);
2928        *(uint64_t *)(vd + i) = hswap64(b);
2929        *(uint64_t *)(vd + j) = hswap64(f);
2930    }
2931}
2932
2933void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2934{
2935    intptr_t i, j, opr_sz = simd_oprsz(desc);
2936    for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2937        uint64_t f = *(uint64_t *)(vn + i);
2938        uint64_t b = *(uint64_t *)(vn + j);
2939        *(uint64_t *)(vd + i) = rol64(b, 32);
2940        *(uint64_t *)(vd + j) = rol64(f, 32);
2941    }
2942}
2943
2944void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2945{
2946    intptr_t i, j, opr_sz = simd_oprsz(desc);
2947    for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2948        uint64_t f = *(uint64_t *)(vn + i);
2949        uint64_t b = *(uint64_t *)(vn + j);
2950        *(uint64_t *)(vd + i) = b;
2951        *(uint64_t *)(vd + j) = f;
2952    }
2953}
2954
2955typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
2956
2957static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
2958                           bool is_tbx, tb_impl_fn *fn)
2959{
2960    ARMVectorReg scratch;
2961    uintptr_t oprsz = simd_oprsz(desc);
2962
2963    if (unlikely(vd == vn)) {
2964        vn = memcpy(&scratch, vn, oprsz);
2965    }
2966
2967    fn(vd, vn, NULL, vm, oprsz, is_tbx);
2968}
2969
2970static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
2971                           uint32_t desc, bool is_tbx, tb_impl_fn *fn)
2972{
2973    ARMVectorReg scratch;
2974    uintptr_t oprsz = simd_oprsz(desc);
2975
2976    if (unlikely(vd == vn0)) {
2977        vn0 = memcpy(&scratch, vn0, oprsz);
2978        if (vd == vn1) {
2979            vn1 = vn0;
2980        }
2981    } else if (unlikely(vd == vn1)) {
2982        vn1 = memcpy(&scratch, vn1, oprsz);
2983    }
2984
2985    fn(vd, vn0, vn1, vm, oprsz, is_tbx);
2986}
2987
2988#define DO_TB(SUFF, TYPE, H)                                            \
2989static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1,         \
2990                                void *vm, uintptr_t oprsz, bool is_tbx) \
2991{                                                                       \
2992    TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm;              \
2993    uintptr_t i, nelem = oprsz / sizeof(TYPE);                          \
2994    for (i = 0; i < nelem; ++i) {                                       \
2995        TYPE index = indexes[H1(i)], val = 0;                           \
2996        if (index < nelem) {                                            \
2997            val = tbl0[H(index)];                                       \
2998        } else {                                                        \
2999            index -= nelem;                                             \
3000            if (tbl1 && index < nelem) {                                \
3001                val = tbl1[H(index)];                                   \
3002            } else if (is_tbx) {                                        \
3003                continue;                                               \
3004            }                                                           \
3005        }                                                               \
3006        d[H(i)] = val;                                                  \
3007    }                                                                   \
3008}                                                                       \
3009void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3010{                                                                       \
3011    do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF);                     \
3012}                                                                       \
3013void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1,            \
3014                             void *vm, uint32_t desc)                   \
3015{                                                                       \
3016    do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF);               \
3017}                                                                       \
3018void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3019{                                                                       \
3020    do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF);                      \
3021}
3022
3023DO_TB(b, uint8_t, H1)
3024DO_TB(h, uint16_t, H2)
3025DO_TB(s, uint32_t, H4)
3026DO_TB(d, uint64_t, H8)
3027
3028#undef DO_TB
3029
3030#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3031void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
3032{                                                              \
3033    intptr_t i, opr_sz = simd_oprsz(desc);                     \
3034    TYPED *d = vd;                                             \
3035    TYPES *n = vn;                                             \
3036    ARMVectorReg tmp;                                          \
3037    if (unlikely(vn - vd < opr_sz)) {                          \
3038        n = memcpy(&tmp, n, opr_sz / 2);                       \
3039    }                                                          \
3040    for (i = 0; i < opr_sz / sizeof(TYPED); i++) {             \
3041        d[HD(i)] = n[HS(i)];                                   \
3042    }                                                          \
3043}
3044
3045DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3046DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3047DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3048
3049DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3050DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3051DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3052
3053#undef DO_UNPK
3054
3055/* Mask of bits included in the even numbered predicates of width esz.
3056 * We also use this for expand_bits/compress_bits, and so extend the
3057 * same pattern out to 16-bit units.
3058 */
3059static const uint64_t even_bit_esz_masks[5] = {
3060    0x5555555555555555ull,
3061    0x3333333333333333ull,
3062    0x0f0f0f0f0f0f0f0full,
3063    0x00ff00ff00ff00ffull,
3064    0x0000ffff0000ffffull,
3065};
3066
3067/* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3068 * For N==0, this corresponds to the operation that in qemu/bitops.h
3069 * we call half_shuffle64; this algorithm is from Hacker's Delight,
3070 * section 7-2 Shuffling Bits.
3071 */
3072static uint64_t expand_bits(uint64_t x, int n)
3073{
3074    int i;
3075
3076    x &= 0xffffffffu;
3077    for (i = 4; i >= n; i--) {
3078        int sh = 1 << i;
3079        x = ((x << sh) | x) & even_bit_esz_masks[i];
3080    }
3081    return x;
3082}
3083
3084/* Compress units of 2**(N+1) bits to units of 2**N bits.
3085 * For N==0, this corresponds to the operation that in qemu/bitops.h
3086 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3087 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3088 */
3089static uint64_t compress_bits(uint64_t x, int n)
3090{
3091    int i;
3092
3093    for (i = n; i <= 4; i++) {
3094        int sh = 1 << i;
3095        x &= even_bit_esz_masks[i];
3096        x = (x >> sh) | x;
3097    }
3098    return x & 0xffffffffu;
3099}
3100
3101void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3102{
3103    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3104    int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3105    intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3106    int esize = 1 << esz;
3107    uint64_t *d = vd;
3108    intptr_t i;
3109
3110    if (oprsz <= 8) {
3111        uint64_t nn = *(uint64_t *)vn;
3112        uint64_t mm = *(uint64_t *)vm;
3113        int half = 4 * oprsz;
3114
3115        nn = extract64(nn, high * half, half);
3116        mm = extract64(mm, high * half, half);
3117        nn = expand_bits(nn, esz);
3118        mm = expand_bits(mm, esz);
3119        d[0] = nn | (mm << esize);
3120    } else {
3121        ARMPredicateReg tmp;
3122
3123        /* We produce output faster than we consume input.
3124           Therefore we must be mindful of possible overlap.  */
3125        if (vd == vn) {
3126            vn = memcpy(&tmp, vn, oprsz);
3127            if (vd == vm) {
3128                vm = vn;
3129            }
3130        } else if (vd == vm) {
3131            vm = memcpy(&tmp, vm, oprsz);
3132        }
3133        if (high) {
3134            high = oprsz >> 1;
3135        }
3136
3137        if ((oprsz & 7) == 0) {
3138            uint32_t *n = vn, *m = vm;
3139            high >>= 2;
3140
3141            for (i = 0; i < oprsz / 8; i++) {
3142                uint64_t nn = n[H4(high + i)];
3143                uint64_t mm = m[H4(high + i)];
3144
3145                nn = expand_bits(nn, esz);
3146                mm = expand_bits(mm, esz);
3147                d[i] = nn | (mm << esize);
3148            }
3149        } else {
3150            uint8_t *n = vn, *m = vm;
3151            uint16_t *d16 = vd;
3152
3153            for (i = 0; i < oprsz / 2; i++) {
3154                uint16_t nn = n[H1(high + i)];
3155                uint16_t mm = m[H1(high + i)];
3156
3157                nn = expand_bits(nn, esz);
3158                mm = expand_bits(mm, esz);
3159                d16[H2(i)] = nn | (mm << esize);
3160            }
3161        }
3162    }
3163}
3164
3165void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3166{
3167    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3168    int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3169    int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3170    uint64_t *d = vd, *n = vn, *m = vm;
3171    uint64_t l, h;
3172    intptr_t i;
3173
3174    if (oprsz <= 8) {
3175        l = compress_bits(n[0] >> odd, esz);
3176        h = compress_bits(m[0] >> odd, esz);
3177        d[0] = l | (h << (4 * oprsz));
3178    } else {
3179        ARMPredicateReg tmp_m;
3180        intptr_t oprsz_16 = oprsz / 16;
3181
3182        if ((vm - vd) < (uintptr_t)oprsz) {
3183            m = memcpy(&tmp_m, vm, oprsz);
3184        }
3185
3186        for (i = 0; i < oprsz_16; i++) {
3187            l = n[2 * i + 0];
3188            h = n[2 * i + 1];
3189            l = compress_bits(l >> odd, esz);
3190            h = compress_bits(h >> odd, esz);
3191            d[i] = l | (h << 32);
3192        }
3193
3194        /*
3195         * For VL which is not a multiple of 512, the results from M do not
3196         * align nicely with the uint64_t for D.  Put the aligned results
3197         * from M into TMP_M and then copy it into place afterward.
3198         */
3199        if (oprsz & 15) {
3200            int final_shift = (oprsz & 15) * 2;
3201
3202            l = n[2 * i + 0];
3203            h = n[2 * i + 1];
3204            l = compress_bits(l >> odd, esz);
3205            h = compress_bits(h >> odd, esz);
3206            d[i] = l | (h << final_shift);
3207
3208            for (i = 0; i < oprsz_16; i++) {
3209                l = m[2 * i + 0];
3210                h = m[2 * i + 1];
3211                l = compress_bits(l >> odd, esz);
3212                h = compress_bits(h >> odd, esz);
3213                tmp_m.p[i] = l | (h << 32);
3214            }
3215            l = m[2 * i + 0];
3216            h = m[2 * i + 1];
3217            l = compress_bits(l >> odd, esz);
3218            h = compress_bits(h >> odd, esz);
3219            tmp_m.p[i] = l | (h << final_shift);
3220
3221            swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3222        } else {
3223            for (i = 0; i < oprsz_16; i++) {
3224                l = m[2 * i + 0];
3225                h = m[2 * i + 1];
3226                l = compress_bits(l >> odd, esz);
3227                h = compress_bits(h >> odd, esz);
3228                d[oprsz_16 + i] = l | (h << 32);
3229            }
3230        }
3231    }
3232}
3233
3234void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3235{
3236    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3237    int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3238    int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3239    uint64_t *d = vd, *n = vn, *m = vm;
3240    uint64_t mask;
3241    int shr, shl;
3242    intptr_t i;
3243
3244    shl = 1 << esz;
3245    shr = 0;
3246    mask = even_bit_esz_masks[esz];
3247    if (odd) {
3248        mask <<= shl;
3249        shr = shl;
3250        shl = 0;
3251    }
3252
3253    for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3254        uint64_t nn = (n[i] & mask) >> shr;
3255        uint64_t mm = (m[i] & mask) << shl;
3256        d[i] = nn + mm;
3257    }
3258}
3259
3260/* Reverse units of 2**N bits.  */
3261static uint64_t reverse_bits_64(uint64_t x, int n)
3262{
3263    int i, sh;
3264
3265    x = bswap64(x);
3266    for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3267        uint64_t mask = even_bit_esz_masks[i];
3268        x = ((x & mask) << sh) | ((x >> sh) & mask);
3269    }
3270    return x;
3271}
3272
3273static uint8_t reverse_bits_8(uint8_t x, int n)
3274{
3275    static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3276    int i, sh;
3277
3278    for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3279        x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3280    }
3281    return x;
3282}
3283
3284void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3285{
3286    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3287    int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3288    intptr_t i, oprsz_2 = oprsz / 2;
3289
3290    if (oprsz <= 8) {
3291        uint64_t l = *(uint64_t *)vn;
3292        l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3293        *(uint64_t *)vd = l;
3294    } else if ((oprsz & 15) == 0) {
3295        for (i = 0; i < oprsz_2; i += 8) {
3296            intptr_t ih = oprsz - 8 - i;
3297            uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3298            uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3299            *(uint64_t *)(vd + i) = h;
3300            *(uint64_t *)(vd + ih) = l;
3301        }
3302    } else {
3303        for (i = 0; i < oprsz_2; i += 1) {
3304            intptr_t il = H1(i);
3305            intptr_t ih = H1(oprsz - 1 - i);
3306            uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3307            uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3308            *(uint8_t *)(vd + il) = h;
3309            *(uint8_t *)(vd + ih) = l;
3310        }
3311    }
3312}
3313
3314void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3315{
3316    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3317    intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3318    uint64_t *d = vd;
3319    intptr_t i;
3320
3321    if (oprsz <= 8) {
3322        uint64_t nn = *(uint64_t *)vn;
3323        int half = 4 * oprsz;
3324
3325        nn = extract64(nn, high * half, half);
3326        nn = expand_bits(nn, 0);
3327        d[0] = nn;
3328    } else {
3329        ARMPredicateReg tmp_n;
3330
3331        /* We produce output faster than we consume input.
3332           Therefore we must be mindful of possible overlap.  */
3333        if ((vn - vd) < (uintptr_t)oprsz) {
3334            vn = memcpy(&tmp_n, vn, oprsz);
3335        }
3336        if (high) {
3337            high = oprsz >> 1;
3338        }
3339
3340        if ((oprsz & 7) == 0) {
3341            uint32_t *n = vn;
3342            high >>= 2;
3343
3344            for (i = 0; i < oprsz / 8; i++) {
3345                uint64_t nn = n[H4(high + i)];
3346                d[i] = expand_bits(nn, 0);
3347            }
3348        } else {
3349            uint16_t *d16 = vd;
3350            uint8_t *n = vn;
3351
3352            for (i = 0; i < oprsz / 2; i++) {
3353                uint16_t nn = n[H1(high + i)];
3354                d16[H2(i)] = expand_bits(nn, 0);
3355            }
3356        }
3357    }
3358}
3359
3360#define DO_ZIP(NAME, TYPE, H) \
3361void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
3362{                                                                    \
3363    intptr_t oprsz = simd_oprsz(desc);                               \
3364    intptr_t odd_ofs = simd_data(desc);                              \
3365    intptr_t i, oprsz_2 = oprsz / 2;                                 \
3366    ARMVectorReg tmp_n, tmp_m;                                       \
3367    /* We produce output faster than we consume input.               \
3368       Therefore we must be mindful of possible overlap.  */         \
3369    if (unlikely((vn - vd) < (uintptr_t)oprsz)) {                    \
3370        vn = memcpy(&tmp_n, vn, oprsz);                              \
3371    }                                                                \
3372    if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                    \
3373        vm = memcpy(&tmp_m, vm, oprsz);                              \
3374    }                                                                \
3375    for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                    \
3376        *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
3377        *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) =                    \
3378            *(TYPE *)(vm + odd_ofs + H(i));                          \
3379    }                                                                \
3380    if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                \
3381        memset(vd + oprsz - 16, 0, 16);                              \
3382    }                                                                \
3383}
3384
3385DO_ZIP(sve_zip_b, uint8_t, H1)
3386DO_ZIP(sve_zip_h, uint16_t, H1_2)
3387DO_ZIP(sve_zip_s, uint32_t, H1_4)
3388DO_ZIP(sve_zip_d, uint64_t, H1_8)
3389DO_ZIP(sve2_zip_q, Int128, )
3390
3391#define DO_UZP(NAME, TYPE, H) \
3392void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3393{                                                                      \
3394    intptr_t oprsz = simd_oprsz(desc);                                 \
3395    intptr_t odd_ofs = simd_data(desc);                                \
3396    intptr_t i, p;                                                     \
3397    ARMVectorReg tmp_m;                                                \
3398    if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                      \
3399        vm = memcpy(&tmp_m, vm, oprsz);                                \
3400    }                                                                  \
3401    i = 0, p = odd_ofs;                                                \
3402    do {                                                               \
3403        *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p));                   \
3404        i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3405    } while (p < oprsz);                                               \
3406    p -= oprsz;                                                        \
3407    do {                                                               \
3408        *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p));                   \
3409        i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3410    } while (p < oprsz);                                               \
3411    tcg_debug_assert(i == oprsz);                                      \
3412}
3413
3414DO_UZP(sve_uzp_b, uint8_t, H1)
3415DO_UZP(sve_uzp_h, uint16_t, H1_2)
3416DO_UZP(sve_uzp_s, uint32_t, H1_4)
3417DO_UZP(sve_uzp_d, uint64_t, H1_8)
3418DO_UZP(sve2_uzp_q, Int128, )
3419
3420#define DO_TRN(NAME, TYPE, H) \
3421void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3422{                                                                      \
3423    intptr_t oprsz = simd_oprsz(desc);                                 \
3424    intptr_t odd_ofs = simd_data(desc);                                \
3425    intptr_t i;                                                        \
3426    for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) {                    \
3427        TYPE ae = *(TYPE *)(vn + H(i + odd_ofs));                      \
3428        TYPE be = *(TYPE *)(vm + H(i + odd_ofs));                      \
3429        *(TYPE *)(vd + H(i + 0)) = ae;                                 \
3430        *(TYPE *)(vd + H(i + sizeof(TYPE))) = be;                      \
3431    }                                                                  \
3432    if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                  \
3433        memset(vd + oprsz - 16, 0, 16);                                \
3434    }                                                                  \
3435}
3436
3437DO_TRN(sve_trn_b, uint8_t, H1)
3438DO_TRN(sve_trn_h, uint16_t, H1_2)
3439DO_TRN(sve_trn_s, uint32_t, H1_4)
3440DO_TRN(sve_trn_d, uint64_t, H1_8)
3441DO_TRN(sve2_trn_q, Int128, )
3442
3443#undef DO_ZIP
3444#undef DO_UZP
3445#undef DO_TRN
3446
3447void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3448{
3449    intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3450    uint32_t *d = vd, *n = vn;
3451    uint8_t *pg = vg;
3452
3453    for (i = j = 0; i < opr_sz; i++) {
3454        if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3455            d[H4(j)] = n[H4(i)];
3456            j++;
3457        }
3458    }
3459    for (; j < opr_sz; j++) {
3460        d[H4(j)] = 0;
3461    }
3462}
3463
3464void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3465{
3466    intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3467    uint64_t *d = vd, *n = vn;
3468    uint8_t *pg = vg;
3469
3470    for (i = j = 0; i < opr_sz; i++) {
3471        if (pg[H1(i)] & 1) {
3472            d[j] = n[i];
3473            j++;
3474        }
3475    }
3476    for (; j < opr_sz; j++) {
3477        d[j] = 0;
3478    }
3479}
3480
3481/* Similar to the ARM LastActiveElement pseudocode function, except the
3482 * result is multiplied by the element size.  This includes the not found
3483 * indication; e.g. not found for esz=3 is -8.
3484 */
3485int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3486{
3487    intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3488    intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3489
3490    return last_active_element(vg, words, esz);
3491}
3492
3493void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3494{
3495    intptr_t opr_sz = simd_oprsz(desc) / 8;
3496    int esz = simd_data(desc);
3497    uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3498    intptr_t i, first_i, last_i;
3499    ARMVectorReg tmp;
3500
3501    first_i = last_i = 0;
3502    first_g = last_g = 0;
3503
3504    /* Find the extent of the active elements within VG.  */
3505    for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3506        pg = *(uint64_t *)(vg + i) & mask;
3507        if (pg) {
3508            if (last_g == 0) {
3509                last_g = pg;
3510                last_i = i;
3511            }
3512            first_g = pg;
3513            first_i = i;
3514        }
3515    }
3516
3517    len = 0;
3518    if (first_g != 0) {
3519        first_i = first_i * 8 + ctz64(first_g);
3520        last_i = last_i * 8 + 63 - clz64(last_g);
3521        len = last_i - first_i + (1 << esz);
3522        if (vd == vm) {
3523            vm = memcpy(&tmp, vm, opr_sz * 8);
3524        }
3525        swap_memmove(vd, vn + first_i, len);
3526    }
3527    swap_memmove(vd + len, vm, opr_sz * 8 - len);
3528}
3529
3530void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3531                            void *vg, uint32_t desc)
3532{
3533    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3534    uint64_t *d = vd, *n = vn, *m = vm;
3535    uint8_t *pg = vg;
3536
3537    for (i = 0; i < opr_sz; i += 1) {
3538        uint64_t nn = n[i], mm = m[i];
3539        uint64_t pp = expand_pred_b(pg[H1(i)]);
3540        d[i] = (nn & pp) | (mm & ~pp);
3541    }
3542}
3543
3544void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3545                            void *vg, uint32_t desc)
3546{
3547    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3548    uint64_t *d = vd, *n = vn, *m = vm;
3549    uint8_t *pg = vg;
3550
3551    for (i = 0; i < opr_sz; i += 1) {
3552        uint64_t nn = n[i], mm = m[i];
3553        uint64_t pp = expand_pred_h(pg[H1(i)]);
3554        d[i] = (nn & pp) | (mm & ~pp);
3555    }
3556}
3557
3558void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3559                            void *vg, uint32_t desc)
3560{
3561    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3562    uint64_t *d = vd, *n = vn, *m = vm;
3563    uint8_t *pg = vg;
3564
3565    for (i = 0; i < opr_sz; i += 1) {
3566        uint64_t nn = n[i], mm = m[i];
3567        uint64_t pp = expand_pred_s(pg[H1(i)]);
3568        d[i] = (nn & pp) | (mm & ~pp);
3569    }
3570}
3571
3572void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3573                            void *vg, uint32_t desc)
3574{
3575    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3576    uint64_t *d = vd, *n = vn, *m = vm;
3577    uint8_t *pg = vg;
3578
3579    for (i = 0; i < opr_sz; i += 1) {
3580        uint64_t nn = n[i], mm = m[i];
3581        d[i] = (pg[H1(i)] & 1 ? nn : mm);
3582    }
3583}
3584
3585void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
3586                            void *vg, uint32_t desc)
3587{
3588    intptr_t i, opr_sz = simd_oprsz(desc) / 16;
3589    Int128 *d = vd, *n = vn, *m = vm;
3590    uint16_t *pg = vg;
3591
3592    for (i = 0; i < opr_sz; i += 1) {
3593        d[i] = (pg[H2(i)] & 1 ? n : m)[i];
3594    }
3595}
3596
3597/* Two operand comparison controlled by a predicate.
3598 * ??? It is very tempting to want to be able to expand this inline
3599 * with x86 instructions, e.g.
3600 *
3601 *    vcmpeqw    zm, zn, %ymm0
3602 *    vpmovmskb  %ymm0, %eax
3603 *    and        $0x5555, %eax
3604 *    and        pg, %eax
3605 *
3606 * or even aarch64, e.g.
3607 *
3608 *    // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3609 *    cmeq       v0.8h, zn, zm
3610 *    and        v0.8h, v0.8h, mask
3611 *    addv       h0, v0.8h
3612 *    and        v0.8b, pg
3613 *
3614 * However, coming up with an abstraction that allows vector inputs and
3615 * a scalar output, and also handles the byte-ordering of sub-uint64_t
3616 * scalar outputs, is tricky.
3617 */
3618#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK)                                 \
3619uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3620{                                                                            \
3621    intptr_t opr_sz = simd_oprsz(desc);                                      \
3622    uint32_t flags = PREDTEST_INIT;                                          \
3623    intptr_t i = opr_sz;                                                     \
3624    do {                                                                     \
3625        uint64_t out = 0, pg;                                                \
3626        do {                                                                 \
3627            i -= sizeof(TYPE), out <<= sizeof(TYPE);                         \
3628            TYPE nn = *(TYPE *)(vn + H(i));                                  \
3629            TYPE mm = *(TYPE *)(vm + H(i));                                  \
3630            out |= nn OP mm;                                                 \
3631        } while (i & 63);                                                    \
3632        pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3633        out &= pg;                                                           \
3634        *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3635        flags = iter_predtest_bwd(out, pg, flags);                           \
3636    } while (i > 0);                                                         \
3637    return flags;                                                            \
3638}
3639
3640#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3641    DO_CMP_PPZZ(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3642#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3643    DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3644#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3645    DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3646#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3647    DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3648
3649DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t,  ==)
3650DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3651DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3652DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3653
3654DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t,  !=)
3655DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3656DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3657DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3658
3659DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t,  >)
3660DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3661DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3662DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3663
3664DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t,  >=)
3665DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3666DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3667DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3668
3669DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t,  >)
3670DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3671DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3672DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3673
3674DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t,  >=)
3675DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3676DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3677DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3678
3679#undef DO_CMP_PPZZ_B
3680#undef DO_CMP_PPZZ_H
3681#undef DO_CMP_PPZZ_S
3682#undef DO_CMP_PPZZ_D
3683#undef DO_CMP_PPZZ
3684
3685/* Similar, but the second source is "wide".  */
3686#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK)                     \
3687uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3688{                                                                            \
3689    intptr_t opr_sz = simd_oprsz(desc);                                      \
3690    uint32_t flags = PREDTEST_INIT;                                          \
3691    intptr_t i = opr_sz;                                                     \
3692    do {                                                                     \
3693        uint64_t out = 0, pg;                                                \
3694        do {                                                                 \
3695            TYPEW mm = *(TYPEW *)(vm + i - 8);                               \
3696            do {                                                             \
3697                i -= sizeof(TYPE), out <<= sizeof(TYPE);                     \
3698                TYPE nn = *(TYPE *)(vn + H(i));                              \
3699                out |= nn OP mm;                                             \
3700            } while (i & 7);                                                 \
3701        } while (i & 63);                                                    \
3702        pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3703        out &= pg;                                                           \
3704        *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3705        flags = iter_predtest_bwd(out, pg, flags);                           \
3706    } while (i > 0);                                                         \
3707    return flags;                                                            \
3708}
3709
3710#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3711    DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1,   0xffffffffffffffffull)
3712#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3713    DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3714#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3715    DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3716
3717DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t,  uint64_t, ==)
3718DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3719DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3720
3721DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t,  uint64_t, !=)
3722DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3723DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3724
3725DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t,   int64_t, >)
3726DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t,  int64_t, >)
3727DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t,  int64_t, >)
3728
3729DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t,   int64_t, >=)
3730DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t,  int64_t, >=)
3731DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t,  int64_t, >=)
3732
3733DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t,  uint64_t, >)
3734DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3735DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3736
3737DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t,  uint64_t, >=)
3738DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3739DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3740
3741DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t,   int64_t, <)
3742DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t,  int64_t, <)
3743DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t,  int64_t, <)
3744
3745DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t,   int64_t, <=)
3746DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t,  int64_t, <=)
3747DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t,  int64_t, <=)
3748
3749DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t,  uint64_t, <)
3750DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3751DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3752
3753DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t,  uint64_t, <=)
3754DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3755DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3756
3757#undef DO_CMP_PPZW_B
3758#undef DO_CMP_PPZW_H
3759#undef DO_CMP_PPZW_S
3760#undef DO_CMP_PPZW
3761
3762/* Similar, but the second source is immediate.  */
3763#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK)                         \
3764uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)   \
3765{                                                                    \
3766    intptr_t opr_sz = simd_oprsz(desc);                              \
3767    uint32_t flags = PREDTEST_INIT;                                  \
3768    TYPE mm = simd_data(desc);                                       \
3769    intptr_t i = opr_sz;                                             \
3770    do {                                                             \
3771        uint64_t out = 0, pg;                                        \
3772        do {                                                         \
3773            i -= sizeof(TYPE), out <<= sizeof(TYPE);                 \
3774            TYPE nn = *(TYPE *)(vn + H(i));                          \
3775            out |= nn OP mm;                                         \
3776        } while (i & 63);                                            \
3777        pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                    \
3778        out &= pg;                                                   \
3779        *(uint64_t *)(vd + (i >> 3)) = out;                          \
3780        flags = iter_predtest_bwd(out, pg, flags);                   \
3781    } while (i > 0);                                                 \
3782    return flags;                                                    \
3783}
3784
3785#define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3786    DO_CMP_PPZI(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3787#define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3788    DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3789#define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3790    DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3791#define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3792    DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3793
3794DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t,  ==)
3795DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3796DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3797DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3798
3799DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t,  !=)
3800DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3801DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3802DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3803
3804DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t,  >)
3805DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3806DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3807DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3808
3809DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t,  >=)
3810DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3811DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3812DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3813
3814DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t,  >)
3815DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3816DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3817DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3818
3819DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t,  >=)
3820DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3821DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3822DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3823
3824DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t,  <)
3825DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3826DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3827DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3828
3829DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t,  <=)
3830DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3831DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3832DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3833
3834DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t,  <)
3835DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3836DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3837DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3838
3839DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t,  <=)
3840DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3841DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3842DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3843
3844#undef DO_CMP_PPZI_B
3845#undef DO_CMP_PPZI_H
3846#undef DO_CMP_PPZI_S
3847#undef DO_CMP_PPZI_D
3848#undef DO_CMP_PPZI
3849
3850/* Similar to the ARM LastActive pseudocode function.  */
3851static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3852{
3853    intptr_t i;
3854
3855    for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3856        uint64_t pg = *(uint64_t *)(vg + i);
3857        if (pg) {
3858            return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3859        }
3860    }
3861    return 0;
3862}
3863
3864/* Compute a mask into RETB that is true for all G, up to and including
3865 * (if after) or excluding (if !after) the first G & N.
3866 * Return true if BRK found.
3867 */
3868static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3869                        bool brk, bool after)
3870{
3871    uint64_t b;
3872
3873    if (brk) {
3874        b = 0;
3875    } else if ((g & n) == 0) {
3876        /* For all G, no N are set; break not found.  */
3877        b = g;
3878    } else {
3879        /* Break somewhere in N.  Locate it.  */
3880        b = g & n;            /* guard true, pred true */
3881        b = b & -b;           /* first such */
3882        if (after) {
3883            b = b | (b - 1);  /* break after same */
3884        } else {
3885            b = b - 1;        /* break before same */
3886        }
3887        brk = true;
3888    }
3889
3890    *retb = b;
3891    return brk;
3892}
3893
3894/* Compute a zeroing BRK.  */
3895static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3896                          intptr_t oprsz, bool after)
3897{
3898    bool brk = false;
3899    intptr_t i;
3900
3901    for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3902        uint64_t this_b, this_g = g[i];
3903
3904        brk = compute_brk(&this_b, n[i], this_g, brk, after);
3905        d[i] = this_b & this_g;
3906    }
3907}
3908
3909/* Likewise, but also compute flags.  */
3910static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3911                               intptr_t oprsz, bool after)
3912{
3913    uint32_t flags = PREDTEST_INIT;
3914    bool brk = false;
3915    intptr_t i;
3916
3917    for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3918        uint64_t this_b, this_d, this_g = g[i];
3919
3920        brk = compute_brk(&this_b, n[i], this_g, brk, after);
3921        d[i] = this_d = this_b & this_g;
3922        flags = iter_predtest_fwd(this_d, this_g, flags);
3923    }
3924    return flags;
3925}
3926
3927/* Compute a merging BRK.  */
3928static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3929                          intptr_t oprsz, bool after)
3930{
3931    bool brk = false;
3932    intptr_t i;
3933
3934    for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3935        uint64_t this_b, this_g = g[i];
3936
3937        brk = compute_brk(&this_b, n[i], this_g, brk, after);
3938        d[i] = (this_b & this_g) | (d[i] & ~this_g);
3939    }
3940}
3941
3942/* Likewise, but also compute flags.  */
3943static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3944                               intptr_t oprsz, bool after)
3945{
3946    uint32_t flags = PREDTEST_INIT;
3947    bool brk = false;
3948    intptr_t i;
3949
3950    for (i = 0; i < oprsz / 8; ++i) {
3951        uint64_t this_b, this_d = d[i], this_g = g[i];
3952
3953        brk = compute_brk(&this_b, n[i], this_g, brk, after);
3954        d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3955        flags = iter_predtest_fwd(this_d, this_g, flags);
3956    }
3957    return flags;
3958}
3959
3960static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3961{
3962    /* It is quicker to zero the whole predicate than loop on OPRSZ.
3963     * The compiler should turn this into 4 64-bit integer stores.
3964     */
3965    memset(d, 0, sizeof(ARMPredicateReg));
3966    return PREDTEST_INIT;
3967}
3968
3969void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
3970                       uint32_t pred_desc)
3971{
3972    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3973    if (last_active_pred(vn, vg, oprsz)) {
3974        compute_brk_z(vd, vm, vg, oprsz, true);
3975    } else {
3976        do_zero(vd, oprsz);
3977    }
3978}
3979
3980uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
3981                            uint32_t pred_desc)
3982{
3983    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3984    if (last_active_pred(vn, vg, oprsz)) {
3985        return compute_brks_z(vd, vm, vg, oprsz, true);
3986    } else {
3987        return do_zero(vd, oprsz);
3988    }
3989}
3990
3991void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
3992                       uint32_t pred_desc)
3993{
3994    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3995    if (last_active_pred(vn, vg, oprsz)) {
3996        compute_brk_z(vd, vm, vg, oprsz, false);
3997    } else {
3998        do_zero(vd, oprsz);
3999    }
4000}
4001
4002uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4003                            uint32_t pred_desc)
4004{
4005    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4006    if (last_active_pred(vn, vg, oprsz)) {
4007        return compute_brks_z(vd, vm, vg, oprsz, false);
4008    } else {
4009        return do_zero(vd, oprsz);
4010    }
4011}
4012
4013void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4014{
4015    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4016    compute_brk_z(vd, vn, vg, oprsz, true);
4017}
4018
4019uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4020{
4021    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4022    return compute_brks_z(vd, vn, vg, oprsz, true);
4023}
4024
4025void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4026{
4027    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4028    compute_brk_z(vd, vn, vg, oprsz, false);
4029}
4030
4031uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4032{
4033    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4034    return compute_brks_z(vd, vn, vg, oprsz, false);
4035}
4036
4037void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4038{
4039    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4040    compute_brk_m(vd, vn, vg, oprsz, true);
4041}
4042
4043uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4044{
4045    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4046    return compute_brks_m(vd, vn, vg, oprsz, true);
4047}
4048
4049void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4050{
4051    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4052    compute_brk_m(vd, vn, vg, oprsz, false);
4053}
4054
4055uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4056{
4057    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4058    return compute_brks_m(vd, vn, vg, oprsz, false);
4059}
4060
4061void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4062{
4063    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4064    if (!last_active_pred(vn, vg, oprsz)) {
4065        do_zero(vd, oprsz);
4066    }
4067}
4068
4069/* As if PredTest(Ones(PL), D, esz).  */
4070static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
4071                              uint64_t esz_mask)
4072{
4073    uint32_t flags = PREDTEST_INIT;
4074    intptr_t i;
4075
4076    for (i = 0; i < oprsz / 8; i++) {
4077        flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
4078    }
4079    if (oprsz & 7) {
4080        uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4081        flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
4082    }
4083    return flags;
4084}
4085
4086uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4087{
4088    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4089    if (last_active_pred(vn, vg, oprsz)) {
4090        return predtest_ones(vd, oprsz, -1);
4091    } else {
4092        return do_zero(vd, oprsz);
4093    }
4094}
4095
4096uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4097{
4098    intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4099    intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4100    uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4101    intptr_t i;
4102
4103    for (i = 0; i < words; ++i) {
4104        uint64_t t = n[i] & g[i] & mask;
4105        sum += ctpop64(t);
4106    }
4107    return sum;
4108}
4109
4110uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4111{
4112    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4113    intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4114    uint64_t esz_mask = pred_esz_masks[esz];
4115    ARMPredicateReg *d = vd;
4116    uint32_t flags;
4117    intptr_t i;
4118
4119    /* Begin with a zero predicate register.  */
4120    flags = do_zero(d, oprsz);
4121    if (count == 0) {
4122        return flags;
4123    }
4124
4125    /* Set all of the requested bits.  */
4126    for (i = 0; i < count / 64; ++i) {
4127        d->p[i] = esz_mask;
4128    }
4129    if (count & 63) {
4130        d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4131    }
4132
4133    return predtest_ones(d, oprsz, esz_mask);
4134}
4135
4136uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4137{
4138    intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4139    intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4140    uint64_t esz_mask = pred_esz_masks[esz];
4141    ARMPredicateReg *d = vd;
4142    intptr_t i, invcount, oprbits;
4143    uint64_t bits;
4144
4145    if (count == 0) {
4146        return do_zero(d, oprsz);
4147    }
4148
4149    oprbits = oprsz * 8;
4150    tcg_debug_assert(count <= oprbits);
4151
4152    bits = esz_mask;
4153    if (oprbits & 63) {
4154        bits &= MAKE_64BIT_MASK(0, oprbits & 63);
4155    }
4156
4157    invcount = oprbits - count;
4158    for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
4159        d->p[i] = bits;
4160        bits = esz_mask;
4161    }
4162
4163    d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
4164
4165    while (--i >= 0) {
4166        d->p[i] = 0;
4167    }
4168
4169    return predtest_ones(d, oprsz, esz_mask);
4170}
4171
4172/* Recursive reduction on a function;
4173 * C.f. the ARM ARM function ReducePredicated.
4174 *
4175 * While it would be possible to write this without the DATA temporary,
4176 * it is much simpler to process the predicate register this way.
4177 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4178 * little to gain with a more complex non-recursive form.
4179 */
4180#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT)                         \
4181static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4182{                                                                     \
4183    if (n == 1) {                                                     \
4184        return *data;                                                 \
4185    } else {                                                          \
4186        uintptr_t half = n / 2;                                       \
4187        TYPE lo = NAME##_reduce(data, status, half);                  \
4188        TYPE hi = NAME##_reduce(data + half, status, half);           \
4189        return TYPE##_##FUNC(lo, hi, status);                         \
4190    }                                                                 \
4191}                                                                     \
4192uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc)    \
4193{                                                                     \
4194    uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc);   \
4195    TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)];                   \
4196    for (i = 0; i < oprsz; ) {                                        \
4197        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));               \
4198        do {                                                          \
4199            TYPE nn = *(TYPE *)(vn + H(i));                           \
4200            *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT);      \
4201            i += sizeof(TYPE), pg >>= sizeof(TYPE);                   \
4202        } while (i & 15);                                             \
4203    }                                                                 \
4204    for (; i < maxsz; i += sizeof(TYPE)) {                            \
4205        *(TYPE *)((void *)data + i) = IDENT;                          \
4206    }                                                                 \
4207    return NAME##_reduce(data, vs, maxsz / sizeof(TYPE));             \
4208}
4209
4210DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
4211DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
4212DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero)
4213
4214/* Identity is floatN_default_nan, without the function call.  */
4215DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
4216DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
4217DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL)
4218
4219DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
4220DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
4221DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL)
4222
4223DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
4224DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
4225DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity)
4226
4227DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
4228DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
4229DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity))
4230
4231#undef DO_REDUCE
4232
4233uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4234                             void *status, uint32_t desc)
4235{
4236    intptr_t i = 0, opr_sz = simd_oprsz(desc);
4237    float16 result = nn;
4238
4239    do {
4240        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4241        do {
4242            if (pg & 1) {
4243                float16 mm = *(float16 *)(vm + H1_2(i));
4244                result = float16_add(result, mm, status);
4245            }
4246            i += sizeof(float16), pg >>= sizeof(float16);
4247        } while (i & 15);
4248    } while (i < opr_sz);
4249
4250    return result;
4251}
4252
4253uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4254                             void *status, uint32_t desc)
4255{
4256    intptr_t i = 0, opr_sz = simd_oprsz(desc);
4257    float32 result = nn;
4258
4259    do {
4260        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4261        do {
4262            if (pg & 1) {
4263                float32 mm = *(float32 *)(vm + H1_2(i));
4264                result = float32_add(result, mm, status);
4265            }
4266            i += sizeof(float32), pg >>= sizeof(float32);
4267        } while (i & 15);
4268    } while (i < opr_sz);
4269
4270    return result;
4271}
4272
4273uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4274                             void *status, uint32_t desc)
4275{
4276    intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4277    uint64_t *m = vm;
4278    uint8_t *pg = vg;
4279
4280    for (i = 0; i < opr_sz; i++) {
4281        if (pg[H1(i)] & 1) {
4282            nn = float64_add(nn, m[i], status);
4283        }
4284    }
4285
4286    return nn;
4287}
4288
4289/* Fully general three-operand expander, controlled by a predicate,
4290 * With the extra float_status parameter.
4291 */
4292#define DO_ZPZZ_FP(NAME, TYPE, H, OP)                           \
4293void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,       \
4294                  void *status, uint32_t desc)                  \
4295{                                                               \
4296    intptr_t i = simd_oprsz(desc);                              \
4297    uint64_t *g = vg;                                           \
4298    do {                                                        \
4299        uint64_t pg = g[(i - 1) >> 6];                          \
4300        do {                                                    \
4301            i -= sizeof(TYPE);                                  \
4302            if (likely((pg >> (i & 63)) & 1)) {                 \
4303                TYPE nn = *(TYPE *)(vn + H(i));                 \
4304                TYPE mm = *(TYPE *)(vm + H(i));                 \
4305                *(TYPE *)(vd + H(i)) = OP(nn, mm, status);      \
4306            }                                                   \
4307        } while (i & 63);                                       \
4308    } while (i != 0);                                           \
4309}
4310
4311DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4312DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4313DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4314
4315DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4316DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4317DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4318
4319DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4320DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4321DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4322
4323DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4324DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4325DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4326
4327DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4328DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4329DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4330
4331DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4332DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4333DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4334
4335DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4336DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4337DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4338
4339DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4340DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4341DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4342
4343static inline float16 abd_h(float16 a, float16 b, float_status *s)
4344{
4345    return float16_abs(float16_sub(a, b, s));
4346}
4347
4348static inline float32 abd_s(float32 a, float32 b, float_status *s)
4349{
4350    return float32_abs(float32_sub(a, b, s));
4351}
4352
4353static inline float64 abd_d(float64 a, float64 b, float_status *s)
4354{
4355    return float64_abs(float64_sub(a, b, s));
4356}
4357
4358DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4359DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4360DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4361
4362static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4363{
4364    int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4365    return float64_scalbn(a, b_int, s);
4366}
4367
4368DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4369DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4370DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4371
4372DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4373DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4374DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4375
4376#undef DO_ZPZZ_FP
4377
4378/* Three-operand expander, with one scalar operand, controlled by
4379 * a predicate, with the extra float_status parameter.
4380 */
4381#define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4382void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar,  \
4383                  void *status, uint32_t desc)                    \
4384{                                                                 \
4385    intptr_t i = simd_oprsz(desc);                                \
4386    uint64_t *g = vg;                                             \
4387    TYPE mm = scalar;                                             \
4388    do {                                                          \
4389        uint64_t pg = g[(i - 1) >> 6];                            \
4390        do {                                                      \
4391            i -= sizeof(TYPE);                                    \
4392            if (likely((pg >> (i & 63)) & 1)) {                   \
4393                TYPE nn = *(TYPE *)(vn + H(i));                   \
4394                *(TYPE *)(vd + H(i)) = OP(nn, mm, status);        \
4395            }                                                     \
4396        } while (i & 63);                                         \
4397    } while (i != 0);                                             \
4398}
4399
4400DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4401DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4402DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4403
4404DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4405DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4406DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4407
4408DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4409DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4410DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4411
4412static inline float16 subr_h(float16 a, float16 b, float_status *s)
4413{
4414    return float16_sub(b, a, s);
4415}
4416
4417static inline float32 subr_s(float32 a, float32 b, float_status *s)
4418{
4419    return float32_sub(b, a, s);
4420}
4421
4422static inline float64 subr_d(float64 a, float64 b, float_status *s)
4423{
4424    return float64_sub(b, a, s);
4425}
4426
4427DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4428DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4429DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4430
4431DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4432DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4433DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4434
4435DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4436DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4437DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4438
4439DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4440DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4441DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4442
4443DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4444DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4445DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4446
4447/* Fully general two-operand expander, controlled by a predicate,
4448 * With the extra float_status parameter.
4449 */
4450#define DO_ZPZ_FP(NAME, TYPE, H, OP)                                  \
4451void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
4452{                                                                     \
4453    intptr_t i = simd_oprsz(desc);                                    \
4454    uint64_t *g = vg;                                                 \
4455    do {                                                              \
4456        uint64_t pg = g[(i - 1) >> 6];                                \
4457        do {                                                          \
4458            i -= sizeof(TYPE);                                        \
4459            if (likely((pg >> (i & 63)) & 1)) {                       \
4460                TYPE nn = *(TYPE *)(vn + H(i));                       \
4461                *(TYPE *)(vd + H(i)) = OP(nn, status);                \
4462            }                                                         \
4463        } while (i & 63);                                             \
4464    } while (i != 0);                                                 \
4465}
4466
4467/* SVE fp16 conversions always use IEEE mode.  Like AdvSIMD, they ignore
4468 * FZ16.  When converting from fp16, this affects flushing input denormals;
4469 * when converting to fp16, this affects flushing output denormals.
4470 */
4471static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4472{
4473    bool save = get_flush_inputs_to_zero(fpst);
4474    float32 ret;
4475
4476    set_flush_inputs_to_zero(false, fpst);
4477    ret = float16_to_float32(f, true, fpst);
4478    set_flush_inputs_to_zero(save, fpst);
4479    return ret;
4480}
4481
4482static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4483{
4484    bool save = get_flush_inputs_to_zero(fpst);
4485    float64 ret;
4486
4487    set_flush_inputs_to_zero(false, fpst);
4488    ret = float16_to_float64(f, true, fpst);
4489    set_flush_inputs_to_zero(save, fpst);
4490    return ret;
4491}
4492
4493static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4494{
4495    bool save = get_flush_to_zero(fpst);
4496    float16 ret;
4497
4498    set_flush_to_zero(false, fpst);
4499    ret = float32_to_float16(f, true, fpst);
4500    set_flush_to_zero(save, fpst);
4501    return ret;
4502}
4503
4504static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4505{
4506    bool save = get_flush_to_zero(fpst);
4507    float16 ret;
4508
4509    set_flush_to_zero(false, fpst);
4510    ret = float64_to_float16(f, true, fpst);
4511    set_flush_to_zero(save, fpst);
4512    return ret;
4513}
4514
4515static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4516{
4517    if (float16_is_any_nan(f)) {
4518        float_raise(float_flag_invalid, s);
4519        return 0;
4520    }
4521    return float16_to_int16_round_to_zero(f, s);
4522}
4523
4524static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4525{
4526    if (float16_is_any_nan(f)) {
4527        float_raise(float_flag_invalid, s);
4528        return 0;
4529    }
4530    return float16_to_int64_round_to_zero(f, s);
4531}
4532
4533static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4534{
4535    if (float32_is_any_nan(f)) {
4536        float_raise(float_flag_invalid, s);
4537        return 0;
4538    }
4539    return float32_to_int64_round_to_zero(f, s);
4540}
4541
4542static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4543{
4544    if (float64_is_any_nan(f)) {
4545        float_raise(float_flag_invalid, s);
4546        return 0;
4547    }
4548    return float64_to_int64_round_to_zero(f, s);
4549}
4550
4551static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4552{
4553    if (float16_is_any_nan(f)) {
4554        float_raise(float_flag_invalid, s);
4555        return 0;
4556    }
4557    return float16_to_uint16_round_to_zero(f, s);
4558}
4559
4560static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4561{
4562    if (float16_is_any_nan(f)) {
4563        float_raise(float_flag_invalid, s);
4564        return 0;
4565    }
4566    return float16_to_uint64_round_to_zero(f, s);
4567}
4568
4569static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4570{
4571    if (float32_is_any_nan(f)) {
4572        float_raise(float_flag_invalid, s);
4573        return 0;
4574    }
4575    return float32_to_uint64_round_to_zero(f, s);
4576}
4577
4578static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4579{
4580    if (float64_is_any_nan(f)) {
4581        float_raise(float_flag_invalid, s);
4582        return 0;
4583    }
4584    return float64_to_uint64_round_to_zero(f, s);
4585}
4586
4587DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4588DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4589DO_ZPZ_FP(sve_bfcvt,   uint32_t, H1_4, float32_to_bfloat16)
4590DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4591DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4592DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4593DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4594
4595DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4596DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4597DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4598DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4599DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4600DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4601DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4602
4603DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4604DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4605DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4606DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4607DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4608DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4609DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4610
4611DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4612DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4613DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4614
4615DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4616DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4617DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4618
4619DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4620DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4621DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4622
4623DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4624DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4625DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
4626
4627DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4628DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4629DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4630DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
4631DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
4632DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
4633DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
4634
4635DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4636DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4637DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4638DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
4639DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
4640DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
4641DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
4642
4643static int16_t do_float16_logb_as_int(float16 a, float_status *s)
4644{
4645    /* Extract frac to the top of the uint32_t. */
4646    uint32_t frac = (uint32_t)a << (16 + 6);
4647    int16_t exp = extract32(a, 10, 5);
4648
4649    if (unlikely(exp == 0)) {
4650        if (frac != 0) {
4651            if (!get_flush_inputs_to_zero(s)) {
4652                /* denormal: bias - fractional_zeros */
4653                return -15 - clz32(frac);
4654            }
4655            /* flush to zero */
4656            float_raise(float_flag_input_denormal, s);
4657        }
4658    } else if (unlikely(exp == 0x1f)) {
4659        if (frac == 0) {
4660            return INT16_MAX; /* infinity */
4661        }
4662    } else {
4663        /* normal: exp - bias */
4664        return exp - 15;
4665    }
4666    /* nan or zero */
4667    float_raise(float_flag_invalid, s);
4668    return INT16_MIN;
4669}
4670
4671static int32_t do_float32_logb_as_int(float32 a, float_status *s)
4672{
4673    /* Extract frac to the top of the uint32_t. */
4674    uint32_t frac = a << 9;
4675    int32_t exp = extract32(a, 23, 8);
4676
4677    if (unlikely(exp == 0)) {
4678        if (frac != 0) {
4679            if (!get_flush_inputs_to_zero(s)) {
4680                /* denormal: bias - fractional_zeros */
4681                return -127 - clz32(frac);
4682            }
4683            /* flush to zero */
4684            float_raise(float_flag_input_denormal, s);
4685        }
4686    } else if (unlikely(exp == 0xff)) {
4687        if (frac == 0) {
4688            return INT32_MAX; /* infinity */
4689        }
4690    } else {
4691        /* normal: exp - bias */
4692        return exp - 127;
4693    }
4694    /* nan or zero */
4695    float_raise(float_flag_invalid, s);
4696    return INT32_MIN;
4697}
4698
4699static int64_t do_float64_logb_as_int(float64 a, float_status *s)
4700{
4701    /* Extract frac to the top of the uint64_t. */
4702    uint64_t frac = a << 12;
4703    int64_t exp = extract64(a, 52, 11);
4704
4705    if (unlikely(exp == 0)) {
4706        if (frac != 0) {
4707            if (!get_flush_inputs_to_zero(s)) {
4708                /* denormal: bias - fractional_zeros */
4709                return -1023 - clz64(frac);
4710            }
4711            /* flush to zero */
4712            float_raise(float_flag_input_denormal, s);
4713        }
4714    } else if (unlikely(exp == 0x7ff)) {
4715        if (frac == 0) {
4716            return INT64_MAX; /* infinity */
4717        }
4718    } else {
4719        /* normal: exp - bias */
4720        return exp - 1023;
4721    }
4722    /* nan or zero */
4723    float_raise(float_flag_invalid, s);
4724    return INT64_MIN;
4725}
4726
4727DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
4728DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
4729DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
4730
4731#undef DO_ZPZ_FP
4732
4733static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4734                            float_status *status, uint32_t desc,
4735                            uint16_t neg1, uint16_t neg3)
4736{
4737    intptr_t i = simd_oprsz(desc);
4738    uint64_t *g = vg;
4739
4740    do {
4741        uint64_t pg = g[(i - 1) >> 6];
4742        do {
4743            i -= 2;
4744            if (likely((pg >> (i & 63)) & 1)) {
4745                float16 e1, e2, e3, r;
4746
4747                e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4748                e2 = *(uint16_t *)(vm + H1_2(i));
4749                e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4750                r = float16_muladd(e1, e2, e3, 0, status);
4751                *(uint16_t *)(vd + H1_2(i)) = r;
4752            }
4753        } while (i & 63);
4754    } while (i != 0);
4755}
4756
4757void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4758                              void *vg, void *status, uint32_t desc)
4759{
4760    do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
4761}
4762
4763void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4764                              void *vg, void *status, uint32_t desc)
4765{
4766    do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
4767}
4768
4769void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4770                               void *vg, void *status, uint32_t desc)
4771{
4772    do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
4773}
4774
4775void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4776                               void *vg, void *status, uint32_t desc)
4777{
4778    do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
4779}
4780
4781static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4782                            float_status *status, uint32_t desc,
4783                            uint32_t neg1, uint32_t neg3)
4784{
4785    intptr_t i = simd_oprsz(desc);
4786    uint64_t *g = vg;
4787
4788    do {
4789        uint64_t pg = g[(i - 1) >> 6];
4790        do {
4791            i -= 4;
4792            if (likely((pg >> (i & 63)) & 1)) {
4793                float32 e1, e2, e3, r;
4794
4795                e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4796                e2 = *(uint32_t *)(vm + H1_4(i));
4797                e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4798                r = float32_muladd(e1, e2, e3, 0, status);
4799                *(uint32_t *)(vd + H1_4(i)) = r;
4800            }
4801        } while (i & 63);
4802    } while (i != 0);
4803}
4804
4805void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4806                              void *vg, void *status, uint32_t desc)
4807{
4808    do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
4809}
4810
4811void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4812                              void *vg, void *status, uint32_t desc)
4813{
4814    do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
4815}
4816
4817void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4818                               void *vg, void *status, uint32_t desc)
4819{
4820    do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
4821}
4822
4823void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4824                               void *vg, void *status, uint32_t desc)
4825{
4826    do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
4827}
4828
4829static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4830                            float_status *status, uint32_t desc,
4831                            uint64_t neg1, uint64_t neg3)
4832{
4833    intptr_t i = simd_oprsz(desc);
4834    uint64_t *g = vg;
4835
4836    do {
4837        uint64_t pg = g[(i - 1) >> 6];
4838        do {
4839            i -= 8;
4840            if (likely((pg >> (i & 63)) & 1)) {
4841                float64 e1, e2, e3, r;
4842
4843                e1 = *(uint64_t *)(vn + i) ^ neg1;
4844                e2 = *(uint64_t *)(vm + i);
4845                e3 = *(uint64_t *)(va + i) ^ neg3;
4846                r = float64_muladd(e1, e2, e3, 0, status);
4847                *(uint64_t *)(vd + i) = r;
4848            }
4849        } while (i & 63);
4850    } while (i != 0);
4851}
4852
4853void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4854                              void *vg, void *status, uint32_t desc)
4855{
4856    do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
4857}
4858
4859void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4860                              void *vg, void *status, uint32_t desc)
4861{
4862    do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
4863}
4864
4865void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4866                               void *vg, void *status, uint32_t desc)
4867{
4868    do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
4869}
4870
4871void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4872                               void *vg, void *status, uint32_t desc)
4873{
4874    do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
4875}
4876
4877/* Two operand floating-point comparison controlled by a predicate.
4878 * Unlike the integer version, we are not allowed to optimistically
4879 * compare operands, since the comparison may have side effects wrt
4880 * the FPSR.
4881 */
4882#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP)                                \
4883void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
4884                  void *status, uint32_t desc)                          \
4885{                                                                       \
4886    intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;                    \
4887    uint64_t *d = vd, *g = vg;                                          \
4888    do {                                                                \
4889        uint64_t out = 0, pg = g[j];                                    \
4890        do {                                                            \
4891            i -= sizeof(TYPE), out <<= sizeof(TYPE);                    \
4892            if (likely((pg >> (i & 63)) & 1)) {                         \
4893                TYPE nn = *(TYPE *)(vn + H(i));                         \
4894                TYPE mm = *(TYPE *)(vm + H(i));                         \
4895                out |= OP(TYPE, nn, mm, status);                        \
4896            }                                                           \
4897        } while (i & 63);                                               \
4898        d[j--] = out;                                                   \
4899    } while (i > 0);                                                    \
4900}
4901
4902#define DO_FPCMP_PPZZ_H(NAME, OP) \
4903    DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4904#define DO_FPCMP_PPZZ_S(NAME, OP) \
4905    DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4906#define DO_FPCMP_PPZZ_D(NAME, OP) \
4907    DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
4908
4909#define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4910    DO_FPCMP_PPZZ_H(NAME, OP)   \
4911    DO_FPCMP_PPZZ_S(NAME, OP)   \
4912    DO_FPCMP_PPZZ_D(NAME, OP)
4913
4914#define DO_FCMGE(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) <= 0
4915#define DO_FCMGT(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) < 0
4916#define DO_FCMLE(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) <= 0
4917#define DO_FCMLT(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) < 0
4918#define DO_FCMEQ(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) == 0
4919#define DO_FCMNE(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) != 0
4920#define DO_FCMUO(TYPE, X, Y, ST)  \
4921    TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4922#define DO_FACGE(TYPE, X, Y, ST)  \
4923    TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4924#define DO_FACGT(TYPE, X, Y, ST)  \
4925    TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4926
4927DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
4928DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
4929DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
4930DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
4931DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
4932DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
4933DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
4934
4935#undef DO_FPCMP_PPZZ_ALL
4936#undef DO_FPCMP_PPZZ_D
4937#undef DO_FPCMP_PPZZ_S
4938#undef DO_FPCMP_PPZZ_H
4939#undef DO_FPCMP_PPZZ
4940
4941/* One operand floating-point comparison against zero, controlled
4942 * by a predicate.
4943 */
4944#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP)                   \
4945void HELPER(NAME)(void *vd, void *vn, void *vg,            \
4946                  void *status, uint32_t desc)             \
4947{                                                          \
4948    intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;       \
4949    uint64_t *d = vd, *g = vg;                             \
4950    do {                                                   \
4951        uint64_t out = 0, pg = g[j];                       \
4952        do {                                               \
4953            i -= sizeof(TYPE), out <<= sizeof(TYPE);       \
4954            if ((pg >> (i & 63)) & 1) {                    \
4955                TYPE nn = *(TYPE *)(vn + H(i));            \
4956                out |= OP(TYPE, nn, 0, status);            \
4957            }                                              \
4958        } while (i & 63);                                  \
4959        d[j--] = out;                                      \
4960    } while (i > 0);                                       \
4961}
4962
4963#define DO_FPCMP_PPZ0_H(NAME, OP) \
4964    DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4965#define DO_FPCMP_PPZ0_S(NAME, OP) \
4966    DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4967#define DO_FPCMP_PPZ0_D(NAME, OP) \
4968    DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
4969
4970#define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4971    DO_FPCMP_PPZ0_H(NAME, OP)   \
4972    DO_FPCMP_PPZ0_S(NAME, OP)   \
4973    DO_FPCMP_PPZ0_D(NAME, OP)
4974
4975DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
4976DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
4977DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
4978DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
4979DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
4980DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
4981
4982/* FP Trig Multiply-Add. */
4983
4984void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4985{
4986    static const float16 coeff[16] = {
4987        0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4988        0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4989    };
4990    intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
4991    intptr_t x = simd_data(desc);
4992    float16 *d = vd, *n = vn, *m = vm;
4993    for (i = 0; i < opr_sz; i++) {
4994        float16 mm = m[i];
4995        intptr_t xx = x;
4996        if (float16_is_neg(mm)) {
4997            mm = float16_abs(mm);
4998            xx += 8;
4999        }
5000        d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
5001    }
5002}
5003
5004void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5005{
5006    static const float32 coeff[16] = {
5007        0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5008        0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5009        0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5010        0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5011    };
5012    intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5013    intptr_t x = simd_data(desc);
5014    float32 *d = vd, *n = vn, *m = vm;
5015    for (i = 0; i < opr_sz; i++) {
5016        float32 mm = m[i];
5017        intptr_t xx = x;
5018        if (float32_is_neg(mm)) {
5019            mm = float32_abs(mm);
5020            xx += 8;
5021        }
5022        d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
5023    }
5024}
5025
5026void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
5027{
5028    static const float64 coeff[16] = {
5029        0x3ff0000000000000ull, 0xbfc5555555555543ull,
5030        0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5031        0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5032        0x3de5d8408868552full, 0x0000000000000000ull,
5033        0x3ff0000000000000ull, 0xbfe0000000000000ull,
5034        0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5035        0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5036        0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5037    };
5038    intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5039    intptr_t x = simd_data(desc);
5040    float64 *d = vd, *n = vn, *m = vm;
5041    for (i = 0; i < opr_sz; i++) {
5042        float64 mm = m[i];
5043        intptr_t xx = x;
5044        if (float64_is_neg(mm)) {
5045            mm = float64_abs(mm);
5046            xx += 8;
5047        }
5048        d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
5049    }
5050}
5051
5052/*
5053 * FP Complex Add
5054 */
5055
5056void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5057                         void *vs, uint32_t desc)
5058{
5059    intptr_t j, i = simd_oprsz(desc);
5060    uint64_t *g = vg;
5061    float16 neg_imag = float16_set_sign(0, simd_data(desc));
5062    float16 neg_real = float16_chs(neg_imag);
5063
5064    do {
5065        uint64_t pg = g[(i - 1) >> 6];
5066        do {
5067            float16 e0, e1, e2, e3;
5068
5069            /* I holds the real index; J holds the imag index.  */
5070            j = i - sizeof(float16);
5071            i -= 2 * sizeof(float16);
5072
5073            e0 = *(float16 *)(vn + H1_2(i));
5074            e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
5075            e2 = *(float16 *)(vn + H1_2(j));
5076            e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
5077
5078            if (likely((pg >> (i & 63)) & 1)) {
5079                *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
5080            }
5081            if (likely((pg >> (j & 63)) & 1)) {
5082                *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
5083            }
5084        } while (i & 63);
5085    } while (i != 0);
5086}
5087
5088void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5089                         void *vs, uint32_t desc)
5090{
5091    intptr_t j, i = simd_oprsz(desc);
5092    uint64_t *g = vg;
5093    float32 neg_imag = float32_set_sign(0, simd_data(desc));
5094    float32 neg_real = float32_chs(neg_imag);
5095
5096    do {
5097        uint64_t pg = g[(i - 1) >> 6];
5098        do {
5099            float32 e0, e1, e2, e3;
5100
5101            /* I holds the real index; J holds the imag index.  */
5102            j = i - sizeof(float32);
5103            i -= 2 * sizeof(float32);
5104
5105            e0 = *(float32 *)(vn + H1_2(i));
5106            e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
5107            e2 = *(float32 *)(vn + H1_2(j));
5108            e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
5109
5110            if (likely((pg >> (i & 63)) & 1)) {
5111                *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
5112            }
5113            if (likely((pg >> (j & 63)) & 1)) {
5114                *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
5115            }
5116        } while (i & 63);
5117    } while (i != 0);
5118}
5119
5120void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5121                         void *vs, uint32_t desc)
5122{
5123    intptr_t j, i = simd_oprsz(desc);
5124    uint64_t *g = vg;
5125    float64 neg_imag = float64_set_sign(0, simd_data(desc));
5126    float64 neg_real = float64_chs(neg_imag);
5127
5128    do {
5129        uint64_t pg = g[(i - 1) >> 6];
5130        do {
5131            float64 e0, e1, e2, e3;
5132
5133            /* I holds the real index; J holds the imag index.  */
5134            j = i - sizeof(float64);
5135            i -= 2 * sizeof(float64);
5136
5137            e0 = *(float64 *)(vn + H1_2(i));
5138            e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
5139            e2 = *(float64 *)(vn + H1_2(j));
5140            e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
5141
5142            if (likely((pg >> (i & 63)) & 1)) {
5143                *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
5144            }
5145            if (likely((pg >> (j & 63)) & 1)) {
5146                *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
5147            }
5148        } while (i & 63);
5149    } while (i != 0);
5150}
5151
5152/*
5153 * FP Complex Multiply
5154 */
5155
5156void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5157                               void *vg, void *status, uint32_t desc)
5158{
5159    intptr_t j, i = simd_oprsz(desc);
5160    unsigned rot = simd_data(desc);
5161    bool flip = rot & 1;
5162    float16 neg_imag, neg_real;
5163    uint64_t *g = vg;
5164
5165    neg_imag = float16_set_sign(0, (rot & 2) != 0);
5166    neg_real = float16_set_sign(0, rot == 1 || rot == 2);
5167
5168    do {
5169        uint64_t pg = g[(i - 1) >> 6];
5170        do {
5171            float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5172
5173            /* I holds the real index; J holds the imag index.  */
5174            j = i - sizeof(float16);
5175            i -= 2 * sizeof(float16);
5176
5177            nr = *(float16 *)(vn + H1_2(i));
5178            ni = *(float16 *)(vn + H1_2(j));
5179            mr = *(float16 *)(vm + H1_2(i));
5180            mi = *(float16 *)(vm + H1_2(j));
5181
5182            e2 = (flip ? ni : nr);
5183            e1 = (flip ? mi : mr) ^ neg_real;
5184            e4 = e2;
5185            e3 = (flip ? mr : mi) ^ neg_imag;
5186
5187            if (likely((pg >> (i & 63)) & 1)) {
5188                d = *(float16 *)(va + H1_2(i));
5189                d = float16_muladd(e2, e1, d, 0, status);
5190                *(float16 *)(vd + H1_2(i)) = d;
5191            }
5192            if (likely((pg >> (j & 63)) & 1)) {
5193                d = *(float16 *)(va + H1_2(j));
5194                d = float16_muladd(e4, e3, d, 0, status);
5195                *(float16 *)(vd + H1_2(j)) = d;
5196            }
5197        } while (i & 63);
5198    } while (i != 0);
5199}
5200
5201void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5202                               void *vg, void *status, uint32_t desc)
5203{
5204    intptr_t j, i = simd_oprsz(desc);
5205    unsigned rot = simd_data(desc);
5206    bool flip = rot & 1;
5207    float32 neg_imag, neg_real;
5208    uint64_t *g = vg;
5209
5210    neg_imag = float32_set_sign(0, (rot & 2) != 0);
5211    neg_real = float32_set_sign(0, rot == 1 || rot == 2);
5212
5213    do {
5214        uint64_t pg = g[(i - 1) >> 6];
5215        do {
5216            float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5217
5218            /* I holds the real index; J holds the imag index.  */
5219            j = i - sizeof(float32);
5220            i -= 2 * sizeof(float32);
5221
5222            nr = *(float32 *)(vn + H1_2(i));
5223            ni = *(float32 *)(vn + H1_2(j));
5224            mr = *(float32 *)(vm + H1_2(i));
5225            mi = *(float32 *)(vm + H1_2(j));
5226
5227            e2 = (flip ? ni : nr);
5228            e1 = (flip ? mi : mr) ^ neg_real;
5229            e4 = e2;
5230            e3 = (flip ? mr : mi) ^ neg_imag;
5231
5232            if (likely((pg >> (i & 63)) & 1)) {
5233                d = *(float32 *)(va + H1_2(i));
5234                d = float32_muladd(e2, e1, d, 0, status);
5235                *(float32 *)(vd + H1_2(i)) = d;
5236            }
5237            if (likely((pg >> (j & 63)) & 1)) {
5238                d = *(float32 *)(va + H1_2(j));
5239                d = float32_muladd(e4, e3, d, 0, status);
5240                *(float32 *)(vd + H1_2(j)) = d;
5241            }
5242        } while (i & 63);
5243    } while (i != 0);
5244}
5245
5246void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5247                               void *vg, void *status, uint32_t desc)
5248{
5249    intptr_t j, i = simd_oprsz(desc);
5250    unsigned rot = simd_data(desc);
5251    bool flip = rot & 1;
5252    float64 neg_imag, neg_real;
5253    uint64_t *g = vg;
5254
5255    neg_imag = float64_set_sign(0, (rot & 2) != 0);
5256    neg_real = float64_set_sign(0, rot == 1 || rot == 2);
5257
5258    do {
5259        uint64_t pg = g[(i - 1) >> 6];
5260        do {
5261            float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5262
5263            /* I holds the real index; J holds the imag index.  */
5264            j = i - sizeof(float64);
5265            i -= 2 * sizeof(float64);
5266
5267            nr = *(float64 *)(vn + H1_2(i));
5268            ni = *(float64 *)(vn + H1_2(j));
5269            mr = *(float64 *)(vm + H1_2(i));
5270            mi = *(float64 *)(vm + H1_2(j));
5271
5272            e2 = (flip ? ni : nr);
5273            e1 = (flip ? mi : mr) ^ neg_real;
5274            e4 = e2;
5275            e3 = (flip ? mr : mi) ^ neg_imag;
5276
5277            if (likely((pg >> (i & 63)) & 1)) {
5278                d = *(float64 *)(va + H1_2(i));
5279                d = float64_muladd(e2, e1, d, 0, status);
5280                *(float64 *)(vd + H1_2(i)) = d;
5281            }
5282            if (likely((pg >> (j & 63)) & 1)) {
5283                d = *(float64 *)(va + H1_2(j));
5284                d = float64_muladd(e4, e3, d, 0, status);
5285                *(float64 *)(vd + H1_2(j)) = d;
5286            }
5287        } while (i & 63);
5288    } while (i != 0);
5289}
5290
5291/*
5292 * Load contiguous data, protected by a governing predicate.
5293 */
5294
5295/*
5296 * Skip through a sequence of inactive elements in the guarding predicate @vg,
5297 * beginning at @reg_off bounded by @reg_max.  Return the offset of the active
5298 * element >= @reg_off, or @reg_max if there were no active elements at all.
5299 */
5300static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5301                                 intptr_t reg_max, int esz)
5302{
5303    uint64_t pg_mask = pred_esz_masks[esz];
5304    uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5305
5306    /* In normal usage, the first element is active.  */
5307    if (likely(pg & 1)) {
5308        return reg_off;
5309    }
5310
5311    if (pg == 0) {
5312        reg_off &= -64;
5313        do {
5314            reg_off += 64;
5315            if (unlikely(reg_off >= reg_max)) {
5316                /* The entire predicate was false.  */
5317                return reg_max;
5318            }
5319            pg = vg[reg_off >> 6] & pg_mask;
5320        } while (pg == 0);
5321    }
5322    reg_off += ctz64(pg);
5323
5324    /* We should never see an out of range predicate bit set.  */
5325    tcg_debug_assert(reg_off < reg_max);
5326    return reg_off;
5327}
5328
5329/*
5330 * Resolve the guest virtual address to info->host and info->flags.
5331 * If @nofault, return false if the page is invalid, otherwise
5332 * exit via page fault exception.
5333 */
5334
5335bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
5336                    target_ulong addr, int mem_off, MMUAccessType access_type,
5337                    int mmu_idx, uintptr_t retaddr)
5338{
5339    int flags;
5340
5341    addr += mem_off;
5342
5343    /*
5344     * User-only currently always issues with TBI.  See the comment
5345     * above useronly_clean_ptr.  Usually we clean this top byte away
5346     * during translation, but we can't do that for e.g. vector + imm
5347     * addressing modes.
5348     *
5349     * We currently always enable TBI for user-only, and do not provide
5350     * a way to turn it off.  So clean the pointer unconditionally here,
5351     * rather than look it up here, or pass it down from above.
5352     */
5353    addr = useronly_clean_ptr(addr);
5354
5355#ifdef CONFIG_USER_ONLY
5356    flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault,
5357                               &info->host, retaddr);
5358#else
5359    CPUTLBEntryFull *full;
5360    flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault,
5361                              &info->host, &full, retaddr);
5362#endif
5363    info->flags = flags;
5364
5365    if (flags & TLB_INVALID_MASK) {
5366        g_assert(nofault);
5367        return false;
5368    }
5369
5370#ifdef CONFIG_USER_ONLY
5371    memset(&info->attrs, 0, sizeof(info->attrs));
5372    /* Require both ANON and MTE; see allocation_tag_mem(). */
5373    info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);
5374#else
5375    info->attrs = full->attrs;
5376    info->tagged = full->pte_attrs == 0xf0;
5377#endif
5378
5379    /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5380    info->host -= mem_off;
5381    return true;
5382}
5383
5384/*
5385 * Find first active element on each page, and a loose bound for the
5386 * final element on each page.  Identify any single element that spans
5387 * the page boundary.  Return true if there are any active elements.
5388 */
5389bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
5390                            intptr_t reg_max, int esz, int msize)
5391{
5392    const int esize = 1 << esz;
5393    const uint64_t pg_mask = pred_esz_masks[esz];
5394    intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5395    intptr_t mem_off_last, mem_off_split;
5396    intptr_t page_split, elt_split;
5397    intptr_t i;
5398
5399    /* Set all of the element indices to -1, and the TLB data to 0. */
5400    memset(info, -1, offsetof(SVEContLdSt, page));
5401    memset(info->page, 0, sizeof(info->page));
5402
5403    /* Gross scan over the entire predicate to find bounds. */
5404    i = 0;
5405    do {
5406        uint64_t pg = vg[i] & pg_mask;
5407        if (pg) {
5408            reg_off_last = i * 64 + 63 - clz64(pg);
5409            if (reg_off_first < 0) {
5410                reg_off_first = i * 64 + ctz64(pg);
5411            }
5412        }
5413    } while (++i * 64 < reg_max);
5414
5415    if (unlikely(reg_off_first < 0)) {
5416        /* No active elements, no pages touched. */
5417        return false;
5418    }
5419    tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5420
5421    info->reg_off_first[0] = reg_off_first;
5422    info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5423    mem_off_last = (reg_off_last >> esz) * msize;
5424
5425    page_split = -(addr | TARGET_PAGE_MASK);
5426    if (likely(mem_off_last + msize <= page_split)) {
5427        /* The entire operation fits within a single page. */
5428        info->reg_off_last[0] = reg_off_last;
5429        return true;
5430    }
5431
5432    info->page_split = page_split;
5433    elt_split = page_split / msize;
5434    reg_off_split = elt_split << esz;
5435    mem_off_split = elt_split * msize;
5436
5437    /*
5438     * This is the last full element on the first page, but it is not
5439     * necessarily active.  If there is no full element, i.e. the first
5440     * active element is the one that's split, this value remains -1.
5441     * It is useful as iteration bounds.
5442     */
5443    if (elt_split != 0) {
5444        info->reg_off_last[0] = reg_off_split - esize;
5445    }
5446
5447    /* Determine if an unaligned element spans the pages.  */
5448    if (page_split % msize != 0) {
5449        /* It is helpful to know if the split element is active. */
5450        if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5451            info->reg_off_split = reg_off_split;
5452            info->mem_off_split = mem_off_split;
5453
5454            if (reg_off_split == reg_off_last) {
5455                /* The page crossing element is last. */
5456                return true;
5457            }
5458        }
5459        reg_off_split += esize;
5460        mem_off_split += msize;
5461    }
5462
5463    /*
5464     * We do want the first active element on the second page, because
5465     * this may affect the address reported in an exception.
5466     */
5467    reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5468    tcg_debug_assert(reg_off_split <= reg_off_last);
5469    info->reg_off_first[1] = reg_off_split;
5470    info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5471    info->reg_off_last[1] = reg_off_last;
5472    return true;
5473}
5474
5475/*
5476 * Resolve the guest virtual addresses to info->page[].
5477 * Control the generation of page faults with @fault.  Return false if
5478 * there is no work to do, which can only happen with @fault == FAULT_NO.
5479 */
5480bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5481                         CPUARMState *env, target_ulong addr,
5482                         MMUAccessType access_type, uintptr_t retaddr)
5483{
5484    int mmu_idx = cpu_mmu_index(env, false);
5485    int mem_off = info->mem_off_first[0];
5486    bool nofault = fault == FAULT_NO;
5487    bool have_work = true;
5488
5489    if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5490                        access_type, mmu_idx, retaddr)) {
5491        /* No work to be done. */
5492        return false;
5493    }
5494
5495    if (likely(info->page_split < 0)) {
5496        /* The entire operation was on the one page. */
5497        return true;
5498    }
5499
5500    /*
5501     * If the second page is invalid, then we want the fault address to be
5502     * the first byte on that page which is accessed.
5503     */
5504    if (info->mem_off_split >= 0) {
5505        /*
5506         * There is an element split across the pages.  The fault address
5507         * should be the first byte of the second page.
5508         */
5509        mem_off = info->page_split;
5510        /*
5511         * If the split element is also the first active element
5512         * of the vector, then:  For first-fault we should continue
5513         * to generate faults for the second page.  For no-fault,
5514         * we have work only if the second page is valid.
5515         */
5516        if (info->mem_off_first[0] < info->mem_off_split) {
5517            nofault = FAULT_FIRST;
5518            have_work = false;
5519        }
5520    } else {
5521        /*
5522         * There is no element split across the pages.  The fault address
5523         * should be the first active element on the second page.
5524         */
5525        mem_off = info->mem_off_first[1];
5526        /*
5527         * There must have been one active element on the first page,
5528         * so we're out of first-fault territory.
5529         */
5530        nofault = fault != FAULT_ALL;
5531    }
5532
5533    have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5534                                access_type, mmu_idx, retaddr);
5535    return have_work;
5536}
5537
5538#ifndef CONFIG_USER_ONLY
5539void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5540                               uint64_t *vg, target_ulong addr,
5541                               int esize, int msize, int wp_access,
5542                               uintptr_t retaddr)
5543{
5544    intptr_t mem_off, reg_off, reg_last;
5545    int flags0 = info->page[0].flags;
5546    int flags1 = info->page[1].flags;
5547
5548    if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5549        return;
5550    }
5551
5552    /* Indicate that watchpoints are handled. */
5553    info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5554    info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5555
5556    if (flags0 & TLB_WATCHPOINT) {
5557        mem_off = info->mem_off_first[0];
5558        reg_off = info->reg_off_first[0];
5559        reg_last = info->reg_off_last[0];
5560
5561        while (reg_off <= reg_last) {
5562            uint64_t pg = vg[reg_off >> 6];
5563            do {
5564                if ((pg >> (reg_off & 63)) & 1) {
5565                    cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5566                                         msize, info->page[0].attrs,
5567                                         wp_access, retaddr);
5568                }
5569                reg_off += esize;
5570                mem_off += msize;
5571            } while (reg_off <= reg_last && (reg_off & 63));
5572        }
5573    }
5574
5575    mem_off = info->mem_off_split;
5576    if (mem_off >= 0) {
5577        cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5578                             info->page[0].attrs, wp_access, retaddr);
5579    }
5580
5581    mem_off = info->mem_off_first[1];
5582    if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5583        reg_off = info->reg_off_first[1];
5584        reg_last = info->reg_off_last[1];
5585
5586        do {
5587            uint64_t pg = vg[reg_off >> 6];
5588            do {
5589                if ((pg >> (reg_off & 63)) & 1) {
5590                    cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5591                                         msize, info->page[1].attrs,
5592                                         wp_access, retaddr);
5593                }
5594                reg_off += esize;
5595                mem_off += msize;
5596            } while (reg_off & 63);
5597        } while (reg_off <= reg_last);
5598    }
5599}
5600#endif
5601
5602void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5603                             uint64_t *vg, target_ulong addr, int esize,
5604                             int msize, uint32_t mtedesc, uintptr_t ra)
5605{
5606    intptr_t mem_off, reg_off, reg_last;
5607
5608    /* Process the page only if MemAttr == Tagged. */
5609    if (info->page[0].tagged) {
5610        mem_off = info->mem_off_first[0];
5611        reg_off = info->reg_off_first[0];
5612        reg_last = info->reg_off_split;
5613        if (reg_last < 0) {
5614            reg_last = info->reg_off_last[0];
5615        }
5616
5617        do {
5618            uint64_t pg = vg[reg_off >> 6];
5619            do {
5620                if ((pg >> (reg_off & 63)) & 1) {
5621                    mte_check(env, mtedesc, addr, ra);
5622                }
5623                reg_off += esize;
5624                mem_off += msize;
5625            } while (reg_off <= reg_last && (reg_off & 63));
5626        } while (reg_off <= reg_last);
5627    }
5628
5629    mem_off = info->mem_off_first[1];
5630    if (mem_off >= 0 && info->page[1].tagged) {
5631        reg_off = info->reg_off_first[1];
5632        reg_last = info->reg_off_last[1];
5633
5634        do {
5635            uint64_t pg = vg[reg_off >> 6];
5636            do {
5637                if ((pg >> (reg_off & 63)) & 1) {
5638                    mte_check(env, mtedesc, addr, ra);
5639                }
5640                reg_off += esize;
5641                mem_off += msize;
5642            } while (reg_off & 63);
5643        } while (reg_off <= reg_last);
5644    }
5645}
5646
5647/*
5648 * Common helper for all contiguous 1,2,3,4-register predicated stores.
5649 */
5650static inline QEMU_ALWAYS_INLINE
5651void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5652               uint32_t desc, const uintptr_t retaddr,
5653               const int esz, const int msz, const int N, uint32_t mtedesc,
5654               sve_ldst1_host_fn *host_fn,
5655               sve_ldst1_tlb_fn *tlb_fn)
5656{
5657    const unsigned rd = simd_data(desc);
5658    const intptr_t reg_max = simd_oprsz(desc);
5659    intptr_t reg_off, reg_last, mem_off;
5660    SVEContLdSt info;
5661    void *host;
5662    int flags, i;
5663
5664    /* Find the active elements.  */
5665    if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5666        /* The entire predicate was false; no load occurs.  */
5667        for (i = 0; i < N; ++i) {
5668            memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5669        }
5670        return;
5671    }
5672
5673    /* Probe the page(s).  Exit with exception for any invalid page. */
5674    sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5675
5676    /* Handle watchpoints for all active elements. */
5677    sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5678                              BP_MEM_READ, retaddr);
5679
5680    /*
5681     * Handle mte checks for all active elements.
5682     * Since TBI must be set for MTE, !mtedesc => !mte_active.
5683     */
5684    if (mtedesc) {
5685        sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5686                                mtedesc, retaddr);
5687    }
5688
5689    flags = info.page[0].flags | info.page[1].flags;
5690    if (unlikely(flags != 0)) {
5691        /*
5692         * At least one page includes MMIO.
5693         * Any bus operation can fail with cpu_transaction_failed,
5694         * which for ARM will raise SyncExternal.  Perform the load
5695         * into scratch memory to preserve register state until the end.
5696         */
5697        ARMVectorReg scratch[4] = { };
5698
5699        mem_off = info.mem_off_first[0];
5700        reg_off = info.reg_off_first[0];
5701        reg_last = info.reg_off_last[1];
5702        if (reg_last < 0) {
5703            reg_last = info.reg_off_split;
5704            if (reg_last < 0) {
5705                reg_last = info.reg_off_last[0];
5706            }
5707        }
5708
5709        do {
5710            uint64_t pg = vg[reg_off >> 6];
5711            do {
5712                if ((pg >> (reg_off & 63)) & 1) {
5713                    for (i = 0; i < N; ++i) {
5714                        tlb_fn(env, &scratch[i], reg_off,
5715                               addr + mem_off + (i << msz), retaddr);
5716                    }
5717                }
5718                reg_off += 1 << esz;
5719                mem_off += N << msz;
5720            } while (reg_off & 63);
5721        } while (reg_off <= reg_last);
5722
5723        for (i = 0; i < N; ++i) {
5724            memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5725        }
5726        return;
5727    }
5728
5729    /* The entire operation is in RAM, on valid pages. */
5730
5731    for (i = 0; i < N; ++i) {
5732        memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5733    }
5734
5735    mem_off = info.mem_off_first[0];
5736    reg_off = info.reg_off_first[0];
5737    reg_last = info.reg_off_last[0];
5738    host = info.page[0].host;
5739
5740    while (reg_off <= reg_last) {
5741        uint64_t pg = vg[reg_off >> 6];
5742        do {
5743            if ((pg >> (reg_off & 63)) & 1) {
5744                for (i = 0; i < N; ++i) {
5745                    host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5746                            host + mem_off + (i << msz));
5747                }
5748            }
5749            reg_off += 1 << esz;
5750            mem_off += N << msz;
5751        } while (reg_off <= reg_last && (reg_off & 63));
5752    }
5753
5754    /*
5755     * Use the slow path to manage the cross-page misalignment.
5756     * But we know this is RAM and cannot trap.
5757     */
5758    mem_off = info.mem_off_split;
5759    if (unlikely(mem_off >= 0)) {
5760        reg_off = info.reg_off_split;
5761        for (i = 0; i < N; ++i) {
5762            tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5763                   addr + mem_off + (i << msz), retaddr);
5764        }
5765    }
5766
5767    mem_off = info.mem_off_first[1];
5768    if (unlikely(mem_off >= 0)) {
5769        reg_off = info.reg_off_first[1];
5770        reg_last = info.reg_off_last[1];
5771        host = info.page[1].host;
5772
5773        do {
5774            uint64_t pg = vg[reg_off >> 6];
5775            do {
5776                if ((pg >> (reg_off & 63)) & 1) {
5777                    for (i = 0; i < N; ++i) {
5778                        host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5779                                host + mem_off + (i << msz));
5780                    }
5781                }
5782                reg_off += 1 << esz;
5783                mem_off += N << msz;
5784            } while (reg_off & 63);
5785        } while (reg_off <= reg_last);
5786    }
5787}
5788
5789static inline QEMU_ALWAYS_INLINE
5790void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5791                   uint32_t desc, const uintptr_t ra,
5792                   const int esz, const int msz, const int N,
5793                   sve_ldst1_host_fn *host_fn,
5794                   sve_ldst1_tlb_fn *tlb_fn)
5795{
5796    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5797    int bit55 = extract64(addr, 55, 1);
5798
5799    /* Remove mtedesc from the normal sve descriptor. */
5800    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5801
5802    /* Perform gross MTE suppression early. */
5803    if (!tbi_check(desc, bit55) ||
5804        tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5805        mtedesc = 0;
5806    }
5807
5808    sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
5809}
5810
5811#define DO_LD1_1(NAME, ESZ)                                             \
5812void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg,                 \
5813                            target_ulong addr, uint32_t desc)           \
5814{                                                                       \
5815    sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0,            \
5816              sve_##NAME##_host, sve_##NAME##_tlb);                     \
5817}                                                                       \
5818void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg,             \
5819                                target_ulong addr, uint32_t desc)       \
5820{                                                                       \
5821    sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1,           \
5822                  sve_##NAME##_host, sve_##NAME##_tlb);                 \
5823}
5824
5825#define DO_LD1_2(NAME, ESZ, MSZ)                                        \
5826void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg,              \
5827                               target_ulong addr, uint32_t desc)        \
5828{                                                                       \
5829    sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
5830              sve_##NAME##_le_host, sve_##NAME##_le_tlb);               \
5831}                                                                       \
5832void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg,              \
5833                               target_ulong addr, uint32_t desc)        \
5834{                                                                       \
5835    sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
5836              sve_##NAME##_be_host, sve_##NAME##_be_tlb);               \
5837}                                                                       \
5838void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg,          \
5839                                   target_ulong addr, uint32_t desc)    \
5840{                                                                       \
5841    sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
5842                  sve_##NAME##_le_host, sve_##NAME##_le_tlb);           \
5843}                                                                       \
5844void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg,          \
5845                                   target_ulong addr, uint32_t desc)    \
5846{                                                                       \
5847    sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
5848                  sve_##NAME##_be_host, sve_##NAME##_be_tlb);           \
5849}
5850
5851DO_LD1_1(ld1bb,  MO_8)
5852DO_LD1_1(ld1bhu, MO_16)
5853DO_LD1_1(ld1bhs, MO_16)
5854DO_LD1_1(ld1bsu, MO_32)
5855DO_LD1_1(ld1bss, MO_32)
5856DO_LD1_1(ld1bdu, MO_64)
5857DO_LD1_1(ld1bds, MO_64)
5858
5859DO_LD1_2(ld1hh,  MO_16, MO_16)
5860DO_LD1_2(ld1hsu, MO_32, MO_16)
5861DO_LD1_2(ld1hss, MO_32, MO_16)
5862DO_LD1_2(ld1hdu, MO_64, MO_16)
5863DO_LD1_2(ld1hds, MO_64, MO_16)
5864
5865DO_LD1_2(ld1ss,  MO_32, MO_32)
5866DO_LD1_2(ld1sdu, MO_64, MO_32)
5867DO_LD1_2(ld1sds, MO_64, MO_32)
5868
5869DO_LD1_2(ld1dd,  MO_64, MO_64)
5870
5871#undef DO_LD1_1
5872#undef DO_LD1_2
5873
5874#define DO_LDN_1(N)                                                     \
5875void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg,                \
5876                             target_ulong addr, uint32_t desc)          \
5877{                                                                       \
5878    sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0,           \
5879              sve_ld1bb_host, sve_ld1bb_tlb);                           \
5880}                                                                       \
5881void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg,            \
5882                                 target_ulong addr, uint32_t desc)      \
5883{                                                                       \
5884    sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N,          \
5885                  sve_ld1bb_host, sve_ld1bb_tlb);                       \
5886}
5887
5888#define DO_LDN_2(N, SUFF, ESZ)                                          \
5889void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg,         \
5890                                    target_ulong addr, uint32_t desc)   \
5891{                                                                       \
5892    sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
5893              sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);         \
5894}                                                                       \
5895void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg,         \
5896                                    target_ulong addr, uint32_t desc)   \
5897{                                                                       \
5898    sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
5899              sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);         \
5900}                                                                       \
5901void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg,     \
5902                                        target_ulong addr, uint32_t desc) \
5903{                                                                       \
5904    sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
5905                  sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);     \
5906}                                                                       \
5907void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg,     \
5908                                        target_ulong addr, uint32_t desc) \
5909{                                                                       \
5910    sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
5911                  sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);     \
5912}
5913
5914DO_LDN_1(2)
5915DO_LDN_1(3)
5916DO_LDN_1(4)
5917
5918DO_LDN_2(2, hh, MO_16)
5919DO_LDN_2(3, hh, MO_16)
5920DO_LDN_2(4, hh, MO_16)
5921
5922DO_LDN_2(2, ss, MO_32)
5923DO_LDN_2(3, ss, MO_32)
5924DO_LDN_2(4, ss, MO_32)
5925
5926DO_LDN_2(2, dd, MO_64)
5927DO_LDN_2(3, dd, MO_64)
5928DO_LDN_2(4, dd, MO_64)
5929
5930#undef DO_LDN_1
5931#undef DO_LDN_2
5932
5933/*
5934 * Load contiguous data, first-fault and no-fault.
5935 *
5936 * For user-only, one could argue that we should hold the mmap_lock during
5937 * the operation so that there is no race between page_check_range and the
5938 * load operation.  However, unmapping pages out from under a running thread
5939 * is extraordinarily unlikely.  This theoretical race condition also affects
5940 * linux-user/ in its get_user/put_user macros.
5941 *
5942 * TODO: Construct some helpers, written in assembly, that interact with
5943 * host_signal_handler to produce memory ops which can properly report errors
5944 * without racing.
5945 */
5946
5947/* Fault on byte I.  All bits in FFR from I are cleared.  The vector
5948 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
5949 * option, which leaves subsequent data unchanged.
5950 */
5951static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
5952{
5953    uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
5954
5955    if (i & 63) {
5956        ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
5957        i = ROUND_UP(i, 64);
5958    }
5959    for (; i < oprsz; i += 64) {
5960        ffr[i / 64] = 0;
5961    }
5962}
5963
5964/*
5965 * Common helper for all contiguous no-fault and first-fault loads.
5966 */
5967static inline QEMU_ALWAYS_INLINE
5968void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
5969                   uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
5970                   const int esz, const int msz, const SVEContFault fault,
5971                   sve_ldst1_host_fn *host_fn,
5972                   sve_ldst1_tlb_fn *tlb_fn)
5973{
5974    const unsigned rd = simd_data(desc);
5975    void *vd = &env->vfp.zregs[rd];
5976    const intptr_t reg_max = simd_oprsz(desc);
5977    intptr_t reg_off, mem_off, reg_last;
5978    SVEContLdSt info;
5979    int flags;
5980    void *host;
5981
5982    /* Find the active elements.  */
5983    if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
5984        /* The entire predicate was false; no load occurs.  */
5985        memset(vd, 0, reg_max);
5986        return;
5987    }
5988    reg_off = info.reg_off_first[0];
5989
5990    /* Probe the page(s). */
5991    if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
5992        /* Fault on first element. */
5993        tcg_debug_assert(fault == FAULT_NO);
5994        memset(vd, 0, reg_max);
5995        goto do_fault;
5996    }
5997
5998    mem_off = info.mem_off_first[0];
5999    flags = info.page[0].flags;
6000
6001    /*
6002     * Disable MTE checking if the Tagged bit is not set.  Since TBI must
6003     * be set within MTEDESC for MTE, !mtedesc => !mte_active.
6004     */
6005    if (!info.page[0].tagged) {
6006        mtedesc = 0;
6007    }
6008
6009    if (fault == FAULT_FIRST) {
6010        /* Trapping mte check for the first-fault element.  */
6011        if (mtedesc) {
6012            mte_check(env, mtedesc, addr + mem_off, retaddr);
6013        }
6014
6015        /*
6016         * Special handling of the first active element,
6017         * if it crosses a page boundary or is MMIO.
6018         */
6019        bool is_split = mem_off == info.mem_off_split;
6020        if (unlikely(flags != 0) || unlikely(is_split)) {
6021            /*
6022             * Use the slow path for cross-page handling.
6023             * Might trap for MMIO or watchpoints.
6024             */
6025            tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6026
6027            /* After any fault, zero the other elements. */
6028            swap_memzero(vd, reg_off);
6029            reg_off += 1 << esz;
6030            mem_off += 1 << msz;
6031            swap_memzero(vd + reg_off, reg_max - reg_off);
6032
6033            if (is_split) {
6034                goto second_page;
6035            }
6036        } else {
6037            memset(vd, 0, reg_max);
6038        }
6039    } else {
6040        memset(vd, 0, reg_max);
6041        if (unlikely(mem_off == info.mem_off_split)) {
6042            /* The first active element crosses a page boundary. */
6043            flags |= info.page[1].flags;
6044            if (unlikely(flags & TLB_MMIO)) {
6045                /* Some page is MMIO, see below. */
6046                goto do_fault;
6047            }
6048            if (unlikely(flags & TLB_WATCHPOINT) &&
6049                (cpu_watchpoint_address_matches
6050                 (env_cpu(env), addr + mem_off, 1 << msz)
6051                 & BP_MEM_READ)) {
6052                /* Watchpoint hit, see below. */
6053                goto do_fault;
6054            }
6055            if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6056                goto do_fault;
6057            }
6058            /*
6059             * Use the slow path for cross-page handling.
6060             * This is RAM, without a watchpoint, and will not trap.
6061             */
6062            tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6063            goto second_page;
6064        }
6065    }
6066
6067    /*
6068     * From this point on, all memory operations are MemSingleNF.
6069     *
6070     * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6071     * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6072     *
6073     * Unfortuately we do not have access to the memory attributes from the
6074     * PTE to tell Device memory from Normal memory.  So we make a mostly
6075     * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6076     * This gives the right answer for the common cases of "Normal memory,
6077     * backed by host RAM" and "Device memory, backed by MMIO".
6078     * The architecture allows us to suppress an NF load and return
6079     * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6080     * case of "Normal memory, backed by MMIO" is permitted.  The case we
6081     * get wrong is "Device memory, backed by host RAM", for which we
6082     * should return (UNKNOWN, FAULT) for but do not.
6083     *
6084     * Similarly, CPU_BP breakpoints would raise exceptions, and so
6085     * return (UNKNOWN, FAULT).  For simplicity, we consider gdb and
6086     * architectural breakpoints the same.
6087     */
6088    if (unlikely(flags & TLB_MMIO)) {
6089        goto do_fault;
6090    }
6091
6092    reg_last = info.reg_off_last[0];
6093    host = info.page[0].host;
6094
6095    do {
6096        uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6097        do {
6098            if ((pg >> (reg_off & 63)) & 1) {
6099                if (unlikely(flags & TLB_WATCHPOINT) &&
6100                    (cpu_watchpoint_address_matches
6101                     (env_cpu(env), addr + mem_off, 1 << msz)
6102                     & BP_MEM_READ)) {
6103                    goto do_fault;
6104                }
6105                if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6106                    goto do_fault;
6107                }
6108                host_fn(vd, reg_off, host + mem_off);
6109            }
6110            reg_off += 1 << esz;
6111            mem_off += 1 << msz;
6112        } while (reg_off <= reg_last && (reg_off & 63));
6113    } while (reg_off <= reg_last);
6114
6115    /*
6116     * MemSingleNF is allowed to fail for any reason.  We have special
6117     * code above to handle the first element crossing a page boundary.
6118     * As an implementation choice, decline to handle a cross-page element
6119     * in any other position.
6120     */
6121    reg_off = info.reg_off_split;
6122    if (reg_off >= 0) {
6123        goto do_fault;
6124    }
6125
6126 second_page:
6127    reg_off = info.reg_off_first[1];
6128    if (likely(reg_off < 0)) {
6129        /* No active elements on the second page.  All done. */
6130        return;
6131    }
6132
6133    /*
6134     * MemSingleNF is allowed to fail for any reason.  As an implementation
6135     * choice, decline to handle elements on the second page.  This should
6136     * be low frequency as the guest walks through memory -- the next
6137     * iteration of the guest's loop should be aligned on the page boundary,
6138     * and then all following iterations will stay aligned.
6139     */
6140
6141 do_fault:
6142    record_fault(env, reg_off, reg_max);
6143}
6144
6145static inline QEMU_ALWAYS_INLINE
6146void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6147                       uint32_t desc, const uintptr_t retaddr,
6148                       const int esz, const int msz, const SVEContFault fault,
6149                       sve_ldst1_host_fn *host_fn,
6150                       sve_ldst1_tlb_fn *tlb_fn)
6151{
6152    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6153    int bit55 = extract64(addr, 55, 1);
6154
6155    /* Remove mtedesc from the normal sve descriptor. */
6156    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6157
6158    /* Perform gross MTE suppression early. */
6159    if (!tbi_check(desc, bit55) ||
6160        tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6161        mtedesc = 0;
6162    }
6163
6164    sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6165                  esz, msz, fault, host_fn, tlb_fn);
6166}
6167
6168#define DO_LDFF1_LDNF1_1(PART, ESZ)                                     \
6169void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg,            \
6170                                 target_ulong addr, uint32_t desc)      \
6171{                                                                       \
6172    sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6173                  sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6174}                                                                       \
6175void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg,            \
6176                                 target_ulong addr, uint32_t desc)      \
6177{                                                                       \
6178    sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6179                  sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6180}                                                                       \
6181void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6182                                     target_ulong addr, uint32_t desc)  \
6183{                                                                       \
6184    sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6185                      sve_ld1##PART##_host, sve_ld1##PART##_tlb);       \
6186}                                                                       \
6187void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6188                                     target_ulong addr, uint32_t desc)  \
6189{                                                                       \
6190    sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6191                  sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6192}
6193
6194#define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ)                                \
6195void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg,         \
6196                                    target_ulong addr, uint32_t desc)   \
6197{                                                                       \
6198    sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6199                  sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6200}                                                                       \
6201void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg,         \
6202                                    target_ulong addr, uint32_t desc)   \
6203{                                                                       \
6204    sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6205                  sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6206}                                                                       \
6207void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg,         \
6208                                    target_ulong addr, uint32_t desc)   \
6209{                                                                       \
6210    sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6211                  sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6212}                                                                       \
6213void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg,         \
6214                                    target_ulong addr, uint32_t desc)   \
6215{                                                                       \
6216    sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6217                  sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6218}                                                                       \
6219void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6220                                        target_ulong addr, uint32_t desc) \
6221{                                                                       \
6222    sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6223                      sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6224}                                                                       \
6225void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6226                                        target_ulong addr, uint32_t desc) \
6227{                                                                       \
6228    sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6229                      sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6230}                                                                       \
6231void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6232                                        target_ulong addr, uint32_t desc) \
6233{                                                                       \
6234    sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6235                      sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6236}                                                                       \
6237void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6238                                        target_ulong addr, uint32_t desc) \
6239{                                                                       \
6240    sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6241                      sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6242}
6243
6244DO_LDFF1_LDNF1_1(bb,  MO_8)
6245DO_LDFF1_LDNF1_1(bhu, MO_16)
6246DO_LDFF1_LDNF1_1(bhs, MO_16)
6247DO_LDFF1_LDNF1_1(bsu, MO_32)
6248DO_LDFF1_LDNF1_1(bss, MO_32)
6249DO_LDFF1_LDNF1_1(bdu, MO_64)
6250DO_LDFF1_LDNF1_1(bds, MO_64)
6251
6252DO_LDFF1_LDNF1_2(hh,  MO_16, MO_16)
6253DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6254DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6255DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6256DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6257
6258DO_LDFF1_LDNF1_2(ss,  MO_32, MO_32)
6259DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6260DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6261
6262DO_LDFF1_LDNF1_2(dd,  MO_64, MO_64)
6263
6264#undef DO_LDFF1_LDNF1_1
6265#undef DO_LDFF1_LDNF1_2
6266
6267/*
6268 * Common helper for all contiguous 1,2,3,4-register predicated stores.
6269 */
6270
6271static inline QEMU_ALWAYS_INLINE
6272void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6273               uint32_t desc, const uintptr_t retaddr,
6274               const int esz, const int msz, const int N, uint32_t mtedesc,
6275               sve_ldst1_host_fn *host_fn,
6276               sve_ldst1_tlb_fn *tlb_fn)
6277{
6278    const unsigned rd = simd_data(desc);
6279    const intptr_t reg_max = simd_oprsz(desc);
6280    intptr_t reg_off, reg_last, mem_off;
6281    SVEContLdSt info;
6282    void *host;
6283    int i, flags;
6284
6285    /* Find the active elements.  */
6286    if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6287        /* The entire predicate was false; no store occurs.  */
6288        return;
6289    }
6290
6291    /* Probe the page(s).  Exit with exception for any invalid page. */
6292    sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6293
6294    /* Handle watchpoints for all active elements. */
6295    sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6296                              BP_MEM_WRITE, retaddr);
6297
6298    /*
6299     * Handle mte checks for all active elements.
6300     * Since TBI must be set for MTE, !mtedesc => !mte_active.
6301     */
6302    if (mtedesc) {
6303        sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6304                                mtedesc, retaddr);
6305    }
6306
6307    flags = info.page[0].flags | info.page[1].flags;
6308    if (unlikely(flags != 0)) {
6309#ifdef CONFIG_USER_ONLY
6310        g_assert_not_reached();
6311#else
6312        /*
6313         * At least one page includes MMIO.
6314         * Any bus operation can fail with cpu_transaction_failed,
6315         * which for ARM will raise SyncExternal.  We cannot avoid
6316         * this fault and will leave with the store incomplete.
6317         */
6318        mem_off = info.mem_off_first[0];
6319        reg_off = info.reg_off_first[0];
6320        reg_last = info.reg_off_last[1];
6321        if (reg_last < 0) {
6322            reg_last = info.reg_off_split;
6323            if (reg_last < 0) {
6324                reg_last = info.reg_off_last[0];
6325            }
6326        }
6327
6328        do {
6329            uint64_t pg = vg[reg_off >> 6];
6330            do {
6331                if ((pg >> (reg_off & 63)) & 1) {
6332                    for (i = 0; i < N; ++i) {
6333                        tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6334                               addr + mem_off + (i << msz), retaddr);
6335                    }
6336                }
6337                reg_off += 1 << esz;
6338                mem_off += N << msz;
6339            } while (reg_off & 63);
6340        } while (reg_off <= reg_last);
6341        return;
6342#endif
6343    }
6344
6345    mem_off = info.mem_off_first[0];
6346    reg_off = info.reg_off_first[0];
6347    reg_last = info.reg_off_last[0];
6348    host = info.page[0].host;
6349
6350    while (reg_off <= reg_last) {
6351        uint64_t pg = vg[reg_off >> 6];
6352        do {
6353            if ((pg >> (reg_off & 63)) & 1) {
6354                for (i = 0; i < N; ++i) {
6355                    host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6356                            host + mem_off + (i << msz));
6357                }
6358            }
6359            reg_off += 1 << esz;
6360            mem_off += N << msz;
6361        } while (reg_off <= reg_last && (reg_off & 63));
6362    }
6363
6364    /*
6365     * Use the slow path to manage the cross-page misalignment.
6366     * But we know this is RAM and cannot trap.
6367     */
6368    mem_off = info.mem_off_split;
6369    if (unlikely(mem_off >= 0)) {
6370        reg_off = info.reg_off_split;
6371        for (i = 0; i < N; ++i) {
6372            tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6373                   addr + mem_off + (i << msz), retaddr);
6374        }
6375    }
6376
6377    mem_off = info.mem_off_first[1];
6378    if (unlikely(mem_off >= 0)) {
6379        reg_off = info.reg_off_first[1];
6380        reg_last = info.reg_off_last[1];
6381        host = info.page[1].host;
6382
6383        do {
6384            uint64_t pg = vg[reg_off >> 6];
6385            do {
6386                if ((pg >> (reg_off & 63)) & 1) {
6387                    for (i = 0; i < N; ++i) {
6388                        host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6389                                host + mem_off + (i << msz));
6390                    }
6391                }
6392                reg_off += 1 << esz;
6393                mem_off += N << msz;
6394            } while (reg_off & 63);
6395        } while (reg_off <= reg_last);
6396    }
6397}
6398
6399static inline QEMU_ALWAYS_INLINE
6400void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6401                   uint32_t desc, const uintptr_t ra,
6402                   const int esz, const int msz, const int N,
6403                   sve_ldst1_host_fn *host_fn,
6404                   sve_ldst1_tlb_fn *tlb_fn)
6405{
6406    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6407    int bit55 = extract64(addr, 55, 1);
6408
6409    /* Remove mtedesc from the normal sve descriptor. */
6410    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6411
6412    /* Perform gross MTE suppression early. */
6413    if (!tbi_check(desc, bit55) ||
6414        tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6415        mtedesc = 0;
6416    }
6417
6418    sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6419}
6420
6421#define DO_STN_1(N, NAME, ESZ)                                          \
6422void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg,            \
6423                                 target_ulong addr, uint32_t desc)      \
6424{                                                                       \
6425    sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0,            \
6426              sve_st1##NAME##_host, sve_st1##NAME##_tlb);               \
6427}                                                                       \
6428void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg,        \
6429                                     target_ulong addr, uint32_t desc)  \
6430{                                                                       \
6431    sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N,           \
6432                  sve_st1##NAME##_host, sve_st1##NAME##_tlb);           \
6433}
6434
6435#define DO_STN_2(N, NAME, ESZ, MSZ)                                     \
6436void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg,         \
6437                                    target_ulong addr, uint32_t desc)   \
6438{                                                                       \
6439    sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
6440              sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);         \
6441}                                                                       \
6442void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg,         \
6443                                    target_ulong addr, uint32_t desc)   \
6444{                                                                       \
6445    sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
6446              sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);         \
6447}                                                                       \
6448void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg,     \
6449                                        target_ulong addr, uint32_t desc) \
6450{                                                                       \
6451    sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
6452                  sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);     \
6453}                                                                       \
6454void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg,     \
6455                                        target_ulong addr, uint32_t desc) \
6456{                                                                       \
6457    sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
6458                  sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);     \
6459}
6460
6461DO_STN_1(1, bb, MO_8)
6462DO_STN_1(1, bh, MO_16)
6463DO_STN_1(1, bs, MO_32)
6464DO_STN_1(1, bd, MO_64)
6465DO_STN_1(2, bb, MO_8)
6466DO_STN_1(3, bb, MO_8)
6467DO_STN_1(4, bb, MO_8)
6468
6469DO_STN_2(1, hh, MO_16, MO_16)
6470DO_STN_2(1, hs, MO_32, MO_16)
6471DO_STN_2(1, hd, MO_64, MO_16)
6472DO_STN_2(2, hh, MO_16, MO_16)
6473DO_STN_2(3, hh, MO_16, MO_16)
6474DO_STN_2(4, hh, MO_16, MO_16)
6475
6476DO_STN_2(1, ss, MO_32, MO_32)
6477DO_STN_2(1, sd, MO_64, MO_32)
6478DO_STN_2(2, ss, MO_32, MO_32)
6479DO_STN_2(3, ss, MO_32, MO_32)
6480DO_STN_2(4, ss, MO_32, MO_32)
6481
6482DO_STN_2(1, dd, MO_64, MO_64)
6483DO_STN_2(2, dd, MO_64, MO_64)
6484DO_STN_2(3, dd, MO_64, MO_64)
6485DO_STN_2(4, dd, MO_64, MO_64)
6486
6487#undef DO_STN_1
6488#undef DO_STN_2
6489
6490/*
6491 * Loads with a vector index.
6492 */
6493
6494/*
6495 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6496 */
6497typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6498
6499static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6500{
6501    return *(uint32_t *)(reg + H1_4(reg_ofs));
6502}
6503
6504static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6505{
6506    return *(int32_t *)(reg + H1_4(reg_ofs));
6507}
6508
6509static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6510{
6511    return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6512}
6513
6514static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6515{
6516    return (int32_t)*(uint64_t *)(reg + reg_ofs);
6517}
6518
6519static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6520{
6521    return *(uint64_t *)(reg + reg_ofs);
6522}
6523
6524static inline QEMU_ALWAYS_INLINE
6525void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6526               target_ulong base, uint32_t desc, uintptr_t retaddr,
6527               uint32_t mtedesc, int esize, int msize,
6528               zreg_off_fn *off_fn,
6529               sve_ldst1_host_fn *host_fn,
6530               sve_ldst1_tlb_fn *tlb_fn)
6531{
6532    const int mmu_idx = cpu_mmu_index(env, false);
6533    const intptr_t reg_max = simd_oprsz(desc);
6534    const int scale = simd_data(desc);
6535    ARMVectorReg scratch;
6536    intptr_t reg_off;
6537    SVEHostPage info, info2;
6538
6539    memset(&scratch, 0, reg_max);
6540    reg_off = 0;
6541    do {
6542        uint64_t pg = vg[reg_off >> 6];
6543        do {
6544            if (likely(pg & 1)) {
6545                target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6546                target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6547
6548                sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6549                               mmu_idx, retaddr);
6550
6551                if (likely(in_page >= msize)) {
6552                    if (unlikely(info.flags & TLB_WATCHPOINT)) {
6553                        cpu_check_watchpoint(env_cpu(env), addr, msize,
6554                                             info.attrs, BP_MEM_READ, retaddr);
6555                    }
6556                    if (mtedesc && info.tagged) {
6557                        mte_check(env, mtedesc, addr, retaddr);
6558                    }
6559                    if (unlikely(info.flags & TLB_MMIO)) {
6560                        tlb_fn(env, &scratch, reg_off, addr, retaddr);
6561                    } else {
6562                        host_fn(&scratch, reg_off, info.host);
6563                    }
6564                } else {
6565                    /* Element crosses the page boundary. */
6566                    sve_probe_page(&info2, false, env, addr + in_page, 0,
6567                                   MMU_DATA_LOAD, mmu_idx, retaddr);
6568                    if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6569                        cpu_check_watchpoint(env_cpu(env), addr,
6570                                             msize, info.attrs,
6571                                             BP_MEM_READ, retaddr);
6572                    }
6573                    if (mtedesc && info.tagged) {
6574                        mte_check(env, mtedesc, addr, retaddr);
6575                    }
6576                    tlb_fn(env, &scratch, reg_off, addr, retaddr);
6577                }
6578            }
6579            reg_off += esize;
6580            pg >>= esize;
6581        } while (reg_off & 63);
6582    } while (reg_off < reg_max);
6583
6584    /* Wait until all exceptions have been raised to write back.  */
6585    memcpy(vd, &scratch, reg_max);
6586}
6587
6588static inline QEMU_ALWAYS_INLINE
6589void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6590                   target_ulong base, uint32_t desc, uintptr_t retaddr,
6591                   int esize, int msize, zreg_off_fn *off_fn,
6592                   sve_ldst1_host_fn *host_fn,
6593                   sve_ldst1_tlb_fn *tlb_fn)
6594{
6595    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6596    /* Remove mtedesc from the normal sve descriptor. */
6597    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6598
6599    /*
6600     * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6601     * offset base entirely over the address space hole to change the
6602     * pointer tag, or change the bit55 selector.  So we could here
6603     * examine TBI + TCMA like we do for sve_ldN_r_mte().
6604     */
6605    sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6606              esize, msize, off_fn, host_fn, tlb_fn);
6607}
6608
6609#define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6610void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
6611                                 void *vm, target_ulong base, uint32_t desc) \
6612{                                                                            \
6613    sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,          \
6614              off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
6615}                                                                            \
6616void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6617     void *vm, target_ulong base, uint32_t desc)                             \
6618{                                                                            \
6619    sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,         \
6620                  off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
6621}
6622
6623#define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6624void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
6625                                 void *vm, target_ulong base, uint32_t desc) \
6626{                                                                            \
6627    sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,          \
6628              off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
6629}                                                                            \
6630void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6631    void *vm, target_ulong base, uint32_t desc)                              \
6632{                                                                            \
6633    sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,         \
6634                  off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
6635}
6636
6637DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6638DO_LD1_ZPZ_S(bsu, zss, MO_8)
6639DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6640DO_LD1_ZPZ_D(bdu, zss, MO_8)
6641DO_LD1_ZPZ_D(bdu, zd, MO_8)
6642
6643DO_LD1_ZPZ_S(bss, zsu, MO_8)
6644DO_LD1_ZPZ_S(bss, zss, MO_8)
6645DO_LD1_ZPZ_D(bds, zsu, MO_8)
6646DO_LD1_ZPZ_D(bds, zss, MO_8)
6647DO_LD1_ZPZ_D(bds, zd, MO_8)
6648
6649DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6650DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6651DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6652DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6653DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6654
6655DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6656DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6657DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6658DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6659DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6660
6661DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6662DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6663DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6664DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6665DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6666
6667DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6668DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6669DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6670DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6671DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6672
6673DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6674DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6675DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6676DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6677DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6678
6679DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6680DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6681DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6682DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6683DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6684
6685DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6686DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6687DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6688
6689DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6690DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6691DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6692
6693DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6694DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6695DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6696
6697DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6698DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6699DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6700
6701#undef DO_LD1_ZPZ_S
6702#undef DO_LD1_ZPZ_D
6703
6704/* First fault loads with a vector index.  */
6705
6706/*
6707 * Common helpers for all gather first-faulting loads.
6708 */
6709
6710static inline QEMU_ALWAYS_INLINE
6711void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6712                 target_ulong base, uint32_t desc, uintptr_t retaddr,
6713                 uint32_t mtedesc, const int esz, const int msz,
6714                 zreg_off_fn *off_fn,
6715                 sve_ldst1_host_fn *host_fn,
6716                 sve_ldst1_tlb_fn *tlb_fn)
6717{
6718    const int mmu_idx = cpu_mmu_index(env, false);
6719    const intptr_t reg_max = simd_oprsz(desc);
6720    const int scale = simd_data(desc);
6721    const int esize = 1 << esz;
6722    const int msize = 1 << msz;
6723    intptr_t reg_off;
6724    SVEHostPage info;
6725    target_ulong addr, in_page;
6726    ARMVectorReg scratch;
6727
6728    /* Skip to the first true predicate.  */
6729    reg_off = find_next_active(vg, 0, reg_max, esz);
6730    if (unlikely(reg_off >= reg_max)) {
6731        /* The entire predicate was false; no load occurs.  */
6732        memset(vd, 0, reg_max);
6733        return;
6734    }
6735
6736    /* Protect against overlap between vd and vm. */
6737    if (unlikely(vd == vm)) {
6738        vm = memcpy(&scratch, vm, reg_max);
6739    }
6740
6741    /*
6742     * Probe the first element, allowing faults.
6743     */
6744    addr = base + (off_fn(vm, reg_off) << scale);
6745    if (mtedesc) {
6746        mte_check(env, mtedesc, addr, retaddr);
6747    }
6748    tlb_fn(env, vd, reg_off, addr, retaddr);
6749
6750    /* After any fault, zero the other elements. */
6751    swap_memzero(vd, reg_off);
6752    reg_off += esize;
6753    swap_memzero(vd + reg_off, reg_max - reg_off);
6754
6755    /*
6756     * Probe the remaining elements, not allowing faults.
6757     */
6758    while (reg_off < reg_max) {
6759        uint64_t pg = vg[reg_off >> 6];
6760        do {
6761            if (likely((pg >> (reg_off & 63)) & 1)) {
6762                addr = base + (off_fn(vm, reg_off) << scale);
6763                in_page = -(addr | TARGET_PAGE_MASK);
6764
6765                if (unlikely(in_page < msize)) {
6766                    /* Stop if the element crosses a page boundary. */
6767                    goto fault;
6768                }
6769
6770                sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6771                               mmu_idx, retaddr);
6772                if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6773                    goto fault;
6774                }
6775                if (unlikely(info.flags & TLB_WATCHPOINT) &&
6776                    (cpu_watchpoint_address_matches
6777                     (env_cpu(env), addr, msize) & BP_MEM_READ)) {
6778                    goto fault;
6779                }
6780                if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) {
6781                    goto fault;
6782                }
6783
6784                host_fn(vd, reg_off, info.host);
6785            }
6786            reg_off += esize;
6787        } while (reg_off & 63);
6788    }
6789    return;
6790
6791 fault:
6792    record_fault(env, reg_off, reg_max);
6793}
6794
6795static inline QEMU_ALWAYS_INLINE
6796void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6797                     target_ulong base, uint32_t desc, uintptr_t retaddr,
6798                     const int esz, const int msz,
6799                     zreg_off_fn *off_fn,
6800                     sve_ldst1_host_fn *host_fn,
6801                     sve_ldst1_tlb_fn *tlb_fn)
6802{
6803    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6804    /* Remove mtedesc from the normal sve descriptor. */
6805    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6806
6807    /*
6808     * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6809     * offset base entirely over the address space hole to change the
6810     * pointer tag, or change the bit55 selector.  So we could here
6811     * examine TBI + TCMA like we do for sve_ldN_r_mte().
6812     */
6813    sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6814                esz, msz, off_fn, host_fn, tlb_fn);
6815}
6816
6817#define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ)                                   \
6818void HELPER(sve_ldff##MEM##_##OFS)                                      \
6819    (CPUARMState *env, void *vd, void *vg,                              \
6820     void *vm, target_ulong base, uint32_t desc)                        \
6821{                                                                       \
6822    sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ,    \
6823                off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6824}                                                                       \
6825void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
6826    (CPUARMState *env, void *vd, void *vg,                              \
6827     void *vm, target_ulong base, uint32_t desc)                        \
6828{                                                                       \
6829    sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ,   \
6830                    off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6831}
6832
6833#define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ)                                   \
6834void HELPER(sve_ldff##MEM##_##OFS)                                      \
6835    (CPUARMState *env, void *vd, void *vg,                              \
6836     void *vm, target_ulong base, uint32_t desc)                        \
6837{                                                                       \
6838    sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ,    \
6839                off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6840}                                                                       \
6841void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
6842    (CPUARMState *env, void *vd, void *vg,                              \
6843     void *vm, target_ulong base, uint32_t desc)                        \
6844{                                                                       \
6845    sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ,   \
6846                    off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6847}
6848
6849DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
6850DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
6851DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
6852DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
6853DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
6854
6855DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
6856DO_LDFF1_ZPZ_S(bss, zss, MO_8)
6857DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
6858DO_LDFF1_ZPZ_D(bds, zss, MO_8)
6859DO_LDFF1_ZPZ_D(bds, zd, MO_8)
6860
6861DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
6862DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
6863DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
6864DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
6865DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
6866
6867DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
6868DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
6869DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
6870DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
6871DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
6872
6873DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
6874DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
6875DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
6876DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
6877DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
6878
6879DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
6880DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
6881DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
6882DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
6883DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
6884
6885DO_LDFF1_ZPZ_S(ss_le,  zsu, MO_32)
6886DO_LDFF1_ZPZ_S(ss_le,  zss, MO_32)
6887DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
6888DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
6889DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
6890
6891DO_LDFF1_ZPZ_S(ss_be,  zsu, MO_32)
6892DO_LDFF1_ZPZ_S(ss_be,  zss, MO_32)
6893DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
6894DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
6895DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
6896
6897DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
6898DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
6899DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
6900
6901DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
6902DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
6903DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
6904
6905DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
6906DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
6907DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
6908
6909DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
6910DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
6911DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
6912
6913/* Stores with a vector index.  */
6914
6915static inline QEMU_ALWAYS_INLINE
6916void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6917               target_ulong base, uint32_t desc, uintptr_t retaddr,
6918               uint32_t mtedesc, int esize, int msize,
6919               zreg_off_fn *off_fn,
6920               sve_ldst1_host_fn *host_fn,
6921               sve_ldst1_tlb_fn *tlb_fn)
6922{
6923    const int mmu_idx = cpu_mmu_index(env, false);
6924    const intptr_t reg_max = simd_oprsz(desc);
6925    const int scale = simd_data(desc);
6926    void *host[ARM_MAX_VQ * 4];
6927    intptr_t reg_off, i;
6928    SVEHostPage info, info2;
6929
6930    /*
6931     * Probe all of the elements for host addresses and flags.
6932     */
6933    i = reg_off = 0;
6934    do {
6935        uint64_t pg = vg[reg_off >> 6];
6936        do {
6937            target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6938            target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6939
6940            host[i] = NULL;
6941            if (likely((pg >> (reg_off & 63)) & 1)) {
6942                if (likely(in_page >= msize)) {
6943                    sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
6944                                   mmu_idx, retaddr);
6945                    if (!(info.flags & TLB_MMIO)) {
6946                        host[i] = info.host;
6947                    }
6948                } else {
6949                    /*
6950                     * Element crosses the page boundary.
6951                     * Probe both pages, but do not record the host address,
6952                     * so that we use the slow path.
6953                     */
6954                    sve_probe_page(&info, false, env, addr, 0,
6955                                   MMU_DATA_STORE, mmu_idx, retaddr);
6956                    sve_probe_page(&info2, false, env, addr + in_page, 0,
6957                                   MMU_DATA_STORE, mmu_idx, retaddr);
6958                    info.flags |= info2.flags;
6959                }
6960
6961                if (unlikely(info.flags & TLB_WATCHPOINT)) {
6962                    cpu_check_watchpoint(env_cpu(env), addr, msize,
6963                                         info.attrs, BP_MEM_WRITE, retaddr);
6964                }
6965
6966                if (mtedesc && info.tagged) {
6967                    mte_check(env, mtedesc, addr, retaddr);
6968                }
6969            }
6970            i += 1;
6971            reg_off += esize;
6972        } while (reg_off & 63);
6973    } while (reg_off < reg_max);
6974
6975    /*
6976     * Now that we have recognized all exceptions except SyncExternal
6977     * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
6978     *
6979     * Note for the common case of an element in RAM, not crossing a page
6980     * boundary, we have stored the host address in host[].  This doubles
6981     * as a first-level check against the predicate, since only enabled
6982     * elements have non-null host addresses.
6983     */
6984    i = reg_off = 0;
6985    do {
6986        void *h = host[i];
6987        if (likely(h != NULL)) {
6988            host_fn(vd, reg_off, h);
6989        } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
6990            target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6991            tlb_fn(env, vd, reg_off, addr, retaddr);
6992        }
6993        i += 1;
6994        reg_off += esize;
6995    } while (reg_off < reg_max);
6996}
6997
6998static inline QEMU_ALWAYS_INLINE
6999void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7000                   target_ulong base, uint32_t desc, uintptr_t retaddr,
7001                   int esize, int msize, zreg_off_fn *off_fn,
7002                   sve_ldst1_host_fn *host_fn,
7003                   sve_ldst1_tlb_fn *tlb_fn)
7004{
7005    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7006    /* Remove mtedesc from the normal sve descriptor. */
7007    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7008
7009    /*
7010     * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7011     * offset base entirely over the address space hole to change the
7012     * pointer tag, or change the bit55 selector.  So we could here
7013     * examine TBI + TCMA like we do for sve_ldN_r_mte().
7014     */
7015    sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7016              esize, msize, off_fn, host_fn, tlb_fn);
7017}
7018
7019#define DO_ST1_ZPZ_S(MEM, OFS, MSZ)                                     \
7020void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7021                                 void *vm, target_ulong base, uint32_t desc) \
7022{                                                                       \
7023    sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,     \
7024              off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7025}                                                                       \
7026void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7027    void *vm, target_ulong base, uint32_t desc)                         \
7028{                                                                       \
7029    sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,    \
7030                  off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7031}
7032
7033#define DO_ST1_ZPZ_D(MEM, OFS, MSZ)                                     \
7034void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7035                                 void *vm, target_ulong base, uint32_t desc) \
7036{                                                                       \
7037    sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,     \
7038              off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7039}                                                                       \
7040void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7041    void *vm, target_ulong base, uint32_t desc)                         \
7042{                                                                       \
7043    sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,    \
7044                  off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7045}
7046
7047DO_ST1_ZPZ_S(bs, zsu, MO_8)
7048DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7049DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7050DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7051DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7052
7053DO_ST1_ZPZ_S(bs, zss, MO_8)
7054DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7055DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7056DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7057DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7058
7059DO_ST1_ZPZ_D(bd, zsu, MO_8)
7060DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7061DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7062DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7063DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7064DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7065DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7066
7067DO_ST1_ZPZ_D(bd, zss, MO_8)
7068DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7069DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7070DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7071DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7072DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7073DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7074
7075DO_ST1_ZPZ_D(bd, zd, MO_8)
7076DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7077DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7078DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7079DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7080DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7081DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7082
7083#undef DO_ST1_ZPZ_S
7084#undef DO_ST1_ZPZ_D
7085
7086void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7087{
7088    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7089    uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7090
7091    for (i = 0; i < opr_sz; ++i) {
7092        d[i] = n[i] ^ m[i] ^ k[i];
7093    }
7094}
7095
7096void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7097{
7098    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7099    uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7100
7101    for (i = 0; i < opr_sz; ++i) {
7102        d[i] = n[i] ^ (m[i] & ~k[i]);
7103    }
7104}
7105
7106void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7107{
7108    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7109    uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7110
7111    for (i = 0; i < opr_sz; ++i) {
7112        d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
7113    }
7114}
7115
7116void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7117{
7118    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7119    uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7120
7121    for (i = 0; i < opr_sz; ++i) {
7122        d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7123    }
7124}
7125
7126void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7127{
7128    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7129    uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7130
7131    for (i = 0; i < opr_sz; ++i) {
7132        d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7133    }
7134}
7135
7136/*
7137 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
7138 * See hasless(v,1) from
7139 *   https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
7140 */
7141static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7142{
7143    int bits = 8 << esz;
7144    uint64_t ones = dup_const(esz, 1);
7145    uint64_t signs = ones << (bits - 1);
7146    uint64_t cmp0, cmp1;
7147
7148    cmp1 = dup_const(esz, n);
7149    cmp0 = cmp1 ^ m0;
7150    cmp1 = cmp1 ^ m1;
7151    cmp0 = (cmp0 - ones) & ~cmp0;
7152    cmp1 = (cmp1 - ones) & ~cmp1;
7153    return (cmp0 | cmp1) & signs;
7154}
7155
7156static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7157                                uint32_t desc, int esz, bool nmatch)
7158{
7159    uint16_t esz_mask = pred_esz_masks[esz];
7160    intptr_t opr_sz = simd_oprsz(desc);
7161    uint32_t flags = PREDTEST_INIT;
7162    intptr_t i, j, k;
7163
7164    for (i = 0; i < opr_sz; i += 16) {
7165        uint64_t m0 = *(uint64_t *)(vm + i);
7166        uint64_t m1 = *(uint64_t *)(vm + i + 8);
7167        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7168        uint16_t out = 0;
7169
7170        for (j = 0; j < 16; j += 8) {
7171            uint64_t n = *(uint64_t *)(vn + i + j);
7172
7173            for (k = 0; k < 8; k += 1 << esz) {
7174                if (pg & (1 << (j + k))) {
7175                    bool o = do_match2(n >> (k * 8), m0, m1, esz);
7176                    out |= (o ^ nmatch) << (j + k);
7177                }
7178            }
7179        }
7180        *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7181        flags = iter_predtest_fwd(out, pg, flags);
7182    }
7183    return flags;
7184}
7185
7186#define DO_PPZZ_MATCH(NAME, ESZ, INV)                                         \
7187uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
7188{                                                                             \
7189    return do_match(vd, vn, vm, vg, desc, ESZ, INV);                          \
7190}
7191
7192DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7193DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7194
7195DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7196DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7197
7198#undef DO_PPZZ_MATCH
7199
7200void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7201                            uint32_t desc)
7202{
7203    ARMVectorReg scratch;
7204    intptr_t i, j;
7205    intptr_t opr_sz = simd_oprsz(desc);
7206    uint32_t *d = vd, *n = vn, *m = vm;
7207    uint8_t *pg = vg;
7208
7209    if (d == n) {
7210        n = memcpy(&scratch, n, opr_sz);
7211        if (d == m) {
7212            m = n;
7213        }
7214    } else if (d == m) {
7215        m = memcpy(&scratch, m, opr_sz);
7216    }
7217
7218    for (i = 0; i < opr_sz; i += 4) {
7219        uint64_t count = 0;
7220        uint8_t pred;
7221
7222        pred = pg[H1(i >> 3)] >> (i & 7);
7223        if (pred & 1) {
7224            uint32_t nn = n[H4(i >> 2)];
7225
7226            for (j = 0; j <= i; j += 4) {
7227                pred = pg[H1(j >> 3)] >> (j & 7);
7228                if ((pred & 1) && nn == m[H4(j >> 2)]) {
7229                    ++count;
7230                }
7231            }
7232        }
7233        d[H4(i >> 2)] = count;
7234    }
7235}
7236
7237void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7238                            uint32_t desc)
7239{
7240    ARMVectorReg scratch;
7241    intptr_t i, j;
7242    intptr_t opr_sz = simd_oprsz(desc);
7243    uint64_t *d = vd, *n = vn, *m = vm;
7244    uint8_t *pg = vg;
7245
7246    if (d == n) {
7247        n = memcpy(&scratch, n, opr_sz);
7248        if (d == m) {
7249            m = n;
7250        }
7251    } else if (d == m) {
7252        m = memcpy(&scratch, m, opr_sz);
7253    }
7254
7255    for (i = 0; i < opr_sz / 8; ++i) {
7256        uint64_t count = 0;
7257        if (pg[H1(i)] & 1) {
7258            uint64_t nn = n[i];
7259            for (j = 0; j <= i; ++j) {
7260                if ((pg[H1(j)] & 1) && nn == m[j]) {
7261                    ++count;
7262                }
7263            }
7264        }
7265        d[i] = count;
7266    }
7267}
7268
7269/*
7270 * Returns the number of bytes in m0 and m1 that match n.
7271 * Unlike do_match2 we don't just need true/false, we need an exact count.
7272 * This requires two extra logical operations.
7273 */
7274static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7275{
7276    const uint64_t mask = dup_const(MO_8, 0x7f);
7277    uint64_t cmp0, cmp1;
7278
7279    cmp1 = dup_const(MO_8, n);
7280    cmp0 = cmp1 ^ m0;
7281    cmp1 = cmp1 ^ m1;
7282
7283    /*
7284     * 1: clear msb of each byte to avoid carry to next byte (& mask)
7285     * 2: carry in to msb if byte != 0 (+ mask)
7286     * 3: set msb if cmp has msb set (| cmp)
7287     * 4: set ~msb to ignore them (| mask)
7288     * We now have 0xff for byte != 0 or 0x7f for byte == 0.
7289     * 5: invert, resulting in 0x80 if and only if byte == 0.
7290     */
7291    cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7292    cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7293
7294    /*
7295     * Combine the two compares in a way that the bits do
7296     * not overlap, and so preserves the count of set bits.
7297     * If the host has an efficient instruction for ctpop,
7298     * then ctpop(x) + ctpop(y) has the same number of
7299     * operations as ctpop(x | (y >> 1)).  If the host does
7300     * not have an efficient ctpop, then we only want to
7301     * use it once.
7302     */
7303    return ctpop64(cmp0 | (cmp1 >> 1));
7304}
7305
7306void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7307{
7308    intptr_t i, j;
7309    intptr_t opr_sz = simd_oprsz(desc);
7310
7311    for (i = 0; i < opr_sz; i += 16) {
7312        uint64_t n0 = *(uint64_t *)(vn + i);
7313        uint64_t m0 = *(uint64_t *)(vm + i);
7314        uint64_t n1 = *(uint64_t *)(vn + i + 8);
7315        uint64_t m1 = *(uint64_t *)(vm + i + 8);
7316        uint64_t out0 = 0;
7317        uint64_t out1 = 0;
7318
7319        for (j = 0; j < 64; j += 8) {
7320            uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7321            uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7322            out0 |= cnt0 << j;
7323            out1 |= cnt1 << j;
7324        }
7325
7326        *(uint64_t *)(vd + i) = out0;
7327        *(uint64_t *)(vd + i + 8) = out1;
7328    }
7329}
7330
7331void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
7332{
7333    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7334    int shr = simd_data(desc);
7335    int shl = 8 - shr;
7336    uint64_t mask = dup_const(MO_8, 0xff >> shr);
7337    uint64_t *d = vd, *n = vn, *m = vm;
7338
7339    for (i = 0; i < opr_sz; ++i) {
7340        uint64_t t = n[i] ^ m[i];
7341        d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7342    }
7343}
7344
7345void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
7346{
7347    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7348    int shr = simd_data(desc);
7349    int shl = 16 - shr;
7350    uint64_t mask = dup_const(MO_16, 0xffff >> shr);
7351    uint64_t *d = vd, *n = vn, *m = vm;
7352
7353    for (i = 0; i < opr_sz; ++i) {
7354        uint64_t t = n[i] ^ m[i];
7355        d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7356    }
7357}
7358
7359void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
7360{
7361    intptr_t i, opr_sz = simd_oprsz(desc) / 4;
7362    int shr = simd_data(desc);
7363    uint32_t *d = vd, *n = vn, *m = vm;
7364
7365    for (i = 0; i < opr_sz; ++i) {
7366        d[i] = ror32(n[i] ^ m[i], shr);
7367    }
7368}
7369
7370void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
7371                     void *status, uint32_t desc)
7372{
7373    intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
7374
7375    for (s = 0; s < opr_sz; ++s) {
7376        float32 *n = vn + s * sizeof(float32) * 4;
7377        float32 *m = vm + s * sizeof(float32) * 4;
7378        float32 *a = va + s * sizeof(float32) * 4;
7379        float32 *d = vd + s * sizeof(float32) * 4;
7380        float32 n00 = n[H4(0)], n01 = n[H4(1)];
7381        float32 n10 = n[H4(2)], n11 = n[H4(3)];
7382        float32 m00 = m[H4(0)], m01 = m[H4(1)];
7383        float32 m10 = m[H4(2)], m11 = m[H4(3)];
7384        float32 p0, p1;
7385
7386        /* i = 0, j = 0 */
7387        p0 = float32_mul(n00, m00, status);
7388        p1 = float32_mul(n01, m01, status);
7389        d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
7390
7391        /* i = 0, j = 1 */
7392        p0 = float32_mul(n00, m10, status);
7393        p1 = float32_mul(n01, m11, status);
7394        d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
7395
7396        /* i = 1, j = 0 */
7397        p0 = float32_mul(n10, m00, status);
7398        p1 = float32_mul(n11, m01, status);
7399        d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
7400
7401        /* i = 1, j = 1 */
7402        p0 = float32_mul(n10, m10, status);
7403        p1 = float32_mul(n11, m11, status);
7404        d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
7405    }
7406}
7407
7408void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
7409                     void *status, uint32_t desc)
7410{
7411    intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
7412
7413    for (s = 0; s < opr_sz; ++s) {
7414        float64 *n = vn + s * sizeof(float64) * 4;
7415        float64 *m = vm + s * sizeof(float64) * 4;
7416        float64 *a = va + s * sizeof(float64) * 4;
7417        float64 *d = vd + s * sizeof(float64) * 4;
7418        float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
7419        float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
7420        float64 p0, p1;
7421
7422        /* i = 0, j = 0 */
7423        p0 = float64_mul(n00, m00, status);
7424        p1 = float64_mul(n01, m01, status);
7425        d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
7426
7427        /* i = 0, j = 1 */
7428        p0 = float64_mul(n00, m10, status);
7429        p1 = float64_mul(n01, m11, status);
7430        d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
7431
7432        /* i = 1, j = 0 */
7433        p0 = float64_mul(n10, m00, status);
7434        p1 = float64_mul(n11, m01, status);
7435        d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
7436
7437        /* i = 1, j = 1 */
7438        p0 = float64_mul(n10, m10, status);
7439        p1 = float64_mul(n11, m11, status);
7440        d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
7441    }
7442}
7443
7444#define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
7445void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc)  \
7446{                                                                             \
7447    intptr_t i = simd_oprsz(desc);                                            \
7448    uint64_t *g = vg;                                                         \
7449    do {                                                                      \
7450        uint64_t pg = g[(i - 1) >> 6];                                        \
7451        do {                                                                  \
7452            i -= sizeof(TYPEW);                                               \
7453            if (likely((pg >> (i & 63)) & 1)) {                               \
7454                TYPEW nn = *(TYPEW *)(vn + HW(i));                            \
7455                *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status);      \
7456            }                                                                 \
7457        } while (i & 63);                                                     \
7458    } while (i != 0);                                                         \
7459}
7460
7461DO_FCVTNT(sve_bfcvtnt,    uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
7462DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
7463DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
7464
7465#define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
7466void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc)  \
7467{                                                                             \
7468    intptr_t i = simd_oprsz(desc);                                            \
7469    uint64_t *g = vg;                                                         \
7470    do {                                                                      \
7471        uint64_t pg = g[(i - 1) >> 6];                                        \
7472        do {                                                                  \
7473            i -= sizeof(TYPEW);                                               \
7474            if (likely((pg >> (i & 63)) & 1)) {                               \
7475                TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN)));            \
7476                *(TYPEW *)(vd + HW(i)) = OP(nn, status);                      \
7477            }                                                                 \
7478        } while (i & 63);                                                     \
7479    } while (i != 0);                                                         \
7480}
7481
7482DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
7483DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
7484
7485#undef DO_FCVTLT
7486#undef DO_FCVTNT
7487