qemu/target/arm/sve_helper.c
<<
>>
Prefs
   1/*
   2 * ARM SVE Operations
   3 *
   4 * Copyright (c) 2018 Linaro, Ltd.
   5 *
   6 * This library is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU Lesser General Public
   8 * License as published by the Free Software Foundation; either
   9 * version 2.1 of the License, or (at your option) any later version.
  10 *
  11 * This library is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * Lesser General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU Lesser General Public
  17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20#include "qemu/osdep.h"
  21#include "cpu.h"
  22#include "internals.h"
  23#include "exec/exec-all.h"
  24#include "exec/cpu_ldst.h"
  25#include "exec/helper-proto.h"
  26#include "tcg/tcg-gvec-desc.h"
  27#include "fpu/softfloat.h"
  28#include "tcg/tcg.h"
  29
  30
  31/* Note that vector data is stored in host-endian 64-bit chunks,
  32   so addressing units smaller than that needs a host-endian fixup.  */
  33#ifdef HOST_WORDS_BIGENDIAN
  34#define H1(x)   ((x) ^ 7)
  35#define H1_2(x) ((x) ^ 6)
  36#define H1_4(x) ((x) ^ 4)
  37#define H2(x)   ((x) ^ 3)
  38#define H4(x)   ((x) ^ 1)
  39#else
  40#define H1(x)   (x)
  41#define H1_2(x) (x)
  42#define H1_4(x) (x)
  43#define H2(x)   (x)
  44#define H4(x)   (x)
  45#endif
  46
  47/* Return a value for NZCV as per the ARM PredTest pseudofunction.
  48 *
  49 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
  50 * and bit 0 set if C is set.  Compare the definitions of these variables
  51 * within CPUARMState.
  52 */
  53
  54/* For no G bits set, NZCV = C.  */
  55#define PREDTEST_INIT  1
  56
  57/* This is an iterative function, called for each Pd and Pg word
  58 * moving forward.
  59 */
  60static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
  61{
  62    if (likely(g)) {
  63        /* Compute N from first D & G.
  64           Use bit 2 to signal first G bit seen.  */
  65        if (!(flags & 4)) {
  66            flags |= ((d & (g & -g)) != 0) << 31;
  67            flags |= 4;
  68        }
  69
  70        /* Accumulate Z from each D & G.  */
  71        flags |= ((d & g) != 0) << 1;
  72
  73        /* Compute C from last !(D & G).  Replace previous.  */
  74        flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
  75    }
  76    return flags;
  77}
  78
  79/* This is an iterative function, called for each Pd and Pg word
  80 * moving backward.
  81 */
  82static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
  83{
  84    if (likely(g)) {
  85        /* Compute C from first (i.e last) !(D & G).
  86           Use bit 2 to signal first G bit seen.  */
  87        if (!(flags & 4)) {
  88            flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
  89            flags |= (d & pow2floor(g)) == 0;
  90        }
  91
  92        /* Accumulate Z from each D & G.  */
  93        flags |= ((d & g) != 0) << 1;
  94
  95        /* Compute N from last (i.e first) D & G.  Replace previous.  */
  96        flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
  97    }
  98    return flags;
  99}
 100
 101/* The same for a single word predicate.  */
 102uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
 103{
 104    return iter_predtest_fwd(d, g, PREDTEST_INIT);
 105}
 106
 107/* The same for a multi-word predicate.  */
 108uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
 109{
 110    uint32_t flags = PREDTEST_INIT;
 111    uint64_t *d = vd, *g = vg;
 112    uintptr_t i = 0;
 113
 114    do {
 115        flags = iter_predtest_fwd(d[i], g[i], flags);
 116    } while (++i < words);
 117
 118    return flags;
 119}
 120
 121/* Expand active predicate bits to bytes, for byte elements.
 122 *  for (i = 0; i < 256; ++i) {
 123 *      unsigned long m = 0;
 124 *      for (j = 0; j < 8; j++) {
 125 *          if ((i >> j) & 1) {
 126 *              m |= 0xfful << (j << 3);
 127 *          }
 128 *      }
 129 *      printf("0x%016lx,\n", m);
 130 *  }
 131 */
 132static inline uint64_t expand_pred_b(uint8_t byte)
 133{
 134    static const uint64_t word[256] = {
 135        0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
 136        0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
 137        0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
 138        0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
 139        0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
 140        0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
 141        0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
 142        0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
 143        0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
 144        0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
 145        0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
 146        0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
 147        0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
 148        0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
 149        0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
 150        0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
 151        0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
 152        0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
 153        0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
 154        0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
 155        0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
 156        0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
 157        0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
 158        0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
 159        0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
 160        0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
 161        0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
 162        0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
 163        0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
 164        0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
 165        0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
 166        0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
 167        0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
 168        0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
 169        0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
 170        0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
 171        0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
 172        0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
 173        0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
 174        0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
 175        0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
 176        0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
 177        0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
 178        0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
 179        0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
 180        0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
 181        0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
 182        0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
 183        0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
 184        0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
 185        0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
 186        0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
 187        0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
 188        0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
 189        0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
 190        0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
 191        0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
 192        0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
 193        0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
 194        0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
 195        0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
 196        0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
 197        0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
 198        0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
 199        0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
 200        0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
 201        0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
 202        0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
 203        0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
 204        0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
 205        0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
 206        0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
 207        0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
 208        0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
 209        0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
 210        0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
 211        0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
 212        0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
 213        0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
 214        0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
 215        0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
 216        0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
 217        0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
 218        0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
 219        0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
 220        0xffffffffffffffff,
 221    };
 222    return word[byte];
 223}
 224
 225/* Similarly for half-word elements.
 226 *  for (i = 0; i < 256; ++i) {
 227 *      unsigned long m = 0;
 228 *      if (i & 0xaa) {
 229 *          continue;
 230 *      }
 231 *      for (j = 0; j < 8; j += 2) {
 232 *          if ((i >> j) & 1) {
 233 *              m |= 0xfffful << (j << 3);
 234 *          }
 235 *      }
 236 *      printf("[0x%x] = 0x%016lx,\n", i, m);
 237 *  }
 238 */
 239static inline uint64_t expand_pred_h(uint8_t byte)
 240{
 241    static const uint64_t word[] = {
 242        [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
 243        [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
 244        [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
 245        [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
 246        [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
 247        [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
 248        [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
 249        [0x55] = 0xffffffffffffffff,
 250    };
 251    return word[byte & 0x55];
 252}
 253
 254/* Similarly for single word elements.  */
 255static inline uint64_t expand_pred_s(uint8_t byte)
 256{
 257    static const uint64_t word[] = {
 258        [0x01] = 0x00000000ffffffffull,
 259        [0x10] = 0xffffffff00000000ull,
 260        [0x11] = 0xffffffffffffffffull,
 261    };
 262    return word[byte & 0x11];
 263}
 264
 265/* Swap 16-bit words within a 32-bit word.  */
 266static inline uint32_t hswap32(uint32_t h)
 267{
 268    return rol32(h, 16);
 269}
 270
 271/* Swap 16-bit words within a 64-bit word.  */
 272static inline uint64_t hswap64(uint64_t h)
 273{
 274    uint64_t m = 0x0000ffff0000ffffull;
 275    h = rol64(h, 32);
 276    return ((h & m) << 16) | ((h >> 16) & m);
 277}
 278
 279/* Swap 32-bit words within a 64-bit word.  */
 280static inline uint64_t wswap64(uint64_t h)
 281{
 282    return rol64(h, 32);
 283}
 284
 285#define LOGICAL_PPPP(NAME, FUNC) \
 286void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
 287{                                                                         \
 288    uintptr_t opr_sz = simd_oprsz(desc);                                  \
 289    uint64_t *d = vd, *n = vn, *m = vm, *g = vg;                          \
 290    uintptr_t i;                                                          \
 291    for (i = 0; i < opr_sz / 8; ++i) {                                    \
 292        d[i] = FUNC(n[i], m[i], g[i]);                                    \
 293    }                                                                     \
 294}
 295
 296#define DO_AND(N, M, G)  (((N) & (M)) & (G))
 297#define DO_BIC(N, M, G)  (((N) & ~(M)) & (G))
 298#define DO_EOR(N, M, G)  (((N) ^ (M)) & (G))
 299#define DO_ORR(N, M, G)  (((N) | (M)) & (G))
 300#define DO_ORN(N, M, G)  (((N) | ~(M)) & (G))
 301#define DO_NOR(N, M, G)  (~((N) | (M)) & (G))
 302#define DO_NAND(N, M, G) (~((N) & (M)) & (G))
 303#define DO_SEL(N, M, G)  (((N) & (G)) | ((M) & ~(G)))
 304
 305LOGICAL_PPPP(sve_and_pppp, DO_AND)
 306LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
 307LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
 308LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
 309LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
 310LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
 311LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
 312LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
 313
 314#undef DO_AND
 315#undef DO_BIC
 316#undef DO_EOR
 317#undef DO_ORR
 318#undef DO_ORN
 319#undef DO_NOR
 320#undef DO_NAND
 321#undef DO_SEL
 322#undef LOGICAL_PPPP
 323
 324/* Fully general three-operand expander, controlled by a predicate.
 325 * This is complicated by the host-endian storage of the register file.
 326 */
 327/* ??? I don't expect the compiler could ever vectorize this itself.
 328 * With some tables we can convert bit masks to byte masks, and with
 329 * extra care wrt byte/word ordering we could use gcc generic vectors
 330 * and do 16 bytes at a time.
 331 */
 332#define DO_ZPZZ(NAME, TYPE, H, OP)                                       \
 333void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
 334{                                                                       \
 335    intptr_t i, opr_sz = simd_oprsz(desc);                              \
 336    for (i = 0; i < opr_sz; ) {                                         \
 337        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
 338        do {                                                            \
 339            if (pg & 1) {                                               \
 340                TYPE nn = *(TYPE *)(vn + H(i));                         \
 341                TYPE mm = *(TYPE *)(vm + H(i));                         \
 342                *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
 343            }                                                           \
 344            i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
 345        } while (i & 15);                                               \
 346    }                                                                   \
 347}
 348
 349/* Similarly, specialized for 64-bit operands.  */
 350#define DO_ZPZZ_D(NAME, TYPE, OP)                                \
 351void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
 352{                                                               \
 353    intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
 354    TYPE *d = vd, *n = vn, *m = vm;                             \
 355    uint8_t *pg = vg;                                           \
 356    for (i = 0; i < opr_sz; i += 1) {                           \
 357        if (pg[H1(i)] & 1) {                                    \
 358            TYPE nn = n[i], mm = m[i];                          \
 359            d[i] = OP(nn, mm);                                  \
 360        }                                                       \
 361    }                                                           \
 362}
 363
 364#define DO_AND(N, M)  (N & M)
 365#define DO_EOR(N, M)  (N ^ M)
 366#define DO_ORR(N, M)  (N | M)
 367#define DO_BIC(N, M)  (N & ~M)
 368#define DO_ADD(N, M)  (N + M)
 369#define DO_SUB(N, M)  (N - M)
 370#define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
 371#define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
 372#define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
 373#define DO_MUL(N, M)  (N * M)
 374
 375
 376/*
 377 * We must avoid the C undefined behaviour cases: division by
 378 * zero and signed division of INT_MIN by -1. Both of these
 379 * have architecturally defined required results for Arm.
 380 * We special case all signed divisions by -1 to avoid having
 381 * to deduce the minimum integer for the type involved.
 382 */
 383#define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
 384#define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
 385
 386DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
 387DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
 388DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
 389DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
 390
 391DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
 392DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
 393DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
 394DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
 395
 396DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
 397DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
 398DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
 399DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
 400
 401DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
 402DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
 403DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
 404DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
 405
 406DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
 407DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
 408DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
 409DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
 410
 411DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
 412DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
 413DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
 414DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
 415
 416DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
 417DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
 418DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
 419DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
 420
 421DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
 422DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
 423DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
 424DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
 425
 426DO_ZPZZ(sve_smin_zpzz_b, int8_t,  H1, DO_MIN)
 427DO_ZPZZ(sve_smin_zpzz_h, int16_t,  H1_2, DO_MIN)
 428DO_ZPZZ(sve_smin_zpzz_s, int32_t,  H1_4, DO_MIN)
 429DO_ZPZZ_D(sve_smin_zpzz_d, int64_t,  DO_MIN)
 430
 431DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
 432DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
 433DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
 434DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
 435
 436DO_ZPZZ(sve_sabd_zpzz_b, int8_t,  H1, DO_ABD)
 437DO_ZPZZ(sve_sabd_zpzz_h, int16_t,  H1_2, DO_ABD)
 438DO_ZPZZ(sve_sabd_zpzz_s, int32_t,  H1_4, DO_ABD)
 439DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t,  DO_ABD)
 440
 441DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
 442DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
 443DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
 444DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
 445
 446/* Because the computation type is at least twice as large as required,
 447   these work for both signed and unsigned source types.  */
 448static inline uint8_t do_mulh_b(int32_t n, int32_t m)
 449{
 450    return (n * m) >> 8;
 451}
 452
 453static inline uint16_t do_mulh_h(int32_t n, int32_t m)
 454{
 455    return (n * m) >> 16;
 456}
 457
 458static inline uint32_t do_mulh_s(int64_t n, int64_t m)
 459{
 460    return (n * m) >> 32;
 461}
 462
 463static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
 464{
 465    uint64_t lo, hi;
 466    muls64(&lo, &hi, n, m);
 467    return hi;
 468}
 469
 470static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
 471{
 472    uint64_t lo, hi;
 473    mulu64(&lo, &hi, n, m);
 474    return hi;
 475}
 476
 477DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
 478DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
 479DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
 480DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
 481
 482DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
 483DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
 484DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
 485DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
 486
 487DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
 488DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
 489DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
 490DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
 491
 492DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
 493DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
 494
 495DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
 496DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
 497
 498/* Note that all bits of the shift are significant
 499   and not modulo the element size.  */
 500#define DO_ASR(N, M)  (N >> MIN(M, sizeof(N) * 8 - 1))
 501#define DO_LSR(N, M)  (M < sizeof(N) * 8 ? N >> M : 0)
 502#define DO_LSL(N, M)  (M < sizeof(N) * 8 ? N << M : 0)
 503
 504DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
 505DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
 506DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
 507
 508DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
 509DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
 510DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
 511
 512DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
 513DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
 514DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
 515
 516DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
 517DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
 518DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
 519
 520#undef DO_ZPZZ
 521#undef DO_ZPZZ_D
 522
 523/* Three-operand expander, controlled by a predicate, in which the
 524 * third operand is "wide".  That is, for D = N op M, the same 64-bit
 525 * value of M is used with all of the narrower values of N.
 526 */
 527#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP)                               \
 528void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
 529{                                                                       \
 530    intptr_t i, opr_sz = simd_oprsz(desc);                              \
 531    for (i = 0; i < opr_sz; ) {                                         \
 532        uint8_t pg = *(uint8_t *)(vg + H1(i >> 3));                     \
 533        TYPEW mm = *(TYPEW *)(vm + i);                                  \
 534        do {                                                            \
 535            if (pg & 1) {                                               \
 536                TYPE nn = *(TYPE *)(vn + H(i));                         \
 537                *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
 538            }                                                           \
 539            i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
 540        } while (i & 7);                                                \
 541    }                                                                   \
 542}
 543
 544DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
 545DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
 546DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
 547
 548DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
 549DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
 550DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
 551
 552DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
 553DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
 554DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
 555
 556#undef DO_ZPZW
 557
 558/* Fully general two-operand expander, controlled by a predicate.
 559 */
 560#define DO_ZPZ(NAME, TYPE, H, OP)                               \
 561void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
 562{                                                               \
 563    intptr_t i, opr_sz = simd_oprsz(desc);                      \
 564    for (i = 0; i < opr_sz; ) {                                 \
 565        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
 566        do {                                                    \
 567            if (pg & 1) {                                       \
 568                TYPE nn = *(TYPE *)(vn + H(i));                 \
 569                *(TYPE *)(vd + H(i)) = OP(nn);                  \
 570            }                                                   \
 571            i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
 572        } while (i & 15);                                       \
 573    }                                                           \
 574}
 575
 576/* Similarly, specialized for 64-bit operands.  */
 577#define DO_ZPZ_D(NAME, TYPE, OP)                                \
 578void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
 579{                                                               \
 580    intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
 581    TYPE *d = vd, *n = vn;                                      \
 582    uint8_t *pg = vg;                                           \
 583    for (i = 0; i < opr_sz; i += 1) {                           \
 584        if (pg[H1(i)] & 1) {                                    \
 585            TYPE nn = n[i];                                     \
 586            d[i] = OP(nn);                                      \
 587        }                                                       \
 588    }                                                           \
 589}
 590
 591#define DO_CLS_B(N)   (clrsb32(N) - 24)
 592#define DO_CLS_H(N)   (clrsb32(N) - 16)
 593
 594DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
 595DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
 596DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
 597DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
 598
 599#define DO_CLZ_B(N)   (clz32(N) - 24)
 600#define DO_CLZ_H(N)   (clz32(N) - 16)
 601
 602DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
 603DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
 604DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
 605DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
 606
 607DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
 608DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
 609DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
 610DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
 611
 612#define DO_CNOT(N)    (N == 0)
 613
 614DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
 615DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
 616DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
 617DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
 618
 619#define DO_FABS(N)    (N & ((__typeof(N))-1 >> 1))
 620
 621DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
 622DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
 623DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
 624
 625#define DO_FNEG(N)    (N ^ ~((__typeof(N))-1 >> 1))
 626
 627DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
 628DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
 629DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
 630
 631#define DO_NOT(N)    (~N)
 632
 633DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
 634DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
 635DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
 636DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
 637
 638#define DO_SXTB(N)    ((int8_t)N)
 639#define DO_SXTH(N)    ((int16_t)N)
 640#define DO_SXTS(N)    ((int32_t)N)
 641#define DO_UXTB(N)    ((uint8_t)N)
 642#define DO_UXTH(N)    ((uint16_t)N)
 643#define DO_UXTS(N)    ((uint32_t)N)
 644
 645DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
 646DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
 647DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
 648DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
 649DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
 650DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
 651
 652DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
 653DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
 654DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
 655DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
 656DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
 657DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
 658
 659#define DO_ABS(N)    (N < 0 ? -N : N)
 660
 661DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
 662DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
 663DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
 664DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
 665
 666#define DO_NEG(N)    (-N)
 667
 668DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
 669DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
 670DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
 671DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
 672
 673DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
 674DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
 675DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
 676
 677DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
 678DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
 679
 680DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
 681
 682DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
 683DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
 684DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
 685DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
 686
 687/* Three-operand expander, unpredicated, in which the third operand is "wide".
 688 */
 689#define DO_ZZW(NAME, TYPE, TYPEW, H, OP)                       \
 690void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
 691{                                                              \
 692    intptr_t i, opr_sz = simd_oprsz(desc);                     \
 693    for (i = 0; i < opr_sz; ) {                                \
 694        TYPEW mm = *(TYPEW *)(vm + i);                         \
 695        do {                                                   \
 696            TYPE nn = *(TYPE *)(vn + H(i));                    \
 697            *(TYPE *)(vd + H(i)) = OP(nn, mm);                 \
 698            i += sizeof(TYPE);                                 \
 699        } while (i & 7);                                       \
 700    }                                                          \
 701}
 702
 703DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
 704DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
 705DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
 706
 707DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
 708DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
 709DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
 710
 711DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
 712DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
 713DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
 714
 715#undef DO_ZZW
 716
 717#undef DO_CLS_B
 718#undef DO_CLS_H
 719#undef DO_CLZ_B
 720#undef DO_CLZ_H
 721#undef DO_CNOT
 722#undef DO_FABS
 723#undef DO_FNEG
 724#undef DO_ABS
 725#undef DO_NEG
 726#undef DO_ZPZ
 727#undef DO_ZPZ_D
 728
 729/* Two-operand reduction expander, controlled by a predicate.
 730 * The difference between TYPERED and TYPERET has to do with
 731 * sign-extension.  E.g. for SMAX, TYPERED must be signed,
 732 * but TYPERET must be unsigned so that e.g. a 32-bit value
 733 * is not sign-extended to the ABI uint64_t return type.
 734 */
 735/* ??? If we were to vectorize this by hand the reduction ordering
 736 * would change.  For integer operands, this is perfectly fine.
 737 */
 738#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
 739uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
 740{                                                          \
 741    intptr_t i, opr_sz = simd_oprsz(desc);                 \
 742    TYPERED ret = INIT;                                    \
 743    for (i = 0; i < opr_sz; ) {                            \
 744        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
 745        do {                                               \
 746            if (pg & 1) {                                  \
 747                TYPEELT nn = *(TYPEELT *)(vn + H(i));      \
 748                ret = OP(ret, nn);                         \
 749            }                                              \
 750            i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT);  \
 751        } while (i & 15);                                  \
 752    }                                                      \
 753    return (TYPERET)ret;                                   \
 754}
 755
 756#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP)             \
 757uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
 758{                                                          \
 759    intptr_t i, opr_sz = simd_oprsz(desc) / 8;             \
 760    TYPEE *n = vn;                                         \
 761    uint8_t *pg = vg;                                      \
 762    TYPER ret = INIT;                                      \
 763    for (i = 0; i < opr_sz; i += 1) {                      \
 764        if (pg[H1(i)] & 1) {                               \
 765            TYPEE nn = n[i];                               \
 766            ret = OP(ret, nn);                             \
 767        }                                                  \
 768    }                                                      \
 769    return ret;                                            \
 770}
 771
 772DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
 773DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
 774DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
 775DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
 776
 777DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
 778DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
 779DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
 780DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
 781
 782DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
 783DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
 784DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
 785DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
 786
 787DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
 788DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
 789DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
 790
 791DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
 792DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
 793DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
 794DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
 795
 796DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
 797DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
 798DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
 799DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
 800
 801DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
 802DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
 803DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
 804DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
 805
 806DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
 807DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
 808DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
 809DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
 810
 811DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
 812DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
 813DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
 814DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
 815
 816#undef DO_VPZ
 817#undef DO_VPZ_D
 818
 819/* Two vector operand, one scalar operand, unpredicated.  */
 820#define DO_ZZI(NAME, TYPE, OP)                                       \
 821void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc)   \
 822{                                                                    \
 823    intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);            \
 824    TYPE s = s64, *d = vd, *n = vn;                                  \
 825    for (i = 0; i < opr_sz; ++i) {                                   \
 826        d[i] = OP(n[i], s);                                          \
 827    }                                                                \
 828}
 829
 830#define DO_SUBR(X, Y)   (Y - X)
 831
 832DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
 833DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
 834DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
 835DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
 836
 837DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
 838DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
 839DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
 840DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
 841
 842DO_ZZI(sve_smini_b, int8_t, DO_MIN)
 843DO_ZZI(sve_smini_h, int16_t, DO_MIN)
 844DO_ZZI(sve_smini_s, int32_t, DO_MIN)
 845DO_ZZI(sve_smini_d, int64_t, DO_MIN)
 846
 847DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
 848DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
 849DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
 850DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
 851
 852DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
 853DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
 854DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
 855DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
 856
 857#undef DO_ZZI
 858
 859#undef DO_AND
 860#undef DO_ORR
 861#undef DO_EOR
 862#undef DO_BIC
 863#undef DO_ADD
 864#undef DO_SUB
 865#undef DO_MAX
 866#undef DO_MIN
 867#undef DO_ABD
 868#undef DO_MUL
 869#undef DO_DIV
 870#undef DO_ASR
 871#undef DO_LSR
 872#undef DO_LSL
 873#undef DO_SUBR
 874
 875/* Similar to the ARM LastActiveElement pseudocode function, except the
 876   result is multiplied by the element size.  This includes the not found
 877   indication; e.g. not found for esz=3 is -8.  */
 878static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
 879{
 880    uint64_t mask = pred_esz_masks[esz];
 881    intptr_t i = words;
 882
 883    do {
 884        uint64_t this_g = g[--i] & mask;
 885        if (this_g) {
 886            return i * 64 + (63 - clz64(this_g));
 887        }
 888    } while (i > 0);
 889    return (intptr_t)-1 << esz;
 890}
 891
 892uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
 893{
 894    uint32_t flags = PREDTEST_INIT;
 895    uint64_t *d = vd, *g = vg;
 896    intptr_t i = 0;
 897
 898    do {
 899        uint64_t this_d = d[i];
 900        uint64_t this_g = g[i];
 901
 902        if (this_g) {
 903            if (!(flags & 4)) {
 904                /* Set in D the first bit of G.  */
 905                this_d |= this_g & -this_g;
 906                d[i] = this_d;
 907            }
 908            flags = iter_predtest_fwd(this_d, this_g, flags);
 909        }
 910    } while (++i < words);
 911
 912    return flags;
 913}
 914
 915uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
 916{
 917    intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
 918    intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
 919    uint32_t flags = PREDTEST_INIT;
 920    uint64_t *d = vd, *g = vg, esz_mask;
 921    intptr_t i, next;
 922
 923    next = last_active_element(vd, words, esz) + (1 << esz);
 924    esz_mask = pred_esz_masks[esz];
 925
 926    /* Similar to the pseudocode for pnext, but scaled by ESZ
 927       so that we find the correct bit.  */
 928    if (next < words * 64) {
 929        uint64_t mask = -1;
 930
 931        if (next & 63) {
 932            mask = ~((1ull << (next & 63)) - 1);
 933            next &= -64;
 934        }
 935        do {
 936            uint64_t this_g = g[next / 64] & esz_mask & mask;
 937            if (this_g != 0) {
 938                next = (next & -64) + ctz64(this_g);
 939                break;
 940            }
 941            next += 64;
 942            mask = -1;
 943        } while (next < words * 64);
 944    }
 945
 946    i = 0;
 947    do {
 948        uint64_t this_d = 0;
 949        if (i == next / 64) {
 950            this_d = 1ull << (next & 63);
 951        }
 952        d[i] = this_d;
 953        flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
 954    } while (++i < words);
 955
 956    return flags;
 957}
 958
 959/*
 960 * Copy Zn into Zd, and store zero into inactive elements.
 961 * If inv, store zeros into the active elements.
 962 */
 963void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
 964{
 965    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
 966    uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
 967    uint64_t *d = vd, *n = vn;
 968    uint8_t *pg = vg;
 969
 970    for (i = 0; i < opr_sz; i += 1) {
 971        d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
 972    }
 973}
 974
 975void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
 976{
 977    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
 978    uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
 979    uint64_t *d = vd, *n = vn;
 980    uint8_t *pg = vg;
 981
 982    for (i = 0; i < opr_sz; i += 1) {
 983        d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
 984    }
 985}
 986
 987void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
 988{
 989    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
 990    uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
 991    uint64_t *d = vd, *n = vn;
 992    uint8_t *pg = vg;
 993
 994    for (i = 0; i < opr_sz; i += 1) {
 995        d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
 996    }
 997}
 998
 999void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1000{
1001    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1002    uint64_t *d = vd, *n = vn;
1003    uint8_t *pg = vg;
1004    uint8_t inv = simd_data(desc);
1005
1006    for (i = 0; i < opr_sz; i += 1) {
1007        d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
1008    }
1009}
1010
1011/* Three-operand expander, immediate operand, controlled by a predicate.
1012 */
1013#define DO_ZPZI(NAME, TYPE, H, OP)                              \
1014void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
1015{                                                               \
1016    intptr_t i, opr_sz = simd_oprsz(desc);                      \
1017    TYPE imm = simd_data(desc);                                 \
1018    for (i = 0; i < opr_sz; ) {                                 \
1019        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
1020        do {                                                    \
1021            if (pg & 1) {                                       \
1022                TYPE nn = *(TYPE *)(vn + H(i));                 \
1023                *(TYPE *)(vd + H(i)) = OP(nn, imm);             \
1024            }                                                   \
1025            i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
1026        } while (i & 15);                                       \
1027    }                                                           \
1028}
1029
1030/* Similarly, specialized for 64-bit operands.  */
1031#define DO_ZPZI_D(NAME, TYPE, OP)                               \
1032void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
1033{                                                               \
1034    intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
1035    TYPE *d = vd, *n = vn;                                      \
1036    TYPE imm = simd_data(desc);                                 \
1037    uint8_t *pg = vg;                                           \
1038    for (i = 0; i < opr_sz; i += 1) {                           \
1039        if (pg[H1(i)] & 1) {                                    \
1040            TYPE nn = n[i];                                     \
1041            d[i] = OP(nn, imm);                                 \
1042        }                                                       \
1043    }                                                           \
1044}
1045
1046#define DO_SHR(N, M)  (N >> M)
1047#define DO_SHL(N, M)  (N << M)
1048
1049/* Arithmetic shift right for division.  This rounds negative numbers
1050   toward zero as per signed division.  Therefore before shifting,
1051   when N is negative, add 2**M-1.  */
1052#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1053
1054DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1055DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1056DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1057DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1058
1059DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1060DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1061DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1062DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1063
1064DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1065DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1066DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1067DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1068
1069DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1070DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1071DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1072DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1073
1074#undef DO_SHR
1075#undef DO_SHL
1076#undef DO_ASRD
1077#undef DO_ZPZI
1078#undef DO_ZPZI_D
1079
1080/* Fully general four-operand expander, controlled by a predicate.
1081 */
1082#define DO_ZPZZZ(NAME, TYPE, H, OP)                           \
1083void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
1084                  void *vg, uint32_t desc)                    \
1085{                                                             \
1086    intptr_t i, opr_sz = simd_oprsz(desc);                    \
1087    for (i = 0; i < opr_sz; ) {                               \
1088        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));       \
1089        do {                                                  \
1090            if (pg & 1) {                                     \
1091                TYPE nn = *(TYPE *)(vn + H(i));               \
1092                TYPE mm = *(TYPE *)(vm + H(i));               \
1093                TYPE aa = *(TYPE *)(va + H(i));               \
1094                *(TYPE *)(vd + H(i)) = OP(aa, nn, mm);        \
1095            }                                                 \
1096            i += sizeof(TYPE), pg >>= sizeof(TYPE);           \
1097        } while (i & 15);                                     \
1098    }                                                         \
1099}
1100
1101/* Similarly, specialized for 64-bit operands.  */
1102#define DO_ZPZZZ_D(NAME, TYPE, OP)                            \
1103void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
1104                  void *vg, uint32_t desc)                    \
1105{                                                             \
1106    intptr_t i, opr_sz = simd_oprsz(desc) / 8;                \
1107    TYPE *d = vd, *a = va, *n = vn, *m = vm;                  \
1108    uint8_t *pg = vg;                                         \
1109    for (i = 0; i < opr_sz; i += 1) {                         \
1110        if (pg[H1(i)] & 1) {                                  \
1111            TYPE aa = a[i], nn = n[i], mm = m[i];             \
1112            d[i] = OP(aa, nn, mm);                            \
1113        }                                                     \
1114    }                                                         \
1115}
1116
1117#define DO_MLA(A, N, M)  (A + N * M)
1118#define DO_MLS(A, N, M)  (A - N * M)
1119
1120DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1121DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1122
1123DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1124DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1125
1126DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1127DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1128
1129DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1130DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1131
1132#undef DO_MLA
1133#undef DO_MLS
1134#undef DO_ZPZZZ
1135#undef DO_ZPZZZ_D
1136
1137void HELPER(sve_index_b)(void *vd, uint32_t start,
1138                         uint32_t incr, uint32_t desc)
1139{
1140    intptr_t i, opr_sz = simd_oprsz(desc);
1141    uint8_t *d = vd;
1142    for (i = 0; i < opr_sz; i += 1) {
1143        d[H1(i)] = start + i * incr;
1144    }
1145}
1146
1147void HELPER(sve_index_h)(void *vd, uint32_t start,
1148                         uint32_t incr, uint32_t desc)
1149{
1150    intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1151    uint16_t *d = vd;
1152    for (i = 0; i < opr_sz; i += 1) {
1153        d[H2(i)] = start + i * incr;
1154    }
1155}
1156
1157void HELPER(sve_index_s)(void *vd, uint32_t start,
1158                         uint32_t incr, uint32_t desc)
1159{
1160    intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1161    uint32_t *d = vd;
1162    for (i = 0; i < opr_sz; i += 1) {
1163        d[H4(i)] = start + i * incr;
1164    }
1165}
1166
1167void HELPER(sve_index_d)(void *vd, uint64_t start,
1168                         uint64_t incr, uint32_t desc)
1169{
1170    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1171    uint64_t *d = vd;
1172    for (i = 0; i < opr_sz; i += 1) {
1173        d[i] = start + i * incr;
1174    }
1175}
1176
1177void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1178{
1179    intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1180    uint32_t sh = simd_data(desc);
1181    uint32_t *d = vd, *n = vn, *m = vm;
1182    for (i = 0; i < opr_sz; i += 1) {
1183        d[i] = n[i] + (m[i] << sh);
1184    }
1185}
1186
1187void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1188{
1189    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1190    uint64_t sh = simd_data(desc);
1191    uint64_t *d = vd, *n = vn, *m = vm;
1192    for (i = 0; i < opr_sz; i += 1) {
1193        d[i] = n[i] + (m[i] << sh);
1194    }
1195}
1196
1197void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1198{
1199    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1200    uint64_t sh = simd_data(desc);
1201    uint64_t *d = vd, *n = vn, *m = vm;
1202    for (i = 0; i < opr_sz; i += 1) {
1203        d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1204    }
1205}
1206
1207void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1208{
1209    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1210    uint64_t sh = simd_data(desc);
1211    uint64_t *d = vd, *n = vn, *m = vm;
1212    for (i = 0; i < opr_sz; i += 1) {
1213        d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1214    }
1215}
1216
1217void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1218{
1219    /* These constants are cut-and-paste directly from the ARM pseudocode.  */
1220    static const uint16_t coeff[] = {
1221        0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1222        0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1223        0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1224        0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1225    };
1226    intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1227    uint16_t *d = vd, *n = vn;
1228
1229    for (i = 0; i < opr_sz; i++) {
1230        uint16_t nn = n[i];
1231        intptr_t idx = extract32(nn, 0, 5);
1232        uint16_t exp = extract32(nn, 5, 5);
1233        d[i] = coeff[idx] | (exp << 10);
1234    }
1235}
1236
1237void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1238{
1239    /* These constants are cut-and-paste directly from the ARM pseudocode.  */
1240    static const uint32_t coeff[] = {
1241        0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1242        0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1243        0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1244        0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1245        0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1246        0x1ef532, 0x20b051, 0x227043, 0x243516,
1247        0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1248        0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1249        0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1250        0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1251        0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1252        0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1253        0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1254        0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1255        0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1256        0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1257    };
1258    intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1259    uint32_t *d = vd, *n = vn;
1260
1261    for (i = 0; i < opr_sz; i++) {
1262        uint32_t nn = n[i];
1263        intptr_t idx = extract32(nn, 0, 6);
1264        uint32_t exp = extract32(nn, 6, 8);
1265        d[i] = coeff[idx] | (exp << 23);
1266    }
1267}
1268
1269void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1270{
1271    /* These constants are cut-and-paste directly from the ARM pseudocode.  */
1272    static const uint64_t coeff[] = {
1273        0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1274        0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1275        0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1276        0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1277        0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1278        0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1279        0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1280        0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1281        0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1282        0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1283        0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1284        0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1285        0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1286        0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1287        0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1288        0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1289        0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1290        0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1291        0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1292        0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1293        0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1294        0xFA7C1819E90D8ull,
1295    };
1296    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1297    uint64_t *d = vd, *n = vn;
1298
1299    for (i = 0; i < opr_sz; i++) {
1300        uint64_t nn = n[i];
1301        intptr_t idx = extract32(nn, 0, 6);
1302        uint64_t exp = extract32(nn, 6, 11);
1303        d[i] = coeff[idx] | (exp << 52);
1304    }
1305}
1306
1307void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1308{
1309    intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1310    uint16_t *d = vd, *n = vn, *m = vm;
1311    for (i = 0; i < opr_sz; i += 1) {
1312        uint16_t nn = n[i];
1313        uint16_t mm = m[i];
1314        if (mm & 1) {
1315            nn = float16_one;
1316        }
1317        d[i] = nn ^ (mm & 2) << 14;
1318    }
1319}
1320
1321void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1322{
1323    intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1324    uint32_t *d = vd, *n = vn, *m = vm;
1325    for (i = 0; i < opr_sz; i += 1) {
1326        uint32_t nn = n[i];
1327        uint32_t mm = m[i];
1328        if (mm & 1) {
1329            nn = float32_one;
1330        }
1331        d[i] = nn ^ (mm & 2) << 30;
1332    }
1333}
1334
1335void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1336{
1337    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1338    uint64_t *d = vd, *n = vn, *m = vm;
1339    for (i = 0; i < opr_sz; i += 1) {
1340        uint64_t nn = n[i];
1341        uint64_t mm = m[i];
1342        if (mm & 1) {
1343            nn = float64_one;
1344        }
1345        d[i] = nn ^ (mm & 2) << 62;
1346    }
1347}
1348
1349/*
1350 * Signed saturating addition with scalar operand.
1351 */
1352
1353void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1354{
1355    intptr_t i, oprsz = simd_oprsz(desc);
1356
1357    for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1358        int r = *(int8_t *)(a + i) + b;
1359        if (r > INT8_MAX) {
1360            r = INT8_MAX;
1361        } else if (r < INT8_MIN) {
1362            r = INT8_MIN;
1363        }
1364        *(int8_t *)(d + i) = r;
1365    }
1366}
1367
1368void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1369{
1370    intptr_t i, oprsz = simd_oprsz(desc);
1371
1372    for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1373        int r = *(int16_t *)(a + i) + b;
1374        if (r > INT16_MAX) {
1375            r = INT16_MAX;
1376        } else if (r < INT16_MIN) {
1377            r = INT16_MIN;
1378        }
1379        *(int16_t *)(d + i) = r;
1380    }
1381}
1382
1383void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1384{
1385    intptr_t i, oprsz = simd_oprsz(desc);
1386
1387    for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1388        int64_t r = *(int32_t *)(a + i) + b;
1389        if (r > INT32_MAX) {
1390            r = INT32_MAX;
1391        } else if (r < INT32_MIN) {
1392            r = INT32_MIN;
1393        }
1394        *(int32_t *)(d + i) = r;
1395    }
1396}
1397
1398void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1399{
1400    intptr_t i, oprsz = simd_oprsz(desc);
1401
1402    for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1403        int64_t ai = *(int64_t *)(a + i);
1404        int64_t r = ai + b;
1405        if (((r ^ ai) & ~(ai ^ b)) < 0) {
1406            /* Signed overflow.  */
1407            r = (r < 0 ? INT64_MAX : INT64_MIN);
1408        }
1409        *(int64_t *)(d + i) = r;
1410    }
1411}
1412
1413/*
1414 * Unsigned saturating addition with scalar operand.
1415 */
1416
1417void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1418{
1419    intptr_t i, oprsz = simd_oprsz(desc);
1420
1421    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1422        int r = *(uint8_t *)(a + i) + b;
1423        if (r > UINT8_MAX) {
1424            r = UINT8_MAX;
1425        } else if (r < 0) {
1426            r = 0;
1427        }
1428        *(uint8_t *)(d + i) = r;
1429    }
1430}
1431
1432void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1433{
1434    intptr_t i, oprsz = simd_oprsz(desc);
1435
1436    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1437        int r = *(uint16_t *)(a + i) + b;
1438        if (r > UINT16_MAX) {
1439            r = UINT16_MAX;
1440        } else if (r < 0) {
1441            r = 0;
1442        }
1443        *(uint16_t *)(d + i) = r;
1444    }
1445}
1446
1447void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1448{
1449    intptr_t i, oprsz = simd_oprsz(desc);
1450
1451    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1452        int64_t r = *(uint32_t *)(a + i) + b;
1453        if (r > UINT32_MAX) {
1454            r = UINT32_MAX;
1455        } else if (r < 0) {
1456            r = 0;
1457        }
1458        *(uint32_t *)(d + i) = r;
1459    }
1460}
1461
1462void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1463{
1464    intptr_t i, oprsz = simd_oprsz(desc);
1465
1466    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1467        uint64_t r = *(uint64_t *)(a + i) + b;
1468        if (r < b) {
1469            r = UINT64_MAX;
1470        }
1471        *(uint64_t *)(d + i) = r;
1472    }
1473}
1474
1475void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1476{
1477    intptr_t i, oprsz = simd_oprsz(desc);
1478
1479    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1480        uint64_t ai = *(uint64_t *)(a + i);
1481        *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1482    }
1483}
1484
1485/* Two operand predicated copy immediate with merge.  All valid immediates
1486 * can fit within 17 signed bits in the simd_data field.
1487 */
1488void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1489                         uint64_t mm, uint32_t desc)
1490{
1491    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1492    uint64_t *d = vd, *n = vn;
1493    uint8_t *pg = vg;
1494
1495    mm = dup_const(MO_8, mm);
1496    for (i = 0; i < opr_sz; i += 1) {
1497        uint64_t nn = n[i];
1498        uint64_t pp = expand_pred_b(pg[H1(i)]);
1499        d[i] = (mm & pp) | (nn & ~pp);
1500    }
1501}
1502
1503void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1504                         uint64_t mm, uint32_t desc)
1505{
1506    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1507    uint64_t *d = vd, *n = vn;
1508    uint8_t *pg = vg;
1509
1510    mm = dup_const(MO_16, mm);
1511    for (i = 0; i < opr_sz; i += 1) {
1512        uint64_t nn = n[i];
1513        uint64_t pp = expand_pred_h(pg[H1(i)]);
1514        d[i] = (mm & pp) | (nn & ~pp);
1515    }
1516}
1517
1518void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1519                         uint64_t mm, uint32_t desc)
1520{
1521    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1522    uint64_t *d = vd, *n = vn;
1523    uint8_t *pg = vg;
1524
1525    mm = dup_const(MO_32, mm);
1526    for (i = 0; i < opr_sz; i += 1) {
1527        uint64_t nn = n[i];
1528        uint64_t pp = expand_pred_s(pg[H1(i)]);
1529        d[i] = (mm & pp) | (nn & ~pp);
1530    }
1531}
1532
1533void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1534                         uint64_t mm, uint32_t desc)
1535{
1536    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1537    uint64_t *d = vd, *n = vn;
1538    uint8_t *pg = vg;
1539
1540    for (i = 0; i < opr_sz; i += 1) {
1541        uint64_t nn = n[i];
1542        d[i] = (pg[H1(i)] & 1 ? mm : nn);
1543    }
1544}
1545
1546void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1547{
1548    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1549    uint64_t *d = vd;
1550    uint8_t *pg = vg;
1551
1552    val = dup_const(MO_8, val);
1553    for (i = 0; i < opr_sz; i += 1) {
1554        d[i] = val & expand_pred_b(pg[H1(i)]);
1555    }
1556}
1557
1558void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1559{
1560    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1561    uint64_t *d = vd;
1562    uint8_t *pg = vg;
1563
1564    val = dup_const(MO_16, val);
1565    for (i = 0; i < opr_sz; i += 1) {
1566        d[i] = val & expand_pred_h(pg[H1(i)]);
1567    }
1568}
1569
1570void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1571{
1572    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1573    uint64_t *d = vd;
1574    uint8_t *pg = vg;
1575
1576    val = dup_const(MO_32, val);
1577    for (i = 0; i < opr_sz; i += 1) {
1578        d[i] = val & expand_pred_s(pg[H1(i)]);
1579    }
1580}
1581
1582void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1583{
1584    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1585    uint64_t *d = vd;
1586    uint8_t *pg = vg;
1587
1588    for (i = 0; i < opr_sz; i += 1) {
1589        d[i] = (pg[H1(i)] & 1 ? val : 0);
1590    }
1591}
1592
1593/* Big-endian hosts need to frob the byte indices.  If the copy
1594 * happens to be 8-byte aligned, then no frobbing necessary.
1595 */
1596static void swap_memmove(void *vd, void *vs, size_t n)
1597{
1598    uintptr_t d = (uintptr_t)vd;
1599    uintptr_t s = (uintptr_t)vs;
1600    uintptr_t o = (d | s | n) & 7;
1601    size_t i;
1602
1603#ifndef HOST_WORDS_BIGENDIAN
1604    o = 0;
1605#endif
1606    switch (o) {
1607    case 0:
1608        memmove(vd, vs, n);
1609        break;
1610
1611    case 4:
1612        if (d < s || d >= s + n) {
1613            for (i = 0; i < n; i += 4) {
1614                *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1615            }
1616        } else {
1617            for (i = n; i > 0; ) {
1618                i -= 4;
1619                *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1620            }
1621        }
1622        break;
1623
1624    case 2:
1625    case 6:
1626        if (d < s || d >= s + n) {
1627            for (i = 0; i < n; i += 2) {
1628                *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1629            }
1630        } else {
1631            for (i = n; i > 0; ) {
1632                i -= 2;
1633                *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1634            }
1635        }
1636        break;
1637
1638    default:
1639        if (d < s || d >= s + n) {
1640            for (i = 0; i < n; i++) {
1641                *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1642            }
1643        } else {
1644            for (i = n; i > 0; ) {
1645                i -= 1;
1646                *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1647            }
1648        }
1649        break;
1650    }
1651}
1652
1653/* Similarly for memset of 0.  */
1654static void swap_memzero(void *vd, size_t n)
1655{
1656    uintptr_t d = (uintptr_t)vd;
1657    uintptr_t o = (d | n) & 7;
1658    size_t i;
1659
1660    /* Usually, the first bit of a predicate is set, so N is 0.  */
1661    if (likely(n == 0)) {
1662        return;
1663    }
1664
1665#ifndef HOST_WORDS_BIGENDIAN
1666    o = 0;
1667#endif
1668    switch (o) {
1669    case 0:
1670        memset(vd, 0, n);
1671        break;
1672
1673    case 4:
1674        for (i = 0; i < n; i += 4) {
1675            *(uint32_t *)H1_4(d + i) = 0;
1676        }
1677        break;
1678
1679    case 2:
1680    case 6:
1681        for (i = 0; i < n; i += 2) {
1682            *(uint16_t *)H1_2(d + i) = 0;
1683        }
1684        break;
1685
1686    default:
1687        for (i = 0; i < n; i++) {
1688            *(uint8_t *)H1(d + i) = 0;
1689        }
1690        break;
1691    }
1692}
1693
1694void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1695{
1696    intptr_t opr_sz = simd_oprsz(desc);
1697    size_t n_ofs = simd_data(desc);
1698    size_t n_siz = opr_sz - n_ofs;
1699
1700    if (vd != vm) {
1701        swap_memmove(vd, vn + n_ofs, n_siz);
1702        swap_memmove(vd + n_siz, vm, n_ofs);
1703    } else if (vd != vn) {
1704        swap_memmove(vd + n_siz, vd, n_ofs);
1705        swap_memmove(vd, vn + n_ofs, n_siz);
1706    } else {
1707        /* vd == vn == vm.  Need temp space.  */
1708        ARMVectorReg tmp;
1709        swap_memmove(&tmp, vm, n_ofs);
1710        swap_memmove(vd, vd + n_ofs, n_siz);
1711        memcpy(vd + n_siz, &tmp, n_ofs);
1712    }
1713}
1714
1715#define DO_INSR(NAME, TYPE, H) \
1716void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1717{                                                                  \
1718    intptr_t opr_sz = simd_oprsz(desc);                            \
1719    swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE));    \
1720    *(TYPE *)(vd + H(0)) = val;                                    \
1721}
1722
1723DO_INSR(sve_insr_b, uint8_t, H1)
1724DO_INSR(sve_insr_h, uint16_t, H1_2)
1725DO_INSR(sve_insr_s, uint32_t, H1_4)
1726DO_INSR(sve_insr_d, uint64_t, )
1727
1728#undef DO_INSR
1729
1730void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1731{
1732    intptr_t i, j, opr_sz = simd_oprsz(desc);
1733    for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1734        uint64_t f = *(uint64_t *)(vn + i);
1735        uint64_t b = *(uint64_t *)(vn + j);
1736        *(uint64_t *)(vd + i) = bswap64(b);
1737        *(uint64_t *)(vd + j) = bswap64(f);
1738    }
1739}
1740
1741void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1742{
1743    intptr_t i, j, opr_sz = simd_oprsz(desc);
1744    for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1745        uint64_t f = *(uint64_t *)(vn + i);
1746        uint64_t b = *(uint64_t *)(vn + j);
1747        *(uint64_t *)(vd + i) = hswap64(b);
1748        *(uint64_t *)(vd + j) = hswap64(f);
1749    }
1750}
1751
1752void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1753{
1754    intptr_t i, j, opr_sz = simd_oprsz(desc);
1755    for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1756        uint64_t f = *(uint64_t *)(vn + i);
1757        uint64_t b = *(uint64_t *)(vn + j);
1758        *(uint64_t *)(vd + i) = rol64(b, 32);
1759        *(uint64_t *)(vd + j) = rol64(f, 32);
1760    }
1761}
1762
1763void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1764{
1765    intptr_t i, j, opr_sz = simd_oprsz(desc);
1766    for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1767        uint64_t f = *(uint64_t *)(vn + i);
1768        uint64_t b = *(uint64_t *)(vn + j);
1769        *(uint64_t *)(vd + i) = b;
1770        *(uint64_t *)(vd + j) = f;
1771    }
1772}
1773
1774#define DO_TBL(NAME, TYPE, H) \
1775void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1776{                                                              \
1777    intptr_t i, opr_sz = simd_oprsz(desc);                     \
1778    uintptr_t elem = opr_sz / sizeof(TYPE);                    \
1779    TYPE *d = vd, *n = vn, *m = vm;                            \
1780    ARMVectorReg tmp;                                          \
1781    if (unlikely(vd == vn)) {                                  \
1782        n = memcpy(&tmp, vn, opr_sz);                          \
1783    }                                                          \
1784    for (i = 0; i < elem; i++) {                               \
1785        TYPE j = m[H(i)];                                      \
1786        d[H(i)] = j < elem ? n[H(j)] : 0;                      \
1787    }                                                          \
1788}
1789
1790DO_TBL(sve_tbl_b, uint8_t, H1)
1791DO_TBL(sve_tbl_h, uint16_t, H2)
1792DO_TBL(sve_tbl_s, uint32_t, H4)
1793DO_TBL(sve_tbl_d, uint64_t, )
1794
1795#undef TBL
1796
1797#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1798void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
1799{                                                              \
1800    intptr_t i, opr_sz = simd_oprsz(desc);                     \
1801    TYPED *d = vd;                                             \
1802    TYPES *n = vn;                                             \
1803    ARMVectorReg tmp;                                          \
1804    if (unlikely(vn - vd < opr_sz)) {                          \
1805        n = memcpy(&tmp, n, opr_sz / 2);                       \
1806    }                                                          \
1807    for (i = 0; i < opr_sz / sizeof(TYPED); i++) {             \
1808        d[HD(i)] = n[HS(i)];                                   \
1809    }                                                          \
1810}
1811
1812DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1813DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1814DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1815
1816DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1817DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1818DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1819
1820#undef DO_UNPK
1821
1822/* Mask of bits included in the even numbered predicates of width esz.
1823 * We also use this for expand_bits/compress_bits, and so extend the
1824 * same pattern out to 16-bit units.
1825 */
1826static const uint64_t even_bit_esz_masks[5] = {
1827    0x5555555555555555ull,
1828    0x3333333333333333ull,
1829    0x0f0f0f0f0f0f0f0full,
1830    0x00ff00ff00ff00ffull,
1831    0x0000ffff0000ffffull,
1832};
1833
1834/* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1835 * For N==0, this corresponds to the operation that in qemu/bitops.h
1836 * we call half_shuffle64; this algorithm is from Hacker's Delight,
1837 * section 7-2 Shuffling Bits.
1838 */
1839static uint64_t expand_bits(uint64_t x, int n)
1840{
1841    int i;
1842
1843    x &= 0xffffffffu;
1844    for (i = 4; i >= n; i--) {
1845        int sh = 1 << i;
1846        x = ((x << sh) | x) & even_bit_esz_masks[i];
1847    }
1848    return x;
1849}
1850
1851/* Compress units of 2**(N+1) bits to units of 2**N bits.
1852 * For N==0, this corresponds to the operation that in qemu/bitops.h
1853 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1854 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1855 */
1856static uint64_t compress_bits(uint64_t x, int n)
1857{
1858    int i;
1859
1860    for (i = n; i <= 4; i++) {
1861        int sh = 1 << i;
1862        x &= even_bit_esz_masks[i];
1863        x = (x >> sh) | x;
1864    }
1865    return x & 0xffffffffu;
1866}
1867
1868void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1869{
1870    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1871    int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1872    intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1873    uint64_t *d = vd;
1874    intptr_t i;
1875
1876    if (oprsz <= 8) {
1877        uint64_t nn = *(uint64_t *)vn;
1878        uint64_t mm = *(uint64_t *)vm;
1879        int half = 4 * oprsz;
1880
1881        nn = extract64(nn, high * half, half);
1882        mm = extract64(mm, high * half, half);
1883        nn = expand_bits(nn, esz);
1884        mm = expand_bits(mm, esz);
1885        d[0] = nn + (mm << (1 << esz));
1886    } else {
1887        ARMPredicateReg tmp_n, tmp_m;
1888
1889        /* We produce output faster than we consume input.
1890           Therefore we must be mindful of possible overlap.  */
1891        if ((vn - vd) < (uintptr_t)oprsz) {
1892            vn = memcpy(&tmp_n, vn, oprsz);
1893        }
1894        if ((vm - vd) < (uintptr_t)oprsz) {
1895            vm = memcpy(&tmp_m, vm, oprsz);
1896        }
1897        if (high) {
1898            high = oprsz >> 1;
1899        }
1900
1901        if ((high & 3) == 0) {
1902            uint32_t *n = vn, *m = vm;
1903            high >>= 2;
1904
1905            for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1906                uint64_t nn = n[H4(high + i)];
1907                uint64_t mm = m[H4(high + i)];
1908
1909                nn = expand_bits(nn, esz);
1910                mm = expand_bits(mm, esz);
1911                d[i] = nn + (mm << (1 << esz));
1912            }
1913        } else {
1914            uint8_t *n = vn, *m = vm;
1915            uint16_t *d16 = vd;
1916
1917            for (i = 0; i < oprsz / 2; i++) {
1918                uint16_t nn = n[H1(high + i)];
1919                uint16_t mm = m[H1(high + i)];
1920
1921                nn = expand_bits(nn, esz);
1922                mm = expand_bits(mm, esz);
1923                d16[H2(i)] = nn + (mm << (1 << esz));
1924            }
1925        }
1926    }
1927}
1928
1929void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1930{
1931    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1932    int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1933    int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
1934    uint64_t *d = vd, *n = vn, *m = vm;
1935    uint64_t l, h;
1936    intptr_t i;
1937
1938    if (oprsz <= 8) {
1939        l = compress_bits(n[0] >> odd, esz);
1940        h = compress_bits(m[0] >> odd, esz);
1941        d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
1942    } else {
1943        ARMPredicateReg tmp_m;
1944        intptr_t oprsz_16 = oprsz / 16;
1945
1946        if ((vm - vd) < (uintptr_t)oprsz) {
1947            m = memcpy(&tmp_m, vm, oprsz);
1948        }
1949
1950        for (i = 0; i < oprsz_16; i++) {
1951            l = n[2 * i + 0];
1952            h = n[2 * i + 1];
1953            l = compress_bits(l >> odd, esz);
1954            h = compress_bits(h >> odd, esz);
1955            d[i] = l + (h << 32);
1956        }
1957
1958        /* For VL which is not a power of 2, the results from M do not
1959           align nicely with the uint64_t for D.  Put the aligned results
1960           from M into TMP_M and then copy it into place afterward.  */
1961        if (oprsz & 15) {
1962            d[i] = compress_bits(n[2 * i] >> odd, esz);
1963
1964            for (i = 0; i < oprsz_16; i++) {
1965                l = m[2 * i + 0];
1966                h = m[2 * i + 1];
1967                l = compress_bits(l >> odd, esz);
1968                h = compress_bits(h >> odd, esz);
1969                tmp_m.p[i] = l + (h << 32);
1970            }
1971            tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
1972
1973            swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
1974        } else {
1975            for (i = 0; i < oprsz_16; i++) {
1976                l = m[2 * i + 0];
1977                h = m[2 * i + 1];
1978                l = compress_bits(l >> odd, esz);
1979                h = compress_bits(h >> odd, esz);
1980                d[oprsz_16 + i] = l + (h << 32);
1981            }
1982        }
1983    }
1984}
1985
1986void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1987{
1988    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1989    uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1990    bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1991    uint64_t *d = vd, *n = vn, *m = vm;
1992    uint64_t mask;
1993    int shr, shl;
1994    intptr_t i;
1995
1996    shl = 1 << esz;
1997    shr = 0;
1998    mask = even_bit_esz_masks[esz];
1999    if (odd) {
2000        mask <<= shl;
2001        shr = shl;
2002        shl = 0;
2003    }
2004
2005    for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2006        uint64_t nn = (n[i] & mask) >> shr;
2007        uint64_t mm = (m[i] & mask) << shl;
2008        d[i] = nn + mm;
2009    }
2010}
2011
2012/* Reverse units of 2**N bits.  */
2013static uint64_t reverse_bits_64(uint64_t x, int n)
2014{
2015    int i, sh;
2016
2017    x = bswap64(x);
2018    for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2019        uint64_t mask = even_bit_esz_masks[i];
2020        x = ((x & mask) << sh) | ((x >> sh) & mask);
2021    }
2022    return x;
2023}
2024
2025static uint8_t reverse_bits_8(uint8_t x, int n)
2026{
2027    static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2028    int i, sh;
2029
2030    for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2031        x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2032    }
2033    return x;
2034}
2035
2036void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2037{
2038    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2039    int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2040    intptr_t i, oprsz_2 = oprsz / 2;
2041
2042    if (oprsz <= 8) {
2043        uint64_t l = *(uint64_t *)vn;
2044        l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2045        *(uint64_t *)vd = l;
2046    } else if ((oprsz & 15) == 0) {
2047        for (i = 0; i < oprsz_2; i += 8) {
2048            intptr_t ih = oprsz - 8 - i;
2049            uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2050            uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2051            *(uint64_t *)(vd + i) = h;
2052            *(uint64_t *)(vd + ih) = l;
2053        }
2054    } else {
2055        for (i = 0; i < oprsz_2; i += 1) {
2056            intptr_t il = H1(i);
2057            intptr_t ih = H1(oprsz - 1 - i);
2058            uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2059            uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2060            *(uint8_t *)(vd + il) = h;
2061            *(uint8_t *)(vd + ih) = l;
2062        }
2063    }
2064}
2065
2066void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2067{
2068    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2069    intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2070    uint64_t *d = vd;
2071    intptr_t i;
2072
2073    if (oprsz <= 8) {
2074        uint64_t nn = *(uint64_t *)vn;
2075        int half = 4 * oprsz;
2076
2077        nn = extract64(nn, high * half, half);
2078        nn = expand_bits(nn, 0);
2079        d[0] = nn;
2080    } else {
2081        ARMPredicateReg tmp_n;
2082
2083        /* We produce output faster than we consume input.
2084           Therefore we must be mindful of possible overlap.  */
2085        if ((vn - vd) < (uintptr_t)oprsz) {
2086            vn = memcpy(&tmp_n, vn, oprsz);
2087        }
2088        if (high) {
2089            high = oprsz >> 1;
2090        }
2091
2092        if ((high & 3) == 0) {
2093            uint32_t *n = vn;
2094            high >>= 2;
2095
2096            for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2097                uint64_t nn = n[H4(high + i)];
2098                d[i] = expand_bits(nn, 0);
2099            }
2100        } else {
2101            uint16_t *d16 = vd;
2102            uint8_t *n = vn;
2103
2104            for (i = 0; i < oprsz / 2; i++) {
2105                uint16_t nn = n[H1(high + i)];
2106                d16[H2(i)] = expand_bits(nn, 0);
2107            }
2108        }
2109    }
2110}
2111
2112#define DO_ZIP(NAME, TYPE, H) \
2113void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
2114{                                                                    \
2115    intptr_t oprsz = simd_oprsz(desc);                               \
2116    intptr_t i, oprsz_2 = oprsz / 2;                                 \
2117    ARMVectorReg tmp_n, tmp_m;                                       \
2118    /* We produce output faster than we consume input.               \
2119       Therefore we must be mindful of possible overlap.  */         \
2120    if (unlikely((vn - vd) < (uintptr_t)oprsz)) {                    \
2121        vn = memcpy(&tmp_n, vn, oprsz_2);                            \
2122    }                                                                \
2123    if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                    \
2124        vm = memcpy(&tmp_m, vm, oprsz_2);                            \
2125    }                                                                \
2126    for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                    \
2127        *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i));         \
2128        *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2129    }                                                                \
2130}
2131
2132DO_ZIP(sve_zip_b, uint8_t, H1)
2133DO_ZIP(sve_zip_h, uint16_t, H1_2)
2134DO_ZIP(sve_zip_s, uint32_t, H1_4)
2135DO_ZIP(sve_zip_d, uint64_t, )
2136
2137#define DO_UZP(NAME, TYPE, H) \
2138void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
2139{                                                                      \
2140    intptr_t oprsz = simd_oprsz(desc);                                 \
2141    intptr_t oprsz_2 = oprsz / 2;                                      \
2142    intptr_t odd_ofs = simd_data(desc);                                \
2143    intptr_t i;                                                        \
2144    ARMVectorReg tmp_m;                                                \
2145    if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                      \
2146        vm = memcpy(&tmp_m, vm, oprsz);                                \
2147    }                                                                  \
2148    for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                      \
2149        *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs));     \
2150    }                                                                  \
2151    for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                      \
2152        *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2153    }                                                                  \
2154}
2155
2156DO_UZP(sve_uzp_b, uint8_t, H1)
2157DO_UZP(sve_uzp_h, uint16_t, H1_2)
2158DO_UZP(sve_uzp_s, uint32_t, H1_4)
2159DO_UZP(sve_uzp_d, uint64_t, )
2160
2161#define DO_TRN(NAME, TYPE, H) \
2162void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
2163{                                                                      \
2164    intptr_t oprsz = simd_oprsz(desc);                                 \
2165    intptr_t odd_ofs = simd_data(desc);                                \
2166    intptr_t i;                                                        \
2167    for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) {                    \
2168        TYPE ae = *(TYPE *)(vn + H(i + odd_ofs));                      \
2169        TYPE be = *(TYPE *)(vm + H(i + odd_ofs));                      \
2170        *(TYPE *)(vd + H(i + 0)) = ae;                                 \
2171        *(TYPE *)(vd + H(i + sizeof(TYPE))) = be;                      \
2172    }                                                                  \
2173}
2174
2175DO_TRN(sve_trn_b, uint8_t, H1)
2176DO_TRN(sve_trn_h, uint16_t, H1_2)
2177DO_TRN(sve_trn_s, uint32_t, H1_4)
2178DO_TRN(sve_trn_d, uint64_t, )
2179
2180#undef DO_ZIP
2181#undef DO_UZP
2182#undef DO_TRN
2183
2184void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2185{
2186    intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2187    uint32_t *d = vd, *n = vn;
2188    uint8_t *pg = vg;
2189
2190    for (i = j = 0; i < opr_sz; i++) {
2191        if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2192            d[H4(j)] = n[H4(i)];
2193            j++;
2194        }
2195    }
2196    for (; j < opr_sz; j++) {
2197        d[H4(j)] = 0;
2198    }
2199}
2200
2201void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2202{
2203    intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2204    uint64_t *d = vd, *n = vn;
2205    uint8_t *pg = vg;
2206
2207    for (i = j = 0; i < opr_sz; i++) {
2208        if (pg[H1(i)] & 1) {
2209            d[j] = n[i];
2210            j++;
2211        }
2212    }
2213    for (; j < opr_sz; j++) {
2214        d[j] = 0;
2215    }
2216}
2217
2218/* Similar to the ARM LastActiveElement pseudocode function, except the
2219 * result is multiplied by the element size.  This includes the not found
2220 * indication; e.g. not found for esz=3 is -8.
2221 */
2222int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2223{
2224    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2225    intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2226
2227    return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2228}
2229
2230void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2231{
2232    intptr_t opr_sz = simd_oprsz(desc) / 8;
2233    int esz = simd_data(desc);
2234    uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2235    intptr_t i, first_i, last_i;
2236    ARMVectorReg tmp;
2237
2238    first_i = last_i = 0;
2239    first_g = last_g = 0;
2240
2241    /* Find the extent of the active elements within VG.  */
2242    for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2243        pg = *(uint64_t *)(vg + i) & mask;
2244        if (pg) {
2245            if (last_g == 0) {
2246                last_g = pg;
2247                last_i = i;
2248            }
2249            first_g = pg;
2250            first_i = i;
2251        }
2252    }
2253
2254    len = 0;
2255    if (first_g != 0) {
2256        first_i = first_i * 8 + ctz64(first_g);
2257        last_i = last_i * 8 + 63 - clz64(last_g);
2258        len = last_i - first_i + (1 << esz);
2259        if (vd == vm) {
2260            vm = memcpy(&tmp, vm, opr_sz * 8);
2261        }
2262        swap_memmove(vd, vn + first_i, len);
2263    }
2264    swap_memmove(vd + len, vm, opr_sz * 8 - len);
2265}
2266
2267void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2268                            void *vg, uint32_t desc)
2269{
2270    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2271    uint64_t *d = vd, *n = vn, *m = vm;
2272    uint8_t *pg = vg;
2273
2274    for (i = 0; i < opr_sz; i += 1) {
2275        uint64_t nn = n[i], mm = m[i];
2276        uint64_t pp = expand_pred_b(pg[H1(i)]);
2277        d[i] = (nn & pp) | (mm & ~pp);
2278    }
2279}
2280
2281void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2282                            void *vg, uint32_t desc)
2283{
2284    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2285    uint64_t *d = vd, *n = vn, *m = vm;
2286    uint8_t *pg = vg;
2287
2288    for (i = 0; i < opr_sz; i += 1) {
2289        uint64_t nn = n[i], mm = m[i];
2290        uint64_t pp = expand_pred_h(pg[H1(i)]);
2291        d[i] = (nn & pp) | (mm & ~pp);
2292    }
2293}
2294
2295void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2296                            void *vg, uint32_t desc)
2297{
2298    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2299    uint64_t *d = vd, *n = vn, *m = vm;
2300    uint8_t *pg = vg;
2301
2302    for (i = 0; i < opr_sz; i += 1) {
2303        uint64_t nn = n[i], mm = m[i];
2304        uint64_t pp = expand_pred_s(pg[H1(i)]);
2305        d[i] = (nn & pp) | (mm & ~pp);
2306    }
2307}
2308
2309void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2310                            void *vg, uint32_t desc)
2311{
2312    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2313    uint64_t *d = vd, *n = vn, *m = vm;
2314    uint8_t *pg = vg;
2315
2316    for (i = 0; i < opr_sz; i += 1) {
2317        uint64_t nn = n[i], mm = m[i];
2318        d[i] = (pg[H1(i)] & 1 ? nn : mm);
2319    }
2320}
2321
2322/* Two operand comparison controlled by a predicate.
2323 * ??? It is very tempting to want to be able to expand this inline
2324 * with x86 instructions, e.g.
2325 *
2326 *    vcmpeqw    zm, zn, %ymm0
2327 *    vpmovmskb  %ymm0, %eax
2328 *    and        $0x5555, %eax
2329 *    and        pg, %eax
2330 *
2331 * or even aarch64, e.g.
2332 *
2333 *    // mask = 4000 1000 0400 0100 0040 0010 0004 0001
2334 *    cmeq       v0.8h, zn, zm
2335 *    and        v0.8h, v0.8h, mask
2336 *    addv       h0, v0.8h
2337 *    and        v0.8b, pg
2338 *
2339 * However, coming up with an abstraction that allows vector inputs and
2340 * a scalar output, and also handles the byte-ordering of sub-uint64_t
2341 * scalar outputs, is tricky.
2342 */
2343#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK)                                 \
2344uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2345{                                                                            \
2346    intptr_t opr_sz = simd_oprsz(desc);                                      \
2347    uint32_t flags = PREDTEST_INIT;                                          \
2348    intptr_t i = opr_sz;                                                     \
2349    do {                                                                     \
2350        uint64_t out = 0, pg;                                                \
2351        do {                                                                 \
2352            i -= sizeof(TYPE), out <<= sizeof(TYPE);                         \
2353            TYPE nn = *(TYPE *)(vn + H(i));                                  \
2354            TYPE mm = *(TYPE *)(vm + H(i));                                  \
2355            out |= nn OP mm;                                                 \
2356        } while (i & 63);                                                    \
2357        pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
2358        out &= pg;                                                           \
2359        *(uint64_t *)(vd + (i >> 3)) = out;                                  \
2360        flags = iter_predtest_bwd(out, pg, flags);                           \
2361    } while (i > 0);                                                         \
2362    return flags;                                                            \
2363}
2364
2365#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2366    DO_CMP_PPZZ(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
2367#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2368    DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2369#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2370    DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2371#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2372    DO_CMP_PPZZ(NAME, TYPE, OP,     , 0x0101010101010101ull)
2373
2374DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t,  ==)
2375DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2376DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2377DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2378
2379DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t,  !=)
2380DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2381DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2382DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2383
2384DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t,  >)
2385DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2386DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2387DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2388
2389DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t,  >=)
2390DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2391DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2392DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2393
2394DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t,  >)
2395DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2396DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2397DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2398
2399DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t,  >=)
2400DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2401DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2402DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2403
2404#undef DO_CMP_PPZZ_B
2405#undef DO_CMP_PPZZ_H
2406#undef DO_CMP_PPZZ_S
2407#undef DO_CMP_PPZZ_D
2408#undef DO_CMP_PPZZ
2409
2410/* Similar, but the second source is "wide".  */
2411#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK)                     \
2412uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2413{                                                                            \
2414    intptr_t opr_sz = simd_oprsz(desc);                                      \
2415    uint32_t flags = PREDTEST_INIT;                                          \
2416    intptr_t i = opr_sz;                                                     \
2417    do {                                                                     \
2418        uint64_t out = 0, pg;                                                \
2419        do {                                                                 \
2420            TYPEW mm = *(TYPEW *)(vm + i - 8);                               \
2421            do {                                                             \
2422                i -= sizeof(TYPE), out <<= sizeof(TYPE);                     \
2423                TYPE nn = *(TYPE *)(vn + H(i));                              \
2424                out |= nn OP mm;                                             \
2425            } while (i & 7);                                                 \
2426        } while (i & 63);                                                    \
2427        pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
2428        out &= pg;                                                           \
2429        *(uint64_t *)(vd + (i >> 3)) = out;                                  \
2430        flags = iter_predtest_bwd(out, pg, flags);                           \
2431    } while (i > 0);                                                         \
2432    return flags;                                                            \
2433}
2434
2435#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2436    DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1,   0xffffffffffffffffull)
2437#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2438    DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2439#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2440    DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2441
2442DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t,  uint64_t, ==)
2443DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
2444DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
2445
2446DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t,  uint64_t, !=)
2447DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
2448DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
2449
2450DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t,   int64_t, >)
2451DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t,  int64_t, >)
2452DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t,  int64_t, >)
2453
2454DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t,   int64_t, >=)
2455DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t,  int64_t, >=)
2456DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t,  int64_t, >=)
2457
2458DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t,  uint64_t, >)
2459DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
2460DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
2461
2462DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t,  uint64_t, >=)
2463DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
2464DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
2465
2466DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t,   int64_t, <)
2467DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t,  int64_t, <)
2468DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t,  int64_t, <)
2469
2470DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t,   int64_t, <=)
2471DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t,  int64_t, <=)
2472DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t,  int64_t, <=)
2473
2474DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t,  uint64_t, <)
2475DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
2476DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
2477
2478DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t,  uint64_t, <=)
2479DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
2480DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
2481
2482#undef DO_CMP_PPZW_B
2483#undef DO_CMP_PPZW_H
2484#undef DO_CMP_PPZW_S
2485#undef DO_CMP_PPZW
2486
2487/* Similar, but the second source is immediate.  */
2488#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK)                         \
2489uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)   \
2490{                                                                    \
2491    intptr_t opr_sz = simd_oprsz(desc);                              \
2492    uint32_t flags = PREDTEST_INIT;                                  \
2493    TYPE mm = simd_data(desc);                                       \
2494    intptr_t i = opr_sz;                                             \
2495    do {                                                             \
2496        uint64_t out = 0, pg;                                        \
2497        do {                                                         \
2498            i -= sizeof(TYPE), out <<= sizeof(TYPE);                 \
2499            TYPE nn = *(TYPE *)(vn + H(i));                          \
2500            out |= nn OP mm;                                         \
2501        } while (i & 63);                                            \
2502        pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                    \
2503        out &= pg;                                                   \
2504        *(uint64_t *)(vd + (i >> 3)) = out;                          \
2505        flags = iter_predtest_bwd(out, pg, flags);                   \
2506    } while (i > 0);                                                 \
2507    return flags;                                                    \
2508}
2509
2510#define DO_CMP_PPZI_B(NAME, TYPE, OP) \
2511    DO_CMP_PPZI(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
2512#define DO_CMP_PPZI_H(NAME, TYPE, OP) \
2513    DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2514#define DO_CMP_PPZI_S(NAME, TYPE, OP) \
2515    DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2516#define DO_CMP_PPZI_D(NAME, TYPE, OP) \
2517    DO_CMP_PPZI(NAME, TYPE, OP,     , 0x0101010101010101ull)
2518
2519DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t,  ==)
2520DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
2521DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
2522DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
2523
2524DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t,  !=)
2525DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
2526DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
2527DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
2528
2529DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t,  >)
2530DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
2531DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
2532DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
2533
2534DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t,  >=)
2535DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
2536DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
2537DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
2538
2539DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t,  >)
2540DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
2541DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
2542DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
2543
2544DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t,  >=)
2545DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
2546DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
2547DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
2548
2549DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t,  <)
2550DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
2551DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
2552DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
2553
2554DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t,  <=)
2555DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
2556DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
2557DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
2558
2559DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t,  <)
2560DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
2561DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
2562DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
2563
2564DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t,  <=)
2565DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
2566DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
2567DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
2568
2569#undef DO_CMP_PPZI_B
2570#undef DO_CMP_PPZI_H
2571#undef DO_CMP_PPZI_S
2572#undef DO_CMP_PPZI_D
2573#undef DO_CMP_PPZI
2574
2575/* Similar to the ARM LastActive pseudocode function.  */
2576static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
2577{
2578    intptr_t i;
2579
2580    for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
2581        uint64_t pg = *(uint64_t *)(vg + i);
2582        if (pg) {
2583            return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
2584        }
2585    }
2586    return 0;
2587}
2588
2589/* Compute a mask into RETB that is true for all G, up to and including
2590 * (if after) or excluding (if !after) the first G & N.
2591 * Return true if BRK found.
2592 */
2593static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
2594                        bool brk, bool after)
2595{
2596    uint64_t b;
2597
2598    if (brk) {
2599        b = 0;
2600    } else if ((g & n) == 0) {
2601        /* For all G, no N are set; break not found.  */
2602        b = g;
2603    } else {
2604        /* Break somewhere in N.  Locate it.  */
2605        b = g & n;            /* guard true, pred true */
2606        b = b & -b;           /* first such */
2607        if (after) {
2608            b = b | (b - 1);  /* break after same */
2609        } else {
2610            b = b - 1;        /* break before same */
2611        }
2612        brk = true;
2613    }
2614
2615    *retb = b;
2616    return brk;
2617}
2618
2619/* Compute a zeroing BRK.  */
2620static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
2621                          intptr_t oprsz, bool after)
2622{
2623    bool brk = false;
2624    intptr_t i;
2625
2626    for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2627        uint64_t this_b, this_g = g[i];
2628
2629        brk = compute_brk(&this_b, n[i], this_g, brk, after);
2630        d[i] = this_b & this_g;
2631    }
2632}
2633
2634/* Likewise, but also compute flags.  */
2635static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
2636                               intptr_t oprsz, bool after)
2637{
2638    uint32_t flags = PREDTEST_INIT;
2639    bool brk = false;
2640    intptr_t i;
2641
2642    for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2643        uint64_t this_b, this_d, this_g = g[i];
2644
2645        brk = compute_brk(&this_b, n[i], this_g, brk, after);
2646        d[i] = this_d = this_b & this_g;
2647        flags = iter_predtest_fwd(this_d, this_g, flags);
2648    }
2649    return flags;
2650}
2651
2652/* Compute a merging BRK.  */
2653static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
2654                          intptr_t oprsz, bool after)
2655{
2656    bool brk = false;
2657    intptr_t i;
2658
2659    for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2660        uint64_t this_b, this_g = g[i];
2661
2662        brk = compute_brk(&this_b, n[i], this_g, brk, after);
2663        d[i] = (this_b & this_g) | (d[i] & ~this_g);
2664    }
2665}
2666
2667/* Likewise, but also compute flags.  */
2668static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
2669                               intptr_t oprsz, bool after)
2670{
2671    uint32_t flags = PREDTEST_INIT;
2672    bool brk = false;
2673    intptr_t i;
2674
2675    for (i = 0; i < oprsz / 8; ++i) {
2676        uint64_t this_b, this_d = d[i], this_g = g[i];
2677
2678        brk = compute_brk(&this_b, n[i], this_g, brk, after);
2679        d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
2680        flags = iter_predtest_fwd(this_d, this_g, flags);
2681    }
2682    return flags;
2683}
2684
2685static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
2686{
2687    /* It is quicker to zero the whole predicate than loop on OPRSZ.
2688     * The compiler should turn this into 4 64-bit integer stores.
2689     */
2690    memset(d, 0, sizeof(ARMPredicateReg));
2691    return PREDTEST_INIT;
2692}
2693
2694void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
2695                       uint32_t pred_desc)
2696{
2697    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2698    if (last_active_pred(vn, vg, oprsz)) {
2699        compute_brk_z(vd, vm, vg, oprsz, true);
2700    } else {
2701        do_zero(vd, oprsz);
2702    }
2703}
2704
2705uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
2706                            uint32_t pred_desc)
2707{
2708    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2709    if (last_active_pred(vn, vg, oprsz)) {
2710        return compute_brks_z(vd, vm, vg, oprsz, true);
2711    } else {
2712        return do_zero(vd, oprsz);
2713    }
2714}
2715
2716void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
2717                       uint32_t pred_desc)
2718{
2719    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2720    if (last_active_pred(vn, vg, oprsz)) {
2721        compute_brk_z(vd, vm, vg, oprsz, false);
2722    } else {
2723        do_zero(vd, oprsz);
2724    }
2725}
2726
2727uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
2728                            uint32_t pred_desc)
2729{
2730    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2731    if (last_active_pred(vn, vg, oprsz)) {
2732        return compute_brks_z(vd, vm, vg, oprsz, false);
2733    } else {
2734        return do_zero(vd, oprsz);
2735    }
2736}
2737
2738void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2739{
2740    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2741    compute_brk_z(vd, vn, vg, oprsz, true);
2742}
2743
2744uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2745{
2746    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2747    return compute_brks_z(vd, vn, vg, oprsz, true);
2748}
2749
2750void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2751{
2752    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2753    compute_brk_z(vd, vn, vg, oprsz, false);
2754}
2755
2756uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2757{
2758    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2759    return compute_brks_z(vd, vn, vg, oprsz, false);
2760}
2761
2762void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2763{
2764    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2765    compute_brk_m(vd, vn, vg, oprsz, true);
2766}
2767
2768uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2769{
2770    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2771    return compute_brks_m(vd, vn, vg, oprsz, true);
2772}
2773
2774void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2775{
2776    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2777    compute_brk_m(vd, vn, vg, oprsz, false);
2778}
2779
2780uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2781{
2782    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2783    return compute_brks_m(vd, vn, vg, oprsz, false);
2784}
2785
2786void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2787{
2788    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2789
2790    if (!last_active_pred(vn, vg, oprsz)) {
2791        do_zero(vd, oprsz);
2792    }
2793}
2794
2795/* As if PredTest(Ones(PL), D, esz).  */
2796static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
2797                              uint64_t esz_mask)
2798{
2799    uint32_t flags = PREDTEST_INIT;
2800    intptr_t i;
2801
2802    for (i = 0; i < oprsz / 8; i++) {
2803        flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
2804    }
2805    if (oprsz & 7) {
2806        uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
2807        flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
2808    }
2809    return flags;
2810}
2811
2812uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2813{
2814    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2815
2816    if (last_active_pred(vn, vg, oprsz)) {
2817        return predtest_ones(vd, oprsz, -1);
2818    } else {
2819        return do_zero(vd, oprsz);
2820    }
2821}
2822
2823uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
2824{
2825    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2826    intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2827    uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
2828    intptr_t i;
2829
2830    for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2831        uint64_t t = n[i] & g[i] & mask;
2832        sum += ctpop64(t);
2833    }
2834    return sum;
2835}
2836
2837uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
2838{
2839    uintptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2840    intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2841    uint64_t esz_mask = pred_esz_masks[esz];
2842    ARMPredicateReg *d = vd;
2843    uint32_t flags;
2844    intptr_t i;
2845
2846    /* Begin with a zero predicate register.  */
2847    flags = do_zero(d, oprsz);
2848    if (count == 0) {
2849        return flags;
2850    }
2851
2852    /* Set all of the requested bits.  */
2853    for (i = 0; i < count / 64; ++i) {
2854        d->p[i] = esz_mask;
2855    }
2856    if (count & 63) {
2857        d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
2858    }
2859
2860    return predtest_ones(d, oprsz, esz_mask);
2861}
2862
2863/* Recursive reduction on a function;
2864 * C.f. the ARM ARM function ReducePredicated.
2865 *
2866 * While it would be possible to write this without the DATA temporary,
2867 * it is much simpler to process the predicate register this way.
2868 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
2869 * little to gain with a more complex non-recursive form.
2870 */
2871#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT)                         \
2872static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
2873{                                                                     \
2874    if (n == 1) {                                                     \
2875        return *data;                                                 \
2876    } else {                                                          \
2877        uintptr_t half = n / 2;                                       \
2878        TYPE lo = NAME##_reduce(data, status, half);                  \
2879        TYPE hi = NAME##_reduce(data + half, status, half);           \
2880        return TYPE##_##FUNC(lo, hi, status);                         \
2881    }                                                                 \
2882}                                                                     \
2883uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc)    \
2884{                                                                     \
2885    uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_maxsz(desc);  \
2886    TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)];                   \
2887    for (i = 0; i < oprsz; ) {                                        \
2888        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));               \
2889        do {                                                          \
2890            TYPE nn = *(TYPE *)(vn + H(i));                           \
2891            *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT);      \
2892            i += sizeof(TYPE), pg >>= sizeof(TYPE);                   \
2893        } while (i & 15);                                             \
2894    }                                                                 \
2895    for (; i < maxsz; i += sizeof(TYPE)) {                            \
2896        *(TYPE *)((void *)data + i) = IDENT;                          \
2897    }                                                                 \
2898    return NAME##_reduce(data, vs, maxsz / sizeof(TYPE));             \
2899}
2900
2901DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
2902DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
2903DO_REDUCE(sve_faddv_d, float64,     , add, float64_zero)
2904
2905/* Identity is floatN_default_nan, without the function call.  */
2906DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
2907DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
2908DO_REDUCE(sve_fminnmv_d, float64,     , minnum, 0x7FF8000000000000ULL)
2909
2910DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
2911DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
2912DO_REDUCE(sve_fmaxnmv_d, float64,     , maxnum, 0x7FF8000000000000ULL)
2913
2914DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
2915DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
2916DO_REDUCE(sve_fminv_d, float64,     , min, float64_infinity)
2917
2918DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
2919DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
2920DO_REDUCE(sve_fmaxv_d, float64,     , max, float64_chs(float64_infinity))
2921
2922#undef DO_REDUCE
2923
2924uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
2925                             void *status, uint32_t desc)
2926{
2927    intptr_t i = 0, opr_sz = simd_oprsz(desc);
2928    float16 result = nn;
2929
2930    do {
2931        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2932        do {
2933            if (pg & 1) {
2934                float16 mm = *(float16 *)(vm + H1_2(i));
2935                result = float16_add(result, mm, status);
2936            }
2937            i += sizeof(float16), pg >>= sizeof(float16);
2938        } while (i & 15);
2939    } while (i < opr_sz);
2940
2941    return result;
2942}
2943
2944uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
2945                             void *status, uint32_t desc)
2946{
2947    intptr_t i = 0, opr_sz = simd_oprsz(desc);
2948    float32 result = nn;
2949
2950    do {
2951        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2952        do {
2953            if (pg & 1) {
2954                float32 mm = *(float32 *)(vm + H1_2(i));
2955                result = float32_add(result, mm, status);
2956            }
2957            i += sizeof(float32), pg >>= sizeof(float32);
2958        } while (i & 15);
2959    } while (i < opr_sz);
2960
2961    return result;
2962}
2963
2964uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
2965                             void *status, uint32_t desc)
2966{
2967    intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
2968    uint64_t *m = vm;
2969    uint8_t *pg = vg;
2970
2971    for (i = 0; i < opr_sz; i++) {
2972        if (pg[H1(i)] & 1) {
2973            nn = float64_add(nn, m[i], status);
2974        }
2975    }
2976
2977    return nn;
2978}
2979
2980/* Fully general three-operand expander, controlled by a predicate,
2981 * With the extra float_status parameter.
2982 */
2983#define DO_ZPZZ_FP(NAME, TYPE, H, OP)                           \
2984void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,       \
2985                  void *status, uint32_t desc)                  \
2986{                                                               \
2987    intptr_t i = simd_oprsz(desc);                              \
2988    uint64_t *g = vg;                                           \
2989    do {                                                        \
2990        uint64_t pg = g[(i - 1) >> 6];                          \
2991        do {                                                    \
2992            i -= sizeof(TYPE);                                  \
2993            if (likely((pg >> (i & 63)) & 1)) {                 \
2994                TYPE nn = *(TYPE *)(vn + H(i));                 \
2995                TYPE mm = *(TYPE *)(vm + H(i));                 \
2996                *(TYPE *)(vd + H(i)) = OP(nn, mm, status);      \
2997            }                                                   \
2998        } while (i & 63);                                       \
2999    } while (i != 0);                                           \
3000}
3001
3002DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
3003DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
3004DO_ZPZZ_FP(sve_fadd_d, uint64_t,     , float64_add)
3005
3006DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
3007DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
3008DO_ZPZZ_FP(sve_fsub_d, uint64_t,     , float64_sub)
3009
3010DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
3011DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
3012DO_ZPZZ_FP(sve_fmul_d, uint64_t,     , float64_mul)
3013
3014DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
3015DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
3016DO_ZPZZ_FP(sve_fdiv_d, uint64_t,     , float64_div)
3017
3018DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
3019DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
3020DO_ZPZZ_FP(sve_fmin_d, uint64_t,     , float64_min)
3021
3022DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
3023DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
3024DO_ZPZZ_FP(sve_fmax_d, uint64_t,     , float64_max)
3025
3026DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
3027DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
3028DO_ZPZZ_FP(sve_fminnum_d, uint64_t,     , float64_minnum)
3029
3030DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
3031DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
3032DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t,     , float64_maxnum)
3033
3034static inline float16 abd_h(float16 a, float16 b, float_status *s)
3035{
3036    return float16_abs(float16_sub(a, b, s));
3037}
3038
3039static inline float32 abd_s(float32 a, float32 b, float_status *s)
3040{
3041    return float32_abs(float32_sub(a, b, s));
3042}
3043
3044static inline float64 abd_d(float64 a, float64 b, float_status *s)
3045{
3046    return float64_abs(float64_sub(a, b, s));
3047}
3048
3049DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
3050DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
3051DO_ZPZZ_FP(sve_fabd_d, uint64_t,     , abd_d)
3052
3053static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
3054{
3055    int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
3056    return float64_scalbn(a, b_int, s);
3057}
3058
3059DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
3060DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
3061DO_ZPZZ_FP(sve_fscalbn_d, int64_t,     , scalbn_d)
3062
3063DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
3064DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
3065DO_ZPZZ_FP(sve_fmulx_d, uint64_t,     , helper_vfp_mulxd)
3066
3067#undef DO_ZPZZ_FP
3068
3069/* Three-operand expander, with one scalar operand, controlled by
3070 * a predicate, with the extra float_status parameter.
3071 */
3072#define DO_ZPZS_FP(NAME, TYPE, H, OP) \
3073void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar,  \
3074                  void *status, uint32_t desc)                    \
3075{                                                                 \
3076    intptr_t i = simd_oprsz(desc);                                \
3077    uint64_t *g = vg;                                             \
3078    TYPE mm = scalar;                                             \
3079    do {                                                          \
3080        uint64_t pg = g[(i - 1) >> 6];                            \
3081        do {                                                      \
3082            i -= sizeof(TYPE);                                    \
3083            if (likely((pg >> (i & 63)) & 1)) {                   \
3084                TYPE nn = *(TYPE *)(vn + H(i));                   \
3085                *(TYPE *)(vd + H(i)) = OP(nn, mm, status);        \
3086            }                                                     \
3087        } while (i & 63);                                         \
3088    } while (i != 0);                                             \
3089}
3090
3091DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
3092DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
3093DO_ZPZS_FP(sve_fadds_d, float64,     , float64_add)
3094
3095DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
3096DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
3097DO_ZPZS_FP(sve_fsubs_d, float64,     , float64_sub)
3098
3099DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
3100DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
3101DO_ZPZS_FP(sve_fmuls_d, float64,     , float64_mul)
3102
3103static inline float16 subr_h(float16 a, float16 b, float_status *s)
3104{
3105    return float16_sub(b, a, s);
3106}
3107
3108static inline float32 subr_s(float32 a, float32 b, float_status *s)
3109{
3110    return float32_sub(b, a, s);
3111}
3112
3113static inline float64 subr_d(float64 a, float64 b, float_status *s)
3114{
3115    return float64_sub(b, a, s);
3116}
3117
3118DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
3119DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
3120DO_ZPZS_FP(sve_fsubrs_d, float64,     , subr_d)
3121
3122DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
3123DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
3124DO_ZPZS_FP(sve_fmaxnms_d, float64,     , float64_maxnum)
3125
3126DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
3127DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
3128DO_ZPZS_FP(sve_fminnms_d, float64,     , float64_minnum)
3129
3130DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
3131DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
3132DO_ZPZS_FP(sve_fmaxs_d, float64,     , float64_max)
3133
3134DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
3135DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
3136DO_ZPZS_FP(sve_fmins_d, float64,     , float64_min)
3137
3138/* Fully general two-operand expander, controlled by a predicate,
3139 * With the extra float_status parameter.
3140 */
3141#define DO_ZPZ_FP(NAME, TYPE, H, OP)                                  \
3142void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
3143{                                                                     \
3144    intptr_t i = simd_oprsz(desc);                                    \
3145    uint64_t *g = vg;                                                 \
3146    do {                                                              \
3147        uint64_t pg = g[(i - 1) >> 6];                                \
3148        do {                                                          \
3149            i -= sizeof(TYPE);                                        \
3150            if (likely((pg >> (i & 63)) & 1)) {                       \
3151                TYPE nn = *(TYPE *)(vn + H(i));                       \
3152                *(TYPE *)(vd + H(i)) = OP(nn, status);                \
3153            }                                                         \
3154        } while (i & 63);                                             \
3155    } while (i != 0);                                                 \
3156}
3157
3158/* SVE fp16 conversions always use IEEE mode.  Like AdvSIMD, they ignore
3159 * FZ16.  When converting from fp16, this affects flushing input denormals;
3160 * when converting to fp16, this affects flushing output denormals.
3161 */
3162static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
3163{
3164    bool save = get_flush_inputs_to_zero(fpst);
3165    float32 ret;
3166
3167    set_flush_inputs_to_zero(false, fpst);
3168    ret = float16_to_float32(f, true, fpst);
3169    set_flush_inputs_to_zero(save, fpst);
3170    return ret;
3171}
3172
3173static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
3174{
3175    bool save = get_flush_inputs_to_zero(fpst);
3176    float64 ret;
3177
3178    set_flush_inputs_to_zero(false, fpst);
3179    ret = float16_to_float64(f, true, fpst);
3180    set_flush_inputs_to_zero(save, fpst);
3181    return ret;
3182}
3183
3184static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
3185{
3186    bool save = get_flush_to_zero(fpst);
3187    float16 ret;
3188
3189    set_flush_to_zero(false, fpst);
3190    ret = float32_to_float16(f, true, fpst);
3191    set_flush_to_zero(save, fpst);
3192    return ret;
3193}
3194
3195static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
3196{
3197    bool save = get_flush_to_zero(fpst);
3198    float16 ret;
3199
3200    set_flush_to_zero(false, fpst);
3201    ret = float64_to_float16(f, true, fpst);
3202    set_flush_to_zero(save, fpst);
3203    return ret;
3204}
3205
3206static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
3207{
3208    if (float16_is_any_nan(f)) {
3209        float_raise(float_flag_invalid, s);
3210        return 0;
3211    }
3212    return float16_to_int16_round_to_zero(f, s);
3213}
3214
3215static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
3216{
3217    if (float16_is_any_nan(f)) {
3218        float_raise(float_flag_invalid, s);
3219        return 0;
3220    }
3221    return float16_to_int64_round_to_zero(f, s);
3222}
3223
3224static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
3225{
3226    if (float32_is_any_nan(f)) {
3227        float_raise(float_flag_invalid, s);
3228        return 0;
3229    }
3230    return float32_to_int64_round_to_zero(f, s);
3231}
3232
3233static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
3234{
3235    if (float64_is_any_nan(f)) {
3236        float_raise(float_flag_invalid, s);
3237        return 0;
3238    }
3239    return float64_to_int64_round_to_zero(f, s);
3240}
3241
3242static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
3243{
3244    if (float16_is_any_nan(f)) {
3245        float_raise(float_flag_invalid, s);
3246        return 0;
3247    }
3248    return float16_to_uint16_round_to_zero(f, s);
3249}
3250
3251static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
3252{
3253    if (float16_is_any_nan(f)) {
3254        float_raise(float_flag_invalid, s);
3255        return 0;
3256    }
3257    return float16_to_uint64_round_to_zero(f, s);
3258}
3259
3260static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
3261{
3262    if (float32_is_any_nan(f)) {
3263        float_raise(float_flag_invalid, s);
3264        return 0;
3265    }
3266    return float32_to_uint64_round_to_zero(f, s);
3267}
3268
3269static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
3270{
3271    if (float64_is_any_nan(f)) {
3272        float_raise(float_flag_invalid, s);
3273        return 0;
3274    }
3275    return float64_to_uint64_round_to_zero(f, s);
3276}
3277
3278DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
3279DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
3280DO_ZPZ_FP(sve_fcvt_dh, uint64_t,     , sve_f64_to_f16)
3281DO_ZPZ_FP(sve_fcvt_hd, uint64_t,     , sve_f16_to_f64)
3282DO_ZPZ_FP(sve_fcvt_ds, uint64_t,     , float64_to_float32)
3283DO_ZPZ_FP(sve_fcvt_sd, uint64_t,     , float32_to_float64)
3284
3285DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
3286DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
3287DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
3288DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t,     , vfp_float16_to_int64_rtz)
3289DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t,     , vfp_float32_to_int64_rtz)
3290DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t,     , helper_vfp_tosizd)
3291DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t,     , vfp_float64_to_int64_rtz)
3292
3293DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
3294DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
3295DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
3296DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t,     , vfp_float16_to_uint64_rtz)
3297DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t,     , vfp_float32_to_uint64_rtz)
3298DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t,     , helper_vfp_touizd)
3299DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t,     , vfp_float64_to_uint64_rtz)
3300
3301DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
3302DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
3303DO_ZPZ_FP(sve_frint_d, uint64_t,     , helper_rintd)
3304
3305DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
3306DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
3307DO_ZPZ_FP(sve_frintx_d, uint64_t,     , float64_round_to_int)
3308
3309DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
3310DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
3311DO_ZPZ_FP(sve_frecpx_d, uint64_t,     , helper_frecpx_f64)
3312
3313DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
3314DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
3315DO_ZPZ_FP(sve_fsqrt_d, uint64_t,     , float64_sqrt)
3316
3317DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
3318DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
3319DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
3320DO_ZPZ_FP(sve_scvt_sd, uint64_t,     , int32_to_float64)
3321DO_ZPZ_FP(sve_scvt_dh, uint64_t,     , int64_to_float16)
3322DO_ZPZ_FP(sve_scvt_ds, uint64_t,     , int64_to_float32)
3323DO_ZPZ_FP(sve_scvt_dd, uint64_t,     , int64_to_float64)
3324
3325DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
3326DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
3327DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
3328DO_ZPZ_FP(sve_ucvt_sd, uint64_t,     , uint32_to_float64)
3329DO_ZPZ_FP(sve_ucvt_dh, uint64_t,     , uint64_to_float16)
3330DO_ZPZ_FP(sve_ucvt_ds, uint64_t,     , uint64_to_float32)
3331DO_ZPZ_FP(sve_ucvt_dd, uint64_t,     , uint64_to_float64)
3332
3333#undef DO_ZPZ_FP
3334
3335static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
3336                            float_status *status, uint32_t desc,
3337                            uint16_t neg1, uint16_t neg3)
3338{
3339    intptr_t i = simd_oprsz(desc);
3340    uint64_t *g = vg;
3341
3342    do {
3343        uint64_t pg = g[(i - 1) >> 6];
3344        do {
3345            i -= 2;
3346            if (likely((pg >> (i & 63)) & 1)) {
3347                float16 e1, e2, e3, r;
3348
3349                e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
3350                e2 = *(uint16_t *)(vm + H1_2(i));
3351                e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
3352                r = float16_muladd(e1, e2, e3, 0, status);
3353                *(uint16_t *)(vd + H1_2(i)) = r;
3354            }
3355        } while (i & 63);
3356    } while (i != 0);
3357}
3358
3359void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3360                              void *vg, void *status, uint32_t desc)
3361{
3362    do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
3363}
3364
3365void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3366                              void *vg, void *status, uint32_t desc)
3367{
3368    do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
3369}
3370
3371void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3372                               void *vg, void *status, uint32_t desc)
3373{
3374    do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
3375}
3376
3377void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3378                               void *vg, void *status, uint32_t desc)
3379{
3380    do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
3381}
3382
3383static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
3384                            float_status *status, uint32_t desc,
3385                            uint32_t neg1, uint32_t neg3)
3386{
3387    intptr_t i = simd_oprsz(desc);
3388    uint64_t *g = vg;
3389
3390    do {
3391        uint64_t pg = g[(i - 1) >> 6];
3392        do {
3393            i -= 4;
3394            if (likely((pg >> (i & 63)) & 1)) {
3395                float32 e1, e2, e3, r;
3396
3397                e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
3398                e2 = *(uint32_t *)(vm + H1_4(i));
3399                e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
3400                r = float32_muladd(e1, e2, e3, 0, status);
3401                *(uint32_t *)(vd + H1_4(i)) = r;
3402            }
3403        } while (i & 63);
3404    } while (i != 0);
3405}
3406
3407void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3408                              void *vg, void *status, uint32_t desc)
3409{
3410    do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
3411}
3412
3413void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3414                              void *vg, void *status, uint32_t desc)
3415{
3416    do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
3417}
3418
3419void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3420                               void *vg, void *status, uint32_t desc)
3421{
3422    do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
3423}
3424
3425void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3426                               void *vg, void *status, uint32_t desc)
3427{
3428    do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
3429}
3430
3431static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
3432                            float_status *status, uint32_t desc,
3433                            uint64_t neg1, uint64_t neg3)
3434{
3435    intptr_t i = simd_oprsz(desc);
3436    uint64_t *g = vg;
3437
3438    do {
3439        uint64_t pg = g[(i - 1) >> 6];
3440        do {
3441            i -= 8;
3442            if (likely((pg >> (i & 63)) & 1)) {
3443                float64 e1, e2, e3, r;
3444
3445                e1 = *(uint64_t *)(vn + i) ^ neg1;
3446                e2 = *(uint64_t *)(vm + i);
3447                e3 = *(uint64_t *)(va + i) ^ neg3;
3448                r = float64_muladd(e1, e2, e3, 0, status);
3449                *(uint64_t *)(vd + i) = r;
3450            }
3451        } while (i & 63);
3452    } while (i != 0);
3453}
3454
3455void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3456                              void *vg, void *status, uint32_t desc)
3457{
3458    do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
3459}
3460
3461void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3462                              void *vg, void *status, uint32_t desc)
3463{
3464    do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
3465}
3466
3467void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3468                               void *vg, void *status, uint32_t desc)
3469{
3470    do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
3471}
3472
3473void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3474                               void *vg, void *status, uint32_t desc)
3475{
3476    do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
3477}
3478
3479/* Two operand floating-point comparison controlled by a predicate.
3480 * Unlike the integer version, we are not allowed to optimistically
3481 * compare operands, since the comparison may have side effects wrt
3482 * the FPSR.
3483 */
3484#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP)                                \
3485void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
3486                  void *status, uint32_t desc)                          \
3487{                                                                       \
3488    intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;                    \
3489    uint64_t *d = vd, *g = vg;                                          \
3490    do {                                                                \
3491        uint64_t out = 0, pg = g[j];                                    \
3492        do {                                                            \
3493            i -= sizeof(TYPE), out <<= sizeof(TYPE);                    \
3494            if (likely((pg >> (i & 63)) & 1)) {                         \
3495                TYPE nn = *(TYPE *)(vn + H(i));                         \
3496                TYPE mm = *(TYPE *)(vm + H(i));                         \
3497                out |= OP(TYPE, nn, mm, status);                        \
3498            }                                                           \
3499        } while (i & 63);                                               \
3500        d[j--] = out;                                                   \
3501    } while (i > 0);                                                    \
3502}
3503
3504#define DO_FPCMP_PPZZ_H(NAME, OP) \
3505    DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
3506#define DO_FPCMP_PPZZ_S(NAME, OP) \
3507    DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
3508#define DO_FPCMP_PPZZ_D(NAME, OP) \
3509    DO_FPCMP_PPZZ(NAME##_d, float64,     , OP)
3510
3511#define DO_FPCMP_PPZZ_ALL(NAME, OP) \
3512    DO_FPCMP_PPZZ_H(NAME, OP)   \
3513    DO_FPCMP_PPZZ_S(NAME, OP)   \
3514    DO_FPCMP_PPZZ_D(NAME, OP)
3515
3516#define DO_FCMGE(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) <= 0
3517#define DO_FCMGT(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) < 0
3518#define DO_FCMLE(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) <= 0
3519#define DO_FCMLT(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) < 0
3520#define DO_FCMEQ(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) == 0
3521#define DO_FCMNE(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) != 0
3522#define DO_FCMUO(TYPE, X, Y, ST)  \
3523    TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
3524#define DO_FACGE(TYPE, X, Y, ST)  \
3525    TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
3526#define DO_FACGT(TYPE, X, Y, ST)  \
3527    TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
3528
3529DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
3530DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
3531DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
3532DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
3533DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
3534DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
3535DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
3536
3537#undef DO_FPCMP_PPZZ_ALL
3538#undef DO_FPCMP_PPZZ_D
3539#undef DO_FPCMP_PPZZ_S
3540#undef DO_FPCMP_PPZZ_H
3541#undef DO_FPCMP_PPZZ
3542
3543/* One operand floating-point comparison against zero, controlled
3544 * by a predicate.
3545 */
3546#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP)                   \
3547void HELPER(NAME)(void *vd, void *vn, void *vg,            \
3548                  void *status, uint32_t desc)             \
3549{                                                          \
3550    intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;       \
3551    uint64_t *d = vd, *g = vg;                             \
3552    do {                                                   \
3553        uint64_t out = 0, pg = g[j];                       \
3554        do {                                               \
3555            i -= sizeof(TYPE), out <<= sizeof(TYPE);       \
3556            if ((pg >> (i & 63)) & 1) {                    \
3557                TYPE nn = *(TYPE *)(vn + H(i));            \
3558                out |= OP(TYPE, nn, 0, status);            \
3559            }                                              \
3560        } while (i & 63);                                  \
3561        d[j--] = out;                                      \
3562    } while (i > 0);                                       \
3563}
3564
3565#define DO_FPCMP_PPZ0_H(NAME, OP) \
3566    DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
3567#define DO_FPCMP_PPZ0_S(NAME, OP) \
3568    DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
3569#define DO_FPCMP_PPZ0_D(NAME, OP) \
3570    DO_FPCMP_PPZ0(NAME##_d, float64,     , OP)
3571
3572#define DO_FPCMP_PPZ0_ALL(NAME, OP) \
3573    DO_FPCMP_PPZ0_H(NAME, OP)   \
3574    DO_FPCMP_PPZ0_S(NAME, OP)   \
3575    DO_FPCMP_PPZ0_D(NAME, OP)
3576
3577DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
3578DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
3579DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
3580DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
3581DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
3582DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
3583
3584/* FP Trig Multiply-Add. */
3585
3586void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3587{
3588    static const float16 coeff[16] = {
3589        0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3590        0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3591    };
3592    intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
3593    intptr_t x = simd_data(desc);
3594    float16 *d = vd, *n = vn, *m = vm;
3595    for (i = 0; i < opr_sz; i++) {
3596        float16 mm = m[i];
3597        intptr_t xx = x;
3598        if (float16_is_neg(mm)) {
3599            mm = float16_abs(mm);
3600            xx += 8;
3601        }
3602        d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
3603    }
3604}
3605
3606void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3607{
3608    static const float32 coeff[16] = {
3609        0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
3610        0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
3611        0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
3612        0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
3613    };
3614    intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
3615    intptr_t x = simd_data(desc);
3616    float32 *d = vd, *n = vn, *m = vm;
3617    for (i = 0; i < opr_sz; i++) {
3618        float32 mm = m[i];
3619        intptr_t xx = x;
3620        if (float32_is_neg(mm)) {
3621            mm = float32_abs(mm);
3622            xx += 8;
3623        }
3624        d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
3625    }
3626}
3627
3628void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3629{
3630    static const float64 coeff[16] = {
3631        0x3ff0000000000000ull, 0xbfc5555555555543ull,
3632        0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
3633        0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
3634        0x3de5d8408868552full, 0x0000000000000000ull,
3635        0x3ff0000000000000ull, 0xbfe0000000000000ull,
3636        0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
3637        0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
3638        0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
3639    };
3640    intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
3641    intptr_t x = simd_data(desc);
3642    float64 *d = vd, *n = vn, *m = vm;
3643    for (i = 0; i < opr_sz; i++) {
3644        float64 mm = m[i];
3645        intptr_t xx = x;
3646        if (float64_is_neg(mm)) {
3647            mm = float64_abs(mm);
3648            xx += 8;
3649        }
3650        d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
3651    }
3652}
3653
3654/*
3655 * FP Complex Add
3656 */
3657
3658void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
3659                         void *vs, uint32_t desc)
3660{
3661    intptr_t j, i = simd_oprsz(desc);
3662    uint64_t *g = vg;
3663    float16 neg_imag = float16_set_sign(0, simd_data(desc));
3664    float16 neg_real = float16_chs(neg_imag);
3665
3666    do {
3667        uint64_t pg = g[(i - 1) >> 6];
3668        do {
3669            float16 e0, e1, e2, e3;
3670
3671            /* I holds the real index; J holds the imag index.  */
3672            j = i - sizeof(float16);
3673            i -= 2 * sizeof(float16);
3674
3675            e0 = *(float16 *)(vn + H1_2(i));
3676            e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
3677            e2 = *(float16 *)(vn + H1_2(j));
3678            e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
3679
3680            if (likely((pg >> (i & 63)) & 1)) {
3681                *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
3682            }
3683            if (likely((pg >> (j & 63)) & 1)) {
3684                *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
3685            }
3686        } while (i & 63);
3687    } while (i != 0);
3688}
3689
3690void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
3691                         void *vs, uint32_t desc)
3692{
3693    intptr_t j, i = simd_oprsz(desc);
3694    uint64_t *g = vg;
3695    float32 neg_imag = float32_set_sign(0, simd_data(desc));
3696    float32 neg_real = float32_chs(neg_imag);
3697
3698    do {
3699        uint64_t pg = g[(i - 1) >> 6];
3700        do {
3701            float32 e0, e1, e2, e3;
3702
3703            /* I holds the real index; J holds the imag index.  */
3704            j = i - sizeof(float32);
3705            i -= 2 * sizeof(float32);
3706
3707            e0 = *(float32 *)(vn + H1_2(i));
3708            e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
3709            e2 = *(float32 *)(vn + H1_2(j));
3710            e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
3711
3712            if (likely((pg >> (i & 63)) & 1)) {
3713                *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
3714            }
3715            if (likely((pg >> (j & 63)) & 1)) {
3716                *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
3717            }
3718        } while (i & 63);
3719    } while (i != 0);
3720}
3721
3722void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
3723                         void *vs, uint32_t desc)
3724{
3725    intptr_t j, i = simd_oprsz(desc);
3726    uint64_t *g = vg;
3727    float64 neg_imag = float64_set_sign(0, simd_data(desc));
3728    float64 neg_real = float64_chs(neg_imag);
3729
3730    do {
3731        uint64_t pg = g[(i - 1) >> 6];
3732        do {
3733            float64 e0, e1, e2, e3;
3734
3735            /* I holds the real index; J holds the imag index.  */
3736            j = i - sizeof(float64);
3737            i -= 2 * sizeof(float64);
3738
3739            e0 = *(float64 *)(vn + H1_2(i));
3740            e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
3741            e2 = *(float64 *)(vn + H1_2(j));
3742            e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
3743
3744            if (likely((pg >> (i & 63)) & 1)) {
3745                *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
3746            }
3747            if (likely((pg >> (j & 63)) & 1)) {
3748                *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
3749            }
3750        } while (i & 63);
3751    } while (i != 0);
3752}
3753
3754/*
3755 * FP Complex Multiply
3756 */
3757
3758void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3759                               void *vg, void *status, uint32_t desc)
3760{
3761    intptr_t j, i = simd_oprsz(desc);
3762    unsigned rot = simd_data(desc);
3763    bool flip = rot & 1;
3764    float16 neg_imag, neg_real;
3765    uint64_t *g = vg;
3766
3767    neg_imag = float16_set_sign(0, (rot & 2) != 0);
3768    neg_real = float16_set_sign(0, rot == 1 || rot == 2);
3769
3770    do {
3771        uint64_t pg = g[(i - 1) >> 6];
3772        do {
3773            float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
3774
3775            /* I holds the real index; J holds the imag index.  */
3776            j = i - sizeof(float16);
3777            i -= 2 * sizeof(float16);
3778
3779            nr = *(float16 *)(vn + H1_2(i));
3780            ni = *(float16 *)(vn + H1_2(j));
3781            mr = *(float16 *)(vm + H1_2(i));
3782            mi = *(float16 *)(vm + H1_2(j));
3783
3784            e2 = (flip ? ni : nr);
3785            e1 = (flip ? mi : mr) ^ neg_real;
3786            e4 = e2;
3787            e3 = (flip ? mr : mi) ^ neg_imag;
3788
3789            if (likely((pg >> (i & 63)) & 1)) {
3790                d = *(float16 *)(va + H1_2(i));
3791                d = float16_muladd(e2, e1, d, 0, status);
3792                *(float16 *)(vd + H1_2(i)) = d;
3793            }
3794            if (likely((pg >> (j & 63)) & 1)) {
3795                d = *(float16 *)(va + H1_2(j));
3796                d = float16_muladd(e4, e3, d, 0, status);
3797                *(float16 *)(vd + H1_2(j)) = d;
3798            }
3799        } while (i & 63);
3800    } while (i != 0);
3801}
3802
3803void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3804                               void *vg, void *status, uint32_t desc)
3805{
3806    intptr_t j, i = simd_oprsz(desc);
3807    unsigned rot = simd_data(desc);
3808    bool flip = rot & 1;
3809    float32 neg_imag, neg_real;
3810    uint64_t *g = vg;
3811
3812    neg_imag = float32_set_sign(0, (rot & 2) != 0);
3813    neg_real = float32_set_sign(0, rot == 1 || rot == 2);
3814
3815    do {
3816        uint64_t pg = g[(i - 1) >> 6];
3817        do {
3818            float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
3819
3820            /* I holds the real index; J holds the imag index.  */
3821            j = i - sizeof(float32);
3822            i -= 2 * sizeof(float32);
3823
3824            nr = *(float32 *)(vn + H1_2(i));
3825            ni = *(float32 *)(vn + H1_2(j));
3826            mr = *(float32 *)(vm + H1_2(i));
3827            mi = *(float32 *)(vm + H1_2(j));
3828
3829            e2 = (flip ? ni : nr);
3830            e1 = (flip ? mi : mr) ^ neg_real;
3831            e4 = e2;
3832            e3 = (flip ? mr : mi) ^ neg_imag;
3833
3834            if (likely((pg >> (i & 63)) & 1)) {
3835                d = *(float32 *)(va + H1_2(i));
3836                d = float32_muladd(e2, e1, d, 0, status);
3837                *(float32 *)(vd + H1_2(i)) = d;
3838            }
3839            if (likely((pg >> (j & 63)) & 1)) {
3840                d = *(float32 *)(va + H1_2(j));
3841                d = float32_muladd(e4, e3, d, 0, status);
3842                *(float32 *)(vd + H1_2(j)) = d;
3843            }
3844        } while (i & 63);
3845    } while (i != 0);
3846}
3847
3848void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3849                               void *vg, void *status, uint32_t desc)
3850{
3851    intptr_t j, i = simd_oprsz(desc);
3852    unsigned rot = simd_data(desc);
3853    bool flip = rot & 1;
3854    float64 neg_imag, neg_real;
3855    uint64_t *g = vg;
3856
3857    neg_imag = float64_set_sign(0, (rot & 2) != 0);
3858    neg_real = float64_set_sign(0, rot == 1 || rot == 2);
3859
3860    do {
3861        uint64_t pg = g[(i - 1) >> 6];
3862        do {
3863            float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
3864
3865            /* I holds the real index; J holds the imag index.  */
3866            j = i - sizeof(float64);
3867            i -= 2 * sizeof(float64);
3868
3869            nr = *(float64 *)(vn + H1_2(i));
3870            ni = *(float64 *)(vn + H1_2(j));
3871            mr = *(float64 *)(vm + H1_2(i));
3872            mi = *(float64 *)(vm + H1_2(j));
3873
3874            e2 = (flip ? ni : nr);
3875            e1 = (flip ? mi : mr) ^ neg_real;
3876            e4 = e2;
3877            e3 = (flip ? mr : mi) ^ neg_imag;
3878
3879            if (likely((pg >> (i & 63)) & 1)) {
3880                d = *(float64 *)(va + H1_2(i));
3881                d = float64_muladd(e2, e1, d, 0, status);
3882                *(float64 *)(vd + H1_2(i)) = d;
3883            }
3884            if (likely((pg >> (j & 63)) & 1)) {
3885                d = *(float64 *)(va + H1_2(j));
3886                d = float64_muladd(e4, e3, d, 0, status);
3887                *(float64 *)(vd + H1_2(j)) = d;
3888            }
3889        } while (i & 63);
3890    } while (i != 0);
3891}
3892
3893/*
3894 * Load contiguous data, protected by a governing predicate.
3895 */
3896
3897/*
3898 * Load one element into @vd + @reg_off from @host.
3899 * The controlling predicate is known to be true.
3900 */
3901typedef void sve_ldst1_host_fn(void *vd, intptr_t reg_off, void *host);
3902
3903/*
3904 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
3905 * The controlling predicate is known to be true.
3906 */
3907typedef void sve_ldst1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
3908                              target_ulong vaddr, uintptr_t retaddr);
3909
3910/*
3911 * Generate the above primitives.
3912 */
3913
3914#define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
3915static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host)  \
3916{                                                                      \
3917    TYPEM val = HOST(host);                                            \
3918    *(TYPEE *)(vd + H(reg_off)) = val;                                 \
3919}
3920
3921#define DO_ST_HOST(NAME, H, TYPEE, TYPEM, HOST) \
3922static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host)  \
3923{ HOST(host, (TYPEM)*(TYPEE *)(vd + H(reg_off))); }
3924
3925#define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \
3926static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off,  \
3927                             target_ulong addr, uintptr_t ra)               \
3928{                                                                           \
3929    *(TYPEE *)(vd + H(reg_off)) =                                           \
3930        (TYPEM)TLB(env, useronly_clean_ptr(addr), ra);                      \
3931}
3932
3933#define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \
3934static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off,  \
3935                             target_ulong addr, uintptr_t ra)               \
3936{                                                                           \
3937    TLB(env, useronly_clean_ptr(addr),                                      \
3938        (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra);                            \
3939}
3940
3941#define DO_LD_PRIM_1(NAME, H, TE, TM)                   \
3942    DO_LD_HOST(NAME, H, TE, TM, ldub_p)                 \
3943    DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra)
3944
3945DO_LD_PRIM_1(ld1bb,  H1,   uint8_t,  uint8_t)
3946DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
3947DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t,  int8_t)
3948DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
3949DO_LD_PRIM_1(ld1bss, H1_4, uint32_t,  int8_t)
3950DO_LD_PRIM_1(ld1bdu,     , uint64_t, uint8_t)
3951DO_LD_PRIM_1(ld1bds,     , uint64_t,  int8_t)
3952
3953#define DO_ST_PRIM_1(NAME, H, TE, TM)                   \
3954    DO_ST_HOST(st1##NAME, H, TE, TM, stb_p)             \
3955    DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra)
3956
3957DO_ST_PRIM_1(bb,   H1,  uint8_t, uint8_t)
3958DO_ST_PRIM_1(bh, H1_2, uint16_t, uint8_t)
3959DO_ST_PRIM_1(bs, H1_4, uint32_t, uint8_t)
3960DO_ST_PRIM_1(bd,     , uint64_t, uint8_t)
3961
3962#define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
3963    DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p)    \
3964    DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p)    \
3965    DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \
3966    DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra)
3967
3968#define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \
3969    DO_ST_HOST(st1##NAME##_be, H, TE, TM, ST##_be_p)    \
3970    DO_ST_HOST(st1##NAME##_le, H, TE, TM, ST##_le_p)    \
3971    DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \
3972    DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra)
3973
3974DO_LD_PRIM_2(hh,  H1_2, uint16_t, uint16_t, lduw)
3975DO_LD_PRIM_2(hsu, H1_4, uint32_t, uint16_t, lduw)
3976DO_LD_PRIM_2(hss, H1_4, uint32_t,  int16_t, lduw)
3977DO_LD_PRIM_2(hdu,     , uint64_t, uint16_t, lduw)
3978DO_LD_PRIM_2(hds,     , uint64_t,  int16_t, lduw)
3979
3980DO_ST_PRIM_2(hh, H1_2, uint16_t, uint16_t, stw)
3981DO_ST_PRIM_2(hs, H1_4, uint32_t, uint16_t, stw)
3982DO_ST_PRIM_2(hd,     , uint64_t, uint16_t, stw)
3983
3984DO_LD_PRIM_2(ss,  H1_4, uint32_t, uint32_t, ldl)
3985DO_LD_PRIM_2(sdu,     , uint64_t, uint32_t, ldl)
3986DO_LD_PRIM_2(sds,     , uint64_t,  int32_t, ldl)
3987
3988DO_ST_PRIM_2(ss, H1_4, uint32_t, uint32_t, stl)
3989DO_ST_PRIM_2(sd,     , uint64_t, uint32_t, stl)
3990
3991DO_LD_PRIM_2(dd,     , uint64_t, uint64_t, ldq)
3992DO_ST_PRIM_2(dd,     , uint64_t, uint64_t, stq)
3993
3994#undef DO_LD_TLB
3995#undef DO_ST_TLB
3996#undef DO_LD_HOST
3997#undef DO_LD_PRIM_1
3998#undef DO_ST_PRIM_1
3999#undef DO_LD_PRIM_2
4000#undef DO_ST_PRIM_2
4001
4002/*
4003 * Skip through a sequence of inactive elements in the guarding predicate @vg,
4004 * beginning at @reg_off bounded by @reg_max.  Return the offset of the active
4005 * element >= @reg_off, or @reg_max if there were no active elements at all.
4006 */
4007static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
4008                                 intptr_t reg_max, int esz)
4009{
4010    uint64_t pg_mask = pred_esz_masks[esz];
4011    uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
4012
4013    /* In normal usage, the first element is active.  */
4014    if (likely(pg & 1)) {
4015        return reg_off;
4016    }
4017
4018    if (pg == 0) {
4019        reg_off &= -64;
4020        do {
4021            reg_off += 64;
4022            if (unlikely(reg_off >= reg_max)) {
4023                /* The entire predicate was false.  */
4024                return reg_max;
4025            }
4026            pg = vg[reg_off >> 6] & pg_mask;
4027        } while (pg == 0);
4028    }
4029    reg_off += ctz64(pg);
4030
4031    /* We should never see an out of range predicate bit set.  */
4032    tcg_debug_assert(reg_off < reg_max);
4033    return reg_off;
4034}
4035
4036/*
4037 * Resolve the guest virtual address to info->host and info->flags.
4038 * If @nofault, return false if the page is invalid, otherwise
4039 * exit via page fault exception.
4040 */
4041
4042typedef struct {
4043    void *host;
4044    int flags;
4045    MemTxAttrs attrs;
4046} SVEHostPage;
4047
4048static bool sve_probe_page(SVEHostPage *info, bool nofault,
4049                           CPUARMState *env, target_ulong addr,
4050                           int mem_off, MMUAccessType access_type,
4051                           int mmu_idx, uintptr_t retaddr)
4052{
4053    int flags;
4054
4055    addr += mem_off;
4056
4057    /*
4058     * User-only currently always issues with TBI.  See the comment
4059     * above useronly_clean_ptr.  Usually we clean this top byte away
4060     * during translation, but we can't do that for e.g. vector + imm
4061     * addressing modes.
4062     *
4063     * We currently always enable TBI for user-only, and do not provide
4064     * a way to turn it off.  So clean the pointer unconditionally here,
4065     * rather than look it up here, or pass it down from above.
4066     */
4067    addr = useronly_clean_ptr(addr);
4068
4069    flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
4070                               &info->host, retaddr);
4071    info->flags = flags;
4072
4073    if (flags & TLB_INVALID_MASK) {
4074        g_assert(nofault);
4075        return false;
4076    }
4077
4078    /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
4079    info->host -= mem_off;
4080
4081#ifdef CONFIG_USER_ONLY
4082    memset(&info->attrs, 0, sizeof(info->attrs));
4083#else
4084    /*
4085     * Find the iotlbentry for addr and return the transaction attributes.
4086     * This *must* be present in the TLB because we just found the mapping.
4087     */
4088    {
4089        uintptr_t index = tlb_index(env, mmu_idx, addr);
4090
4091# ifdef CONFIG_DEBUG_TCG
4092        CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
4093        target_ulong comparator = (access_type == MMU_DATA_LOAD
4094                                   ? entry->addr_read
4095                                   : tlb_addr_write(entry));
4096        g_assert(tlb_hit(comparator, addr));
4097# endif
4098
4099        CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
4100        info->attrs = iotlbentry->attrs;
4101    }
4102#endif
4103
4104    return true;
4105}
4106
4107
4108/*
4109 * Analyse contiguous data, protected by a governing predicate.
4110 */
4111
4112typedef enum {
4113    FAULT_NO,
4114    FAULT_FIRST,
4115    FAULT_ALL,
4116} SVEContFault;
4117
4118typedef struct {
4119    /*
4120     * First and last element wholly contained within the two pages.
4121     * mem_off_first[0] and reg_off_first[0] are always set >= 0.
4122     * reg_off_last[0] may be < 0 if the first element crosses pages.
4123     * All of mem_off_first[1], reg_off_first[1] and reg_off_last[1]
4124     * are set >= 0 only if there are complete elements on a second page.
4125     *
4126     * The reg_off_* offsets are relative to the internal vector register.
4127     * The mem_off_first offset is relative to the memory address; the
4128     * two offsets are different when a load operation extends, a store
4129     * operation truncates, or for multi-register operations.
4130     */
4131    int16_t mem_off_first[2];
4132    int16_t reg_off_first[2];
4133    int16_t reg_off_last[2];
4134
4135    /*
4136     * One element that is misaligned and spans both pages,
4137     * or -1 if there is no such active element.
4138     */
4139    int16_t mem_off_split;
4140    int16_t reg_off_split;
4141
4142    /*
4143     * The byte offset at which the entire operation crosses a page boundary.
4144     * Set >= 0 if and only if the entire operation spans two pages.
4145     */
4146    int16_t page_split;
4147
4148    /* TLB data for the two pages. */
4149    SVEHostPage page[2];
4150} SVEContLdSt;
4151
4152/*
4153 * Find first active element on each page, and a loose bound for the
4154 * final element on each page.  Identify any single element that spans
4155 * the page boundary.  Return true if there are any active elements.
4156 */
4157static bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr,
4158                                   uint64_t *vg, intptr_t reg_max,
4159                                   int esz, int msize)
4160{
4161    const int esize = 1 << esz;
4162    const uint64_t pg_mask = pred_esz_masks[esz];
4163    intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
4164    intptr_t mem_off_last, mem_off_split;
4165    intptr_t page_split, elt_split;
4166    intptr_t i;
4167
4168    /* Set all of the element indices to -1, and the TLB data to 0. */
4169    memset(info, -1, offsetof(SVEContLdSt, page));
4170    memset(info->page, 0, sizeof(info->page));
4171
4172    /* Gross scan over the entire predicate to find bounds. */
4173    i = 0;
4174    do {
4175        uint64_t pg = vg[i] & pg_mask;
4176        if (pg) {
4177            reg_off_last = i * 64 + 63 - clz64(pg);
4178            if (reg_off_first < 0) {
4179                reg_off_first = i * 64 + ctz64(pg);
4180            }
4181        }
4182    } while (++i * 64 < reg_max);
4183
4184    if (unlikely(reg_off_first < 0)) {
4185        /* No active elements, no pages touched. */
4186        return false;
4187    }
4188    tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
4189
4190    info->reg_off_first[0] = reg_off_first;
4191    info->mem_off_first[0] = (reg_off_first >> esz) * msize;
4192    mem_off_last = (reg_off_last >> esz) * msize;
4193
4194    page_split = -(addr | TARGET_PAGE_MASK);
4195    if (likely(mem_off_last + msize <= page_split)) {
4196        /* The entire operation fits within a single page. */
4197        info->reg_off_last[0] = reg_off_last;
4198        return true;
4199    }
4200
4201    info->page_split = page_split;
4202    elt_split = page_split / msize;
4203    reg_off_split = elt_split << esz;
4204    mem_off_split = elt_split * msize;
4205
4206    /*
4207     * This is the last full element on the first page, but it is not
4208     * necessarily active.  If there is no full element, i.e. the first
4209     * active element is the one that's split, this value remains -1.
4210     * It is useful as iteration bounds.
4211     */
4212    if (elt_split != 0) {
4213        info->reg_off_last[0] = reg_off_split - esize;
4214    }
4215
4216    /* Determine if an unaligned element spans the pages.  */
4217    if (page_split % msize != 0) {
4218        /* It is helpful to know if the split element is active. */
4219        if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
4220            info->reg_off_split = reg_off_split;
4221            info->mem_off_split = mem_off_split;
4222
4223            if (reg_off_split == reg_off_last) {
4224                /* The page crossing element is last. */
4225                return true;
4226            }
4227        }
4228        reg_off_split += esize;
4229        mem_off_split += msize;
4230    }
4231
4232    /*
4233     * We do want the first active element on the second page, because
4234     * this may affect the address reported in an exception.
4235     */
4236    reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
4237    tcg_debug_assert(reg_off_split <= reg_off_last);
4238    info->reg_off_first[1] = reg_off_split;
4239    info->mem_off_first[1] = (reg_off_split >> esz) * msize;
4240    info->reg_off_last[1] = reg_off_last;
4241    return true;
4242}
4243
4244/*
4245 * Resolve the guest virtual addresses to info->page[].
4246 * Control the generation of page faults with @fault.  Return false if
4247 * there is no work to do, which can only happen with @fault == FAULT_NO.
4248 */
4249static bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
4250                                CPUARMState *env, target_ulong addr,
4251                                MMUAccessType access_type, uintptr_t retaddr)
4252{
4253    int mmu_idx = cpu_mmu_index(env, false);
4254    int mem_off = info->mem_off_first[0];
4255    bool nofault = fault == FAULT_NO;
4256    bool have_work = true;
4257
4258    if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
4259                        access_type, mmu_idx, retaddr)) {
4260        /* No work to be done. */
4261        return false;
4262    }
4263
4264    if (likely(info->page_split < 0)) {
4265        /* The entire operation was on the one page. */
4266        return true;
4267    }
4268
4269    /*
4270     * If the second page is invalid, then we want the fault address to be
4271     * the first byte on that page which is accessed.
4272     */
4273    if (info->mem_off_split >= 0) {
4274        /*
4275         * There is an element split across the pages.  The fault address
4276         * should be the first byte of the second page.
4277         */
4278        mem_off = info->page_split;
4279        /*
4280         * If the split element is also the first active element
4281         * of the vector, then:  For first-fault we should continue
4282         * to generate faults for the second page.  For no-fault,
4283         * we have work only if the second page is valid.
4284         */
4285        if (info->mem_off_first[0] < info->mem_off_split) {
4286            nofault = FAULT_FIRST;
4287            have_work = false;
4288        }
4289    } else {
4290        /*
4291         * There is no element split across the pages.  The fault address
4292         * should be the first active element on the second page.
4293         */
4294        mem_off = info->mem_off_first[1];
4295        /*
4296         * There must have been one active element on the first page,
4297         * so we're out of first-fault territory.
4298         */
4299        nofault = fault != FAULT_ALL;
4300    }
4301
4302    have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
4303                                access_type, mmu_idx, retaddr);
4304    return have_work;
4305}
4306
4307static void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
4308                                      uint64_t *vg, target_ulong addr,
4309                                      int esize, int msize, int wp_access,
4310                                      uintptr_t retaddr)
4311{
4312#ifndef CONFIG_USER_ONLY
4313    intptr_t mem_off, reg_off, reg_last;
4314    int flags0 = info->page[0].flags;
4315    int flags1 = info->page[1].flags;
4316
4317    if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
4318        return;
4319    }
4320
4321    /* Indicate that watchpoints are handled. */
4322    info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
4323    info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
4324
4325    if (flags0 & TLB_WATCHPOINT) {
4326        mem_off = info->mem_off_first[0];
4327        reg_off = info->reg_off_first[0];
4328        reg_last = info->reg_off_last[0];
4329
4330        while (reg_off <= reg_last) {
4331            uint64_t pg = vg[reg_off >> 6];
4332            do {
4333                if ((pg >> (reg_off & 63)) & 1) {
4334                    cpu_check_watchpoint(env_cpu(env), addr + mem_off,
4335                                         msize, info->page[0].attrs,
4336                                         wp_access, retaddr);
4337                }
4338                reg_off += esize;
4339                mem_off += msize;
4340            } while (reg_off <= reg_last && (reg_off & 63));
4341        }
4342    }
4343
4344    mem_off = info->mem_off_split;
4345    if (mem_off >= 0) {
4346        cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
4347                             info->page[0].attrs, wp_access, retaddr);
4348    }
4349
4350    mem_off = info->mem_off_first[1];
4351    if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
4352        reg_off = info->reg_off_first[1];
4353        reg_last = info->reg_off_last[1];
4354
4355        do {
4356            uint64_t pg = vg[reg_off >> 6];
4357            do {
4358                if ((pg >> (reg_off & 63)) & 1) {
4359                    cpu_check_watchpoint(env_cpu(env), addr + mem_off,
4360                                         msize, info->page[1].attrs,
4361                                         wp_access, retaddr);
4362                }
4363                reg_off += esize;
4364                mem_off += msize;
4365            } while (reg_off & 63);
4366        } while (reg_off <= reg_last);
4367    }
4368#endif
4369}
4370
4371typedef uint64_t mte_check_fn(CPUARMState *, uint32_t, uint64_t, uintptr_t);
4372
4373static inline QEMU_ALWAYS_INLINE
4374void sve_cont_ldst_mte_check_int(SVEContLdSt *info, CPUARMState *env,
4375                                 uint64_t *vg, target_ulong addr, int esize,
4376                                 int msize, uint32_t mtedesc, uintptr_t ra,
4377                                 mte_check_fn *check)
4378{
4379    intptr_t mem_off, reg_off, reg_last;
4380
4381    /* Process the page only if MemAttr == Tagged. */
4382    if (arm_tlb_mte_tagged(&info->page[0].attrs)) {
4383        mem_off = info->mem_off_first[0];
4384        reg_off = info->reg_off_first[0];
4385        reg_last = info->reg_off_split;
4386        if (reg_last < 0) {
4387            reg_last = info->reg_off_last[0];
4388        }
4389
4390        do {
4391            uint64_t pg = vg[reg_off >> 6];
4392            do {
4393                if ((pg >> (reg_off & 63)) & 1) {
4394                    check(env, mtedesc, addr, ra);
4395                }
4396                reg_off += esize;
4397                mem_off += msize;
4398            } while (reg_off <= reg_last && (reg_off & 63));
4399        } while (reg_off <= reg_last);
4400    }
4401
4402    mem_off = info->mem_off_first[1];
4403    if (mem_off >= 0 && arm_tlb_mte_tagged(&info->page[1].attrs)) {
4404        reg_off = info->reg_off_first[1];
4405        reg_last = info->reg_off_last[1];
4406
4407        do {
4408            uint64_t pg = vg[reg_off >> 6];
4409            do {
4410                if ((pg >> (reg_off & 63)) & 1) {
4411                    check(env, mtedesc, addr, ra);
4412                }
4413                reg_off += esize;
4414                mem_off += msize;
4415            } while (reg_off & 63);
4416        } while (reg_off <= reg_last);
4417    }
4418}
4419
4420typedef void sve_cont_ldst_mte_check_fn(SVEContLdSt *info, CPUARMState *env,
4421                                        uint64_t *vg, target_ulong addr,
4422                                        int esize, int msize, uint32_t mtedesc,
4423                                        uintptr_t ra);
4424
4425static void sve_cont_ldst_mte_check1(SVEContLdSt *info, CPUARMState *env,
4426                                     uint64_t *vg, target_ulong addr,
4427                                     int esize, int msize, uint32_t mtedesc,
4428                                     uintptr_t ra)
4429{
4430    sve_cont_ldst_mte_check_int(info, env, vg, addr, esize, msize,
4431                                mtedesc, ra, mte_check1);
4432}
4433
4434static void sve_cont_ldst_mte_checkN(SVEContLdSt *info, CPUARMState *env,
4435                                     uint64_t *vg, target_ulong addr,
4436                                     int esize, int msize, uint32_t mtedesc,
4437                                     uintptr_t ra)
4438{
4439    sve_cont_ldst_mte_check_int(info, env, vg, addr, esize, msize,
4440                                mtedesc, ra, mte_checkN);
4441}
4442
4443
4444/*
4445 * Common helper for all contiguous 1,2,3,4-register predicated stores.
4446 */
4447static inline QEMU_ALWAYS_INLINE
4448void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
4449               uint32_t desc, const uintptr_t retaddr,
4450               const int esz, const int msz, const int N, uint32_t mtedesc,
4451               sve_ldst1_host_fn *host_fn,
4452               sve_ldst1_tlb_fn *tlb_fn,
4453               sve_cont_ldst_mte_check_fn *mte_check_fn)
4454{
4455    const unsigned rd = simd_data(desc);
4456    const intptr_t reg_max = simd_oprsz(desc);
4457    intptr_t reg_off, reg_last, mem_off;
4458    SVEContLdSt info;
4459    void *host;
4460    int flags, i;
4461
4462    /* Find the active elements.  */
4463    if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
4464        /* The entire predicate was false; no load occurs.  */
4465        for (i = 0; i < N; ++i) {
4466            memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
4467        }
4468        return;
4469    }
4470
4471    /* Probe the page(s).  Exit with exception for any invalid page. */
4472    sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
4473
4474    /* Handle watchpoints for all active elements. */
4475    sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
4476                              BP_MEM_READ, retaddr);
4477
4478    /*
4479     * Handle mte checks for all active elements.
4480     * Since TBI must be set for MTE, !mtedesc => !mte_active.
4481     */
4482    if (mte_check_fn && mtedesc) {
4483        mte_check_fn(&info, env, vg, addr, 1 << esz, N << msz,
4484                     mtedesc, retaddr);
4485    }
4486
4487    flags = info.page[0].flags | info.page[1].flags;
4488    if (unlikely(flags != 0)) {
4489#ifdef CONFIG_USER_ONLY
4490        g_assert_not_reached();
4491#else
4492        /*
4493         * At least one page includes MMIO.
4494         * Any bus operation can fail with cpu_transaction_failed,
4495         * which for ARM will raise SyncExternal.  Perform the load
4496         * into scratch memory to preserve register state until the end.
4497         */
4498        ARMVectorReg scratch[4] = { };
4499
4500        mem_off = info.mem_off_first[0];
4501        reg_off = info.reg_off_first[0];
4502        reg_last = info.reg_off_last[1];
4503        if (reg_last < 0) {
4504            reg_last = info.reg_off_split;
4505            if (reg_last < 0) {
4506                reg_last = info.reg_off_last[0];
4507            }
4508        }
4509
4510        do {
4511            uint64_t pg = vg[reg_off >> 6];
4512            do {
4513                if ((pg >> (reg_off & 63)) & 1) {
4514                    for (i = 0; i < N; ++i) {
4515                        tlb_fn(env, &scratch[i], reg_off,
4516                               addr + mem_off + (i << msz), retaddr);
4517                    }
4518                }
4519                reg_off += 1 << esz;
4520                mem_off += N << msz;
4521            } while (reg_off & 63);
4522        } while (reg_off <= reg_last);
4523
4524        for (i = 0; i < N; ++i) {
4525            memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
4526        }
4527        return;
4528#endif
4529    }
4530
4531    /* The entire operation is in RAM, on valid pages. */
4532
4533    for (i = 0; i < N; ++i) {
4534        memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
4535    }
4536
4537    mem_off = info.mem_off_first[0];
4538    reg_off = info.reg_off_first[0];
4539    reg_last = info.reg_off_last[0];
4540    host = info.page[0].host;
4541
4542    while (reg_off <= reg_last) {
4543        uint64_t pg = vg[reg_off >> 6];
4544        do {
4545            if ((pg >> (reg_off & 63)) & 1) {
4546                for (i = 0; i < N; ++i) {
4547                    host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
4548                            host + mem_off + (i << msz));
4549                }
4550            }
4551            reg_off += 1 << esz;
4552            mem_off += N << msz;
4553        } while (reg_off <= reg_last && (reg_off & 63));
4554    }
4555
4556    /*
4557     * Use the slow path to manage the cross-page misalignment.
4558     * But we know this is RAM and cannot trap.
4559     */
4560    mem_off = info.mem_off_split;
4561    if (unlikely(mem_off >= 0)) {
4562        reg_off = info.reg_off_split;
4563        for (i = 0; i < N; ++i) {
4564            tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
4565                   addr + mem_off + (i << msz), retaddr);
4566        }
4567    }
4568
4569    mem_off = info.mem_off_first[1];
4570    if (unlikely(mem_off >= 0)) {
4571        reg_off = info.reg_off_first[1];
4572        reg_last = info.reg_off_last[1];
4573        host = info.page[1].host;
4574
4575        do {
4576            uint64_t pg = vg[reg_off >> 6];
4577            do {
4578                if ((pg >> (reg_off & 63)) & 1) {
4579                    for (i = 0; i < N; ++i) {
4580                        host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
4581                                host + mem_off + (i << msz));
4582                    }
4583                }
4584                reg_off += 1 << esz;
4585                mem_off += N << msz;
4586            } while (reg_off & 63);
4587        } while (reg_off <= reg_last);
4588    }
4589}
4590
4591static inline QEMU_ALWAYS_INLINE
4592void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
4593                   uint32_t desc, const uintptr_t ra,
4594                   const int esz, const int msz, const int N,
4595                   sve_ldst1_host_fn *host_fn,
4596                   sve_ldst1_tlb_fn *tlb_fn)
4597{
4598    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4599    int bit55 = extract64(addr, 55, 1);
4600
4601    /* Remove mtedesc from the normal sve descriptor. */
4602    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4603
4604    /* Perform gross MTE suppression early. */
4605    if (!tbi_check(desc, bit55) ||
4606        tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
4607        mtedesc = 0;
4608    }
4609
4610    sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn,
4611              N == 1 ? sve_cont_ldst_mte_check1 : sve_cont_ldst_mte_checkN);
4612}
4613
4614#define DO_LD1_1(NAME, ESZ)                                             \
4615void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg,                 \
4616                            target_ulong addr, uint32_t desc)           \
4617{                                                                       \
4618    sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0,            \
4619              sve_##NAME##_host, sve_##NAME##_tlb, NULL);               \
4620}                                                                       \
4621void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg,             \
4622                                target_ulong addr, uint32_t desc)       \
4623{                                                                       \
4624    sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1,           \
4625                  sve_##NAME##_host, sve_##NAME##_tlb);                 \
4626}
4627
4628#define DO_LD1_2(NAME, ESZ, MSZ)                                        \
4629void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg,              \
4630                               target_ulong addr, uint32_t desc)        \
4631{                                                                       \
4632    sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
4633              sve_##NAME##_le_host, sve_##NAME##_le_tlb, NULL);         \
4634}                                                                       \
4635void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg,              \
4636                               target_ulong addr, uint32_t desc)        \
4637{                                                                       \
4638    sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
4639              sve_##NAME##_be_host, sve_##NAME##_be_tlb, NULL);         \
4640}                                                                       \
4641void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg,          \
4642                                 target_ulong addr, uint32_t desc)      \
4643{                                                                       \
4644    sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
4645                  sve_##NAME##_le_host, sve_##NAME##_le_tlb);           \
4646}                                                                       \
4647void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg,          \
4648                                 target_ulong addr, uint32_t desc)      \
4649{                                                                       \
4650    sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
4651                  sve_##NAME##_be_host, sve_##NAME##_be_tlb);           \
4652}
4653
4654DO_LD1_1(ld1bb,  MO_8)
4655DO_LD1_1(ld1bhu, MO_16)
4656DO_LD1_1(ld1bhs, MO_16)
4657DO_LD1_1(ld1bsu, MO_32)
4658DO_LD1_1(ld1bss, MO_32)
4659DO_LD1_1(ld1bdu, MO_64)
4660DO_LD1_1(ld1bds, MO_64)
4661
4662DO_LD1_2(ld1hh,  MO_16, MO_16)
4663DO_LD1_2(ld1hsu, MO_32, MO_16)
4664DO_LD1_2(ld1hss, MO_32, MO_16)
4665DO_LD1_2(ld1hdu, MO_64, MO_16)
4666DO_LD1_2(ld1hds, MO_64, MO_16)
4667
4668DO_LD1_2(ld1ss,  MO_32, MO_32)
4669DO_LD1_2(ld1sdu, MO_64, MO_32)
4670DO_LD1_2(ld1sds, MO_64, MO_32)
4671
4672DO_LD1_2(ld1dd,  MO_64, MO_64)
4673
4674#undef DO_LD1_1
4675#undef DO_LD1_2
4676
4677#define DO_LDN_1(N)                                                     \
4678void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg,                \
4679                             target_ulong addr, uint32_t desc)          \
4680{                                                                       \
4681    sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0,           \
4682              sve_ld1bb_host, sve_ld1bb_tlb, NULL);                     \
4683}                                                                       \
4684void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg,            \
4685                                 target_ulong addr, uint32_t desc)      \
4686{                                                                       \
4687    sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N,          \
4688                  sve_ld1bb_host, sve_ld1bb_tlb);                       \
4689}
4690
4691#define DO_LDN_2(N, SUFF, ESZ)                                          \
4692void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg,         \
4693                                    target_ulong addr, uint32_t desc)   \
4694{                                                                       \
4695    sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
4696              sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb, NULL);   \
4697}                                                                       \
4698void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg,         \
4699                                    target_ulong addr, uint32_t desc)   \
4700{                                                                       \
4701    sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
4702              sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb, NULL);   \
4703}                                                                       \
4704void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg,     \
4705                                        target_ulong addr, uint32_t desc) \
4706{                                                                       \
4707    sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
4708                  sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);     \
4709}                                                                       \
4710void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg,     \
4711                                        target_ulong addr, uint32_t desc) \
4712{                                                                       \
4713    sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
4714                  sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);     \
4715}
4716
4717DO_LDN_1(2)
4718DO_LDN_1(3)
4719DO_LDN_1(4)
4720
4721DO_LDN_2(2, hh, MO_16)
4722DO_LDN_2(3, hh, MO_16)
4723DO_LDN_2(4, hh, MO_16)
4724
4725DO_LDN_2(2, ss, MO_32)
4726DO_LDN_2(3, ss, MO_32)
4727DO_LDN_2(4, ss, MO_32)
4728
4729DO_LDN_2(2, dd, MO_64)
4730DO_LDN_2(3, dd, MO_64)
4731DO_LDN_2(4, dd, MO_64)
4732
4733#undef DO_LDN_1
4734#undef DO_LDN_2
4735
4736/*
4737 * Load contiguous data, first-fault and no-fault.
4738 *
4739 * For user-only, one could argue that we should hold the mmap_lock during
4740 * the operation so that there is no race between page_check_range and the
4741 * load operation.  However, unmapping pages out from under a running thread
4742 * is extraordinarily unlikely.  This theoretical race condition also affects
4743 * linux-user/ in its get_user/put_user macros.
4744 *
4745 * TODO: Construct some helpers, written in assembly, that interact with
4746 * handle_cpu_signal to produce memory ops which can properly report errors
4747 * without racing.
4748 */
4749
4750/* Fault on byte I.  All bits in FFR from I are cleared.  The vector
4751 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
4752 * option, which leaves subsequent data unchanged.
4753 */
4754static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
4755{
4756    uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
4757
4758    if (i & 63) {
4759        ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
4760        i = ROUND_UP(i, 64);
4761    }
4762    for (; i < oprsz; i += 64) {
4763        ffr[i / 64] = 0;
4764    }
4765}
4766
4767/*
4768 * Common helper for all contiguous no-fault and first-fault loads.
4769 */
4770static inline QEMU_ALWAYS_INLINE
4771void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
4772                   uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
4773                   const int esz, const int msz, const SVEContFault fault,
4774                   sve_ldst1_host_fn *host_fn,
4775                   sve_ldst1_tlb_fn *tlb_fn)
4776{
4777    const unsigned rd = simd_data(desc);
4778    void *vd = &env->vfp.zregs[rd];
4779    const intptr_t reg_max = simd_oprsz(desc);
4780    intptr_t reg_off, mem_off, reg_last;
4781    SVEContLdSt info;
4782    int flags;
4783    void *host;
4784
4785    /* Find the active elements.  */
4786    if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
4787        /* The entire predicate was false; no load occurs.  */
4788        memset(vd, 0, reg_max);
4789        return;
4790    }
4791    reg_off = info.reg_off_first[0];
4792
4793    /* Probe the page(s). */
4794    if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
4795        /* Fault on first element. */
4796        tcg_debug_assert(fault == FAULT_NO);
4797        memset(vd, 0, reg_max);
4798        goto do_fault;
4799    }
4800
4801    mem_off = info.mem_off_first[0];
4802    flags = info.page[0].flags;
4803
4804    /*
4805     * Disable MTE checking if the Tagged bit is not set.  Since TBI must
4806     * be set within MTEDESC for MTE, !mtedesc => !mte_active.
4807     */
4808    if (arm_tlb_mte_tagged(&info.page[0].attrs)) {
4809        mtedesc = 0;
4810    }
4811
4812    if (fault == FAULT_FIRST) {
4813        /* Trapping mte check for the first-fault element.  */
4814        if (mtedesc) {
4815            mte_check1(env, mtedesc, addr + mem_off, retaddr);
4816        }
4817
4818        /*
4819         * Special handling of the first active element,
4820         * if it crosses a page boundary or is MMIO.
4821         */
4822        bool is_split = mem_off == info.mem_off_split;
4823        if (unlikely(flags != 0) || unlikely(is_split)) {
4824            /*
4825             * Use the slow path for cross-page handling.
4826             * Might trap for MMIO or watchpoints.
4827             */
4828            tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
4829
4830            /* After any fault, zero the other elements. */
4831            swap_memzero(vd, reg_off);
4832            reg_off += 1 << esz;
4833            mem_off += 1 << msz;
4834            swap_memzero(vd + reg_off, reg_max - reg_off);
4835
4836            if (is_split) {
4837                goto second_page;
4838            }
4839        } else {
4840            memset(vd, 0, reg_max);
4841        }
4842    } else {
4843        memset(vd, 0, reg_max);
4844        if (unlikely(mem_off == info.mem_off_split)) {
4845            /* The first active element crosses a page boundary. */
4846            flags |= info.page[1].flags;
4847            if (unlikely(flags & TLB_MMIO)) {
4848                /* Some page is MMIO, see below. */
4849                goto do_fault;
4850            }
4851            if (unlikely(flags & TLB_WATCHPOINT) &&
4852                (cpu_watchpoint_address_matches
4853                 (env_cpu(env), addr + mem_off, 1 << msz)
4854                 & BP_MEM_READ)) {
4855                /* Watchpoint hit, see below. */
4856                goto do_fault;
4857            }
4858            if (mtedesc && !mte_probe1(env, mtedesc, addr + mem_off)) {
4859                goto do_fault;
4860            }
4861            /*
4862             * Use the slow path for cross-page handling.
4863             * This is RAM, without a watchpoint, and will not trap.
4864             */
4865            tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
4866            goto second_page;
4867        }
4868    }
4869
4870    /*
4871     * From this point on, all memory operations are MemSingleNF.
4872     *
4873     * Per the MemSingleNF pseudocode, a no-fault load from Device memory
4874     * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
4875     *
4876     * Unfortuately we do not have access to the memory attributes from the
4877     * PTE to tell Device memory from Normal memory.  So we make a mostly
4878     * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
4879     * This gives the right answer for the common cases of "Normal memory,
4880     * backed by host RAM" and "Device memory, backed by MMIO".
4881     * The architecture allows us to suppress an NF load and return
4882     * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
4883     * case of "Normal memory, backed by MMIO" is permitted.  The case we
4884     * get wrong is "Device memory, backed by host RAM", for which we
4885     * should return (UNKNOWN, FAULT) for but do not.
4886     *
4887     * Similarly, CPU_BP breakpoints would raise exceptions, and so
4888     * return (UNKNOWN, FAULT).  For simplicity, we consider gdb and
4889     * architectural breakpoints the same.
4890     */
4891    if (unlikely(flags & TLB_MMIO)) {
4892        goto do_fault;
4893    }
4894
4895    reg_last = info.reg_off_last[0];
4896    host = info.page[0].host;
4897
4898    do {
4899        uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
4900        do {
4901            if ((pg >> (reg_off & 63)) & 1) {
4902                if (unlikely(flags & TLB_WATCHPOINT) &&
4903                    (cpu_watchpoint_address_matches
4904                     (env_cpu(env), addr + mem_off, 1 << msz)
4905                     & BP_MEM_READ)) {
4906                    goto do_fault;
4907                }
4908                if (mtedesc && !mte_probe1(env, mtedesc, addr + mem_off)) {
4909                    goto do_fault;
4910                }
4911                host_fn(vd, reg_off, host + mem_off);
4912            }
4913            reg_off += 1 << esz;
4914            mem_off += 1 << msz;
4915        } while (reg_off <= reg_last && (reg_off & 63));
4916    } while (reg_off <= reg_last);
4917
4918    /*
4919     * MemSingleNF is allowed to fail for any reason.  We have special
4920     * code above to handle the first element crossing a page boundary.
4921     * As an implementation choice, decline to handle a cross-page element
4922     * in any other position.
4923     */
4924    reg_off = info.reg_off_split;
4925    if (reg_off >= 0) {
4926        goto do_fault;
4927    }
4928
4929 second_page:
4930    reg_off = info.reg_off_first[1];
4931    if (likely(reg_off < 0)) {
4932        /* No active elements on the second page.  All done. */
4933        return;
4934    }
4935
4936    /*
4937     * MemSingleNF is allowed to fail for any reason.  As an implementation
4938     * choice, decline to handle elements on the second page.  This should
4939     * be low frequency as the guest walks through memory -- the next
4940     * iteration of the guest's loop should be aligned on the page boundary,
4941     * and then all following iterations will stay aligned.
4942     */
4943
4944 do_fault:
4945    record_fault(env, reg_off, reg_max);
4946}
4947
4948static inline QEMU_ALWAYS_INLINE
4949void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
4950                       uint32_t desc, const uintptr_t retaddr,
4951                       const int esz, const int msz, const SVEContFault fault,
4952                       sve_ldst1_host_fn *host_fn,
4953                       sve_ldst1_tlb_fn *tlb_fn)
4954{
4955    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4956    int bit55 = extract64(addr, 55, 1);
4957
4958    /* Remove mtedesc from the normal sve descriptor. */
4959    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4960
4961    /* Perform gross MTE suppression early. */
4962    if (!tbi_check(desc, bit55) ||
4963        tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
4964        mtedesc = 0;
4965    }
4966
4967    sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
4968                  esz, msz, fault, host_fn, tlb_fn);
4969}
4970
4971#define DO_LDFF1_LDNF1_1(PART, ESZ)                                     \
4972void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg,            \
4973                                 target_ulong addr, uint32_t desc)      \
4974{                                                                       \
4975    sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
4976                  sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
4977}                                                                       \
4978void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg,            \
4979                                 target_ulong addr, uint32_t desc)      \
4980{                                                                       \
4981    sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
4982                  sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
4983}                                                                       \
4984void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg,        \
4985                                     target_ulong addr, uint32_t desc)  \
4986{                                                                       \
4987    sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
4988                      sve_ld1##PART##_host, sve_ld1##PART##_tlb);       \
4989}                                                                       \
4990void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg,        \
4991                                     target_ulong addr, uint32_t desc)  \
4992{                                                                       \
4993    sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
4994                  sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
4995}
4996
4997#define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ)                                \
4998void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg,         \
4999                                    target_ulong addr, uint32_t desc)   \
5000{                                                                       \
5001    sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
5002                  sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
5003}                                                                       \
5004void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg,         \
5005                                    target_ulong addr, uint32_t desc)   \
5006{                                                                       \
5007    sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
5008                  sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
5009}                                                                       \
5010void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg,         \
5011                                    target_ulong addr, uint32_t desc)   \
5012{                                                                       \
5013    sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
5014                  sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
5015}                                                                       \
5016void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg,         \
5017                                    target_ulong addr, uint32_t desc)   \
5018{                                                                       \
5019    sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
5020                  sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
5021}                                                                       \
5022void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
5023                                        target_ulong addr, uint32_t desc) \
5024{                                                                       \
5025    sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
5026                      sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5027}                                                                       \
5028void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
5029                                        target_ulong addr, uint32_t desc) \
5030{                                                                       \
5031    sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
5032                      sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5033}                                                                       \
5034void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
5035                                        target_ulong addr, uint32_t desc) \
5036{                                                                       \
5037    sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
5038                      sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5039}                                                                       \
5040void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
5041                                        target_ulong addr, uint32_t desc) \
5042{                                                                       \
5043    sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
5044                      sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5045}
5046
5047DO_LDFF1_LDNF1_1(bb,  MO_8)
5048DO_LDFF1_LDNF1_1(bhu, MO_16)
5049DO_LDFF1_LDNF1_1(bhs, MO_16)
5050DO_LDFF1_LDNF1_1(bsu, MO_32)
5051DO_LDFF1_LDNF1_1(bss, MO_32)
5052DO_LDFF1_LDNF1_1(bdu, MO_64)
5053DO_LDFF1_LDNF1_1(bds, MO_64)
5054
5055DO_LDFF1_LDNF1_2(hh,  MO_16, MO_16)
5056DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
5057DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
5058DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
5059DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
5060
5061DO_LDFF1_LDNF1_2(ss,  MO_32, MO_32)
5062DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
5063DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
5064
5065DO_LDFF1_LDNF1_2(dd,  MO_64, MO_64)
5066
5067#undef DO_LDFF1_LDNF1_1
5068#undef DO_LDFF1_LDNF1_2
5069
5070/*
5071 * Common helper for all contiguous 1,2,3,4-register predicated stores.
5072 */
5073
5074static inline QEMU_ALWAYS_INLINE
5075void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
5076               uint32_t desc, const uintptr_t retaddr,
5077               const int esz, const int msz, const int N, uint32_t mtedesc,
5078               sve_ldst1_host_fn *host_fn,
5079               sve_ldst1_tlb_fn *tlb_fn,
5080               sve_cont_ldst_mte_check_fn *mte_check_fn)
5081{
5082    const unsigned rd = simd_data(desc);
5083    const intptr_t reg_max = simd_oprsz(desc);
5084    intptr_t reg_off, reg_last, mem_off;
5085    SVEContLdSt info;
5086    void *host;
5087    int i, flags;
5088
5089    /* Find the active elements.  */
5090    if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5091        /* The entire predicate was false; no store occurs.  */
5092        return;
5093    }
5094
5095    /* Probe the page(s).  Exit with exception for any invalid page. */
5096    sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
5097
5098    /* Handle watchpoints for all active elements. */
5099    sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5100                              BP_MEM_WRITE, retaddr);
5101
5102    /*
5103     * Handle mte checks for all active elements.
5104     * Since TBI must be set for MTE, !mtedesc => !mte_active.
5105     */
5106    if (mte_check_fn && mtedesc) {
5107        mte_check_fn(&info, env, vg, addr, 1 << esz, N << msz,
5108                     mtedesc, retaddr);
5109    }
5110
5111    flags = info.page[0].flags | info.page[1].flags;
5112    if (unlikely(flags != 0)) {
5113#ifdef CONFIG_USER_ONLY
5114        g_assert_not_reached();
5115#else
5116        /*
5117         * At least one page includes MMIO.
5118         * Any bus operation can fail with cpu_transaction_failed,
5119         * which for ARM will raise SyncExternal.  We cannot avoid
5120         * this fault and will leave with the store incomplete.
5121         */
5122        mem_off = info.mem_off_first[0];
5123        reg_off = info.reg_off_first[0];
5124        reg_last = info.reg_off_last[1];
5125        if (reg_last < 0) {
5126            reg_last = info.reg_off_split;
5127            if (reg_last < 0) {
5128                reg_last = info.reg_off_last[0];
5129            }
5130        }
5131
5132        do {
5133            uint64_t pg = vg[reg_off >> 6];
5134            do {
5135                if ((pg >> (reg_off & 63)) & 1) {
5136                    for (i = 0; i < N; ++i) {
5137                        tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5138                               addr + mem_off + (i << msz), retaddr);
5139                    }
5140                }
5141                reg_off += 1 << esz;
5142                mem_off += N << msz;
5143            } while (reg_off & 63);
5144        } while (reg_off <= reg_last);
5145        return;
5146#endif
5147    }
5148
5149    mem_off = info.mem_off_first[0];
5150    reg_off = info.reg_off_first[0];
5151    reg_last = info.reg_off_last[0];
5152    host = info.page[0].host;
5153
5154    while (reg_off <= reg_last) {
5155        uint64_t pg = vg[reg_off >> 6];
5156        do {
5157            if ((pg >> (reg_off & 63)) & 1) {
5158                for (i = 0; i < N; ++i) {
5159                    host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5160                            host + mem_off + (i << msz));
5161                }
5162            }
5163            reg_off += 1 << esz;
5164            mem_off += N << msz;
5165        } while (reg_off <= reg_last && (reg_off & 63));
5166    }
5167
5168    /*
5169     * Use the slow path to manage the cross-page misalignment.
5170     * But we know this is RAM and cannot trap.
5171     */
5172    mem_off = info.mem_off_split;
5173    if (unlikely(mem_off >= 0)) {
5174        reg_off = info.reg_off_split;
5175        for (i = 0; i < N; ++i) {
5176            tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5177                   addr + mem_off + (i << msz), retaddr);
5178        }
5179    }
5180
5181    mem_off = info.mem_off_first[1];
5182    if (unlikely(mem_off >= 0)) {
5183        reg_off = info.reg_off_first[1];
5184        reg_last = info.reg_off_last[1];
5185        host = info.page[1].host;
5186
5187        do {
5188            uint64_t pg = vg[reg_off >> 6];
5189            do {
5190                if ((pg >> (reg_off & 63)) & 1) {
5191                    for (i = 0; i < N; ++i) {
5192                        host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5193                                host + mem_off + (i << msz));
5194                    }
5195                }
5196                reg_off += 1 << esz;
5197                mem_off += N << msz;
5198            } while (reg_off & 63);
5199        } while (reg_off <= reg_last);
5200    }
5201}
5202
5203static inline QEMU_ALWAYS_INLINE
5204void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5205                   uint32_t desc, const uintptr_t ra,
5206                   const int esz, const int msz, const int N,
5207                   sve_ldst1_host_fn *host_fn,
5208                   sve_ldst1_tlb_fn *tlb_fn)
5209{
5210    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5211    int bit55 = extract64(addr, 55, 1);
5212
5213    /* Remove mtedesc from the normal sve descriptor. */
5214    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5215
5216    /* Perform gross MTE suppression early. */
5217    if (!tbi_check(desc, bit55) ||
5218        tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5219        mtedesc = 0;
5220    }
5221
5222    sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn,
5223              N == 1 ? sve_cont_ldst_mte_check1 : sve_cont_ldst_mte_checkN);
5224}
5225
5226#define DO_STN_1(N, NAME, ESZ)                                          \
5227void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg,            \
5228                                 target_ulong addr, uint32_t desc)      \
5229{                                                                       \
5230    sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0,            \
5231              sve_st1##NAME##_host, sve_st1##NAME##_tlb, NULL);         \
5232}                                                                       \
5233void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg,        \
5234                                     target_ulong addr, uint32_t desc)  \
5235{                                                                       \
5236    sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N,           \
5237                  sve_st1##NAME##_host, sve_st1##NAME##_tlb);           \
5238}
5239
5240#define DO_STN_2(N, NAME, ESZ, MSZ)                                     \
5241void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg,         \
5242                                    target_ulong addr, uint32_t desc)   \
5243{                                                                       \
5244    sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
5245              sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb, NULL);   \
5246}                                                                       \
5247void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg,         \
5248                                    target_ulong addr, uint32_t desc)   \
5249{                                                                       \
5250    sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
5251              sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb, NULL);   \
5252}                                                                       \
5253void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg,     \
5254                                        target_ulong addr, uint32_t desc) \
5255{                                                                       \
5256    sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
5257                  sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);     \
5258}                                                                       \
5259void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg,     \
5260                                        target_ulong addr, uint32_t desc) \
5261{                                                                       \
5262    sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
5263                  sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);     \
5264}
5265
5266DO_STN_1(1, bb, MO_8)
5267DO_STN_1(1, bh, MO_16)
5268DO_STN_1(1, bs, MO_32)
5269DO_STN_1(1, bd, MO_64)
5270DO_STN_1(2, bb, MO_8)
5271DO_STN_1(3, bb, MO_8)
5272DO_STN_1(4, bb, MO_8)
5273
5274DO_STN_2(1, hh, MO_16, MO_16)
5275DO_STN_2(1, hs, MO_32, MO_16)
5276DO_STN_2(1, hd, MO_64, MO_16)
5277DO_STN_2(2, hh, MO_16, MO_16)
5278DO_STN_2(3, hh, MO_16, MO_16)
5279DO_STN_2(4, hh, MO_16, MO_16)
5280
5281DO_STN_2(1, ss, MO_32, MO_32)
5282DO_STN_2(1, sd, MO_64, MO_32)
5283DO_STN_2(2, ss, MO_32, MO_32)
5284DO_STN_2(3, ss, MO_32, MO_32)
5285DO_STN_2(4, ss, MO_32, MO_32)
5286
5287DO_STN_2(1, dd, MO_64, MO_64)
5288DO_STN_2(2, dd, MO_64, MO_64)
5289DO_STN_2(3, dd, MO_64, MO_64)
5290DO_STN_2(4, dd, MO_64, MO_64)
5291
5292#undef DO_STN_1
5293#undef DO_STN_2
5294
5295/*
5296 * Loads with a vector index.
5297 */
5298
5299/*
5300 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
5301 */
5302typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
5303
5304static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
5305{
5306    return *(uint32_t *)(reg + H1_4(reg_ofs));
5307}
5308
5309static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
5310{
5311    return *(int32_t *)(reg + H1_4(reg_ofs));
5312}
5313
5314static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
5315{
5316    return (uint32_t)*(uint64_t *)(reg + reg_ofs);
5317}
5318
5319static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
5320{
5321    return (int32_t)*(uint64_t *)(reg + reg_ofs);
5322}
5323
5324static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
5325{
5326    return *(uint64_t *)(reg + reg_ofs);
5327}
5328
5329static inline QEMU_ALWAYS_INLINE
5330void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5331               target_ulong base, uint32_t desc, uintptr_t retaddr,
5332               uint32_t mtedesc, int esize, int msize,
5333               zreg_off_fn *off_fn,
5334               sve_ldst1_host_fn *host_fn,
5335               sve_ldst1_tlb_fn *tlb_fn)
5336{
5337    const int mmu_idx = cpu_mmu_index(env, false);
5338    const intptr_t reg_max = simd_oprsz(desc);
5339    const int scale = simd_data(desc);
5340    ARMVectorReg scratch;
5341    intptr_t reg_off;
5342    SVEHostPage info, info2;
5343
5344    memset(&scratch, 0, reg_max);
5345    reg_off = 0;
5346    do {
5347        uint64_t pg = vg[reg_off >> 6];
5348        do {
5349            if (likely(pg & 1)) {
5350                target_ulong addr = base + (off_fn(vm, reg_off) << scale);
5351                target_ulong in_page = -(addr | TARGET_PAGE_MASK);
5352
5353                sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
5354                               mmu_idx, retaddr);
5355
5356                if (likely(in_page >= msize)) {
5357                    if (unlikely(info.flags & TLB_WATCHPOINT)) {
5358                        cpu_check_watchpoint(env_cpu(env), addr, msize,
5359                                             info.attrs, BP_MEM_READ, retaddr);
5360                    }
5361                    if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
5362                        mte_check1(env, mtedesc, addr, retaddr);
5363                    }
5364                    host_fn(&scratch, reg_off, info.host);
5365                } else {
5366                    /* Element crosses the page boundary. */
5367                    sve_probe_page(&info2, false, env, addr + in_page, 0,
5368                                   MMU_DATA_LOAD, mmu_idx, retaddr);
5369                    if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
5370                        cpu_check_watchpoint(env_cpu(env), addr,
5371                                             msize, info.attrs,
5372                                             BP_MEM_READ, retaddr);
5373                    }
5374                    if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
5375                        mte_check1(env, mtedesc, addr, retaddr);
5376                    }
5377                    tlb_fn(env, &scratch, reg_off, addr, retaddr);
5378                }
5379            }
5380            reg_off += esize;
5381            pg >>= esize;
5382        } while (reg_off & 63);
5383    } while (reg_off < reg_max);
5384
5385    /* Wait until all exceptions have been raised to write back.  */
5386    memcpy(vd, &scratch, reg_max);
5387}
5388
5389static inline QEMU_ALWAYS_INLINE
5390void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5391                   target_ulong base, uint32_t desc, uintptr_t retaddr,
5392                   int esize, int msize, zreg_off_fn *off_fn,
5393                   sve_ldst1_host_fn *host_fn,
5394                   sve_ldst1_tlb_fn *tlb_fn)
5395{
5396    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5397    /* Remove mtedesc from the normal sve descriptor. */
5398    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5399
5400    /*
5401     * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
5402     * offset base entirely over the address space hole to change the
5403     * pointer tag, or change the bit55 selector.  So we could here
5404     * examine TBI + TCMA like we do for sve_ldN_r_mte().
5405     */
5406    sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
5407              esize, msize, off_fn, host_fn, tlb_fn);
5408}
5409
5410#define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
5411void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
5412                                 void *vm, target_ulong base, uint32_t desc) \
5413{                                                                            \
5414    sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,          \
5415              off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
5416}                                                                            \
5417void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5418     void *vm, target_ulong base, uint32_t desc)                             \
5419{                                                                            \
5420    sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,         \
5421                  off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
5422}
5423
5424#define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
5425void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
5426                                 void *vm, target_ulong base, uint32_t desc) \
5427{                                                                            \
5428    sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,          \
5429              off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
5430}                                                                            \
5431void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5432    void *vm, target_ulong base, uint32_t desc)                              \
5433{                                                                            \
5434    sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,         \
5435                  off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
5436}
5437
5438DO_LD1_ZPZ_S(bsu, zsu, MO_8)
5439DO_LD1_ZPZ_S(bsu, zss, MO_8)
5440DO_LD1_ZPZ_D(bdu, zsu, MO_8)
5441DO_LD1_ZPZ_D(bdu, zss, MO_8)
5442DO_LD1_ZPZ_D(bdu, zd, MO_8)
5443
5444DO_LD1_ZPZ_S(bss, zsu, MO_8)
5445DO_LD1_ZPZ_S(bss, zss, MO_8)
5446DO_LD1_ZPZ_D(bds, zsu, MO_8)
5447DO_LD1_ZPZ_D(bds, zss, MO_8)
5448DO_LD1_ZPZ_D(bds, zd, MO_8)
5449
5450DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
5451DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
5452DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
5453DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
5454DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
5455
5456DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
5457DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
5458DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
5459DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
5460DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
5461
5462DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
5463DO_LD1_ZPZ_S(hss_le, zss, MO_16)
5464DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
5465DO_LD1_ZPZ_D(hds_le, zss, MO_16)
5466DO_LD1_ZPZ_D(hds_le, zd, MO_16)
5467
5468DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
5469DO_LD1_ZPZ_S(hss_be, zss, MO_16)
5470DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
5471DO_LD1_ZPZ_D(hds_be, zss, MO_16)
5472DO_LD1_ZPZ_D(hds_be, zd, MO_16)
5473
5474DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
5475DO_LD1_ZPZ_S(ss_le, zss, MO_32)
5476DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
5477DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
5478DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
5479
5480DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
5481DO_LD1_ZPZ_S(ss_be, zss, MO_32)
5482DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
5483DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
5484DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
5485
5486DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
5487DO_LD1_ZPZ_D(sds_le, zss, MO_32)
5488DO_LD1_ZPZ_D(sds_le, zd, MO_32)
5489
5490DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
5491DO_LD1_ZPZ_D(sds_be, zss, MO_32)
5492DO_LD1_ZPZ_D(sds_be, zd, MO_32)
5493
5494DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
5495DO_LD1_ZPZ_D(dd_le, zss, MO_64)
5496DO_LD1_ZPZ_D(dd_le, zd, MO_64)
5497
5498DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
5499DO_LD1_ZPZ_D(dd_be, zss, MO_64)
5500DO_LD1_ZPZ_D(dd_be, zd, MO_64)
5501
5502#undef DO_LD1_ZPZ_S
5503#undef DO_LD1_ZPZ_D
5504
5505/* First fault loads with a vector index.  */
5506
5507/*
5508 * Common helpers for all gather first-faulting loads.
5509 */
5510
5511static inline QEMU_ALWAYS_INLINE
5512void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5513                 target_ulong base, uint32_t desc, uintptr_t retaddr,
5514                 uint32_t mtedesc, const int esz, const int msz,
5515                 zreg_off_fn *off_fn,
5516                 sve_ldst1_host_fn *host_fn,
5517                 sve_ldst1_tlb_fn *tlb_fn)
5518{
5519    const int mmu_idx = cpu_mmu_index(env, false);
5520    const intptr_t reg_max = simd_oprsz(desc);
5521    const int scale = simd_data(desc);
5522    const int esize = 1 << esz;
5523    const int msize = 1 << msz;
5524    intptr_t reg_off;
5525    SVEHostPage info;
5526    target_ulong addr, in_page;
5527
5528    /* Skip to the first true predicate.  */
5529    reg_off = find_next_active(vg, 0, reg_max, esz);
5530    if (unlikely(reg_off >= reg_max)) {
5531        /* The entire predicate was false; no load occurs.  */
5532        memset(vd, 0, reg_max);
5533        return;
5534    }
5535
5536    /*
5537     * Probe the first element, allowing faults.
5538     */
5539    addr = base + (off_fn(vm, reg_off) << scale);
5540    if (mtedesc) {
5541        mte_check1(env, mtedesc, addr, retaddr);
5542    }
5543    tlb_fn(env, vd, reg_off, addr, retaddr);
5544
5545    /* After any fault, zero the other elements. */
5546    swap_memzero(vd, reg_off);
5547    reg_off += esize;
5548    swap_memzero(vd + reg_off, reg_max - reg_off);
5549
5550    /*
5551     * Probe the remaining elements, not allowing faults.
5552     */
5553    while (reg_off < reg_max) {
5554        uint64_t pg = vg[reg_off >> 6];
5555        do {
5556            if (likely((pg >> (reg_off & 63)) & 1)) {
5557                addr = base + (off_fn(vm, reg_off) << scale);
5558                in_page = -(addr | TARGET_PAGE_MASK);
5559
5560                if (unlikely(in_page < msize)) {
5561                    /* Stop if the element crosses a page boundary. */
5562                    goto fault;
5563                }
5564
5565                sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
5566                               mmu_idx, retaddr);
5567                if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
5568                    goto fault;
5569                }
5570                if (unlikely(info.flags & TLB_WATCHPOINT) &&
5571                    (cpu_watchpoint_address_matches
5572                     (env_cpu(env), addr, msize) & BP_MEM_READ)) {
5573                    goto fault;
5574                }
5575                if (mtedesc &&
5576                    arm_tlb_mte_tagged(&info.attrs) &&
5577                    !mte_probe1(env, mtedesc, addr)) {
5578                    goto fault;
5579                }
5580
5581                host_fn(vd, reg_off, info.host);
5582            }
5583            reg_off += esize;
5584        } while (reg_off & 63);
5585    }
5586    return;
5587
5588 fault:
5589    record_fault(env, reg_off, reg_max);
5590}
5591
5592static inline QEMU_ALWAYS_INLINE
5593void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5594                     target_ulong base, uint32_t desc, uintptr_t retaddr,
5595                     const int esz, const int msz,
5596                     zreg_off_fn *off_fn,
5597                     sve_ldst1_host_fn *host_fn,
5598                     sve_ldst1_tlb_fn *tlb_fn)
5599{
5600    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5601    /* Remove mtedesc from the normal sve descriptor. */
5602    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5603
5604    /*
5605     * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
5606     * offset base entirely over the address space hole to change the
5607     * pointer tag, or change the bit55 selector.  So we could here
5608     * examine TBI + TCMA like we do for sve_ldN_r_mte().
5609     */
5610    sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
5611                esz, msz, off_fn, host_fn, tlb_fn);
5612}
5613
5614#define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ)                                   \
5615void HELPER(sve_ldff##MEM##_##OFS)                                      \
5616    (CPUARMState *env, void *vd, void *vg,                              \
5617     void *vm, target_ulong base, uint32_t desc)                        \
5618{                                                                       \
5619    sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ,    \
5620                off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5621}                                                                       \
5622void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
5623    (CPUARMState *env, void *vd, void *vg,                              \
5624     void *vm, target_ulong base, uint32_t desc)                        \
5625{                                                                       \
5626    sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ,   \
5627                    off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5628}
5629
5630#define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ)                                   \
5631void HELPER(sve_ldff##MEM##_##OFS)                                      \
5632    (CPUARMState *env, void *vd, void *vg,                              \
5633     void *vm, target_ulong base, uint32_t desc)                        \
5634{                                                                       \
5635    sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ,    \
5636                off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5637}                                                                       \
5638void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
5639    (CPUARMState *env, void *vd, void *vg,                              \
5640     void *vm, target_ulong base, uint32_t desc)                        \
5641{                                                                       \
5642    sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ,   \
5643                    off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5644}
5645
5646DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
5647DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
5648DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
5649DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
5650DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
5651
5652DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
5653DO_LDFF1_ZPZ_S(bss, zss, MO_8)
5654DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
5655DO_LDFF1_ZPZ_D(bds, zss, MO_8)
5656DO_LDFF1_ZPZ_D(bds, zd, MO_8)
5657
5658DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
5659DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
5660DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
5661DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
5662DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
5663
5664DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
5665DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
5666DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
5667DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
5668DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
5669
5670DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
5671DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
5672DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
5673DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
5674DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
5675
5676DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
5677DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
5678DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
5679DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
5680DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
5681
5682DO_LDFF1_ZPZ_S(ss_le,  zsu, MO_32)
5683DO_LDFF1_ZPZ_S(ss_le,  zss, MO_32)
5684DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
5685DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
5686DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
5687
5688DO_LDFF1_ZPZ_S(ss_be,  zsu, MO_32)
5689DO_LDFF1_ZPZ_S(ss_be,  zss, MO_32)
5690DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
5691DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
5692DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
5693
5694DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
5695DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
5696DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
5697
5698DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
5699DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
5700DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
5701
5702DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
5703DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
5704DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
5705
5706DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
5707DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
5708DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
5709
5710/* Stores with a vector index.  */
5711
5712static inline QEMU_ALWAYS_INLINE
5713void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5714               target_ulong base, uint32_t desc, uintptr_t retaddr,
5715               uint32_t mtedesc, int esize, int msize,
5716               zreg_off_fn *off_fn,
5717               sve_ldst1_host_fn *host_fn,
5718               sve_ldst1_tlb_fn *tlb_fn)
5719{
5720    const int mmu_idx = cpu_mmu_index(env, false);
5721    const intptr_t reg_max = simd_oprsz(desc);
5722    const int scale = simd_data(desc);
5723    void *host[ARM_MAX_VQ * 4];
5724    intptr_t reg_off, i;
5725    SVEHostPage info, info2;
5726
5727    /*
5728     * Probe all of the elements for host addresses and flags.
5729     */
5730    i = reg_off = 0;
5731    do {
5732        uint64_t pg = vg[reg_off >> 6];
5733        do {
5734            target_ulong addr = base + (off_fn(vm, reg_off) << scale);
5735            target_ulong in_page = -(addr | TARGET_PAGE_MASK);
5736
5737            host[i] = NULL;
5738            if (likely((pg >> (reg_off & 63)) & 1)) {
5739                if (likely(in_page >= msize)) {
5740                    sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
5741                                   mmu_idx, retaddr);
5742                    host[i] = info.host;
5743                } else {
5744                    /*
5745                     * Element crosses the page boundary.
5746                     * Probe both pages, but do not record the host address,
5747                     * so that we use the slow path.
5748                     */
5749                    sve_probe_page(&info, false, env, addr, 0,
5750                                   MMU_DATA_STORE, mmu_idx, retaddr);
5751                    sve_probe_page(&info2, false, env, addr + in_page, 0,
5752                                   MMU_DATA_STORE, mmu_idx, retaddr);
5753                    info.flags |= info2.flags;
5754                }
5755
5756                if (unlikely(info.flags & TLB_WATCHPOINT)) {
5757                    cpu_check_watchpoint(env_cpu(env), addr, msize,
5758                                         info.attrs, BP_MEM_WRITE, retaddr);
5759                }
5760
5761                if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
5762                    mte_check1(env, mtedesc, addr, retaddr);
5763                }
5764            }
5765            i += 1;
5766            reg_off += esize;
5767        } while (reg_off & 63);
5768    } while (reg_off < reg_max);
5769
5770    /*
5771     * Now that we have recognized all exceptions except SyncExternal
5772     * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
5773     *
5774     * Note for the common case of an element in RAM, not crossing a page
5775     * boundary, we have stored the host address in host[].  This doubles
5776     * as a first-level check against the predicate, since only enabled
5777     * elements have non-null host addresses.
5778     */
5779    i = reg_off = 0;
5780    do {
5781        void *h = host[i];
5782        if (likely(h != NULL)) {
5783            host_fn(vd, reg_off, h);
5784        } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
5785            target_ulong addr = base + (off_fn(vm, reg_off) << scale);
5786            tlb_fn(env, vd, reg_off, addr, retaddr);
5787        }
5788        i += 1;
5789        reg_off += esize;
5790    } while (reg_off < reg_max);
5791}
5792
5793static inline QEMU_ALWAYS_INLINE
5794void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5795                   target_ulong base, uint32_t desc, uintptr_t retaddr,
5796                   int esize, int msize, zreg_off_fn *off_fn,
5797                   sve_ldst1_host_fn *host_fn,
5798                   sve_ldst1_tlb_fn *tlb_fn)
5799{
5800    uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5801    /* Remove mtedesc from the normal sve descriptor. */
5802    desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5803
5804    /*
5805     * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
5806     * offset base entirely over the address space hole to change the
5807     * pointer tag, or change the bit55 selector.  So we could here
5808     * examine TBI + TCMA like we do for sve_ldN_r_mte().
5809     */
5810    sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
5811              esize, msize, off_fn, host_fn, tlb_fn);
5812}
5813
5814#define DO_ST1_ZPZ_S(MEM, OFS, MSZ)                                     \
5815void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
5816                                 void *vm, target_ulong base, uint32_t desc) \
5817{                                                                       \
5818    sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,     \
5819              off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
5820}                                                                       \
5821void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5822    void *vm, target_ulong base, uint32_t desc)                         \
5823{                                                                       \
5824    sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,    \
5825                  off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
5826}
5827
5828#define DO_ST1_ZPZ_D(MEM, OFS, MSZ)                                     \
5829void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
5830                                 void *vm, target_ulong base, uint32_t desc) \
5831{                                                                       \
5832    sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,     \
5833              off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
5834}                                                                       \
5835void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5836    void *vm, target_ulong base, uint32_t desc)                         \
5837{                                                                       \
5838    sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,    \
5839                  off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
5840}
5841
5842DO_ST1_ZPZ_S(bs, zsu, MO_8)
5843DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
5844DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
5845DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
5846DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
5847
5848DO_ST1_ZPZ_S(bs, zss, MO_8)
5849DO_ST1_ZPZ_S(hs_le, zss, MO_16)
5850DO_ST1_ZPZ_S(hs_be, zss, MO_16)
5851DO_ST1_ZPZ_S(ss_le, zss, MO_32)
5852DO_ST1_ZPZ_S(ss_be, zss, MO_32)
5853
5854DO_ST1_ZPZ_D(bd, zsu, MO_8)
5855DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
5856DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
5857DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
5858DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
5859DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
5860DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
5861
5862DO_ST1_ZPZ_D(bd, zss, MO_8)
5863DO_ST1_ZPZ_D(hd_le, zss, MO_16)
5864DO_ST1_ZPZ_D(hd_be, zss, MO_16)
5865DO_ST1_ZPZ_D(sd_le, zss, MO_32)
5866DO_ST1_ZPZ_D(sd_be, zss, MO_32)
5867DO_ST1_ZPZ_D(dd_le, zss, MO_64)
5868DO_ST1_ZPZ_D(dd_be, zss, MO_64)
5869
5870DO_ST1_ZPZ_D(bd, zd, MO_8)
5871DO_ST1_ZPZ_D(hd_le, zd, MO_16)
5872DO_ST1_ZPZ_D(hd_be, zd, MO_16)
5873DO_ST1_ZPZ_D(sd_le, zd, MO_32)
5874DO_ST1_ZPZ_D(sd_be, zd, MO_32)
5875DO_ST1_ZPZ_D(dd_le, zd, MO_64)
5876DO_ST1_ZPZ_D(dd_be, zd, MO_64)
5877
5878#undef DO_ST1_ZPZ_S
5879#undef DO_ST1_ZPZ_D
5880