qemu/target/arm/sve_helper.c
<<
>>
Prefs
   1/*
   2 * ARM SVE Operations
   3 *
   4 * Copyright (c) 2018 Linaro, Ltd.
   5 *
   6 * This library is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU Lesser General Public
   8 * License as published by the Free Software Foundation; either
   9 * version 2 of the License, or (at your option) any later version.
  10 *
  11 * This library is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * Lesser General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU Lesser General Public
  17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20#include "qemu/osdep.h"
  21#include "cpu.h"
  22#include "exec/exec-all.h"
  23#include "exec/cpu_ldst.h"
  24#include "exec/helper-proto.h"
  25#include "tcg/tcg-gvec-desc.h"
  26#include "fpu/softfloat.h"
  27
  28
  29/* Note that vector data is stored in host-endian 64-bit chunks,
  30   so addressing units smaller than that needs a host-endian fixup.  */
  31#ifdef HOST_WORDS_BIGENDIAN
  32#define H1(x)   ((x) ^ 7)
  33#define H1_2(x) ((x) ^ 6)
  34#define H1_4(x) ((x) ^ 4)
  35#define H2(x)   ((x) ^ 3)
  36#define H4(x)   ((x) ^ 1)
  37#else
  38#define H1(x)   (x)
  39#define H1_2(x) (x)
  40#define H1_4(x) (x)
  41#define H2(x)   (x)
  42#define H4(x)   (x)
  43#endif
  44
  45/* Return a value for NZCV as per the ARM PredTest pseudofunction.
  46 *
  47 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
  48 * and bit 0 set if C is set.  Compare the definitions of these variables
  49 * within CPUARMState.
  50 */
  51
  52/* For no G bits set, NZCV = C.  */
  53#define PREDTEST_INIT  1
  54
  55/* This is an iterative function, called for each Pd and Pg word
  56 * moving forward.
  57 */
  58static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
  59{
  60    if (likely(g)) {
  61        /* Compute N from first D & G.
  62           Use bit 2 to signal first G bit seen.  */
  63        if (!(flags & 4)) {
  64            flags |= ((d & (g & -g)) != 0) << 31;
  65            flags |= 4;
  66        }
  67
  68        /* Accumulate Z from each D & G.  */
  69        flags |= ((d & g) != 0) << 1;
  70
  71        /* Compute C from last !(D & G).  Replace previous.  */
  72        flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
  73    }
  74    return flags;
  75}
  76
  77/* This is an iterative function, called for each Pd and Pg word
  78 * moving backward.
  79 */
  80static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
  81{
  82    if (likely(g)) {
  83        /* Compute C from first (i.e last) !(D & G).
  84           Use bit 2 to signal first G bit seen.  */
  85        if (!(flags & 4)) {
  86            flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
  87            flags |= (d & pow2floor(g)) == 0;
  88        }
  89
  90        /* Accumulate Z from each D & G.  */
  91        flags |= ((d & g) != 0) << 1;
  92
  93        /* Compute N from last (i.e first) D & G.  Replace previous.  */
  94        flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
  95    }
  96    return flags;
  97}
  98
  99/* The same for a single word predicate.  */
 100uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
 101{
 102    return iter_predtest_fwd(d, g, PREDTEST_INIT);
 103}
 104
 105/* The same for a multi-word predicate.  */
 106uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
 107{
 108    uint32_t flags = PREDTEST_INIT;
 109    uint64_t *d = vd, *g = vg;
 110    uintptr_t i = 0;
 111
 112    do {
 113        flags = iter_predtest_fwd(d[i], g[i], flags);
 114    } while (++i < words);
 115
 116    return flags;
 117}
 118
 119/* Expand active predicate bits to bytes, for byte elements.
 120 *  for (i = 0; i < 256; ++i) {
 121 *      unsigned long m = 0;
 122 *      for (j = 0; j < 8; j++) {
 123 *          if ((i >> j) & 1) {
 124 *              m |= 0xfful << (j << 3);
 125 *          }
 126 *      }
 127 *      printf("0x%016lx,\n", m);
 128 *  }
 129 */
 130static inline uint64_t expand_pred_b(uint8_t byte)
 131{
 132    static const uint64_t word[256] = {
 133        0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
 134        0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
 135        0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
 136        0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
 137        0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
 138        0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
 139        0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
 140        0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
 141        0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
 142        0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
 143        0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
 144        0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
 145        0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
 146        0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
 147        0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
 148        0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
 149        0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
 150        0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
 151        0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
 152        0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
 153        0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
 154        0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
 155        0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
 156        0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
 157        0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
 158        0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
 159        0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
 160        0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
 161        0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
 162        0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
 163        0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
 164        0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
 165        0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
 166        0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
 167        0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
 168        0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
 169        0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
 170        0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
 171        0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
 172        0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
 173        0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
 174        0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
 175        0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
 176        0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
 177        0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
 178        0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
 179        0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
 180        0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
 181        0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
 182        0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
 183        0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
 184        0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
 185        0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
 186        0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
 187        0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
 188        0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
 189        0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
 190        0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
 191        0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
 192        0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
 193        0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
 194        0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
 195        0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
 196        0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
 197        0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
 198        0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
 199        0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
 200        0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
 201        0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
 202        0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
 203        0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
 204        0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
 205        0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
 206        0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
 207        0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
 208        0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
 209        0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
 210        0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
 211        0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
 212        0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
 213        0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
 214        0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
 215        0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
 216        0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
 217        0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
 218        0xffffffffffffffff,
 219    };
 220    return word[byte];
 221}
 222
 223/* Similarly for half-word elements.
 224 *  for (i = 0; i < 256; ++i) {
 225 *      unsigned long m = 0;
 226 *      if (i & 0xaa) {
 227 *          continue;
 228 *      }
 229 *      for (j = 0; j < 8; j += 2) {
 230 *          if ((i >> j) & 1) {
 231 *              m |= 0xfffful << (j << 3);
 232 *          }
 233 *      }
 234 *      printf("[0x%x] = 0x%016lx,\n", i, m);
 235 *  }
 236 */
 237static inline uint64_t expand_pred_h(uint8_t byte)
 238{
 239    static const uint64_t word[] = {
 240        [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
 241        [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
 242        [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
 243        [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
 244        [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
 245        [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
 246        [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
 247        [0x55] = 0xffffffffffffffff,
 248    };
 249    return word[byte & 0x55];
 250}
 251
 252/* Similarly for single word elements.  */
 253static inline uint64_t expand_pred_s(uint8_t byte)
 254{
 255    static const uint64_t word[] = {
 256        [0x01] = 0x00000000ffffffffull,
 257        [0x10] = 0xffffffff00000000ull,
 258        [0x11] = 0xffffffffffffffffull,
 259    };
 260    return word[byte & 0x11];
 261}
 262
 263/* Swap 16-bit words within a 32-bit word.  */
 264static inline uint32_t hswap32(uint32_t h)
 265{
 266    return rol32(h, 16);
 267}
 268
 269/* Swap 16-bit words within a 64-bit word.  */
 270static inline uint64_t hswap64(uint64_t h)
 271{
 272    uint64_t m = 0x0000ffff0000ffffull;
 273    h = rol64(h, 32);
 274    return ((h & m) << 16) | ((h >> 16) & m);
 275}
 276
 277/* Swap 32-bit words within a 64-bit word.  */
 278static inline uint64_t wswap64(uint64_t h)
 279{
 280    return rol64(h, 32);
 281}
 282
 283#define LOGICAL_PPPP(NAME, FUNC) \
 284void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
 285{                                                                         \
 286    uintptr_t opr_sz = simd_oprsz(desc);                                  \
 287    uint64_t *d = vd, *n = vn, *m = vm, *g = vg;                          \
 288    uintptr_t i;                                                          \
 289    for (i = 0; i < opr_sz / 8; ++i) {                                    \
 290        d[i] = FUNC(n[i], m[i], g[i]);                                    \
 291    }                                                                     \
 292}
 293
 294#define DO_AND(N, M, G)  (((N) & (M)) & (G))
 295#define DO_BIC(N, M, G)  (((N) & ~(M)) & (G))
 296#define DO_EOR(N, M, G)  (((N) ^ (M)) & (G))
 297#define DO_ORR(N, M, G)  (((N) | (M)) & (G))
 298#define DO_ORN(N, M, G)  (((N) | ~(M)) & (G))
 299#define DO_NOR(N, M, G)  (~((N) | (M)) & (G))
 300#define DO_NAND(N, M, G) (~((N) & (M)) & (G))
 301#define DO_SEL(N, M, G)  (((N) & (G)) | ((M) & ~(G)))
 302
 303LOGICAL_PPPP(sve_and_pppp, DO_AND)
 304LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
 305LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
 306LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
 307LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
 308LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
 309LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
 310LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
 311
 312#undef DO_AND
 313#undef DO_BIC
 314#undef DO_EOR
 315#undef DO_ORR
 316#undef DO_ORN
 317#undef DO_NOR
 318#undef DO_NAND
 319#undef DO_SEL
 320#undef LOGICAL_PPPP
 321
 322/* Fully general three-operand expander, controlled by a predicate.
 323 * This is complicated by the host-endian storage of the register file.
 324 */
 325/* ??? I don't expect the compiler could ever vectorize this itself.
 326 * With some tables we can convert bit masks to byte masks, and with
 327 * extra care wrt byte/word ordering we could use gcc generic vectors
 328 * and do 16 bytes at a time.
 329 */
 330#define DO_ZPZZ(NAME, TYPE, H, OP)                                       \
 331void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
 332{                                                                       \
 333    intptr_t i, opr_sz = simd_oprsz(desc);                              \
 334    for (i = 0; i < opr_sz; ) {                                         \
 335        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
 336        do {                                                            \
 337            if (pg & 1) {                                               \
 338                TYPE nn = *(TYPE *)(vn + H(i));                         \
 339                TYPE mm = *(TYPE *)(vm + H(i));                         \
 340                *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
 341            }                                                           \
 342            i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
 343        } while (i & 15);                                               \
 344    }                                                                   \
 345}
 346
 347/* Similarly, specialized for 64-bit operands.  */
 348#define DO_ZPZZ_D(NAME, TYPE, OP)                                \
 349void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
 350{                                                               \
 351    intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
 352    TYPE *d = vd, *n = vn, *m = vm;                             \
 353    uint8_t *pg = vg;                                           \
 354    for (i = 0; i < opr_sz; i += 1) {                           \
 355        if (pg[H1(i)] & 1) {                                    \
 356            TYPE nn = n[i], mm = m[i];                          \
 357            d[i] = OP(nn, mm);                                  \
 358        }                                                       \
 359    }                                                           \
 360}
 361
 362#define DO_AND(N, M)  (N & M)
 363#define DO_EOR(N, M)  (N ^ M)
 364#define DO_ORR(N, M)  (N | M)
 365#define DO_BIC(N, M)  (N & ~M)
 366#define DO_ADD(N, M)  (N + M)
 367#define DO_SUB(N, M)  (N - M)
 368#define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
 369#define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
 370#define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
 371#define DO_MUL(N, M)  (N * M)
 372
 373
 374/*
 375 * We must avoid the C undefined behaviour cases: division by
 376 * zero and signed division of INT_MIN by -1. Both of these
 377 * have architecturally defined required results for Arm.
 378 * We special case all signed divisions by -1 to avoid having
 379 * to deduce the minimum integer for the type involved.
 380 */
 381#define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
 382#define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
 383
 384DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
 385DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
 386DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
 387DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
 388
 389DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
 390DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
 391DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
 392DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
 393
 394DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
 395DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
 396DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
 397DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
 398
 399DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
 400DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
 401DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
 402DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
 403
 404DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
 405DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
 406DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
 407DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
 408
 409DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
 410DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
 411DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
 412DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
 413
 414DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
 415DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
 416DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
 417DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
 418
 419DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
 420DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
 421DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
 422DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
 423
 424DO_ZPZZ(sve_smin_zpzz_b, int8_t,  H1, DO_MIN)
 425DO_ZPZZ(sve_smin_zpzz_h, int16_t,  H1_2, DO_MIN)
 426DO_ZPZZ(sve_smin_zpzz_s, int32_t,  H1_4, DO_MIN)
 427DO_ZPZZ_D(sve_smin_zpzz_d, int64_t,  DO_MIN)
 428
 429DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
 430DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
 431DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
 432DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
 433
 434DO_ZPZZ(sve_sabd_zpzz_b, int8_t,  H1, DO_ABD)
 435DO_ZPZZ(sve_sabd_zpzz_h, int16_t,  H1_2, DO_ABD)
 436DO_ZPZZ(sve_sabd_zpzz_s, int32_t,  H1_4, DO_ABD)
 437DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t,  DO_ABD)
 438
 439DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
 440DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
 441DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
 442DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
 443
 444/* Because the computation type is at least twice as large as required,
 445   these work for both signed and unsigned source types.  */
 446static inline uint8_t do_mulh_b(int32_t n, int32_t m)
 447{
 448    return (n * m) >> 8;
 449}
 450
 451static inline uint16_t do_mulh_h(int32_t n, int32_t m)
 452{
 453    return (n * m) >> 16;
 454}
 455
 456static inline uint32_t do_mulh_s(int64_t n, int64_t m)
 457{
 458    return (n * m) >> 32;
 459}
 460
 461static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
 462{
 463    uint64_t lo, hi;
 464    muls64(&lo, &hi, n, m);
 465    return hi;
 466}
 467
 468static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
 469{
 470    uint64_t lo, hi;
 471    mulu64(&lo, &hi, n, m);
 472    return hi;
 473}
 474
 475DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
 476DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
 477DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
 478DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
 479
 480DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
 481DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
 482DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
 483DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
 484
 485DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
 486DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
 487DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
 488DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
 489
 490DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
 491DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
 492
 493DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
 494DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
 495
 496/* Note that all bits of the shift are significant
 497   and not modulo the element size.  */
 498#define DO_ASR(N, M)  (N >> MIN(M, sizeof(N) * 8 - 1))
 499#define DO_LSR(N, M)  (M < sizeof(N) * 8 ? N >> M : 0)
 500#define DO_LSL(N, M)  (M < sizeof(N) * 8 ? N << M : 0)
 501
 502DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
 503DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
 504DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
 505
 506DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
 507DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
 508DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
 509
 510DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
 511DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
 512DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
 513
 514DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
 515DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
 516DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
 517
 518#undef DO_ZPZZ
 519#undef DO_ZPZZ_D
 520
 521/* Three-operand expander, controlled by a predicate, in which the
 522 * third operand is "wide".  That is, for D = N op M, the same 64-bit
 523 * value of M is used with all of the narrower values of N.
 524 */
 525#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP)                               \
 526void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
 527{                                                                       \
 528    intptr_t i, opr_sz = simd_oprsz(desc);                              \
 529    for (i = 0; i < opr_sz; ) {                                         \
 530        uint8_t pg = *(uint8_t *)(vg + H1(i >> 3));                     \
 531        TYPEW mm = *(TYPEW *)(vm + i);                                  \
 532        do {                                                            \
 533            if (pg & 1) {                                               \
 534                TYPE nn = *(TYPE *)(vn + H(i));                         \
 535                *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
 536            }                                                           \
 537            i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
 538        } while (i & 7);                                                \
 539    }                                                                   \
 540}
 541
 542DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
 543DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
 544DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
 545
 546DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
 547DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
 548DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
 549
 550DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
 551DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
 552DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
 553
 554#undef DO_ZPZW
 555
 556/* Fully general two-operand expander, controlled by a predicate.
 557 */
 558#define DO_ZPZ(NAME, TYPE, H, OP)                               \
 559void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
 560{                                                               \
 561    intptr_t i, opr_sz = simd_oprsz(desc);                      \
 562    for (i = 0; i < opr_sz; ) {                                 \
 563        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
 564        do {                                                    \
 565            if (pg & 1) {                                       \
 566                TYPE nn = *(TYPE *)(vn + H(i));                 \
 567                *(TYPE *)(vd + H(i)) = OP(nn);                  \
 568            }                                                   \
 569            i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
 570        } while (i & 15);                                       \
 571    }                                                           \
 572}
 573
 574/* Similarly, specialized for 64-bit operands.  */
 575#define DO_ZPZ_D(NAME, TYPE, OP)                                \
 576void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
 577{                                                               \
 578    intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
 579    TYPE *d = vd, *n = vn;                                      \
 580    uint8_t *pg = vg;                                           \
 581    for (i = 0; i < opr_sz; i += 1) {                           \
 582        if (pg[H1(i)] & 1) {                                    \
 583            TYPE nn = n[i];                                     \
 584            d[i] = OP(nn);                                      \
 585        }                                                       \
 586    }                                                           \
 587}
 588
 589#define DO_CLS_B(N)   (clrsb32(N) - 24)
 590#define DO_CLS_H(N)   (clrsb32(N) - 16)
 591
 592DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
 593DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
 594DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
 595DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
 596
 597#define DO_CLZ_B(N)   (clz32(N) - 24)
 598#define DO_CLZ_H(N)   (clz32(N) - 16)
 599
 600DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
 601DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
 602DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
 603DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
 604
 605DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
 606DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
 607DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
 608DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
 609
 610#define DO_CNOT(N)    (N == 0)
 611
 612DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
 613DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
 614DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
 615DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
 616
 617#define DO_FABS(N)    (N & ((__typeof(N))-1 >> 1))
 618
 619DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
 620DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
 621DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
 622
 623#define DO_FNEG(N)    (N ^ ~((__typeof(N))-1 >> 1))
 624
 625DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
 626DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
 627DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
 628
 629#define DO_NOT(N)    (~N)
 630
 631DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
 632DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
 633DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
 634DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
 635
 636#define DO_SXTB(N)    ((int8_t)N)
 637#define DO_SXTH(N)    ((int16_t)N)
 638#define DO_SXTS(N)    ((int32_t)N)
 639#define DO_UXTB(N)    ((uint8_t)N)
 640#define DO_UXTH(N)    ((uint16_t)N)
 641#define DO_UXTS(N)    ((uint32_t)N)
 642
 643DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
 644DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
 645DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
 646DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
 647DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
 648DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
 649
 650DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
 651DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
 652DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
 653DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
 654DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
 655DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
 656
 657#define DO_ABS(N)    (N < 0 ? -N : N)
 658
 659DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
 660DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
 661DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
 662DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
 663
 664#define DO_NEG(N)    (-N)
 665
 666DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
 667DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
 668DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
 669DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
 670
 671DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
 672DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
 673DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
 674
 675DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
 676DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
 677
 678DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
 679
 680DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
 681DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
 682DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
 683DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
 684
 685/* Three-operand expander, unpredicated, in which the third operand is "wide".
 686 */
 687#define DO_ZZW(NAME, TYPE, TYPEW, H, OP)                       \
 688void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
 689{                                                              \
 690    intptr_t i, opr_sz = simd_oprsz(desc);                     \
 691    for (i = 0; i < opr_sz; ) {                                \
 692        TYPEW mm = *(TYPEW *)(vm + i);                         \
 693        do {                                                   \
 694            TYPE nn = *(TYPE *)(vn + H(i));                    \
 695            *(TYPE *)(vd + H(i)) = OP(nn, mm);                 \
 696            i += sizeof(TYPE);                                 \
 697        } while (i & 7);                                       \
 698    }                                                          \
 699}
 700
 701DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
 702DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
 703DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
 704
 705DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
 706DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
 707DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
 708
 709DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
 710DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
 711DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
 712
 713#undef DO_ZZW
 714
 715#undef DO_CLS_B
 716#undef DO_CLS_H
 717#undef DO_CLZ_B
 718#undef DO_CLZ_H
 719#undef DO_CNOT
 720#undef DO_FABS
 721#undef DO_FNEG
 722#undef DO_ABS
 723#undef DO_NEG
 724#undef DO_ZPZ
 725#undef DO_ZPZ_D
 726
 727/* Two-operand reduction expander, controlled by a predicate.
 728 * The difference between TYPERED and TYPERET has to do with
 729 * sign-extension.  E.g. for SMAX, TYPERED must be signed,
 730 * but TYPERET must be unsigned so that e.g. a 32-bit value
 731 * is not sign-extended to the ABI uint64_t return type.
 732 */
 733/* ??? If we were to vectorize this by hand the reduction ordering
 734 * would change.  For integer operands, this is perfectly fine.
 735 */
 736#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
 737uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
 738{                                                          \
 739    intptr_t i, opr_sz = simd_oprsz(desc);                 \
 740    TYPERED ret = INIT;                                    \
 741    for (i = 0; i < opr_sz; ) {                            \
 742        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
 743        do {                                               \
 744            if (pg & 1) {                                  \
 745                TYPEELT nn = *(TYPEELT *)(vn + H(i));      \
 746                ret = OP(ret, nn);                         \
 747            }                                              \
 748            i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT);  \
 749        } while (i & 15);                                  \
 750    }                                                      \
 751    return (TYPERET)ret;                                   \
 752}
 753
 754#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP)             \
 755uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
 756{                                                          \
 757    intptr_t i, opr_sz = simd_oprsz(desc) / 8;             \
 758    TYPEE *n = vn;                                         \
 759    uint8_t *pg = vg;                                      \
 760    TYPER ret = INIT;                                      \
 761    for (i = 0; i < opr_sz; i += 1) {                      \
 762        if (pg[H1(i)] & 1) {                               \
 763            TYPEE nn = n[i];                               \
 764            ret = OP(ret, nn);                             \
 765        }                                                  \
 766    }                                                      \
 767    return ret;                                            \
 768}
 769
 770DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
 771DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
 772DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
 773DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
 774
 775DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
 776DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
 777DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
 778DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
 779
 780DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
 781DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
 782DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
 783DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
 784
 785DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
 786DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
 787DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
 788
 789DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
 790DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
 791DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
 792DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
 793
 794DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
 795DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
 796DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
 797DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
 798
 799DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
 800DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
 801DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
 802DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
 803
 804DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
 805DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
 806DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
 807DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
 808
 809DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
 810DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
 811DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
 812DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
 813
 814#undef DO_VPZ
 815#undef DO_VPZ_D
 816
 817/* Two vector operand, one scalar operand, unpredicated.  */
 818#define DO_ZZI(NAME, TYPE, OP)                                       \
 819void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc)   \
 820{                                                                    \
 821    intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);            \
 822    TYPE s = s64, *d = vd, *n = vn;                                  \
 823    for (i = 0; i < opr_sz; ++i) {                                   \
 824        d[i] = OP(n[i], s);                                          \
 825    }                                                                \
 826}
 827
 828#define DO_SUBR(X, Y)   (Y - X)
 829
 830DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
 831DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
 832DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
 833DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
 834
 835DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
 836DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
 837DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
 838DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
 839
 840DO_ZZI(sve_smini_b, int8_t, DO_MIN)
 841DO_ZZI(sve_smini_h, int16_t, DO_MIN)
 842DO_ZZI(sve_smini_s, int32_t, DO_MIN)
 843DO_ZZI(sve_smini_d, int64_t, DO_MIN)
 844
 845DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
 846DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
 847DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
 848DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
 849
 850DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
 851DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
 852DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
 853DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
 854
 855#undef DO_ZZI
 856
 857#undef DO_AND
 858#undef DO_ORR
 859#undef DO_EOR
 860#undef DO_BIC
 861#undef DO_ADD
 862#undef DO_SUB
 863#undef DO_MAX
 864#undef DO_MIN
 865#undef DO_ABD
 866#undef DO_MUL
 867#undef DO_DIV
 868#undef DO_ASR
 869#undef DO_LSR
 870#undef DO_LSL
 871#undef DO_SUBR
 872
 873/* Similar to the ARM LastActiveElement pseudocode function, except the
 874   result is multiplied by the element size.  This includes the not found
 875   indication; e.g. not found for esz=3 is -8.  */
 876static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
 877{
 878    uint64_t mask = pred_esz_masks[esz];
 879    intptr_t i = words;
 880
 881    do {
 882        uint64_t this_g = g[--i] & mask;
 883        if (this_g) {
 884            return i * 64 + (63 - clz64(this_g));
 885        }
 886    } while (i > 0);
 887    return (intptr_t)-1 << esz;
 888}
 889
 890uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
 891{
 892    uint32_t flags = PREDTEST_INIT;
 893    uint64_t *d = vd, *g = vg;
 894    intptr_t i = 0;
 895
 896    do {
 897        uint64_t this_d = d[i];
 898        uint64_t this_g = g[i];
 899
 900        if (this_g) {
 901            if (!(flags & 4)) {
 902                /* Set in D the first bit of G.  */
 903                this_d |= this_g & -this_g;
 904                d[i] = this_d;
 905            }
 906            flags = iter_predtest_fwd(this_d, this_g, flags);
 907        }
 908    } while (++i < words);
 909
 910    return flags;
 911}
 912
 913uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
 914{
 915    intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
 916    intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
 917    uint32_t flags = PREDTEST_INIT;
 918    uint64_t *d = vd, *g = vg, esz_mask;
 919    intptr_t i, next;
 920
 921    next = last_active_element(vd, words, esz) + (1 << esz);
 922    esz_mask = pred_esz_masks[esz];
 923
 924    /* Similar to the pseudocode for pnext, but scaled by ESZ
 925       so that we find the correct bit.  */
 926    if (next < words * 64) {
 927        uint64_t mask = -1;
 928
 929        if (next & 63) {
 930            mask = ~((1ull << (next & 63)) - 1);
 931            next &= -64;
 932        }
 933        do {
 934            uint64_t this_g = g[next / 64] & esz_mask & mask;
 935            if (this_g != 0) {
 936                next = (next & -64) + ctz64(this_g);
 937                break;
 938            }
 939            next += 64;
 940            mask = -1;
 941        } while (next < words * 64);
 942    }
 943
 944    i = 0;
 945    do {
 946        uint64_t this_d = 0;
 947        if (i == next / 64) {
 948            this_d = 1ull << (next & 63);
 949        }
 950        d[i] = this_d;
 951        flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
 952    } while (++i < words);
 953
 954    return flags;
 955}
 956
 957/* Store zero into every active element of Zd.  We will use this for two
 958 * and three-operand predicated instructions for which logic dictates a
 959 * zero result.  In particular, logical shift by element size, which is
 960 * otherwise undefined on the host.
 961 *
 962 * For element sizes smaller than uint64_t, we use tables to expand
 963 * the N bits of the controlling predicate to a byte mask, and clear
 964 * those bytes.
 965 */
 966void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
 967{
 968    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
 969    uint64_t *d = vd;
 970    uint8_t *pg = vg;
 971    for (i = 0; i < opr_sz; i += 1) {
 972        d[i] &= ~expand_pred_b(pg[H1(i)]);
 973    }
 974}
 975
 976void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
 977{
 978    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
 979    uint64_t *d = vd;
 980    uint8_t *pg = vg;
 981    for (i = 0; i < opr_sz; i += 1) {
 982        d[i] &= ~expand_pred_h(pg[H1(i)]);
 983    }
 984}
 985
 986void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
 987{
 988    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
 989    uint64_t *d = vd;
 990    uint8_t *pg = vg;
 991    for (i = 0; i < opr_sz; i += 1) {
 992        d[i] &= ~expand_pred_s(pg[H1(i)]);
 993    }
 994}
 995
 996void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
 997{
 998    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
 999    uint64_t *d = vd;
1000    uint8_t *pg = vg;
1001    for (i = 0; i < opr_sz; i += 1) {
1002        if (pg[H1(i)] & 1) {
1003            d[i] = 0;
1004        }
1005    }
1006}
1007
1008/* Copy Zn into Zd, and store zero into inactive elements.  */
1009void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1010{
1011    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1012    uint64_t *d = vd, *n = vn;
1013    uint8_t *pg = vg;
1014    for (i = 0; i < opr_sz; i += 1) {
1015        d[i] = n[i] & expand_pred_b(pg[H1(i)]);
1016    }
1017}
1018
1019void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1020{
1021    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1022    uint64_t *d = vd, *n = vn;
1023    uint8_t *pg = vg;
1024    for (i = 0; i < opr_sz; i += 1) {
1025        d[i] = n[i] & expand_pred_h(pg[H1(i)]);
1026    }
1027}
1028
1029void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1030{
1031    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1032    uint64_t *d = vd, *n = vn;
1033    uint8_t *pg = vg;
1034    for (i = 0; i < opr_sz; i += 1) {
1035        d[i] = n[i] & expand_pred_s(pg[H1(i)]);
1036    }
1037}
1038
1039void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1040{
1041    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1042    uint64_t *d = vd, *n = vn;
1043    uint8_t *pg = vg;
1044    for (i = 0; i < opr_sz; i += 1) {
1045        d[i] = n[1] & -(uint64_t)(pg[H1(i)] & 1);
1046    }
1047}
1048
1049/* Three-operand expander, immediate operand, controlled by a predicate.
1050 */
1051#define DO_ZPZI(NAME, TYPE, H, OP)                              \
1052void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
1053{                                                               \
1054    intptr_t i, opr_sz = simd_oprsz(desc);                      \
1055    TYPE imm = simd_data(desc);                                 \
1056    for (i = 0; i < opr_sz; ) {                                 \
1057        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
1058        do {                                                    \
1059            if (pg & 1) {                                       \
1060                TYPE nn = *(TYPE *)(vn + H(i));                 \
1061                *(TYPE *)(vd + H(i)) = OP(nn, imm);             \
1062            }                                                   \
1063            i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
1064        } while (i & 15);                                       \
1065    }                                                           \
1066}
1067
1068/* Similarly, specialized for 64-bit operands.  */
1069#define DO_ZPZI_D(NAME, TYPE, OP)                               \
1070void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
1071{                                                               \
1072    intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
1073    TYPE *d = vd, *n = vn;                                      \
1074    TYPE imm = simd_data(desc);                                 \
1075    uint8_t *pg = vg;                                           \
1076    for (i = 0; i < opr_sz; i += 1) {                           \
1077        if (pg[H1(i)] & 1) {                                    \
1078            TYPE nn = n[i];                                     \
1079            d[i] = OP(nn, imm);                                 \
1080        }                                                       \
1081    }                                                           \
1082}
1083
1084#define DO_SHR(N, M)  (N >> M)
1085#define DO_SHL(N, M)  (N << M)
1086
1087/* Arithmetic shift right for division.  This rounds negative numbers
1088   toward zero as per signed division.  Therefore before shifting,
1089   when N is negative, add 2**M-1.  */
1090#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1091
1092DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1093DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1094DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1095DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1096
1097DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1098DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1099DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1100DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1101
1102DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1103DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1104DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1105DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1106
1107DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1108DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1109DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1110DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1111
1112#undef DO_SHR
1113#undef DO_SHL
1114#undef DO_ASRD
1115#undef DO_ZPZI
1116#undef DO_ZPZI_D
1117
1118/* Fully general four-operand expander, controlled by a predicate.
1119 */
1120#define DO_ZPZZZ(NAME, TYPE, H, OP)                           \
1121void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
1122                  void *vg, uint32_t desc)                    \
1123{                                                             \
1124    intptr_t i, opr_sz = simd_oprsz(desc);                    \
1125    for (i = 0; i < opr_sz; ) {                               \
1126        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));       \
1127        do {                                                  \
1128            if (pg & 1) {                                     \
1129                TYPE nn = *(TYPE *)(vn + H(i));               \
1130                TYPE mm = *(TYPE *)(vm + H(i));               \
1131                TYPE aa = *(TYPE *)(va + H(i));               \
1132                *(TYPE *)(vd + H(i)) = OP(aa, nn, mm);        \
1133            }                                                 \
1134            i += sizeof(TYPE), pg >>= sizeof(TYPE);           \
1135        } while (i & 15);                                     \
1136    }                                                         \
1137}
1138
1139/* Similarly, specialized for 64-bit operands.  */
1140#define DO_ZPZZZ_D(NAME, TYPE, OP)                            \
1141void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
1142                  void *vg, uint32_t desc)                    \
1143{                                                             \
1144    intptr_t i, opr_sz = simd_oprsz(desc) / 8;                \
1145    TYPE *d = vd, *a = va, *n = vn, *m = vm;                  \
1146    uint8_t *pg = vg;                                         \
1147    for (i = 0; i < opr_sz; i += 1) {                         \
1148        if (pg[H1(i)] & 1) {                                  \
1149            TYPE aa = a[i], nn = n[i], mm = m[i];             \
1150            d[i] = OP(aa, nn, mm);                            \
1151        }                                                     \
1152    }                                                         \
1153}
1154
1155#define DO_MLA(A, N, M)  (A + N * M)
1156#define DO_MLS(A, N, M)  (A - N * M)
1157
1158DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1159DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1160
1161DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1162DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1163
1164DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1165DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1166
1167DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1168DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1169
1170#undef DO_MLA
1171#undef DO_MLS
1172#undef DO_ZPZZZ
1173#undef DO_ZPZZZ_D
1174
1175void HELPER(sve_index_b)(void *vd, uint32_t start,
1176                         uint32_t incr, uint32_t desc)
1177{
1178    intptr_t i, opr_sz = simd_oprsz(desc);
1179    uint8_t *d = vd;
1180    for (i = 0; i < opr_sz; i += 1) {
1181        d[H1(i)] = start + i * incr;
1182    }
1183}
1184
1185void HELPER(sve_index_h)(void *vd, uint32_t start,
1186                         uint32_t incr, uint32_t desc)
1187{
1188    intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1189    uint16_t *d = vd;
1190    for (i = 0; i < opr_sz; i += 1) {
1191        d[H2(i)] = start + i * incr;
1192    }
1193}
1194
1195void HELPER(sve_index_s)(void *vd, uint32_t start,
1196                         uint32_t incr, uint32_t desc)
1197{
1198    intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1199    uint32_t *d = vd;
1200    for (i = 0; i < opr_sz; i += 1) {
1201        d[H4(i)] = start + i * incr;
1202    }
1203}
1204
1205void HELPER(sve_index_d)(void *vd, uint64_t start,
1206                         uint64_t incr, uint32_t desc)
1207{
1208    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1209    uint64_t *d = vd;
1210    for (i = 0; i < opr_sz; i += 1) {
1211        d[i] = start + i * incr;
1212    }
1213}
1214
1215void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1216{
1217    intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1218    uint32_t sh = simd_data(desc);
1219    uint32_t *d = vd, *n = vn, *m = vm;
1220    for (i = 0; i < opr_sz; i += 1) {
1221        d[i] = n[i] + (m[i] << sh);
1222    }
1223}
1224
1225void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1226{
1227    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1228    uint64_t sh = simd_data(desc);
1229    uint64_t *d = vd, *n = vn, *m = vm;
1230    for (i = 0; i < opr_sz; i += 1) {
1231        d[i] = n[i] + (m[i] << sh);
1232    }
1233}
1234
1235void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1236{
1237    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1238    uint64_t sh = simd_data(desc);
1239    uint64_t *d = vd, *n = vn, *m = vm;
1240    for (i = 0; i < opr_sz; i += 1) {
1241        d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1242    }
1243}
1244
1245void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1246{
1247    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1248    uint64_t sh = simd_data(desc);
1249    uint64_t *d = vd, *n = vn, *m = vm;
1250    for (i = 0; i < opr_sz; i += 1) {
1251        d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1252    }
1253}
1254
1255void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1256{
1257    /* These constants are cut-and-paste directly from the ARM pseudocode.  */
1258    static const uint16_t coeff[] = {
1259        0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1260        0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1261        0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1262        0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1263    };
1264    intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1265    uint16_t *d = vd, *n = vn;
1266
1267    for (i = 0; i < opr_sz; i++) {
1268        uint16_t nn = n[i];
1269        intptr_t idx = extract32(nn, 0, 5);
1270        uint16_t exp = extract32(nn, 5, 5);
1271        d[i] = coeff[idx] | (exp << 10);
1272    }
1273}
1274
1275void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1276{
1277    /* These constants are cut-and-paste directly from the ARM pseudocode.  */
1278    static const uint32_t coeff[] = {
1279        0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1280        0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1281        0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1282        0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1283        0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1284        0x1ef532, 0x20b051, 0x227043, 0x243516,
1285        0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1286        0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1287        0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1288        0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1289        0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1290        0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1291        0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1292        0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1293        0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1294        0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1295    };
1296    intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1297    uint32_t *d = vd, *n = vn;
1298
1299    for (i = 0; i < opr_sz; i++) {
1300        uint32_t nn = n[i];
1301        intptr_t idx = extract32(nn, 0, 6);
1302        uint32_t exp = extract32(nn, 6, 8);
1303        d[i] = coeff[idx] | (exp << 23);
1304    }
1305}
1306
1307void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1308{
1309    /* These constants are cut-and-paste directly from the ARM pseudocode.  */
1310    static const uint64_t coeff[] = {
1311        0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1312        0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1313        0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1314        0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1315        0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1316        0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1317        0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1318        0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1319        0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1320        0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1321        0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1322        0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1323        0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1324        0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1325        0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1326        0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1327        0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1328        0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1329        0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1330        0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1331        0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1332        0xFA7C1819E90D8ull,
1333    };
1334    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1335    uint64_t *d = vd, *n = vn;
1336
1337    for (i = 0; i < opr_sz; i++) {
1338        uint64_t nn = n[i];
1339        intptr_t idx = extract32(nn, 0, 6);
1340        uint64_t exp = extract32(nn, 6, 11);
1341        d[i] = coeff[idx] | (exp << 52);
1342    }
1343}
1344
1345void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1346{
1347    intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1348    uint16_t *d = vd, *n = vn, *m = vm;
1349    for (i = 0; i < opr_sz; i += 1) {
1350        uint16_t nn = n[i];
1351        uint16_t mm = m[i];
1352        if (mm & 1) {
1353            nn = float16_one;
1354        }
1355        d[i] = nn ^ (mm & 2) << 14;
1356    }
1357}
1358
1359void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1360{
1361    intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1362    uint32_t *d = vd, *n = vn, *m = vm;
1363    for (i = 0; i < opr_sz; i += 1) {
1364        uint32_t nn = n[i];
1365        uint32_t mm = m[i];
1366        if (mm & 1) {
1367            nn = float32_one;
1368        }
1369        d[i] = nn ^ (mm & 2) << 30;
1370    }
1371}
1372
1373void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1374{
1375    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1376    uint64_t *d = vd, *n = vn, *m = vm;
1377    for (i = 0; i < opr_sz; i += 1) {
1378        uint64_t nn = n[i];
1379        uint64_t mm = m[i];
1380        if (mm & 1) {
1381            nn = float64_one;
1382        }
1383        d[i] = nn ^ (mm & 2) << 62;
1384    }
1385}
1386
1387/*
1388 * Signed saturating addition with scalar operand.
1389 */
1390
1391void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1392{
1393    intptr_t i, oprsz = simd_oprsz(desc);
1394
1395    for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1396        int r = *(int8_t *)(a + i) + b;
1397        if (r > INT8_MAX) {
1398            r = INT8_MAX;
1399        } else if (r < INT8_MIN) {
1400            r = INT8_MIN;
1401        }
1402        *(int8_t *)(d + i) = r;
1403    }
1404}
1405
1406void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1407{
1408    intptr_t i, oprsz = simd_oprsz(desc);
1409
1410    for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1411        int r = *(int16_t *)(a + i) + b;
1412        if (r > INT16_MAX) {
1413            r = INT16_MAX;
1414        } else if (r < INT16_MIN) {
1415            r = INT16_MIN;
1416        }
1417        *(int16_t *)(d + i) = r;
1418    }
1419}
1420
1421void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1422{
1423    intptr_t i, oprsz = simd_oprsz(desc);
1424
1425    for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1426        int64_t r = *(int32_t *)(a + i) + b;
1427        if (r > INT32_MAX) {
1428            r = INT32_MAX;
1429        } else if (r < INT32_MIN) {
1430            r = INT32_MIN;
1431        }
1432        *(int32_t *)(d + i) = r;
1433    }
1434}
1435
1436void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1437{
1438    intptr_t i, oprsz = simd_oprsz(desc);
1439
1440    for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1441        int64_t ai = *(int64_t *)(a + i);
1442        int64_t r = ai + b;
1443        if (((r ^ ai) & ~(ai ^ b)) < 0) {
1444            /* Signed overflow.  */
1445            r = (r < 0 ? INT64_MAX : INT64_MIN);
1446        }
1447        *(int64_t *)(d + i) = r;
1448    }
1449}
1450
1451/*
1452 * Unsigned saturating addition with scalar operand.
1453 */
1454
1455void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1456{
1457    intptr_t i, oprsz = simd_oprsz(desc);
1458
1459    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1460        int r = *(uint8_t *)(a + i) + b;
1461        if (r > UINT8_MAX) {
1462            r = UINT8_MAX;
1463        } else if (r < 0) {
1464            r = 0;
1465        }
1466        *(uint8_t *)(d + i) = r;
1467    }
1468}
1469
1470void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1471{
1472    intptr_t i, oprsz = simd_oprsz(desc);
1473
1474    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1475        int r = *(uint16_t *)(a + i) + b;
1476        if (r > UINT16_MAX) {
1477            r = UINT16_MAX;
1478        } else if (r < 0) {
1479            r = 0;
1480        }
1481        *(uint16_t *)(d + i) = r;
1482    }
1483}
1484
1485void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1486{
1487    intptr_t i, oprsz = simd_oprsz(desc);
1488
1489    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1490        int64_t r = *(uint32_t *)(a + i) + b;
1491        if (r > UINT32_MAX) {
1492            r = UINT32_MAX;
1493        } else if (r < 0) {
1494            r = 0;
1495        }
1496        *(uint32_t *)(d + i) = r;
1497    }
1498}
1499
1500void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1501{
1502    intptr_t i, oprsz = simd_oprsz(desc);
1503
1504    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1505        uint64_t r = *(uint64_t *)(a + i) + b;
1506        if (r < b) {
1507            r = UINT64_MAX;
1508        }
1509        *(uint64_t *)(d + i) = r;
1510    }
1511}
1512
1513void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1514{
1515    intptr_t i, oprsz = simd_oprsz(desc);
1516
1517    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1518        uint64_t ai = *(uint64_t *)(a + i);
1519        *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1520    }
1521}
1522
1523/* Two operand predicated copy immediate with merge.  All valid immediates
1524 * can fit within 17 signed bits in the simd_data field.
1525 */
1526void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1527                         uint64_t mm, uint32_t desc)
1528{
1529    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1530    uint64_t *d = vd, *n = vn;
1531    uint8_t *pg = vg;
1532
1533    mm = dup_const(MO_8, mm);
1534    for (i = 0; i < opr_sz; i += 1) {
1535        uint64_t nn = n[i];
1536        uint64_t pp = expand_pred_b(pg[H1(i)]);
1537        d[i] = (mm & pp) | (nn & ~pp);
1538    }
1539}
1540
1541void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1542                         uint64_t mm, uint32_t desc)
1543{
1544    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1545    uint64_t *d = vd, *n = vn;
1546    uint8_t *pg = vg;
1547
1548    mm = dup_const(MO_16, mm);
1549    for (i = 0; i < opr_sz; i += 1) {
1550        uint64_t nn = n[i];
1551        uint64_t pp = expand_pred_h(pg[H1(i)]);
1552        d[i] = (mm & pp) | (nn & ~pp);
1553    }
1554}
1555
1556void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1557                         uint64_t mm, uint32_t desc)
1558{
1559    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1560    uint64_t *d = vd, *n = vn;
1561    uint8_t *pg = vg;
1562
1563    mm = dup_const(MO_32, mm);
1564    for (i = 0; i < opr_sz; i += 1) {
1565        uint64_t nn = n[i];
1566        uint64_t pp = expand_pred_s(pg[H1(i)]);
1567        d[i] = (mm & pp) | (nn & ~pp);
1568    }
1569}
1570
1571void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1572                         uint64_t mm, uint32_t desc)
1573{
1574    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1575    uint64_t *d = vd, *n = vn;
1576    uint8_t *pg = vg;
1577
1578    for (i = 0; i < opr_sz; i += 1) {
1579        uint64_t nn = n[i];
1580        d[i] = (pg[H1(i)] & 1 ? mm : nn);
1581    }
1582}
1583
1584void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1585{
1586    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1587    uint64_t *d = vd;
1588    uint8_t *pg = vg;
1589
1590    val = dup_const(MO_8, val);
1591    for (i = 0; i < opr_sz; i += 1) {
1592        d[i] = val & expand_pred_b(pg[H1(i)]);
1593    }
1594}
1595
1596void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1597{
1598    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1599    uint64_t *d = vd;
1600    uint8_t *pg = vg;
1601
1602    val = dup_const(MO_16, val);
1603    for (i = 0; i < opr_sz; i += 1) {
1604        d[i] = val & expand_pred_h(pg[H1(i)]);
1605    }
1606}
1607
1608void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1609{
1610    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1611    uint64_t *d = vd;
1612    uint8_t *pg = vg;
1613
1614    val = dup_const(MO_32, val);
1615    for (i = 0; i < opr_sz; i += 1) {
1616        d[i] = val & expand_pred_s(pg[H1(i)]);
1617    }
1618}
1619
1620void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1621{
1622    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1623    uint64_t *d = vd;
1624    uint8_t *pg = vg;
1625
1626    for (i = 0; i < opr_sz; i += 1) {
1627        d[i] = (pg[H1(i)] & 1 ? val : 0);
1628    }
1629}
1630
1631/* Big-endian hosts need to frob the byte indicies.  If the copy
1632 * happens to be 8-byte aligned, then no frobbing necessary.
1633 */
1634static void swap_memmove(void *vd, void *vs, size_t n)
1635{
1636    uintptr_t d = (uintptr_t)vd;
1637    uintptr_t s = (uintptr_t)vs;
1638    uintptr_t o = (d | s | n) & 7;
1639    size_t i;
1640
1641#ifndef HOST_WORDS_BIGENDIAN
1642    o = 0;
1643#endif
1644    switch (o) {
1645    case 0:
1646        memmove(vd, vs, n);
1647        break;
1648
1649    case 4:
1650        if (d < s || d >= s + n) {
1651            for (i = 0; i < n; i += 4) {
1652                *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1653            }
1654        } else {
1655            for (i = n; i > 0; ) {
1656                i -= 4;
1657                *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1658            }
1659        }
1660        break;
1661
1662    case 2:
1663    case 6:
1664        if (d < s || d >= s + n) {
1665            for (i = 0; i < n; i += 2) {
1666                *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1667            }
1668        } else {
1669            for (i = n; i > 0; ) {
1670                i -= 2;
1671                *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1672            }
1673        }
1674        break;
1675
1676    default:
1677        if (d < s || d >= s + n) {
1678            for (i = 0; i < n; i++) {
1679                *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1680            }
1681        } else {
1682            for (i = n; i > 0; ) {
1683                i -= 1;
1684                *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1685            }
1686        }
1687        break;
1688    }
1689}
1690
1691void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1692{
1693    intptr_t opr_sz = simd_oprsz(desc);
1694    size_t n_ofs = simd_data(desc);
1695    size_t n_siz = opr_sz - n_ofs;
1696
1697    if (vd != vm) {
1698        swap_memmove(vd, vn + n_ofs, n_siz);
1699        swap_memmove(vd + n_siz, vm, n_ofs);
1700    } else if (vd != vn) {
1701        swap_memmove(vd + n_siz, vd, n_ofs);
1702        swap_memmove(vd, vn + n_ofs, n_siz);
1703    } else {
1704        /* vd == vn == vm.  Need temp space.  */
1705        ARMVectorReg tmp;
1706        swap_memmove(&tmp, vm, n_ofs);
1707        swap_memmove(vd, vd + n_ofs, n_siz);
1708        memcpy(vd + n_siz, &tmp, n_ofs);
1709    }
1710}
1711
1712#define DO_INSR(NAME, TYPE, H) \
1713void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1714{                                                                  \
1715    intptr_t opr_sz = simd_oprsz(desc);                            \
1716    swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE));    \
1717    *(TYPE *)(vd + H(0)) = val;                                    \
1718}
1719
1720DO_INSR(sve_insr_b, uint8_t, H1)
1721DO_INSR(sve_insr_h, uint16_t, H1_2)
1722DO_INSR(sve_insr_s, uint32_t, H1_4)
1723DO_INSR(sve_insr_d, uint64_t, )
1724
1725#undef DO_INSR
1726
1727void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1728{
1729    intptr_t i, j, opr_sz = simd_oprsz(desc);
1730    for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1731        uint64_t f = *(uint64_t *)(vn + i);
1732        uint64_t b = *(uint64_t *)(vn + j);
1733        *(uint64_t *)(vd + i) = bswap64(b);
1734        *(uint64_t *)(vd + j) = bswap64(f);
1735    }
1736}
1737
1738void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1739{
1740    intptr_t i, j, opr_sz = simd_oprsz(desc);
1741    for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1742        uint64_t f = *(uint64_t *)(vn + i);
1743        uint64_t b = *(uint64_t *)(vn + j);
1744        *(uint64_t *)(vd + i) = hswap64(b);
1745        *(uint64_t *)(vd + j) = hswap64(f);
1746    }
1747}
1748
1749void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1750{
1751    intptr_t i, j, opr_sz = simd_oprsz(desc);
1752    for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1753        uint64_t f = *(uint64_t *)(vn + i);
1754        uint64_t b = *(uint64_t *)(vn + j);
1755        *(uint64_t *)(vd + i) = rol64(b, 32);
1756        *(uint64_t *)(vd + j) = rol64(f, 32);
1757    }
1758}
1759
1760void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1761{
1762    intptr_t i, j, opr_sz = simd_oprsz(desc);
1763    for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1764        uint64_t f = *(uint64_t *)(vn + i);
1765        uint64_t b = *(uint64_t *)(vn + j);
1766        *(uint64_t *)(vd + i) = b;
1767        *(uint64_t *)(vd + j) = f;
1768    }
1769}
1770
1771#define DO_TBL(NAME, TYPE, H) \
1772void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1773{                                                              \
1774    intptr_t i, opr_sz = simd_oprsz(desc);                     \
1775    uintptr_t elem = opr_sz / sizeof(TYPE);                    \
1776    TYPE *d = vd, *n = vn, *m = vm;                            \
1777    ARMVectorReg tmp;                                          \
1778    if (unlikely(vd == vn)) {                                  \
1779        n = memcpy(&tmp, vn, opr_sz);                          \
1780    }                                                          \
1781    for (i = 0; i < elem; i++) {                               \
1782        TYPE j = m[H(i)];                                      \
1783        d[H(i)] = j < elem ? n[H(j)] : 0;                      \
1784    }                                                          \
1785}
1786
1787DO_TBL(sve_tbl_b, uint8_t, H1)
1788DO_TBL(sve_tbl_h, uint16_t, H2)
1789DO_TBL(sve_tbl_s, uint32_t, H4)
1790DO_TBL(sve_tbl_d, uint64_t, )
1791
1792#undef TBL
1793
1794#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1795void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
1796{                                                              \
1797    intptr_t i, opr_sz = simd_oprsz(desc);                     \
1798    TYPED *d = vd;                                             \
1799    TYPES *n = vn;                                             \
1800    ARMVectorReg tmp;                                          \
1801    if (unlikely(vn - vd < opr_sz)) {                          \
1802        n = memcpy(&tmp, n, opr_sz / 2);                       \
1803    }                                                          \
1804    for (i = 0; i < opr_sz / sizeof(TYPED); i++) {             \
1805        d[HD(i)] = n[HS(i)];                                   \
1806    }                                                          \
1807}
1808
1809DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1810DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1811DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1812
1813DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1814DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1815DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1816
1817#undef DO_UNPK
1818
1819/* Mask of bits included in the even numbered predicates of width esz.
1820 * We also use this for expand_bits/compress_bits, and so extend the
1821 * same pattern out to 16-bit units.
1822 */
1823static const uint64_t even_bit_esz_masks[5] = {
1824    0x5555555555555555ull,
1825    0x3333333333333333ull,
1826    0x0f0f0f0f0f0f0f0full,
1827    0x00ff00ff00ff00ffull,
1828    0x0000ffff0000ffffull,
1829};
1830
1831/* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1832 * For N==0, this corresponds to the operation that in qemu/bitops.h
1833 * we call half_shuffle64; this algorithm is from Hacker's Delight,
1834 * section 7-2 Shuffling Bits.
1835 */
1836static uint64_t expand_bits(uint64_t x, int n)
1837{
1838    int i;
1839
1840    x &= 0xffffffffu;
1841    for (i = 4; i >= n; i--) {
1842        int sh = 1 << i;
1843        x = ((x << sh) | x) & even_bit_esz_masks[i];
1844    }
1845    return x;
1846}
1847
1848/* Compress units of 2**(N+1) bits to units of 2**N bits.
1849 * For N==0, this corresponds to the operation that in qemu/bitops.h
1850 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1851 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1852 */
1853static uint64_t compress_bits(uint64_t x, int n)
1854{
1855    int i;
1856
1857    for (i = n; i <= 4; i++) {
1858        int sh = 1 << i;
1859        x &= even_bit_esz_masks[i];
1860        x = (x >> sh) | x;
1861    }
1862    return x & 0xffffffffu;
1863}
1864
1865void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1866{
1867    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1868    int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1869    intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1870    uint64_t *d = vd;
1871    intptr_t i;
1872
1873    if (oprsz <= 8) {
1874        uint64_t nn = *(uint64_t *)vn;
1875        uint64_t mm = *(uint64_t *)vm;
1876        int half = 4 * oprsz;
1877
1878        nn = extract64(nn, high * half, half);
1879        mm = extract64(mm, high * half, half);
1880        nn = expand_bits(nn, esz);
1881        mm = expand_bits(mm, esz);
1882        d[0] = nn + (mm << (1 << esz));
1883    } else {
1884        ARMPredicateReg tmp_n, tmp_m;
1885
1886        /* We produce output faster than we consume input.
1887           Therefore we must be mindful of possible overlap.  */
1888        if ((vn - vd) < (uintptr_t)oprsz) {
1889            vn = memcpy(&tmp_n, vn, oprsz);
1890        }
1891        if ((vm - vd) < (uintptr_t)oprsz) {
1892            vm = memcpy(&tmp_m, vm, oprsz);
1893        }
1894        if (high) {
1895            high = oprsz >> 1;
1896        }
1897
1898        if ((high & 3) == 0) {
1899            uint32_t *n = vn, *m = vm;
1900            high >>= 2;
1901
1902            for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1903                uint64_t nn = n[H4(high + i)];
1904                uint64_t mm = m[H4(high + i)];
1905
1906                nn = expand_bits(nn, esz);
1907                mm = expand_bits(mm, esz);
1908                d[i] = nn + (mm << (1 << esz));
1909            }
1910        } else {
1911            uint8_t *n = vn, *m = vm;
1912            uint16_t *d16 = vd;
1913
1914            for (i = 0; i < oprsz / 2; i++) {
1915                uint16_t nn = n[H1(high + i)];
1916                uint16_t mm = m[H1(high + i)];
1917
1918                nn = expand_bits(nn, esz);
1919                mm = expand_bits(mm, esz);
1920                d16[H2(i)] = nn + (mm << (1 << esz));
1921            }
1922        }
1923    }
1924}
1925
1926void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1927{
1928    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1929    int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1930    int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
1931    uint64_t *d = vd, *n = vn, *m = vm;
1932    uint64_t l, h;
1933    intptr_t i;
1934
1935    if (oprsz <= 8) {
1936        l = compress_bits(n[0] >> odd, esz);
1937        h = compress_bits(m[0] >> odd, esz);
1938        d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
1939    } else {
1940        ARMPredicateReg tmp_m;
1941        intptr_t oprsz_16 = oprsz / 16;
1942
1943        if ((vm - vd) < (uintptr_t)oprsz) {
1944            m = memcpy(&tmp_m, vm, oprsz);
1945        }
1946
1947        for (i = 0; i < oprsz_16; i++) {
1948            l = n[2 * i + 0];
1949            h = n[2 * i + 1];
1950            l = compress_bits(l >> odd, esz);
1951            h = compress_bits(h >> odd, esz);
1952            d[i] = l + (h << 32);
1953        }
1954
1955        /* For VL which is not a power of 2, the results from M do not
1956           align nicely with the uint64_t for D.  Put the aligned results
1957           from M into TMP_M and then copy it into place afterward.  */
1958        if (oprsz & 15) {
1959            d[i] = compress_bits(n[2 * i] >> odd, esz);
1960
1961            for (i = 0; i < oprsz_16; i++) {
1962                l = m[2 * i + 0];
1963                h = m[2 * i + 1];
1964                l = compress_bits(l >> odd, esz);
1965                h = compress_bits(h >> odd, esz);
1966                tmp_m.p[i] = l + (h << 32);
1967            }
1968            tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
1969
1970            swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
1971        } else {
1972            for (i = 0; i < oprsz_16; i++) {
1973                l = m[2 * i + 0];
1974                h = m[2 * i + 1];
1975                l = compress_bits(l >> odd, esz);
1976                h = compress_bits(h >> odd, esz);
1977                d[oprsz_16 + i] = l + (h << 32);
1978            }
1979        }
1980    }
1981}
1982
1983void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1984{
1985    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1986    uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1987    bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1988    uint64_t *d = vd, *n = vn, *m = vm;
1989    uint64_t mask;
1990    int shr, shl;
1991    intptr_t i;
1992
1993    shl = 1 << esz;
1994    shr = 0;
1995    mask = even_bit_esz_masks[esz];
1996    if (odd) {
1997        mask <<= shl;
1998        shr = shl;
1999        shl = 0;
2000    }
2001
2002    for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2003        uint64_t nn = (n[i] & mask) >> shr;
2004        uint64_t mm = (m[i] & mask) << shl;
2005        d[i] = nn + mm;
2006    }
2007}
2008
2009/* Reverse units of 2**N bits.  */
2010static uint64_t reverse_bits_64(uint64_t x, int n)
2011{
2012    int i, sh;
2013
2014    x = bswap64(x);
2015    for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2016        uint64_t mask = even_bit_esz_masks[i];
2017        x = ((x & mask) << sh) | ((x >> sh) & mask);
2018    }
2019    return x;
2020}
2021
2022static uint8_t reverse_bits_8(uint8_t x, int n)
2023{
2024    static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2025    int i, sh;
2026
2027    for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2028        x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2029    }
2030    return x;
2031}
2032
2033void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2034{
2035    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2036    int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2037    intptr_t i, oprsz_2 = oprsz / 2;
2038
2039    if (oprsz <= 8) {
2040        uint64_t l = *(uint64_t *)vn;
2041        l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2042        *(uint64_t *)vd = l;
2043    } else if ((oprsz & 15) == 0) {
2044        for (i = 0; i < oprsz_2; i += 8) {
2045            intptr_t ih = oprsz - 8 - i;
2046            uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2047            uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2048            *(uint64_t *)(vd + i) = h;
2049            *(uint64_t *)(vd + ih) = l;
2050        }
2051    } else {
2052        for (i = 0; i < oprsz_2; i += 1) {
2053            intptr_t il = H1(i);
2054            intptr_t ih = H1(oprsz - 1 - i);
2055            uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2056            uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2057            *(uint8_t *)(vd + il) = h;
2058            *(uint8_t *)(vd + ih) = l;
2059        }
2060    }
2061}
2062
2063void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2064{
2065    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2066    intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2067    uint64_t *d = vd;
2068    intptr_t i;
2069
2070    if (oprsz <= 8) {
2071        uint64_t nn = *(uint64_t *)vn;
2072        int half = 4 * oprsz;
2073
2074        nn = extract64(nn, high * half, half);
2075        nn = expand_bits(nn, 0);
2076        d[0] = nn;
2077    } else {
2078        ARMPredicateReg tmp_n;
2079
2080        /* We produce output faster than we consume input.
2081           Therefore we must be mindful of possible overlap.  */
2082        if ((vn - vd) < (uintptr_t)oprsz) {
2083            vn = memcpy(&tmp_n, vn, oprsz);
2084        }
2085        if (high) {
2086            high = oprsz >> 1;
2087        }
2088
2089        if ((high & 3) == 0) {
2090            uint32_t *n = vn;
2091            high >>= 2;
2092
2093            for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2094                uint64_t nn = n[H4(high + i)];
2095                d[i] = expand_bits(nn, 0);
2096            }
2097        } else {
2098            uint16_t *d16 = vd;
2099            uint8_t *n = vn;
2100
2101            for (i = 0; i < oprsz / 2; i++) {
2102                uint16_t nn = n[H1(high + i)];
2103                d16[H2(i)] = expand_bits(nn, 0);
2104            }
2105        }
2106    }
2107}
2108
2109#define DO_ZIP(NAME, TYPE, H) \
2110void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
2111{                                                                    \
2112    intptr_t oprsz = simd_oprsz(desc);                               \
2113    intptr_t i, oprsz_2 = oprsz / 2;                                 \
2114    ARMVectorReg tmp_n, tmp_m;                                       \
2115    /* We produce output faster than we consume input.               \
2116       Therefore we must be mindful of possible overlap.  */         \
2117    if (unlikely((vn - vd) < (uintptr_t)oprsz)) {                    \
2118        vn = memcpy(&tmp_n, vn, oprsz_2);                            \
2119    }                                                                \
2120    if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                    \
2121        vm = memcpy(&tmp_m, vm, oprsz_2);                            \
2122    }                                                                \
2123    for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                    \
2124        *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i));         \
2125        *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2126    }                                                                \
2127}
2128
2129DO_ZIP(sve_zip_b, uint8_t, H1)
2130DO_ZIP(sve_zip_h, uint16_t, H1_2)
2131DO_ZIP(sve_zip_s, uint32_t, H1_4)
2132DO_ZIP(sve_zip_d, uint64_t, )
2133
2134#define DO_UZP(NAME, TYPE, H) \
2135void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
2136{                                                                      \
2137    intptr_t oprsz = simd_oprsz(desc);                                 \
2138    intptr_t oprsz_2 = oprsz / 2;                                      \
2139    intptr_t odd_ofs = simd_data(desc);                                \
2140    intptr_t i;                                                        \
2141    ARMVectorReg tmp_m;                                                \
2142    if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                      \
2143        vm = memcpy(&tmp_m, vm, oprsz);                                \
2144    }                                                                  \
2145    for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                      \
2146        *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs));     \
2147    }                                                                  \
2148    for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                      \
2149        *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2150    }                                                                  \
2151}
2152
2153DO_UZP(sve_uzp_b, uint8_t, H1)
2154DO_UZP(sve_uzp_h, uint16_t, H1_2)
2155DO_UZP(sve_uzp_s, uint32_t, H1_4)
2156DO_UZP(sve_uzp_d, uint64_t, )
2157
2158#define DO_TRN(NAME, TYPE, H) \
2159void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
2160{                                                                      \
2161    intptr_t oprsz = simd_oprsz(desc);                                 \
2162    intptr_t odd_ofs = simd_data(desc);                                \
2163    intptr_t i;                                                        \
2164    for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) {                    \
2165        TYPE ae = *(TYPE *)(vn + H(i + odd_ofs));                      \
2166        TYPE be = *(TYPE *)(vm + H(i + odd_ofs));                      \
2167        *(TYPE *)(vd + H(i + 0)) = ae;                                 \
2168        *(TYPE *)(vd + H(i + sizeof(TYPE))) = be;                      \
2169    }                                                                  \
2170}
2171
2172DO_TRN(sve_trn_b, uint8_t, H1)
2173DO_TRN(sve_trn_h, uint16_t, H1_2)
2174DO_TRN(sve_trn_s, uint32_t, H1_4)
2175DO_TRN(sve_trn_d, uint64_t, )
2176
2177#undef DO_ZIP
2178#undef DO_UZP
2179#undef DO_TRN
2180
2181void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2182{
2183    intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2184    uint32_t *d = vd, *n = vn;
2185    uint8_t *pg = vg;
2186
2187    for (i = j = 0; i < opr_sz; i++) {
2188        if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2189            d[H4(j)] = n[H4(i)];
2190            j++;
2191        }
2192    }
2193    for (; j < opr_sz; j++) {
2194        d[H4(j)] = 0;
2195    }
2196}
2197
2198void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2199{
2200    intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2201    uint64_t *d = vd, *n = vn;
2202    uint8_t *pg = vg;
2203
2204    for (i = j = 0; i < opr_sz; i++) {
2205        if (pg[H1(i)] & 1) {
2206            d[j] = n[i];
2207            j++;
2208        }
2209    }
2210    for (; j < opr_sz; j++) {
2211        d[j] = 0;
2212    }
2213}
2214
2215/* Similar to the ARM LastActiveElement pseudocode function, except the
2216 * result is multiplied by the element size.  This includes the not found
2217 * indication; e.g. not found for esz=3 is -8.
2218 */
2219int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2220{
2221    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2222    intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2223
2224    return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2225}
2226
2227void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2228{
2229    intptr_t opr_sz = simd_oprsz(desc) / 8;
2230    int esz = simd_data(desc);
2231    uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2232    intptr_t i, first_i, last_i;
2233    ARMVectorReg tmp;
2234
2235    first_i = last_i = 0;
2236    first_g = last_g = 0;
2237
2238    /* Find the extent of the active elements within VG.  */
2239    for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2240        pg = *(uint64_t *)(vg + i) & mask;
2241        if (pg) {
2242            if (last_g == 0) {
2243                last_g = pg;
2244                last_i = i;
2245            }
2246            first_g = pg;
2247            first_i = i;
2248        }
2249    }
2250
2251    len = 0;
2252    if (first_g != 0) {
2253        first_i = first_i * 8 + ctz64(first_g);
2254        last_i = last_i * 8 + 63 - clz64(last_g);
2255        len = last_i - first_i + (1 << esz);
2256        if (vd == vm) {
2257            vm = memcpy(&tmp, vm, opr_sz * 8);
2258        }
2259        swap_memmove(vd, vn + first_i, len);
2260    }
2261    swap_memmove(vd + len, vm, opr_sz * 8 - len);
2262}
2263
2264void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2265                            void *vg, uint32_t desc)
2266{
2267    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2268    uint64_t *d = vd, *n = vn, *m = vm;
2269    uint8_t *pg = vg;
2270
2271    for (i = 0; i < opr_sz; i += 1) {
2272        uint64_t nn = n[i], mm = m[i];
2273        uint64_t pp = expand_pred_b(pg[H1(i)]);
2274        d[i] = (nn & pp) | (mm & ~pp);
2275    }
2276}
2277
2278void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2279                            void *vg, uint32_t desc)
2280{
2281    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2282    uint64_t *d = vd, *n = vn, *m = vm;
2283    uint8_t *pg = vg;
2284
2285    for (i = 0; i < opr_sz; i += 1) {
2286        uint64_t nn = n[i], mm = m[i];
2287        uint64_t pp = expand_pred_h(pg[H1(i)]);
2288        d[i] = (nn & pp) | (mm & ~pp);
2289    }
2290}
2291
2292void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2293                            void *vg, uint32_t desc)
2294{
2295    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2296    uint64_t *d = vd, *n = vn, *m = vm;
2297    uint8_t *pg = vg;
2298
2299    for (i = 0; i < opr_sz; i += 1) {
2300        uint64_t nn = n[i], mm = m[i];
2301        uint64_t pp = expand_pred_s(pg[H1(i)]);
2302        d[i] = (nn & pp) | (mm & ~pp);
2303    }
2304}
2305
2306void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2307                            void *vg, uint32_t desc)
2308{
2309    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2310    uint64_t *d = vd, *n = vn, *m = vm;
2311    uint8_t *pg = vg;
2312
2313    for (i = 0; i < opr_sz; i += 1) {
2314        uint64_t nn = n[i], mm = m[i];
2315        d[i] = (pg[H1(i)] & 1 ? nn : mm);
2316    }
2317}
2318
2319/* Two operand comparison controlled by a predicate.
2320 * ??? It is very tempting to want to be able to expand this inline
2321 * with x86 instructions, e.g.
2322 *
2323 *    vcmpeqw    zm, zn, %ymm0
2324 *    vpmovmskb  %ymm0, %eax
2325 *    and        $0x5555, %eax
2326 *    and        pg, %eax
2327 *
2328 * or even aarch64, e.g.
2329 *
2330 *    // mask = 4000 1000 0400 0100 0040 0010 0004 0001
2331 *    cmeq       v0.8h, zn, zm
2332 *    and        v0.8h, v0.8h, mask
2333 *    addv       h0, v0.8h
2334 *    and        v0.8b, pg
2335 *
2336 * However, coming up with an abstraction that allows vector inputs and
2337 * a scalar output, and also handles the byte-ordering of sub-uint64_t
2338 * scalar outputs, is tricky.
2339 */
2340#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK)                                 \
2341uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2342{                                                                            \
2343    intptr_t opr_sz = simd_oprsz(desc);                                      \
2344    uint32_t flags = PREDTEST_INIT;                                          \
2345    intptr_t i = opr_sz;                                                     \
2346    do {                                                                     \
2347        uint64_t out = 0, pg;                                                \
2348        do {                                                                 \
2349            i -= sizeof(TYPE), out <<= sizeof(TYPE);                         \
2350            TYPE nn = *(TYPE *)(vn + H(i));                                  \
2351            TYPE mm = *(TYPE *)(vm + H(i));                                  \
2352            out |= nn OP mm;                                                 \
2353        } while (i & 63);                                                    \
2354        pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
2355        out &= pg;                                                           \
2356        *(uint64_t *)(vd + (i >> 3)) = out;                                  \
2357        flags = iter_predtest_bwd(out, pg, flags);                           \
2358    } while (i > 0);                                                         \
2359    return flags;                                                            \
2360}
2361
2362#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2363    DO_CMP_PPZZ(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
2364#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2365    DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2366#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2367    DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2368#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2369    DO_CMP_PPZZ(NAME, TYPE, OP,     , 0x0101010101010101ull)
2370
2371DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t,  ==)
2372DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2373DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2374DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2375
2376DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t,  !=)
2377DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2378DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2379DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2380
2381DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t,  >)
2382DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2383DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2384DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2385
2386DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t,  >=)
2387DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2388DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2389DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2390
2391DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t,  >)
2392DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2393DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2394DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2395
2396DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t,  >=)
2397DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2398DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2399DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2400
2401#undef DO_CMP_PPZZ_B
2402#undef DO_CMP_PPZZ_H
2403#undef DO_CMP_PPZZ_S
2404#undef DO_CMP_PPZZ_D
2405#undef DO_CMP_PPZZ
2406
2407/* Similar, but the second source is "wide".  */
2408#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK)                     \
2409uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2410{                                                                            \
2411    intptr_t opr_sz = simd_oprsz(desc);                                      \
2412    uint32_t flags = PREDTEST_INIT;                                          \
2413    intptr_t i = opr_sz;                                                     \
2414    do {                                                                     \
2415        uint64_t out = 0, pg;                                                \
2416        do {                                                                 \
2417            TYPEW mm = *(TYPEW *)(vm + i - 8);                               \
2418            do {                                                             \
2419                i -= sizeof(TYPE), out <<= sizeof(TYPE);                     \
2420                TYPE nn = *(TYPE *)(vn + H(i));                              \
2421                out |= nn OP mm;                                             \
2422            } while (i & 7);                                                 \
2423        } while (i & 63);                                                    \
2424        pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
2425        out &= pg;                                                           \
2426        *(uint64_t *)(vd + (i >> 3)) = out;                                  \
2427        flags = iter_predtest_bwd(out, pg, flags);                           \
2428    } while (i > 0);                                                         \
2429    return flags;                                                            \
2430}
2431
2432#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2433    DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1,   0xffffffffffffffffull)
2434#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2435    DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2436#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2437    DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2438
2439DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, uint8_t,  uint64_t, ==)
2440DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, uint16_t, uint64_t, ==)
2441DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, uint32_t, uint64_t, ==)
2442
2443DO_CMP_PPZW_B(sve_cmpne_ppzw_b, uint8_t,  uint64_t, !=)
2444DO_CMP_PPZW_H(sve_cmpne_ppzw_h, uint16_t, uint64_t, !=)
2445DO_CMP_PPZW_S(sve_cmpne_ppzw_s, uint32_t, uint64_t, !=)
2446
2447DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t,   int64_t, >)
2448DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t,  int64_t, >)
2449DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t,  int64_t, >)
2450
2451DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t,   int64_t, >=)
2452DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t,  int64_t, >=)
2453DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t,  int64_t, >=)
2454
2455DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t,  uint64_t, >)
2456DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
2457DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
2458
2459DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t,  uint64_t, >=)
2460DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
2461DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
2462
2463DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t,   int64_t, <)
2464DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t,  int64_t, <)
2465DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t,  int64_t, <)
2466
2467DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t,   int64_t, <=)
2468DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t,  int64_t, <=)
2469DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t,  int64_t, <=)
2470
2471DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t,  uint64_t, <)
2472DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
2473DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
2474
2475DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t,  uint64_t, <=)
2476DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
2477DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
2478
2479#undef DO_CMP_PPZW_B
2480#undef DO_CMP_PPZW_H
2481#undef DO_CMP_PPZW_S
2482#undef DO_CMP_PPZW
2483
2484/* Similar, but the second source is immediate.  */
2485#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK)                         \
2486uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)   \
2487{                                                                    \
2488    intptr_t opr_sz = simd_oprsz(desc);                              \
2489    uint32_t flags = PREDTEST_INIT;                                  \
2490    TYPE mm = simd_data(desc);                                       \
2491    intptr_t i = opr_sz;                                             \
2492    do {                                                             \
2493        uint64_t out = 0, pg;                                        \
2494        do {                                                         \
2495            i -= sizeof(TYPE), out <<= sizeof(TYPE);                 \
2496            TYPE nn = *(TYPE *)(vn + H(i));                          \
2497            out |= nn OP mm;                                         \
2498        } while (i & 63);                                            \
2499        pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                    \
2500        out &= pg;                                                   \
2501        *(uint64_t *)(vd + (i >> 3)) = out;                          \
2502        flags = iter_predtest_bwd(out, pg, flags);                   \
2503    } while (i > 0);                                                 \
2504    return flags;                                                    \
2505}
2506
2507#define DO_CMP_PPZI_B(NAME, TYPE, OP) \
2508    DO_CMP_PPZI(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
2509#define DO_CMP_PPZI_H(NAME, TYPE, OP) \
2510    DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2511#define DO_CMP_PPZI_S(NAME, TYPE, OP) \
2512    DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2513#define DO_CMP_PPZI_D(NAME, TYPE, OP) \
2514    DO_CMP_PPZI(NAME, TYPE, OP,     , 0x0101010101010101ull)
2515
2516DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t,  ==)
2517DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
2518DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
2519DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
2520
2521DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t,  !=)
2522DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
2523DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
2524DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
2525
2526DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t,  >)
2527DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
2528DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
2529DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
2530
2531DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t,  >=)
2532DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
2533DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
2534DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
2535
2536DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t,  >)
2537DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
2538DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
2539DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
2540
2541DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t,  >=)
2542DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
2543DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
2544DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
2545
2546DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t,  <)
2547DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
2548DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
2549DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
2550
2551DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t,  <=)
2552DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
2553DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
2554DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
2555
2556DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t,  <)
2557DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
2558DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
2559DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
2560
2561DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t,  <=)
2562DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
2563DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
2564DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
2565
2566#undef DO_CMP_PPZI_B
2567#undef DO_CMP_PPZI_H
2568#undef DO_CMP_PPZI_S
2569#undef DO_CMP_PPZI_D
2570#undef DO_CMP_PPZI
2571
2572/* Similar to the ARM LastActive pseudocode function.  */
2573static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
2574{
2575    intptr_t i;
2576
2577    for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
2578        uint64_t pg = *(uint64_t *)(vg + i);
2579        if (pg) {
2580            return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
2581        }
2582    }
2583    return 0;
2584}
2585
2586/* Compute a mask into RETB that is true for all G, up to and including
2587 * (if after) or excluding (if !after) the first G & N.
2588 * Return true if BRK found.
2589 */
2590static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
2591                        bool brk, bool after)
2592{
2593    uint64_t b;
2594
2595    if (brk) {
2596        b = 0;
2597    } else if ((g & n) == 0) {
2598        /* For all G, no N are set; break not found.  */
2599        b = g;
2600    } else {
2601        /* Break somewhere in N.  Locate it.  */
2602        b = g & n;            /* guard true, pred true */
2603        b = b & -b;           /* first such */
2604        if (after) {
2605            b = b | (b - 1);  /* break after same */
2606        } else {
2607            b = b - 1;        /* break before same */
2608        }
2609        brk = true;
2610    }
2611
2612    *retb = b;
2613    return brk;
2614}
2615
2616/* Compute a zeroing BRK.  */
2617static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
2618                          intptr_t oprsz, bool after)
2619{
2620    bool brk = false;
2621    intptr_t i;
2622
2623    for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2624        uint64_t this_b, this_g = g[i];
2625
2626        brk = compute_brk(&this_b, n[i], this_g, brk, after);
2627        d[i] = this_b & this_g;
2628    }
2629}
2630
2631/* Likewise, but also compute flags.  */
2632static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
2633                               intptr_t oprsz, bool after)
2634{
2635    uint32_t flags = PREDTEST_INIT;
2636    bool brk = false;
2637    intptr_t i;
2638
2639    for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2640        uint64_t this_b, this_d, this_g = g[i];
2641
2642        brk = compute_brk(&this_b, n[i], this_g, brk, after);
2643        d[i] = this_d = this_b & this_g;
2644        flags = iter_predtest_fwd(this_d, this_g, flags);
2645    }
2646    return flags;
2647}
2648
2649/* Compute a merging BRK.  */
2650static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
2651                          intptr_t oprsz, bool after)
2652{
2653    bool brk = false;
2654    intptr_t i;
2655
2656    for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2657        uint64_t this_b, this_g = g[i];
2658
2659        brk = compute_brk(&this_b, n[i], this_g, brk, after);
2660        d[i] = (this_b & this_g) | (d[i] & ~this_g);
2661    }
2662}
2663
2664/* Likewise, but also compute flags.  */
2665static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
2666                               intptr_t oprsz, bool after)
2667{
2668    uint32_t flags = PREDTEST_INIT;
2669    bool brk = false;
2670    intptr_t i;
2671
2672    for (i = 0; i < oprsz / 8; ++i) {
2673        uint64_t this_b, this_d = d[i], this_g = g[i];
2674
2675        brk = compute_brk(&this_b, n[i], this_g, brk, after);
2676        d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
2677        flags = iter_predtest_fwd(this_d, this_g, flags);
2678    }
2679    return flags;
2680}
2681
2682static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
2683{
2684    /* It is quicker to zero the whole predicate than loop on OPRSZ.
2685     * The compiler should turn this into 4 64-bit integer stores.
2686     */
2687    memset(d, 0, sizeof(ARMPredicateReg));
2688    return PREDTEST_INIT;
2689}
2690
2691void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
2692                       uint32_t pred_desc)
2693{
2694    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2695    if (last_active_pred(vn, vg, oprsz)) {
2696        compute_brk_z(vd, vm, vg, oprsz, true);
2697    } else {
2698        do_zero(vd, oprsz);
2699    }
2700}
2701
2702uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
2703                            uint32_t pred_desc)
2704{
2705    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2706    if (last_active_pred(vn, vg, oprsz)) {
2707        return compute_brks_z(vd, vm, vg, oprsz, true);
2708    } else {
2709        return do_zero(vd, oprsz);
2710    }
2711}
2712
2713void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
2714                       uint32_t pred_desc)
2715{
2716    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2717    if (last_active_pred(vn, vg, oprsz)) {
2718        compute_brk_z(vd, vm, vg, oprsz, false);
2719    } else {
2720        do_zero(vd, oprsz);
2721    }
2722}
2723
2724uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
2725                            uint32_t pred_desc)
2726{
2727    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2728    if (last_active_pred(vn, vg, oprsz)) {
2729        return compute_brks_z(vd, vm, vg, oprsz, false);
2730    } else {
2731        return do_zero(vd, oprsz);
2732    }
2733}
2734
2735void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2736{
2737    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2738    compute_brk_z(vd, vn, vg, oprsz, true);
2739}
2740
2741uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2742{
2743    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2744    return compute_brks_z(vd, vn, vg, oprsz, true);
2745}
2746
2747void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2748{
2749    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2750    compute_brk_z(vd, vn, vg, oprsz, false);
2751}
2752
2753uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2754{
2755    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2756    return compute_brks_z(vd, vn, vg, oprsz, false);
2757}
2758
2759void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2760{
2761    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2762    compute_brk_m(vd, vn, vg, oprsz, true);
2763}
2764
2765uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2766{
2767    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2768    return compute_brks_m(vd, vn, vg, oprsz, true);
2769}
2770
2771void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2772{
2773    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2774    compute_brk_m(vd, vn, vg, oprsz, false);
2775}
2776
2777uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2778{
2779    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2780    return compute_brks_m(vd, vn, vg, oprsz, false);
2781}
2782
2783void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2784{
2785    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2786
2787    if (!last_active_pred(vn, vg, oprsz)) {
2788        do_zero(vd, oprsz);
2789    }
2790}
2791
2792/* As if PredTest(Ones(PL), D, esz).  */
2793static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
2794                              uint64_t esz_mask)
2795{
2796    uint32_t flags = PREDTEST_INIT;
2797    intptr_t i;
2798
2799    for (i = 0; i < oprsz / 8; i++) {
2800        flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
2801    }
2802    if (oprsz & 7) {
2803        uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
2804        flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
2805    }
2806    return flags;
2807}
2808
2809uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2810{
2811    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2812
2813    if (last_active_pred(vn, vg, oprsz)) {
2814        return predtest_ones(vd, oprsz, -1);
2815    } else {
2816        return do_zero(vd, oprsz);
2817    }
2818}
2819
2820uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
2821{
2822    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2823    intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2824    uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
2825    intptr_t i;
2826
2827    for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2828        uint64_t t = n[i] & g[i] & mask;
2829        sum += ctpop64(t);
2830    }
2831    return sum;
2832}
2833
2834uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
2835{
2836    uintptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2837    intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2838    uint64_t esz_mask = pred_esz_masks[esz];
2839    ARMPredicateReg *d = vd;
2840    uint32_t flags;
2841    intptr_t i;
2842
2843    /* Begin with a zero predicate register.  */
2844    flags = do_zero(d, oprsz);
2845    if (count == 0) {
2846        return flags;
2847    }
2848
2849    /* Scale from predicate element count to bits.  */
2850    count <<= esz;
2851    /* Bound to the bits in the predicate.  */
2852    count = MIN(count, oprsz * 8);
2853
2854    /* Set all of the requested bits.  */
2855    for (i = 0; i < count / 64; ++i) {
2856        d->p[i] = esz_mask;
2857    }
2858    if (count & 63) {
2859        d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
2860    }
2861
2862    return predtest_ones(d, oprsz, esz_mask);
2863}
2864
2865/* Recursive reduction on a function;
2866 * C.f. the ARM ARM function ReducePredicated.
2867 *
2868 * While it would be possible to write this without the DATA temporary,
2869 * it is much simpler to process the predicate register this way.
2870 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
2871 * little to gain with a more complex non-recursive form.
2872 */
2873#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT)                         \
2874static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
2875{                                                                     \
2876    if (n == 1) {                                                     \
2877        return *data;                                                 \
2878    } else {                                                          \
2879        uintptr_t half = n / 2;                                       \
2880        TYPE lo = NAME##_reduce(data, status, half);                  \
2881        TYPE hi = NAME##_reduce(data + half, status, half);           \
2882        return TYPE##_##FUNC(lo, hi, status);                         \
2883    }                                                                 \
2884}                                                                     \
2885uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc)    \
2886{                                                                     \
2887    uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_maxsz(desc);  \
2888    TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)];                   \
2889    for (i = 0; i < oprsz; ) {                                        \
2890        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));               \
2891        do {                                                          \
2892            TYPE nn = *(TYPE *)(vn + H(i));                           \
2893            *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT);      \
2894            i += sizeof(TYPE), pg >>= sizeof(TYPE);                   \
2895        } while (i & 15);                                             \
2896    }                                                                 \
2897    for (; i < maxsz; i += sizeof(TYPE)) {                            \
2898        *(TYPE *)((void *)data + i) = IDENT;                          \
2899    }                                                                 \
2900    return NAME##_reduce(data, vs, maxsz / sizeof(TYPE));             \
2901}
2902
2903DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
2904DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
2905DO_REDUCE(sve_faddv_d, float64,     , add, float64_zero)
2906
2907/* Identity is floatN_default_nan, without the function call.  */
2908DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
2909DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
2910DO_REDUCE(sve_fminnmv_d, float64,     , minnum, 0x7FF8000000000000ULL)
2911
2912DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
2913DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
2914DO_REDUCE(sve_fmaxnmv_d, float64,     , maxnum, 0x7FF8000000000000ULL)
2915
2916DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
2917DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
2918DO_REDUCE(sve_fminv_d, float64,     , min, float64_infinity)
2919
2920DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
2921DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
2922DO_REDUCE(sve_fmaxv_d, float64,     , max, float64_chs(float64_infinity))
2923
2924#undef DO_REDUCE
2925
2926uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
2927                             void *status, uint32_t desc)
2928{
2929    intptr_t i = 0, opr_sz = simd_oprsz(desc);
2930    float16 result = nn;
2931
2932    do {
2933        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2934        do {
2935            if (pg & 1) {
2936                float16 mm = *(float16 *)(vm + H1_2(i));
2937                result = float16_add(result, mm, status);
2938            }
2939            i += sizeof(float16), pg >>= sizeof(float16);
2940        } while (i & 15);
2941    } while (i < opr_sz);
2942
2943    return result;
2944}
2945
2946uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
2947                             void *status, uint32_t desc)
2948{
2949    intptr_t i = 0, opr_sz = simd_oprsz(desc);
2950    float32 result = nn;
2951
2952    do {
2953        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2954        do {
2955            if (pg & 1) {
2956                float32 mm = *(float32 *)(vm + H1_2(i));
2957                result = float32_add(result, mm, status);
2958            }
2959            i += sizeof(float32), pg >>= sizeof(float32);
2960        } while (i & 15);
2961    } while (i < opr_sz);
2962
2963    return result;
2964}
2965
2966uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
2967                             void *status, uint32_t desc)
2968{
2969    intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
2970    uint64_t *m = vm;
2971    uint8_t *pg = vg;
2972
2973    for (i = 0; i < opr_sz; i++) {
2974        if (pg[H1(i)] & 1) {
2975            nn = float64_add(nn, m[i], status);
2976        }
2977    }
2978
2979    return nn;
2980}
2981
2982/* Fully general three-operand expander, controlled by a predicate,
2983 * With the extra float_status parameter.
2984 */
2985#define DO_ZPZZ_FP(NAME, TYPE, H, OP)                           \
2986void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,       \
2987                  void *status, uint32_t desc)                  \
2988{                                                               \
2989    intptr_t i = simd_oprsz(desc);                              \
2990    uint64_t *g = vg;                                           \
2991    do {                                                        \
2992        uint64_t pg = g[(i - 1) >> 6];                          \
2993        do {                                                    \
2994            i -= sizeof(TYPE);                                  \
2995            if (likely((pg >> (i & 63)) & 1)) {                 \
2996                TYPE nn = *(TYPE *)(vn + H(i));                 \
2997                TYPE mm = *(TYPE *)(vm + H(i));                 \
2998                *(TYPE *)(vd + H(i)) = OP(nn, mm, status);      \
2999            }                                                   \
3000        } while (i & 63);                                       \
3001    } while (i != 0);                                           \
3002}
3003
3004DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
3005DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
3006DO_ZPZZ_FP(sve_fadd_d, uint64_t,     , float64_add)
3007
3008DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
3009DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
3010DO_ZPZZ_FP(sve_fsub_d, uint64_t,     , float64_sub)
3011
3012DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
3013DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
3014DO_ZPZZ_FP(sve_fmul_d, uint64_t,     , float64_mul)
3015
3016DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
3017DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
3018DO_ZPZZ_FP(sve_fdiv_d, uint64_t,     , float64_div)
3019
3020DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
3021DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
3022DO_ZPZZ_FP(sve_fmin_d, uint64_t,     , float64_min)
3023
3024DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
3025DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
3026DO_ZPZZ_FP(sve_fmax_d, uint64_t,     , float64_max)
3027
3028DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
3029DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
3030DO_ZPZZ_FP(sve_fminnum_d, uint64_t,     , float64_minnum)
3031
3032DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
3033DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
3034DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t,     , float64_maxnum)
3035
3036static inline float16 abd_h(float16 a, float16 b, float_status *s)
3037{
3038    return float16_abs(float16_sub(a, b, s));
3039}
3040
3041static inline float32 abd_s(float32 a, float32 b, float_status *s)
3042{
3043    return float32_abs(float32_sub(a, b, s));
3044}
3045
3046static inline float64 abd_d(float64 a, float64 b, float_status *s)
3047{
3048    return float64_abs(float64_sub(a, b, s));
3049}
3050
3051DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
3052DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
3053DO_ZPZZ_FP(sve_fabd_d, uint64_t,     , abd_d)
3054
3055static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
3056{
3057    int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
3058    return float64_scalbn(a, b_int, s);
3059}
3060
3061DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
3062DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
3063DO_ZPZZ_FP(sve_fscalbn_d, int64_t,     , scalbn_d)
3064
3065DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
3066DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
3067DO_ZPZZ_FP(sve_fmulx_d, uint64_t,     , helper_vfp_mulxd)
3068
3069#undef DO_ZPZZ_FP
3070
3071/* Three-operand expander, with one scalar operand, controlled by
3072 * a predicate, with the extra float_status parameter.
3073 */
3074#define DO_ZPZS_FP(NAME, TYPE, H, OP) \
3075void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar,  \
3076                  void *status, uint32_t desc)                    \
3077{                                                                 \
3078    intptr_t i = simd_oprsz(desc);                                \
3079    uint64_t *g = vg;                                             \
3080    TYPE mm = scalar;                                             \
3081    do {                                                          \
3082        uint64_t pg = g[(i - 1) >> 6];                            \
3083        do {                                                      \
3084            i -= sizeof(TYPE);                                    \
3085            if (likely((pg >> (i & 63)) & 1)) {                   \
3086                TYPE nn = *(TYPE *)(vn + H(i));                   \
3087                *(TYPE *)(vd + H(i)) = OP(nn, mm, status);        \
3088            }                                                     \
3089        } while (i & 63);                                         \
3090    } while (i != 0);                                             \
3091}
3092
3093DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
3094DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
3095DO_ZPZS_FP(sve_fadds_d, float64,     , float64_add)
3096
3097DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
3098DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
3099DO_ZPZS_FP(sve_fsubs_d, float64,     , float64_sub)
3100
3101DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
3102DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
3103DO_ZPZS_FP(sve_fmuls_d, float64,     , float64_mul)
3104
3105static inline float16 subr_h(float16 a, float16 b, float_status *s)
3106{
3107    return float16_sub(b, a, s);
3108}
3109
3110static inline float32 subr_s(float32 a, float32 b, float_status *s)
3111{
3112    return float32_sub(b, a, s);
3113}
3114
3115static inline float64 subr_d(float64 a, float64 b, float_status *s)
3116{
3117    return float64_sub(b, a, s);
3118}
3119
3120DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
3121DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
3122DO_ZPZS_FP(sve_fsubrs_d, float64,     , subr_d)
3123
3124DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
3125DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
3126DO_ZPZS_FP(sve_fmaxnms_d, float64,     , float64_maxnum)
3127
3128DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
3129DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
3130DO_ZPZS_FP(sve_fminnms_d, float64,     , float64_minnum)
3131
3132DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
3133DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
3134DO_ZPZS_FP(sve_fmaxs_d, float64,     , float64_max)
3135
3136DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
3137DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
3138DO_ZPZS_FP(sve_fmins_d, float64,     , float64_min)
3139
3140/* Fully general two-operand expander, controlled by a predicate,
3141 * With the extra float_status parameter.
3142 */
3143#define DO_ZPZ_FP(NAME, TYPE, H, OP)                                  \
3144void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
3145{                                                                     \
3146    intptr_t i = simd_oprsz(desc);                                    \
3147    uint64_t *g = vg;                                                 \
3148    do {                                                              \
3149        uint64_t pg = g[(i - 1) >> 6];                                \
3150        do {                                                          \
3151            i -= sizeof(TYPE);                                        \
3152            if (likely((pg >> (i & 63)) & 1)) {                       \
3153                TYPE nn = *(TYPE *)(vn + H(i));                       \
3154                *(TYPE *)(vd + H(i)) = OP(nn, status);                \
3155            }                                                         \
3156        } while (i & 63);                                             \
3157    } while (i != 0);                                                 \
3158}
3159
3160/* SVE fp16 conversions always use IEEE mode.  Like AdvSIMD, they ignore
3161 * FZ16.  When converting from fp16, this affects flushing input denormals;
3162 * when converting to fp16, this affects flushing output denormals.
3163 */
3164static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
3165{
3166    flag save = get_flush_inputs_to_zero(fpst);
3167    float32 ret;
3168
3169    set_flush_inputs_to_zero(false, fpst);
3170    ret = float16_to_float32(f, true, fpst);
3171    set_flush_inputs_to_zero(save, fpst);
3172    return ret;
3173}
3174
3175static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
3176{
3177    flag save = get_flush_inputs_to_zero(fpst);
3178    float64 ret;
3179
3180    set_flush_inputs_to_zero(false, fpst);
3181    ret = float16_to_float64(f, true, fpst);
3182    set_flush_inputs_to_zero(save, fpst);
3183    return ret;
3184}
3185
3186static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
3187{
3188    flag save = get_flush_to_zero(fpst);
3189    float16 ret;
3190
3191    set_flush_to_zero(false, fpst);
3192    ret = float32_to_float16(f, true, fpst);
3193    set_flush_to_zero(save, fpst);
3194    return ret;
3195}
3196
3197static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
3198{
3199    flag save = get_flush_to_zero(fpst);
3200    float16 ret;
3201
3202    set_flush_to_zero(false, fpst);
3203    ret = float64_to_float16(f, true, fpst);
3204    set_flush_to_zero(save, fpst);
3205    return ret;
3206}
3207
3208static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
3209{
3210    if (float16_is_any_nan(f)) {
3211        float_raise(float_flag_invalid, s);
3212        return 0;
3213    }
3214    return float16_to_int16_round_to_zero(f, s);
3215}
3216
3217static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
3218{
3219    if (float16_is_any_nan(f)) {
3220        float_raise(float_flag_invalid, s);
3221        return 0;
3222    }
3223    return float16_to_int64_round_to_zero(f, s);
3224}
3225
3226static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
3227{
3228    if (float32_is_any_nan(f)) {
3229        float_raise(float_flag_invalid, s);
3230        return 0;
3231    }
3232    return float32_to_int64_round_to_zero(f, s);
3233}
3234
3235static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
3236{
3237    if (float64_is_any_nan(f)) {
3238        float_raise(float_flag_invalid, s);
3239        return 0;
3240    }
3241    return float64_to_int64_round_to_zero(f, s);
3242}
3243
3244static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
3245{
3246    if (float16_is_any_nan(f)) {
3247        float_raise(float_flag_invalid, s);
3248        return 0;
3249    }
3250    return float16_to_uint16_round_to_zero(f, s);
3251}
3252
3253static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
3254{
3255    if (float16_is_any_nan(f)) {
3256        float_raise(float_flag_invalid, s);
3257        return 0;
3258    }
3259    return float16_to_uint64_round_to_zero(f, s);
3260}
3261
3262static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
3263{
3264    if (float32_is_any_nan(f)) {
3265        float_raise(float_flag_invalid, s);
3266        return 0;
3267    }
3268    return float32_to_uint64_round_to_zero(f, s);
3269}
3270
3271static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
3272{
3273    if (float64_is_any_nan(f)) {
3274        float_raise(float_flag_invalid, s);
3275        return 0;
3276    }
3277    return float64_to_uint64_round_to_zero(f, s);
3278}
3279
3280DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
3281DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
3282DO_ZPZ_FP(sve_fcvt_dh, uint64_t,     , sve_f64_to_f16)
3283DO_ZPZ_FP(sve_fcvt_hd, uint64_t,     , sve_f16_to_f64)
3284DO_ZPZ_FP(sve_fcvt_ds, uint64_t,     , float64_to_float32)
3285DO_ZPZ_FP(sve_fcvt_sd, uint64_t,     , float32_to_float64)
3286
3287DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
3288DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
3289DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
3290DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t,     , vfp_float16_to_int64_rtz)
3291DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t,     , vfp_float32_to_int64_rtz)
3292DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t,     , helper_vfp_tosizd)
3293DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t,     , vfp_float64_to_int64_rtz)
3294
3295DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
3296DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
3297DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
3298DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t,     , vfp_float16_to_uint64_rtz)
3299DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t,     , vfp_float32_to_uint64_rtz)
3300DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t,     , helper_vfp_touizd)
3301DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t,     , vfp_float64_to_uint64_rtz)
3302
3303DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
3304DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
3305DO_ZPZ_FP(sve_frint_d, uint64_t,     , helper_rintd)
3306
3307DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
3308DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
3309DO_ZPZ_FP(sve_frintx_d, uint64_t,     , float64_round_to_int)
3310
3311DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
3312DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
3313DO_ZPZ_FP(sve_frecpx_d, uint64_t,     , helper_frecpx_f64)
3314
3315DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
3316DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
3317DO_ZPZ_FP(sve_fsqrt_d, uint64_t,     , float64_sqrt)
3318
3319DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
3320DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
3321DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
3322DO_ZPZ_FP(sve_scvt_sd, uint64_t,     , int32_to_float64)
3323DO_ZPZ_FP(sve_scvt_dh, uint64_t,     , int64_to_float16)
3324DO_ZPZ_FP(sve_scvt_ds, uint64_t,     , int64_to_float32)
3325DO_ZPZ_FP(sve_scvt_dd, uint64_t,     , int64_to_float64)
3326
3327DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
3328DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
3329DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
3330DO_ZPZ_FP(sve_ucvt_sd, uint64_t,     , uint32_to_float64)
3331DO_ZPZ_FP(sve_ucvt_dh, uint64_t,     , uint64_to_float16)
3332DO_ZPZ_FP(sve_ucvt_ds, uint64_t,     , uint64_to_float32)
3333DO_ZPZ_FP(sve_ucvt_dd, uint64_t,     , uint64_to_float64)
3334
3335#undef DO_ZPZ_FP
3336
3337/* 4-operand predicated multiply-add.  This requires 7 operands to pass
3338 * "properly", so we need to encode some of the registers into DESC.
3339 */
3340QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 20 > 32);
3341
3342static void do_fmla_zpzzz_h(CPUARMState *env, void *vg, uint32_t desc,
3343                            uint16_t neg1, uint16_t neg3)
3344{
3345    intptr_t i = simd_oprsz(desc);
3346    unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3347    unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3348    unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3349    unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3350    void *vd = &env->vfp.zregs[rd];
3351    void *vn = &env->vfp.zregs[rn];
3352    void *vm = &env->vfp.zregs[rm];
3353    void *va = &env->vfp.zregs[ra];
3354    uint64_t *g = vg;
3355
3356    do {
3357        uint64_t pg = g[(i - 1) >> 6];
3358        do {
3359            i -= 2;
3360            if (likely((pg >> (i & 63)) & 1)) {
3361                float16 e1, e2, e3, r;
3362
3363                e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
3364                e2 = *(uint16_t *)(vm + H1_2(i));
3365                e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
3366                r = float16_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3367                *(uint16_t *)(vd + H1_2(i)) = r;
3368            }
3369        } while (i & 63);
3370    } while (i != 0);
3371}
3372
3373void HELPER(sve_fmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3374{
3375    do_fmla_zpzzz_h(env, vg, desc, 0, 0);
3376}
3377
3378void HELPER(sve_fmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3379{
3380    do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0);
3381}
3382
3383void HELPER(sve_fnmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3384{
3385    do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0x8000);
3386}
3387
3388void HELPER(sve_fnmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3389{
3390    do_fmla_zpzzz_h(env, vg, desc, 0, 0x8000);
3391}
3392
3393static void do_fmla_zpzzz_s(CPUARMState *env, void *vg, uint32_t desc,
3394                            uint32_t neg1, uint32_t neg3)
3395{
3396    intptr_t i = simd_oprsz(desc);
3397    unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3398    unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3399    unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3400    unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3401    void *vd = &env->vfp.zregs[rd];
3402    void *vn = &env->vfp.zregs[rn];
3403    void *vm = &env->vfp.zregs[rm];
3404    void *va = &env->vfp.zregs[ra];
3405    uint64_t *g = vg;
3406
3407    do {
3408        uint64_t pg = g[(i - 1) >> 6];
3409        do {
3410            i -= 4;
3411            if (likely((pg >> (i & 63)) & 1)) {
3412                float32 e1, e2, e3, r;
3413
3414                e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
3415                e2 = *(uint32_t *)(vm + H1_4(i));
3416                e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
3417                r = float32_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3418                *(uint32_t *)(vd + H1_4(i)) = r;
3419            }
3420        } while (i & 63);
3421    } while (i != 0);
3422}
3423
3424void HELPER(sve_fmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3425{
3426    do_fmla_zpzzz_s(env, vg, desc, 0, 0);
3427}
3428
3429void HELPER(sve_fmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3430{
3431    do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0);
3432}
3433
3434void HELPER(sve_fnmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3435{
3436    do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0x80000000);
3437}
3438
3439void HELPER(sve_fnmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3440{
3441    do_fmla_zpzzz_s(env, vg, desc, 0, 0x80000000);
3442}
3443
3444static void do_fmla_zpzzz_d(CPUARMState *env, void *vg, uint32_t desc,
3445                            uint64_t neg1, uint64_t neg3)
3446{
3447    intptr_t i = simd_oprsz(desc);
3448    unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3449    unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3450    unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3451    unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3452    void *vd = &env->vfp.zregs[rd];
3453    void *vn = &env->vfp.zregs[rn];
3454    void *vm = &env->vfp.zregs[rm];
3455    void *va = &env->vfp.zregs[ra];
3456    uint64_t *g = vg;
3457
3458    do {
3459        uint64_t pg = g[(i - 1) >> 6];
3460        do {
3461            i -= 8;
3462            if (likely((pg >> (i & 63)) & 1)) {
3463                float64 e1, e2, e3, r;
3464
3465                e1 = *(uint64_t *)(vn + i) ^ neg1;
3466                e2 = *(uint64_t *)(vm + i);
3467                e3 = *(uint64_t *)(va + i) ^ neg3;
3468                r = float64_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3469                *(uint64_t *)(vd + i) = r;
3470            }
3471        } while (i & 63);
3472    } while (i != 0);
3473}
3474
3475void HELPER(sve_fmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3476{
3477    do_fmla_zpzzz_d(env, vg, desc, 0, 0);
3478}
3479
3480void HELPER(sve_fmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3481{
3482    do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, 0);
3483}
3484
3485void HELPER(sve_fnmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3486{
3487    do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, INT64_MIN);
3488}
3489
3490void HELPER(sve_fnmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3491{
3492    do_fmla_zpzzz_d(env, vg, desc, 0, INT64_MIN);
3493}
3494
3495/* Two operand floating-point comparison controlled by a predicate.
3496 * Unlike the integer version, we are not allowed to optimistically
3497 * compare operands, since the comparison may have side effects wrt
3498 * the FPSR.
3499 */
3500#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP)                                \
3501void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
3502                  void *status, uint32_t desc)                          \
3503{                                                                       \
3504    intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;                    \
3505    uint64_t *d = vd, *g = vg;                                          \
3506    do {                                                                \
3507        uint64_t out = 0, pg = g[j];                                    \
3508        do {                                                            \
3509            i -= sizeof(TYPE), out <<= sizeof(TYPE);                    \
3510            if (likely((pg >> (i & 63)) & 1)) {                         \
3511                TYPE nn = *(TYPE *)(vn + H(i));                         \
3512                TYPE mm = *(TYPE *)(vm + H(i));                         \
3513                out |= OP(TYPE, nn, mm, status);                        \
3514            }                                                           \
3515        } while (i & 63);                                               \
3516        d[j--] = out;                                                   \
3517    } while (i > 0);                                                    \
3518}
3519
3520#define DO_FPCMP_PPZZ_H(NAME, OP) \
3521    DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
3522#define DO_FPCMP_PPZZ_S(NAME, OP) \
3523    DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
3524#define DO_FPCMP_PPZZ_D(NAME, OP) \
3525    DO_FPCMP_PPZZ(NAME##_d, float64,     , OP)
3526
3527#define DO_FPCMP_PPZZ_ALL(NAME, OP) \
3528    DO_FPCMP_PPZZ_H(NAME, OP)   \
3529    DO_FPCMP_PPZZ_S(NAME, OP)   \
3530    DO_FPCMP_PPZZ_D(NAME, OP)
3531
3532#define DO_FCMGE(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) <= 0
3533#define DO_FCMGT(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) < 0
3534#define DO_FCMLE(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) <= 0
3535#define DO_FCMLT(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) < 0
3536#define DO_FCMEQ(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) == 0
3537#define DO_FCMNE(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) != 0
3538#define DO_FCMUO(TYPE, X, Y, ST)  \
3539    TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
3540#define DO_FACGE(TYPE, X, Y, ST)  \
3541    TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
3542#define DO_FACGT(TYPE, X, Y, ST)  \
3543    TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
3544
3545DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
3546DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
3547DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
3548DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
3549DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
3550DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
3551DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
3552
3553#undef DO_FPCMP_PPZZ_ALL
3554#undef DO_FPCMP_PPZZ_D
3555#undef DO_FPCMP_PPZZ_S
3556#undef DO_FPCMP_PPZZ_H
3557#undef DO_FPCMP_PPZZ
3558
3559/* One operand floating-point comparison against zero, controlled
3560 * by a predicate.
3561 */
3562#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP)                   \
3563void HELPER(NAME)(void *vd, void *vn, void *vg,            \
3564                  void *status, uint32_t desc)             \
3565{                                                          \
3566    intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;       \
3567    uint64_t *d = vd, *g = vg;                             \
3568    do {                                                   \
3569        uint64_t out = 0, pg = g[j];                       \
3570        do {                                               \
3571            i -= sizeof(TYPE), out <<= sizeof(TYPE);       \
3572            if ((pg >> (i & 63)) & 1) {                    \
3573                TYPE nn = *(TYPE *)(vn + H(i));            \
3574                out |= OP(TYPE, nn, 0, status);            \
3575            }                                              \
3576        } while (i & 63);                                  \
3577        d[j--] = out;                                      \
3578    } while (i > 0);                                       \
3579}
3580
3581#define DO_FPCMP_PPZ0_H(NAME, OP) \
3582    DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
3583#define DO_FPCMP_PPZ0_S(NAME, OP) \
3584    DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
3585#define DO_FPCMP_PPZ0_D(NAME, OP) \
3586    DO_FPCMP_PPZ0(NAME##_d, float64,     , OP)
3587
3588#define DO_FPCMP_PPZ0_ALL(NAME, OP) \
3589    DO_FPCMP_PPZ0_H(NAME, OP)   \
3590    DO_FPCMP_PPZ0_S(NAME, OP)   \
3591    DO_FPCMP_PPZ0_D(NAME, OP)
3592
3593DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
3594DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
3595DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
3596DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
3597DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
3598DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
3599
3600/* FP Trig Multiply-Add. */
3601
3602void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3603{
3604    static const float16 coeff[16] = {
3605        0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3606        0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3607    };
3608    intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
3609    intptr_t x = simd_data(desc);
3610    float16 *d = vd, *n = vn, *m = vm;
3611    for (i = 0; i < opr_sz; i++) {
3612        float16 mm = m[i];
3613        intptr_t xx = x;
3614        if (float16_is_neg(mm)) {
3615            mm = float16_abs(mm);
3616            xx += 8;
3617        }
3618        d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
3619    }
3620}
3621
3622void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3623{
3624    static const float32 coeff[16] = {
3625        0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
3626        0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
3627        0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
3628        0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
3629    };
3630    intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
3631    intptr_t x = simd_data(desc);
3632    float32 *d = vd, *n = vn, *m = vm;
3633    for (i = 0; i < opr_sz; i++) {
3634        float32 mm = m[i];
3635        intptr_t xx = x;
3636        if (float32_is_neg(mm)) {
3637            mm = float32_abs(mm);
3638            xx += 8;
3639        }
3640        d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
3641    }
3642}
3643
3644void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3645{
3646    static const float64 coeff[16] = {
3647        0x3ff0000000000000ull, 0xbfc5555555555543ull,
3648        0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
3649        0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
3650        0x3de5d8408868552full, 0x0000000000000000ull,
3651        0x3ff0000000000000ull, 0xbfe0000000000000ull,
3652        0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
3653        0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
3654        0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
3655    };
3656    intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
3657    intptr_t x = simd_data(desc);
3658    float64 *d = vd, *n = vn, *m = vm;
3659    for (i = 0; i < opr_sz; i++) {
3660        float64 mm = m[i];
3661        intptr_t xx = x;
3662        if (float64_is_neg(mm)) {
3663            mm = float64_abs(mm);
3664            xx += 8;
3665        }
3666        d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
3667    }
3668}
3669
3670/*
3671 * FP Complex Add
3672 */
3673
3674void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
3675                         void *vs, uint32_t desc)
3676{
3677    intptr_t j, i = simd_oprsz(desc);
3678    uint64_t *g = vg;
3679    float16 neg_imag = float16_set_sign(0, simd_data(desc));
3680    float16 neg_real = float16_chs(neg_imag);
3681
3682    do {
3683        uint64_t pg = g[(i - 1) >> 6];
3684        do {
3685            float16 e0, e1, e2, e3;
3686
3687            /* I holds the real index; J holds the imag index.  */
3688            j = i - sizeof(float16);
3689            i -= 2 * sizeof(float16);
3690
3691            e0 = *(float16 *)(vn + H1_2(i));
3692            e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
3693            e2 = *(float16 *)(vn + H1_2(j));
3694            e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
3695
3696            if (likely((pg >> (i & 63)) & 1)) {
3697                *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
3698            }
3699            if (likely((pg >> (j & 63)) & 1)) {
3700                *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
3701            }
3702        } while (i & 63);
3703    } while (i != 0);
3704}
3705
3706void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
3707                         void *vs, uint32_t desc)
3708{
3709    intptr_t j, i = simd_oprsz(desc);
3710    uint64_t *g = vg;
3711    float32 neg_imag = float32_set_sign(0, simd_data(desc));
3712    float32 neg_real = float32_chs(neg_imag);
3713
3714    do {
3715        uint64_t pg = g[(i - 1) >> 6];
3716        do {
3717            float32 e0, e1, e2, e3;
3718
3719            /* I holds the real index; J holds the imag index.  */
3720            j = i - sizeof(float32);
3721            i -= 2 * sizeof(float32);
3722
3723            e0 = *(float32 *)(vn + H1_2(i));
3724            e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
3725            e2 = *(float32 *)(vn + H1_2(j));
3726            e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
3727
3728            if (likely((pg >> (i & 63)) & 1)) {
3729                *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
3730            }
3731            if (likely((pg >> (j & 63)) & 1)) {
3732                *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
3733            }
3734        } while (i & 63);
3735    } while (i != 0);
3736}
3737
3738void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
3739                         void *vs, uint32_t desc)
3740{
3741    intptr_t j, i = simd_oprsz(desc);
3742    uint64_t *g = vg;
3743    float64 neg_imag = float64_set_sign(0, simd_data(desc));
3744    float64 neg_real = float64_chs(neg_imag);
3745
3746    do {
3747        uint64_t pg = g[(i - 1) >> 6];
3748        do {
3749            float64 e0, e1, e2, e3;
3750
3751            /* I holds the real index; J holds the imag index.  */
3752            j = i - sizeof(float64);
3753            i -= 2 * sizeof(float64);
3754
3755            e0 = *(float64 *)(vn + H1_2(i));
3756            e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
3757            e2 = *(float64 *)(vn + H1_2(j));
3758            e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
3759
3760            if (likely((pg >> (i & 63)) & 1)) {
3761                *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
3762            }
3763            if (likely((pg >> (j & 63)) & 1)) {
3764                *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
3765            }
3766        } while (i & 63);
3767    } while (i != 0);
3768}
3769
3770/*
3771 * FP Complex Multiply
3772 */
3773
3774QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 22 > 32);
3775
3776void HELPER(sve_fcmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3777{
3778    intptr_t j, i = simd_oprsz(desc);
3779    unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3780    unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3781    unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3782    unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3783    unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3784    bool flip = rot & 1;
3785    float16 neg_imag, neg_real;
3786    void *vd = &env->vfp.zregs[rd];
3787    void *vn = &env->vfp.zregs[rn];
3788    void *vm = &env->vfp.zregs[rm];
3789    void *va = &env->vfp.zregs[ra];
3790    uint64_t *g = vg;
3791
3792    neg_imag = float16_set_sign(0, (rot & 2) != 0);
3793    neg_real = float16_set_sign(0, rot == 1 || rot == 2);
3794
3795    do {
3796        uint64_t pg = g[(i - 1) >> 6];
3797        do {
3798            float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
3799
3800            /* I holds the real index; J holds the imag index.  */
3801            j = i - sizeof(float16);
3802            i -= 2 * sizeof(float16);
3803
3804            nr = *(float16 *)(vn + H1_2(i));
3805            ni = *(float16 *)(vn + H1_2(j));
3806            mr = *(float16 *)(vm + H1_2(i));
3807            mi = *(float16 *)(vm + H1_2(j));
3808
3809            e2 = (flip ? ni : nr);
3810            e1 = (flip ? mi : mr) ^ neg_real;
3811            e4 = e2;
3812            e3 = (flip ? mr : mi) ^ neg_imag;
3813
3814            if (likely((pg >> (i & 63)) & 1)) {
3815                d = *(float16 *)(va + H1_2(i));
3816                d = float16_muladd(e2, e1, d, 0, &env->vfp.fp_status_f16);
3817                *(float16 *)(vd + H1_2(i)) = d;
3818            }
3819            if (likely((pg >> (j & 63)) & 1)) {
3820                d = *(float16 *)(va + H1_2(j));
3821                d = float16_muladd(e4, e3, d, 0, &env->vfp.fp_status_f16);
3822                *(float16 *)(vd + H1_2(j)) = d;
3823            }
3824        } while (i & 63);
3825    } while (i != 0);
3826}
3827
3828void HELPER(sve_fcmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3829{
3830    intptr_t j, i = simd_oprsz(desc);
3831    unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3832    unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3833    unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3834    unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3835    unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3836    bool flip = rot & 1;
3837    float32 neg_imag, neg_real;
3838    void *vd = &env->vfp.zregs[rd];
3839    void *vn = &env->vfp.zregs[rn];
3840    void *vm = &env->vfp.zregs[rm];
3841    void *va = &env->vfp.zregs[ra];
3842    uint64_t *g = vg;
3843
3844    neg_imag = float32_set_sign(0, (rot & 2) != 0);
3845    neg_real = float32_set_sign(0, rot == 1 || rot == 2);
3846
3847    do {
3848        uint64_t pg = g[(i - 1) >> 6];
3849        do {
3850            float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
3851
3852            /* I holds the real index; J holds the imag index.  */
3853            j = i - sizeof(float32);
3854            i -= 2 * sizeof(float32);
3855
3856            nr = *(float32 *)(vn + H1_2(i));
3857            ni = *(float32 *)(vn + H1_2(j));
3858            mr = *(float32 *)(vm + H1_2(i));
3859            mi = *(float32 *)(vm + H1_2(j));
3860
3861            e2 = (flip ? ni : nr);
3862            e1 = (flip ? mi : mr) ^ neg_real;
3863            e4 = e2;
3864            e3 = (flip ? mr : mi) ^ neg_imag;
3865
3866            if (likely((pg >> (i & 63)) & 1)) {
3867                d = *(float32 *)(va + H1_2(i));
3868                d = float32_muladd(e2, e1, d, 0, &env->vfp.fp_status);
3869                *(float32 *)(vd + H1_2(i)) = d;
3870            }
3871            if (likely((pg >> (j & 63)) & 1)) {
3872                d = *(float32 *)(va + H1_2(j));
3873                d = float32_muladd(e4, e3, d, 0, &env->vfp.fp_status);
3874                *(float32 *)(vd + H1_2(j)) = d;
3875            }
3876        } while (i & 63);
3877    } while (i != 0);
3878}
3879
3880void HELPER(sve_fcmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3881{
3882    intptr_t j, i = simd_oprsz(desc);
3883    unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3884    unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3885    unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3886    unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3887    unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3888    bool flip = rot & 1;
3889    float64 neg_imag, neg_real;
3890    void *vd = &env->vfp.zregs[rd];
3891    void *vn = &env->vfp.zregs[rn];
3892    void *vm = &env->vfp.zregs[rm];
3893    void *va = &env->vfp.zregs[ra];
3894    uint64_t *g = vg;
3895
3896    neg_imag = float64_set_sign(0, (rot & 2) != 0);
3897    neg_real = float64_set_sign(0, rot == 1 || rot == 2);
3898
3899    do {
3900        uint64_t pg = g[(i - 1) >> 6];
3901        do {
3902            float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
3903
3904            /* I holds the real index; J holds the imag index.  */
3905            j = i - sizeof(float64);
3906            i -= 2 * sizeof(float64);
3907
3908            nr = *(float64 *)(vn + H1_2(i));
3909            ni = *(float64 *)(vn + H1_2(j));
3910            mr = *(float64 *)(vm + H1_2(i));
3911            mi = *(float64 *)(vm + H1_2(j));
3912
3913            e2 = (flip ? ni : nr);
3914            e1 = (flip ? mi : mr) ^ neg_real;
3915            e4 = e2;
3916            e3 = (flip ? mr : mi) ^ neg_imag;
3917
3918            if (likely((pg >> (i & 63)) & 1)) {
3919                d = *(float64 *)(va + H1_2(i));
3920                d = float64_muladd(e2, e1, d, 0, &env->vfp.fp_status);
3921                *(float64 *)(vd + H1_2(i)) = d;
3922            }
3923            if (likely((pg >> (j & 63)) & 1)) {
3924                d = *(float64 *)(va + H1_2(j));
3925                d = float64_muladd(e4, e3, d, 0, &env->vfp.fp_status);
3926                *(float64 *)(vd + H1_2(j)) = d;
3927            }
3928        } while (i & 63);
3929    } while (i != 0);
3930}
3931
3932/*
3933 * Load contiguous data, protected by a governing predicate.
3934 */
3935#define DO_LD1(NAME, FN, TYPEE, TYPEM, H)                  \
3936static void do_##NAME(CPUARMState *env, void *vd, void *vg, \
3937                      target_ulong addr, intptr_t oprsz,   \
3938                      uintptr_t ra)                        \
3939{                                                          \
3940    intptr_t i = 0;                                        \
3941    do {                                                   \
3942        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
3943        do {                                               \
3944            TYPEM m = 0;                                   \
3945            if (pg & 1) {                                  \
3946                m = FN(env, addr, ra);                     \
3947            }                                              \
3948            *(TYPEE *)(vd + H(i)) = m;                     \
3949            i += sizeof(TYPEE), pg >>= sizeof(TYPEE);      \
3950            addr += sizeof(TYPEM);                         \
3951        } while (i & 15);                                  \
3952    } while (i < oprsz);                                   \
3953}                                                          \
3954void HELPER(NAME)(CPUARMState *env, void *vg,              \
3955                  target_ulong addr, uint32_t desc)        \
3956{                                                          \
3957    do_##NAME(env, &env->vfp.zregs[simd_data(desc)], vg,   \
3958              addr, simd_oprsz(desc), GETPC());            \
3959}
3960
3961#define DO_LD2(NAME, FN, TYPEE, TYPEM, H)                  \
3962void HELPER(NAME)(CPUARMState *env, void *vg,              \
3963                  target_ulong addr, uint32_t desc)        \
3964{                                                          \
3965    intptr_t i, oprsz = simd_oprsz(desc);                  \
3966    intptr_t ra = GETPC();                                 \
3967    unsigned rd = simd_data(desc);                         \
3968    void *d1 = &env->vfp.zregs[rd];                        \
3969    void *d2 = &env->vfp.zregs[(rd + 1) & 31];             \
3970    for (i = 0; i < oprsz; ) {                             \
3971        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
3972        do {                                               \
3973            TYPEM m1 = 0, m2 = 0;                          \
3974            if (pg & 1) {                                  \
3975                m1 = FN(env, addr, ra);                    \
3976                m2 = FN(env, addr + sizeof(TYPEM), ra);    \
3977            }                                              \
3978            *(TYPEE *)(d1 + H(i)) = m1;                    \
3979            *(TYPEE *)(d2 + H(i)) = m2;                    \
3980            i += sizeof(TYPEE), pg >>= sizeof(TYPEE);      \
3981            addr += 2 * sizeof(TYPEM);                     \
3982        } while (i & 15);                                  \
3983    }                                                      \
3984}
3985
3986#define DO_LD3(NAME, FN, TYPEE, TYPEM, H)                  \
3987void HELPER(NAME)(CPUARMState *env, void *vg,              \
3988                  target_ulong addr, uint32_t desc)        \
3989{                                                          \
3990    intptr_t i, oprsz = simd_oprsz(desc);                  \
3991    intptr_t ra = GETPC();                                 \
3992    unsigned rd = simd_data(desc);                         \
3993    void *d1 = &env->vfp.zregs[rd];                        \
3994    void *d2 = &env->vfp.zregs[(rd + 1) & 31];             \
3995    void *d3 = &env->vfp.zregs[(rd + 2) & 31];             \
3996    for (i = 0; i < oprsz; ) {                             \
3997        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
3998        do {                                               \
3999            TYPEM m1 = 0, m2 = 0, m3 = 0;                  \
4000            if (pg & 1) {                                  \
4001                m1 = FN(env, addr, ra);                    \
4002                m2 = FN(env, addr + sizeof(TYPEM), ra);    \
4003                m3 = FN(env, addr + 2 * sizeof(TYPEM), ra); \
4004            }                                              \
4005            *(TYPEE *)(d1 + H(i)) = m1;                    \
4006            *(TYPEE *)(d2 + H(i)) = m2;                    \
4007            *(TYPEE *)(d3 + H(i)) = m3;                    \
4008            i += sizeof(TYPEE), pg >>= sizeof(TYPEE);      \
4009            addr += 3 * sizeof(TYPEM);                     \
4010        } while (i & 15);                                  \
4011    }                                                      \
4012}
4013
4014#define DO_LD4(NAME, FN, TYPEE, TYPEM, H)                  \
4015void HELPER(NAME)(CPUARMState *env, void *vg,              \
4016                  target_ulong addr, uint32_t desc)        \
4017{                                                          \
4018    intptr_t i, oprsz = simd_oprsz(desc);                  \
4019    intptr_t ra = GETPC();                                 \
4020    unsigned rd = simd_data(desc);                         \
4021    void *d1 = &env->vfp.zregs[rd];                        \
4022    void *d2 = &env->vfp.zregs[(rd + 1) & 31];             \
4023    void *d3 = &env->vfp.zregs[(rd + 2) & 31];             \
4024    void *d4 = &env->vfp.zregs[(rd + 3) & 31];             \
4025    for (i = 0; i < oprsz; ) {                             \
4026        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
4027        do {                                               \
4028            TYPEM m1 = 0, m2 = 0, m3 = 0, m4 = 0;          \
4029            if (pg & 1) {                                  \
4030                m1 = FN(env, addr, ra);                    \
4031                m2 = FN(env, addr + sizeof(TYPEM), ra);    \
4032                m3 = FN(env, addr + 2 * sizeof(TYPEM), ra); \
4033                m4 = FN(env, addr + 3 * sizeof(TYPEM), ra); \
4034            }                                              \
4035            *(TYPEE *)(d1 + H(i)) = m1;                    \
4036            *(TYPEE *)(d2 + H(i)) = m2;                    \
4037            *(TYPEE *)(d3 + H(i)) = m3;                    \
4038            *(TYPEE *)(d4 + H(i)) = m4;                    \
4039            i += sizeof(TYPEE), pg >>= sizeof(TYPEE);      \
4040            addr += 4 * sizeof(TYPEM);                     \
4041        } while (i & 15);                                  \
4042    }                                                      \
4043}
4044
4045DO_LD1(sve_ld1bhu_r, cpu_ldub_data_ra, uint16_t, uint8_t, H1_2)
4046DO_LD1(sve_ld1bhs_r, cpu_ldsb_data_ra, uint16_t, int8_t, H1_2)
4047DO_LD1(sve_ld1bsu_r, cpu_ldub_data_ra, uint32_t, uint8_t, H1_4)
4048DO_LD1(sve_ld1bss_r, cpu_ldsb_data_ra, uint32_t, int8_t, H1_4)
4049DO_LD1(sve_ld1bdu_r, cpu_ldub_data_ra, uint64_t, uint8_t, )
4050DO_LD1(sve_ld1bds_r, cpu_ldsb_data_ra, uint64_t, int8_t, )
4051
4052DO_LD1(sve_ld1hsu_r, cpu_lduw_data_ra, uint32_t, uint16_t, H1_4)
4053DO_LD1(sve_ld1hss_r, cpu_ldsw_data_ra, uint32_t, int8_t, H1_4)
4054DO_LD1(sve_ld1hdu_r, cpu_lduw_data_ra, uint64_t, uint16_t, )
4055DO_LD1(sve_ld1hds_r, cpu_ldsw_data_ra, uint64_t, int16_t, )
4056
4057DO_LD1(sve_ld1sdu_r, cpu_ldl_data_ra, uint64_t, uint32_t, )
4058DO_LD1(sve_ld1sds_r, cpu_ldl_data_ra, uint64_t, int32_t, )
4059
4060DO_LD1(sve_ld1bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4061DO_LD2(sve_ld2bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4062DO_LD3(sve_ld3bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4063DO_LD4(sve_ld4bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4064
4065DO_LD1(sve_ld1hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4066DO_LD2(sve_ld2hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4067DO_LD3(sve_ld3hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4068DO_LD4(sve_ld4hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4069
4070DO_LD1(sve_ld1ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4071DO_LD2(sve_ld2ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4072DO_LD3(sve_ld3ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4073DO_LD4(sve_ld4ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4074
4075DO_LD1(sve_ld1dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4076DO_LD2(sve_ld2dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4077DO_LD3(sve_ld3dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4078DO_LD4(sve_ld4dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4079
4080#undef DO_LD1
4081#undef DO_LD2
4082#undef DO_LD3
4083#undef DO_LD4
4084
4085/*
4086 * Load contiguous data, first-fault and no-fault.
4087 */
4088
4089#ifdef CONFIG_USER_ONLY
4090
4091/* Fault on byte I.  All bits in FFR from I are cleared.  The vector
4092 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
4093 * option, which leaves subsequent data unchanged.
4094 */
4095static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
4096{
4097    uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
4098
4099    if (i & 63) {
4100        ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
4101        i = ROUND_UP(i, 64);
4102    }
4103    for (; i < oprsz; i += 64) {
4104        ffr[i / 64] = 0;
4105    }
4106}
4107
4108/* Hold the mmap lock during the operation so that there is no race
4109 * between page_check_range and the load operation.  We expect the
4110 * usual case to have no faults at all, so we check the whole range
4111 * first and if successful defer to the normal load operation.
4112 *
4113 * TODO: Change mmap_lock to a rwlock so that multiple readers
4114 * can run simultaneously.  This will probably help other uses
4115 * within QEMU as well.
4116 */
4117#define DO_LDFF1(PART, FN, TYPEE, TYPEM, H)                             \
4118static void do_sve_ldff1##PART(CPUARMState *env, void *vd, void *vg,    \
4119                               target_ulong addr, intptr_t oprsz,       \
4120                               bool first, uintptr_t ra)                \
4121{                                                                       \
4122    intptr_t i = 0;                                                     \
4123    do {                                                                \
4124        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
4125        do {                                                            \
4126            TYPEM m = 0;                                                \
4127            if (pg & 1) {                                               \
4128                if (!first &&                                           \
4129                    unlikely(page_check_range(addr, sizeof(TYPEM),      \
4130                                              PAGE_READ))) {            \
4131                    record_fault(env, i, oprsz);                        \
4132                    return;                                             \
4133                }                                                       \
4134                m = FN(env, addr, ra);                                  \
4135                first = false;                                          \
4136            }                                                           \
4137            *(TYPEE *)(vd + H(i)) = m;                                  \
4138            i += sizeof(TYPEE), pg >>= sizeof(TYPEE);                   \
4139            addr += sizeof(TYPEM);                                      \
4140        } while (i & 15);                                               \
4141    } while (i < oprsz);                                                \
4142}                                                                       \
4143void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg,                \
4144                             target_ulong addr, uint32_t desc)          \
4145{                                                                       \
4146    intptr_t oprsz = simd_oprsz(desc);                                  \
4147    unsigned rd = simd_data(desc);                                      \
4148    void *vd = &env->vfp.zregs[rd];                                     \
4149    mmap_lock();                                                        \
4150    if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) {        \
4151        do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC());            \
4152    } else {                                                            \
4153        do_sve_ldff1##PART(env, vd, vg, addr, oprsz, true, GETPC());    \
4154    }                                                                   \
4155    mmap_unlock();                                                      \
4156}
4157
4158/* No-fault loads are like first-fault loads without the
4159 * first faulting special case.
4160 */
4161#define DO_LDNF1(PART)                                                  \
4162void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg,                \
4163                             target_ulong addr, uint32_t desc)          \
4164{                                                                       \
4165    intptr_t oprsz = simd_oprsz(desc);                                  \
4166    unsigned rd = simd_data(desc);                                      \
4167    void *vd = &env->vfp.zregs[rd];                                     \
4168    mmap_lock();                                                        \
4169    if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) {        \
4170        do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC());            \
4171    } else {                                                            \
4172        do_sve_ldff1##PART(env, vd, vg, addr, oprsz, false, GETPC());   \
4173    }                                                                   \
4174    mmap_unlock();                                                      \
4175}
4176
4177#else
4178
4179/* TODO: System mode is not yet supported.
4180 * This would probably use tlb_vaddr_to_host.
4181 */
4182#define DO_LDFF1(PART, FN, TYPEE, TYPEM, H)                     \
4183void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg,        \
4184                  target_ulong addr, uint32_t desc)             \
4185{                                                               \
4186    g_assert_not_reached();                                     \
4187}
4188
4189#define DO_LDNF1(PART)                                          \
4190void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg,        \
4191                  target_ulong addr, uint32_t desc)             \
4192{                                                               \
4193    g_assert_not_reached();                                     \
4194}
4195
4196#endif
4197
4198DO_LDFF1(bb_r,  cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4199DO_LDFF1(bhu_r, cpu_ldub_data_ra, uint16_t, uint8_t, H1_2)
4200DO_LDFF1(bhs_r, cpu_ldsb_data_ra, uint16_t, int8_t, H1_2)
4201DO_LDFF1(bsu_r, cpu_ldub_data_ra, uint32_t, uint8_t, H1_4)
4202DO_LDFF1(bss_r, cpu_ldsb_data_ra, uint32_t, int8_t, H1_4)
4203DO_LDFF1(bdu_r, cpu_ldub_data_ra, uint64_t, uint8_t, )
4204DO_LDFF1(bds_r, cpu_ldsb_data_ra, uint64_t, int8_t, )
4205
4206DO_LDFF1(hh_r,  cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4207DO_LDFF1(hsu_r, cpu_lduw_data_ra, uint32_t, uint16_t, H1_4)
4208DO_LDFF1(hss_r, cpu_ldsw_data_ra, uint32_t, int8_t, H1_4)
4209DO_LDFF1(hdu_r, cpu_lduw_data_ra, uint64_t, uint16_t, )
4210DO_LDFF1(hds_r, cpu_ldsw_data_ra, uint64_t, int16_t, )
4211
4212DO_LDFF1(ss_r,  cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4213DO_LDFF1(sdu_r, cpu_ldl_data_ra, uint64_t, uint32_t, )
4214DO_LDFF1(sds_r, cpu_ldl_data_ra, uint64_t, int32_t, )
4215
4216DO_LDFF1(dd_r,  cpu_ldq_data_ra, uint64_t, uint64_t, )
4217
4218#undef DO_LDFF1
4219
4220DO_LDNF1(bb_r)
4221DO_LDNF1(bhu_r)
4222DO_LDNF1(bhs_r)
4223DO_LDNF1(bsu_r)
4224DO_LDNF1(bss_r)
4225DO_LDNF1(bdu_r)
4226DO_LDNF1(bds_r)
4227
4228DO_LDNF1(hh_r)
4229DO_LDNF1(hsu_r)
4230DO_LDNF1(hss_r)
4231DO_LDNF1(hdu_r)
4232DO_LDNF1(hds_r)
4233
4234DO_LDNF1(ss_r)
4235DO_LDNF1(sdu_r)
4236DO_LDNF1(sds_r)
4237
4238DO_LDNF1(dd_r)
4239
4240#undef DO_LDNF1
4241
4242/*
4243 * Store contiguous data, protected by a governing predicate.
4244 */
4245#define DO_ST1(NAME, FN, TYPEE, TYPEM, H)                  \
4246void HELPER(NAME)(CPUARMState *env, void *vg,              \
4247                  target_ulong addr, uint32_t desc)        \
4248{                                                          \
4249    intptr_t i, oprsz = simd_oprsz(desc);                  \
4250    intptr_t ra = GETPC();                                 \
4251    unsigned rd = simd_data(desc);                         \
4252    void *vd = &env->vfp.zregs[rd];                        \
4253    for (i = 0; i < oprsz; ) {                             \
4254        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
4255        do {                                               \
4256            if (pg & 1) {                                  \
4257                TYPEM m = *(TYPEE *)(vd + H(i));           \
4258                FN(env, addr, m, ra);                      \
4259            }                                              \
4260            i += sizeof(TYPEE), pg >>= sizeof(TYPEE);      \
4261            addr += sizeof(TYPEM);                         \
4262        } while (i & 15);                                  \
4263    }                                                      \
4264}
4265
4266#define DO_ST1_D(NAME, FN, TYPEM)                          \
4267void HELPER(NAME)(CPUARMState *env, void *vg,              \
4268                  target_ulong addr, uint32_t desc)        \
4269{                                                          \
4270    intptr_t i, oprsz = simd_oprsz(desc) / 8;              \
4271    intptr_t ra = GETPC();                                 \
4272    unsigned rd = simd_data(desc);                         \
4273    uint64_t *d = &env->vfp.zregs[rd].d[0];                \
4274    uint8_t *pg = vg;                                      \
4275    for (i = 0; i < oprsz; i += 1) {                       \
4276        if (pg[H1(i)] & 1) {                               \
4277            FN(env, addr, d[i], ra);                       \
4278        }                                                  \
4279        addr += sizeof(TYPEM);                             \
4280    }                                                      \
4281}
4282
4283#define DO_ST2(NAME, FN, TYPEE, TYPEM, H)                  \
4284void HELPER(NAME)(CPUARMState *env, void *vg,              \
4285                  target_ulong addr, uint32_t desc)        \
4286{                                                          \
4287    intptr_t i, oprsz = simd_oprsz(desc);                  \
4288    intptr_t ra = GETPC();                                 \
4289    unsigned rd = simd_data(desc);                         \
4290    void *d1 = &env->vfp.zregs[rd];                        \
4291    void *d2 = &env->vfp.zregs[(rd + 1) & 31];             \
4292    for (i = 0; i < oprsz; ) {                             \
4293        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
4294        do {                                               \
4295            if (pg & 1) {                                  \
4296                TYPEM m1 = *(TYPEE *)(d1 + H(i));          \
4297                TYPEM m2 = *(TYPEE *)(d2 + H(i));          \
4298                FN(env, addr, m1, ra);                     \
4299                FN(env, addr + sizeof(TYPEM), m2, ra);     \
4300            }                                              \
4301            i += sizeof(TYPEE), pg >>= sizeof(TYPEE);      \
4302            addr += 2 * sizeof(TYPEM);                     \
4303        } while (i & 15);                                  \
4304    }                                                      \
4305}
4306
4307#define DO_ST3(NAME, FN, TYPEE, TYPEM, H)                  \
4308void HELPER(NAME)(CPUARMState *env, void *vg,              \
4309                  target_ulong addr, uint32_t desc)        \
4310{                                                          \
4311    intptr_t i, oprsz = simd_oprsz(desc);                  \
4312    intptr_t ra = GETPC();                                 \
4313    unsigned rd = simd_data(desc);                         \
4314    void *d1 = &env->vfp.zregs[rd];                        \
4315    void *d2 = &env->vfp.zregs[(rd + 1) & 31];             \
4316    void *d3 = &env->vfp.zregs[(rd + 2) & 31];             \
4317    for (i = 0; i < oprsz; ) {                             \
4318        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
4319        do {                                               \
4320            if (pg & 1) {                                  \
4321                TYPEM m1 = *(TYPEE *)(d1 + H(i));          \
4322                TYPEM m2 = *(TYPEE *)(d2 + H(i));          \
4323                TYPEM m3 = *(TYPEE *)(d3 + H(i));          \
4324                FN(env, addr, m1, ra);                     \
4325                FN(env, addr + sizeof(TYPEM), m2, ra);     \
4326                FN(env, addr + 2 * sizeof(TYPEM), m3, ra); \
4327            }                                              \
4328            i += sizeof(TYPEE), pg >>= sizeof(TYPEE);      \
4329            addr += 3 * sizeof(TYPEM);                     \
4330        } while (i & 15);                                  \
4331    }                                                      \
4332}
4333
4334#define DO_ST4(NAME, FN, TYPEE, TYPEM, H)                  \
4335void HELPER(NAME)(CPUARMState *env, void *vg,              \
4336                  target_ulong addr, uint32_t desc)        \
4337{                                                          \
4338    intptr_t i, oprsz = simd_oprsz(desc);                  \
4339    intptr_t ra = GETPC();                                 \
4340    unsigned rd = simd_data(desc);                         \
4341    void *d1 = &env->vfp.zregs[rd];                        \
4342    void *d2 = &env->vfp.zregs[(rd + 1) & 31];             \
4343    void *d3 = &env->vfp.zregs[(rd + 2) & 31];             \
4344    void *d4 = &env->vfp.zregs[(rd + 3) & 31];             \
4345    for (i = 0; i < oprsz; ) {                             \
4346        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
4347        do {                                               \
4348            if (pg & 1) {                                  \
4349                TYPEM m1 = *(TYPEE *)(d1 + H(i));          \
4350                TYPEM m2 = *(TYPEE *)(d2 + H(i));          \
4351                TYPEM m3 = *(TYPEE *)(d3 + H(i));          \
4352                TYPEM m4 = *(TYPEE *)(d4 + H(i));          \
4353                FN(env, addr, m1, ra);                     \
4354                FN(env, addr + sizeof(TYPEM), m2, ra);     \
4355                FN(env, addr + 2 * sizeof(TYPEM), m3, ra); \
4356                FN(env, addr + 3 * sizeof(TYPEM), m4, ra); \
4357            }                                              \
4358            i += sizeof(TYPEE), pg >>= sizeof(TYPEE);      \
4359            addr += 4 * sizeof(TYPEM);                     \
4360        } while (i & 15);                                  \
4361    }                                                      \
4362}
4363
4364DO_ST1(sve_st1bh_r, cpu_stb_data_ra, uint16_t, uint8_t, H1_2)
4365DO_ST1(sve_st1bs_r, cpu_stb_data_ra, uint32_t, uint8_t, H1_4)
4366DO_ST1_D(sve_st1bd_r, cpu_stb_data_ra, uint8_t)
4367
4368DO_ST1(sve_st1hs_r, cpu_stw_data_ra, uint32_t, uint16_t, H1_4)
4369DO_ST1_D(sve_st1hd_r, cpu_stw_data_ra, uint16_t)
4370
4371DO_ST1_D(sve_st1sd_r, cpu_stl_data_ra, uint32_t)
4372
4373DO_ST1(sve_st1bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
4374DO_ST2(sve_st2bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
4375DO_ST3(sve_st3bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
4376DO_ST4(sve_st4bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
4377
4378DO_ST1(sve_st1hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
4379DO_ST2(sve_st2hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
4380DO_ST3(sve_st3hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
4381DO_ST4(sve_st4hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
4382
4383DO_ST1(sve_st1ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
4384DO_ST2(sve_st2ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
4385DO_ST3(sve_st3ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
4386DO_ST4(sve_st4ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
4387
4388DO_ST1_D(sve_st1dd_r, cpu_stq_data_ra, uint64_t)
4389
4390void HELPER(sve_st2dd_r)(CPUARMState *env, void *vg,
4391                         target_ulong addr, uint32_t desc)
4392{
4393    intptr_t i, oprsz = simd_oprsz(desc) / 8;
4394    intptr_t ra = GETPC();
4395    unsigned rd = simd_data(desc);
4396    uint64_t *d1 = &env->vfp.zregs[rd].d[0];
4397    uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
4398    uint8_t *pg = vg;
4399
4400    for (i = 0; i < oprsz; i += 1) {
4401        if (pg[H1(i)] & 1) {
4402            cpu_stq_data_ra(env, addr, d1[i], ra);
4403            cpu_stq_data_ra(env, addr + 8, d2[i], ra);
4404        }
4405        addr += 2 * 8;
4406    }
4407}
4408
4409void HELPER(sve_st3dd_r)(CPUARMState *env, void *vg,
4410                         target_ulong addr, uint32_t desc)
4411{
4412    intptr_t i, oprsz = simd_oprsz(desc) / 8;
4413    intptr_t ra = GETPC();
4414    unsigned rd = simd_data(desc);
4415    uint64_t *d1 = &env->vfp.zregs[rd].d[0];
4416    uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
4417    uint64_t *d3 = &env->vfp.zregs[(rd + 2) & 31].d[0];
4418    uint8_t *pg = vg;
4419
4420    for (i = 0; i < oprsz; i += 1) {
4421        if (pg[H1(i)] & 1) {
4422            cpu_stq_data_ra(env, addr, d1[i], ra);
4423            cpu_stq_data_ra(env, addr + 8, d2[i], ra);
4424            cpu_stq_data_ra(env, addr + 16, d3[i], ra);
4425        }
4426        addr += 3 * 8;
4427    }
4428}
4429
4430void HELPER(sve_st4dd_r)(CPUARMState *env, void *vg,
4431                         target_ulong addr, uint32_t desc)
4432{
4433    intptr_t i, oprsz = simd_oprsz(desc) / 8;
4434    intptr_t ra = GETPC();
4435    unsigned rd = simd_data(desc);
4436    uint64_t *d1 = &env->vfp.zregs[rd].d[0];
4437    uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
4438    uint64_t *d3 = &env->vfp.zregs[(rd + 2) & 31].d[0];
4439    uint64_t *d4 = &env->vfp.zregs[(rd + 3) & 31].d[0];
4440    uint8_t *pg = vg;
4441
4442    for (i = 0; i < oprsz; i += 1) {
4443        if (pg[H1(i)] & 1) {
4444            cpu_stq_data_ra(env, addr, d1[i], ra);
4445            cpu_stq_data_ra(env, addr + 8, d2[i], ra);
4446            cpu_stq_data_ra(env, addr + 16, d3[i], ra);
4447            cpu_stq_data_ra(env, addr + 24, d4[i], ra);
4448        }
4449        addr += 4 * 8;
4450    }
4451}
4452
4453/* Loads with a vector index.  */
4454
4455#define DO_LD1_ZPZ_S(NAME, TYPEI, TYPEM, FN)                            \
4456void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm,       \
4457                  target_ulong base, uint32_t desc)                     \
4458{                                                                       \
4459    intptr_t i, oprsz = simd_oprsz(desc);                               \
4460    unsigned scale = simd_data(desc);                                   \
4461    uintptr_t ra = GETPC();                                             \
4462    for (i = 0; i < oprsz; ) {                                          \
4463        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
4464        do {                                                            \
4465            TYPEM m = 0;                                                \
4466            if (pg & 1) {                                               \
4467                target_ulong off = *(TYPEI *)(vm + H1_4(i));            \
4468                m = FN(env, base + (off << scale), ra);                 \
4469            }                                                           \
4470            *(uint32_t *)(vd + H1_4(i)) = m;                            \
4471            i += 4, pg >>= 4;                                           \
4472        } while (i & 15);                                               \
4473    }                                                                   \
4474}
4475
4476#define DO_LD1_ZPZ_D(NAME, TYPEI, TYPEM, FN)                            \
4477void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm,       \
4478                  target_ulong base, uint32_t desc)                     \
4479{                                                                       \
4480    intptr_t i, oprsz = simd_oprsz(desc) / 8;                           \
4481    unsigned scale = simd_data(desc);                                   \
4482    uintptr_t ra = GETPC();                                             \
4483    uint64_t *d = vd, *m = vm; uint8_t *pg = vg;                        \
4484    for (i = 0; i < oprsz; i++) {                                       \
4485        TYPEM mm = 0;                                                   \
4486        if (pg[H1(i)] & 1) {                                            \
4487            target_ulong off = (TYPEI)m[i];                             \
4488            mm = FN(env, base + (off << scale), ra);                    \
4489        }                                                               \
4490        d[i] = mm;                                                      \
4491    }                                                                   \
4492}
4493
4494DO_LD1_ZPZ_S(sve_ldbsu_zsu, uint32_t, uint8_t,  cpu_ldub_data_ra)
4495DO_LD1_ZPZ_S(sve_ldhsu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4496DO_LD1_ZPZ_S(sve_ldssu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4497DO_LD1_ZPZ_S(sve_ldbss_zsu, uint32_t, int8_t,   cpu_ldub_data_ra)
4498DO_LD1_ZPZ_S(sve_ldhss_zsu, uint32_t, int16_t,  cpu_lduw_data_ra)
4499
4500DO_LD1_ZPZ_S(sve_ldbsu_zss, int32_t, uint8_t,  cpu_ldub_data_ra)
4501DO_LD1_ZPZ_S(sve_ldhsu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4502DO_LD1_ZPZ_S(sve_ldssu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4503DO_LD1_ZPZ_S(sve_ldbss_zss, int32_t, int8_t,   cpu_ldub_data_ra)
4504DO_LD1_ZPZ_S(sve_ldhss_zss, int32_t, int16_t,  cpu_lduw_data_ra)
4505
4506DO_LD1_ZPZ_D(sve_ldbdu_zsu, uint32_t, uint8_t,  cpu_ldub_data_ra)
4507DO_LD1_ZPZ_D(sve_ldhdu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4508DO_LD1_ZPZ_D(sve_ldsdu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4509DO_LD1_ZPZ_D(sve_ldddu_zsu, uint32_t, uint64_t, cpu_ldq_data_ra)
4510DO_LD1_ZPZ_D(sve_ldbds_zsu, uint32_t, int8_t,   cpu_ldub_data_ra)
4511DO_LD1_ZPZ_D(sve_ldhds_zsu, uint32_t, int16_t,  cpu_lduw_data_ra)
4512DO_LD1_ZPZ_D(sve_ldsds_zsu, uint32_t, int32_t,  cpu_ldl_data_ra)
4513
4514DO_LD1_ZPZ_D(sve_ldbdu_zss, int32_t, uint8_t,  cpu_ldub_data_ra)
4515DO_LD1_ZPZ_D(sve_ldhdu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4516DO_LD1_ZPZ_D(sve_ldsdu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4517DO_LD1_ZPZ_D(sve_ldddu_zss, int32_t, uint64_t, cpu_ldq_data_ra)
4518DO_LD1_ZPZ_D(sve_ldbds_zss, int32_t, int8_t,   cpu_ldub_data_ra)
4519DO_LD1_ZPZ_D(sve_ldhds_zss, int32_t, int16_t,  cpu_lduw_data_ra)
4520DO_LD1_ZPZ_D(sve_ldsds_zss, int32_t, int32_t,  cpu_ldl_data_ra)
4521
4522DO_LD1_ZPZ_D(sve_ldbdu_zd, uint64_t, uint8_t,  cpu_ldub_data_ra)
4523DO_LD1_ZPZ_D(sve_ldhdu_zd, uint64_t, uint16_t, cpu_lduw_data_ra)
4524DO_LD1_ZPZ_D(sve_ldsdu_zd, uint64_t, uint32_t, cpu_ldl_data_ra)
4525DO_LD1_ZPZ_D(sve_ldddu_zd, uint64_t, uint64_t, cpu_ldq_data_ra)
4526DO_LD1_ZPZ_D(sve_ldbds_zd, uint64_t, int8_t,   cpu_ldub_data_ra)
4527DO_LD1_ZPZ_D(sve_ldhds_zd, uint64_t, int16_t,  cpu_lduw_data_ra)
4528DO_LD1_ZPZ_D(sve_ldsds_zd, uint64_t, int32_t,  cpu_ldl_data_ra)
4529
4530/* First fault loads with a vector index.  */
4531
4532#ifdef CONFIG_USER_ONLY
4533
4534#define DO_LDFF1_ZPZ(NAME, TYPEE, TYPEI, TYPEM, FN, H)                  \
4535void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm,       \
4536                  target_ulong base, uint32_t desc)                     \
4537{                                                                       \
4538    intptr_t i, oprsz = simd_oprsz(desc);                               \
4539    unsigned scale = simd_data(desc);                                   \
4540    uintptr_t ra = GETPC();                                             \
4541    bool first = true;                                                  \
4542    mmap_lock();                                                        \
4543    for (i = 0; i < oprsz; ) {                                          \
4544        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
4545        do {                                                            \
4546            TYPEM m = 0;                                                \
4547            if (pg & 1) {                                               \
4548                target_ulong off = *(TYPEI *)(vm + H(i));               \
4549                target_ulong addr = base + (off << scale);              \
4550                if (!first &&                                           \
4551                    page_check_range(addr, sizeof(TYPEM), PAGE_READ)) { \
4552                    record_fault(env, i, oprsz);                        \
4553                    goto exit;                                          \
4554                }                                                       \
4555                m = FN(env, addr, ra);                                  \
4556                first = false;                                          \
4557            }                                                           \
4558            *(TYPEE *)(vd + H(i)) = m;                                  \
4559            i += sizeof(TYPEE), pg >>= sizeof(TYPEE);                   \
4560        } while (i & 15);                                               \
4561    }                                                                   \
4562 exit:                                                                  \
4563    mmap_unlock();                                                      \
4564}
4565
4566#else
4567
4568#define DO_LDFF1_ZPZ(NAME, TYPEE, TYPEI, TYPEM, FN, H)                  \
4569void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm,       \
4570                  target_ulong base, uint32_t desc)                     \
4571{                                                                       \
4572    g_assert_not_reached();                                             \
4573}
4574
4575#endif
4576
4577#define DO_LDFF1_ZPZ_S(NAME, TYPEI, TYPEM, FN) \
4578    DO_LDFF1_ZPZ(NAME, uint32_t, TYPEI, TYPEM, FN, H1_4)
4579#define DO_LDFF1_ZPZ_D(NAME, TYPEI, TYPEM, FN) \
4580    DO_LDFF1_ZPZ(NAME, uint64_t, TYPEI, TYPEM, FN, )
4581
4582DO_LDFF1_ZPZ_S(sve_ldffbsu_zsu, uint32_t, uint8_t,  cpu_ldub_data_ra)
4583DO_LDFF1_ZPZ_S(sve_ldffhsu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4584DO_LDFF1_ZPZ_S(sve_ldffssu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4585DO_LDFF1_ZPZ_S(sve_ldffbss_zsu, uint32_t, int8_t,   cpu_ldub_data_ra)
4586DO_LDFF1_ZPZ_S(sve_ldffhss_zsu, uint32_t, int16_t,  cpu_lduw_data_ra)
4587
4588DO_LDFF1_ZPZ_S(sve_ldffbsu_zss, int32_t, uint8_t,  cpu_ldub_data_ra)
4589DO_LDFF1_ZPZ_S(sve_ldffhsu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4590DO_LDFF1_ZPZ_S(sve_ldffssu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4591DO_LDFF1_ZPZ_S(sve_ldffbss_zss, int32_t, int8_t,   cpu_ldub_data_ra)
4592DO_LDFF1_ZPZ_S(sve_ldffhss_zss, int32_t, int16_t,  cpu_lduw_data_ra)
4593
4594DO_LDFF1_ZPZ_D(sve_ldffbdu_zsu, uint32_t, uint8_t,  cpu_ldub_data_ra)
4595DO_LDFF1_ZPZ_D(sve_ldffhdu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4596DO_LDFF1_ZPZ_D(sve_ldffsdu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4597DO_LDFF1_ZPZ_D(sve_ldffddu_zsu, uint32_t, uint64_t, cpu_ldq_data_ra)
4598DO_LDFF1_ZPZ_D(sve_ldffbds_zsu, uint32_t, int8_t,   cpu_ldub_data_ra)
4599DO_LDFF1_ZPZ_D(sve_ldffhds_zsu, uint32_t, int16_t,  cpu_lduw_data_ra)
4600DO_LDFF1_ZPZ_D(sve_ldffsds_zsu, uint32_t, int32_t,  cpu_ldl_data_ra)
4601
4602DO_LDFF1_ZPZ_D(sve_ldffbdu_zss, int32_t, uint8_t,  cpu_ldub_data_ra)
4603DO_LDFF1_ZPZ_D(sve_ldffhdu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4604DO_LDFF1_ZPZ_D(sve_ldffsdu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4605DO_LDFF1_ZPZ_D(sve_ldffddu_zss, int32_t, uint64_t, cpu_ldq_data_ra)
4606DO_LDFF1_ZPZ_D(sve_ldffbds_zss, int32_t, int8_t,   cpu_ldub_data_ra)
4607DO_LDFF1_ZPZ_D(sve_ldffhds_zss, int32_t, int16_t,  cpu_lduw_data_ra)
4608DO_LDFF1_ZPZ_D(sve_ldffsds_zss, int32_t, int32_t,  cpu_ldl_data_ra)
4609
4610DO_LDFF1_ZPZ_D(sve_ldffbdu_zd, uint64_t, uint8_t,  cpu_ldub_data_ra)
4611DO_LDFF1_ZPZ_D(sve_ldffhdu_zd, uint64_t, uint16_t, cpu_lduw_data_ra)
4612DO_LDFF1_ZPZ_D(sve_ldffsdu_zd, uint64_t, uint32_t, cpu_ldl_data_ra)
4613DO_LDFF1_ZPZ_D(sve_ldffddu_zd, uint64_t, uint64_t, cpu_ldq_data_ra)
4614DO_LDFF1_ZPZ_D(sve_ldffbds_zd, uint64_t, int8_t,   cpu_ldub_data_ra)
4615DO_LDFF1_ZPZ_D(sve_ldffhds_zd, uint64_t, int16_t,  cpu_lduw_data_ra)
4616DO_LDFF1_ZPZ_D(sve_ldffsds_zd, uint64_t, int32_t,  cpu_ldl_data_ra)
4617
4618/* Stores with a vector index.  */
4619
4620#define DO_ST1_ZPZ_S(NAME, TYPEI, FN)                                   \
4621void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm,       \
4622                  target_ulong base, uint32_t desc)                     \
4623{                                                                       \
4624    intptr_t i, oprsz = simd_oprsz(desc);                               \
4625    unsigned scale = simd_data(desc);                                   \
4626    uintptr_t ra = GETPC();                                             \
4627    for (i = 0; i < oprsz; ) {                                          \
4628        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
4629        do {                                                            \
4630            if (likely(pg & 1)) {                                       \
4631                target_ulong off = *(TYPEI *)(vm + H1_4(i));            \
4632                uint32_t d = *(uint32_t *)(vd + H1_4(i));               \
4633                FN(env, base + (off << scale), d, ra);                  \
4634            }                                                           \
4635            i += sizeof(uint32_t), pg >>= sizeof(uint32_t);             \
4636        } while (i & 15);                                               \
4637    }                                                                   \
4638}
4639
4640#define DO_ST1_ZPZ_D(NAME, TYPEI, FN)                                   \
4641void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm,       \
4642                  target_ulong base, uint32_t desc)                     \
4643{                                                                       \
4644    intptr_t i, oprsz = simd_oprsz(desc) / 8;                           \
4645    unsigned scale = simd_data(desc);                                   \
4646    uintptr_t ra = GETPC();                                             \
4647    uint64_t *d = vd, *m = vm; uint8_t *pg = vg;                        \
4648    for (i = 0; i < oprsz; i++) {                                       \
4649        if (likely(pg[H1(i)] & 1)) {                                    \
4650            target_ulong off = (target_ulong)(TYPEI)m[i] << scale;      \
4651            FN(env, base + off, d[i], ra);                              \
4652        }                                                               \
4653    }                                                                   \
4654}
4655
4656DO_ST1_ZPZ_S(sve_stbs_zsu, uint32_t, cpu_stb_data_ra)
4657DO_ST1_ZPZ_S(sve_sths_zsu, uint32_t, cpu_stw_data_ra)
4658DO_ST1_ZPZ_S(sve_stss_zsu, uint32_t, cpu_stl_data_ra)
4659
4660DO_ST1_ZPZ_S(sve_stbs_zss, int32_t, cpu_stb_data_ra)
4661DO_ST1_ZPZ_S(sve_sths_zss, int32_t, cpu_stw_data_ra)
4662DO_ST1_ZPZ_S(sve_stss_zss, int32_t, cpu_stl_data_ra)
4663
4664DO_ST1_ZPZ_D(sve_stbd_zsu, uint32_t, cpu_stb_data_ra)
4665DO_ST1_ZPZ_D(sve_sthd_zsu, uint32_t, cpu_stw_data_ra)
4666DO_ST1_ZPZ_D(sve_stsd_zsu, uint32_t, cpu_stl_data_ra)
4667DO_ST1_ZPZ_D(sve_stdd_zsu, uint32_t, cpu_stq_data_ra)
4668
4669DO_ST1_ZPZ_D(sve_stbd_zss, int32_t, cpu_stb_data_ra)
4670DO_ST1_ZPZ_D(sve_sthd_zss, int32_t, cpu_stw_data_ra)
4671DO_ST1_ZPZ_D(sve_stsd_zss, int32_t, cpu_stl_data_ra)
4672DO_ST1_ZPZ_D(sve_stdd_zss, int32_t, cpu_stq_data_ra)
4673
4674DO_ST1_ZPZ_D(sve_stbd_zd, uint64_t, cpu_stb_data_ra)
4675DO_ST1_ZPZ_D(sve_sthd_zd, uint64_t, cpu_stw_data_ra)
4676DO_ST1_ZPZ_D(sve_stsd_zd, uint64_t, cpu_stl_data_ra)
4677DO_ST1_ZPZ_D(sve_stdd_zd, uint64_t, cpu_stq_data_ra)
4678