qemu/target/arm/sve_helper.c
<<
>>
Prefs
   1/*
   2 * ARM SVE Operations
   3 *
   4 * Copyright (c) 2018 Linaro, Ltd.
   5 *
   6 * This library is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU Lesser General Public
   8 * License as published by the Free Software Foundation; either
   9 * version 2 of the License, or (at your option) any later version.
  10 *
  11 * This library is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * Lesser General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU Lesser General Public
  17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20#include "qemu/osdep.h"
  21#include "cpu.h"
  22#include "internals.h"
  23#include "exec/exec-all.h"
  24#include "exec/cpu_ldst.h"
  25#include "exec/helper-proto.h"
  26#include "tcg/tcg-gvec-desc.h"
  27#include "fpu/softfloat.h"
  28
  29
  30/* Note that vector data is stored in host-endian 64-bit chunks,
  31   so addressing units smaller than that needs a host-endian fixup.  */
  32#ifdef HOST_WORDS_BIGENDIAN
  33#define H1(x)   ((x) ^ 7)
  34#define H1_2(x) ((x) ^ 6)
  35#define H1_4(x) ((x) ^ 4)
  36#define H2(x)   ((x) ^ 3)
  37#define H4(x)   ((x) ^ 1)
  38#else
  39#define H1(x)   (x)
  40#define H1_2(x) (x)
  41#define H1_4(x) (x)
  42#define H2(x)   (x)
  43#define H4(x)   (x)
  44#endif
  45
  46/* Return a value for NZCV as per the ARM PredTest pseudofunction.
  47 *
  48 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
  49 * and bit 0 set if C is set.  Compare the definitions of these variables
  50 * within CPUARMState.
  51 */
  52
  53/* For no G bits set, NZCV = C.  */
  54#define PREDTEST_INIT  1
  55
  56/* This is an iterative function, called for each Pd and Pg word
  57 * moving forward.
  58 */
  59static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
  60{
  61    if (likely(g)) {
  62        /* Compute N from first D & G.
  63           Use bit 2 to signal first G bit seen.  */
  64        if (!(flags & 4)) {
  65            flags |= ((d & (g & -g)) != 0) << 31;
  66            flags |= 4;
  67        }
  68
  69        /* Accumulate Z from each D & G.  */
  70        flags |= ((d & g) != 0) << 1;
  71
  72        /* Compute C from last !(D & G).  Replace previous.  */
  73        flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
  74    }
  75    return flags;
  76}
  77
  78/* This is an iterative function, called for each Pd and Pg word
  79 * moving backward.
  80 */
  81static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
  82{
  83    if (likely(g)) {
  84        /* Compute C from first (i.e last) !(D & G).
  85           Use bit 2 to signal first G bit seen.  */
  86        if (!(flags & 4)) {
  87            flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
  88            flags |= (d & pow2floor(g)) == 0;
  89        }
  90
  91        /* Accumulate Z from each D & G.  */
  92        flags |= ((d & g) != 0) << 1;
  93
  94        /* Compute N from last (i.e first) D & G.  Replace previous.  */
  95        flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
  96    }
  97    return flags;
  98}
  99
 100/* The same for a single word predicate.  */
 101uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
 102{
 103    return iter_predtest_fwd(d, g, PREDTEST_INIT);
 104}
 105
 106/* The same for a multi-word predicate.  */
 107uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
 108{
 109    uint32_t flags = PREDTEST_INIT;
 110    uint64_t *d = vd, *g = vg;
 111    uintptr_t i = 0;
 112
 113    do {
 114        flags = iter_predtest_fwd(d[i], g[i], flags);
 115    } while (++i < words);
 116
 117    return flags;
 118}
 119
 120/* Expand active predicate bits to bytes, for byte elements.
 121 *  for (i = 0; i < 256; ++i) {
 122 *      unsigned long m = 0;
 123 *      for (j = 0; j < 8; j++) {
 124 *          if ((i >> j) & 1) {
 125 *              m |= 0xfful << (j << 3);
 126 *          }
 127 *      }
 128 *      printf("0x%016lx,\n", m);
 129 *  }
 130 */
 131static inline uint64_t expand_pred_b(uint8_t byte)
 132{
 133    static const uint64_t word[256] = {
 134        0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
 135        0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
 136        0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
 137        0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
 138        0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
 139        0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
 140        0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
 141        0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
 142        0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
 143        0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
 144        0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
 145        0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
 146        0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
 147        0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
 148        0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
 149        0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
 150        0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
 151        0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
 152        0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
 153        0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
 154        0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
 155        0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
 156        0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
 157        0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
 158        0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
 159        0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
 160        0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
 161        0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
 162        0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
 163        0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
 164        0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
 165        0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
 166        0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
 167        0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
 168        0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
 169        0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
 170        0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
 171        0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
 172        0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
 173        0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
 174        0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
 175        0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
 176        0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
 177        0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
 178        0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
 179        0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
 180        0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
 181        0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
 182        0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
 183        0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
 184        0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
 185        0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
 186        0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
 187        0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
 188        0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
 189        0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
 190        0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
 191        0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
 192        0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
 193        0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
 194        0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
 195        0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
 196        0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
 197        0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
 198        0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
 199        0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
 200        0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
 201        0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
 202        0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
 203        0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
 204        0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
 205        0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
 206        0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
 207        0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
 208        0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
 209        0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
 210        0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
 211        0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
 212        0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
 213        0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
 214        0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
 215        0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
 216        0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
 217        0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
 218        0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
 219        0xffffffffffffffff,
 220    };
 221    return word[byte];
 222}
 223
 224/* Similarly for half-word elements.
 225 *  for (i = 0; i < 256; ++i) {
 226 *      unsigned long m = 0;
 227 *      if (i & 0xaa) {
 228 *          continue;
 229 *      }
 230 *      for (j = 0; j < 8; j += 2) {
 231 *          if ((i >> j) & 1) {
 232 *              m |= 0xfffful << (j << 3);
 233 *          }
 234 *      }
 235 *      printf("[0x%x] = 0x%016lx,\n", i, m);
 236 *  }
 237 */
 238static inline uint64_t expand_pred_h(uint8_t byte)
 239{
 240    static const uint64_t word[] = {
 241        [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
 242        [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
 243        [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
 244        [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
 245        [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
 246        [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
 247        [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
 248        [0x55] = 0xffffffffffffffff,
 249    };
 250    return word[byte & 0x55];
 251}
 252
 253/* Similarly for single word elements.  */
 254static inline uint64_t expand_pred_s(uint8_t byte)
 255{
 256    static const uint64_t word[] = {
 257        [0x01] = 0x00000000ffffffffull,
 258        [0x10] = 0xffffffff00000000ull,
 259        [0x11] = 0xffffffffffffffffull,
 260    };
 261    return word[byte & 0x11];
 262}
 263
 264/* Swap 16-bit words within a 32-bit word.  */
 265static inline uint32_t hswap32(uint32_t h)
 266{
 267    return rol32(h, 16);
 268}
 269
 270/* Swap 16-bit words within a 64-bit word.  */
 271static inline uint64_t hswap64(uint64_t h)
 272{
 273    uint64_t m = 0x0000ffff0000ffffull;
 274    h = rol64(h, 32);
 275    return ((h & m) << 16) | ((h >> 16) & m);
 276}
 277
 278/* Swap 32-bit words within a 64-bit word.  */
 279static inline uint64_t wswap64(uint64_t h)
 280{
 281    return rol64(h, 32);
 282}
 283
 284#define LOGICAL_PPPP(NAME, FUNC) \
 285void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
 286{                                                                         \
 287    uintptr_t opr_sz = simd_oprsz(desc);                                  \
 288    uint64_t *d = vd, *n = vn, *m = vm, *g = vg;                          \
 289    uintptr_t i;                                                          \
 290    for (i = 0; i < opr_sz / 8; ++i) {                                    \
 291        d[i] = FUNC(n[i], m[i], g[i]);                                    \
 292    }                                                                     \
 293}
 294
 295#define DO_AND(N, M, G)  (((N) & (M)) & (G))
 296#define DO_BIC(N, M, G)  (((N) & ~(M)) & (G))
 297#define DO_EOR(N, M, G)  (((N) ^ (M)) & (G))
 298#define DO_ORR(N, M, G)  (((N) | (M)) & (G))
 299#define DO_ORN(N, M, G)  (((N) | ~(M)) & (G))
 300#define DO_NOR(N, M, G)  (~((N) | (M)) & (G))
 301#define DO_NAND(N, M, G) (~((N) & (M)) & (G))
 302#define DO_SEL(N, M, G)  (((N) & (G)) | ((M) & ~(G)))
 303
 304LOGICAL_PPPP(sve_and_pppp, DO_AND)
 305LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
 306LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
 307LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
 308LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
 309LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
 310LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
 311LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
 312
 313#undef DO_AND
 314#undef DO_BIC
 315#undef DO_EOR
 316#undef DO_ORR
 317#undef DO_ORN
 318#undef DO_NOR
 319#undef DO_NAND
 320#undef DO_SEL
 321#undef LOGICAL_PPPP
 322
 323/* Fully general three-operand expander, controlled by a predicate.
 324 * This is complicated by the host-endian storage of the register file.
 325 */
 326/* ??? I don't expect the compiler could ever vectorize this itself.
 327 * With some tables we can convert bit masks to byte masks, and with
 328 * extra care wrt byte/word ordering we could use gcc generic vectors
 329 * and do 16 bytes at a time.
 330 */
 331#define DO_ZPZZ(NAME, TYPE, H, OP)                                       \
 332void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
 333{                                                                       \
 334    intptr_t i, opr_sz = simd_oprsz(desc);                              \
 335    for (i = 0; i < opr_sz; ) {                                         \
 336        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
 337        do {                                                            \
 338            if (pg & 1) {                                               \
 339                TYPE nn = *(TYPE *)(vn + H(i));                         \
 340                TYPE mm = *(TYPE *)(vm + H(i));                         \
 341                *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
 342            }                                                           \
 343            i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
 344        } while (i & 15);                                               \
 345    }                                                                   \
 346}
 347
 348/* Similarly, specialized for 64-bit operands.  */
 349#define DO_ZPZZ_D(NAME, TYPE, OP)                                \
 350void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
 351{                                                               \
 352    intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
 353    TYPE *d = vd, *n = vn, *m = vm;                             \
 354    uint8_t *pg = vg;                                           \
 355    for (i = 0; i < opr_sz; i += 1) {                           \
 356        if (pg[H1(i)] & 1) {                                    \
 357            TYPE nn = n[i], mm = m[i];                          \
 358            d[i] = OP(nn, mm);                                  \
 359        }                                                       \
 360    }                                                           \
 361}
 362
 363#define DO_AND(N, M)  (N & M)
 364#define DO_EOR(N, M)  (N ^ M)
 365#define DO_ORR(N, M)  (N | M)
 366#define DO_BIC(N, M)  (N & ~M)
 367#define DO_ADD(N, M)  (N + M)
 368#define DO_SUB(N, M)  (N - M)
 369#define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
 370#define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
 371#define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
 372#define DO_MUL(N, M)  (N * M)
 373
 374
 375/*
 376 * We must avoid the C undefined behaviour cases: division by
 377 * zero and signed division of INT_MIN by -1. Both of these
 378 * have architecturally defined required results for Arm.
 379 * We special case all signed divisions by -1 to avoid having
 380 * to deduce the minimum integer for the type involved.
 381 */
 382#define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
 383#define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
 384
 385DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
 386DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
 387DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
 388DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
 389
 390DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
 391DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
 392DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
 393DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
 394
 395DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
 396DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
 397DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
 398DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
 399
 400DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
 401DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
 402DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
 403DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
 404
 405DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
 406DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
 407DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
 408DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
 409
 410DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
 411DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
 412DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
 413DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
 414
 415DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
 416DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
 417DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
 418DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
 419
 420DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
 421DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
 422DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
 423DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
 424
 425DO_ZPZZ(sve_smin_zpzz_b, int8_t,  H1, DO_MIN)
 426DO_ZPZZ(sve_smin_zpzz_h, int16_t,  H1_2, DO_MIN)
 427DO_ZPZZ(sve_smin_zpzz_s, int32_t,  H1_4, DO_MIN)
 428DO_ZPZZ_D(sve_smin_zpzz_d, int64_t,  DO_MIN)
 429
 430DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
 431DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
 432DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
 433DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
 434
 435DO_ZPZZ(sve_sabd_zpzz_b, int8_t,  H1, DO_ABD)
 436DO_ZPZZ(sve_sabd_zpzz_h, int16_t,  H1_2, DO_ABD)
 437DO_ZPZZ(sve_sabd_zpzz_s, int32_t,  H1_4, DO_ABD)
 438DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t,  DO_ABD)
 439
 440DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
 441DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
 442DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
 443DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
 444
 445/* Because the computation type is at least twice as large as required,
 446   these work for both signed and unsigned source types.  */
 447static inline uint8_t do_mulh_b(int32_t n, int32_t m)
 448{
 449    return (n * m) >> 8;
 450}
 451
 452static inline uint16_t do_mulh_h(int32_t n, int32_t m)
 453{
 454    return (n * m) >> 16;
 455}
 456
 457static inline uint32_t do_mulh_s(int64_t n, int64_t m)
 458{
 459    return (n * m) >> 32;
 460}
 461
 462static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
 463{
 464    uint64_t lo, hi;
 465    muls64(&lo, &hi, n, m);
 466    return hi;
 467}
 468
 469static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
 470{
 471    uint64_t lo, hi;
 472    mulu64(&lo, &hi, n, m);
 473    return hi;
 474}
 475
 476DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
 477DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
 478DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
 479DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
 480
 481DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
 482DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
 483DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
 484DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
 485
 486DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
 487DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
 488DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
 489DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
 490
 491DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
 492DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
 493
 494DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
 495DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
 496
 497/* Note that all bits of the shift are significant
 498   and not modulo the element size.  */
 499#define DO_ASR(N, M)  (N >> MIN(M, sizeof(N) * 8 - 1))
 500#define DO_LSR(N, M)  (M < sizeof(N) * 8 ? N >> M : 0)
 501#define DO_LSL(N, M)  (M < sizeof(N) * 8 ? N << M : 0)
 502
 503DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
 504DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
 505DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
 506
 507DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
 508DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
 509DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
 510
 511DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
 512DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
 513DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
 514
 515DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
 516DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
 517DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
 518
 519#undef DO_ZPZZ
 520#undef DO_ZPZZ_D
 521
 522/* Three-operand expander, controlled by a predicate, in which the
 523 * third operand is "wide".  That is, for D = N op M, the same 64-bit
 524 * value of M is used with all of the narrower values of N.
 525 */
 526#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP)                               \
 527void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
 528{                                                                       \
 529    intptr_t i, opr_sz = simd_oprsz(desc);                              \
 530    for (i = 0; i < opr_sz; ) {                                         \
 531        uint8_t pg = *(uint8_t *)(vg + H1(i >> 3));                     \
 532        TYPEW mm = *(TYPEW *)(vm + i);                                  \
 533        do {                                                            \
 534            if (pg & 1) {                                               \
 535                TYPE nn = *(TYPE *)(vn + H(i));                         \
 536                *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
 537            }                                                           \
 538            i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
 539        } while (i & 7);                                                \
 540    }                                                                   \
 541}
 542
 543DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
 544DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
 545DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
 546
 547DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
 548DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
 549DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
 550
 551DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
 552DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
 553DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
 554
 555#undef DO_ZPZW
 556
 557/* Fully general two-operand expander, controlled by a predicate.
 558 */
 559#define DO_ZPZ(NAME, TYPE, H, OP)                               \
 560void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
 561{                                                               \
 562    intptr_t i, opr_sz = simd_oprsz(desc);                      \
 563    for (i = 0; i < opr_sz; ) {                                 \
 564        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
 565        do {                                                    \
 566            if (pg & 1) {                                       \
 567                TYPE nn = *(TYPE *)(vn + H(i));                 \
 568                *(TYPE *)(vd + H(i)) = OP(nn);                  \
 569            }                                                   \
 570            i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
 571        } while (i & 15);                                       \
 572    }                                                           \
 573}
 574
 575/* Similarly, specialized for 64-bit operands.  */
 576#define DO_ZPZ_D(NAME, TYPE, OP)                                \
 577void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
 578{                                                               \
 579    intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
 580    TYPE *d = vd, *n = vn;                                      \
 581    uint8_t *pg = vg;                                           \
 582    for (i = 0; i < opr_sz; i += 1) {                           \
 583        if (pg[H1(i)] & 1) {                                    \
 584            TYPE nn = n[i];                                     \
 585            d[i] = OP(nn);                                      \
 586        }                                                       \
 587    }                                                           \
 588}
 589
 590#define DO_CLS_B(N)   (clrsb32(N) - 24)
 591#define DO_CLS_H(N)   (clrsb32(N) - 16)
 592
 593DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
 594DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
 595DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
 596DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
 597
 598#define DO_CLZ_B(N)   (clz32(N) - 24)
 599#define DO_CLZ_H(N)   (clz32(N) - 16)
 600
 601DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
 602DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
 603DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
 604DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
 605
 606DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
 607DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
 608DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
 609DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
 610
 611#define DO_CNOT(N)    (N == 0)
 612
 613DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
 614DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
 615DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
 616DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
 617
 618#define DO_FABS(N)    (N & ((__typeof(N))-1 >> 1))
 619
 620DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
 621DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
 622DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
 623
 624#define DO_FNEG(N)    (N ^ ~((__typeof(N))-1 >> 1))
 625
 626DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
 627DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
 628DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
 629
 630#define DO_NOT(N)    (~N)
 631
 632DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
 633DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
 634DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
 635DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
 636
 637#define DO_SXTB(N)    ((int8_t)N)
 638#define DO_SXTH(N)    ((int16_t)N)
 639#define DO_SXTS(N)    ((int32_t)N)
 640#define DO_UXTB(N)    ((uint8_t)N)
 641#define DO_UXTH(N)    ((uint16_t)N)
 642#define DO_UXTS(N)    ((uint32_t)N)
 643
 644DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
 645DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
 646DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
 647DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
 648DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
 649DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
 650
 651DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
 652DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
 653DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
 654DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
 655DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
 656DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
 657
 658#define DO_ABS(N)    (N < 0 ? -N : N)
 659
 660DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
 661DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
 662DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
 663DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
 664
 665#define DO_NEG(N)    (-N)
 666
 667DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
 668DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
 669DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
 670DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
 671
 672DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
 673DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
 674DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
 675
 676DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
 677DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
 678
 679DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
 680
 681DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
 682DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
 683DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
 684DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
 685
 686/* Three-operand expander, unpredicated, in which the third operand is "wide".
 687 */
 688#define DO_ZZW(NAME, TYPE, TYPEW, H, OP)                       \
 689void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
 690{                                                              \
 691    intptr_t i, opr_sz = simd_oprsz(desc);                     \
 692    for (i = 0; i < opr_sz; ) {                                \
 693        TYPEW mm = *(TYPEW *)(vm + i);                         \
 694        do {                                                   \
 695            TYPE nn = *(TYPE *)(vn + H(i));                    \
 696            *(TYPE *)(vd + H(i)) = OP(nn, mm);                 \
 697            i += sizeof(TYPE);                                 \
 698        } while (i & 7);                                       \
 699    }                                                          \
 700}
 701
 702DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
 703DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
 704DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
 705
 706DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
 707DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
 708DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
 709
 710DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
 711DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
 712DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
 713
 714#undef DO_ZZW
 715
 716#undef DO_CLS_B
 717#undef DO_CLS_H
 718#undef DO_CLZ_B
 719#undef DO_CLZ_H
 720#undef DO_CNOT
 721#undef DO_FABS
 722#undef DO_FNEG
 723#undef DO_ABS
 724#undef DO_NEG
 725#undef DO_ZPZ
 726#undef DO_ZPZ_D
 727
 728/* Two-operand reduction expander, controlled by a predicate.
 729 * The difference between TYPERED and TYPERET has to do with
 730 * sign-extension.  E.g. for SMAX, TYPERED must be signed,
 731 * but TYPERET must be unsigned so that e.g. a 32-bit value
 732 * is not sign-extended to the ABI uint64_t return type.
 733 */
 734/* ??? If we were to vectorize this by hand the reduction ordering
 735 * would change.  For integer operands, this is perfectly fine.
 736 */
 737#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
 738uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
 739{                                                          \
 740    intptr_t i, opr_sz = simd_oprsz(desc);                 \
 741    TYPERED ret = INIT;                                    \
 742    for (i = 0; i < opr_sz; ) {                            \
 743        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
 744        do {                                               \
 745            if (pg & 1) {                                  \
 746                TYPEELT nn = *(TYPEELT *)(vn + H(i));      \
 747                ret = OP(ret, nn);                         \
 748            }                                              \
 749            i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT);  \
 750        } while (i & 15);                                  \
 751    }                                                      \
 752    return (TYPERET)ret;                                   \
 753}
 754
 755#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP)             \
 756uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
 757{                                                          \
 758    intptr_t i, opr_sz = simd_oprsz(desc) / 8;             \
 759    TYPEE *n = vn;                                         \
 760    uint8_t *pg = vg;                                      \
 761    TYPER ret = INIT;                                      \
 762    for (i = 0; i < opr_sz; i += 1) {                      \
 763        if (pg[H1(i)] & 1) {                               \
 764            TYPEE nn = n[i];                               \
 765            ret = OP(ret, nn);                             \
 766        }                                                  \
 767    }                                                      \
 768    return ret;                                            \
 769}
 770
 771DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
 772DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
 773DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
 774DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
 775
 776DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
 777DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
 778DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
 779DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
 780
 781DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
 782DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
 783DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
 784DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
 785
 786DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
 787DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
 788DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
 789
 790DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
 791DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
 792DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
 793DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
 794
 795DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
 796DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
 797DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
 798DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
 799
 800DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
 801DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
 802DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
 803DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
 804
 805DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
 806DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
 807DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
 808DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
 809
 810DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
 811DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
 812DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
 813DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
 814
 815#undef DO_VPZ
 816#undef DO_VPZ_D
 817
 818/* Two vector operand, one scalar operand, unpredicated.  */
 819#define DO_ZZI(NAME, TYPE, OP)                                       \
 820void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc)   \
 821{                                                                    \
 822    intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);            \
 823    TYPE s = s64, *d = vd, *n = vn;                                  \
 824    for (i = 0; i < opr_sz; ++i) {                                   \
 825        d[i] = OP(n[i], s);                                          \
 826    }                                                                \
 827}
 828
 829#define DO_SUBR(X, Y)   (Y - X)
 830
 831DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
 832DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
 833DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
 834DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
 835
 836DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
 837DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
 838DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
 839DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
 840
 841DO_ZZI(sve_smini_b, int8_t, DO_MIN)
 842DO_ZZI(sve_smini_h, int16_t, DO_MIN)
 843DO_ZZI(sve_smini_s, int32_t, DO_MIN)
 844DO_ZZI(sve_smini_d, int64_t, DO_MIN)
 845
 846DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
 847DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
 848DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
 849DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
 850
 851DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
 852DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
 853DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
 854DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
 855
 856#undef DO_ZZI
 857
 858#undef DO_AND
 859#undef DO_ORR
 860#undef DO_EOR
 861#undef DO_BIC
 862#undef DO_ADD
 863#undef DO_SUB
 864#undef DO_MAX
 865#undef DO_MIN
 866#undef DO_ABD
 867#undef DO_MUL
 868#undef DO_DIV
 869#undef DO_ASR
 870#undef DO_LSR
 871#undef DO_LSL
 872#undef DO_SUBR
 873
 874/* Similar to the ARM LastActiveElement pseudocode function, except the
 875   result is multiplied by the element size.  This includes the not found
 876   indication; e.g. not found for esz=3 is -8.  */
 877static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
 878{
 879    uint64_t mask = pred_esz_masks[esz];
 880    intptr_t i = words;
 881
 882    do {
 883        uint64_t this_g = g[--i] & mask;
 884        if (this_g) {
 885            return i * 64 + (63 - clz64(this_g));
 886        }
 887    } while (i > 0);
 888    return (intptr_t)-1 << esz;
 889}
 890
 891uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
 892{
 893    uint32_t flags = PREDTEST_INIT;
 894    uint64_t *d = vd, *g = vg;
 895    intptr_t i = 0;
 896
 897    do {
 898        uint64_t this_d = d[i];
 899        uint64_t this_g = g[i];
 900
 901        if (this_g) {
 902            if (!(flags & 4)) {
 903                /* Set in D the first bit of G.  */
 904                this_d |= this_g & -this_g;
 905                d[i] = this_d;
 906            }
 907            flags = iter_predtest_fwd(this_d, this_g, flags);
 908        }
 909    } while (++i < words);
 910
 911    return flags;
 912}
 913
 914uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
 915{
 916    intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
 917    intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
 918    uint32_t flags = PREDTEST_INIT;
 919    uint64_t *d = vd, *g = vg, esz_mask;
 920    intptr_t i, next;
 921
 922    next = last_active_element(vd, words, esz) + (1 << esz);
 923    esz_mask = pred_esz_masks[esz];
 924
 925    /* Similar to the pseudocode for pnext, but scaled by ESZ
 926       so that we find the correct bit.  */
 927    if (next < words * 64) {
 928        uint64_t mask = -1;
 929
 930        if (next & 63) {
 931            mask = ~((1ull << (next & 63)) - 1);
 932            next &= -64;
 933        }
 934        do {
 935            uint64_t this_g = g[next / 64] & esz_mask & mask;
 936            if (this_g != 0) {
 937                next = (next & -64) + ctz64(this_g);
 938                break;
 939            }
 940            next += 64;
 941            mask = -1;
 942        } while (next < words * 64);
 943    }
 944
 945    i = 0;
 946    do {
 947        uint64_t this_d = 0;
 948        if (i == next / 64) {
 949            this_d = 1ull << (next & 63);
 950        }
 951        d[i] = this_d;
 952        flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
 953    } while (++i < words);
 954
 955    return flags;
 956}
 957
 958/* Store zero into every active element of Zd.  We will use this for two
 959 * and three-operand predicated instructions for which logic dictates a
 960 * zero result.  In particular, logical shift by element size, which is
 961 * otherwise undefined on the host.
 962 *
 963 * For element sizes smaller than uint64_t, we use tables to expand
 964 * the N bits of the controlling predicate to a byte mask, and clear
 965 * those bytes.
 966 */
 967void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
 968{
 969    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
 970    uint64_t *d = vd;
 971    uint8_t *pg = vg;
 972    for (i = 0; i < opr_sz; i += 1) {
 973        d[i] &= ~expand_pred_b(pg[H1(i)]);
 974    }
 975}
 976
 977void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
 978{
 979    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
 980    uint64_t *d = vd;
 981    uint8_t *pg = vg;
 982    for (i = 0; i < opr_sz; i += 1) {
 983        d[i] &= ~expand_pred_h(pg[H1(i)]);
 984    }
 985}
 986
 987void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
 988{
 989    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
 990    uint64_t *d = vd;
 991    uint8_t *pg = vg;
 992    for (i = 0; i < opr_sz; i += 1) {
 993        d[i] &= ~expand_pred_s(pg[H1(i)]);
 994    }
 995}
 996
 997void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
 998{
 999    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1000    uint64_t *d = vd;
1001    uint8_t *pg = vg;
1002    for (i = 0; i < opr_sz; i += 1) {
1003        if (pg[H1(i)] & 1) {
1004            d[i] = 0;
1005        }
1006    }
1007}
1008
1009/* Copy Zn into Zd, and store zero into inactive elements.  */
1010void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1011{
1012    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1013    uint64_t *d = vd, *n = vn;
1014    uint8_t *pg = vg;
1015    for (i = 0; i < opr_sz; i += 1) {
1016        d[i] = n[i] & expand_pred_b(pg[H1(i)]);
1017    }
1018}
1019
1020void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1021{
1022    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1023    uint64_t *d = vd, *n = vn;
1024    uint8_t *pg = vg;
1025    for (i = 0; i < opr_sz; i += 1) {
1026        d[i] = n[i] & expand_pred_h(pg[H1(i)]);
1027    }
1028}
1029
1030void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1031{
1032    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1033    uint64_t *d = vd, *n = vn;
1034    uint8_t *pg = vg;
1035    for (i = 0; i < opr_sz; i += 1) {
1036        d[i] = n[i] & expand_pred_s(pg[H1(i)]);
1037    }
1038}
1039
1040void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1041{
1042    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1043    uint64_t *d = vd, *n = vn;
1044    uint8_t *pg = vg;
1045    for (i = 0; i < opr_sz; i += 1) {
1046        d[i] = n[i] & -(uint64_t)(pg[H1(i)] & 1);
1047    }
1048}
1049
1050/* Three-operand expander, immediate operand, controlled by a predicate.
1051 */
1052#define DO_ZPZI(NAME, TYPE, H, OP)                              \
1053void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
1054{                                                               \
1055    intptr_t i, opr_sz = simd_oprsz(desc);                      \
1056    TYPE imm = simd_data(desc);                                 \
1057    for (i = 0; i < opr_sz; ) {                                 \
1058        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
1059        do {                                                    \
1060            if (pg & 1) {                                       \
1061                TYPE nn = *(TYPE *)(vn + H(i));                 \
1062                *(TYPE *)(vd + H(i)) = OP(nn, imm);             \
1063            }                                                   \
1064            i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
1065        } while (i & 15);                                       \
1066    }                                                           \
1067}
1068
1069/* Similarly, specialized for 64-bit operands.  */
1070#define DO_ZPZI_D(NAME, TYPE, OP)                               \
1071void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
1072{                                                               \
1073    intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
1074    TYPE *d = vd, *n = vn;                                      \
1075    TYPE imm = simd_data(desc);                                 \
1076    uint8_t *pg = vg;                                           \
1077    for (i = 0; i < opr_sz; i += 1) {                           \
1078        if (pg[H1(i)] & 1) {                                    \
1079            TYPE nn = n[i];                                     \
1080            d[i] = OP(nn, imm);                                 \
1081        }                                                       \
1082    }                                                           \
1083}
1084
1085#define DO_SHR(N, M)  (N >> M)
1086#define DO_SHL(N, M)  (N << M)
1087
1088/* Arithmetic shift right for division.  This rounds negative numbers
1089   toward zero as per signed division.  Therefore before shifting,
1090   when N is negative, add 2**M-1.  */
1091#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1092
1093DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1094DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1095DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1096DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1097
1098DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1099DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1100DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1101DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1102
1103DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1104DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1105DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1106DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1107
1108DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1109DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1110DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1111DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1112
1113#undef DO_SHR
1114#undef DO_SHL
1115#undef DO_ASRD
1116#undef DO_ZPZI
1117#undef DO_ZPZI_D
1118
1119/* Fully general four-operand expander, controlled by a predicate.
1120 */
1121#define DO_ZPZZZ(NAME, TYPE, H, OP)                           \
1122void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
1123                  void *vg, uint32_t desc)                    \
1124{                                                             \
1125    intptr_t i, opr_sz = simd_oprsz(desc);                    \
1126    for (i = 0; i < opr_sz; ) {                               \
1127        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));       \
1128        do {                                                  \
1129            if (pg & 1) {                                     \
1130                TYPE nn = *(TYPE *)(vn + H(i));               \
1131                TYPE mm = *(TYPE *)(vm + H(i));               \
1132                TYPE aa = *(TYPE *)(va + H(i));               \
1133                *(TYPE *)(vd + H(i)) = OP(aa, nn, mm);        \
1134            }                                                 \
1135            i += sizeof(TYPE), pg >>= sizeof(TYPE);           \
1136        } while (i & 15);                                     \
1137    }                                                         \
1138}
1139
1140/* Similarly, specialized for 64-bit operands.  */
1141#define DO_ZPZZZ_D(NAME, TYPE, OP)                            \
1142void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
1143                  void *vg, uint32_t desc)                    \
1144{                                                             \
1145    intptr_t i, opr_sz = simd_oprsz(desc) / 8;                \
1146    TYPE *d = vd, *a = va, *n = vn, *m = vm;                  \
1147    uint8_t *pg = vg;                                         \
1148    for (i = 0; i < opr_sz; i += 1) {                         \
1149        if (pg[H1(i)] & 1) {                                  \
1150            TYPE aa = a[i], nn = n[i], mm = m[i];             \
1151            d[i] = OP(aa, nn, mm);                            \
1152        }                                                     \
1153    }                                                         \
1154}
1155
1156#define DO_MLA(A, N, M)  (A + N * M)
1157#define DO_MLS(A, N, M)  (A - N * M)
1158
1159DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1160DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1161
1162DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1163DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1164
1165DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1166DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1167
1168DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1169DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1170
1171#undef DO_MLA
1172#undef DO_MLS
1173#undef DO_ZPZZZ
1174#undef DO_ZPZZZ_D
1175
1176void HELPER(sve_index_b)(void *vd, uint32_t start,
1177                         uint32_t incr, uint32_t desc)
1178{
1179    intptr_t i, opr_sz = simd_oprsz(desc);
1180    uint8_t *d = vd;
1181    for (i = 0; i < opr_sz; i += 1) {
1182        d[H1(i)] = start + i * incr;
1183    }
1184}
1185
1186void HELPER(sve_index_h)(void *vd, uint32_t start,
1187                         uint32_t incr, uint32_t desc)
1188{
1189    intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1190    uint16_t *d = vd;
1191    for (i = 0; i < opr_sz; i += 1) {
1192        d[H2(i)] = start + i * incr;
1193    }
1194}
1195
1196void HELPER(sve_index_s)(void *vd, uint32_t start,
1197                         uint32_t incr, uint32_t desc)
1198{
1199    intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1200    uint32_t *d = vd;
1201    for (i = 0; i < opr_sz; i += 1) {
1202        d[H4(i)] = start + i * incr;
1203    }
1204}
1205
1206void HELPER(sve_index_d)(void *vd, uint64_t start,
1207                         uint64_t incr, uint32_t desc)
1208{
1209    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1210    uint64_t *d = vd;
1211    for (i = 0; i < opr_sz; i += 1) {
1212        d[i] = start + i * incr;
1213    }
1214}
1215
1216void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1217{
1218    intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1219    uint32_t sh = simd_data(desc);
1220    uint32_t *d = vd, *n = vn, *m = vm;
1221    for (i = 0; i < opr_sz; i += 1) {
1222        d[i] = n[i] + (m[i] << sh);
1223    }
1224}
1225
1226void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1227{
1228    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1229    uint64_t sh = simd_data(desc);
1230    uint64_t *d = vd, *n = vn, *m = vm;
1231    for (i = 0; i < opr_sz; i += 1) {
1232        d[i] = n[i] + (m[i] << sh);
1233    }
1234}
1235
1236void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1237{
1238    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1239    uint64_t sh = simd_data(desc);
1240    uint64_t *d = vd, *n = vn, *m = vm;
1241    for (i = 0; i < opr_sz; i += 1) {
1242        d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1243    }
1244}
1245
1246void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1247{
1248    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1249    uint64_t sh = simd_data(desc);
1250    uint64_t *d = vd, *n = vn, *m = vm;
1251    for (i = 0; i < opr_sz; i += 1) {
1252        d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1253    }
1254}
1255
1256void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1257{
1258    /* These constants are cut-and-paste directly from the ARM pseudocode.  */
1259    static const uint16_t coeff[] = {
1260        0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1261        0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1262        0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1263        0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1264    };
1265    intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1266    uint16_t *d = vd, *n = vn;
1267
1268    for (i = 0; i < opr_sz; i++) {
1269        uint16_t nn = n[i];
1270        intptr_t idx = extract32(nn, 0, 5);
1271        uint16_t exp = extract32(nn, 5, 5);
1272        d[i] = coeff[idx] | (exp << 10);
1273    }
1274}
1275
1276void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1277{
1278    /* These constants are cut-and-paste directly from the ARM pseudocode.  */
1279    static const uint32_t coeff[] = {
1280        0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1281        0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1282        0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1283        0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1284        0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1285        0x1ef532, 0x20b051, 0x227043, 0x243516,
1286        0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1287        0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1288        0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1289        0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1290        0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1291        0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1292        0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1293        0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1294        0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1295        0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1296    };
1297    intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1298    uint32_t *d = vd, *n = vn;
1299
1300    for (i = 0; i < opr_sz; i++) {
1301        uint32_t nn = n[i];
1302        intptr_t idx = extract32(nn, 0, 6);
1303        uint32_t exp = extract32(nn, 6, 8);
1304        d[i] = coeff[idx] | (exp << 23);
1305    }
1306}
1307
1308void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1309{
1310    /* These constants are cut-and-paste directly from the ARM pseudocode.  */
1311    static const uint64_t coeff[] = {
1312        0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1313        0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1314        0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1315        0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1316        0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1317        0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1318        0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1319        0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1320        0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1321        0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1322        0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1323        0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1324        0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1325        0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1326        0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1327        0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1328        0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1329        0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1330        0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1331        0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1332        0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1333        0xFA7C1819E90D8ull,
1334    };
1335    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1336    uint64_t *d = vd, *n = vn;
1337
1338    for (i = 0; i < opr_sz; i++) {
1339        uint64_t nn = n[i];
1340        intptr_t idx = extract32(nn, 0, 6);
1341        uint64_t exp = extract32(nn, 6, 11);
1342        d[i] = coeff[idx] | (exp << 52);
1343    }
1344}
1345
1346void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1347{
1348    intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1349    uint16_t *d = vd, *n = vn, *m = vm;
1350    for (i = 0; i < opr_sz; i += 1) {
1351        uint16_t nn = n[i];
1352        uint16_t mm = m[i];
1353        if (mm & 1) {
1354            nn = float16_one;
1355        }
1356        d[i] = nn ^ (mm & 2) << 14;
1357    }
1358}
1359
1360void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1361{
1362    intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1363    uint32_t *d = vd, *n = vn, *m = vm;
1364    for (i = 0; i < opr_sz; i += 1) {
1365        uint32_t nn = n[i];
1366        uint32_t mm = m[i];
1367        if (mm & 1) {
1368            nn = float32_one;
1369        }
1370        d[i] = nn ^ (mm & 2) << 30;
1371    }
1372}
1373
1374void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1375{
1376    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1377    uint64_t *d = vd, *n = vn, *m = vm;
1378    for (i = 0; i < opr_sz; i += 1) {
1379        uint64_t nn = n[i];
1380        uint64_t mm = m[i];
1381        if (mm & 1) {
1382            nn = float64_one;
1383        }
1384        d[i] = nn ^ (mm & 2) << 62;
1385    }
1386}
1387
1388/*
1389 * Signed saturating addition with scalar operand.
1390 */
1391
1392void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1393{
1394    intptr_t i, oprsz = simd_oprsz(desc);
1395
1396    for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1397        int r = *(int8_t *)(a + i) + b;
1398        if (r > INT8_MAX) {
1399            r = INT8_MAX;
1400        } else if (r < INT8_MIN) {
1401            r = INT8_MIN;
1402        }
1403        *(int8_t *)(d + i) = r;
1404    }
1405}
1406
1407void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1408{
1409    intptr_t i, oprsz = simd_oprsz(desc);
1410
1411    for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1412        int r = *(int16_t *)(a + i) + b;
1413        if (r > INT16_MAX) {
1414            r = INT16_MAX;
1415        } else if (r < INT16_MIN) {
1416            r = INT16_MIN;
1417        }
1418        *(int16_t *)(d + i) = r;
1419    }
1420}
1421
1422void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1423{
1424    intptr_t i, oprsz = simd_oprsz(desc);
1425
1426    for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1427        int64_t r = *(int32_t *)(a + i) + b;
1428        if (r > INT32_MAX) {
1429            r = INT32_MAX;
1430        } else if (r < INT32_MIN) {
1431            r = INT32_MIN;
1432        }
1433        *(int32_t *)(d + i) = r;
1434    }
1435}
1436
1437void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1438{
1439    intptr_t i, oprsz = simd_oprsz(desc);
1440
1441    for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1442        int64_t ai = *(int64_t *)(a + i);
1443        int64_t r = ai + b;
1444        if (((r ^ ai) & ~(ai ^ b)) < 0) {
1445            /* Signed overflow.  */
1446            r = (r < 0 ? INT64_MAX : INT64_MIN);
1447        }
1448        *(int64_t *)(d + i) = r;
1449    }
1450}
1451
1452/*
1453 * Unsigned saturating addition with scalar operand.
1454 */
1455
1456void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1457{
1458    intptr_t i, oprsz = simd_oprsz(desc);
1459
1460    for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1461        int r = *(uint8_t *)(a + i) + b;
1462        if (r > UINT8_MAX) {
1463            r = UINT8_MAX;
1464        } else if (r < 0) {
1465            r = 0;
1466        }
1467        *(uint8_t *)(d + i) = r;
1468    }
1469}
1470
1471void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1472{
1473    intptr_t i, oprsz = simd_oprsz(desc);
1474
1475    for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1476        int r = *(uint16_t *)(a + i) + b;
1477        if (r > UINT16_MAX) {
1478            r = UINT16_MAX;
1479        } else if (r < 0) {
1480            r = 0;
1481        }
1482        *(uint16_t *)(d + i) = r;
1483    }
1484}
1485
1486void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1487{
1488    intptr_t i, oprsz = simd_oprsz(desc);
1489
1490    for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1491        int64_t r = *(uint32_t *)(a + i) + b;
1492        if (r > UINT32_MAX) {
1493            r = UINT32_MAX;
1494        } else if (r < 0) {
1495            r = 0;
1496        }
1497        *(uint32_t *)(d + i) = r;
1498    }
1499}
1500
1501void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1502{
1503    intptr_t i, oprsz = simd_oprsz(desc);
1504
1505    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1506        uint64_t r = *(uint64_t *)(a + i) + b;
1507        if (r < b) {
1508            r = UINT64_MAX;
1509        }
1510        *(uint64_t *)(d + i) = r;
1511    }
1512}
1513
1514void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1515{
1516    intptr_t i, oprsz = simd_oprsz(desc);
1517
1518    for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1519        uint64_t ai = *(uint64_t *)(a + i);
1520        *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1521    }
1522}
1523
1524/* Two operand predicated copy immediate with merge.  All valid immediates
1525 * can fit within 17 signed bits in the simd_data field.
1526 */
1527void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1528                         uint64_t mm, uint32_t desc)
1529{
1530    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1531    uint64_t *d = vd, *n = vn;
1532    uint8_t *pg = vg;
1533
1534    mm = dup_const(MO_8, mm);
1535    for (i = 0; i < opr_sz; i += 1) {
1536        uint64_t nn = n[i];
1537        uint64_t pp = expand_pred_b(pg[H1(i)]);
1538        d[i] = (mm & pp) | (nn & ~pp);
1539    }
1540}
1541
1542void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1543                         uint64_t mm, uint32_t desc)
1544{
1545    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1546    uint64_t *d = vd, *n = vn;
1547    uint8_t *pg = vg;
1548
1549    mm = dup_const(MO_16, mm);
1550    for (i = 0; i < opr_sz; i += 1) {
1551        uint64_t nn = n[i];
1552        uint64_t pp = expand_pred_h(pg[H1(i)]);
1553        d[i] = (mm & pp) | (nn & ~pp);
1554    }
1555}
1556
1557void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1558                         uint64_t mm, uint32_t desc)
1559{
1560    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1561    uint64_t *d = vd, *n = vn;
1562    uint8_t *pg = vg;
1563
1564    mm = dup_const(MO_32, mm);
1565    for (i = 0; i < opr_sz; i += 1) {
1566        uint64_t nn = n[i];
1567        uint64_t pp = expand_pred_s(pg[H1(i)]);
1568        d[i] = (mm & pp) | (nn & ~pp);
1569    }
1570}
1571
1572void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1573                         uint64_t mm, uint32_t desc)
1574{
1575    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1576    uint64_t *d = vd, *n = vn;
1577    uint8_t *pg = vg;
1578
1579    for (i = 0; i < opr_sz; i += 1) {
1580        uint64_t nn = n[i];
1581        d[i] = (pg[H1(i)] & 1 ? mm : nn);
1582    }
1583}
1584
1585void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1586{
1587    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1588    uint64_t *d = vd;
1589    uint8_t *pg = vg;
1590
1591    val = dup_const(MO_8, val);
1592    for (i = 0; i < opr_sz; i += 1) {
1593        d[i] = val & expand_pred_b(pg[H1(i)]);
1594    }
1595}
1596
1597void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1598{
1599    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1600    uint64_t *d = vd;
1601    uint8_t *pg = vg;
1602
1603    val = dup_const(MO_16, val);
1604    for (i = 0; i < opr_sz; i += 1) {
1605        d[i] = val & expand_pred_h(pg[H1(i)]);
1606    }
1607}
1608
1609void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1610{
1611    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1612    uint64_t *d = vd;
1613    uint8_t *pg = vg;
1614
1615    val = dup_const(MO_32, val);
1616    for (i = 0; i < opr_sz; i += 1) {
1617        d[i] = val & expand_pred_s(pg[H1(i)]);
1618    }
1619}
1620
1621void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1622{
1623    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1624    uint64_t *d = vd;
1625    uint8_t *pg = vg;
1626
1627    for (i = 0; i < opr_sz; i += 1) {
1628        d[i] = (pg[H1(i)] & 1 ? val : 0);
1629    }
1630}
1631
1632/* Big-endian hosts need to frob the byte indicies.  If the copy
1633 * happens to be 8-byte aligned, then no frobbing necessary.
1634 */
1635static void swap_memmove(void *vd, void *vs, size_t n)
1636{
1637    uintptr_t d = (uintptr_t)vd;
1638    uintptr_t s = (uintptr_t)vs;
1639    uintptr_t o = (d | s | n) & 7;
1640    size_t i;
1641
1642#ifndef HOST_WORDS_BIGENDIAN
1643    o = 0;
1644#endif
1645    switch (o) {
1646    case 0:
1647        memmove(vd, vs, n);
1648        break;
1649
1650    case 4:
1651        if (d < s || d >= s + n) {
1652            for (i = 0; i < n; i += 4) {
1653                *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1654            }
1655        } else {
1656            for (i = n; i > 0; ) {
1657                i -= 4;
1658                *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1659            }
1660        }
1661        break;
1662
1663    case 2:
1664    case 6:
1665        if (d < s || d >= s + n) {
1666            for (i = 0; i < n; i += 2) {
1667                *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1668            }
1669        } else {
1670            for (i = n; i > 0; ) {
1671                i -= 2;
1672                *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1673            }
1674        }
1675        break;
1676
1677    default:
1678        if (d < s || d >= s + n) {
1679            for (i = 0; i < n; i++) {
1680                *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1681            }
1682        } else {
1683            for (i = n; i > 0; ) {
1684                i -= 1;
1685                *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1686            }
1687        }
1688        break;
1689    }
1690}
1691
1692/* Similarly for memset of 0.  */
1693static void swap_memzero(void *vd, size_t n)
1694{
1695    uintptr_t d = (uintptr_t)vd;
1696    uintptr_t o = (d | n) & 7;
1697    size_t i;
1698
1699    /* Usually, the first bit of a predicate is set, so N is 0.  */
1700    if (likely(n == 0)) {
1701        return;
1702    }
1703
1704#ifndef HOST_WORDS_BIGENDIAN
1705    o = 0;
1706#endif
1707    switch (o) {
1708    case 0:
1709        memset(vd, 0, n);
1710        break;
1711
1712    case 4:
1713        for (i = 0; i < n; i += 4) {
1714            *(uint32_t *)H1_4(d + i) = 0;
1715        }
1716        break;
1717
1718    case 2:
1719    case 6:
1720        for (i = 0; i < n; i += 2) {
1721            *(uint16_t *)H1_2(d + i) = 0;
1722        }
1723        break;
1724
1725    default:
1726        for (i = 0; i < n; i++) {
1727            *(uint8_t *)H1(d + i) = 0;
1728        }
1729        break;
1730    }
1731}
1732
1733void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1734{
1735    intptr_t opr_sz = simd_oprsz(desc);
1736    size_t n_ofs = simd_data(desc);
1737    size_t n_siz = opr_sz - n_ofs;
1738
1739    if (vd != vm) {
1740        swap_memmove(vd, vn + n_ofs, n_siz);
1741        swap_memmove(vd + n_siz, vm, n_ofs);
1742    } else if (vd != vn) {
1743        swap_memmove(vd + n_siz, vd, n_ofs);
1744        swap_memmove(vd, vn + n_ofs, n_siz);
1745    } else {
1746        /* vd == vn == vm.  Need temp space.  */
1747        ARMVectorReg tmp;
1748        swap_memmove(&tmp, vm, n_ofs);
1749        swap_memmove(vd, vd + n_ofs, n_siz);
1750        memcpy(vd + n_siz, &tmp, n_ofs);
1751    }
1752}
1753
1754#define DO_INSR(NAME, TYPE, H) \
1755void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1756{                                                                  \
1757    intptr_t opr_sz = simd_oprsz(desc);                            \
1758    swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE));    \
1759    *(TYPE *)(vd + H(0)) = val;                                    \
1760}
1761
1762DO_INSR(sve_insr_b, uint8_t, H1)
1763DO_INSR(sve_insr_h, uint16_t, H1_2)
1764DO_INSR(sve_insr_s, uint32_t, H1_4)
1765DO_INSR(sve_insr_d, uint64_t, )
1766
1767#undef DO_INSR
1768
1769void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1770{
1771    intptr_t i, j, opr_sz = simd_oprsz(desc);
1772    for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1773        uint64_t f = *(uint64_t *)(vn + i);
1774        uint64_t b = *(uint64_t *)(vn + j);
1775        *(uint64_t *)(vd + i) = bswap64(b);
1776        *(uint64_t *)(vd + j) = bswap64(f);
1777    }
1778}
1779
1780void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1781{
1782    intptr_t i, j, opr_sz = simd_oprsz(desc);
1783    for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1784        uint64_t f = *(uint64_t *)(vn + i);
1785        uint64_t b = *(uint64_t *)(vn + j);
1786        *(uint64_t *)(vd + i) = hswap64(b);
1787        *(uint64_t *)(vd + j) = hswap64(f);
1788    }
1789}
1790
1791void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1792{
1793    intptr_t i, j, opr_sz = simd_oprsz(desc);
1794    for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1795        uint64_t f = *(uint64_t *)(vn + i);
1796        uint64_t b = *(uint64_t *)(vn + j);
1797        *(uint64_t *)(vd + i) = rol64(b, 32);
1798        *(uint64_t *)(vd + j) = rol64(f, 32);
1799    }
1800}
1801
1802void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1803{
1804    intptr_t i, j, opr_sz = simd_oprsz(desc);
1805    for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1806        uint64_t f = *(uint64_t *)(vn + i);
1807        uint64_t b = *(uint64_t *)(vn + j);
1808        *(uint64_t *)(vd + i) = b;
1809        *(uint64_t *)(vd + j) = f;
1810    }
1811}
1812
1813#define DO_TBL(NAME, TYPE, H) \
1814void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1815{                                                              \
1816    intptr_t i, opr_sz = simd_oprsz(desc);                     \
1817    uintptr_t elem = opr_sz / sizeof(TYPE);                    \
1818    TYPE *d = vd, *n = vn, *m = vm;                            \
1819    ARMVectorReg tmp;                                          \
1820    if (unlikely(vd == vn)) {                                  \
1821        n = memcpy(&tmp, vn, opr_sz);                          \
1822    }                                                          \
1823    for (i = 0; i < elem; i++) {                               \
1824        TYPE j = m[H(i)];                                      \
1825        d[H(i)] = j < elem ? n[H(j)] : 0;                      \
1826    }                                                          \
1827}
1828
1829DO_TBL(sve_tbl_b, uint8_t, H1)
1830DO_TBL(sve_tbl_h, uint16_t, H2)
1831DO_TBL(sve_tbl_s, uint32_t, H4)
1832DO_TBL(sve_tbl_d, uint64_t, )
1833
1834#undef TBL
1835
1836#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1837void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
1838{                                                              \
1839    intptr_t i, opr_sz = simd_oprsz(desc);                     \
1840    TYPED *d = vd;                                             \
1841    TYPES *n = vn;                                             \
1842    ARMVectorReg tmp;                                          \
1843    if (unlikely(vn - vd < opr_sz)) {                          \
1844        n = memcpy(&tmp, n, opr_sz / 2);                       \
1845    }                                                          \
1846    for (i = 0; i < opr_sz / sizeof(TYPED); i++) {             \
1847        d[HD(i)] = n[HS(i)];                                   \
1848    }                                                          \
1849}
1850
1851DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1852DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1853DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1854
1855DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1856DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1857DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1858
1859#undef DO_UNPK
1860
1861/* Mask of bits included in the even numbered predicates of width esz.
1862 * We also use this for expand_bits/compress_bits, and so extend the
1863 * same pattern out to 16-bit units.
1864 */
1865static const uint64_t even_bit_esz_masks[5] = {
1866    0x5555555555555555ull,
1867    0x3333333333333333ull,
1868    0x0f0f0f0f0f0f0f0full,
1869    0x00ff00ff00ff00ffull,
1870    0x0000ffff0000ffffull,
1871};
1872
1873/* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1874 * For N==0, this corresponds to the operation that in qemu/bitops.h
1875 * we call half_shuffle64; this algorithm is from Hacker's Delight,
1876 * section 7-2 Shuffling Bits.
1877 */
1878static uint64_t expand_bits(uint64_t x, int n)
1879{
1880    int i;
1881
1882    x &= 0xffffffffu;
1883    for (i = 4; i >= n; i--) {
1884        int sh = 1 << i;
1885        x = ((x << sh) | x) & even_bit_esz_masks[i];
1886    }
1887    return x;
1888}
1889
1890/* Compress units of 2**(N+1) bits to units of 2**N bits.
1891 * For N==0, this corresponds to the operation that in qemu/bitops.h
1892 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1893 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1894 */
1895static uint64_t compress_bits(uint64_t x, int n)
1896{
1897    int i;
1898
1899    for (i = n; i <= 4; i++) {
1900        int sh = 1 << i;
1901        x &= even_bit_esz_masks[i];
1902        x = (x >> sh) | x;
1903    }
1904    return x & 0xffffffffu;
1905}
1906
1907void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1908{
1909    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1910    int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1911    intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1912    uint64_t *d = vd;
1913    intptr_t i;
1914
1915    if (oprsz <= 8) {
1916        uint64_t nn = *(uint64_t *)vn;
1917        uint64_t mm = *(uint64_t *)vm;
1918        int half = 4 * oprsz;
1919
1920        nn = extract64(nn, high * half, half);
1921        mm = extract64(mm, high * half, half);
1922        nn = expand_bits(nn, esz);
1923        mm = expand_bits(mm, esz);
1924        d[0] = nn + (mm << (1 << esz));
1925    } else {
1926        ARMPredicateReg tmp_n, tmp_m;
1927
1928        /* We produce output faster than we consume input.
1929           Therefore we must be mindful of possible overlap.  */
1930        if ((vn - vd) < (uintptr_t)oprsz) {
1931            vn = memcpy(&tmp_n, vn, oprsz);
1932        }
1933        if ((vm - vd) < (uintptr_t)oprsz) {
1934            vm = memcpy(&tmp_m, vm, oprsz);
1935        }
1936        if (high) {
1937            high = oprsz >> 1;
1938        }
1939
1940        if ((high & 3) == 0) {
1941            uint32_t *n = vn, *m = vm;
1942            high >>= 2;
1943
1944            for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1945                uint64_t nn = n[H4(high + i)];
1946                uint64_t mm = m[H4(high + i)];
1947
1948                nn = expand_bits(nn, esz);
1949                mm = expand_bits(mm, esz);
1950                d[i] = nn + (mm << (1 << esz));
1951            }
1952        } else {
1953            uint8_t *n = vn, *m = vm;
1954            uint16_t *d16 = vd;
1955
1956            for (i = 0; i < oprsz / 2; i++) {
1957                uint16_t nn = n[H1(high + i)];
1958                uint16_t mm = m[H1(high + i)];
1959
1960                nn = expand_bits(nn, esz);
1961                mm = expand_bits(mm, esz);
1962                d16[H2(i)] = nn + (mm << (1 << esz));
1963            }
1964        }
1965    }
1966}
1967
1968void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1969{
1970    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1971    int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1972    int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
1973    uint64_t *d = vd, *n = vn, *m = vm;
1974    uint64_t l, h;
1975    intptr_t i;
1976
1977    if (oprsz <= 8) {
1978        l = compress_bits(n[0] >> odd, esz);
1979        h = compress_bits(m[0] >> odd, esz);
1980        d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
1981    } else {
1982        ARMPredicateReg tmp_m;
1983        intptr_t oprsz_16 = oprsz / 16;
1984
1985        if ((vm - vd) < (uintptr_t)oprsz) {
1986            m = memcpy(&tmp_m, vm, oprsz);
1987        }
1988
1989        for (i = 0; i < oprsz_16; i++) {
1990            l = n[2 * i + 0];
1991            h = n[2 * i + 1];
1992            l = compress_bits(l >> odd, esz);
1993            h = compress_bits(h >> odd, esz);
1994            d[i] = l + (h << 32);
1995        }
1996
1997        /* For VL which is not a power of 2, the results from M do not
1998           align nicely with the uint64_t for D.  Put the aligned results
1999           from M into TMP_M and then copy it into place afterward.  */
2000        if (oprsz & 15) {
2001            d[i] = compress_bits(n[2 * i] >> odd, esz);
2002
2003            for (i = 0; i < oprsz_16; i++) {
2004                l = m[2 * i + 0];
2005                h = m[2 * i + 1];
2006                l = compress_bits(l >> odd, esz);
2007                h = compress_bits(h >> odd, esz);
2008                tmp_m.p[i] = l + (h << 32);
2009            }
2010            tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
2011
2012            swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
2013        } else {
2014            for (i = 0; i < oprsz_16; i++) {
2015                l = m[2 * i + 0];
2016                h = m[2 * i + 1];
2017                l = compress_bits(l >> odd, esz);
2018                h = compress_bits(h >> odd, esz);
2019                d[oprsz_16 + i] = l + (h << 32);
2020            }
2021        }
2022    }
2023}
2024
2025void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2026{
2027    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2028    uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2029    bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2030    uint64_t *d = vd, *n = vn, *m = vm;
2031    uint64_t mask;
2032    int shr, shl;
2033    intptr_t i;
2034
2035    shl = 1 << esz;
2036    shr = 0;
2037    mask = even_bit_esz_masks[esz];
2038    if (odd) {
2039        mask <<= shl;
2040        shr = shl;
2041        shl = 0;
2042    }
2043
2044    for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2045        uint64_t nn = (n[i] & mask) >> shr;
2046        uint64_t mm = (m[i] & mask) << shl;
2047        d[i] = nn + mm;
2048    }
2049}
2050
2051/* Reverse units of 2**N bits.  */
2052static uint64_t reverse_bits_64(uint64_t x, int n)
2053{
2054    int i, sh;
2055
2056    x = bswap64(x);
2057    for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2058        uint64_t mask = even_bit_esz_masks[i];
2059        x = ((x & mask) << sh) | ((x >> sh) & mask);
2060    }
2061    return x;
2062}
2063
2064static uint8_t reverse_bits_8(uint8_t x, int n)
2065{
2066    static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2067    int i, sh;
2068
2069    for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2070        x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2071    }
2072    return x;
2073}
2074
2075void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2076{
2077    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2078    int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2079    intptr_t i, oprsz_2 = oprsz / 2;
2080
2081    if (oprsz <= 8) {
2082        uint64_t l = *(uint64_t *)vn;
2083        l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2084        *(uint64_t *)vd = l;
2085    } else if ((oprsz & 15) == 0) {
2086        for (i = 0; i < oprsz_2; i += 8) {
2087            intptr_t ih = oprsz - 8 - i;
2088            uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2089            uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2090            *(uint64_t *)(vd + i) = h;
2091            *(uint64_t *)(vd + ih) = l;
2092        }
2093    } else {
2094        for (i = 0; i < oprsz_2; i += 1) {
2095            intptr_t il = H1(i);
2096            intptr_t ih = H1(oprsz - 1 - i);
2097            uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2098            uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2099            *(uint8_t *)(vd + il) = h;
2100            *(uint8_t *)(vd + ih) = l;
2101        }
2102    }
2103}
2104
2105void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2106{
2107    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2108    intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2109    uint64_t *d = vd;
2110    intptr_t i;
2111
2112    if (oprsz <= 8) {
2113        uint64_t nn = *(uint64_t *)vn;
2114        int half = 4 * oprsz;
2115
2116        nn = extract64(nn, high * half, half);
2117        nn = expand_bits(nn, 0);
2118        d[0] = nn;
2119    } else {
2120        ARMPredicateReg tmp_n;
2121
2122        /* We produce output faster than we consume input.
2123           Therefore we must be mindful of possible overlap.  */
2124        if ((vn - vd) < (uintptr_t)oprsz) {
2125            vn = memcpy(&tmp_n, vn, oprsz);
2126        }
2127        if (high) {
2128            high = oprsz >> 1;
2129        }
2130
2131        if ((high & 3) == 0) {
2132            uint32_t *n = vn;
2133            high >>= 2;
2134
2135            for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2136                uint64_t nn = n[H4(high + i)];
2137                d[i] = expand_bits(nn, 0);
2138            }
2139        } else {
2140            uint16_t *d16 = vd;
2141            uint8_t *n = vn;
2142
2143            for (i = 0; i < oprsz / 2; i++) {
2144                uint16_t nn = n[H1(high + i)];
2145                d16[H2(i)] = expand_bits(nn, 0);
2146            }
2147        }
2148    }
2149}
2150
2151#define DO_ZIP(NAME, TYPE, H) \
2152void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
2153{                                                                    \
2154    intptr_t oprsz = simd_oprsz(desc);                               \
2155    intptr_t i, oprsz_2 = oprsz / 2;                                 \
2156    ARMVectorReg tmp_n, tmp_m;                                       \
2157    /* We produce output faster than we consume input.               \
2158       Therefore we must be mindful of possible overlap.  */         \
2159    if (unlikely((vn - vd) < (uintptr_t)oprsz)) {                    \
2160        vn = memcpy(&tmp_n, vn, oprsz_2);                            \
2161    }                                                                \
2162    if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                    \
2163        vm = memcpy(&tmp_m, vm, oprsz_2);                            \
2164    }                                                                \
2165    for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                    \
2166        *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i));         \
2167        *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2168    }                                                                \
2169}
2170
2171DO_ZIP(sve_zip_b, uint8_t, H1)
2172DO_ZIP(sve_zip_h, uint16_t, H1_2)
2173DO_ZIP(sve_zip_s, uint32_t, H1_4)
2174DO_ZIP(sve_zip_d, uint64_t, )
2175
2176#define DO_UZP(NAME, TYPE, H) \
2177void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
2178{                                                                      \
2179    intptr_t oprsz = simd_oprsz(desc);                                 \
2180    intptr_t oprsz_2 = oprsz / 2;                                      \
2181    intptr_t odd_ofs = simd_data(desc);                                \
2182    intptr_t i;                                                        \
2183    ARMVectorReg tmp_m;                                                \
2184    if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                      \
2185        vm = memcpy(&tmp_m, vm, oprsz);                                \
2186    }                                                                  \
2187    for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                      \
2188        *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs));     \
2189    }                                                                  \
2190    for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                      \
2191        *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2192    }                                                                  \
2193}
2194
2195DO_UZP(sve_uzp_b, uint8_t, H1)
2196DO_UZP(sve_uzp_h, uint16_t, H1_2)
2197DO_UZP(sve_uzp_s, uint32_t, H1_4)
2198DO_UZP(sve_uzp_d, uint64_t, )
2199
2200#define DO_TRN(NAME, TYPE, H) \
2201void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
2202{                                                                      \
2203    intptr_t oprsz = simd_oprsz(desc);                                 \
2204    intptr_t odd_ofs = simd_data(desc);                                \
2205    intptr_t i;                                                        \
2206    for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) {                    \
2207        TYPE ae = *(TYPE *)(vn + H(i + odd_ofs));                      \
2208        TYPE be = *(TYPE *)(vm + H(i + odd_ofs));                      \
2209        *(TYPE *)(vd + H(i + 0)) = ae;                                 \
2210        *(TYPE *)(vd + H(i + sizeof(TYPE))) = be;                      \
2211    }                                                                  \
2212}
2213
2214DO_TRN(sve_trn_b, uint8_t, H1)
2215DO_TRN(sve_trn_h, uint16_t, H1_2)
2216DO_TRN(sve_trn_s, uint32_t, H1_4)
2217DO_TRN(sve_trn_d, uint64_t, )
2218
2219#undef DO_ZIP
2220#undef DO_UZP
2221#undef DO_TRN
2222
2223void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2224{
2225    intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2226    uint32_t *d = vd, *n = vn;
2227    uint8_t *pg = vg;
2228
2229    for (i = j = 0; i < opr_sz; i++) {
2230        if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2231            d[H4(j)] = n[H4(i)];
2232            j++;
2233        }
2234    }
2235    for (; j < opr_sz; j++) {
2236        d[H4(j)] = 0;
2237    }
2238}
2239
2240void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2241{
2242    intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2243    uint64_t *d = vd, *n = vn;
2244    uint8_t *pg = vg;
2245
2246    for (i = j = 0; i < opr_sz; i++) {
2247        if (pg[H1(i)] & 1) {
2248            d[j] = n[i];
2249            j++;
2250        }
2251    }
2252    for (; j < opr_sz; j++) {
2253        d[j] = 0;
2254    }
2255}
2256
2257/* Similar to the ARM LastActiveElement pseudocode function, except the
2258 * result is multiplied by the element size.  This includes the not found
2259 * indication; e.g. not found for esz=3 is -8.
2260 */
2261int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2262{
2263    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2264    intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2265
2266    return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2267}
2268
2269void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2270{
2271    intptr_t opr_sz = simd_oprsz(desc) / 8;
2272    int esz = simd_data(desc);
2273    uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2274    intptr_t i, first_i, last_i;
2275    ARMVectorReg tmp;
2276
2277    first_i = last_i = 0;
2278    first_g = last_g = 0;
2279
2280    /* Find the extent of the active elements within VG.  */
2281    for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2282        pg = *(uint64_t *)(vg + i) & mask;
2283        if (pg) {
2284            if (last_g == 0) {
2285                last_g = pg;
2286                last_i = i;
2287            }
2288            first_g = pg;
2289            first_i = i;
2290        }
2291    }
2292
2293    len = 0;
2294    if (first_g != 0) {
2295        first_i = first_i * 8 + ctz64(first_g);
2296        last_i = last_i * 8 + 63 - clz64(last_g);
2297        len = last_i - first_i + (1 << esz);
2298        if (vd == vm) {
2299            vm = memcpy(&tmp, vm, opr_sz * 8);
2300        }
2301        swap_memmove(vd, vn + first_i, len);
2302    }
2303    swap_memmove(vd + len, vm, opr_sz * 8 - len);
2304}
2305
2306void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2307                            void *vg, uint32_t desc)
2308{
2309    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2310    uint64_t *d = vd, *n = vn, *m = vm;
2311    uint8_t *pg = vg;
2312
2313    for (i = 0; i < opr_sz; i += 1) {
2314        uint64_t nn = n[i], mm = m[i];
2315        uint64_t pp = expand_pred_b(pg[H1(i)]);
2316        d[i] = (nn & pp) | (mm & ~pp);
2317    }
2318}
2319
2320void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2321                            void *vg, uint32_t desc)
2322{
2323    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2324    uint64_t *d = vd, *n = vn, *m = vm;
2325    uint8_t *pg = vg;
2326
2327    for (i = 0; i < opr_sz; i += 1) {
2328        uint64_t nn = n[i], mm = m[i];
2329        uint64_t pp = expand_pred_h(pg[H1(i)]);
2330        d[i] = (nn & pp) | (mm & ~pp);
2331    }
2332}
2333
2334void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2335                            void *vg, uint32_t desc)
2336{
2337    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2338    uint64_t *d = vd, *n = vn, *m = vm;
2339    uint8_t *pg = vg;
2340
2341    for (i = 0; i < opr_sz; i += 1) {
2342        uint64_t nn = n[i], mm = m[i];
2343        uint64_t pp = expand_pred_s(pg[H1(i)]);
2344        d[i] = (nn & pp) | (mm & ~pp);
2345    }
2346}
2347
2348void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2349                            void *vg, uint32_t desc)
2350{
2351    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2352    uint64_t *d = vd, *n = vn, *m = vm;
2353    uint8_t *pg = vg;
2354
2355    for (i = 0; i < opr_sz; i += 1) {
2356        uint64_t nn = n[i], mm = m[i];
2357        d[i] = (pg[H1(i)] & 1 ? nn : mm);
2358    }
2359}
2360
2361/* Two operand comparison controlled by a predicate.
2362 * ??? It is very tempting to want to be able to expand this inline
2363 * with x86 instructions, e.g.
2364 *
2365 *    vcmpeqw    zm, zn, %ymm0
2366 *    vpmovmskb  %ymm0, %eax
2367 *    and        $0x5555, %eax
2368 *    and        pg, %eax
2369 *
2370 * or even aarch64, e.g.
2371 *
2372 *    // mask = 4000 1000 0400 0100 0040 0010 0004 0001
2373 *    cmeq       v0.8h, zn, zm
2374 *    and        v0.8h, v0.8h, mask
2375 *    addv       h0, v0.8h
2376 *    and        v0.8b, pg
2377 *
2378 * However, coming up with an abstraction that allows vector inputs and
2379 * a scalar output, and also handles the byte-ordering of sub-uint64_t
2380 * scalar outputs, is tricky.
2381 */
2382#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK)                                 \
2383uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2384{                                                                            \
2385    intptr_t opr_sz = simd_oprsz(desc);                                      \
2386    uint32_t flags = PREDTEST_INIT;                                          \
2387    intptr_t i = opr_sz;                                                     \
2388    do {                                                                     \
2389        uint64_t out = 0, pg;                                                \
2390        do {                                                                 \
2391            i -= sizeof(TYPE), out <<= sizeof(TYPE);                         \
2392            TYPE nn = *(TYPE *)(vn + H(i));                                  \
2393            TYPE mm = *(TYPE *)(vm + H(i));                                  \
2394            out |= nn OP mm;                                                 \
2395        } while (i & 63);                                                    \
2396        pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
2397        out &= pg;                                                           \
2398        *(uint64_t *)(vd + (i >> 3)) = out;                                  \
2399        flags = iter_predtest_bwd(out, pg, flags);                           \
2400    } while (i > 0);                                                         \
2401    return flags;                                                            \
2402}
2403
2404#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2405    DO_CMP_PPZZ(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
2406#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2407    DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2408#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2409    DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2410#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2411    DO_CMP_PPZZ(NAME, TYPE, OP,     , 0x0101010101010101ull)
2412
2413DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t,  ==)
2414DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2415DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2416DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2417
2418DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t,  !=)
2419DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2420DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2421DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2422
2423DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t,  >)
2424DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2425DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2426DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2427
2428DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t,  >=)
2429DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2430DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2431DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2432
2433DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t,  >)
2434DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2435DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2436DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2437
2438DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t,  >=)
2439DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2440DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2441DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2442
2443#undef DO_CMP_PPZZ_B
2444#undef DO_CMP_PPZZ_H
2445#undef DO_CMP_PPZZ_S
2446#undef DO_CMP_PPZZ_D
2447#undef DO_CMP_PPZZ
2448
2449/* Similar, but the second source is "wide".  */
2450#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK)                     \
2451uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2452{                                                                            \
2453    intptr_t opr_sz = simd_oprsz(desc);                                      \
2454    uint32_t flags = PREDTEST_INIT;                                          \
2455    intptr_t i = opr_sz;                                                     \
2456    do {                                                                     \
2457        uint64_t out = 0, pg;                                                \
2458        do {                                                                 \
2459            TYPEW mm = *(TYPEW *)(vm + i - 8);                               \
2460            do {                                                             \
2461                i -= sizeof(TYPE), out <<= sizeof(TYPE);                     \
2462                TYPE nn = *(TYPE *)(vn + H(i));                              \
2463                out |= nn OP mm;                                             \
2464            } while (i & 7);                                                 \
2465        } while (i & 63);                                                    \
2466        pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
2467        out &= pg;                                                           \
2468        *(uint64_t *)(vd + (i >> 3)) = out;                                  \
2469        flags = iter_predtest_bwd(out, pg, flags);                           \
2470    } while (i > 0);                                                         \
2471    return flags;                                                            \
2472}
2473
2474#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2475    DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1,   0xffffffffffffffffull)
2476#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2477    DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2478#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2479    DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2480
2481DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t,  uint64_t, ==)
2482DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
2483DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
2484
2485DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t,  uint64_t, !=)
2486DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
2487DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
2488
2489DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t,   int64_t, >)
2490DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t,  int64_t, >)
2491DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t,  int64_t, >)
2492
2493DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t,   int64_t, >=)
2494DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t,  int64_t, >=)
2495DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t,  int64_t, >=)
2496
2497DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t,  uint64_t, >)
2498DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
2499DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
2500
2501DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t,  uint64_t, >=)
2502DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
2503DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
2504
2505DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t,   int64_t, <)
2506DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t,  int64_t, <)
2507DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t,  int64_t, <)
2508
2509DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t,   int64_t, <=)
2510DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t,  int64_t, <=)
2511DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t,  int64_t, <=)
2512
2513DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t,  uint64_t, <)
2514DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
2515DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
2516
2517DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t,  uint64_t, <=)
2518DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
2519DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
2520
2521#undef DO_CMP_PPZW_B
2522#undef DO_CMP_PPZW_H
2523#undef DO_CMP_PPZW_S
2524#undef DO_CMP_PPZW
2525
2526/* Similar, but the second source is immediate.  */
2527#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK)                         \
2528uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)   \
2529{                                                                    \
2530    intptr_t opr_sz = simd_oprsz(desc);                              \
2531    uint32_t flags = PREDTEST_INIT;                                  \
2532    TYPE mm = simd_data(desc);                                       \
2533    intptr_t i = opr_sz;                                             \
2534    do {                                                             \
2535        uint64_t out = 0, pg;                                        \
2536        do {                                                         \
2537            i -= sizeof(TYPE), out <<= sizeof(TYPE);                 \
2538            TYPE nn = *(TYPE *)(vn + H(i));                          \
2539            out |= nn OP mm;                                         \
2540        } while (i & 63);                                            \
2541        pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                    \
2542        out &= pg;                                                   \
2543        *(uint64_t *)(vd + (i >> 3)) = out;                          \
2544        flags = iter_predtest_bwd(out, pg, flags);                   \
2545    } while (i > 0);                                                 \
2546    return flags;                                                    \
2547}
2548
2549#define DO_CMP_PPZI_B(NAME, TYPE, OP) \
2550    DO_CMP_PPZI(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
2551#define DO_CMP_PPZI_H(NAME, TYPE, OP) \
2552    DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2553#define DO_CMP_PPZI_S(NAME, TYPE, OP) \
2554    DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2555#define DO_CMP_PPZI_D(NAME, TYPE, OP) \
2556    DO_CMP_PPZI(NAME, TYPE, OP,     , 0x0101010101010101ull)
2557
2558DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t,  ==)
2559DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
2560DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
2561DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
2562
2563DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t,  !=)
2564DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
2565DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
2566DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
2567
2568DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t,  >)
2569DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
2570DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
2571DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
2572
2573DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t,  >=)
2574DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
2575DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
2576DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
2577
2578DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t,  >)
2579DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
2580DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
2581DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
2582
2583DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t,  >=)
2584DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
2585DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
2586DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
2587
2588DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t,  <)
2589DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
2590DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
2591DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
2592
2593DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t,  <=)
2594DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
2595DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
2596DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
2597
2598DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t,  <)
2599DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
2600DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
2601DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
2602
2603DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t,  <=)
2604DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
2605DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
2606DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
2607
2608#undef DO_CMP_PPZI_B
2609#undef DO_CMP_PPZI_H
2610#undef DO_CMP_PPZI_S
2611#undef DO_CMP_PPZI_D
2612#undef DO_CMP_PPZI
2613
2614/* Similar to the ARM LastActive pseudocode function.  */
2615static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
2616{
2617    intptr_t i;
2618
2619    for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
2620        uint64_t pg = *(uint64_t *)(vg + i);
2621        if (pg) {
2622            return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
2623        }
2624    }
2625    return 0;
2626}
2627
2628/* Compute a mask into RETB that is true for all G, up to and including
2629 * (if after) or excluding (if !after) the first G & N.
2630 * Return true if BRK found.
2631 */
2632static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
2633                        bool brk, bool after)
2634{
2635    uint64_t b;
2636
2637    if (brk) {
2638        b = 0;
2639    } else if ((g & n) == 0) {
2640        /* For all G, no N are set; break not found.  */
2641        b = g;
2642    } else {
2643        /* Break somewhere in N.  Locate it.  */
2644        b = g & n;            /* guard true, pred true */
2645        b = b & -b;           /* first such */
2646        if (after) {
2647            b = b | (b - 1);  /* break after same */
2648        } else {
2649            b = b - 1;        /* break before same */
2650        }
2651        brk = true;
2652    }
2653
2654    *retb = b;
2655    return brk;
2656}
2657
2658/* Compute a zeroing BRK.  */
2659static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
2660                          intptr_t oprsz, bool after)
2661{
2662    bool brk = false;
2663    intptr_t i;
2664
2665    for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2666        uint64_t this_b, this_g = g[i];
2667
2668        brk = compute_brk(&this_b, n[i], this_g, brk, after);
2669        d[i] = this_b & this_g;
2670    }
2671}
2672
2673/* Likewise, but also compute flags.  */
2674static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
2675                               intptr_t oprsz, bool after)
2676{
2677    uint32_t flags = PREDTEST_INIT;
2678    bool brk = false;
2679    intptr_t i;
2680
2681    for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2682        uint64_t this_b, this_d, this_g = g[i];
2683
2684        brk = compute_brk(&this_b, n[i], this_g, brk, after);
2685        d[i] = this_d = this_b & this_g;
2686        flags = iter_predtest_fwd(this_d, this_g, flags);
2687    }
2688    return flags;
2689}
2690
2691/* Compute a merging BRK.  */
2692static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
2693                          intptr_t oprsz, bool after)
2694{
2695    bool brk = false;
2696    intptr_t i;
2697
2698    for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2699        uint64_t this_b, this_g = g[i];
2700
2701        brk = compute_brk(&this_b, n[i], this_g, brk, after);
2702        d[i] = (this_b & this_g) | (d[i] & ~this_g);
2703    }
2704}
2705
2706/* Likewise, but also compute flags.  */
2707static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
2708                               intptr_t oprsz, bool after)
2709{
2710    uint32_t flags = PREDTEST_INIT;
2711    bool brk = false;
2712    intptr_t i;
2713
2714    for (i = 0; i < oprsz / 8; ++i) {
2715        uint64_t this_b, this_d = d[i], this_g = g[i];
2716
2717        brk = compute_brk(&this_b, n[i], this_g, brk, after);
2718        d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
2719        flags = iter_predtest_fwd(this_d, this_g, flags);
2720    }
2721    return flags;
2722}
2723
2724static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
2725{
2726    /* It is quicker to zero the whole predicate than loop on OPRSZ.
2727     * The compiler should turn this into 4 64-bit integer stores.
2728     */
2729    memset(d, 0, sizeof(ARMPredicateReg));
2730    return PREDTEST_INIT;
2731}
2732
2733void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
2734                       uint32_t pred_desc)
2735{
2736    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2737    if (last_active_pred(vn, vg, oprsz)) {
2738        compute_brk_z(vd, vm, vg, oprsz, true);
2739    } else {
2740        do_zero(vd, oprsz);
2741    }
2742}
2743
2744uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
2745                            uint32_t pred_desc)
2746{
2747    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2748    if (last_active_pred(vn, vg, oprsz)) {
2749        return compute_brks_z(vd, vm, vg, oprsz, true);
2750    } else {
2751        return do_zero(vd, oprsz);
2752    }
2753}
2754
2755void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
2756                       uint32_t pred_desc)
2757{
2758    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2759    if (last_active_pred(vn, vg, oprsz)) {
2760        compute_brk_z(vd, vm, vg, oprsz, false);
2761    } else {
2762        do_zero(vd, oprsz);
2763    }
2764}
2765
2766uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
2767                            uint32_t pred_desc)
2768{
2769    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2770    if (last_active_pred(vn, vg, oprsz)) {
2771        return compute_brks_z(vd, vm, vg, oprsz, false);
2772    } else {
2773        return do_zero(vd, oprsz);
2774    }
2775}
2776
2777void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2778{
2779    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2780    compute_brk_z(vd, vn, vg, oprsz, true);
2781}
2782
2783uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2784{
2785    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2786    return compute_brks_z(vd, vn, vg, oprsz, true);
2787}
2788
2789void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2790{
2791    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2792    compute_brk_z(vd, vn, vg, oprsz, false);
2793}
2794
2795uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2796{
2797    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2798    return compute_brks_z(vd, vn, vg, oprsz, false);
2799}
2800
2801void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2802{
2803    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2804    compute_brk_m(vd, vn, vg, oprsz, true);
2805}
2806
2807uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2808{
2809    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2810    return compute_brks_m(vd, vn, vg, oprsz, true);
2811}
2812
2813void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2814{
2815    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2816    compute_brk_m(vd, vn, vg, oprsz, false);
2817}
2818
2819uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2820{
2821    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2822    return compute_brks_m(vd, vn, vg, oprsz, false);
2823}
2824
2825void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2826{
2827    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2828
2829    if (!last_active_pred(vn, vg, oprsz)) {
2830        do_zero(vd, oprsz);
2831    }
2832}
2833
2834/* As if PredTest(Ones(PL), D, esz).  */
2835static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
2836                              uint64_t esz_mask)
2837{
2838    uint32_t flags = PREDTEST_INIT;
2839    intptr_t i;
2840
2841    for (i = 0; i < oprsz / 8; i++) {
2842        flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
2843    }
2844    if (oprsz & 7) {
2845        uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
2846        flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
2847    }
2848    return flags;
2849}
2850
2851uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2852{
2853    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2854
2855    if (last_active_pred(vn, vg, oprsz)) {
2856        return predtest_ones(vd, oprsz, -1);
2857    } else {
2858        return do_zero(vd, oprsz);
2859    }
2860}
2861
2862uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
2863{
2864    intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2865    intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2866    uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
2867    intptr_t i;
2868
2869    for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2870        uint64_t t = n[i] & g[i] & mask;
2871        sum += ctpop64(t);
2872    }
2873    return sum;
2874}
2875
2876uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
2877{
2878    uintptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2879    intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2880    uint64_t esz_mask = pred_esz_masks[esz];
2881    ARMPredicateReg *d = vd;
2882    uint32_t flags;
2883    intptr_t i;
2884
2885    /* Begin with a zero predicate register.  */
2886    flags = do_zero(d, oprsz);
2887    if (count == 0) {
2888        return flags;
2889    }
2890
2891    /* Set all of the requested bits.  */
2892    for (i = 0; i < count / 64; ++i) {
2893        d->p[i] = esz_mask;
2894    }
2895    if (count & 63) {
2896        d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
2897    }
2898
2899    return predtest_ones(d, oprsz, esz_mask);
2900}
2901
2902/* Recursive reduction on a function;
2903 * C.f. the ARM ARM function ReducePredicated.
2904 *
2905 * While it would be possible to write this without the DATA temporary,
2906 * it is much simpler to process the predicate register this way.
2907 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
2908 * little to gain with a more complex non-recursive form.
2909 */
2910#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT)                         \
2911static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
2912{                                                                     \
2913    if (n == 1) {                                                     \
2914        return *data;                                                 \
2915    } else {                                                          \
2916        uintptr_t half = n / 2;                                       \
2917        TYPE lo = NAME##_reduce(data, status, half);                  \
2918        TYPE hi = NAME##_reduce(data + half, status, half);           \
2919        return TYPE##_##FUNC(lo, hi, status);                         \
2920    }                                                                 \
2921}                                                                     \
2922uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc)    \
2923{                                                                     \
2924    uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_maxsz(desc);  \
2925    TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)];                   \
2926    for (i = 0; i < oprsz; ) {                                        \
2927        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));               \
2928        do {                                                          \
2929            TYPE nn = *(TYPE *)(vn + H(i));                           \
2930            *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT);      \
2931            i += sizeof(TYPE), pg >>= sizeof(TYPE);                   \
2932        } while (i & 15);                                             \
2933    }                                                                 \
2934    for (; i < maxsz; i += sizeof(TYPE)) {                            \
2935        *(TYPE *)((void *)data + i) = IDENT;                          \
2936    }                                                                 \
2937    return NAME##_reduce(data, vs, maxsz / sizeof(TYPE));             \
2938}
2939
2940DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
2941DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
2942DO_REDUCE(sve_faddv_d, float64,     , add, float64_zero)
2943
2944/* Identity is floatN_default_nan, without the function call.  */
2945DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
2946DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
2947DO_REDUCE(sve_fminnmv_d, float64,     , minnum, 0x7FF8000000000000ULL)
2948
2949DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
2950DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
2951DO_REDUCE(sve_fmaxnmv_d, float64,     , maxnum, 0x7FF8000000000000ULL)
2952
2953DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
2954DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
2955DO_REDUCE(sve_fminv_d, float64,     , min, float64_infinity)
2956
2957DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
2958DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
2959DO_REDUCE(sve_fmaxv_d, float64,     , max, float64_chs(float64_infinity))
2960
2961#undef DO_REDUCE
2962
2963uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
2964                             void *status, uint32_t desc)
2965{
2966    intptr_t i = 0, opr_sz = simd_oprsz(desc);
2967    float16 result = nn;
2968
2969    do {
2970        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2971        do {
2972            if (pg & 1) {
2973                float16 mm = *(float16 *)(vm + H1_2(i));
2974                result = float16_add(result, mm, status);
2975            }
2976            i += sizeof(float16), pg >>= sizeof(float16);
2977        } while (i & 15);
2978    } while (i < opr_sz);
2979
2980    return result;
2981}
2982
2983uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
2984                             void *status, uint32_t desc)
2985{
2986    intptr_t i = 0, opr_sz = simd_oprsz(desc);
2987    float32 result = nn;
2988
2989    do {
2990        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2991        do {
2992            if (pg & 1) {
2993                float32 mm = *(float32 *)(vm + H1_2(i));
2994                result = float32_add(result, mm, status);
2995            }
2996            i += sizeof(float32), pg >>= sizeof(float32);
2997        } while (i & 15);
2998    } while (i < opr_sz);
2999
3000    return result;
3001}
3002
3003uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
3004                             void *status, uint32_t desc)
3005{
3006    intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
3007    uint64_t *m = vm;
3008    uint8_t *pg = vg;
3009
3010    for (i = 0; i < opr_sz; i++) {
3011        if (pg[H1(i)] & 1) {
3012            nn = float64_add(nn, m[i], status);
3013        }
3014    }
3015
3016    return nn;
3017}
3018
3019/* Fully general three-operand expander, controlled by a predicate,
3020 * With the extra float_status parameter.
3021 */
3022#define DO_ZPZZ_FP(NAME, TYPE, H, OP)                           \
3023void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,       \
3024                  void *status, uint32_t desc)                  \
3025{                                                               \
3026    intptr_t i = simd_oprsz(desc);                              \
3027    uint64_t *g = vg;                                           \
3028    do {                                                        \
3029        uint64_t pg = g[(i - 1) >> 6];                          \
3030        do {                                                    \
3031            i -= sizeof(TYPE);                                  \
3032            if (likely((pg >> (i & 63)) & 1)) {                 \
3033                TYPE nn = *(TYPE *)(vn + H(i));                 \
3034                TYPE mm = *(TYPE *)(vm + H(i));                 \
3035                *(TYPE *)(vd + H(i)) = OP(nn, mm, status);      \
3036            }                                                   \
3037        } while (i & 63);                                       \
3038    } while (i != 0);                                           \
3039}
3040
3041DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
3042DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
3043DO_ZPZZ_FP(sve_fadd_d, uint64_t,     , float64_add)
3044
3045DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
3046DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
3047DO_ZPZZ_FP(sve_fsub_d, uint64_t,     , float64_sub)
3048
3049DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
3050DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
3051DO_ZPZZ_FP(sve_fmul_d, uint64_t,     , float64_mul)
3052
3053DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
3054DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
3055DO_ZPZZ_FP(sve_fdiv_d, uint64_t,     , float64_div)
3056
3057DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
3058DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
3059DO_ZPZZ_FP(sve_fmin_d, uint64_t,     , float64_min)
3060
3061DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
3062DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
3063DO_ZPZZ_FP(sve_fmax_d, uint64_t,     , float64_max)
3064
3065DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
3066DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
3067DO_ZPZZ_FP(sve_fminnum_d, uint64_t,     , float64_minnum)
3068
3069DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
3070DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
3071DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t,     , float64_maxnum)
3072
3073static inline float16 abd_h(float16 a, float16 b, float_status *s)
3074{
3075    return float16_abs(float16_sub(a, b, s));
3076}
3077
3078static inline float32 abd_s(float32 a, float32 b, float_status *s)
3079{
3080    return float32_abs(float32_sub(a, b, s));
3081}
3082
3083static inline float64 abd_d(float64 a, float64 b, float_status *s)
3084{
3085    return float64_abs(float64_sub(a, b, s));
3086}
3087
3088DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
3089DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
3090DO_ZPZZ_FP(sve_fabd_d, uint64_t,     , abd_d)
3091
3092static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
3093{
3094    int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
3095    return float64_scalbn(a, b_int, s);
3096}
3097
3098DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
3099DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
3100DO_ZPZZ_FP(sve_fscalbn_d, int64_t,     , scalbn_d)
3101
3102DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
3103DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
3104DO_ZPZZ_FP(sve_fmulx_d, uint64_t,     , helper_vfp_mulxd)
3105
3106#undef DO_ZPZZ_FP
3107
3108/* Three-operand expander, with one scalar operand, controlled by
3109 * a predicate, with the extra float_status parameter.
3110 */
3111#define DO_ZPZS_FP(NAME, TYPE, H, OP) \
3112void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar,  \
3113                  void *status, uint32_t desc)                    \
3114{                                                                 \
3115    intptr_t i = simd_oprsz(desc);                                \
3116    uint64_t *g = vg;                                             \
3117    TYPE mm = scalar;                                             \
3118    do {                                                          \
3119        uint64_t pg = g[(i - 1) >> 6];                            \
3120        do {                                                      \
3121            i -= sizeof(TYPE);                                    \
3122            if (likely((pg >> (i & 63)) & 1)) {                   \
3123                TYPE nn = *(TYPE *)(vn + H(i));                   \
3124                *(TYPE *)(vd + H(i)) = OP(nn, mm, status);        \
3125            }                                                     \
3126        } while (i & 63);                                         \
3127    } while (i != 0);                                             \
3128}
3129
3130DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
3131DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
3132DO_ZPZS_FP(sve_fadds_d, float64,     , float64_add)
3133
3134DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
3135DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
3136DO_ZPZS_FP(sve_fsubs_d, float64,     , float64_sub)
3137
3138DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
3139DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
3140DO_ZPZS_FP(sve_fmuls_d, float64,     , float64_mul)
3141
3142static inline float16 subr_h(float16 a, float16 b, float_status *s)
3143{
3144    return float16_sub(b, a, s);
3145}
3146
3147static inline float32 subr_s(float32 a, float32 b, float_status *s)
3148{
3149    return float32_sub(b, a, s);
3150}
3151
3152static inline float64 subr_d(float64 a, float64 b, float_status *s)
3153{
3154    return float64_sub(b, a, s);
3155}
3156
3157DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
3158DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
3159DO_ZPZS_FP(sve_fsubrs_d, float64,     , subr_d)
3160
3161DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
3162DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
3163DO_ZPZS_FP(sve_fmaxnms_d, float64,     , float64_maxnum)
3164
3165DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
3166DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
3167DO_ZPZS_FP(sve_fminnms_d, float64,     , float64_minnum)
3168
3169DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
3170DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
3171DO_ZPZS_FP(sve_fmaxs_d, float64,     , float64_max)
3172
3173DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
3174DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
3175DO_ZPZS_FP(sve_fmins_d, float64,     , float64_min)
3176
3177/* Fully general two-operand expander, controlled by a predicate,
3178 * With the extra float_status parameter.
3179 */
3180#define DO_ZPZ_FP(NAME, TYPE, H, OP)                                  \
3181void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
3182{                                                                     \
3183    intptr_t i = simd_oprsz(desc);                                    \
3184    uint64_t *g = vg;                                                 \
3185    do {                                                              \
3186        uint64_t pg = g[(i - 1) >> 6];                                \
3187        do {                                                          \
3188            i -= sizeof(TYPE);                                        \
3189            if (likely((pg >> (i & 63)) & 1)) {                       \
3190                TYPE nn = *(TYPE *)(vn + H(i));                       \
3191                *(TYPE *)(vd + H(i)) = OP(nn, status);                \
3192            }                                                         \
3193        } while (i & 63);                                             \
3194    } while (i != 0);                                                 \
3195}
3196
3197/* SVE fp16 conversions always use IEEE mode.  Like AdvSIMD, they ignore
3198 * FZ16.  When converting from fp16, this affects flushing input denormals;
3199 * when converting to fp16, this affects flushing output denormals.
3200 */
3201static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
3202{
3203    flag save = get_flush_inputs_to_zero(fpst);
3204    float32 ret;
3205
3206    set_flush_inputs_to_zero(false, fpst);
3207    ret = float16_to_float32(f, true, fpst);
3208    set_flush_inputs_to_zero(save, fpst);
3209    return ret;
3210}
3211
3212static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
3213{
3214    flag save = get_flush_inputs_to_zero(fpst);
3215    float64 ret;
3216
3217    set_flush_inputs_to_zero(false, fpst);
3218    ret = float16_to_float64(f, true, fpst);
3219    set_flush_inputs_to_zero(save, fpst);
3220    return ret;
3221}
3222
3223static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
3224{
3225    flag save = get_flush_to_zero(fpst);
3226    float16 ret;
3227
3228    set_flush_to_zero(false, fpst);
3229    ret = float32_to_float16(f, true, fpst);
3230    set_flush_to_zero(save, fpst);
3231    return ret;
3232}
3233
3234static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
3235{
3236    flag save = get_flush_to_zero(fpst);
3237    float16 ret;
3238
3239    set_flush_to_zero(false, fpst);
3240    ret = float64_to_float16(f, true, fpst);
3241    set_flush_to_zero(save, fpst);
3242    return ret;
3243}
3244
3245static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
3246{
3247    if (float16_is_any_nan(f)) {
3248        float_raise(float_flag_invalid, s);
3249        return 0;
3250    }
3251    return float16_to_int16_round_to_zero(f, s);
3252}
3253
3254static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
3255{
3256    if (float16_is_any_nan(f)) {
3257        float_raise(float_flag_invalid, s);
3258        return 0;
3259    }
3260    return float16_to_int64_round_to_zero(f, s);
3261}
3262
3263static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
3264{
3265    if (float32_is_any_nan(f)) {
3266        float_raise(float_flag_invalid, s);
3267        return 0;
3268    }
3269    return float32_to_int64_round_to_zero(f, s);
3270}
3271
3272static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
3273{
3274    if (float64_is_any_nan(f)) {
3275        float_raise(float_flag_invalid, s);
3276        return 0;
3277    }
3278    return float64_to_int64_round_to_zero(f, s);
3279}
3280
3281static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
3282{
3283    if (float16_is_any_nan(f)) {
3284        float_raise(float_flag_invalid, s);
3285        return 0;
3286    }
3287    return float16_to_uint16_round_to_zero(f, s);
3288}
3289
3290static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
3291{
3292    if (float16_is_any_nan(f)) {
3293        float_raise(float_flag_invalid, s);
3294        return 0;
3295    }
3296    return float16_to_uint64_round_to_zero(f, s);
3297}
3298
3299static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
3300{
3301    if (float32_is_any_nan(f)) {
3302        float_raise(float_flag_invalid, s);
3303        return 0;
3304    }
3305    return float32_to_uint64_round_to_zero(f, s);
3306}
3307
3308static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
3309{
3310    if (float64_is_any_nan(f)) {
3311        float_raise(float_flag_invalid, s);
3312        return 0;
3313    }
3314    return float64_to_uint64_round_to_zero(f, s);
3315}
3316
3317DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
3318DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
3319DO_ZPZ_FP(sve_fcvt_dh, uint64_t,     , sve_f64_to_f16)
3320DO_ZPZ_FP(sve_fcvt_hd, uint64_t,     , sve_f16_to_f64)
3321DO_ZPZ_FP(sve_fcvt_ds, uint64_t,     , float64_to_float32)
3322DO_ZPZ_FP(sve_fcvt_sd, uint64_t,     , float32_to_float64)
3323
3324DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
3325DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
3326DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
3327DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t,     , vfp_float16_to_int64_rtz)
3328DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t,     , vfp_float32_to_int64_rtz)
3329DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t,     , helper_vfp_tosizd)
3330DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t,     , vfp_float64_to_int64_rtz)
3331
3332DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
3333DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
3334DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
3335DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t,     , vfp_float16_to_uint64_rtz)
3336DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t,     , vfp_float32_to_uint64_rtz)
3337DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t,     , helper_vfp_touizd)
3338DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t,     , vfp_float64_to_uint64_rtz)
3339
3340DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
3341DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
3342DO_ZPZ_FP(sve_frint_d, uint64_t,     , helper_rintd)
3343
3344DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
3345DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
3346DO_ZPZ_FP(sve_frintx_d, uint64_t,     , float64_round_to_int)
3347
3348DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
3349DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
3350DO_ZPZ_FP(sve_frecpx_d, uint64_t,     , helper_frecpx_f64)
3351
3352DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
3353DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
3354DO_ZPZ_FP(sve_fsqrt_d, uint64_t,     , float64_sqrt)
3355
3356DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
3357DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
3358DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
3359DO_ZPZ_FP(sve_scvt_sd, uint64_t,     , int32_to_float64)
3360DO_ZPZ_FP(sve_scvt_dh, uint64_t,     , int64_to_float16)
3361DO_ZPZ_FP(sve_scvt_ds, uint64_t,     , int64_to_float32)
3362DO_ZPZ_FP(sve_scvt_dd, uint64_t,     , int64_to_float64)
3363
3364DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
3365DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
3366DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
3367DO_ZPZ_FP(sve_ucvt_sd, uint64_t,     , uint32_to_float64)
3368DO_ZPZ_FP(sve_ucvt_dh, uint64_t,     , uint64_to_float16)
3369DO_ZPZ_FP(sve_ucvt_ds, uint64_t,     , uint64_to_float32)
3370DO_ZPZ_FP(sve_ucvt_dd, uint64_t,     , uint64_to_float64)
3371
3372#undef DO_ZPZ_FP
3373
3374/* 4-operand predicated multiply-add.  This requires 7 operands to pass
3375 * "properly", so we need to encode some of the registers into DESC.
3376 */
3377QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 20 > 32);
3378
3379static void do_fmla_zpzzz_h(CPUARMState *env, void *vg, uint32_t desc,
3380                            uint16_t neg1, uint16_t neg3)
3381{
3382    intptr_t i = simd_oprsz(desc);
3383    unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3384    unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3385    unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3386    unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3387    void *vd = &env->vfp.zregs[rd];
3388    void *vn = &env->vfp.zregs[rn];
3389    void *vm = &env->vfp.zregs[rm];
3390    void *va = &env->vfp.zregs[ra];
3391    uint64_t *g = vg;
3392
3393    do {
3394        uint64_t pg = g[(i - 1) >> 6];
3395        do {
3396            i -= 2;
3397            if (likely((pg >> (i & 63)) & 1)) {
3398                float16 e1, e2, e3, r;
3399
3400                e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
3401                e2 = *(uint16_t *)(vm + H1_2(i));
3402                e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
3403                r = float16_muladd(e1, e2, e3, 0, &env->vfp.fp_status_f16);
3404                *(uint16_t *)(vd + H1_2(i)) = r;
3405            }
3406        } while (i & 63);
3407    } while (i != 0);
3408}
3409
3410void HELPER(sve_fmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3411{
3412    do_fmla_zpzzz_h(env, vg, desc, 0, 0);
3413}
3414
3415void HELPER(sve_fmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3416{
3417    do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0);
3418}
3419
3420void HELPER(sve_fnmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3421{
3422    do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0x8000);
3423}
3424
3425void HELPER(sve_fnmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3426{
3427    do_fmla_zpzzz_h(env, vg, desc, 0, 0x8000);
3428}
3429
3430static void do_fmla_zpzzz_s(CPUARMState *env, void *vg, uint32_t desc,
3431                            uint32_t neg1, uint32_t neg3)
3432{
3433    intptr_t i = simd_oprsz(desc);
3434    unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3435    unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3436    unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3437    unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3438    void *vd = &env->vfp.zregs[rd];
3439    void *vn = &env->vfp.zregs[rn];
3440    void *vm = &env->vfp.zregs[rm];
3441    void *va = &env->vfp.zregs[ra];
3442    uint64_t *g = vg;
3443
3444    do {
3445        uint64_t pg = g[(i - 1) >> 6];
3446        do {
3447            i -= 4;
3448            if (likely((pg >> (i & 63)) & 1)) {
3449                float32 e1, e2, e3, r;
3450
3451                e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
3452                e2 = *(uint32_t *)(vm + H1_4(i));
3453                e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
3454                r = float32_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3455                *(uint32_t *)(vd + H1_4(i)) = r;
3456            }
3457        } while (i & 63);
3458    } while (i != 0);
3459}
3460
3461void HELPER(sve_fmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3462{
3463    do_fmla_zpzzz_s(env, vg, desc, 0, 0);
3464}
3465
3466void HELPER(sve_fmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3467{
3468    do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0);
3469}
3470
3471void HELPER(sve_fnmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3472{
3473    do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0x80000000);
3474}
3475
3476void HELPER(sve_fnmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3477{
3478    do_fmla_zpzzz_s(env, vg, desc, 0, 0x80000000);
3479}
3480
3481static void do_fmla_zpzzz_d(CPUARMState *env, void *vg, uint32_t desc,
3482                            uint64_t neg1, uint64_t neg3)
3483{
3484    intptr_t i = simd_oprsz(desc);
3485    unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3486    unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3487    unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3488    unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3489    void *vd = &env->vfp.zregs[rd];
3490    void *vn = &env->vfp.zregs[rn];
3491    void *vm = &env->vfp.zregs[rm];
3492    void *va = &env->vfp.zregs[ra];
3493    uint64_t *g = vg;
3494
3495    do {
3496        uint64_t pg = g[(i - 1) >> 6];
3497        do {
3498            i -= 8;
3499            if (likely((pg >> (i & 63)) & 1)) {
3500                float64 e1, e2, e3, r;
3501
3502                e1 = *(uint64_t *)(vn + i) ^ neg1;
3503                e2 = *(uint64_t *)(vm + i);
3504                e3 = *(uint64_t *)(va + i) ^ neg3;
3505                r = float64_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3506                *(uint64_t *)(vd + i) = r;
3507            }
3508        } while (i & 63);
3509    } while (i != 0);
3510}
3511
3512void HELPER(sve_fmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3513{
3514    do_fmla_zpzzz_d(env, vg, desc, 0, 0);
3515}
3516
3517void HELPER(sve_fmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3518{
3519    do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, 0);
3520}
3521
3522void HELPER(sve_fnmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3523{
3524    do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, INT64_MIN);
3525}
3526
3527void HELPER(sve_fnmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3528{
3529    do_fmla_zpzzz_d(env, vg, desc, 0, INT64_MIN);
3530}
3531
3532/* Two operand floating-point comparison controlled by a predicate.
3533 * Unlike the integer version, we are not allowed to optimistically
3534 * compare operands, since the comparison may have side effects wrt
3535 * the FPSR.
3536 */
3537#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP)                                \
3538void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
3539                  void *status, uint32_t desc)                          \
3540{                                                                       \
3541    intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;                    \
3542    uint64_t *d = vd, *g = vg;                                          \
3543    do {                                                                \
3544        uint64_t out = 0, pg = g[j];                                    \
3545        do {                                                            \
3546            i -= sizeof(TYPE), out <<= sizeof(TYPE);                    \
3547            if (likely((pg >> (i & 63)) & 1)) {                         \
3548                TYPE nn = *(TYPE *)(vn + H(i));                         \
3549                TYPE mm = *(TYPE *)(vm + H(i));                         \
3550                out |= OP(TYPE, nn, mm, status);                        \
3551            }                                                           \
3552        } while (i & 63);                                               \
3553        d[j--] = out;                                                   \
3554    } while (i > 0);                                                    \
3555}
3556
3557#define DO_FPCMP_PPZZ_H(NAME, OP) \
3558    DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
3559#define DO_FPCMP_PPZZ_S(NAME, OP) \
3560    DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
3561#define DO_FPCMP_PPZZ_D(NAME, OP) \
3562    DO_FPCMP_PPZZ(NAME##_d, float64,     , OP)
3563
3564#define DO_FPCMP_PPZZ_ALL(NAME, OP) \
3565    DO_FPCMP_PPZZ_H(NAME, OP)   \
3566    DO_FPCMP_PPZZ_S(NAME, OP)   \
3567    DO_FPCMP_PPZZ_D(NAME, OP)
3568
3569#define DO_FCMGE(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) <= 0
3570#define DO_FCMGT(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) < 0
3571#define DO_FCMLE(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) <= 0
3572#define DO_FCMLT(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) < 0
3573#define DO_FCMEQ(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) == 0
3574#define DO_FCMNE(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) != 0
3575#define DO_FCMUO(TYPE, X, Y, ST)  \
3576    TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
3577#define DO_FACGE(TYPE, X, Y, ST)  \
3578    TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
3579#define DO_FACGT(TYPE, X, Y, ST)  \
3580    TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
3581
3582DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
3583DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
3584DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
3585DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
3586DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
3587DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
3588DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
3589
3590#undef DO_FPCMP_PPZZ_ALL
3591#undef DO_FPCMP_PPZZ_D
3592#undef DO_FPCMP_PPZZ_S
3593#undef DO_FPCMP_PPZZ_H
3594#undef DO_FPCMP_PPZZ
3595
3596/* One operand floating-point comparison against zero, controlled
3597 * by a predicate.
3598 */
3599#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP)                   \
3600void HELPER(NAME)(void *vd, void *vn, void *vg,            \
3601                  void *status, uint32_t desc)             \
3602{                                                          \
3603    intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;       \
3604    uint64_t *d = vd, *g = vg;                             \
3605    do {                                                   \
3606        uint64_t out = 0, pg = g[j];                       \
3607        do {                                               \
3608            i -= sizeof(TYPE), out <<= sizeof(TYPE);       \
3609            if ((pg >> (i & 63)) & 1) {                    \
3610                TYPE nn = *(TYPE *)(vn + H(i));            \
3611                out |= OP(TYPE, nn, 0, status);            \
3612            }                                              \
3613        } while (i & 63);                                  \
3614        d[j--] = out;                                      \
3615    } while (i > 0);                                       \
3616}
3617
3618#define DO_FPCMP_PPZ0_H(NAME, OP) \
3619    DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
3620#define DO_FPCMP_PPZ0_S(NAME, OP) \
3621    DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
3622#define DO_FPCMP_PPZ0_D(NAME, OP) \
3623    DO_FPCMP_PPZ0(NAME##_d, float64,     , OP)
3624
3625#define DO_FPCMP_PPZ0_ALL(NAME, OP) \
3626    DO_FPCMP_PPZ0_H(NAME, OP)   \
3627    DO_FPCMP_PPZ0_S(NAME, OP)   \
3628    DO_FPCMP_PPZ0_D(NAME, OP)
3629
3630DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
3631DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
3632DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
3633DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
3634DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
3635DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
3636
3637/* FP Trig Multiply-Add. */
3638
3639void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3640{
3641    static const float16 coeff[16] = {
3642        0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3643        0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3644    };
3645    intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
3646    intptr_t x = simd_data(desc);
3647    float16 *d = vd, *n = vn, *m = vm;
3648    for (i = 0; i < opr_sz; i++) {
3649        float16 mm = m[i];
3650        intptr_t xx = x;
3651        if (float16_is_neg(mm)) {
3652            mm = float16_abs(mm);
3653            xx += 8;
3654        }
3655        d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
3656    }
3657}
3658
3659void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3660{
3661    static const float32 coeff[16] = {
3662        0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
3663        0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
3664        0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
3665        0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
3666    };
3667    intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
3668    intptr_t x = simd_data(desc);
3669    float32 *d = vd, *n = vn, *m = vm;
3670    for (i = 0; i < opr_sz; i++) {
3671        float32 mm = m[i];
3672        intptr_t xx = x;
3673        if (float32_is_neg(mm)) {
3674            mm = float32_abs(mm);
3675            xx += 8;
3676        }
3677        d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
3678    }
3679}
3680
3681void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3682{
3683    static const float64 coeff[16] = {
3684        0x3ff0000000000000ull, 0xbfc5555555555543ull,
3685        0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
3686        0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
3687        0x3de5d8408868552full, 0x0000000000000000ull,
3688        0x3ff0000000000000ull, 0xbfe0000000000000ull,
3689        0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
3690        0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
3691        0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
3692    };
3693    intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
3694    intptr_t x = simd_data(desc);
3695    float64 *d = vd, *n = vn, *m = vm;
3696    for (i = 0; i < opr_sz; i++) {
3697        float64 mm = m[i];
3698        intptr_t xx = x;
3699        if (float64_is_neg(mm)) {
3700            mm = float64_abs(mm);
3701            xx += 8;
3702        }
3703        d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
3704    }
3705}
3706
3707/*
3708 * FP Complex Add
3709 */
3710
3711void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
3712                         void *vs, uint32_t desc)
3713{
3714    intptr_t j, i = simd_oprsz(desc);
3715    uint64_t *g = vg;
3716    float16 neg_imag = float16_set_sign(0, simd_data(desc));
3717    float16 neg_real = float16_chs(neg_imag);
3718
3719    do {
3720        uint64_t pg = g[(i - 1) >> 6];
3721        do {
3722            float16 e0, e1, e2, e3;
3723
3724            /* I holds the real index; J holds the imag index.  */
3725            j = i - sizeof(float16);
3726            i -= 2 * sizeof(float16);
3727
3728            e0 = *(float16 *)(vn + H1_2(i));
3729            e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
3730            e2 = *(float16 *)(vn + H1_2(j));
3731            e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
3732
3733            if (likely((pg >> (i & 63)) & 1)) {
3734                *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
3735            }
3736            if (likely((pg >> (j & 63)) & 1)) {
3737                *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
3738            }
3739        } while (i & 63);
3740    } while (i != 0);
3741}
3742
3743void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
3744                         void *vs, uint32_t desc)
3745{
3746    intptr_t j, i = simd_oprsz(desc);
3747    uint64_t *g = vg;
3748    float32 neg_imag = float32_set_sign(0, simd_data(desc));
3749    float32 neg_real = float32_chs(neg_imag);
3750
3751    do {
3752        uint64_t pg = g[(i - 1) >> 6];
3753        do {
3754            float32 e0, e1, e2, e3;
3755
3756            /* I holds the real index; J holds the imag index.  */
3757            j = i - sizeof(float32);
3758            i -= 2 * sizeof(float32);
3759
3760            e0 = *(float32 *)(vn + H1_2(i));
3761            e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
3762            e2 = *(float32 *)(vn + H1_2(j));
3763            e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
3764
3765            if (likely((pg >> (i & 63)) & 1)) {
3766                *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
3767            }
3768            if (likely((pg >> (j & 63)) & 1)) {
3769                *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
3770            }
3771        } while (i & 63);
3772    } while (i != 0);
3773}
3774
3775void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
3776                         void *vs, uint32_t desc)
3777{
3778    intptr_t j, i = simd_oprsz(desc);
3779    uint64_t *g = vg;
3780    float64 neg_imag = float64_set_sign(0, simd_data(desc));
3781    float64 neg_real = float64_chs(neg_imag);
3782
3783    do {
3784        uint64_t pg = g[(i - 1) >> 6];
3785        do {
3786            float64 e0, e1, e2, e3;
3787
3788            /* I holds the real index; J holds the imag index.  */
3789            j = i - sizeof(float64);
3790            i -= 2 * sizeof(float64);
3791
3792            e0 = *(float64 *)(vn + H1_2(i));
3793            e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
3794            e2 = *(float64 *)(vn + H1_2(j));
3795            e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
3796
3797            if (likely((pg >> (i & 63)) & 1)) {
3798                *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
3799            }
3800            if (likely((pg >> (j & 63)) & 1)) {
3801                *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
3802            }
3803        } while (i & 63);
3804    } while (i != 0);
3805}
3806
3807/*
3808 * FP Complex Multiply
3809 */
3810
3811QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 22 > 32);
3812
3813void HELPER(sve_fcmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3814{
3815    intptr_t j, i = simd_oprsz(desc);
3816    unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3817    unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3818    unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3819    unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3820    unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3821    bool flip = rot & 1;
3822    float16 neg_imag, neg_real;
3823    void *vd = &env->vfp.zregs[rd];
3824    void *vn = &env->vfp.zregs[rn];
3825    void *vm = &env->vfp.zregs[rm];
3826    void *va = &env->vfp.zregs[ra];
3827    uint64_t *g = vg;
3828
3829    neg_imag = float16_set_sign(0, (rot & 2) != 0);
3830    neg_real = float16_set_sign(0, rot == 1 || rot == 2);
3831
3832    do {
3833        uint64_t pg = g[(i - 1) >> 6];
3834        do {
3835            float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
3836
3837            /* I holds the real index; J holds the imag index.  */
3838            j = i - sizeof(float16);
3839            i -= 2 * sizeof(float16);
3840
3841            nr = *(float16 *)(vn + H1_2(i));
3842            ni = *(float16 *)(vn + H1_2(j));
3843            mr = *(float16 *)(vm + H1_2(i));
3844            mi = *(float16 *)(vm + H1_2(j));
3845
3846            e2 = (flip ? ni : nr);
3847            e1 = (flip ? mi : mr) ^ neg_real;
3848            e4 = e2;
3849            e3 = (flip ? mr : mi) ^ neg_imag;
3850
3851            if (likely((pg >> (i & 63)) & 1)) {
3852                d = *(float16 *)(va + H1_2(i));
3853                d = float16_muladd(e2, e1, d, 0, &env->vfp.fp_status_f16);
3854                *(float16 *)(vd + H1_2(i)) = d;
3855            }
3856            if (likely((pg >> (j & 63)) & 1)) {
3857                d = *(float16 *)(va + H1_2(j));
3858                d = float16_muladd(e4, e3, d, 0, &env->vfp.fp_status_f16);
3859                *(float16 *)(vd + H1_2(j)) = d;
3860            }
3861        } while (i & 63);
3862    } while (i != 0);
3863}
3864
3865void HELPER(sve_fcmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3866{
3867    intptr_t j, i = simd_oprsz(desc);
3868    unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3869    unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3870    unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3871    unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3872    unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3873    bool flip = rot & 1;
3874    float32 neg_imag, neg_real;
3875    void *vd = &env->vfp.zregs[rd];
3876    void *vn = &env->vfp.zregs[rn];
3877    void *vm = &env->vfp.zregs[rm];
3878    void *va = &env->vfp.zregs[ra];
3879    uint64_t *g = vg;
3880
3881    neg_imag = float32_set_sign(0, (rot & 2) != 0);
3882    neg_real = float32_set_sign(0, rot == 1 || rot == 2);
3883
3884    do {
3885        uint64_t pg = g[(i - 1) >> 6];
3886        do {
3887            float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
3888
3889            /* I holds the real index; J holds the imag index.  */
3890            j = i - sizeof(float32);
3891            i -= 2 * sizeof(float32);
3892
3893            nr = *(float32 *)(vn + H1_2(i));
3894            ni = *(float32 *)(vn + H1_2(j));
3895            mr = *(float32 *)(vm + H1_2(i));
3896            mi = *(float32 *)(vm + H1_2(j));
3897
3898            e2 = (flip ? ni : nr);
3899            e1 = (flip ? mi : mr) ^ neg_real;
3900            e4 = e2;
3901            e3 = (flip ? mr : mi) ^ neg_imag;
3902
3903            if (likely((pg >> (i & 63)) & 1)) {
3904                d = *(float32 *)(va + H1_2(i));
3905                d = float32_muladd(e2, e1, d, 0, &env->vfp.fp_status);
3906                *(float32 *)(vd + H1_2(i)) = d;
3907            }
3908            if (likely((pg >> (j & 63)) & 1)) {
3909                d = *(float32 *)(va + H1_2(j));
3910                d = float32_muladd(e4, e3, d, 0, &env->vfp.fp_status);
3911                *(float32 *)(vd + H1_2(j)) = d;
3912            }
3913        } while (i & 63);
3914    } while (i != 0);
3915}
3916
3917void HELPER(sve_fcmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3918{
3919    intptr_t j, i = simd_oprsz(desc);
3920    unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3921    unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3922    unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3923    unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3924    unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3925    bool flip = rot & 1;
3926    float64 neg_imag, neg_real;
3927    void *vd = &env->vfp.zregs[rd];
3928    void *vn = &env->vfp.zregs[rn];
3929    void *vm = &env->vfp.zregs[rm];
3930    void *va = &env->vfp.zregs[ra];
3931    uint64_t *g = vg;
3932
3933    neg_imag = float64_set_sign(0, (rot & 2) != 0);
3934    neg_real = float64_set_sign(0, rot == 1 || rot == 2);
3935
3936    do {
3937        uint64_t pg = g[(i - 1) >> 6];
3938        do {
3939            float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
3940
3941            /* I holds the real index; J holds the imag index.  */
3942            j = i - sizeof(float64);
3943            i -= 2 * sizeof(float64);
3944
3945            nr = *(float64 *)(vn + H1_2(i));
3946            ni = *(float64 *)(vn + H1_2(j));
3947            mr = *(float64 *)(vm + H1_2(i));
3948            mi = *(float64 *)(vm + H1_2(j));
3949
3950            e2 = (flip ? ni : nr);
3951            e1 = (flip ? mi : mr) ^ neg_real;
3952            e4 = e2;
3953            e3 = (flip ? mr : mi) ^ neg_imag;
3954
3955            if (likely((pg >> (i & 63)) & 1)) {
3956                d = *(float64 *)(va + H1_2(i));
3957                d = float64_muladd(e2, e1, d, 0, &env->vfp.fp_status);
3958                *(float64 *)(vd + H1_2(i)) = d;
3959            }
3960            if (likely((pg >> (j & 63)) & 1)) {
3961                d = *(float64 *)(va + H1_2(j));
3962                d = float64_muladd(e4, e3, d, 0, &env->vfp.fp_status);
3963                *(float64 *)(vd + H1_2(j)) = d;
3964            }
3965        } while (i & 63);
3966    } while (i != 0);
3967}
3968
3969/*
3970 * Load contiguous data, protected by a governing predicate.
3971 */
3972
3973/*
3974 * Load elements into @vd, controlled by @vg, from @host + @mem_ofs.
3975 * Memory is valid through @host + @mem_max.  The register element
3976 * indicies are inferred from @mem_ofs, as modified by the types for
3977 * which the helper is built.  Return the @mem_ofs of the first element
3978 * not loaded (which is @mem_max if they are all loaded).
3979 *
3980 * For softmmu, we have fully validated the guest page.  For user-only,
3981 * we cannot fully validate without taking the mmap lock, but since we
3982 * know the access is within one host page, if any access is valid they
3983 * all must be valid.  However, when @vg is all false, it may be that
3984 * no access is valid.
3985 */
3986typedef intptr_t sve_ld1_host_fn(void *vd, void *vg, void *host,
3987                                 intptr_t mem_ofs, intptr_t mem_max);
3988
3989/*
3990 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
3991 * The controlling predicate is known to be true.
3992 */
3993typedef void sve_ld1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
3994                            target_ulong vaddr, TCGMemOpIdx oi, uintptr_t ra);
3995typedef sve_ld1_tlb_fn sve_st1_tlb_fn;
3996
3997/*
3998 * Generate the above primitives.
3999 */
4000
4001#define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
4002static intptr_t sve_##NAME##_host(void *vd, void *vg, void *host,           \
4003                                  intptr_t mem_off, const intptr_t mem_max) \
4004{                                                                           \
4005    intptr_t reg_off = mem_off * (sizeof(TYPEE) / sizeof(TYPEM));           \
4006    uint64_t *pg = vg;                                                      \
4007    while (mem_off + sizeof(TYPEM) <= mem_max) {                            \
4008        TYPEM val = 0;                                                      \
4009        if (likely((pg[reg_off >> 6] >> (reg_off & 63)) & 1)) {             \
4010            val = HOST(host + mem_off);                                     \
4011        }                                                                   \
4012        *(TYPEE *)(vd + H(reg_off)) = val;                                  \
4013        mem_off += sizeof(TYPEM), reg_off += sizeof(TYPEE);                 \
4014    }                                                                       \
4015    return mem_off;                                                         \
4016}
4017
4018#ifdef CONFIG_SOFTMMU
4019#define DO_LD_TLB(NAME, H, TYPEE, TYPEM, HOST, MOEND, TLB) \
4020static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off,  \
4021                             target_ulong addr, TCGMemOpIdx oi, uintptr_t ra)  \
4022{                                                                           \
4023    TYPEM val = TLB(env, addr, oi, ra);                                     \
4024    *(TYPEE *)(vd + H(reg_off)) = val;                                      \
4025}
4026#else
4027#define DO_LD_TLB(NAME, H, TYPEE, TYPEM, HOST, MOEND, TLB)                  \
4028static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off,  \
4029                             target_ulong addr, TCGMemOpIdx oi, uintptr_t ra)  \
4030{                                                                           \
4031    TYPEM val = HOST(g2h(addr));                                            \
4032    *(TYPEE *)(vd + H(reg_off)) = val;                                      \
4033}
4034#endif
4035
4036#define DO_LD_PRIM_1(NAME, H, TE, TM)                   \
4037    DO_LD_HOST(NAME, H, TE, TM, ldub_p)                 \
4038    DO_LD_TLB(NAME, H, TE, TM, ldub_p, 0, helper_ret_ldub_mmu)
4039
4040DO_LD_PRIM_1(ld1bb,  H1,   uint8_t,  uint8_t)
4041DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
4042DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t,  int8_t)
4043DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
4044DO_LD_PRIM_1(ld1bss, H1_4, uint32_t,  int8_t)
4045DO_LD_PRIM_1(ld1bdu,     , uint64_t, uint8_t)
4046DO_LD_PRIM_1(ld1bds,     , uint64_t,  int8_t)
4047
4048#define DO_LD_PRIM_2(NAME, end, MOEND, H, TE, TM, PH, PT)  \
4049    DO_LD_HOST(NAME##_##end, H, TE, TM, PH##_##end##_p)    \
4050    DO_LD_TLB(NAME##_##end, H, TE, TM, PH##_##end##_p,     \
4051              MOEND, helper_##end##_##PT##_mmu)
4052
4053DO_LD_PRIM_2(ld1hh,  le, MO_LE, H1_2, uint16_t, uint16_t, lduw, lduw)
4054DO_LD_PRIM_2(ld1hsu, le, MO_LE, H1_4, uint32_t, uint16_t, lduw, lduw)
4055DO_LD_PRIM_2(ld1hss, le, MO_LE, H1_4, uint32_t,  int16_t, lduw, lduw)
4056DO_LD_PRIM_2(ld1hdu, le, MO_LE,     , uint64_t, uint16_t, lduw, lduw)
4057DO_LD_PRIM_2(ld1hds, le, MO_LE,     , uint64_t,  int16_t, lduw, lduw)
4058
4059DO_LD_PRIM_2(ld1ss,  le, MO_LE, H1_4, uint32_t, uint32_t, ldl, ldul)
4060DO_LD_PRIM_2(ld1sdu, le, MO_LE,     , uint64_t, uint32_t, ldl, ldul)
4061DO_LD_PRIM_2(ld1sds, le, MO_LE,     , uint64_t,  int32_t, ldl, ldul)
4062
4063DO_LD_PRIM_2(ld1dd,  le, MO_LE,     , uint64_t, uint64_t, ldq, ldq)
4064
4065DO_LD_PRIM_2(ld1hh,  be, MO_BE, H1_2, uint16_t, uint16_t, lduw, lduw)
4066DO_LD_PRIM_2(ld1hsu, be, MO_BE, H1_4, uint32_t, uint16_t, lduw, lduw)
4067DO_LD_PRIM_2(ld1hss, be, MO_BE, H1_4, uint32_t,  int16_t, lduw, lduw)
4068DO_LD_PRIM_2(ld1hdu, be, MO_BE,     , uint64_t, uint16_t, lduw, lduw)
4069DO_LD_PRIM_2(ld1hds, be, MO_BE,     , uint64_t,  int16_t, lduw, lduw)
4070
4071DO_LD_PRIM_2(ld1ss,  be, MO_BE, H1_4, uint32_t, uint32_t, ldl, ldul)
4072DO_LD_PRIM_2(ld1sdu, be, MO_BE,     , uint64_t, uint32_t, ldl, ldul)
4073DO_LD_PRIM_2(ld1sds, be, MO_BE,     , uint64_t,  int32_t, ldl, ldul)
4074
4075DO_LD_PRIM_2(ld1dd,  be, MO_BE,     , uint64_t, uint64_t, ldq, ldq)
4076
4077#undef DO_LD_TLB
4078#undef DO_LD_HOST
4079#undef DO_LD_PRIM_1
4080#undef DO_LD_PRIM_2
4081
4082/*
4083 * Skip through a sequence of inactive elements in the guarding predicate @vg,
4084 * beginning at @reg_off bounded by @reg_max.  Return the offset of the active
4085 * element >= @reg_off, or @reg_max if there were no active elements at all.
4086 */
4087static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
4088                                 intptr_t reg_max, int esz)
4089{
4090    uint64_t pg_mask = pred_esz_masks[esz];
4091    uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
4092
4093    /* In normal usage, the first element is active.  */
4094    if (likely(pg & 1)) {
4095        return reg_off;
4096    }
4097
4098    if (pg == 0) {
4099        reg_off &= -64;
4100        do {
4101            reg_off += 64;
4102            if (unlikely(reg_off >= reg_max)) {
4103                /* The entire predicate was false.  */
4104                return reg_max;
4105            }
4106            pg = vg[reg_off >> 6] & pg_mask;
4107        } while (pg == 0);
4108    }
4109    reg_off += ctz64(pg);
4110
4111    /* We should never see an out of range predicate bit set.  */
4112    tcg_debug_assert(reg_off < reg_max);
4113    return reg_off;
4114}
4115
4116/*
4117 * Return the maximum offset <= @mem_max which is still within the page
4118 * referenced by @base + @mem_off.
4119 */
4120static intptr_t max_for_page(target_ulong base, intptr_t mem_off,
4121                             intptr_t mem_max)
4122{
4123    target_ulong addr = base + mem_off;
4124    intptr_t split = -(intptr_t)(addr | TARGET_PAGE_MASK);
4125    return MIN(split, mem_max - mem_off) + mem_off;
4126}
4127
4128static inline void set_helper_retaddr(uintptr_t ra)
4129{
4130#ifdef CONFIG_USER_ONLY
4131    helper_retaddr = ra;
4132#endif
4133}
4134
4135/*
4136 * The result of tlb_vaddr_to_host for user-only is just g2h(x),
4137 * which is always non-null.  Elide the useless test.
4138 */
4139static inline bool test_host_page(void *host)
4140{
4141#ifdef CONFIG_USER_ONLY
4142    return true;
4143#else
4144    return likely(host != NULL);
4145#endif
4146}
4147
4148/*
4149 * Common helper for all contiguous one-register predicated loads.
4150 */
4151static void sve_ld1_r(CPUARMState *env, void *vg, const target_ulong addr,
4152                      uint32_t desc, const uintptr_t retaddr,
4153                      const int esz, const int msz,
4154                      sve_ld1_host_fn *host_fn,
4155                      sve_ld1_tlb_fn *tlb_fn)
4156{
4157    const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
4158    const int mmu_idx = get_mmuidx(oi);
4159    const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
4160    void *vd = &env->vfp.zregs[rd];
4161    const int diffsz = esz - msz;
4162    const intptr_t reg_max = simd_oprsz(desc);
4163    const intptr_t mem_max = reg_max >> diffsz;
4164    ARMVectorReg scratch;
4165    void *host;
4166    intptr_t split, reg_off, mem_off;
4167
4168    /* Find the first active element.  */
4169    reg_off = find_next_active(vg, 0, reg_max, esz);
4170    if (unlikely(reg_off == reg_max)) {
4171        /* The entire predicate was false; no load occurs.  */
4172        memset(vd, 0, reg_max);
4173        return;
4174    }
4175    mem_off = reg_off >> diffsz;
4176    set_helper_retaddr(retaddr);
4177
4178    /*
4179     * If the (remaining) load is entirely within a single page, then:
4180     * For softmmu, and the tlb hits, then no faults will occur;
4181     * For user-only, either the first load will fault or none will.
4182     * We can thus perform the load directly to the destination and
4183     * Vd will be unmodified on any exception path.
4184     */
4185    split = max_for_page(addr, mem_off, mem_max);
4186    if (likely(split == mem_max)) {
4187        host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
4188        if (test_host_page(host)) {
4189            mem_off = host_fn(vd, vg, host - mem_off, mem_off, mem_max);
4190            tcg_debug_assert(mem_off == mem_max);
4191            set_helper_retaddr(0);
4192            /* After having taken any fault, zero leading inactive elements. */
4193            swap_memzero(vd, reg_off);
4194            return;
4195        }
4196    }
4197
4198    /*
4199     * Perform the predicated read into a temporary, thus ensuring
4200     * if the load of the last element faults, Vd is not modified.
4201     */
4202#ifdef CONFIG_USER_ONLY
4203    swap_memzero(&scratch, reg_off);
4204    host_fn(&scratch, vg, g2h(addr), mem_off, mem_max);
4205#else
4206    memset(&scratch, 0, reg_max);
4207    goto start;
4208    while (1) {
4209        reg_off = find_next_active(vg, reg_off, reg_max, esz);
4210        if (reg_off >= reg_max) {
4211            break;
4212        }
4213        mem_off = reg_off >> diffsz;
4214        split = max_for_page(addr, mem_off, mem_max);
4215
4216    start:
4217        if (split - mem_off >= (1 << msz)) {
4218            /* At least one whole element on this page.  */
4219            host = tlb_vaddr_to_host(env, addr + mem_off,
4220                                     MMU_DATA_LOAD, mmu_idx);
4221            if (host) {
4222                mem_off = host_fn(&scratch, vg, host - mem_off,
4223                                  mem_off, split);
4224                reg_off = mem_off << diffsz;
4225                continue;
4226            }
4227        }
4228
4229        /*
4230         * Perform one normal read.  This may fault, longjmping out to the
4231         * main loop in order to raise an exception.  It may succeed, and
4232         * as a side-effect load the TLB entry for the next round.  Finally,
4233         * in the extremely unlikely case we're performing this operation
4234         * on I/O memory, it may succeed but not bring in the TLB entry.
4235         * But even then we have still made forward progress.
4236         */
4237        tlb_fn(env, &scratch, reg_off, addr + mem_off, oi, retaddr);
4238        reg_off += 1 << esz;
4239    }
4240#endif
4241
4242    set_helper_retaddr(0);
4243    memcpy(vd, &scratch, reg_max);
4244}
4245
4246#define DO_LD1_1(NAME, ESZ) \
4247void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg,        \
4248                            target_ulong addr, uint32_t desc)  \
4249{                                                              \
4250    sve_ld1_r(env, vg, addr, desc, GETPC(), ESZ, 0,            \
4251              sve_##NAME##_host, sve_##NAME##_tlb);            \
4252}
4253
4254#define DO_LD1_2(NAME, ESZ, MSZ) \
4255void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg,        \
4256                               target_ulong addr, uint32_t desc)  \
4257{                                                                 \
4258    sve_ld1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ,             \
4259              sve_##NAME##_le_host, sve_##NAME##_le_tlb);         \
4260}                                                                 \
4261void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg,        \
4262                               target_ulong addr, uint32_t desc)  \
4263{                                                                 \
4264    sve_ld1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ,             \
4265              sve_##NAME##_be_host, sve_##NAME##_be_tlb);         \
4266}
4267
4268DO_LD1_1(ld1bb,  0)
4269DO_LD1_1(ld1bhu, 1)
4270DO_LD1_1(ld1bhs, 1)
4271DO_LD1_1(ld1bsu, 2)
4272DO_LD1_1(ld1bss, 2)
4273DO_LD1_1(ld1bdu, 3)
4274DO_LD1_1(ld1bds, 3)
4275
4276DO_LD1_2(ld1hh,  1, 1)
4277DO_LD1_2(ld1hsu, 2, 1)
4278DO_LD1_2(ld1hss, 2, 1)
4279DO_LD1_2(ld1hdu, 3, 1)
4280DO_LD1_2(ld1hds, 3, 1)
4281
4282DO_LD1_2(ld1ss,  2, 2)
4283DO_LD1_2(ld1sdu, 3, 2)
4284DO_LD1_2(ld1sds, 3, 2)
4285
4286DO_LD1_2(ld1dd,  3, 3)
4287
4288#undef DO_LD1_1
4289#undef DO_LD1_2
4290
4291/*
4292 * Common helpers for all contiguous 2,3,4-register predicated loads.
4293 */
4294static void sve_ld2_r(CPUARMState *env, void *vg, target_ulong addr,
4295                      uint32_t desc, int size, uintptr_t ra,
4296                      sve_ld1_tlb_fn *tlb_fn)
4297{
4298    const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
4299    const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
4300    intptr_t i, oprsz = simd_oprsz(desc);
4301    ARMVectorReg scratch[2] = { };
4302
4303    set_helper_retaddr(ra);
4304    for (i = 0; i < oprsz; ) {
4305        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4306        do {
4307            if (pg & 1) {
4308                tlb_fn(env, &scratch[0], i, addr, oi, ra);
4309                tlb_fn(env, &scratch[1], i, addr + size, oi, ra);
4310            }
4311            i += size, pg >>= size;
4312            addr += 2 * size;
4313        } while (i & 15);
4314    }
4315    set_helper_retaddr(0);
4316
4317    /* Wait until all exceptions have been raised to write back.  */
4318    memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz);
4319    memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz);
4320}
4321
4322static void sve_ld3_r(CPUARMState *env, void *vg, target_ulong addr,
4323                      uint32_t desc, int size, uintptr_t ra,
4324                      sve_ld1_tlb_fn *tlb_fn)
4325{
4326    const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
4327    const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
4328    intptr_t i, oprsz = simd_oprsz(desc);
4329    ARMVectorReg scratch[3] = { };
4330
4331    set_helper_retaddr(ra);
4332    for (i = 0; i < oprsz; ) {
4333        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4334        do {
4335            if (pg & 1) {
4336                tlb_fn(env, &scratch[0], i, addr, oi, ra);
4337                tlb_fn(env, &scratch[1], i, addr + size, oi, ra);
4338                tlb_fn(env, &scratch[2], i, addr + 2 * size, oi, ra);
4339            }
4340            i += size, pg >>= size;
4341            addr += 3 * size;
4342        } while (i & 15);
4343    }
4344    set_helper_retaddr(0);
4345
4346    /* Wait until all exceptions have been raised to write back.  */
4347    memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz);
4348    memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz);
4349    memcpy(&env->vfp.zregs[(rd + 2) & 31], &scratch[2], oprsz);
4350}
4351
4352static void sve_ld4_r(CPUARMState *env, void *vg, target_ulong addr,
4353                      uint32_t desc, int size, uintptr_t ra,
4354                      sve_ld1_tlb_fn *tlb_fn)
4355{
4356    const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
4357    const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
4358    intptr_t i, oprsz = simd_oprsz(desc);
4359    ARMVectorReg scratch[4] = { };
4360
4361    set_helper_retaddr(ra);
4362    for (i = 0; i < oprsz; ) {
4363        uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4364        do {
4365            if (pg & 1) {
4366                tlb_fn(env, &scratch[0], i, addr, oi, ra);
4367                tlb_fn(env, &scratch[1], i, addr + size, oi, ra);
4368                tlb_fn(env, &scratch[2], i, addr + 2 * size, oi, ra);
4369                tlb_fn(env, &scratch[3], i, addr + 3 * size, oi, ra);
4370            }
4371            i += size, pg >>= size;
4372            addr += 4 * size;
4373        } while (i & 15);
4374    }
4375    set_helper_retaddr(0);
4376
4377    /* Wait until all exceptions have been raised to write back.  */
4378    memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz);
4379    memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz);
4380    memcpy(&env->vfp.zregs[(rd + 2) & 31], &scratch[2], oprsz);
4381    memcpy(&env->vfp.zregs[(rd + 3) & 31], &scratch[3], oprsz);
4382}
4383
4384#define DO_LDN_1(N) \
4385void QEMU_FLATTEN HELPER(sve_ld##N##bb_r) \
4386    (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc)  \
4387{                                                                   \
4388    sve_ld##N##_r(env, vg, addr, desc, 1, GETPC(), sve_ld1bb_tlb);  \
4389}
4390
4391#define DO_LDN_2(N, SUFF, SIZE)                                       \
4392void QEMU_FLATTEN HELPER(sve_ld##N##SUFF##_le_r)                      \
4393    (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc)    \
4394{                                                                     \
4395    sve_ld##N##_r(env, vg, addr, desc, SIZE, GETPC(),                 \
4396                  sve_ld1##SUFF##_le_tlb);                            \
4397}                                                                     \
4398void QEMU_FLATTEN HELPER(sve_ld##N##SUFF##_be_r)                      \
4399    (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc)    \
4400{                                                                     \
4401    sve_ld##N##_r(env, vg, addr, desc, SIZE, GETPC(),                 \
4402                  sve_ld1##SUFF##_be_tlb);                            \
4403}
4404
4405DO_LDN_1(2)
4406DO_LDN_1(3)
4407DO_LDN_1(4)
4408
4409DO_LDN_2(2, hh, 2)
4410DO_LDN_2(3, hh, 2)
4411DO_LDN_2(4, hh, 2)
4412
4413DO_LDN_2(2, ss, 4)
4414DO_LDN_2(3, ss, 4)
4415DO_LDN_2(4, ss, 4)
4416
4417DO_LDN_2(2, dd, 8)
4418DO_LDN_2(3, dd, 8)
4419DO_LDN_2(4, dd, 8)
4420
4421#undef DO_LDN_1
4422#undef DO_LDN_2
4423
4424/*
4425 * Load contiguous data, first-fault and no-fault.
4426 *
4427 * For user-only, one could argue that we should hold the mmap_lock during
4428 * the operation so that there is no race between page_check_range and the
4429 * load operation.  However, unmapping pages out from under a running thread
4430 * is extraordinarily unlikely.  This theoretical race condition also affects
4431 * linux-user/ in its get_user/put_user macros.
4432 *
4433 * TODO: Construct some helpers, written in assembly, that interact with
4434 * handle_cpu_signal to produce memory ops which can properly report errors
4435 * without racing.
4436 */
4437
4438/* Fault on byte I.  All bits in FFR from I are cleared.  The vector
4439 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
4440 * option, which leaves subsequent data unchanged.
4441 */
4442static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
4443{
4444    uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
4445
4446    if (i & 63) {
4447        ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
4448        i = ROUND_UP(i, 64);
4449    }
4450    for (; i < oprsz; i += 64) {
4451        ffr[i / 64] = 0;
4452    }
4453}
4454
4455/*
4456 * Common helper for all contiguous first-fault loads.
4457 */
4458static void sve_ldff1_r(CPUARMState *env, void *vg, const target_ulong addr,
4459                        uint32_t desc, const uintptr_t retaddr,
4460                        const int esz, const int msz,
4461                        sve_ld1_host_fn *host_fn,
4462                        sve_ld1_tlb_fn *tlb_fn)
4463{
4464    const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
4465    const int mmu_idx = get_mmuidx(oi);
4466    const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
4467    void *vd = &env->vfp.zregs[rd];
4468    const int diffsz = esz - msz;
4469    const intptr_t reg_max = simd_oprsz(desc);
4470    const intptr_t mem_max = reg_max >> diffsz;
4471    intptr_t split, reg_off, mem_off;
4472    void *host;
4473
4474    /* Skip to the first active element.  */
4475    reg_off = find_next_active(vg, 0, reg_max, esz);
4476    if (unlikely(reg_off == reg_max)) {
4477        /* The entire predicate was false; no load occurs.  */
4478        memset(vd, 0, reg_max);
4479        return;
4480    }
4481    mem_off = reg_off >> diffsz;
4482    set_helper_retaddr(retaddr);
4483
4484    /*
4485     * If the (remaining) load is entirely within a single page, then:
4486     * For softmmu, and the tlb hits, then no faults will occur;
4487     * For user-only, either the first load will fault or none will.
4488     * We can thus perform the load directly to the destination and
4489     * Vd will be unmodified on any exception path.
4490     */
4491    split = max_for_page(addr, mem_off, mem_max);
4492    if (likely(split == mem_max)) {
4493        host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
4494        if (test_host_page(host)) {
4495            mem_off = host_fn(vd, vg, host - mem_off, mem_off, mem_max);
4496            tcg_debug_assert(mem_off == mem_max);
4497            set_helper_retaddr(0);
4498            /* After any fault, zero any leading inactive elements.  */
4499            swap_memzero(vd, reg_off);
4500            return;
4501        }
4502    }
4503
4504#ifdef CONFIG_USER_ONLY
4505    /*
4506     * The page(s) containing this first element at ADDR+MEM_OFF must
4507     * be valid.  Considering that this first element may be misaligned
4508     * and cross a page boundary itself, take the rest of the page from
4509     * the last byte of the element.
4510     */
4511    split = max_for_page(addr, mem_off + (1 << msz) - 1, mem_max);
4512    mem_off = host_fn(vd, vg, g2h(addr), mem_off, split);
4513
4514    /* After any fault, zero any leading inactive elements.  */
4515    swap_memzero(vd, reg_off);
4516    reg_off = mem_off << diffsz;
4517#else
4518    /*
4519     * Perform one normal read, which will fault or not.
4520     * But it is likely to bring the page into the tlb.
4521     */
4522    tlb_fn(env, vd, reg_off, addr + mem_off, oi, retaddr);
4523
4524    /* After any fault, zero any leading predicated false elts.  */
4525    swap_memzero(vd, reg_off);
4526    mem_off += 1 << msz;
4527    reg_off += 1 << esz;
4528
4529    /* Try again to read the balance of the page.  */
4530    split = max_for_page(addr, mem_off - 1, mem_max);
4531    if (split >= (1 << msz)) {
4532        host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
4533        if (host) {
4534            mem_off = host_fn(vd, vg, host - mem_off, mem_off, split);
4535            reg_off = mem_off << diffsz;
4536        }
4537    }
4538#endif
4539
4540    set_helper_retaddr(0);
4541    record_fault(env, reg_off, reg_max);
4542}
4543
4544/*
4545 * Common helper for all contiguous no-fault loads.
4546 */
4547static void sve_ldnf1_r(CPUARMState *env, void *vg, const target_ulong addr,
4548                        uint32_t desc, const int esz, const int msz,
4549                        sve_ld1_host_fn *host_fn)
4550{
4551    const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
4552    void *vd = &env->vfp.zregs[rd];
4553    const int diffsz = esz - msz;
4554    const intptr_t reg_max = simd_oprsz(desc);
4555    const intptr_t mem_max = reg_max >> diffsz;
4556    const int mmu_idx = cpu_mmu_index(env, false);
4557    intptr_t split, reg_off, mem_off;
4558    void *host;
4559
4560#ifdef CONFIG_USER_ONLY
4561    host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD, mmu_idx);
4562    if (likely(page_check_range(addr, mem_max, PAGE_READ) == 0)) {
4563        /* The entire operation is valid and will not fault.  */
4564        host_fn(vd, vg, host, 0, mem_max);
4565        return;
4566    }
4567#endif
4568
4569    /* There will be no fault, so we may modify in advance.  */
4570    memset(vd, 0, reg_max);
4571
4572    /* Skip to the first active element.  */
4573    reg_off = find_next_active(vg, 0, reg_max, esz);
4574    if (unlikely(reg_off == reg_max)) {
4575        /* The entire predicate was false; no load occurs.  */
4576        return;
4577    }
4578    mem_off = reg_off >> diffsz;
4579
4580#ifdef CONFIG_USER_ONLY
4581    if (page_check_range(addr + mem_off, 1 << msz, PAGE_READ) == 0) {
4582        /* At least one load is valid; take the rest of the page.  */
4583        split = max_for_page(addr, mem_off + (1 << msz) - 1, mem_max);
4584        mem_off = host_fn(vd, vg, host, mem_off, split);
4585        reg_off = mem_off << diffsz;
4586    }
4587#else
4588    /*
4589     * If the address is not in the TLB, we have no way to bring the
4590     * entry into the TLB without also risking a fault.  Note that
4591     * the corollary is that we never load from an address not in RAM.
4592     *
4593     * This last is out of spec, in a weird corner case.
4594     * Per the MemNF/MemSingleNF pseudocode, a NF load from Device memory
4595     * must not actually hit the bus -- it returns UNKNOWN data instead.
4596     * But if you map non-RAM with Normal memory attributes and do a NF
4597     * load then it should access the bus.  (Nobody ought actually do this
4598     * in the real world, obviously.)
4599     *
4600     * Then there are the annoying special cases with watchpoints...
4601     *
4602     * TODO: Add a form of tlb_fill that does not raise an exception,
4603     * with a form of tlb_vaddr_to_host and a set of loads to match.
4604     * The non_fault_vaddr_to_host would handle everything, usually,
4605     * and the loads would handle the iomem path for watchpoints.
4606     */
4607    host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
4608    split = max_for_page(addr, mem_off, mem_max);
4609    if (host && split >= (1 << msz)) {
4610        mem_off = host_fn(vd, vg, host - mem_off, mem_off, split);
4611        reg_off = mem_off << diffsz;
4612    }
4613#endif
4614
4615    record_fault(env, reg_off, reg_max);
4616}
4617
4618#define DO_LDFF1_LDNF1_1(PART, ESZ) \
4619void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg,            \
4620                                 target_ulong addr, uint32_t desc)      \
4621{                                                                       \
4622    sve_ldff1_r(env, vg, addr, desc, GETPC(), ESZ, 0,                   \
4623                sve_ld1##PART##_host, sve_ld1##PART##_tlb);             \
4624}                                                                       \
4625void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg,            \
4626                                 target_ulong addr, uint32_t desc)      \
4627{                                                                       \
4628    sve_ldnf1_r(env, vg, addr, desc, ESZ, 0, sve_ld1##PART##_host);     \
4629}
4630
4631#define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
4632void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg,         \
4633                                    target_ulong addr, uint32_t desc)   \
4634{                                                                       \
4635    sve_ldff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ,                 \
4636                sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);       \
4637}                                                                       \
4638void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg,         \
4639                                    target_ulong addr, uint32_t desc)   \
4640{                                                                       \
4641    sve_ldnf1_r(env, vg, addr, desc, ESZ, MSZ, sve_ld1##PART##_le_host); \
4642}                                                                       \
4643void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg,         \
4644                                    target_ulong addr, uint32_t desc)   \
4645{                                                                       \
4646    sve_ldff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ,                 \
4647                sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);       \
4648}                                                                       \
4649void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg,         \
4650                                    target_ulong addr, uint32_t desc)   \
4651{                                                                       \
4652    sve_ldnf1_r(env, vg, addr, desc, ESZ, MSZ, sve_ld1##PART##_be_host); \
4653}
4654
4655DO_LDFF1_LDNF1_1(bb,  0)
4656DO_LDFF1_LDNF1_1(bhu, 1)
4657DO_LDFF1_LDNF1_1(bhs, 1)
4658DO_LDFF1_LDNF1_1(bsu, 2)
4659DO_LDFF1_LDNF1_1(bss, 2)
4660DO_LDFF1_LDNF1_1(bdu, 3)
4661DO_LDFF1_LDNF1_1(bds, 3)
4662
4663DO_LDFF1_LDNF1_2(hh,  1, 1)
4664DO_LDFF1_LDNF1_2(hsu, 2, 1)
4665DO_LDFF1_LDNF1_2(hss, 2, 1)
4666DO_LDFF1_LDNF1_2(hdu, 3, 1)
4667DO_LDFF1_LDNF1_2(hds, 3, 1)
4668
4669DO_LDFF1_LDNF1_2(ss,  2, 2)
4670DO_LDFF1_LDNF1_2(sdu, 3, 2)
4671DO_LDFF1_LDNF1_2(sds, 3, 2)
4672
4673DO_LDFF1_LDNF1_2(dd,  3, 3)
4674
4675#undef DO_LDFF1_LDNF1_1
4676#undef DO_LDFF1_LDNF1_2
4677
4678/*
4679 * Store contiguous data, protected by a governing predicate.
4680 */
4681
4682#ifdef CONFIG_SOFTMMU
4683#define DO_ST_TLB(NAME, H, TYPEM, HOST, MOEND, TLB) \
4684static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off,  \
4685                             target_ulong addr, TCGMemOpIdx oi, uintptr_t ra) \
4686{                                                                           \
4687    TLB(env, addr, *(TYPEM *)(vd + H(reg_off)), oi, ra);                    \
4688}
4689#else
4690#define DO_ST_TLB(NAME, H, TYPEM, HOST, MOEND, TLB) \
4691static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off,  \
4692                             target_ulong addr, TCGMemOpIdx oi, uintptr_t ra) \
4693{                                                                           \
4694    HOST(g2h(addr), *(TYPEM *)(vd + H(reg_off)));                           \
4695}
4696#endif
4697
4698DO_ST_TLB(st1bb,   H1,  uint8_t, stb_p, 0, helper_ret_stb_mmu)
4699DO_ST_TLB(st1bh, H1_2, uint16_t,