qemu/target/arm/tcg/vec_helper.c
<<
>>
Prefs
   1/*
   2 * ARM AdvSIMD / SVE Vector Operations
   3 *
   4 * Copyright (c) 2018 Linaro
   5 *
   6 * This library is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU Lesser General Public
   8 * License as published by the Free Software Foundation; either
   9 * version 2.1 of the License, or (at your option) any later version.
  10 *
  11 * This library is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * Lesser General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU Lesser General Public
  17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20#include "qemu/osdep.h"
  21#include "cpu.h"
  22#include "exec/helper-proto.h"
  23#include "tcg/tcg-gvec-desc.h"
  24#include "fpu/softfloat.h"
  25#include "qemu/int128.h"
  26#include "vec_internal.h"
  27
  28/*
  29 * Data for expanding active predicate bits to bytes, for byte elements.
  30 *
  31 *  for (i = 0; i < 256; ++i) {
  32 *      unsigned long m = 0;
  33 *      for (j = 0; j < 8; j++) {
  34 *          if ((i >> j) & 1) {
  35 *              m |= 0xfful << (j << 3);
  36 *          }
  37 *      }
  38 *      printf("0x%016lx,\n", m);
  39 *  }
  40 */
  41const uint64_t expand_pred_b_data[256] = {
  42    0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
  43    0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
  44    0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
  45    0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
  46    0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
  47    0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
  48    0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
  49    0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
  50    0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
  51    0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
  52    0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
  53    0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
  54    0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
  55    0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
  56    0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
  57    0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
  58    0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
  59    0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
  60    0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
  61    0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
  62    0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
  63    0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
  64    0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
  65    0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
  66    0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
  67    0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
  68    0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
  69    0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
  70    0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
  71    0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
  72    0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
  73    0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
  74    0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
  75    0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
  76    0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
  77    0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
  78    0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
  79    0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
  80    0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
  81    0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
  82    0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
  83    0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
  84    0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
  85    0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
  86    0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
  87    0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
  88    0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
  89    0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
  90    0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
  91    0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
  92    0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
  93    0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
  94    0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
  95    0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
  96    0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
  97    0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
  98    0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
  99    0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
 100    0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
 101    0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
 102    0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
 103    0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
 104    0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
 105    0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
 106    0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
 107    0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
 108    0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
 109    0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
 110    0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
 111    0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
 112    0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
 113    0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
 114    0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
 115    0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
 116    0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
 117    0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
 118    0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
 119    0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
 120    0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
 121    0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
 122    0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
 123    0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
 124    0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
 125    0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
 126    0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
 127    0xffffffffffffffff,
 128};
 129
 130/*
 131 * Similarly for half-word elements.
 132 *  for (i = 0; i < 256; ++i) {
 133 *      unsigned long m = 0;
 134 *      if (i & 0xaa) {
 135 *          continue;
 136 *      }
 137 *      for (j = 0; j < 8; j += 2) {
 138 *          if ((i >> j) & 1) {
 139 *              m |= 0xfffful << (j << 3);
 140 *          }
 141 *      }
 142 *      printf("[0x%x] = 0x%016lx,\n", i, m);
 143 *  }
 144 */
 145const uint64_t expand_pred_h_data[0x55 + 1] = {
 146    [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
 147    [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
 148    [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
 149    [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
 150    [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
 151    [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
 152    [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
 153    [0x55] = 0xffffffffffffffff,
 154};
 155
 156/* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
 157int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
 158                     bool neg, bool round)
 159{
 160    /*
 161     * Simplify:
 162     * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
 163     * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
 164     */
 165    int32_t ret = (int32_t)src1 * src2;
 166    if (neg) {
 167        ret = -ret;
 168    }
 169    ret += ((int32_t)src3 << 7) + (round << 6);
 170    ret >>= 7;
 171
 172    if (ret != (int8_t)ret) {
 173        ret = (ret < 0 ? INT8_MIN : INT8_MAX);
 174    }
 175    return ret;
 176}
 177
 178void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
 179                             void *va, uint32_t desc)
 180{
 181    intptr_t i, opr_sz = simd_oprsz(desc);
 182    int8_t *d = vd, *n = vn, *m = vm, *a = va;
 183
 184    for (i = 0; i < opr_sz; ++i) {
 185        d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
 186    }
 187}
 188
 189void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
 190                             void *va, uint32_t desc)
 191{
 192    intptr_t i, opr_sz = simd_oprsz(desc);
 193    int8_t *d = vd, *n = vn, *m = vm, *a = va;
 194
 195    for (i = 0; i < opr_sz; ++i) {
 196        d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
 197    }
 198}
 199
 200void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
 201{
 202    intptr_t i, opr_sz = simd_oprsz(desc);
 203    int8_t *d = vd, *n = vn, *m = vm;
 204
 205    for (i = 0; i < opr_sz; ++i) {
 206        d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
 207    }
 208}
 209
 210void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
 211{
 212    intptr_t i, opr_sz = simd_oprsz(desc);
 213    int8_t *d = vd, *n = vn, *m = vm;
 214
 215    for (i = 0; i < opr_sz; ++i) {
 216        d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
 217    }
 218}
 219
 220/* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
 221int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
 222                      bool neg, bool round, uint32_t *sat)
 223{
 224    /* Simplify similarly to do_sqrdmlah_b above.  */
 225    int32_t ret = (int32_t)src1 * src2;
 226    if (neg) {
 227        ret = -ret;
 228    }
 229    ret += ((int32_t)src3 << 15) + (round << 14);
 230    ret >>= 15;
 231
 232    if (ret != (int16_t)ret) {
 233        *sat = 1;
 234        ret = (ret < 0 ? INT16_MIN : INT16_MAX);
 235    }
 236    return ret;
 237}
 238
 239uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
 240                                  uint32_t src2, uint32_t src3)
 241{
 242    uint32_t *sat = &env->vfp.qc[0];
 243    uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
 244    uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
 245                                false, true, sat);
 246    return deposit32(e1, 16, 16, e2);
 247}
 248
 249void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
 250                              void *vq, uint32_t desc)
 251{
 252    uintptr_t opr_sz = simd_oprsz(desc);
 253    int16_t *d = vd;
 254    int16_t *n = vn;
 255    int16_t *m = vm;
 256    uintptr_t i;
 257
 258    for (i = 0; i < opr_sz / 2; ++i) {
 259        d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
 260    }
 261    clear_tail(d, opr_sz, simd_maxsz(desc));
 262}
 263
 264uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
 265                                  uint32_t src2, uint32_t src3)
 266{
 267    uint32_t *sat = &env->vfp.qc[0];
 268    uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
 269    uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
 270                                true, true, sat);
 271    return deposit32(e1, 16, 16, e2);
 272}
 273
 274void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
 275                              void *vq, uint32_t desc)
 276{
 277    uintptr_t opr_sz = simd_oprsz(desc);
 278    int16_t *d = vd;
 279    int16_t *n = vn;
 280    int16_t *m = vm;
 281    uintptr_t i;
 282
 283    for (i = 0; i < opr_sz / 2; ++i) {
 284        d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
 285    }
 286    clear_tail(d, opr_sz, simd_maxsz(desc));
 287}
 288
 289void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
 290                            void *vq, uint32_t desc)
 291{
 292    intptr_t i, opr_sz = simd_oprsz(desc);
 293    int16_t *d = vd, *n = vn, *m = vm;
 294
 295    for (i = 0; i < opr_sz / 2; ++i) {
 296        d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
 297    }
 298    clear_tail(d, opr_sz, simd_maxsz(desc));
 299}
 300
 301void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
 302                             void *vq, uint32_t desc)
 303{
 304    intptr_t i, opr_sz = simd_oprsz(desc);
 305    int16_t *d = vd, *n = vn, *m = vm;
 306
 307    for (i = 0; i < opr_sz / 2; ++i) {
 308        d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
 309    }
 310    clear_tail(d, opr_sz, simd_maxsz(desc));
 311}
 312
 313void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
 314                             void *va, uint32_t desc)
 315{
 316    intptr_t i, opr_sz = simd_oprsz(desc);
 317    int16_t *d = vd, *n = vn, *m = vm, *a = va;
 318    uint32_t discard;
 319
 320    for (i = 0; i < opr_sz / 2; ++i) {
 321        d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
 322    }
 323}
 324
 325void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
 326                             void *va, uint32_t desc)
 327{
 328    intptr_t i, opr_sz = simd_oprsz(desc);
 329    int16_t *d = vd, *n = vn, *m = vm, *a = va;
 330    uint32_t discard;
 331
 332    for (i = 0; i < opr_sz / 2; ++i) {
 333        d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
 334    }
 335}
 336
 337void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
 338{
 339    intptr_t i, opr_sz = simd_oprsz(desc);
 340    int16_t *d = vd, *n = vn, *m = vm;
 341    uint32_t discard;
 342
 343    for (i = 0; i < opr_sz / 2; ++i) {
 344        d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
 345    }
 346}
 347
 348void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
 349{
 350    intptr_t i, opr_sz = simd_oprsz(desc);
 351    int16_t *d = vd, *n = vn, *m = vm;
 352    uint32_t discard;
 353
 354    for (i = 0; i < opr_sz / 2; ++i) {
 355        d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
 356    }
 357}
 358
 359void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
 360{
 361    intptr_t i, j, opr_sz = simd_oprsz(desc);
 362    int idx = simd_data(desc);
 363    int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
 364    uint32_t discard;
 365
 366    for (i = 0; i < opr_sz / 2; i += 16 / 2) {
 367        int16_t mm = m[i];
 368        for (j = 0; j < 16 / 2; ++j) {
 369            d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
 370        }
 371    }
 372}
 373
 374void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
 375{
 376    intptr_t i, j, opr_sz = simd_oprsz(desc);
 377    int idx = simd_data(desc);
 378    int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
 379    uint32_t discard;
 380
 381    for (i = 0; i < opr_sz / 2; i += 16 / 2) {
 382        int16_t mm = m[i];
 383        for (j = 0; j < 16 / 2; ++j) {
 384            d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
 385        }
 386    }
 387}
 388
 389/* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
 390int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
 391                      bool neg, bool round, uint32_t *sat)
 392{
 393    /* Simplify similarly to do_sqrdmlah_b above.  */
 394    int64_t ret = (int64_t)src1 * src2;
 395    if (neg) {
 396        ret = -ret;
 397    }
 398    ret += ((int64_t)src3 << 31) + (round << 30);
 399    ret >>= 31;
 400
 401    if (ret != (int32_t)ret) {
 402        *sat = 1;
 403        ret = (ret < 0 ? INT32_MIN : INT32_MAX);
 404    }
 405    return ret;
 406}
 407
 408uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
 409                                  int32_t src2, int32_t src3)
 410{
 411    uint32_t *sat = &env->vfp.qc[0];
 412    return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
 413}
 414
 415void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
 416                              void *vq, uint32_t desc)
 417{
 418    uintptr_t opr_sz = simd_oprsz(desc);
 419    int32_t *d = vd;
 420    int32_t *n = vn;
 421    int32_t *m = vm;
 422    uintptr_t i;
 423
 424    for (i = 0; i < opr_sz / 4; ++i) {
 425        d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
 426    }
 427    clear_tail(d, opr_sz, simd_maxsz(desc));
 428}
 429
 430uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
 431                                  int32_t src2, int32_t src3)
 432{
 433    uint32_t *sat = &env->vfp.qc[0];
 434    return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
 435}
 436
 437void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
 438                              void *vq, uint32_t desc)
 439{
 440    uintptr_t opr_sz = simd_oprsz(desc);
 441    int32_t *d = vd;
 442    int32_t *n = vn;
 443    int32_t *m = vm;
 444    uintptr_t i;
 445
 446    for (i = 0; i < opr_sz / 4; ++i) {
 447        d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
 448    }
 449    clear_tail(d, opr_sz, simd_maxsz(desc));
 450}
 451
 452void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
 453                            void *vq, uint32_t desc)
 454{
 455    intptr_t i, opr_sz = simd_oprsz(desc);
 456    int32_t *d = vd, *n = vn, *m = vm;
 457
 458    for (i = 0; i < opr_sz / 4; ++i) {
 459        d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
 460    }
 461    clear_tail(d, opr_sz, simd_maxsz(desc));
 462}
 463
 464void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
 465                             void *vq, uint32_t desc)
 466{
 467    intptr_t i, opr_sz = simd_oprsz(desc);
 468    int32_t *d = vd, *n = vn, *m = vm;
 469
 470    for (i = 0; i < opr_sz / 4; ++i) {
 471        d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
 472    }
 473    clear_tail(d, opr_sz, simd_maxsz(desc));
 474}
 475
 476void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
 477                             void *va, uint32_t desc)
 478{
 479    intptr_t i, opr_sz = simd_oprsz(desc);
 480    int32_t *d = vd, *n = vn, *m = vm, *a = va;
 481    uint32_t discard;
 482
 483    for (i = 0; i < opr_sz / 4; ++i) {
 484        d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
 485    }
 486}
 487
 488void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
 489                             void *va, uint32_t desc)
 490{
 491    intptr_t i, opr_sz = simd_oprsz(desc);
 492    int32_t *d = vd, *n = vn, *m = vm, *a = va;
 493    uint32_t discard;
 494
 495    for (i = 0; i < opr_sz / 4; ++i) {
 496        d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
 497    }
 498}
 499
 500void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
 501{
 502    intptr_t i, opr_sz = simd_oprsz(desc);
 503    int32_t *d = vd, *n = vn, *m = vm;
 504    uint32_t discard;
 505
 506    for (i = 0; i < opr_sz / 4; ++i) {
 507        d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
 508    }
 509}
 510
 511void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
 512{
 513    intptr_t i, opr_sz = simd_oprsz(desc);
 514    int32_t *d = vd, *n = vn, *m = vm;
 515    uint32_t discard;
 516
 517    for (i = 0; i < opr_sz / 4; ++i) {
 518        d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
 519    }
 520}
 521
 522void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
 523{
 524    intptr_t i, j, opr_sz = simd_oprsz(desc);
 525    int idx = simd_data(desc);
 526    int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
 527    uint32_t discard;
 528
 529    for (i = 0; i < opr_sz / 4; i += 16 / 4) {
 530        int32_t mm = m[i];
 531        for (j = 0; j < 16 / 4; ++j) {
 532            d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
 533        }
 534    }
 535}
 536
 537void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
 538{
 539    intptr_t i, j, opr_sz = simd_oprsz(desc);
 540    int idx = simd_data(desc);
 541    int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
 542    uint32_t discard;
 543
 544    for (i = 0; i < opr_sz / 4; i += 16 / 4) {
 545        int32_t mm = m[i];
 546        for (j = 0; j < 16 / 4; ++j) {
 547            d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
 548        }
 549    }
 550}
 551
 552/* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
 553static int64_t do_sat128_d(Int128 r)
 554{
 555    int64_t ls = int128_getlo(r);
 556    int64_t hs = int128_gethi(r);
 557
 558    if (unlikely(hs != (ls >> 63))) {
 559        return hs < 0 ? INT64_MIN : INT64_MAX;
 560    }
 561    return ls;
 562}
 563
 564int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
 565{
 566    uint64_t l, h;
 567    Int128 r, t;
 568
 569    /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
 570    muls64(&l, &h, m, n);
 571    r = int128_make128(l, h);
 572    if (neg) {
 573        r = int128_neg(r);
 574    }
 575    if (a) {
 576        t = int128_exts64(a);
 577        t = int128_lshift(t, 63);
 578        r = int128_add(r, t);
 579    }
 580    if (round) {
 581        t = int128_exts64(1ll << 62);
 582        r = int128_add(r, t);
 583    }
 584    r = int128_rshift(r, 63);
 585
 586    return do_sat128_d(r);
 587}
 588
 589void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
 590                             void *va, uint32_t desc)
 591{
 592    intptr_t i, opr_sz = simd_oprsz(desc);
 593    int64_t *d = vd, *n = vn, *m = vm, *a = va;
 594
 595    for (i = 0; i < opr_sz / 8; ++i) {
 596        d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
 597    }
 598}
 599
 600void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
 601                             void *va, uint32_t desc)
 602{
 603    intptr_t i, opr_sz = simd_oprsz(desc);
 604    int64_t *d = vd, *n = vn, *m = vm, *a = va;
 605
 606    for (i = 0; i < opr_sz / 8; ++i) {
 607        d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
 608    }
 609}
 610
 611void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
 612{
 613    intptr_t i, opr_sz = simd_oprsz(desc);
 614    int64_t *d = vd, *n = vn, *m = vm;
 615
 616    for (i = 0; i < opr_sz / 8; ++i) {
 617        d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
 618    }
 619}
 620
 621void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
 622{
 623    intptr_t i, opr_sz = simd_oprsz(desc);
 624    int64_t *d = vd, *n = vn, *m = vm;
 625
 626    for (i = 0; i < opr_sz / 8; ++i) {
 627        d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
 628    }
 629}
 630
 631void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
 632{
 633    intptr_t i, j, opr_sz = simd_oprsz(desc);
 634    int idx = simd_data(desc);
 635    int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
 636
 637    for (i = 0; i < opr_sz / 8; i += 16 / 8) {
 638        int64_t mm = m[i];
 639        for (j = 0; j < 16 / 8; ++j) {
 640            d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
 641        }
 642    }
 643}
 644
 645void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
 646{
 647    intptr_t i, j, opr_sz = simd_oprsz(desc);
 648    int idx = simd_data(desc);
 649    int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
 650
 651    for (i = 0; i < opr_sz / 8; i += 16 / 8) {
 652        int64_t mm = m[i];
 653        for (j = 0; j < 16 / 8; ++j) {
 654            d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
 655        }
 656    }
 657}
 658
 659/* Integer 8 and 16-bit dot-product.
 660 *
 661 * Note that for the loops herein, host endianness does not matter
 662 * with respect to the ordering of data within the quad-width lanes.
 663 * All elements are treated equally, no matter where they are.
 664 */
 665
 666#define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
 667void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
 668{                                                                         \
 669    intptr_t i, opr_sz = simd_oprsz(desc);                                \
 670    TYPED *d = vd, *a = va;                                               \
 671    TYPEN *n = vn;                                                        \
 672    TYPEM *m = vm;                                                        \
 673    for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
 674        d[i] = (a[i] +                                                    \
 675                (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
 676                (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
 677                (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
 678                (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
 679    }                                                                     \
 680    clear_tail(d, opr_sz, simd_maxsz(desc));                              \
 681}
 682
 683DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
 684DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
 685DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
 686DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
 687DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
 688
 689#define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
 690void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
 691{                                                                         \
 692    intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
 693    intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
 694    intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
 695    intptr_t index = simd_data(desc);                                     \
 696    TYPED *d = vd, *a = va;                                               \
 697    TYPEN *n = vn;                                                        \
 698    TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
 699    do {                                                                  \
 700        TYPED m0 = m_indexed[i * 4 + 0];                                  \
 701        TYPED m1 = m_indexed[i * 4 + 1];                                  \
 702        TYPED m2 = m_indexed[i * 4 + 2];                                  \
 703        TYPED m3 = m_indexed[i * 4 + 3];                                  \
 704        do {                                                              \
 705            d[i] = (a[i] +                                                \
 706                    n[i * 4 + 0] * m0 +                                   \
 707                    n[i * 4 + 1] * m1 +                                   \
 708                    n[i * 4 + 2] * m2 +                                   \
 709                    n[i * 4 + 3] * m3);                                   \
 710        } while (++i < segend);                                           \
 711        segend = i + 4;                                                   \
 712    } while (i < opr_sz_n);                                               \
 713    clear_tail(d, opr_sz, simd_maxsz(desc));                              \
 714}
 715
 716DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
 717DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
 718DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
 719DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
 720DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
 721DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
 722
 723void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
 724                         void *vfpst, uint32_t desc)
 725{
 726    uintptr_t opr_sz = simd_oprsz(desc);
 727    float16 *d = vd;
 728    float16 *n = vn;
 729    float16 *m = vm;
 730    float_status *fpst = vfpst;
 731    uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
 732    uint32_t neg_imag = neg_real ^ 1;
 733    uintptr_t i;
 734
 735    /* Shift boolean to the sign bit so we can xor to negate.  */
 736    neg_real <<= 15;
 737    neg_imag <<= 15;
 738
 739    for (i = 0; i < opr_sz / 2; i += 2) {
 740        float16 e0 = n[H2(i)];
 741        float16 e1 = m[H2(i + 1)] ^ neg_imag;
 742        float16 e2 = n[H2(i + 1)];
 743        float16 e3 = m[H2(i)] ^ neg_real;
 744
 745        d[H2(i)] = float16_add(e0, e1, fpst);
 746        d[H2(i + 1)] = float16_add(e2, e3, fpst);
 747    }
 748    clear_tail(d, opr_sz, simd_maxsz(desc));
 749}
 750
 751void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
 752                         void *vfpst, uint32_t desc)
 753{
 754    uintptr_t opr_sz = simd_oprsz(desc);
 755    float32 *d = vd;
 756    float32 *n = vn;
 757    float32 *m = vm;
 758    float_status *fpst = vfpst;
 759    uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
 760    uint32_t neg_imag = neg_real ^ 1;
 761    uintptr_t i;
 762
 763    /* Shift boolean to the sign bit so we can xor to negate.  */
 764    neg_real <<= 31;
 765    neg_imag <<= 31;
 766
 767    for (i = 0; i < opr_sz / 4; i += 2) {
 768        float32 e0 = n[H4(i)];
 769        float32 e1 = m[H4(i + 1)] ^ neg_imag;
 770        float32 e2 = n[H4(i + 1)];
 771        float32 e3 = m[H4(i)] ^ neg_real;
 772
 773        d[H4(i)] = float32_add(e0, e1, fpst);
 774        d[H4(i + 1)] = float32_add(e2, e3, fpst);
 775    }
 776    clear_tail(d, opr_sz, simd_maxsz(desc));
 777}
 778
 779void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
 780                         void *vfpst, uint32_t desc)
 781{
 782    uintptr_t opr_sz = simd_oprsz(desc);
 783    float64 *d = vd;
 784    float64 *n = vn;
 785    float64 *m = vm;
 786    float_status *fpst = vfpst;
 787    uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
 788    uint64_t neg_imag = neg_real ^ 1;
 789    uintptr_t i;
 790
 791    /* Shift boolean to the sign bit so we can xor to negate.  */
 792    neg_real <<= 63;
 793    neg_imag <<= 63;
 794
 795    for (i = 0; i < opr_sz / 8; i += 2) {
 796        float64 e0 = n[i];
 797        float64 e1 = m[i + 1] ^ neg_imag;
 798        float64 e2 = n[i + 1];
 799        float64 e3 = m[i] ^ neg_real;
 800
 801        d[i] = float64_add(e0, e1, fpst);
 802        d[i + 1] = float64_add(e2, e3, fpst);
 803    }
 804    clear_tail(d, opr_sz, simd_maxsz(desc));
 805}
 806
 807void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
 808                         void *vfpst, uint32_t desc)
 809{
 810    uintptr_t opr_sz = simd_oprsz(desc);
 811    float16 *d = vd, *n = vn, *m = vm, *a = va;
 812    float_status *fpst = vfpst;
 813    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
 814    uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
 815    uint32_t neg_real = flip ^ neg_imag;
 816    uintptr_t i;
 817
 818    /* Shift boolean to the sign bit so we can xor to negate.  */
 819    neg_real <<= 15;
 820    neg_imag <<= 15;
 821
 822    for (i = 0; i < opr_sz / 2; i += 2) {
 823        float16 e2 = n[H2(i + flip)];
 824        float16 e1 = m[H2(i + flip)] ^ neg_real;
 825        float16 e4 = e2;
 826        float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
 827
 828        d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
 829        d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
 830    }
 831    clear_tail(d, opr_sz, simd_maxsz(desc));
 832}
 833
 834void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
 835                             void *vfpst, uint32_t desc)
 836{
 837    uintptr_t opr_sz = simd_oprsz(desc);
 838    float16 *d = vd, *n = vn, *m = vm, *a = va;
 839    float_status *fpst = vfpst;
 840    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
 841    uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
 842    intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
 843    uint32_t neg_real = flip ^ neg_imag;
 844    intptr_t elements = opr_sz / sizeof(float16);
 845    intptr_t eltspersegment = 16 / sizeof(float16);
 846    intptr_t i, j;
 847
 848    /* Shift boolean to the sign bit so we can xor to negate.  */
 849    neg_real <<= 15;
 850    neg_imag <<= 15;
 851
 852    for (i = 0; i < elements; i += eltspersegment) {
 853        float16 mr = m[H2(i + 2 * index + 0)];
 854        float16 mi = m[H2(i + 2 * index + 1)];
 855        float16 e1 = neg_real ^ (flip ? mi : mr);
 856        float16 e3 = neg_imag ^ (flip ? mr : mi);
 857
 858        for (j = i; j < i + eltspersegment; j += 2) {
 859            float16 e2 = n[H2(j + flip)];
 860            float16 e4 = e2;
 861
 862            d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
 863            d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
 864        }
 865    }
 866    clear_tail(d, opr_sz, simd_maxsz(desc));
 867}
 868
 869void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
 870                         void *vfpst, uint32_t desc)
 871{
 872    uintptr_t opr_sz = simd_oprsz(desc);
 873    float32 *d = vd, *n = vn, *m = vm, *a = va;
 874    float_status *fpst = vfpst;
 875    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
 876    uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
 877    uint32_t neg_real = flip ^ neg_imag;
 878    uintptr_t i;
 879
 880    /* Shift boolean to the sign bit so we can xor to negate.  */
 881    neg_real <<= 31;
 882    neg_imag <<= 31;
 883
 884    for (i = 0; i < opr_sz / 4; i += 2) {
 885        float32 e2 = n[H4(i + flip)];
 886        float32 e1 = m[H4(i + flip)] ^ neg_real;
 887        float32 e4 = e2;
 888        float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
 889
 890        d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
 891        d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
 892    }
 893    clear_tail(d, opr_sz, simd_maxsz(desc));
 894}
 895
 896void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
 897                             void *vfpst, uint32_t desc)
 898{
 899    uintptr_t opr_sz = simd_oprsz(desc);
 900    float32 *d = vd, *n = vn, *m = vm, *a = va;
 901    float_status *fpst = vfpst;
 902    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
 903    uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
 904    intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
 905    uint32_t neg_real = flip ^ neg_imag;
 906    intptr_t elements = opr_sz / sizeof(float32);
 907    intptr_t eltspersegment = 16 / sizeof(float32);
 908    intptr_t i, j;
 909
 910    /* Shift boolean to the sign bit so we can xor to negate.  */
 911    neg_real <<= 31;
 912    neg_imag <<= 31;
 913
 914    for (i = 0; i < elements; i += eltspersegment) {
 915        float32 mr = m[H4(i + 2 * index + 0)];
 916        float32 mi = m[H4(i + 2 * index + 1)];
 917        float32 e1 = neg_real ^ (flip ? mi : mr);
 918        float32 e3 = neg_imag ^ (flip ? mr : mi);
 919
 920        for (j = i; j < i + eltspersegment; j += 2) {
 921            float32 e2 = n[H4(j + flip)];
 922            float32 e4 = e2;
 923
 924            d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
 925            d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
 926        }
 927    }
 928    clear_tail(d, opr_sz, simd_maxsz(desc));
 929}
 930
 931void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
 932                         void *vfpst, uint32_t desc)
 933{
 934    uintptr_t opr_sz = simd_oprsz(desc);
 935    float64 *d = vd, *n = vn, *m = vm, *a = va;
 936    float_status *fpst = vfpst;
 937    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
 938    uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
 939    uint64_t neg_real = flip ^ neg_imag;
 940    uintptr_t i;
 941
 942    /* Shift boolean to the sign bit so we can xor to negate.  */
 943    neg_real <<= 63;
 944    neg_imag <<= 63;
 945
 946    for (i = 0; i < opr_sz / 8; i += 2) {
 947        float64 e2 = n[i + flip];
 948        float64 e1 = m[i + flip] ^ neg_real;
 949        float64 e4 = e2;
 950        float64 e3 = m[i + 1 - flip] ^ neg_imag;
 951
 952        d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
 953        d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
 954    }
 955    clear_tail(d, opr_sz, simd_maxsz(desc));
 956}
 957
 958/*
 959 * Floating point comparisons producing an integer result (all 1s or all 0s).
 960 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
 961 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
 962 */
 963static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
 964{
 965    return -float16_eq_quiet(op1, op2, stat);
 966}
 967
 968static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
 969{
 970    return -float32_eq_quiet(op1, op2, stat);
 971}
 972
 973static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
 974{
 975    return -float16_le(op2, op1, stat);
 976}
 977
 978static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
 979{
 980    return -float32_le(op2, op1, stat);
 981}
 982
 983static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
 984{
 985    return -float16_lt(op2, op1, stat);
 986}
 987
 988static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
 989{
 990    return -float32_lt(op2, op1, stat);
 991}
 992
 993static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
 994{
 995    return -float16_le(float16_abs(op2), float16_abs(op1), stat);
 996}
 997
 998static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
 999{
1000    return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1001}
1002
1003static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1004{
1005    return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1006}
1007
1008static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1009{
1010    return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1011}
1012
1013static int16_t vfp_tosszh(float16 x, void *fpstp)
1014{
1015    float_status *fpst = fpstp;
1016    if (float16_is_any_nan(x)) {
1017        float_raise(float_flag_invalid, fpst);
1018        return 0;
1019    }
1020    return float16_to_int16_round_to_zero(x, fpst);
1021}
1022
1023static uint16_t vfp_touszh(float16 x, void *fpstp)
1024{
1025    float_status *fpst = fpstp;
1026    if (float16_is_any_nan(x)) {
1027        float_raise(float_flag_invalid, fpst);
1028        return 0;
1029    }
1030    return float16_to_uint16_round_to_zero(x, fpst);
1031}
1032
1033#define DO_2OP(NAME, FUNC, TYPE) \
1034void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)  \
1035{                                                                 \
1036    intptr_t i, oprsz = simd_oprsz(desc);                         \
1037    TYPE *d = vd, *n = vn;                                        \
1038    for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1039        d[i] = FUNC(n[i], stat);                                  \
1040    }                                                             \
1041    clear_tail(d, oprsz, simd_maxsz(desc));                       \
1042}
1043
1044DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1045DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1046DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1047
1048DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1049DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1050DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1051
1052DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1053DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1054
1055DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1056DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1057DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1058DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1059DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1060DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1061DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1062DO_2OP(gvec_touszh, vfp_touszh, float16)
1063
1064#define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1065    static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1066    {                                                           \
1067        return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1068    }
1069
1070#define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1071    static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1072    {                                                           \
1073        return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1074    }
1075
1076#define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1077    WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1078    WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1079    DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1080    DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
1081
1082DO_2OP_CMP0(cgt, cgt, FWD)
1083DO_2OP_CMP0(cge, cge, FWD)
1084DO_2OP_CMP0(ceq, ceq, FWD)
1085DO_2OP_CMP0(clt, cgt, REV)
1086DO_2OP_CMP0(cle, cge, REV)
1087
1088#undef DO_2OP
1089#undef DO_2OP_CMP0
1090
1091/* Floating-point trigonometric starting value.
1092 * See the ARM ARM pseudocode function FPTrigSMul.
1093 */
1094static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1095{
1096    float16 result = float16_mul(op1, op1, stat);
1097    if (!float16_is_any_nan(result)) {
1098        result = float16_set_sign(result, op2 & 1);
1099    }
1100    return result;
1101}
1102
1103static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1104{
1105    float32 result = float32_mul(op1, op1, stat);
1106    if (!float32_is_any_nan(result)) {
1107        result = float32_set_sign(result, op2 & 1);
1108    }
1109    return result;
1110}
1111
1112static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1113{
1114    float64 result = float64_mul(op1, op1, stat);
1115    if (!float64_is_any_nan(result)) {
1116        result = float64_set_sign(result, op2 & 1);
1117    }
1118    return result;
1119}
1120
1121static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1122{
1123    return float16_abs(float16_sub(op1, op2, stat));
1124}
1125
1126static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1127{
1128    return float32_abs(float32_sub(op1, op2, stat));
1129}
1130
1131/*
1132 * Reciprocal step. These are the AArch32 version which uses a
1133 * non-fused multiply-and-subtract.
1134 */
1135static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1136{
1137    op1 = float16_squash_input_denormal(op1, stat);
1138    op2 = float16_squash_input_denormal(op2, stat);
1139
1140    if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1141        (float16_is_infinity(op2) && float16_is_zero(op1))) {
1142        return float16_two;
1143    }
1144    return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1145}
1146
1147static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1148{
1149    op1 = float32_squash_input_denormal(op1, stat);
1150    op2 = float32_squash_input_denormal(op2, stat);
1151
1152    if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1153        (float32_is_infinity(op2) && float32_is_zero(op1))) {
1154        return float32_two;
1155    }
1156    return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1157}
1158
1159/* Reciprocal square-root step. AArch32 non-fused semantics. */
1160static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1161{
1162    op1 = float16_squash_input_denormal(op1, stat);
1163    op2 = float16_squash_input_denormal(op2, stat);
1164
1165    if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1166        (float16_is_infinity(op2) && float16_is_zero(op1))) {
1167        return float16_one_point_five;
1168    }
1169    op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1170    return float16_div(op1, float16_two, stat);
1171}
1172
1173static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1174{
1175    op1 = float32_squash_input_denormal(op1, stat);
1176    op2 = float32_squash_input_denormal(op2, stat);
1177
1178    if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1179        (float32_is_infinity(op2) && float32_is_zero(op1))) {
1180        return float32_one_point_five;
1181    }
1182    op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1183    return float32_div(op1, float32_two, stat);
1184}
1185
1186#define DO_3OP(NAME, FUNC, TYPE) \
1187void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1188{                                                                          \
1189    intptr_t i, oprsz = simd_oprsz(desc);                                  \
1190    TYPE *d = vd, *n = vn, *m = vm;                                        \
1191    for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1192        d[i] = FUNC(n[i], m[i], stat);                                     \
1193    }                                                                      \
1194    clear_tail(d, oprsz, simd_maxsz(desc));                                \
1195}
1196
1197DO_3OP(gvec_fadd_h, float16_add, float16)
1198DO_3OP(gvec_fadd_s, float32_add, float32)
1199DO_3OP(gvec_fadd_d, float64_add, float64)
1200
1201DO_3OP(gvec_fsub_h, float16_sub, float16)
1202DO_3OP(gvec_fsub_s, float32_sub, float32)
1203DO_3OP(gvec_fsub_d, float64_sub, float64)
1204
1205DO_3OP(gvec_fmul_h, float16_mul, float16)
1206DO_3OP(gvec_fmul_s, float32_mul, float32)
1207DO_3OP(gvec_fmul_d, float64_mul, float64)
1208
1209DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1210DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1211DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1212
1213DO_3OP(gvec_fabd_h, float16_abd, float16)
1214DO_3OP(gvec_fabd_s, float32_abd, float32)
1215
1216DO_3OP(gvec_fceq_h, float16_ceq, float16)
1217DO_3OP(gvec_fceq_s, float32_ceq, float32)
1218
1219DO_3OP(gvec_fcge_h, float16_cge, float16)
1220DO_3OP(gvec_fcge_s, float32_cge, float32)
1221
1222DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1223DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1224
1225DO_3OP(gvec_facge_h, float16_acge, float16)
1226DO_3OP(gvec_facge_s, float32_acge, float32)
1227
1228DO_3OP(gvec_facgt_h, float16_acgt, float16)
1229DO_3OP(gvec_facgt_s, float32_acgt, float32)
1230
1231DO_3OP(gvec_fmax_h, float16_max, float16)
1232DO_3OP(gvec_fmax_s, float32_max, float32)
1233
1234DO_3OP(gvec_fmin_h, float16_min, float16)
1235DO_3OP(gvec_fmin_s, float32_min, float32)
1236
1237DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1238DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1239
1240DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1241DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1242
1243DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1244DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1245
1246DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1247DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1248
1249#ifdef TARGET_AARCH64
1250
1251DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1252DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1253DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1254
1255DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1256DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1257DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1258
1259#endif
1260#undef DO_3OP
1261
1262/* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1263static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1264                                 float_status *stat)
1265{
1266    return float16_add(dest, float16_mul(op1, op2, stat), stat);
1267}
1268
1269static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1270                                 float_status *stat)
1271{
1272    return float32_add(dest, float32_mul(op1, op2, stat), stat);
1273}
1274
1275static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1276                                 float_status *stat)
1277{
1278    return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1279}
1280
1281static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1282                                 float_status *stat)
1283{
1284    return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1285}
1286
1287/* Fused versions; these have the semantics Neon VFMA/VFMS want */
1288static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1289                                float_status *stat)
1290{
1291    return float16_muladd(op1, op2, dest, 0, stat);
1292}
1293
1294static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1295                                 float_status *stat)
1296{
1297    return float32_muladd(op1, op2, dest, 0, stat);
1298}
1299
1300static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1301                                 float_status *stat)
1302{
1303    return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1304}
1305
1306static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1307                                 float_status *stat)
1308{
1309    return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1310}
1311
1312#define DO_MULADD(NAME, FUNC, TYPE)                                     \
1313void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1314{                                                                          \
1315    intptr_t i, oprsz = simd_oprsz(desc);                                  \
1316    TYPE *d = vd, *n = vn, *m = vm;                                        \
1317    for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1318        d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1319    }                                                                      \
1320    clear_tail(d, oprsz, simd_maxsz(desc));                                \
1321}
1322
1323DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1324DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1325
1326DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1327DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1328
1329DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1330DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1331
1332DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1333DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1334
1335/* For the indexed ops, SVE applies the index per 128-bit vector segment.
1336 * For AdvSIMD, there is of course only one such vector segment.
1337 */
1338
1339#define DO_MUL_IDX(NAME, TYPE, H) \
1340void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1341{                                                                          \
1342    intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1343    intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1344    intptr_t idx = simd_data(desc);                                        \
1345    TYPE *d = vd, *n = vn, *m = vm;                                        \
1346    for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1347        TYPE mm = m[H(i + idx)];                                           \
1348        for (j = 0; j < segment; j++) {                                    \
1349            d[i + j] = n[i + j] * mm;                                      \
1350        }                                                                  \
1351    }                                                                      \
1352    clear_tail(d, oprsz, simd_maxsz(desc));                                \
1353}
1354
1355DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1356DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1357DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1358
1359#undef DO_MUL_IDX
1360
1361#define DO_MLA_IDX(NAME, TYPE, OP, H) \
1362void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1363{                                                                          \
1364    intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1365    intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1366    intptr_t idx = simd_data(desc);                                        \
1367    TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1368    for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1369        TYPE mm = m[H(i + idx)];                                           \
1370        for (j = 0; j < segment; j++) {                                    \
1371            d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1372        }                                                                  \
1373    }                                                                      \
1374    clear_tail(d, oprsz, simd_maxsz(desc));                                \
1375}
1376
1377DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1378DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1379DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1380
1381DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1382DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1383DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1384
1385#undef DO_MLA_IDX
1386
1387#define DO_FMUL_IDX(NAME, ADD, TYPE, H)                                    \
1388void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1389{                                                                          \
1390    intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1391    intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1392    intptr_t idx = simd_data(desc);                                        \
1393    TYPE *d = vd, *n = vn, *m = vm;                                        \
1394    for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1395        TYPE mm = m[H(i + idx)];                                           \
1396        for (j = 0; j < segment; j++) {                                    \
1397            d[i + j] = TYPE##_##ADD(d[i + j],                              \
1398                                    TYPE##_mul(n[i + j], mm, stat), stat); \
1399        }                                                                  \
1400    }                                                                      \
1401    clear_tail(d, oprsz, simd_maxsz(desc));                                \
1402}
1403
1404#define float16_nop(N, M, S) (M)
1405#define float32_nop(N, M, S) (M)
1406#define float64_nop(N, M, S) (M)
1407
1408DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2)
1409DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4)
1410DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, H8)
1411
1412/*
1413 * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1414 * the fused ops below they assume accumulate both from and into Vd.
1415 */
1416DO_FMUL_IDX(gvec_fmla_nf_idx_h, add, float16, H2)
1417DO_FMUL_IDX(gvec_fmla_nf_idx_s, add, float32, H4)
1418DO_FMUL_IDX(gvec_fmls_nf_idx_h, sub, float16, H2)
1419DO_FMUL_IDX(gvec_fmls_nf_idx_s, sub, float32, H4)
1420
1421#undef float16_nop
1422#undef float32_nop
1423#undef float64_nop
1424#undef DO_FMUL_IDX
1425
1426#define DO_FMLA_IDX(NAME, TYPE, H)                                         \
1427void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1428                  void *stat, uint32_t desc)                               \
1429{                                                                          \
1430    intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1431    intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1432    TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
1433    intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
1434    TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1435    op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
1436    for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1437        TYPE mm = m[H(i + idx)];                                           \
1438        for (j = 0; j < segment; j++) {                                    \
1439            d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
1440                                     mm, a[i + j], 0, stat);               \
1441        }                                                                  \
1442    }                                                                      \
1443    clear_tail(d, oprsz, simd_maxsz(desc));                                \
1444}
1445
1446DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1447DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1448DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1449
1450#undef DO_FMLA_IDX
1451
1452#define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1453void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1454{                                                                          \
1455    intptr_t i, oprsz = simd_oprsz(desc);                                  \
1456    TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1457    bool q = false;                                                        \
1458    for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1459        WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1460        if (dd < MIN) {                                                    \
1461            dd = MIN;                                                      \
1462            q = true;                                                      \
1463        } else if (dd > MAX) {                                             \
1464            dd = MAX;                                                      \
1465            q = true;                                                      \
1466        }                                                                  \
1467        d[i] = dd;                                                         \
1468    }                                                                      \
1469    if (q) {                                                               \
1470        uint32_t *qc = vq;                                                 \
1471        qc[0] = 1;                                                         \
1472    }                                                                      \
1473    clear_tail(d, oprsz, simd_maxsz(desc));                                \
1474}
1475
1476DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1477DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1478DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1479
1480DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1481DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1482DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1483
1484DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1485DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1486DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1487
1488DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1489DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1490DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1491
1492#undef DO_SAT
1493
1494void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1495                          void *vm, uint32_t desc)
1496{
1497    intptr_t i, oprsz = simd_oprsz(desc);
1498    uint64_t *d = vd, *n = vn, *m = vm;
1499    bool q = false;
1500
1501    for (i = 0; i < oprsz / 8; i++) {
1502        uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1503        if (dd < nn) {
1504            dd = UINT64_MAX;
1505            q = true;
1506        }
1507        d[i] = dd;
1508    }
1509    if (q) {
1510        uint32_t *qc = vq;
1511        qc[0] = 1;
1512    }
1513    clear_tail(d, oprsz, simd_maxsz(desc));
1514}
1515
1516void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1517                          void *vm, uint32_t desc)
1518{
1519    intptr_t i, oprsz = simd_oprsz(desc);
1520    uint64_t *d = vd, *n = vn, *m = vm;
1521    bool q = false;
1522
1523    for (i = 0; i < oprsz / 8; i++) {
1524        uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1525        if (nn < mm) {
1526            dd = 0;
1527            q = true;
1528        }
1529        d[i] = dd;
1530    }
1531    if (q) {
1532        uint32_t *qc = vq;
1533        qc[0] = 1;
1534    }
1535    clear_tail(d, oprsz, simd_maxsz(desc));
1536}
1537
1538void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1539                          void *vm, uint32_t desc)
1540{
1541    intptr_t i, oprsz = simd_oprsz(desc);
1542    int64_t *d = vd, *n = vn, *m = vm;
1543    bool q = false;
1544
1545    for (i = 0; i < oprsz / 8; i++) {
1546        int64_t nn = n[i], mm = m[i], dd = nn + mm;
1547        if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1548            dd = (nn >> 63) ^ ~INT64_MIN;
1549            q = true;
1550        }
1551        d[i] = dd;
1552    }
1553    if (q) {
1554        uint32_t *qc = vq;
1555        qc[0] = 1;
1556    }
1557    clear_tail(d, oprsz, simd_maxsz(desc));
1558}
1559
1560void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1561                          void *vm, uint32_t desc)
1562{
1563    intptr_t i, oprsz = simd_oprsz(desc);
1564    int64_t *d = vd, *n = vn, *m = vm;
1565    bool q = false;
1566
1567    for (i = 0; i < oprsz / 8; i++) {
1568        int64_t nn = n[i], mm = m[i], dd = nn - mm;
1569        if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1570            dd = (nn >> 63) ^ ~INT64_MIN;
1571            q = true;
1572        }
1573        d[i] = dd;
1574    }
1575    if (q) {
1576        uint32_t *qc = vq;
1577        qc[0] = 1;
1578    }
1579    clear_tail(d, oprsz, simd_maxsz(desc));
1580}
1581
1582
1583#define DO_SRA(NAME, TYPE)                              \
1584void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1585{                                                       \
1586    intptr_t i, oprsz = simd_oprsz(desc);               \
1587    int shift = simd_data(desc);                        \
1588    TYPE *d = vd, *n = vn;                              \
1589    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1590        d[i] += n[i] >> shift;                          \
1591    }                                                   \
1592    clear_tail(d, oprsz, simd_maxsz(desc));             \
1593}
1594
1595DO_SRA(gvec_ssra_b, int8_t)
1596DO_SRA(gvec_ssra_h, int16_t)
1597DO_SRA(gvec_ssra_s, int32_t)
1598DO_SRA(gvec_ssra_d, int64_t)
1599
1600DO_SRA(gvec_usra_b, uint8_t)
1601DO_SRA(gvec_usra_h, uint16_t)
1602DO_SRA(gvec_usra_s, uint32_t)
1603DO_SRA(gvec_usra_d, uint64_t)
1604
1605#undef DO_SRA
1606
1607#define DO_RSHR(NAME, TYPE)                             \
1608void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1609{                                                       \
1610    intptr_t i, oprsz = simd_oprsz(desc);               \
1611    int shift = simd_data(desc);                        \
1612    TYPE *d = vd, *n = vn;                              \
1613    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1614        TYPE tmp = n[i] >> (shift - 1);                 \
1615        d[i] = (tmp >> 1) + (tmp & 1);                  \
1616    }                                                   \
1617    clear_tail(d, oprsz, simd_maxsz(desc));             \
1618}
1619
1620DO_RSHR(gvec_srshr_b, int8_t)
1621DO_RSHR(gvec_srshr_h, int16_t)
1622DO_RSHR(gvec_srshr_s, int32_t)
1623DO_RSHR(gvec_srshr_d, int64_t)
1624
1625DO_RSHR(gvec_urshr_b, uint8_t)
1626DO_RSHR(gvec_urshr_h, uint16_t)
1627DO_RSHR(gvec_urshr_s, uint32_t)
1628DO_RSHR(gvec_urshr_d, uint64_t)
1629
1630#undef DO_RSHR
1631
1632#define DO_RSRA(NAME, TYPE)                             \
1633void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1634{                                                       \
1635    intptr_t i, oprsz = simd_oprsz(desc);               \
1636    int shift = simd_data(desc);                        \
1637    TYPE *d = vd, *n = vn;                              \
1638    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1639        TYPE tmp = n[i] >> (shift - 1);                 \
1640        d[i] += (tmp >> 1) + (tmp & 1);                 \
1641    }                                                   \
1642    clear_tail(d, oprsz, simd_maxsz(desc));             \
1643}
1644
1645DO_RSRA(gvec_srsra_b, int8_t)
1646DO_RSRA(gvec_srsra_h, int16_t)
1647DO_RSRA(gvec_srsra_s, int32_t)
1648DO_RSRA(gvec_srsra_d, int64_t)
1649
1650DO_RSRA(gvec_ursra_b, uint8_t)
1651DO_RSRA(gvec_ursra_h, uint16_t)
1652DO_RSRA(gvec_ursra_s, uint32_t)
1653DO_RSRA(gvec_ursra_d, uint64_t)
1654
1655#undef DO_RSRA
1656
1657#define DO_SRI(NAME, TYPE)                              \
1658void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1659{                                                       \
1660    intptr_t i, oprsz = simd_oprsz(desc);               \
1661    int shift = simd_data(desc);                        \
1662    TYPE *d = vd, *n = vn;                              \
1663    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1664        d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1665    }                                                   \
1666    clear_tail(d, oprsz, simd_maxsz(desc));             \
1667}
1668
1669DO_SRI(gvec_sri_b, uint8_t)
1670DO_SRI(gvec_sri_h, uint16_t)
1671DO_SRI(gvec_sri_s, uint32_t)
1672DO_SRI(gvec_sri_d, uint64_t)
1673
1674#undef DO_SRI
1675
1676#define DO_SLI(NAME, TYPE)                              \
1677void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1678{                                                       \
1679    intptr_t i, oprsz = simd_oprsz(desc);               \
1680    int shift = simd_data(desc);                        \
1681    TYPE *d = vd, *n = vn;                              \
1682    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1683        d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1684    }                                                   \
1685    clear_tail(d, oprsz, simd_maxsz(desc));             \
1686}
1687
1688DO_SLI(gvec_sli_b, uint8_t)
1689DO_SLI(gvec_sli_h, uint16_t)
1690DO_SLI(gvec_sli_s, uint32_t)
1691DO_SLI(gvec_sli_d, uint64_t)
1692
1693#undef DO_SLI
1694
1695/*
1696 * Convert float16 to float32, raising no exceptions and
1697 * preserving exceptional values, including SNaN.
1698 * This is effectively an unpack+repack operation.
1699 */
1700static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1701{
1702    const int f16_bias = 15;
1703    const int f32_bias = 127;
1704    uint32_t sign = extract32(f16, 15, 1);
1705    uint32_t exp = extract32(f16, 10, 5);
1706    uint32_t frac = extract32(f16, 0, 10);
1707
1708    if (exp == 0x1f) {
1709        /* Inf or NaN */
1710        exp = 0xff;
1711    } else if (exp == 0) {
1712        /* Zero or denormal.  */
1713        if (frac != 0) {
1714            if (fz16) {
1715                frac = 0;
1716            } else {
1717                /*
1718                 * Denormal; these are all normal float32.
1719                 * Shift the fraction so that the msb is at bit 11,
1720                 * then remove bit 11 as the implicit bit of the
1721                 * normalized float32.  Note that we still go through
1722                 * the shift for normal numbers below, to put the
1723                 * float32 fraction at the right place.
1724                 */
1725                int shift = clz32(frac) - 21;
1726                frac = (frac << shift) & 0x3ff;
1727                exp = f32_bias - f16_bias - shift + 1;
1728            }
1729        }
1730    } else {
1731        /* Normal number; adjust the bias.  */
1732        exp += f32_bias - f16_bias;
1733    }
1734    sign <<= 31;
1735    exp <<= 23;
1736    frac <<= 23 - 10;
1737
1738    return sign | exp | frac;
1739}
1740
1741static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
1742{
1743    /*
1744     * Branchless load of u32[0], u64[0], u32[1], or u64[1].
1745     * Load the 2nd qword iff is_q & is_2.
1746     * Shift to the 2nd dword iff !is_q & is_2.
1747     * For !is_q & !is_2, the upper bits of the result are garbage.
1748     */
1749    return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
1750}
1751
1752/*
1753 * Note that FMLAL requires oprsz == 8 or oprsz == 16,
1754 * as there is not yet SVE versions that might use blocking.
1755 */
1756
1757static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
1758                     uint32_t desc, bool fz16)
1759{
1760    intptr_t i, oprsz = simd_oprsz(desc);
1761    int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1762    int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1763    int is_q = oprsz == 16;
1764    uint64_t n_4, m_4;
1765
1766    /* Pre-load all of the f16 data, avoiding overlap issues.  */
1767    n_4 = load4_f16(vn, is_q, is_2);
1768    m_4 = load4_f16(vm, is_q, is_2);
1769
1770    /* Negate all inputs for FMLSL at once.  */
1771    if (is_s) {
1772        n_4 ^= 0x8000800080008000ull;
1773    }
1774
1775    for (i = 0; i < oprsz / 4; i++) {
1776        float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1777        float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
1778        d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1779    }
1780    clear_tail(d, oprsz, simd_maxsz(desc));
1781}
1782
1783void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
1784                            void *venv, uint32_t desc)
1785{
1786    CPUARMState *env = venv;
1787    do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1788             get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1789}
1790
1791void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
1792                            void *venv, uint32_t desc)
1793{
1794    CPUARMState *env = venv;
1795    do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
1796             get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1797}
1798
1799void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
1800                               void *venv, uint32_t desc)
1801{
1802    intptr_t i, oprsz = simd_oprsz(desc);
1803    uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1804    intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1805    CPUARMState *env = venv;
1806    float_status *status = &env->vfp.fp_status;
1807    bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1808
1809    for (i = 0; i < oprsz; i += sizeof(float32)) {
1810        float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
1811        float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
1812        float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1813        float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1814        float32 aa = *(float32 *)(va + H1_4(i));
1815
1816        *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
1817    }
1818}
1819
1820static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
1821                         uint32_t desc, bool fz16)
1822{
1823    intptr_t i, oprsz = simd_oprsz(desc);
1824    int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1825    int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1826    int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
1827    int is_q = oprsz == 16;
1828    uint64_t n_4;
1829    float32 m_1;
1830
1831    /* Pre-load all of the f16 data, avoiding overlap issues.  */
1832    n_4 = load4_f16(vn, is_q, is_2);
1833
1834    /* Negate all inputs for FMLSL at once.  */
1835    if (is_s) {
1836        n_4 ^= 0x8000800080008000ull;
1837    }
1838
1839    m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
1840
1841    for (i = 0; i < oprsz / 4; i++) {
1842        float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1843        d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1844    }
1845    clear_tail(d, oprsz, simd_maxsz(desc));
1846}
1847
1848void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
1849                                void *venv, uint32_t desc)
1850{
1851    CPUARMState *env = venv;
1852    do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1853                 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1854}
1855
1856void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
1857                                void *venv, uint32_t desc)
1858{
1859    CPUARMState *env = venv;
1860    do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
1861                 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1862}
1863
1864void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
1865                               void *venv, uint32_t desc)
1866{
1867    intptr_t i, j, oprsz = simd_oprsz(desc);
1868    uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1869    intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1870    intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
1871    CPUARMState *env = venv;
1872    float_status *status = &env->vfp.fp_status;
1873    bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1874
1875    for (i = 0; i < oprsz; i += 16) {
1876        float16 mm_16 = *(float16 *)(vm + i + idx);
1877        float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1878
1879        for (j = 0; j < 16; j += sizeof(float32)) {
1880            float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
1881            float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1882            float32 aa = *(float32 *)(va + H1_4(i + j));
1883
1884            *(float32 *)(vd + H1_4(i + j)) =
1885                float32_muladd(nn, mm, aa, 0, status);
1886        }
1887    }
1888}
1889
1890void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1891{
1892    intptr_t i, opr_sz = simd_oprsz(desc);
1893    int8_t *d = vd, *n = vn, *m = vm;
1894
1895    for (i = 0; i < opr_sz; ++i) {
1896        int8_t mm = m[i];
1897        int8_t nn = n[i];
1898        int8_t res = 0;
1899        if (mm >= 0) {
1900            if (mm < 8) {
1901                res = nn << mm;
1902            }
1903        } else {
1904            res = nn >> (mm > -8 ? -mm : 7);
1905        }
1906        d[i] = res;
1907    }
1908    clear_tail(d, opr_sz, simd_maxsz(desc));
1909}
1910
1911void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1912{
1913    intptr_t i, opr_sz = simd_oprsz(desc);
1914    int16_t *d = vd, *n = vn, *m = vm;
1915
1916    for (i = 0; i < opr_sz / 2; ++i) {
1917        int8_t mm = m[i];   /* only 8 bits of shift are significant */
1918        int16_t nn = n[i];
1919        int16_t res = 0;
1920        if (mm >= 0) {
1921            if (mm < 16) {
1922                res = nn << mm;
1923            }
1924        } else {
1925            res = nn >> (mm > -16 ? -mm : 15);
1926        }
1927        d[i] = res;
1928    }
1929    clear_tail(d, opr_sz, simd_maxsz(desc));
1930}
1931
1932void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1933{
1934    intptr_t i, opr_sz = simd_oprsz(desc);
1935    uint8_t *d = vd, *n = vn, *m = vm;
1936
1937    for (i = 0; i < opr_sz; ++i) {
1938        int8_t mm = m[i];
1939        uint8_t nn = n[i];
1940        uint8_t res = 0;
1941        if (mm >= 0) {
1942            if (mm < 8) {
1943                res = nn << mm;
1944            }
1945        } else {
1946            if (mm > -8) {
1947                res = nn >> -mm;
1948            }
1949        }
1950        d[i] = res;
1951    }
1952    clear_tail(d, opr_sz, simd_maxsz(desc));
1953}
1954
1955void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1956{
1957    intptr_t i, opr_sz = simd_oprsz(desc);
1958    uint16_t *d = vd, *n = vn, *m = vm;
1959
1960    for (i = 0; i < opr_sz / 2; ++i) {
1961        int8_t mm = m[i];   /* only 8 bits of shift are significant */
1962        uint16_t nn = n[i];
1963        uint16_t res = 0;
1964        if (mm >= 0) {
1965            if (mm < 16) {
1966                res = nn << mm;
1967            }
1968        } else {
1969            if (mm > -16) {
1970                res = nn >> -mm;
1971            }
1972        }
1973        d[i] = res;
1974    }
1975    clear_tail(d, opr_sz, simd_maxsz(desc));
1976}
1977
1978/*
1979 * 8x8->8 polynomial multiply.
1980 *
1981 * Polynomial multiplication is like integer multiplication except the
1982 * partial products are XORed, not added.
1983 *
1984 * TODO: expose this as a generic vector operation, as it is a common
1985 * crypto building block.
1986 */
1987void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
1988{
1989    intptr_t i, j, opr_sz = simd_oprsz(desc);
1990    uint64_t *d = vd, *n = vn, *m = vm;
1991
1992    for (i = 0; i < opr_sz / 8; ++i) {
1993        uint64_t nn = n[i];
1994        uint64_t mm = m[i];
1995        uint64_t rr = 0;
1996
1997        for (j = 0; j < 8; ++j) {
1998            uint64_t mask = (nn & 0x0101010101010101ull) * 0xff;
1999            rr ^= mm & mask;
2000            mm = (mm << 1) & 0xfefefefefefefefeull;
2001            nn >>= 1;
2002        }
2003        d[i] = rr;
2004    }
2005    clear_tail(d, opr_sz, simd_maxsz(desc));
2006}
2007
2008/*
2009 * 64x64->128 polynomial multiply.
2010 * Because of the lanes are not accessed in strict columns,
2011 * this probably cannot be turned into a generic helper.
2012 */
2013void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2014{
2015    intptr_t i, j, opr_sz = simd_oprsz(desc);
2016    intptr_t hi = simd_data(desc);
2017    uint64_t *d = vd, *n = vn, *m = vm;
2018
2019    for (i = 0; i < opr_sz / 8; i += 2) {
2020        uint64_t nn = n[i + hi];
2021        uint64_t mm = m[i + hi];
2022        uint64_t rhi = 0;
2023        uint64_t rlo = 0;
2024
2025        /* Bit 0 can only influence the low 64-bit result.  */
2026        if (nn & 1) {
2027            rlo = mm;
2028        }
2029
2030        for (j = 1; j < 64; ++j) {
2031            uint64_t mask = -((nn >> j) & 1);
2032            rlo ^= (mm << j) & mask;
2033            rhi ^= (mm >> (64 - j)) & mask;
2034        }
2035        d[i] = rlo;
2036        d[i + 1] = rhi;
2037    }
2038    clear_tail(d, opr_sz, simd_maxsz(desc));
2039}
2040
2041/*
2042 * 8x8->16 polynomial multiply.
2043 *
2044 * The byte inputs are expanded to (or extracted from) half-words.
2045 * Note that neon and sve2 get the inputs from different positions.
2046 * This allows 4 bytes to be processed in parallel with uint64_t.
2047 */
2048
2049static uint64_t expand_byte_to_half(uint64_t x)
2050{
2051    return  (x & 0x000000ff)
2052         | ((x & 0x0000ff00) << 8)
2053         | ((x & 0x00ff0000) << 16)
2054         | ((x & 0xff000000) << 24);
2055}
2056
2057uint64_t pmull_w(uint64_t op1, uint64_t op2)
2058{
2059    uint64_t result = 0;
2060    int i;
2061    for (i = 0; i < 16; ++i) {
2062        uint64_t mask = (op1 & 0x0000000100000001ull) * 0xffffffff;
2063        result ^= op2 & mask;
2064        op1 >>= 1;
2065        op2 <<= 1;
2066    }
2067    return result;
2068}
2069
2070uint64_t pmull_h(uint64_t op1, uint64_t op2)
2071{
2072    uint64_t result = 0;
2073    int i;
2074    for (i = 0; i < 8; ++i) {
2075        uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff;
2076        result ^= op2 & mask;
2077        op1 >>= 1;
2078        op2 <<= 1;
2079    }
2080    return result;
2081}
2082
2083void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2084{
2085    int hi = simd_data(desc);
2086    uint64_t *d = vd, *n = vn, *m = vm;
2087    uint64_t nn = n[hi], mm = m[hi];
2088
2089    d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
2090    nn >>= 32;
2091    mm >>= 32;
2092    d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
2093
2094    clear_tail(d, 16, simd_maxsz(desc));
2095}
2096
2097#ifdef TARGET_AARCH64
2098void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2099{
2100    int shift = simd_data(desc) * 8;
2101    intptr_t i, opr_sz = simd_oprsz(desc);
2102    uint64_t *d = vd, *n = vn, *m = vm;
2103
2104    for (i = 0; i < opr_sz / 8; ++i) {
2105        uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull;
2106        uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull;
2107
2108        d[i] = pmull_h(nn, mm);
2109    }
2110}
2111
2112static uint64_t pmull_d(uint64_t op1, uint64_t op2)
2113{
2114    uint64_t result = 0;
2115    int i;
2116
2117    for (i = 0; i < 32; ++i) {
2118        uint64_t mask = -((op1 >> i) & 1);
2119        result ^= (op2 << i) & mask;
2120    }
2121    return result;
2122}
2123
2124void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2125{
2126    intptr_t sel = H4(simd_data(desc));
2127    intptr_t i, opr_sz = simd_oprsz(desc);
2128    uint32_t *n = vn, *m = vm;
2129    uint64_t *d = vd;
2130
2131    for (i = 0; i < opr_sz / 8; ++i) {
2132        d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]);
2133    }
2134}
2135#endif
2136
2137#define DO_CMP0(NAME, TYPE, OP)                         \
2138void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2139{                                                       \
2140    intptr_t i, opr_sz = simd_oprsz(desc);              \
2141    for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2142        TYPE nn = *(TYPE *)(vn + i);                    \
2143        *(TYPE *)(vd + i) = -(nn OP 0);                 \
2144    }                                                   \
2145    clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2146}
2147
2148DO_CMP0(gvec_ceq0_b, int8_t, ==)
2149DO_CMP0(gvec_clt0_b, int8_t, <)
2150DO_CMP0(gvec_cle0_b, int8_t, <=)
2151DO_CMP0(gvec_cgt0_b, int8_t, >)
2152DO_CMP0(gvec_cge0_b, int8_t, >=)
2153
2154DO_CMP0(gvec_ceq0_h, int16_t, ==)
2155DO_CMP0(gvec_clt0_h, int16_t, <)
2156DO_CMP0(gvec_cle0_h, int16_t, <=)
2157DO_CMP0(gvec_cgt0_h, int16_t, >)
2158DO_CMP0(gvec_cge0_h, int16_t, >=)
2159
2160#undef DO_CMP0
2161
2162#define DO_ABD(NAME, TYPE)                                      \
2163void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2164{                                                               \
2165    intptr_t i, opr_sz = simd_oprsz(desc);                      \
2166    TYPE *d = vd, *n = vn, *m = vm;                             \
2167                                                                \
2168    for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2169        d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2170    }                                                           \
2171    clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2172}
2173
2174DO_ABD(gvec_sabd_b, int8_t)
2175DO_ABD(gvec_sabd_h, int16_t)
2176DO_ABD(gvec_sabd_s, int32_t)
2177DO_ABD(gvec_sabd_d, int64_t)
2178
2179DO_ABD(gvec_uabd_b, uint8_t)
2180DO_ABD(gvec_uabd_h, uint16_t)
2181DO_ABD(gvec_uabd_s, uint32_t)
2182DO_ABD(gvec_uabd_d, uint64_t)
2183
2184#undef DO_ABD
2185
2186#define DO_ABA(NAME, TYPE)                                      \
2187void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2188{                                                               \
2189    intptr_t i, opr_sz = simd_oprsz(desc);                      \
2190    TYPE *d = vd, *n = vn, *m = vm;                             \
2191                                                                \
2192    for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2193        d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2194    }                                                           \
2195    clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2196}
2197
2198DO_ABA(gvec_saba_b, int8_t)
2199DO_ABA(gvec_saba_h, int16_t)
2200DO_ABA(gvec_saba_s, int32_t)
2201DO_ABA(gvec_saba_d, int64_t)
2202
2203DO_ABA(gvec_uaba_b, uint8_t)
2204DO_ABA(gvec_uaba_h, uint16_t)
2205DO_ABA(gvec_uaba_s, uint32_t)
2206DO_ABA(gvec_uaba_d, uint64_t)
2207
2208#undef DO_ABA
2209
2210#define DO_NEON_PAIRWISE(NAME, OP)                                      \
2211    void HELPER(NAME##s)(void *vd, void *vn, void *vm,                  \
2212                         void *stat, uint32_t oprsz)                    \
2213    {                                                                   \
2214        float_status *fpst = stat;                                      \
2215        float32 *d = vd;                                                \
2216        float32 *n = vn;                                                \
2217        float32 *m = vm;                                                \
2218        float32 r0, r1;                                                 \
2219                                                                        \
2220        /* Read all inputs before writing outputs in case vm == vd */   \
2221        r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst);                    \
2222        r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst);                    \
2223                                                                        \
2224        d[H4(0)] = r0;                                                  \
2225        d[H4(1)] = r1;                                                  \
2226    }                                                                   \
2227                                                                        \
2228    void HELPER(NAME##h)(void *vd, void *vn, void *vm,                  \
2229                         void *stat, uint32_t oprsz)                    \
2230    {                                                                   \
2231        float_status *fpst = stat;                                      \
2232        float16 *d = vd;                                                \
2233        float16 *n = vn;                                                \
2234        float16 *m = vm;                                                \
2235        float16 r0, r1, r2, r3;                                         \
2236                                                                        \
2237        /* Read all inputs before writing outputs in case vm == vd */   \
2238        r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst);                    \
2239        r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst);                    \
2240        r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst);                    \
2241        r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst);                    \
2242                                                                        \
2243        d[H2(0)] = r0;                                                  \
2244        d[H2(1)] = r1;                                                  \
2245        d[H2(2)] = r2;                                                  \
2246        d[H2(3)] = r3;                                                  \
2247    }
2248
2249DO_NEON_PAIRWISE(neon_padd, add)
2250DO_NEON_PAIRWISE(neon_pmax, max)
2251DO_NEON_PAIRWISE(neon_pmin, min)
2252
2253#undef DO_NEON_PAIRWISE
2254
2255#define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2256    void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2257    {                                                                   \
2258        intptr_t i, oprsz = simd_oprsz(desc);                           \
2259        int shift = simd_data(desc);                                    \
2260        TYPE *d = vd, *n = vn;                                          \
2261        float_status *fpst = stat;                                      \
2262        for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2263            d[i] = FUNC(n[i], shift, fpst);                             \
2264        }                                                               \
2265        clear_tail(d, oprsz, simd_maxsz(desc));                         \
2266    }
2267
2268DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2269DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2270DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2271DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
2272DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2273DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2274DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2275DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2276
2277#undef DO_VCVT_FIXED
2278
2279#define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2280    void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2281    {                                                                   \
2282        float_status *fpst = stat;                                      \
2283        intptr_t i, oprsz = simd_oprsz(desc);                           \
2284        uint32_t rmode = simd_data(desc);                               \
2285        uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2286        TYPE *d = vd, *n = vn;                                          \
2287        set_float_rounding_mode(rmode, fpst);                           \
2288        for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2289            d[i] = FUNC(n[i], 0, fpst);                                 \
2290        }                                                               \
2291        set_float_rounding_mode(prev_rmode, fpst);                      \
2292        clear_tail(d, oprsz, simd_maxsz(desc));                         \
2293    }
2294
2295DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2296DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2297DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2298DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2299
2300#undef DO_VCVT_RMODE
2301
2302#define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2303    void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2304    {                                                                   \
2305        float_status *fpst = stat;                                      \
2306        intptr_t i, oprsz = simd_oprsz(desc);                           \
2307        uint32_t rmode = simd_data(desc);                               \
2308        uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2309        TYPE *d = vd, *n = vn;                                          \
2310        set_float_rounding_mode(rmode, fpst);                           \
2311        for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2312            d[i] = FUNC(n[i], fpst);                                    \
2313        }                                                               \
2314        set_float_rounding_mode(prev_rmode, fpst);                      \
2315        clear_tail(d, oprsz, simd_maxsz(desc));                         \
2316    }
2317
2318DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2319DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2320
2321#undef DO_VRINT_RMODE
2322
2323#ifdef TARGET_AARCH64
2324void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2325{
2326    const uint8_t *indices = vm;
2327    CPUARMState *env = venv;
2328    size_t oprsz = simd_oprsz(desc);
2329    uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2330    bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2331    uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2332    union {
2333        uint8_t b[16];
2334        uint64_t d[2];
2335    } result;
2336
2337    /*
2338     * We must construct the final result in a temp, lest the output
2339     * overlaps the input table.  For TBL, begin with zero; for TBX,
2340     * begin with the original register contents.  Note that we always
2341     * copy 16 bytes here to avoid an extra branch; clearing the high
2342     * bits of the register for oprsz == 8 is handled below.
2343     */
2344    if (is_tbx) {
2345        memcpy(&result, vd, 16);
2346    } else {
2347        memset(&result, 0, 16);
2348    }
2349
2350    for (size_t i = 0; i < oprsz; ++i) {
2351        uint32_t index = indices[H1(i)];
2352
2353        if (index < table_len) {
2354            /*
2355             * Convert index (a byte offset into the virtual table
2356             * which is a series of 128-bit vectors concatenated)
2357             * into the correct register element, bearing in mind
2358             * that the table can wrap around from V31 to V0.
2359             */
2360            const uint8_t *table = (const uint8_t *)
2361                aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2362            result.b[H1(i)] = table[H1(index % 16)];
2363        }
2364    }
2365
2366    memcpy(vd, &result, 16);
2367    clear_tail(vd, oprsz, simd_maxsz(desc));
2368}
2369#endif
2370
2371/*
2372 * NxN -> N highpart multiply
2373 *
2374 * TODO: expose this as a generic vector operation.
2375 */
2376
2377void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2378{
2379    intptr_t i, opr_sz = simd_oprsz(desc);
2380    int8_t *d = vd, *n = vn, *m = vm;
2381
2382    for (i = 0; i < opr_sz; ++i) {
2383        d[i] = ((int32_t)n[i] * m[i]) >> 8;
2384    }
2385    clear_tail(d, opr_sz, simd_maxsz(desc));
2386}
2387
2388void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2389{
2390    intptr_t i, opr_sz = simd_oprsz(desc);
2391    int16_t *d = vd, *n = vn, *m = vm;
2392
2393    for (i = 0; i < opr_sz / 2; ++i) {
2394        d[i] = ((int32_t)n[i] * m[i]) >> 16;
2395    }
2396    clear_tail(d, opr_sz, simd_maxsz(desc));
2397}
2398
2399void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2400{
2401    intptr_t i, opr_sz = simd_oprsz(desc);
2402    int32_t *d = vd, *n = vn, *m = vm;
2403
2404    for (i = 0; i < opr_sz / 4; ++i) {
2405        d[i] = ((int64_t)n[i] * m[i]) >> 32;
2406    }
2407    clear_tail(d, opr_sz, simd_maxsz(desc));
2408}
2409
2410void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2411{
2412    intptr_t i, opr_sz = simd_oprsz(desc);
2413    uint64_t *d = vd, *n = vn, *m = vm;
2414    uint64_t discard;
2415
2416    for (i = 0; i < opr_sz / 8; ++i) {
2417        muls64(&discard, &d[i], n[i], m[i]);
2418    }
2419    clear_tail(d, opr_sz, simd_maxsz(desc));
2420}
2421
2422void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2423{
2424    intptr_t i, opr_sz = simd_oprsz(desc);
2425    uint8_t *d = vd, *n = vn, *m = vm;
2426
2427    for (i = 0; i < opr_sz; ++i) {
2428        d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2429    }
2430    clear_tail(d, opr_sz, simd_maxsz(desc));
2431}
2432
2433void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2434{
2435    intptr_t i, opr_sz = simd_oprsz(desc);
2436    uint16_t *d = vd, *n = vn, *m = vm;
2437
2438    for (i = 0; i < opr_sz / 2; ++i) {
2439        d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2440    }
2441    clear_tail(d, opr_sz, simd_maxsz(desc));
2442}
2443
2444void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2445{
2446    intptr_t i, opr_sz = simd_oprsz(desc);
2447    uint32_t *d = vd, *n = vn, *m = vm;
2448
2449    for (i = 0; i < opr_sz / 4; ++i) {
2450        d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2451    }
2452    clear_tail(d, opr_sz, simd_maxsz(desc));
2453}
2454
2455void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2456{
2457    intptr_t i, opr_sz = simd_oprsz(desc);
2458    uint64_t *d = vd, *n = vn, *m = vm;
2459    uint64_t discard;
2460
2461    for (i = 0; i < opr_sz / 8; ++i) {
2462        mulu64(&discard, &d[i], n[i], m[i]);
2463    }
2464    clear_tail(d, opr_sz, simd_maxsz(desc));
2465}
2466
2467void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2468{
2469    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2470    int shr = simd_data(desc);
2471    uint64_t *d = vd, *n = vn, *m = vm;
2472
2473    for (i = 0; i < opr_sz; ++i) {
2474        d[i] = ror64(n[i] ^ m[i], shr);
2475    }
2476    clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2477}
2478
2479/*
2480 * Integer matrix-multiply accumulate
2481 */
2482
2483static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2484{
2485    int8_t *n = vn, *m = vm;
2486
2487    for (intptr_t k = 0; k < 8; ++k) {
2488        sum += n[H1(k)] * m[H1(k)];
2489    }
2490    return sum;
2491}
2492
2493static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2494{
2495    uint8_t *n = vn, *m = vm;
2496
2497    for (intptr_t k = 0; k < 8; ++k) {
2498        sum += n[H1(k)] * m[H1(k)];
2499    }
2500    return sum;
2501}
2502
2503static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2504{
2505    uint8_t *n = vn;
2506    int8_t *m = vm;
2507
2508    for (intptr_t k = 0; k < 8; ++k) {
2509        sum += n[H1(k)] * m[H1(k)];
2510    }
2511    return sum;
2512}
2513
2514static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2515                      uint32_t (*inner_loop)(uint32_t, void *, void *))
2516{
2517    intptr_t seg, opr_sz = simd_oprsz(desc);
2518
2519    for (seg = 0; seg < opr_sz; seg += 16) {
2520        uint32_t *d = vd + seg;
2521        uint32_t *a = va + seg;
2522        uint32_t sum0, sum1, sum2, sum3;
2523
2524        /*
2525         * Process the entire segment at once, writing back the
2526         * results only after we've consumed all of the inputs.
2527         *
2528         * Key to indices by column:
2529         *          i   j                  i             j
2530         */
2531        sum0 = a[H4(0 + 0)];
2532        sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2533        sum1 = a[H4(0 + 1)];
2534        sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2535        sum2 = a[H4(2 + 0)];
2536        sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2537        sum3 = a[H4(2 + 1)];
2538        sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2539
2540        d[H4(0)] = sum0;
2541        d[H4(1)] = sum1;
2542        d[H4(2)] = sum2;
2543        d[H4(3)] = sum3;
2544    }
2545    clear_tail(vd, opr_sz, simd_maxsz(desc));
2546}
2547
2548#define DO_MMLA_B(NAME, INNER) \
2549    void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2550    { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2551
2552DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2553DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2554DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2555
2556/*
2557 * BFloat16 Dot Product
2558 */
2559
2560float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2)
2561{
2562    /* FPCR is ignored for BFDOT and BFMMLA. */
2563    float_status bf_status = {
2564        .tininess_before_rounding = float_tininess_before_rounding,
2565        .float_rounding_mode = float_round_to_odd_inf,
2566        .flush_to_zero = true,
2567        .flush_inputs_to_zero = true,
2568        .default_nan_mode = true,
2569    };
2570    float32 t1, t2;
2571
2572    /*
2573     * Extract each BFloat16 from the element pair, and shift
2574     * them such that they become float32.
2575     */
2576    t1 = float32_mul(e1 << 16, e2 << 16, &bf_status);
2577    t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status);
2578    t1 = float32_add(t1, t2, &bf_status);
2579    t1 = float32_add(sum, t1, &bf_status);
2580
2581    return t1;
2582}
2583
2584void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2585{
2586    intptr_t i, opr_sz = simd_oprsz(desc);
2587    float32 *d = vd, *a = va;
2588    uint32_t *n = vn, *m = vm;
2589
2590    for (i = 0; i < opr_sz / 4; ++i) {
2591        d[i] = bfdotadd(a[i], n[i], m[i]);
2592    }
2593    clear_tail(d, opr_sz, simd_maxsz(desc));
2594}
2595
2596void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2597                            void *va, uint32_t desc)
2598{
2599    intptr_t i, j, opr_sz = simd_oprsz(desc);
2600    intptr_t index = simd_data(desc);
2601    intptr_t elements = opr_sz / 4;
2602    intptr_t eltspersegment = MIN(16 / 4, elements);
2603    float32 *d = vd, *a = va;
2604    uint32_t *n = vn, *m = vm;
2605
2606    for (i = 0; i < elements; i += eltspersegment) {
2607        uint32_t m_idx = m[i + H4(index)];
2608
2609        for (j = i; j < i + eltspersegment; j++) {
2610            d[j] = bfdotadd(a[j], n[j], m_idx);
2611        }
2612    }
2613    clear_tail(d, opr_sz, simd_maxsz(desc));
2614}
2615
2616void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2617{
2618    intptr_t s, opr_sz = simd_oprsz(desc);
2619    float32 *d = vd, *a = va;
2620    uint32_t *n = vn, *m = vm;
2621
2622    for (s = 0; s < opr_sz / 4; s += 4) {
2623        float32 sum00, sum01, sum10, sum11;
2624
2625        /*
2626         * Process the entire segment at once, writing back the
2627         * results only after we've consumed all of the inputs.
2628         *
2629         * Key to indicies by column:
2630         *               i   j           i   k             j   k
2631         */
2632        sum00 = a[s + H4(0 + 0)];
2633        sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]);
2634        sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]);
2635
2636        sum01 = a[s + H4(0 + 1)];
2637        sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]);
2638        sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]);
2639
2640        sum10 = a[s + H4(2 + 0)];
2641        sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]);
2642        sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]);
2643
2644        sum11 = a[s + H4(2 + 1)];
2645        sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]);
2646        sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]);
2647
2648        d[s + H4(0 + 0)] = sum00;
2649        d[s + H4(0 + 1)] = sum01;
2650        d[s + H4(2 + 0)] = sum10;
2651        d[s + H4(2 + 1)] = sum11;
2652    }
2653    clear_tail(d, opr_sz, simd_maxsz(desc));
2654}
2655
2656void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
2657                         void *stat, uint32_t desc)
2658{
2659    intptr_t i, opr_sz = simd_oprsz(desc);
2660    intptr_t sel = simd_data(desc);
2661    float32 *d = vd, *a = va;
2662    bfloat16 *n = vn, *m = vm;
2663
2664    for (i = 0; i < opr_sz / 4; ++i) {
2665        float32 nn = n[H2(i * 2 + sel)] << 16;
2666        float32 mm = m[H2(i * 2 + sel)] << 16;
2667        d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
2668    }
2669    clear_tail(d, opr_sz, simd_maxsz(desc));
2670}
2671
2672void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
2673                             void *va, void *stat, uint32_t desc)
2674{
2675    intptr_t i, j, opr_sz = simd_oprsz(desc);
2676    intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
2677    intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
2678    intptr_t elements = opr_sz / 4;
2679    intptr_t eltspersegment = MIN(16 / 4, elements);
2680    float32 *d = vd, *a = va;
2681    bfloat16 *n = vn, *m = vm;
2682
2683    for (i = 0; i < elements; i += eltspersegment) {
2684        float32 m_idx = m[H2(2 * i + index)] << 16;
2685
2686        for (j = i; j < i + eltspersegment; j++) {
2687            float32 n_j = n[H2(2 * j + sel)] << 16;
2688            d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
2689        }
2690    }
2691    clear_tail(d, opr_sz, simd_maxsz(desc));
2692}
2693
2694#define DO_CLAMP(NAME, TYPE) \
2695void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
2696{                                                                       \
2697    intptr_t i, opr_sz = simd_oprsz(desc);                              \
2698    for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
2699        TYPE aa = *(TYPE *)(a + i);                                     \
2700        TYPE nn = *(TYPE *)(n + i);                                     \
2701        TYPE mm = *(TYPE *)(m + i);                                     \
2702        TYPE dd = MIN(MAX(aa, nn), mm);                                 \
2703        *(TYPE *)(d + i) = dd;                                          \
2704    }                                                                   \
2705    clear_tail(d, opr_sz, simd_maxsz(desc));                            \
2706}
2707
2708DO_CLAMP(gvec_sclamp_b, int8_t)
2709DO_CLAMP(gvec_sclamp_h, int16_t)
2710DO_CLAMP(gvec_sclamp_s, int32_t)
2711DO_CLAMP(gvec_sclamp_d, int64_t)
2712
2713DO_CLAMP(gvec_uclamp_b, uint8_t)
2714DO_CLAMP(gvec_uclamp_h, uint16_t)
2715DO_CLAMP(gvec_uclamp_s, uint32_t)
2716DO_CLAMP(gvec_uclamp_d, uint64_t)
2717