qemu/target/arm/vec_helper.c
<<
>>
Prefs
   1/*
   2 * ARM AdvSIMD / SVE Vector Operations
   3 *
   4 * Copyright (c) 2018 Linaro
   5 *
   6 * This library is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU Lesser General Public
   8 * License as published by the Free Software Foundation; either
   9 * version 2.1 of the License, or (at your option) any later version.
  10 *
  11 * This library is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * Lesser General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU Lesser General Public
  17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20#include "qemu/osdep.h"
  21#include "cpu.h"
  22#include "exec/helper-proto.h"
  23#include "tcg/tcg-gvec-desc.h"
  24#include "fpu/softfloat.h"
  25#include "qemu/int128.h"
  26#include "vec_internal.h"
  27
  28/*
  29 * Data for expanding active predicate bits to bytes, for byte elements.
  30 *
  31 *  for (i = 0; i < 256; ++i) {
  32 *      unsigned long m = 0;
  33 *      for (j = 0; j < 8; j++) {
  34 *          if ((i >> j) & 1) {
  35 *              m |= 0xfful << (j << 3);
  36 *          }
  37 *      }
  38 *      printf("0x%016lx,\n", m);
  39 *  }
  40 */
  41const uint64_t expand_pred_b_data[256] = {
  42    0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
  43    0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
  44    0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
  45    0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
  46    0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
  47    0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
  48    0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
  49    0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
  50    0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
  51    0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
  52    0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
  53    0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
  54    0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
  55    0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
  56    0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
  57    0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
  58    0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
  59    0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
  60    0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
  61    0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
  62    0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
  63    0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
  64    0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
  65    0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
  66    0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
  67    0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
  68    0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
  69    0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
  70    0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
  71    0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
  72    0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
  73    0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
  74    0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
  75    0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
  76    0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
  77    0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
  78    0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
  79    0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
  80    0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
  81    0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
  82    0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
  83    0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
  84    0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
  85    0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
  86    0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
  87    0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
  88    0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
  89    0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
  90    0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
  91    0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
  92    0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
  93    0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
  94    0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
  95    0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
  96    0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
  97    0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
  98    0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
  99    0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
 100    0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
 101    0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
 102    0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
 103    0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
 104    0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
 105    0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
 106    0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
 107    0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
 108    0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
 109    0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
 110    0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
 111    0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
 112    0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
 113    0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
 114    0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
 115    0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
 116    0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
 117    0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
 118    0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
 119    0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
 120    0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
 121    0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
 122    0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
 123    0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
 124    0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
 125    0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
 126    0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
 127    0xffffffffffffffff,
 128};
 129
 130/* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
 131int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
 132                     bool neg, bool round)
 133{
 134    /*
 135     * Simplify:
 136     * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
 137     * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
 138     */
 139    int32_t ret = (int32_t)src1 * src2;
 140    if (neg) {
 141        ret = -ret;
 142    }
 143    ret += ((int32_t)src3 << 7) + (round << 6);
 144    ret >>= 7;
 145
 146    if (ret != (int8_t)ret) {
 147        ret = (ret < 0 ? INT8_MIN : INT8_MAX);
 148    }
 149    return ret;
 150}
 151
 152void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
 153                             void *va, uint32_t desc)
 154{
 155    intptr_t i, opr_sz = simd_oprsz(desc);
 156    int8_t *d = vd, *n = vn, *m = vm, *a = va;
 157
 158    for (i = 0; i < opr_sz; ++i) {
 159        d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
 160    }
 161}
 162
 163void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
 164                             void *va, uint32_t desc)
 165{
 166    intptr_t i, opr_sz = simd_oprsz(desc);
 167    int8_t *d = vd, *n = vn, *m = vm, *a = va;
 168
 169    for (i = 0; i < opr_sz; ++i) {
 170        d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
 171    }
 172}
 173
 174void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
 175{
 176    intptr_t i, opr_sz = simd_oprsz(desc);
 177    int8_t *d = vd, *n = vn, *m = vm;
 178
 179    for (i = 0; i < opr_sz; ++i) {
 180        d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
 181    }
 182}
 183
 184void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
 185{
 186    intptr_t i, opr_sz = simd_oprsz(desc);
 187    int8_t *d = vd, *n = vn, *m = vm;
 188
 189    for (i = 0; i < opr_sz; ++i) {
 190        d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
 191    }
 192}
 193
 194/* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
 195int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
 196                      bool neg, bool round, uint32_t *sat)
 197{
 198    /* Simplify similarly to do_sqrdmlah_b above.  */
 199    int32_t ret = (int32_t)src1 * src2;
 200    if (neg) {
 201        ret = -ret;
 202    }
 203    ret += ((int32_t)src3 << 15) + (round << 14);
 204    ret >>= 15;
 205
 206    if (ret != (int16_t)ret) {
 207        *sat = 1;
 208        ret = (ret < 0 ? INT16_MIN : INT16_MAX);
 209    }
 210    return ret;
 211}
 212
 213uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
 214                                  uint32_t src2, uint32_t src3)
 215{
 216    uint32_t *sat = &env->vfp.qc[0];
 217    uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
 218    uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
 219                                false, true, sat);
 220    return deposit32(e1, 16, 16, e2);
 221}
 222
 223void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
 224                              void *vq, uint32_t desc)
 225{
 226    uintptr_t opr_sz = simd_oprsz(desc);
 227    int16_t *d = vd;
 228    int16_t *n = vn;
 229    int16_t *m = vm;
 230    uintptr_t i;
 231
 232    for (i = 0; i < opr_sz / 2; ++i) {
 233        d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
 234    }
 235    clear_tail(d, opr_sz, simd_maxsz(desc));
 236}
 237
 238uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
 239                                  uint32_t src2, uint32_t src3)
 240{
 241    uint32_t *sat = &env->vfp.qc[0];
 242    uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
 243    uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
 244                                true, true, sat);
 245    return deposit32(e1, 16, 16, e2);
 246}
 247
 248void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
 249                              void *vq, uint32_t desc)
 250{
 251    uintptr_t opr_sz = simd_oprsz(desc);
 252    int16_t *d = vd;
 253    int16_t *n = vn;
 254    int16_t *m = vm;
 255    uintptr_t i;
 256
 257    for (i = 0; i < opr_sz / 2; ++i) {
 258        d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
 259    }
 260    clear_tail(d, opr_sz, simd_maxsz(desc));
 261}
 262
 263void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
 264                            void *vq, uint32_t desc)
 265{
 266    intptr_t i, opr_sz = simd_oprsz(desc);
 267    int16_t *d = vd, *n = vn, *m = vm;
 268
 269    for (i = 0; i < opr_sz / 2; ++i) {
 270        d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
 271    }
 272    clear_tail(d, opr_sz, simd_maxsz(desc));
 273}
 274
 275void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
 276                             void *vq, uint32_t desc)
 277{
 278    intptr_t i, opr_sz = simd_oprsz(desc);
 279    int16_t *d = vd, *n = vn, *m = vm;
 280
 281    for (i = 0; i < opr_sz / 2; ++i) {
 282        d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
 283    }
 284    clear_tail(d, opr_sz, simd_maxsz(desc));
 285}
 286
 287void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
 288                             void *va, uint32_t desc)
 289{
 290    intptr_t i, opr_sz = simd_oprsz(desc);
 291    int16_t *d = vd, *n = vn, *m = vm, *a = va;
 292    uint32_t discard;
 293
 294    for (i = 0; i < opr_sz / 2; ++i) {
 295        d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
 296    }
 297}
 298
 299void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
 300                             void *va, uint32_t desc)
 301{
 302    intptr_t i, opr_sz = simd_oprsz(desc);
 303    int16_t *d = vd, *n = vn, *m = vm, *a = va;
 304    uint32_t discard;
 305
 306    for (i = 0; i < opr_sz / 2; ++i) {
 307        d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
 308    }
 309}
 310
 311void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
 312{
 313    intptr_t i, opr_sz = simd_oprsz(desc);
 314    int16_t *d = vd, *n = vn, *m = vm;
 315    uint32_t discard;
 316
 317    for (i = 0; i < opr_sz / 2; ++i) {
 318        d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
 319    }
 320}
 321
 322void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
 323{
 324    intptr_t i, opr_sz = simd_oprsz(desc);
 325    int16_t *d = vd, *n = vn, *m = vm;
 326    uint32_t discard;
 327
 328    for (i = 0; i < opr_sz / 2; ++i) {
 329        d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
 330    }
 331}
 332
 333void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
 334{
 335    intptr_t i, j, opr_sz = simd_oprsz(desc);
 336    int idx = simd_data(desc);
 337    int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
 338    uint32_t discard;
 339
 340    for (i = 0; i < opr_sz / 2; i += 16 / 2) {
 341        int16_t mm = m[i];
 342        for (j = 0; j < 16 / 2; ++j) {
 343            d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
 344        }
 345    }
 346}
 347
 348void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
 349{
 350    intptr_t i, j, opr_sz = simd_oprsz(desc);
 351    int idx = simd_data(desc);
 352    int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
 353    uint32_t discard;
 354
 355    for (i = 0; i < opr_sz / 2; i += 16 / 2) {
 356        int16_t mm = m[i];
 357        for (j = 0; j < 16 / 2; ++j) {
 358            d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
 359        }
 360    }
 361}
 362
 363/* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
 364int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
 365                      bool neg, bool round, uint32_t *sat)
 366{
 367    /* Simplify similarly to do_sqrdmlah_b above.  */
 368    int64_t ret = (int64_t)src1 * src2;
 369    if (neg) {
 370        ret = -ret;
 371    }
 372    ret += ((int64_t)src3 << 31) + (round << 30);
 373    ret >>= 31;
 374
 375    if (ret != (int32_t)ret) {
 376        *sat = 1;
 377        ret = (ret < 0 ? INT32_MIN : INT32_MAX);
 378    }
 379    return ret;
 380}
 381
 382uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
 383                                  int32_t src2, int32_t src3)
 384{
 385    uint32_t *sat = &env->vfp.qc[0];
 386    return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
 387}
 388
 389void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
 390                              void *vq, uint32_t desc)
 391{
 392    uintptr_t opr_sz = simd_oprsz(desc);
 393    int32_t *d = vd;
 394    int32_t *n = vn;
 395    int32_t *m = vm;
 396    uintptr_t i;
 397
 398    for (i = 0; i < opr_sz / 4; ++i) {
 399        d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
 400    }
 401    clear_tail(d, opr_sz, simd_maxsz(desc));
 402}
 403
 404uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
 405                                  int32_t src2, int32_t src3)
 406{
 407    uint32_t *sat = &env->vfp.qc[0];
 408    return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
 409}
 410
 411void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
 412                              void *vq, uint32_t desc)
 413{
 414    uintptr_t opr_sz = simd_oprsz(desc);
 415    int32_t *d = vd;
 416    int32_t *n = vn;
 417    int32_t *m = vm;
 418    uintptr_t i;
 419
 420    for (i = 0; i < opr_sz / 4; ++i) {
 421        d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
 422    }
 423    clear_tail(d, opr_sz, simd_maxsz(desc));
 424}
 425
 426void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
 427                            void *vq, uint32_t desc)
 428{
 429    intptr_t i, opr_sz = simd_oprsz(desc);
 430    int32_t *d = vd, *n = vn, *m = vm;
 431
 432    for (i = 0; i < opr_sz / 4; ++i) {
 433        d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
 434    }
 435    clear_tail(d, opr_sz, simd_maxsz(desc));
 436}
 437
 438void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
 439                             void *vq, uint32_t desc)
 440{
 441    intptr_t i, opr_sz = simd_oprsz(desc);
 442    int32_t *d = vd, *n = vn, *m = vm;
 443
 444    for (i = 0; i < opr_sz / 4; ++i) {
 445        d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
 446    }
 447    clear_tail(d, opr_sz, simd_maxsz(desc));
 448}
 449
 450void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
 451                             void *va, uint32_t desc)
 452{
 453    intptr_t i, opr_sz = simd_oprsz(desc);
 454    int32_t *d = vd, *n = vn, *m = vm, *a = va;
 455    uint32_t discard;
 456
 457    for (i = 0; i < opr_sz / 4; ++i) {
 458        d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
 459    }
 460}
 461
 462void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
 463                             void *va, uint32_t desc)
 464{
 465    intptr_t i, opr_sz = simd_oprsz(desc);
 466    int32_t *d = vd, *n = vn, *m = vm, *a = va;
 467    uint32_t discard;
 468
 469    for (i = 0; i < opr_sz / 4; ++i) {
 470        d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
 471    }
 472}
 473
 474void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
 475{
 476    intptr_t i, opr_sz = simd_oprsz(desc);
 477    int32_t *d = vd, *n = vn, *m = vm;
 478    uint32_t discard;
 479
 480    for (i = 0; i < opr_sz / 4; ++i) {
 481        d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
 482    }
 483}
 484
 485void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
 486{
 487    intptr_t i, opr_sz = simd_oprsz(desc);
 488    int32_t *d = vd, *n = vn, *m = vm;
 489    uint32_t discard;
 490
 491    for (i = 0; i < opr_sz / 4; ++i) {
 492        d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
 493    }
 494}
 495
 496void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
 497{
 498    intptr_t i, j, opr_sz = simd_oprsz(desc);
 499    int idx = simd_data(desc);
 500    int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
 501    uint32_t discard;
 502
 503    for (i = 0; i < opr_sz / 4; i += 16 / 4) {
 504        int32_t mm = m[i];
 505        for (j = 0; j < 16 / 4; ++j) {
 506            d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
 507        }
 508    }
 509}
 510
 511void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
 512{
 513    intptr_t i, j, opr_sz = simd_oprsz(desc);
 514    int idx = simd_data(desc);
 515    int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
 516    uint32_t discard;
 517
 518    for (i = 0; i < opr_sz / 4; i += 16 / 4) {
 519        int32_t mm = m[i];
 520        for (j = 0; j < 16 / 4; ++j) {
 521            d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
 522        }
 523    }
 524}
 525
 526/* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
 527static int64_t do_sat128_d(Int128 r)
 528{
 529    int64_t ls = int128_getlo(r);
 530    int64_t hs = int128_gethi(r);
 531
 532    if (unlikely(hs != (ls >> 63))) {
 533        return hs < 0 ? INT64_MIN : INT64_MAX;
 534    }
 535    return ls;
 536}
 537
 538int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
 539{
 540    uint64_t l, h;
 541    Int128 r, t;
 542
 543    /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
 544    muls64(&l, &h, m, n);
 545    r = int128_make128(l, h);
 546    if (neg) {
 547        r = int128_neg(r);
 548    }
 549    if (a) {
 550        t = int128_exts64(a);
 551        t = int128_lshift(t, 63);
 552        r = int128_add(r, t);
 553    }
 554    if (round) {
 555        t = int128_exts64(1ll << 62);
 556        r = int128_add(r, t);
 557    }
 558    r = int128_rshift(r, 63);
 559
 560    return do_sat128_d(r);
 561}
 562
 563void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
 564                             void *va, uint32_t desc)
 565{
 566    intptr_t i, opr_sz = simd_oprsz(desc);
 567    int64_t *d = vd, *n = vn, *m = vm, *a = va;
 568
 569    for (i = 0; i < opr_sz / 8; ++i) {
 570        d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
 571    }
 572}
 573
 574void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
 575                             void *va, uint32_t desc)
 576{
 577    intptr_t i, opr_sz = simd_oprsz(desc);
 578    int64_t *d = vd, *n = vn, *m = vm, *a = va;
 579
 580    for (i = 0; i < opr_sz / 8; ++i) {
 581        d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
 582    }
 583}
 584
 585void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
 586{
 587    intptr_t i, opr_sz = simd_oprsz(desc);
 588    int64_t *d = vd, *n = vn, *m = vm;
 589
 590    for (i = 0; i < opr_sz / 8; ++i) {
 591        d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
 592    }
 593}
 594
 595void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
 596{
 597    intptr_t i, opr_sz = simd_oprsz(desc);
 598    int64_t *d = vd, *n = vn, *m = vm;
 599
 600    for (i = 0; i < opr_sz / 8; ++i) {
 601        d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
 602    }
 603}
 604
 605void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
 606{
 607    intptr_t i, j, opr_sz = simd_oprsz(desc);
 608    int idx = simd_data(desc);
 609    int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
 610
 611    for (i = 0; i < opr_sz / 8; i += 16 / 8) {
 612        int64_t mm = m[i];
 613        for (j = 0; j < 16 / 8; ++j) {
 614            d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
 615        }
 616    }
 617}
 618
 619void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
 620{
 621    intptr_t i, j, opr_sz = simd_oprsz(desc);
 622    int idx = simd_data(desc);
 623    int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
 624
 625    for (i = 0; i < opr_sz / 8; i += 16 / 8) {
 626        int64_t mm = m[i];
 627        for (j = 0; j < 16 / 8; ++j) {
 628            d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
 629        }
 630    }
 631}
 632
 633/* Integer 8 and 16-bit dot-product.
 634 *
 635 * Note that for the loops herein, host endianness does not matter
 636 * with respect to the ordering of data within the quad-width lanes.
 637 * All elements are treated equally, no matter where they are.
 638 */
 639
 640#define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
 641void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
 642{                                                                         \
 643    intptr_t i, opr_sz = simd_oprsz(desc);                                \
 644    TYPED *d = vd, *a = va;                                               \
 645    TYPEN *n = vn;                                                        \
 646    TYPEM *m = vm;                                                        \
 647    for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
 648        d[i] = (a[i] +                                                    \
 649                (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
 650                (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
 651                (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
 652                (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
 653    }                                                                     \
 654    clear_tail(d, opr_sz, simd_maxsz(desc));                              \
 655}
 656
 657DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
 658DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
 659DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
 660DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
 661DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
 662
 663#define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
 664void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
 665{                                                                         \
 666    intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
 667    intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
 668    intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
 669    intptr_t index = simd_data(desc);                                     \
 670    TYPED *d = vd, *a = va;                                               \
 671    TYPEN *n = vn;                                                        \
 672    TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
 673    do {                                                                  \
 674        TYPED m0 = m_indexed[i * 4 + 0];                                  \
 675        TYPED m1 = m_indexed[i * 4 + 1];                                  \
 676        TYPED m2 = m_indexed[i * 4 + 2];                                  \
 677        TYPED m3 = m_indexed[i * 4 + 3];                                  \
 678        do {                                                              \
 679            d[i] = (a[i] +                                                \
 680                    n[i * 4 + 0] * m0 +                                   \
 681                    n[i * 4 + 1] * m1 +                                   \
 682                    n[i * 4 + 2] * m2 +                                   \
 683                    n[i * 4 + 3] * m3);                                   \
 684        } while (++i < segend);                                           \
 685        segend = i + 4;                                                   \
 686    } while (i < opr_sz_n);                                               \
 687    clear_tail(d, opr_sz, simd_maxsz(desc));                              \
 688}
 689
 690DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
 691DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
 692DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
 693DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
 694DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
 695DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
 696
 697void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
 698                         void *vfpst, uint32_t desc)
 699{
 700    uintptr_t opr_sz = simd_oprsz(desc);
 701    float16 *d = vd;
 702    float16 *n = vn;
 703    float16 *m = vm;
 704    float_status *fpst = vfpst;
 705    uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
 706    uint32_t neg_imag = neg_real ^ 1;
 707    uintptr_t i;
 708
 709    /* Shift boolean to the sign bit so we can xor to negate.  */
 710    neg_real <<= 15;
 711    neg_imag <<= 15;
 712
 713    for (i = 0; i < opr_sz / 2; i += 2) {
 714        float16 e0 = n[H2(i)];
 715        float16 e1 = m[H2(i + 1)] ^ neg_imag;
 716        float16 e2 = n[H2(i + 1)];
 717        float16 e3 = m[H2(i)] ^ neg_real;
 718
 719        d[H2(i)] = float16_add(e0, e1, fpst);
 720        d[H2(i + 1)] = float16_add(e2, e3, fpst);
 721    }
 722    clear_tail(d, opr_sz, simd_maxsz(desc));
 723}
 724
 725void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
 726                         void *vfpst, uint32_t desc)
 727{
 728    uintptr_t opr_sz = simd_oprsz(desc);
 729    float32 *d = vd;
 730    float32 *n = vn;
 731    float32 *m = vm;
 732    float_status *fpst = vfpst;
 733    uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
 734    uint32_t neg_imag = neg_real ^ 1;
 735    uintptr_t i;
 736
 737    /* Shift boolean to the sign bit so we can xor to negate.  */
 738    neg_real <<= 31;
 739    neg_imag <<= 31;
 740
 741    for (i = 0; i < opr_sz / 4; i += 2) {
 742        float32 e0 = n[H4(i)];
 743        float32 e1 = m[H4(i + 1)] ^ neg_imag;
 744        float32 e2 = n[H4(i + 1)];
 745        float32 e3 = m[H4(i)] ^ neg_real;
 746
 747        d[H4(i)] = float32_add(e0, e1, fpst);
 748        d[H4(i + 1)] = float32_add(e2, e3, fpst);
 749    }
 750    clear_tail(d, opr_sz, simd_maxsz(desc));
 751}
 752
 753void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
 754                         void *vfpst, uint32_t desc)
 755{
 756    uintptr_t opr_sz = simd_oprsz(desc);
 757    float64 *d = vd;
 758    float64 *n = vn;
 759    float64 *m = vm;
 760    float_status *fpst = vfpst;
 761    uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
 762    uint64_t neg_imag = neg_real ^ 1;
 763    uintptr_t i;
 764
 765    /* Shift boolean to the sign bit so we can xor to negate.  */
 766    neg_real <<= 63;
 767    neg_imag <<= 63;
 768
 769    for (i = 0; i < opr_sz / 8; i += 2) {
 770        float64 e0 = n[i];
 771        float64 e1 = m[i + 1] ^ neg_imag;
 772        float64 e2 = n[i + 1];
 773        float64 e3 = m[i] ^ neg_real;
 774
 775        d[i] = float64_add(e0, e1, fpst);
 776        d[i + 1] = float64_add(e2, e3, fpst);
 777    }
 778    clear_tail(d, opr_sz, simd_maxsz(desc));
 779}
 780
 781void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
 782                         void *vfpst, uint32_t desc)
 783{
 784    uintptr_t opr_sz = simd_oprsz(desc);
 785    float16 *d = vd, *n = vn, *m = vm, *a = va;
 786    float_status *fpst = vfpst;
 787    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
 788    uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
 789    uint32_t neg_real = flip ^ neg_imag;
 790    uintptr_t i;
 791
 792    /* Shift boolean to the sign bit so we can xor to negate.  */
 793    neg_real <<= 15;
 794    neg_imag <<= 15;
 795
 796    for (i = 0; i < opr_sz / 2; i += 2) {
 797        float16 e2 = n[H2(i + flip)];
 798        float16 e1 = m[H2(i + flip)] ^ neg_real;
 799        float16 e4 = e2;
 800        float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
 801
 802        d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
 803        d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
 804    }
 805    clear_tail(d, opr_sz, simd_maxsz(desc));
 806}
 807
 808void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
 809                             void *vfpst, uint32_t desc)
 810{
 811    uintptr_t opr_sz = simd_oprsz(desc);
 812    float16 *d = vd, *n = vn, *m = vm, *a = va;
 813    float_status *fpst = vfpst;
 814    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
 815    uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
 816    intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
 817    uint32_t neg_real = flip ^ neg_imag;
 818    intptr_t elements = opr_sz / sizeof(float16);
 819    intptr_t eltspersegment = 16 / sizeof(float16);
 820    intptr_t i, j;
 821
 822    /* Shift boolean to the sign bit so we can xor to negate.  */
 823    neg_real <<= 15;
 824    neg_imag <<= 15;
 825
 826    for (i = 0; i < elements; i += eltspersegment) {
 827        float16 mr = m[H2(i + 2 * index + 0)];
 828        float16 mi = m[H2(i + 2 * index + 1)];
 829        float16 e1 = neg_real ^ (flip ? mi : mr);
 830        float16 e3 = neg_imag ^ (flip ? mr : mi);
 831
 832        for (j = i; j < i + eltspersegment; j += 2) {
 833            float16 e2 = n[H2(j + flip)];
 834            float16 e4 = e2;
 835
 836            d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
 837            d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
 838        }
 839    }
 840    clear_tail(d, opr_sz, simd_maxsz(desc));
 841}
 842
 843void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
 844                         void *vfpst, uint32_t desc)
 845{
 846    uintptr_t opr_sz = simd_oprsz(desc);
 847    float32 *d = vd, *n = vn, *m = vm, *a = va;
 848    float_status *fpst = vfpst;
 849    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
 850    uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
 851    uint32_t neg_real = flip ^ neg_imag;
 852    uintptr_t i;
 853
 854    /* Shift boolean to the sign bit so we can xor to negate.  */
 855    neg_real <<= 31;
 856    neg_imag <<= 31;
 857
 858    for (i = 0; i < opr_sz / 4; i += 2) {
 859        float32 e2 = n[H4(i + flip)];
 860        float32 e1 = m[H4(i + flip)] ^ neg_real;
 861        float32 e4 = e2;
 862        float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
 863
 864        d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
 865        d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
 866    }
 867    clear_tail(d, opr_sz, simd_maxsz(desc));
 868}
 869
 870void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
 871                             void *vfpst, uint32_t desc)
 872{
 873    uintptr_t opr_sz = simd_oprsz(desc);
 874    float32 *d = vd, *n = vn, *m = vm, *a = va;
 875    float_status *fpst = vfpst;
 876    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
 877    uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
 878    intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
 879    uint32_t neg_real = flip ^ neg_imag;
 880    intptr_t elements = opr_sz / sizeof(float32);
 881    intptr_t eltspersegment = 16 / sizeof(float32);
 882    intptr_t i, j;
 883
 884    /* Shift boolean to the sign bit so we can xor to negate.  */
 885    neg_real <<= 31;
 886    neg_imag <<= 31;
 887
 888    for (i = 0; i < elements; i += eltspersegment) {
 889        float32 mr = m[H4(i + 2 * index + 0)];
 890        float32 mi = m[H4(i + 2 * index + 1)];
 891        float32 e1 = neg_real ^ (flip ? mi : mr);
 892        float32 e3 = neg_imag ^ (flip ? mr : mi);
 893
 894        for (j = i; j < i + eltspersegment; j += 2) {
 895            float32 e2 = n[H4(j + flip)];
 896            float32 e4 = e2;
 897
 898            d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
 899            d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
 900        }
 901    }
 902    clear_tail(d, opr_sz, simd_maxsz(desc));
 903}
 904
 905void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
 906                         void *vfpst, uint32_t desc)
 907{
 908    uintptr_t opr_sz = simd_oprsz(desc);
 909    float64 *d = vd, *n = vn, *m = vm, *a = va;
 910    float_status *fpst = vfpst;
 911    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
 912    uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
 913    uint64_t neg_real = flip ^ neg_imag;
 914    uintptr_t i;
 915
 916    /* Shift boolean to the sign bit so we can xor to negate.  */
 917    neg_real <<= 63;
 918    neg_imag <<= 63;
 919
 920    for (i = 0; i < opr_sz / 8; i += 2) {
 921        float64 e2 = n[i + flip];
 922        float64 e1 = m[i + flip] ^ neg_real;
 923        float64 e4 = e2;
 924        float64 e3 = m[i + 1 - flip] ^ neg_imag;
 925
 926        d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
 927        d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
 928    }
 929    clear_tail(d, opr_sz, simd_maxsz(desc));
 930}
 931
 932/*
 933 * Floating point comparisons producing an integer result (all 1s or all 0s).
 934 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
 935 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
 936 */
 937static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
 938{
 939    return -float16_eq_quiet(op1, op2, stat);
 940}
 941
 942static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
 943{
 944    return -float32_eq_quiet(op1, op2, stat);
 945}
 946
 947static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
 948{
 949    return -float16_le(op2, op1, stat);
 950}
 951
 952static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
 953{
 954    return -float32_le(op2, op1, stat);
 955}
 956
 957static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
 958{
 959    return -float16_lt(op2, op1, stat);
 960}
 961
 962static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
 963{
 964    return -float32_lt(op2, op1, stat);
 965}
 966
 967static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
 968{
 969    return -float16_le(float16_abs(op2), float16_abs(op1), stat);
 970}
 971
 972static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
 973{
 974    return -float32_le(float32_abs(op2), float32_abs(op1), stat);
 975}
 976
 977static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
 978{
 979    return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
 980}
 981
 982static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
 983{
 984    return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
 985}
 986
 987static int16_t vfp_tosszh(float16 x, void *fpstp)
 988{
 989    float_status *fpst = fpstp;
 990    if (float16_is_any_nan(x)) {
 991        float_raise(float_flag_invalid, fpst);
 992        return 0;
 993    }
 994    return float16_to_int16_round_to_zero(x, fpst);
 995}
 996
 997static uint16_t vfp_touszh(float16 x, void *fpstp)
 998{
 999    float_status *fpst = fpstp;
1000    if (float16_is_any_nan(x)) {
1001        float_raise(float_flag_invalid, fpst);
1002        return 0;
1003    }
1004    return float16_to_uint16_round_to_zero(x, fpst);
1005}
1006
1007#define DO_2OP(NAME, FUNC, TYPE) \
1008void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)  \
1009{                                                                 \
1010    intptr_t i, oprsz = simd_oprsz(desc);                         \
1011    TYPE *d = vd, *n = vn;                                        \
1012    for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1013        d[i] = FUNC(n[i], stat);                                  \
1014    }                                                             \
1015    clear_tail(d, oprsz, simd_maxsz(desc));                       \
1016}
1017
1018DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1019DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1020DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1021
1022DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1023DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1024DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1025
1026DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1027DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1028
1029DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1030DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1031DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1032DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1033DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1034DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1035DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1036DO_2OP(gvec_touszh, vfp_touszh, float16)
1037
1038#define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1039    static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1040    {                                                           \
1041        return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1042    }
1043
1044#define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1045    static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1046    {                                                           \
1047        return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1048    }
1049
1050#define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1051    WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1052    WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1053    DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1054    DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
1055
1056DO_2OP_CMP0(cgt, cgt, FWD)
1057DO_2OP_CMP0(cge, cge, FWD)
1058DO_2OP_CMP0(ceq, ceq, FWD)
1059DO_2OP_CMP0(clt, cgt, REV)
1060DO_2OP_CMP0(cle, cge, REV)
1061
1062#undef DO_2OP
1063#undef DO_2OP_CMP0
1064
1065/* Floating-point trigonometric starting value.
1066 * See the ARM ARM pseudocode function FPTrigSMul.
1067 */
1068static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1069{
1070    float16 result = float16_mul(op1, op1, stat);
1071    if (!float16_is_any_nan(result)) {
1072        result = float16_set_sign(result, op2 & 1);
1073    }
1074    return result;
1075}
1076
1077static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1078{
1079    float32 result = float32_mul(op1, op1, stat);
1080    if (!float32_is_any_nan(result)) {
1081        result = float32_set_sign(result, op2 & 1);
1082    }
1083    return result;
1084}
1085
1086static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1087{
1088    float64 result = float64_mul(op1, op1, stat);
1089    if (!float64_is_any_nan(result)) {
1090        result = float64_set_sign(result, op2 & 1);
1091    }
1092    return result;
1093}
1094
1095static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1096{
1097    return float16_abs(float16_sub(op1, op2, stat));
1098}
1099
1100static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1101{
1102    return float32_abs(float32_sub(op1, op2, stat));
1103}
1104
1105/*
1106 * Reciprocal step. These are the AArch32 version which uses a
1107 * non-fused multiply-and-subtract.
1108 */
1109static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1110{
1111    op1 = float16_squash_input_denormal(op1, stat);
1112    op2 = float16_squash_input_denormal(op2, stat);
1113
1114    if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1115        (float16_is_infinity(op2) && float16_is_zero(op1))) {
1116        return float16_two;
1117    }
1118    return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1119}
1120
1121static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1122{
1123    op1 = float32_squash_input_denormal(op1, stat);
1124    op2 = float32_squash_input_denormal(op2, stat);
1125
1126    if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1127        (float32_is_infinity(op2) && float32_is_zero(op1))) {
1128        return float32_two;
1129    }
1130    return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1131}
1132
1133/* Reciprocal square-root step. AArch32 non-fused semantics. */
1134static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1135{
1136    op1 = float16_squash_input_denormal(op1, stat);
1137    op2 = float16_squash_input_denormal(op2, stat);
1138
1139    if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1140        (float16_is_infinity(op2) && float16_is_zero(op1))) {
1141        return float16_one_point_five;
1142    }
1143    op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1144    return float16_div(op1, float16_two, stat);
1145}
1146
1147static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1148{
1149    op1 = float32_squash_input_denormal(op1, stat);
1150    op2 = float32_squash_input_denormal(op2, stat);
1151
1152    if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1153        (float32_is_infinity(op2) && float32_is_zero(op1))) {
1154        return float32_one_point_five;
1155    }
1156    op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1157    return float32_div(op1, float32_two, stat);
1158}
1159
1160#define DO_3OP(NAME, FUNC, TYPE) \
1161void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1162{                                                                          \
1163    intptr_t i, oprsz = simd_oprsz(desc);                                  \
1164    TYPE *d = vd, *n = vn, *m = vm;                                        \
1165    for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1166        d[i] = FUNC(n[i], m[i], stat);                                     \
1167    }                                                                      \
1168    clear_tail(d, oprsz, simd_maxsz(desc));                                \
1169}
1170
1171DO_3OP(gvec_fadd_h, float16_add, float16)
1172DO_3OP(gvec_fadd_s, float32_add, float32)
1173DO_3OP(gvec_fadd_d, float64_add, float64)
1174
1175DO_3OP(gvec_fsub_h, float16_sub, float16)
1176DO_3OP(gvec_fsub_s, float32_sub, float32)
1177DO_3OP(gvec_fsub_d, float64_sub, float64)
1178
1179DO_3OP(gvec_fmul_h, float16_mul, float16)
1180DO_3OP(gvec_fmul_s, float32_mul, float32)
1181DO_3OP(gvec_fmul_d, float64_mul, float64)
1182
1183DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1184DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1185DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1186
1187DO_3OP(gvec_fabd_h, float16_abd, float16)
1188DO_3OP(gvec_fabd_s, float32_abd, float32)
1189
1190DO_3OP(gvec_fceq_h, float16_ceq, float16)
1191DO_3OP(gvec_fceq_s, float32_ceq, float32)
1192
1193DO_3OP(gvec_fcge_h, float16_cge, float16)
1194DO_3OP(gvec_fcge_s, float32_cge, float32)
1195
1196DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1197DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1198
1199DO_3OP(gvec_facge_h, float16_acge, float16)
1200DO_3OP(gvec_facge_s, float32_acge, float32)
1201
1202DO_3OP(gvec_facgt_h, float16_acgt, float16)
1203DO_3OP(gvec_facgt_s, float32_acgt, float32)
1204
1205DO_3OP(gvec_fmax_h, float16_max, float16)
1206DO_3OP(gvec_fmax_s, float32_max, float32)
1207
1208DO_3OP(gvec_fmin_h, float16_min, float16)
1209DO_3OP(gvec_fmin_s, float32_min, float32)
1210
1211DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1212DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1213
1214DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1215DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1216
1217DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1218DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1219
1220DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1221DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1222
1223#ifdef TARGET_AARCH64
1224
1225DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1226DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1227DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1228
1229DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1230DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1231DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1232
1233#endif
1234#undef DO_3OP
1235
1236/* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1237static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1238                                 float_status *stat)
1239{
1240    return float16_add(dest, float16_mul(op1, op2, stat), stat);
1241}
1242
1243static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1244                                 float_status *stat)
1245{
1246    return float32_add(dest, float32_mul(op1, op2, stat), stat);
1247}
1248
1249static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1250                                 float_status *stat)
1251{
1252    return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1253}
1254
1255static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1256                                 float_status *stat)
1257{
1258    return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1259}
1260
1261/* Fused versions; these have the semantics Neon VFMA/VFMS want */
1262static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1263                                float_status *stat)
1264{
1265    return float16_muladd(op1, op2, dest, 0, stat);
1266}
1267
1268static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1269                                 float_status *stat)
1270{
1271    return float32_muladd(op1, op2, dest, 0, stat);
1272}
1273
1274static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1275                                 float_status *stat)
1276{
1277    return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1278}
1279
1280static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1281                                 float_status *stat)
1282{
1283    return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1284}
1285
1286#define DO_MULADD(NAME, FUNC, TYPE)                                     \
1287void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1288{                                                                          \
1289    intptr_t i, oprsz = simd_oprsz(desc);                                  \
1290    TYPE *d = vd, *n = vn, *m = vm;                                        \
1291    for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1292        d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1293    }                                                                      \
1294    clear_tail(d, oprsz, simd_maxsz(desc));                                \
1295}
1296
1297DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1298DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1299
1300DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1301DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1302
1303DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1304DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1305
1306DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1307DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1308
1309/* For the indexed ops, SVE applies the index per 128-bit vector segment.
1310 * For AdvSIMD, there is of course only one such vector segment.
1311 */
1312
1313#define DO_MUL_IDX(NAME, TYPE, H) \
1314void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1315{                                                                          \
1316    intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1317    intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1318    intptr_t idx = simd_data(desc);                                        \
1319    TYPE *d = vd, *n = vn, *m = vm;                                        \
1320    for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1321        TYPE mm = m[H(i + idx)];                                           \
1322        for (j = 0; j < segment; j++) {                                    \
1323            d[i + j] = n[i + j] * mm;                                      \
1324        }                                                                  \
1325    }                                                                      \
1326    clear_tail(d, oprsz, simd_maxsz(desc));                                \
1327}
1328
1329DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1330DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1331DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1332
1333#undef DO_MUL_IDX
1334
1335#define DO_MLA_IDX(NAME, TYPE, OP, H) \
1336void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1337{                                                                          \
1338    intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1339    intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1340    intptr_t idx = simd_data(desc);                                        \
1341    TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1342    for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1343        TYPE mm = m[H(i + idx)];                                           \
1344        for (j = 0; j < segment; j++) {                                    \
1345            d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1346        }                                                                  \
1347    }                                                                      \
1348    clear_tail(d, oprsz, simd_maxsz(desc));                                \
1349}
1350
1351DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1352DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1353DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1354
1355DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1356DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1357DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1358
1359#undef DO_MLA_IDX
1360
1361#define DO_FMUL_IDX(NAME, ADD, TYPE, H)                                    \
1362void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1363{                                                                          \
1364    intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1365    intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1366    intptr_t idx = simd_data(desc);                                        \
1367    TYPE *d = vd, *n = vn, *m = vm;                                        \
1368    for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1369        TYPE mm = m[H(i + idx)];                                           \
1370        for (j = 0; j < segment; j++) {                                    \
1371            d[i + j] = TYPE##_##ADD(d[i + j],                              \
1372                                    TYPE##_mul(n[i + j], mm, stat), stat); \
1373        }                                                                  \
1374    }                                                                      \
1375    clear_tail(d, oprsz, simd_maxsz(desc));                                \
1376}
1377
1378#define float16_nop(N, M, S) (M)
1379#define float32_nop(N, M, S) (M)
1380#define float64_nop(N, M, S) (M)
1381
1382DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2)
1383DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4)
1384DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, H8)
1385
1386/*
1387 * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1388 * the fused ops below they assume accumulate both from and into Vd.
1389 */
1390DO_FMUL_IDX(gvec_fmla_nf_idx_h, add, float16, H2)
1391DO_FMUL_IDX(gvec_fmla_nf_idx_s, add, float32, H4)
1392DO_FMUL_IDX(gvec_fmls_nf_idx_h, sub, float16, H2)
1393DO_FMUL_IDX(gvec_fmls_nf_idx_s, sub, float32, H4)
1394
1395#undef float16_nop
1396#undef float32_nop
1397#undef float64_nop
1398#undef DO_FMUL_IDX
1399
1400#define DO_FMLA_IDX(NAME, TYPE, H)                                         \
1401void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1402                  void *stat, uint32_t desc)                               \
1403{                                                                          \
1404    intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1405    intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1406    TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
1407    intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
1408    TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1409    op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
1410    for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1411        TYPE mm = m[H(i + idx)];                                           \
1412        for (j = 0; j < segment; j++) {                                    \
1413            d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
1414                                     mm, a[i + j], 0, stat);               \
1415        }                                                                  \
1416    }                                                                      \
1417    clear_tail(d, oprsz, simd_maxsz(desc));                                \
1418}
1419
1420DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1421DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1422DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1423
1424#undef DO_FMLA_IDX
1425
1426#define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1427void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1428{                                                                          \
1429    intptr_t i, oprsz = simd_oprsz(desc);                                  \
1430    TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1431    bool q = false;                                                        \
1432    for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1433        WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1434        if (dd < MIN) {                                                    \
1435            dd = MIN;                                                      \
1436            q = true;                                                      \
1437        } else if (dd > MAX) {                                             \
1438            dd = MAX;                                                      \
1439            q = true;                                                      \
1440        }                                                                  \
1441        d[i] = dd;                                                         \
1442    }                                                                      \
1443    if (q) {                                                               \
1444        uint32_t *qc = vq;                                                 \
1445        qc[0] = 1;                                                         \
1446    }                                                                      \
1447    clear_tail(d, oprsz, simd_maxsz(desc));                                \
1448}
1449
1450DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1451DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1452DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1453
1454DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1455DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1456DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1457
1458DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1459DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1460DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1461
1462DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1463DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1464DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1465
1466#undef DO_SAT
1467
1468void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1469                          void *vm, uint32_t desc)
1470{
1471    intptr_t i, oprsz = simd_oprsz(desc);
1472    uint64_t *d = vd, *n = vn, *m = vm;
1473    bool q = false;
1474
1475    for (i = 0; i < oprsz / 8; i++) {
1476        uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1477        if (dd < nn) {
1478            dd = UINT64_MAX;
1479            q = true;
1480        }
1481        d[i] = dd;
1482    }
1483    if (q) {
1484        uint32_t *qc = vq;
1485        qc[0] = 1;
1486    }
1487    clear_tail(d, oprsz, simd_maxsz(desc));
1488}
1489
1490void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1491                          void *vm, uint32_t desc)
1492{
1493    intptr_t i, oprsz = simd_oprsz(desc);
1494    uint64_t *d = vd, *n = vn, *m = vm;
1495    bool q = false;
1496
1497    for (i = 0; i < oprsz / 8; i++) {
1498        uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1499        if (nn < mm) {
1500            dd = 0;
1501            q = true;
1502        }
1503        d[i] = dd;
1504    }
1505    if (q) {
1506        uint32_t *qc = vq;
1507        qc[0] = 1;
1508    }
1509    clear_tail(d, oprsz, simd_maxsz(desc));
1510}
1511
1512void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1513                          void *vm, uint32_t desc)
1514{
1515    intptr_t i, oprsz = simd_oprsz(desc);
1516    int64_t *d = vd, *n = vn, *m = vm;
1517    bool q = false;
1518
1519    for (i = 0; i < oprsz / 8; i++) {
1520        int64_t nn = n[i], mm = m[i], dd = nn + mm;
1521        if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1522            dd = (nn >> 63) ^ ~INT64_MIN;
1523            q = true;
1524        }
1525        d[i] = dd;
1526    }
1527    if (q) {
1528        uint32_t *qc = vq;
1529        qc[0] = 1;
1530    }
1531    clear_tail(d, oprsz, simd_maxsz(desc));
1532}
1533
1534void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1535                          void *vm, uint32_t desc)
1536{
1537    intptr_t i, oprsz = simd_oprsz(desc);
1538    int64_t *d = vd, *n = vn, *m = vm;
1539    bool q = false;
1540
1541    for (i = 0; i < oprsz / 8; i++) {
1542        int64_t nn = n[i], mm = m[i], dd = nn - mm;
1543        if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1544            dd = (nn >> 63) ^ ~INT64_MIN;
1545            q = true;
1546        }
1547        d[i] = dd;
1548    }
1549    if (q) {
1550        uint32_t *qc = vq;
1551        qc[0] = 1;
1552    }
1553    clear_tail(d, oprsz, simd_maxsz(desc));
1554}
1555
1556
1557#define DO_SRA(NAME, TYPE)                              \
1558void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1559{                                                       \
1560    intptr_t i, oprsz = simd_oprsz(desc);               \
1561    int shift = simd_data(desc);                        \
1562    TYPE *d = vd, *n = vn;                              \
1563    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1564        d[i] += n[i] >> shift;                          \
1565    }                                                   \
1566    clear_tail(d, oprsz, simd_maxsz(desc));             \
1567}
1568
1569DO_SRA(gvec_ssra_b, int8_t)
1570DO_SRA(gvec_ssra_h, int16_t)
1571DO_SRA(gvec_ssra_s, int32_t)
1572DO_SRA(gvec_ssra_d, int64_t)
1573
1574DO_SRA(gvec_usra_b, uint8_t)
1575DO_SRA(gvec_usra_h, uint16_t)
1576DO_SRA(gvec_usra_s, uint32_t)
1577DO_SRA(gvec_usra_d, uint64_t)
1578
1579#undef DO_SRA
1580
1581#define DO_RSHR(NAME, TYPE)                             \
1582void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1583{                                                       \
1584    intptr_t i, oprsz = simd_oprsz(desc);               \
1585    int shift = simd_data(desc);                        \
1586    TYPE *d = vd, *n = vn;                              \
1587    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1588        TYPE tmp = n[i] >> (shift - 1);                 \
1589        d[i] = (tmp >> 1) + (tmp & 1);                  \
1590    }                                                   \
1591    clear_tail(d, oprsz, simd_maxsz(desc));             \
1592}
1593
1594DO_RSHR(gvec_srshr_b, int8_t)
1595DO_RSHR(gvec_srshr_h, int16_t)
1596DO_RSHR(gvec_srshr_s, int32_t)
1597DO_RSHR(gvec_srshr_d, int64_t)
1598
1599DO_RSHR(gvec_urshr_b, uint8_t)
1600DO_RSHR(gvec_urshr_h, uint16_t)
1601DO_RSHR(gvec_urshr_s, uint32_t)
1602DO_RSHR(gvec_urshr_d, uint64_t)
1603
1604#undef DO_RSHR
1605
1606#define DO_RSRA(NAME, TYPE)                             \
1607void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1608{                                                       \
1609    intptr_t i, oprsz = simd_oprsz(desc);               \
1610    int shift = simd_data(desc);                        \
1611    TYPE *d = vd, *n = vn;                              \
1612    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1613        TYPE tmp = n[i] >> (shift - 1);                 \
1614        d[i] += (tmp >> 1) + (tmp & 1);                 \
1615    }                                                   \
1616    clear_tail(d, oprsz, simd_maxsz(desc));             \
1617}
1618
1619DO_RSRA(gvec_srsra_b, int8_t)
1620DO_RSRA(gvec_srsra_h, int16_t)
1621DO_RSRA(gvec_srsra_s, int32_t)
1622DO_RSRA(gvec_srsra_d, int64_t)
1623
1624DO_RSRA(gvec_ursra_b, uint8_t)
1625DO_RSRA(gvec_ursra_h, uint16_t)
1626DO_RSRA(gvec_ursra_s, uint32_t)
1627DO_RSRA(gvec_ursra_d, uint64_t)
1628
1629#undef DO_RSRA
1630
1631#define DO_SRI(NAME, TYPE)                              \
1632void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1633{                                                       \
1634    intptr_t i, oprsz = simd_oprsz(desc);               \
1635    int shift = simd_data(desc);                        \
1636    TYPE *d = vd, *n = vn;                              \
1637    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1638        d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1639    }                                                   \
1640    clear_tail(d, oprsz, simd_maxsz(desc));             \
1641}
1642
1643DO_SRI(gvec_sri_b, uint8_t)
1644DO_SRI(gvec_sri_h, uint16_t)
1645DO_SRI(gvec_sri_s, uint32_t)
1646DO_SRI(gvec_sri_d, uint64_t)
1647
1648#undef DO_SRI
1649
1650#define DO_SLI(NAME, TYPE)                              \
1651void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1652{                                                       \
1653    intptr_t i, oprsz = simd_oprsz(desc);               \
1654    int shift = simd_data(desc);                        \
1655    TYPE *d = vd, *n = vn;                              \
1656    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1657        d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1658    }                                                   \
1659    clear_tail(d, oprsz, simd_maxsz(desc));             \
1660}
1661
1662DO_SLI(gvec_sli_b, uint8_t)
1663DO_SLI(gvec_sli_h, uint16_t)
1664DO_SLI(gvec_sli_s, uint32_t)
1665DO_SLI(gvec_sli_d, uint64_t)
1666
1667#undef DO_SLI
1668
1669/*
1670 * Convert float16 to float32, raising no exceptions and
1671 * preserving exceptional values, including SNaN.
1672 * This is effectively an unpack+repack operation.
1673 */
1674static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1675{
1676    const int f16_bias = 15;
1677    const int f32_bias = 127;
1678    uint32_t sign = extract32(f16, 15, 1);
1679    uint32_t exp = extract32(f16, 10, 5);
1680    uint32_t frac = extract32(f16, 0, 10);
1681
1682    if (exp == 0x1f) {
1683        /* Inf or NaN */
1684        exp = 0xff;
1685    } else if (exp == 0) {
1686        /* Zero or denormal.  */
1687        if (frac != 0) {
1688            if (fz16) {
1689                frac = 0;
1690            } else {
1691                /*
1692                 * Denormal; these are all normal float32.
1693                 * Shift the fraction so that the msb is at bit 11,
1694                 * then remove bit 11 as the implicit bit of the
1695                 * normalized float32.  Note that we still go through
1696                 * the shift for normal numbers below, to put the
1697                 * float32 fraction at the right place.
1698                 */
1699                int shift = clz32(frac) - 21;
1700                frac = (frac << shift) & 0x3ff;
1701                exp = f32_bias - f16_bias - shift + 1;
1702            }
1703        }
1704    } else {
1705        /* Normal number; adjust the bias.  */
1706        exp += f32_bias - f16_bias;
1707    }
1708    sign <<= 31;
1709    exp <<= 23;
1710    frac <<= 23 - 10;
1711
1712    return sign | exp | frac;
1713}
1714
1715static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
1716{
1717    /*
1718     * Branchless load of u32[0], u64[0], u32[1], or u64[1].
1719     * Load the 2nd qword iff is_q & is_2.
1720     * Shift to the 2nd dword iff !is_q & is_2.
1721     * For !is_q & !is_2, the upper bits of the result are garbage.
1722     */
1723    return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
1724}
1725
1726/*
1727 * Note that FMLAL requires oprsz == 8 or oprsz == 16,
1728 * as there is not yet SVE versions that might use blocking.
1729 */
1730
1731static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
1732                     uint32_t desc, bool fz16)
1733{
1734    intptr_t i, oprsz = simd_oprsz(desc);
1735    int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1736    int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1737    int is_q = oprsz == 16;
1738    uint64_t n_4, m_4;
1739
1740    /* Pre-load all of the f16 data, avoiding overlap issues.  */
1741    n_4 = load4_f16(vn, is_q, is_2);
1742    m_4 = load4_f16(vm, is_q, is_2);
1743
1744    /* Negate all inputs for FMLSL at once.  */
1745    if (is_s) {
1746        n_4 ^= 0x8000800080008000ull;
1747    }
1748
1749    for (i = 0; i < oprsz / 4; i++) {
1750        float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1751        float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
1752        d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1753    }
1754    clear_tail(d, oprsz, simd_maxsz(desc));
1755}
1756
1757void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
1758                            void *venv, uint32_t desc)
1759{
1760    CPUARMState *env = venv;
1761    do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1762             get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1763}
1764
1765void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
1766                            void *venv, uint32_t desc)
1767{
1768    CPUARMState *env = venv;
1769    do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
1770             get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1771}
1772
1773void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
1774                               void *venv, uint32_t desc)
1775{
1776    intptr_t i, oprsz = simd_oprsz(desc);
1777    uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1778    intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1779    CPUARMState *env = venv;
1780    float_status *status = &env->vfp.fp_status;
1781    bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1782
1783    for (i = 0; i < oprsz; i += sizeof(float32)) {
1784        float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
1785        float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
1786        float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1787        float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1788        float32 aa = *(float32 *)(va + H1_4(i));
1789
1790        *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
1791    }
1792}
1793
1794static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
1795                         uint32_t desc, bool fz16)
1796{
1797    intptr_t i, oprsz = simd_oprsz(desc);
1798    int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1799    int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1800    int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
1801    int is_q = oprsz == 16;
1802    uint64_t n_4;
1803    float32 m_1;
1804
1805    /* Pre-load all of the f16 data, avoiding overlap issues.  */
1806    n_4 = load4_f16(vn, is_q, is_2);
1807
1808    /* Negate all inputs for FMLSL at once.  */
1809    if (is_s) {
1810        n_4 ^= 0x8000800080008000ull;
1811    }
1812
1813    m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
1814
1815    for (i = 0; i < oprsz / 4; i++) {
1816        float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1817        d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1818    }
1819    clear_tail(d, oprsz, simd_maxsz(desc));
1820}
1821
1822void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
1823                                void *venv, uint32_t desc)
1824{
1825    CPUARMState *env = venv;
1826    do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1827                 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1828}
1829
1830void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
1831                                void *venv, uint32_t desc)
1832{
1833    CPUARMState *env = venv;
1834    do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
1835                 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1836}
1837
1838void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
1839                               void *venv, uint32_t desc)
1840{
1841    intptr_t i, j, oprsz = simd_oprsz(desc);
1842    uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1843    intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1844    intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
1845    CPUARMState *env = venv;
1846    float_status *status = &env->vfp.fp_status;
1847    bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1848
1849    for (i = 0; i < oprsz; i += 16) {
1850        float16 mm_16 = *(float16 *)(vm + i + idx);
1851        float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1852
1853        for (j = 0; j < 16; j += sizeof(float32)) {
1854            float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
1855            float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1856            float32 aa = *(float32 *)(va + H1_4(i + j));
1857
1858            *(float32 *)(vd + H1_4(i + j)) =
1859                float32_muladd(nn, mm, aa, 0, status);
1860        }
1861    }
1862}
1863
1864void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1865{
1866    intptr_t i, opr_sz = simd_oprsz(desc);
1867    int8_t *d = vd, *n = vn, *m = vm;
1868
1869    for (i = 0; i < opr_sz; ++i) {
1870        int8_t mm = m[i];
1871        int8_t nn = n[i];
1872        int8_t res = 0;
1873        if (mm >= 0) {
1874            if (mm < 8) {
1875                res = nn << mm;
1876            }
1877        } else {
1878            res = nn >> (mm > -8 ? -mm : 7);
1879        }
1880        d[i] = res;
1881    }
1882    clear_tail(d, opr_sz, simd_maxsz(desc));
1883}
1884
1885void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1886{
1887    intptr_t i, opr_sz = simd_oprsz(desc);
1888    int16_t *d = vd, *n = vn, *m = vm;
1889
1890    for (i = 0; i < opr_sz / 2; ++i) {
1891        int8_t mm = m[i];   /* only 8 bits of shift are significant */
1892        int16_t nn = n[i];
1893        int16_t res = 0;
1894        if (mm >= 0) {
1895            if (mm < 16) {
1896                res = nn << mm;
1897            }
1898        } else {
1899            res = nn >> (mm > -16 ? -mm : 15);
1900        }
1901        d[i] = res;
1902    }
1903    clear_tail(d, opr_sz, simd_maxsz(desc));
1904}
1905
1906void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1907{
1908    intptr_t i, opr_sz = simd_oprsz(desc);
1909    uint8_t *d = vd, *n = vn, *m = vm;
1910
1911    for (i = 0; i < opr_sz; ++i) {
1912        int8_t mm = m[i];
1913        uint8_t nn = n[i];
1914        uint8_t res = 0;
1915        if (mm >= 0) {
1916            if (mm < 8) {
1917                res = nn << mm;
1918            }
1919        } else {
1920            if (mm > -8) {
1921                res = nn >> -mm;
1922            }
1923        }
1924        d[i] = res;
1925    }
1926    clear_tail(d, opr_sz, simd_maxsz(desc));
1927}
1928
1929void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1930{
1931    intptr_t i, opr_sz = simd_oprsz(desc);
1932    uint16_t *d = vd, *n = vn, *m = vm;
1933
1934    for (i = 0; i < opr_sz / 2; ++i) {
1935        int8_t mm = m[i];   /* only 8 bits of shift are significant */
1936        uint16_t nn = n[i];
1937        uint16_t res = 0;
1938        if (mm >= 0) {
1939            if (mm < 16) {
1940                res = nn << mm;
1941            }
1942        } else {
1943            if (mm > -16) {
1944                res = nn >> -mm;
1945            }
1946        }
1947        d[i] = res;
1948    }
1949    clear_tail(d, opr_sz, simd_maxsz(desc));
1950}
1951
1952/*
1953 * 8x8->8 polynomial multiply.
1954 *
1955 * Polynomial multiplication is like integer multiplication except the
1956 * partial products are XORed, not added.
1957 *
1958 * TODO: expose this as a generic vector operation, as it is a common
1959 * crypto building block.
1960 */
1961void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
1962{
1963    intptr_t i, j, opr_sz = simd_oprsz(desc);
1964    uint64_t *d = vd, *n = vn, *m = vm;
1965
1966    for (i = 0; i < opr_sz / 8; ++i) {
1967        uint64_t nn = n[i];
1968        uint64_t mm = m[i];
1969        uint64_t rr = 0;
1970
1971        for (j = 0; j < 8; ++j) {
1972            uint64_t mask = (nn & 0x0101010101010101ull) * 0xff;
1973            rr ^= mm & mask;
1974            mm = (mm << 1) & 0xfefefefefefefefeull;
1975            nn >>= 1;
1976        }
1977        d[i] = rr;
1978    }
1979    clear_tail(d, opr_sz, simd_maxsz(desc));
1980}
1981
1982/*
1983 * 64x64->128 polynomial multiply.
1984 * Because of the lanes are not accessed in strict columns,
1985 * this probably cannot be turned into a generic helper.
1986 */
1987void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
1988{
1989    intptr_t i, j, opr_sz = simd_oprsz(desc);
1990    intptr_t hi = simd_data(desc);
1991    uint64_t *d = vd, *n = vn, *m = vm;
1992
1993    for (i = 0; i < opr_sz / 8; i += 2) {
1994        uint64_t nn = n[i + hi];
1995        uint64_t mm = m[i + hi];
1996        uint64_t rhi = 0;
1997        uint64_t rlo = 0;
1998
1999        /* Bit 0 can only influence the low 64-bit result.  */
2000        if (nn & 1) {
2001            rlo = mm;
2002        }
2003
2004        for (j = 1; j < 64; ++j) {
2005            uint64_t mask = -((nn >> j) & 1);
2006            rlo ^= (mm << j) & mask;
2007            rhi ^= (mm >> (64 - j)) & mask;
2008        }
2009        d[i] = rlo;
2010        d[i + 1] = rhi;
2011    }
2012    clear_tail(d, opr_sz, simd_maxsz(desc));
2013}
2014
2015/*
2016 * 8x8->16 polynomial multiply.
2017 *
2018 * The byte inputs are expanded to (or extracted from) half-words.
2019 * Note that neon and sve2 get the inputs from different positions.
2020 * This allows 4 bytes to be processed in parallel with uint64_t.
2021 */
2022
2023static uint64_t expand_byte_to_half(uint64_t x)
2024{
2025    return  (x & 0x000000ff)
2026         | ((x & 0x0000ff00) << 8)
2027         | ((x & 0x00ff0000) << 16)
2028         | ((x & 0xff000000) << 24);
2029}
2030
2031static uint64_t pmull_h(uint64_t op1, uint64_t op2)
2032{
2033    uint64_t result = 0;
2034    int i;
2035
2036    for (i = 0; i < 8; ++i) {
2037        uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff;
2038        result ^= op2 & mask;
2039        op1 >>= 1;
2040        op2 <<= 1;
2041    }
2042    return result;
2043}
2044
2045void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2046{
2047    int hi = simd_data(desc);
2048    uint64_t *d = vd, *n = vn, *m = vm;
2049    uint64_t nn = n[hi], mm = m[hi];
2050
2051    d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
2052    nn >>= 32;
2053    mm >>= 32;
2054    d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
2055
2056    clear_tail(d, 16, simd_maxsz(desc));
2057}
2058
2059#ifdef TARGET_AARCH64
2060void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2061{
2062    int shift = simd_data(desc) * 8;
2063    intptr_t i, opr_sz = simd_oprsz(desc);
2064    uint64_t *d = vd, *n = vn, *m = vm;
2065
2066    for (i = 0; i < opr_sz / 8; ++i) {
2067        uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull;
2068        uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull;
2069
2070        d[i] = pmull_h(nn, mm);
2071    }
2072}
2073
2074static uint64_t pmull_d(uint64_t op1, uint64_t op2)
2075{
2076    uint64_t result = 0;
2077    int i;
2078
2079    for (i = 0; i < 32; ++i) {
2080        uint64_t mask = -((op1 >> i) & 1);
2081        result ^= (op2 << i) & mask;
2082    }
2083    return result;
2084}
2085
2086void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2087{
2088    intptr_t sel = H4(simd_data(desc));
2089    intptr_t i, opr_sz = simd_oprsz(desc);
2090    uint32_t *n = vn, *m = vm;
2091    uint64_t *d = vd;
2092
2093    for (i = 0; i < opr_sz / 8; ++i) {
2094        d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]);
2095    }
2096}
2097#endif
2098
2099#define DO_CMP0(NAME, TYPE, OP)                         \
2100void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2101{                                                       \
2102    intptr_t i, opr_sz = simd_oprsz(desc);              \
2103    for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2104        TYPE nn = *(TYPE *)(vn + i);                    \
2105        *(TYPE *)(vd + i) = -(nn OP 0);                 \
2106    }                                                   \
2107    clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2108}
2109
2110DO_CMP0(gvec_ceq0_b, int8_t, ==)
2111DO_CMP0(gvec_clt0_b, int8_t, <)
2112DO_CMP0(gvec_cle0_b, int8_t, <=)
2113DO_CMP0(gvec_cgt0_b, int8_t, >)
2114DO_CMP0(gvec_cge0_b, int8_t, >=)
2115
2116DO_CMP0(gvec_ceq0_h, int16_t, ==)
2117DO_CMP0(gvec_clt0_h, int16_t, <)
2118DO_CMP0(gvec_cle0_h, int16_t, <=)
2119DO_CMP0(gvec_cgt0_h, int16_t, >)
2120DO_CMP0(gvec_cge0_h, int16_t, >=)
2121
2122#undef DO_CMP0
2123
2124#define DO_ABD(NAME, TYPE)                                      \
2125void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2126{                                                               \
2127    intptr_t i, opr_sz = simd_oprsz(desc);                      \
2128    TYPE *d = vd, *n = vn, *m = vm;                             \
2129                                                                \
2130    for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2131        d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2132    }                                                           \
2133    clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2134}
2135
2136DO_ABD(gvec_sabd_b, int8_t)
2137DO_ABD(gvec_sabd_h, int16_t)
2138DO_ABD(gvec_sabd_s, int32_t)
2139DO_ABD(gvec_sabd_d, int64_t)
2140
2141DO_ABD(gvec_uabd_b, uint8_t)
2142DO_ABD(gvec_uabd_h, uint16_t)
2143DO_ABD(gvec_uabd_s, uint32_t)
2144DO_ABD(gvec_uabd_d, uint64_t)
2145
2146#undef DO_ABD
2147
2148#define DO_ABA(NAME, TYPE)                                      \
2149void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2150{                                                               \
2151    intptr_t i, opr_sz = simd_oprsz(desc);                      \
2152    TYPE *d = vd, *n = vn, *m = vm;                             \
2153                                                                \
2154    for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2155        d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2156    }                                                           \
2157    clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2158}
2159
2160DO_ABA(gvec_saba_b, int8_t)
2161DO_ABA(gvec_saba_h, int16_t)
2162DO_ABA(gvec_saba_s, int32_t)
2163DO_ABA(gvec_saba_d, int64_t)
2164
2165DO_ABA(gvec_uaba_b, uint8_t)
2166DO_ABA(gvec_uaba_h, uint16_t)
2167DO_ABA(gvec_uaba_s, uint32_t)
2168DO_ABA(gvec_uaba_d, uint64_t)
2169
2170#undef DO_ABA
2171
2172#define DO_NEON_PAIRWISE(NAME, OP)                                      \
2173    void HELPER(NAME##s)(void *vd, void *vn, void *vm,                  \
2174                         void *stat, uint32_t oprsz)                    \
2175    {                                                                   \
2176        float_status *fpst = stat;                                      \
2177        float32 *d = vd;                                                \
2178        float32 *n = vn;                                                \
2179        float32 *m = vm;                                                \
2180        float32 r0, r1;                                                 \
2181                                                                        \
2182        /* Read all inputs before writing outputs in case vm == vd */   \
2183        r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst);                    \
2184        r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst);                    \
2185                                                                        \
2186        d[H4(0)] = r0;                                                  \
2187        d[H4(1)] = r1;                                                  \
2188    }                                                                   \
2189                                                                        \
2190    void HELPER(NAME##h)(void *vd, void *vn, void *vm,                  \
2191                         void *stat, uint32_t oprsz)                    \
2192    {                                                                   \
2193        float_status *fpst = stat;                                      \
2194        float16 *d = vd;                                                \
2195        float16 *n = vn;                                                \
2196        float16 *m = vm;                                                \
2197        float16 r0, r1, r2, r3;                                         \
2198                                                                        \
2199        /* Read all inputs before writing outputs in case vm == vd */   \
2200        r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst);                    \
2201        r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst);                    \
2202        r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst);                    \
2203        r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst);                    \
2204                                                                        \
2205        d[H2(0)] = r0;                                                  \
2206        d[H2(1)] = r1;                                                  \
2207        d[H2(2)] = r2;                                                  \
2208        d[H2(3)] = r3;                                                  \
2209    }
2210
2211DO_NEON_PAIRWISE(neon_padd, add)
2212DO_NEON_PAIRWISE(neon_pmax, max)
2213DO_NEON_PAIRWISE(neon_pmin, min)
2214
2215#undef DO_NEON_PAIRWISE
2216
2217#define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2218    void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2219    {                                                                   \
2220        intptr_t i, oprsz = simd_oprsz(desc);                           \
2221        int shift = simd_data(desc);                                    \
2222        TYPE *d = vd, *n = vn;                                          \
2223        float_status *fpst = stat;                                      \
2224        for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2225            d[i] = FUNC(n[i], shift, fpst);                             \
2226        }                                                               \
2227        clear_tail(d, oprsz, simd_maxsz(desc));                         \
2228    }
2229
2230DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2231DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2232DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2233DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
2234DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2235DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2236DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2237DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2238
2239#undef DO_VCVT_FIXED
2240
2241#define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2242    void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2243    {                                                                   \
2244        float_status *fpst = stat;                                      \
2245        intptr_t i, oprsz = simd_oprsz(desc);                           \
2246        uint32_t rmode = simd_data(desc);                               \
2247        uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2248        TYPE *d = vd, *n = vn;                                          \
2249        set_float_rounding_mode(rmode, fpst);                           \
2250        for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2251            d[i] = FUNC(n[i], 0, fpst);                                 \
2252        }                                                               \
2253        set_float_rounding_mode(prev_rmode, fpst);                      \
2254        clear_tail(d, oprsz, simd_maxsz(desc));                         \
2255    }
2256
2257DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2258DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2259DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2260DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2261
2262#undef DO_VCVT_RMODE
2263
2264#define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2265    void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2266    {                                                                   \
2267        float_status *fpst = stat;                                      \
2268        intptr_t i, oprsz = simd_oprsz(desc);                           \
2269        uint32_t rmode = simd_data(desc);                               \
2270        uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2271        TYPE *d = vd, *n = vn;                                          \
2272        set_float_rounding_mode(rmode, fpst);                           \
2273        for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2274            d[i] = FUNC(n[i], fpst);                                    \
2275        }                                                               \
2276        set_float_rounding_mode(prev_rmode, fpst);                      \
2277        clear_tail(d, oprsz, simd_maxsz(desc));                         \
2278    }
2279
2280DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2281DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2282
2283#undef DO_VRINT_RMODE
2284
2285#ifdef TARGET_AARCH64
2286void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2287{
2288    const uint8_t *indices = vm;
2289    CPUARMState *env = venv;
2290    size_t oprsz = simd_oprsz(desc);
2291    uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2292    bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2293    uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2294    union {
2295        uint8_t b[16];
2296        uint64_t d[2];
2297    } result;
2298
2299    /*
2300     * We must construct the final result in a temp, lest the output
2301     * overlaps the input table.  For TBL, begin with zero; for TBX,
2302     * begin with the original register contents.  Note that we always
2303     * copy 16 bytes here to avoid an extra branch; clearing the high
2304     * bits of the register for oprsz == 8 is handled below.
2305     */
2306    if (is_tbx) {
2307        memcpy(&result, vd, 16);
2308    } else {
2309        memset(&result, 0, 16);
2310    }
2311
2312    for (size_t i = 0; i < oprsz; ++i) {
2313        uint32_t index = indices[H1(i)];
2314
2315        if (index < table_len) {
2316            /*
2317             * Convert index (a byte offset into the virtual table
2318             * which is a series of 128-bit vectors concatenated)
2319             * into the correct register element, bearing in mind
2320             * that the table can wrap around from V31 to V0.
2321             */
2322            const uint8_t *table = (const uint8_t *)
2323                aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2324            result.b[H1(i)] = table[H1(index % 16)];
2325        }
2326    }
2327
2328    memcpy(vd, &result, 16);
2329    clear_tail(vd, oprsz, simd_maxsz(desc));
2330}
2331#endif
2332
2333/*
2334 * NxN -> N highpart multiply
2335 *
2336 * TODO: expose this as a generic vector operation.
2337 */
2338
2339void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2340{
2341    intptr_t i, opr_sz = simd_oprsz(desc);
2342    int8_t *d = vd, *n = vn, *m = vm;
2343
2344    for (i = 0; i < opr_sz; ++i) {
2345        d[i] = ((int32_t)n[i] * m[i]) >> 8;
2346    }
2347    clear_tail(d, opr_sz, simd_maxsz(desc));
2348}
2349
2350void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2351{
2352    intptr_t i, opr_sz = simd_oprsz(desc);
2353    int16_t *d = vd, *n = vn, *m = vm;
2354
2355    for (i = 0; i < opr_sz / 2; ++i) {
2356        d[i] = ((int32_t)n[i] * m[i]) >> 16;
2357    }
2358    clear_tail(d, opr_sz, simd_maxsz(desc));
2359}
2360
2361void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2362{
2363    intptr_t i, opr_sz = simd_oprsz(desc);
2364    int32_t *d = vd, *n = vn, *m = vm;
2365
2366    for (i = 0; i < opr_sz / 4; ++i) {
2367        d[i] = ((int64_t)n[i] * m[i]) >> 32;
2368    }
2369    clear_tail(d, opr_sz, simd_maxsz(desc));
2370}
2371
2372void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2373{
2374    intptr_t i, opr_sz = simd_oprsz(desc);
2375    uint64_t *d = vd, *n = vn, *m = vm;
2376    uint64_t discard;
2377
2378    for (i = 0; i < opr_sz / 8; ++i) {
2379        muls64(&discard, &d[i], n[i], m[i]);
2380    }
2381    clear_tail(d, opr_sz, simd_maxsz(desc));
2382}
2383
2384void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2385{
2386    intptr_t i, opr_sz = simd_oprsz(desc);
2387    uint8_t *d = vd, *n = vn, *m = vm;
2388
2389    for (i = 0; i < opr_sz; ++i) {
2390        d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2391    }
2392    clear_tail(d, opr_sz, simd_maxsz(desc));
2393}
2394
2395void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2396{
2397    intptr_t i, opr_sz = simd_oprsz(desc);
2398    uint16_t *d = vd, *n = vn, *m = vm;
2399
2400    for (i = 0; i < opr_sz / 2; ++i) {
2401        d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2402    }
2403    clear_tail(d, opr_sz, simd_maxsz(desc));
2404}
2405
2406void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2407{
2408    intptr_t i, opr_sz = simd_oprsz(desc);
2409    uint32_t *d = vd, *n = vn, *m = vm;
2410
2411    for (i = 0; i < opr_sz / 4; ++i) {
2412        d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2413    }
2414    clear_tail(d, opr_sz, simd_maxsz(desc));
2415}
2416
2417void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2418{
2419    intptr_t i, opr_sz = simd_oprsz(desc);
2420    uint64_t *d = vd, *n = vn, *m = vm;
2421    uint64_t discard;
2422
2423    for (i = 0; i < opr_sz / 8; ++i) {
2424        mulu64(&discard, &d[i], n[i], m[i]);
2425    }
2426    clear_tail(d, opr_sz, simd_maxsz(desc));
2427}
2428
2429void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2430{
2431    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2432    int shr = simd_data(desc);
2433    uint64_t *d = vd, *n = vn, *m = vm;
2434
2435    for (i = 0; i < opr_sz; ++i) {
2436        d[i] = ror64(n[i] ^ m[i], shr);
2437    }
2438    clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2439}
2440
2441/*
2442 * Integer matrix-multiply accumulate
2443 */
2444
2445static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2446{
2447    int8_t *n = vn, *m = vm;
2448
2449    for (intptr_t k = 0; k < 8; ++k) {
2450        sum += n[H1(k)] * m[H1(k)];
2451    }
2452    return sum;
2453}
2454
2455static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2456{
2457    uint8_t *n = vn, *m = vm;
2458
2459    for (intptr_t k = 0; k < 8; ++k) {
2460        sum += n[H1(k)] * m[H1(k)];
2461    }
2462    return sum;
2463}
2464
2465static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2466{
2467    uint8_t *n = vn;
2468    int8_t *m = vm;
2469
2470    for (intptr_t k = 0; k < 8; ++k) {
2471        sum += n[H1(k)] * m[H1(k)];
2472    }
2473    return sum;
2474}
2475
2476static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2477                      uint32_t (*inner_loop)(uint32_t, void *, void *))
2478{
2479    intptr_t seg, opr_sz = simd_oprsz(desc);
2480
2481    for (seg = 0; seg < opr_sz; seg += 16) {
2482        uint32_t *d = vd + seg;
2483        uint32_t *a = va + seg;
2484        uint32_t sum0, sum1, sum2, sum3;
2485
2486        /*
2487         * Process the entire segment at once, writing back the
2488         * results only after we've consumed all of the inputs.
2489         *
2490         * Key to indices by column:
2491         *          i   j                  i             j
2492         */
2493        sum0 = a[H4(0 + 0)];
2494        sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2495        sum1 = a[H4(0 + 1)];
2496        sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2497        sum2 = a[H4(2 + 0)];
2498        sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2499        sum3 = a[H4(2 + 1)];
2500        sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2501
2502        d[H4(0)] = sum0;
2503        d[H4(1)] = sum1;
2504        d[H4(2)] = sum2;
2505        d[H4(3)] = sum3;
2506    }
2507    clear_tail(vd, opr_sz, simd_maxsz(desc));
2508}
2509
2510#define DO_MMLA_B(NAME, INNER) \
2511    void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2512    { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2513
2514DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2515DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2516DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2517
2518/*
2519 * BFloat16 Dot Product
2520 */
2521
2522static float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2)
2523{
2524    /* FPCR is ignored for BFDOT and BFMMLA. */
2525    float_status bf_status = {
2526        .tininess_before_rounding = float_tininess_before_rounding,
2527        .float_rounding_mode = float_round_to_odd_inf,
2528        .flush_to_zero = true,
2529        .flush_inputs_to_zero = true,
2530        .default_nan_mode = true,
2531    };
2532    float32 t1, t2;
2533
2534    /*
2535     * Extract each BFloat16 from the element pair, and shift
2536     * them such that they become float32.
2537     */
2538    t1 = float32_mul(e1 << 16, e2 << 16, &bf_status);
2539    t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status);
2540    t1 = float32_add(t1, t2, &bf_status);
2541    t1 = float32_add(sum, t1, &bf_status);
2542
2543    return t1;
2544}
2545
2546void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2547{
2548    intptr_t i, opr_sz = simd_oprsz(desc);
2549    float32 *d = vd, *a = va;
2550    uint32_t *n = vn, *m = vm;
2551
2552    for (i = 0; i < opr_sz / 4; ++i) {
2553        d[i] = bfdotadd(a[i], n[i], m[i]);
2554    }
2555    clear_tail(d, opr_sz, simd_maxsz(desc));
2556}
2557
2558void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2559                            void *va, uint32_t desc)
2560{
2561    intptr_t i, j, opr_sz = simd_oprsz(desc);
2562    intptr_t index = simd_data(desc);
2563    intptr_t elements = opr_sz / 4;
2564    intptr_t eltspersegment = MIN(16 / 4, elements);
2565    float32 *d = vd, *a = va;
2566    uint32_t *n = vn, *m = vm;
2567
2568    for (i = 0; i < elements; i += eltspersegment) {
2569        uint32_t m_idx = m[i + H4(index)];
2570
2571        for (j = i; j < i + eltspersegment; j++) {
2572            d[j] = bfdotadd(a[j], n[j], m_idx);
2573        }
2574    }
2575    clear_tail(d, opr_sz, simd_maxsz(desc));
2576}
2577
2578void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2579{
2580    intptr_t s, opr_sz = simd_oprsz(desc);
2581    float32 *d = vd, *a = va;
2582    uint32_t *n = vn, *m = vm;
2583
2584    for (s = 0; s < opr_sz / 4; s += 4) {
2585        float32 sum00, sum01, sum10, sum11;
2586
2587        /*
2588         * Process the entire segment at once, writing back the
2589         * results only after we've consumed all of the inputs.
2590         *
2591         * Key to indicies by column:
2592         *               i   j           i   k             j   k
2593         */
2594        sum00 = a[s + H4(0 + 0)];
2595        sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]);
2596        sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]);
2597
2598        sum01 = a[s + H4(0 + 1)];
2599        sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]);
2600        sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]);
2601
2602        sum10 = a[s + H4(2 + 0)];
2603        sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]);
2604        sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]);
2605
2606        sum11 = a[s + H4(2 + 1)];
2607        sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]);
2608        sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]);
2609
2610        d[s + H4(0 + 0)] = sum00;
2611        d[s + H4(0 + 1)] = sum01;
2612        d[s + H4(2 + 0)] = sum10;
2613        d[s + H4(2 + 1)] = sum11;
2614    }
2615    clear_tail(d, opr_sz, simd_maxsz(desc));
2616}
2617
2618void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
2619                         void *stat, uint32_t desc)
2620{
2621    intptr_t i, opr_sz = simd_oprsz(desc);
2622    intptr_t sel = simd_data(desc);
2623    float32 *d = vd, *a = va;
2624    bfloat16 *n = vn, *m = vm;
2625
2626    for (i = 0; i < opr_sz / 4; ++i) {
2627        float32 nn = n[H2(i * 2 + sel)] << 16;
2628        float32 mm = m[H2(i * 2 + sel)] << 16;
2629        d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
2630    }
2631    clear_tail(d, opr_sz, simd_maxsz(desc));
2632}
2633
2634void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
2635                             void *va, void *stat, uint32_t desc)
2636{
2637    intptr_t i, j, opr_sz = simd_oprsz(desc);
2638    intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
2639    intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
2640    intptr_t elements = opr_sz / 4;
2641    intptr_t eltspersegment = MIN(16 / 4, elements);
2642    float32 *d = vd, *a = va;
2643    bfloat16 *n = vn, *m = vm;
2644
2645    for (i = 0; i < elements; i += eltspersegment) {
2646        float32 m_idx = m[H2(2 * i + index)] << 16;
2647
2648        for (j = i; j < i + eltspersegment; j++) {
2649            float32 n_j = n[H2(2 * j + sel)] << 16;
2650            d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
2651        }
2652    }
2653    clear_tail(d, opr_sz, simd_maxsz(desc));
2654}
2655