qemu/target/arm/vec_helper.c
<<
>>
Prefs
   1/*
   2 * ARM AdvSIMD / SVE Vector Operations
   3 *
   4 * Copyright (c) 2018 Linaro
   5 *
   6 * This library is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU Lesser General Public
   8 * License as published by the Free Software Foundation; either
   9 * version 2.1 of the License, or (at your option) any later version.
  10 *
  11 * This library is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * Lesser General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU Lesser General Public
  17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20#include "qemu/osdep.h"
  21#include "cpu.h"
  22#include "exec/helper-proto.h"
  23#include "tcg/tcg-gvec-desc.h"
  24#include "fpu/softfloat.h"
  25#include "qemu/int128.h"
  26#include "vec_internal.h"
  27
  28/*
  29 * Data for expanding active predicate bits to bytes, for byte elements.
  30 *
  31 *  for (i = 0; i < 256; ++i) {
  32 *      unsigned long m = 0;
  33 *      for (j = 0; j < 8; j++) {
  34 *          if ((i >> j) & 1) {
  35 *              m |= 0xfful << (j << 3);
  36 *          }
  37 *      }
  38 *      printf("0x%016lx,\n", m);
  39 *  }
  40 */
  41const uint64_t expand_pred_b_data[256] = {
  42    0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
  43    0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
  44    0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
  45    0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
  46    0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
  47    0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
  48    0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
  49    0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
  50    0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
  51    0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
  52    0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
  53    0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
  54    0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
  55    0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
  56    0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
  57    0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
  58    0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
  59    0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
  60    0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
  61    0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
  62    0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
  63    0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
  64    0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
  65    0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
  66    0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
  67    0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
  68    0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
  69    0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
  70    0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
  71    0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
  72    0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
  73    0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
  74    0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
  75    0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
  76    0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
  77    0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
  78    0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
  79    0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
  80    0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
  81    0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
  82    0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
  83    0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
  84    0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
  85    0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
  86    0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
  87    0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
  88    0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
  89    0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
  90    0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
  91    0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
  92    0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
  93    0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
  94    0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
  95    0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
  96    0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
  97    0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
  98    0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
  99    0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
 100    0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
 101    0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
 102    0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
 103    0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
 104    0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
 105    0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
 106    0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
 107    0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
 108    0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
 109    0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
 110    0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
 111    0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
 112    0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
 113    0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
 114    0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
 115    0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
 116    0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
 117    0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
 118    0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
 119    0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
 120    0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
 121    0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
 122    0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
 123    0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
 124    0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
 125    0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
 126    0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
 127    0xffffffffffffffff,
 128};
 129
 130/* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
 131int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
 132                     bool neg, bool round)
 133{
 134    /*
 135     * Simplify:
 136     * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
 137     * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
 138     */
 139    int32_t ret = (int32_t)src1 * src2;
 140    if (neg) {
 141        ret = -ret;
 142    }
 143    ret += ((int32_t)src3 << 7) + (round << 6);
 144    ret >>= 7;
 145
 146    if (ret != (int8_t)ret) {
 147        ret = (ret < 0 ? INT8_MIN : INT8_MAX);
 148    }
 149    return ret;
 150}
 151
 152void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
 153                             void *va, uint32_t desc)
 154{
 155    intptr_t i, opr_sz = simd_oprsz(desc);
 156    int8_t *d = vd, *n = vn, *m = vm, *a = va;
 157
 158    for (i = 0; i < opr_sz; ++i) {
 159        d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
 160    }
 161}
 162
 163void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
 164                             void *va, uint32_t desc)
 165{
 166    intptr_t i, opr_sz = simd_oprsz(desc);
 167    int8_t *d = vd, *n = vn, *m = vm, *a = va;
 168
 169    for (i = 0; i < opr_sz; ++i) {
 170        d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
 171    }
 172}
 173
 174void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
 175{
 176    intptr_t i, opr_sz = simd_oprsz(desc);
 177    int8_t *d = vd, *n = vn, *m = vm;
 178
 179    for (i = 0; i < opr_sz; ++i) {
 180        d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
 181    }
 182}
 183
 184void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
 185{
 186    intptr_t i, opr_sz = simd_oprsz(desc);
 187    int8_t *d = vd, *n = vn, *m = vm;
 188
 189    for (i = 0; i < opr_sz; ++i) {
 190        d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
 191    }
 192}
 193
 194/* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
 195int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
 196                      bool neg, bool round, uint32_t *sat)
 197{
 198    /* Simplify similarly to do_sqrdmlah_b above.  */
 199    int32_t ret = (int32_t)src1 * src2;
 200    if (neg) {
 201        ret = -ret;
 202    }
 203    ret += ((int32_t)src3 << 15) + (round << 14);
 204    ret >>= 15;
 205
 206    if (ret != (int16_t)ret) {
 207        *sat = 1;
 208        ret = (ret < 0 ? INT16_MIN : INT16_MAX);
 209    }
 210    return ret;
 211}
 212
 213uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
 214                                  uint32_t src2, uint32_t src3)
 215{
 216    uint32_t *sat = &env->vfp.qc[0];
 217    uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
 218    uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
 219                                false, true, sat);
 220    return deposit32(e1, 16, 16, e2);
 221}
 222
 223void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
 224                              void *vq, uint32_t desc)
 225{
 226    uintptr_t opr_sz = simd_oprsz(desc);
 227    int16_t *d = vd;
 228    int16_t *n = vn;
 229    int16_t *m = vm;
 230    uintptr_t i;
 231
 232    for (i = 0; i < opr_sz / 2; ++i) {
 233        d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
 234    }
 235    clear_tail(d, opr_sz, simd_maxsz(desc));
 236}
 237
 238uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
 239                                  uint32_t src2, uint32_t src3)
 240{
 241    uint32_t *sat = &env->vfp.qc[0];
 242    uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
 243    uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
 244                                true, true, sat);
 245    return deposit32(e1, 16, 16, e2);
 246}
 247
 248void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
 249                              void *vq, uint32_t desc)
 250{
 251    uintptr_t opr_sz = simd_oprsz(desc);
 252    int16_t *d = vd;
 253    int16_t *n = vn;
 254    int16_t *m = vm;
 255    uintptr_t i;
 256
 257    for (i = 0; i < opr_sz / 2; ++i) {
 258        d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
 259    }
 260    clear_tail(d, opr_sz, simd_maxsz(desc));
 261}
 262
 263void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
 264                            void *vq, uint32_t desc)
 265{
 266    intptr_t i, opr_sz = simd_oprsz(desc);
 267    int16_t *d = vd, *n = vn, *m = vm;
 268
 269    for (i = 0; i < opr_sz / 2; ++i) {
 270        d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
 271    }
 272    clear_tail(d, opr_sz, simd_maxsz(desc));
 273}
 274
 275void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
 276                             void *vq, uint32_t desc)
 277{
 278    intptr_t i, opr_sz = simd_oprsz(desc);
 279    int16_t *d = vd, *n = vn, *m = vm;
 280
 281    for (i = 0; i < opr_sz / 2; ++i) {
 282        d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
 283    }
 284    clear_tail(d, opr_sz, simd_maxsz(desc));
 285}
 286
 287void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
 288                             void *va, uint32_t desc)
 289{
 290    intptr_t i, opr_sz = simd_oprsz(desc);
 291    int16_t *d = vd, *n = vn, *m = vm, *a = va;
 292    uint32_t discard;
 293
 294    for (i = 0; i < opr_sz / 2; ++i) {
 295        d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
 296    }
 297}
 298
 299void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
 300                             void *va, uint32_t desc)
 301{
 302    intptr_t i, opr_sz = simd_oprsz(desc);
 303    int16_t *d = vd, *n = vn, *m = vm, *a = va;
 304    uint32_t discard;
 305
 306    for (i = 0; i < opr_sz / 2; ++i) {
 307        d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
 308    }
 309}
 310
 311void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
 312{
 313    intptr_t i, opr_sz = simd_oprsz(desc);
 314    int16_t *d = vd, *n = vn, *m = vm;
 315    uint32_t discard;
 316
 317    for (i = 0; i < opr_sz / 2; ++i) {
 318        d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
 319    }
 320}
 321
 322void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
 323{
 324    intptr_t i, opr_sz = simd_oprsz(desc);
 325    int16_t *d = vd, *n = vn, *m = vm;
 326    uint32_t discard;
 327
 328    for (i = 0; i < opr_sz / 2; ++i) {
 329        d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
 330    }
 331}
 332
 333void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
 334{
 335    intptr_t i, j, opr_sz = simd_oprsz(desc);
 336    int idx = simd_data(desc);
 337    int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
 338    uint32_t discard;
 339
 340    for (i = 0; i < opr_sz / 2; i += 16 / 2) {
 341        int16_t mm = m[i];
 342        for (j = 0; j < 16 / 2; ++j) {
 343            d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
 344        }
 345    }
 346}
 347
 348void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
 349{
 350    intptr_t i, j, opr_sz = simd_oprsz(desc);
 351    int idx = simd_data(desc);
 352    int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
 353    uint32_t discard;
 354
 355    for (i = 0; i < opr_sz / 2; i += 16 / 2) {
 356        int16_t mm = m[i];
 357        for (j = 0; j < 16 / 2; ++j) {
 358            d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
 359        }
 360    }
 361}
 362
 363/* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
 364int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
 365                      bool neg, bool round, uint32_t *sat)
 366{
 367    /* Simplify similarly to do_sqrdmlah_b above.  */
 368    int64_t ret = (int64_t)src1 * src2;
 369    if (neg) {
 370        ret = -ret;
 371    }
 372    ret += ((int64_t)src3 << 31) + (round << 30);
 373    ret >>= 31;
 374
 375    if (ret != (int32_t)ret) {
 376        *sat = 1;
 377        ret = (ret < 0 ? INT32_MIN : INT32_MAX);
 378    }
 379    return ret;
 380}
 381
 382uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
 383                                  int32_t src2, int32_t src3)
 384{
 385    uint32_t *sat = &env->vfp.qc[0];
 386    return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
 387}
 388
 389void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
 390                              void *vq, uint32_t desc)
 391{
 392    uintptr_t opr_sz = simd_oprsz(desc);
 393    int32_t *d = vd;
 394    int32_t *n = vn;
 395    int32_t *m = vm;
 396    uintptr_t i;
 397
 398    for (i = 0; i < opr_sz / 4; ++i) {
 399        d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
 400    }
 401    clear_tail(d, opr_sz, simd_maxsz(desc));
 402}
 403
 404uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
 405                                  int32_t src2, int32_t src3)
 406{
 407    uint32_t *sat = &env->vfp.qc[0];
 408    return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
 409}
 410
 411void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
 412                              void *vq, uint32_t desc)
 413{
 414    uintptr_t opr_sz = simd_oprsz(desc);
 415    int32_t *d = vd;
 416    int32_t *n = vn;
 417    int32_t *m = vm;
 418    uintptr_t i;
 419
 420    for (i = 0; i < opr_sz / 4; ++i) {
 421        d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
 422    }
 423    clear_tail(d, opr_sz, simd_maxsz(desc));
 424}
 425
 426void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
 427                            void *vq, uint32_t desc)
 428{
 429    intptr_t i, opr_sz = simd_oprsz(desc);
 430    int32_t *d = vd, *n = vn, *m = vm;
 431
 432    for (i = 0; i < opr_sz / 4; ++i) {
 433        d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
 434    }
 435    clear_tail(d, opr_sz, simd_maxsz(desc));
 436}
 437
 438void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
 439                             void *vq, uint32_t desc)
 440{
 441    intptr_t i, opr_sz = simd_oprsz(desc);
 442    int32_t *d = vd, *n = vn, *m = vm;
 443
 444    for (i = 0; i < opr_sz / 4; ++i) {
 445        d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
 446    }
 447    clear_tail(d, opr_sz, simd_maxsz(desc));
 448}
 449
 450void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
 451                             void *va, uint32_t desc)
 452{
 453    intptr_t i, opr_sz = simd_oprsz(desc);
 454    int32_t *d = vd, *n = vn, *m = vm, *a = va;
 455    uint32_t discard;
 456
 457    for (i = 0; i < opr_sz / 4; ++i) {
 458        d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
 459    }
 460}
 461
 462void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
 463                             void *va, uint32_t desc)
 464{
 465    intptr_t i, opr_sz = simd_oprsz(desc);
 466    int32_t *d = vd, *n = vn, *m = vm, *a = va;
 467    uint32_t discard;
 468
 469    for (i = 0; i < opr_sz / 4; ++i) {
 470        d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
 471    }
 472}
 473
 474void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
 475{
 476    intptr_t i, opr_sz = simd_oprsz(desc);
 477    int32_t *d = vd, *n = vn, *m = vm;
 478    uint32_t discard;
 479
 480    for (i = 0; i < opr_sz / 4; ++i) {
 481        d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
 482    }
 483}
 484
 485void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
 486{
 487    intptr_t i, opr_sz = simd_oprsz(desc);
 488    int32_t *d = vd, *n = vn, *m = vm;
 489    uint32_t discard;
 490
 491    for (i = 0; i < opr_sz / 4; ++i) {
 492        d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
 493    }
 494}
 495
 496void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
 497{
 498    intptr_t i, j, opr_sz = simd_oprsz(desc);
 499    int idx = simd_data(desc);
 500    int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
 501    uint32_t discard;
 502
 503    for (i = 0; i < opr_sz / 4; i += 16 / 4) {
 504        int32_t mm = m[i];
 505        for (j = 0; j < 16 / 4; ++j) {
 506            d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
 507        }
 508    }
 509}
 510
 511void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
 512{
 513    intptr_t i, j, opr_sz = simd_oprsz(desc);
 514    int idx = simd_data(desc);
 515    int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
 516    uint32_t discard;
 517
 518    for (i = 0; i < opr_sz / 4; i += 16 / 4) {
 519        int32_t mm = m[i];
 520        for (j = 0; j < 16 / 4; ++j) {
 521            d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
 522        }
 523    }
 524}
 525
 526/* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
 527static int64_t do_sat128_d(Int128 r)
 528{
 529    int64_t ls = int128_getlo(r);
 530    int64_t hs = int128_gethi(r);
 531
 532    if (unlikely(hs != (ls >> 63))) {
 533        return hs < 0 ? INT64_MIN : INT64_MAX;
 534    }
 535    return ls;
 536}
 537
 538int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
 539{
 540    uint64_t l, h;
 541    Int128 r, t;
 542
 543    /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
 544    muls64(&l, &h, m, n);
 545    r = int128_make128(l, h);
 546    if (neg) {
 547        r = int128_neg(r);
 548    }
 549    if (a) {
 550        t = int128_exts64(a);
 551        t = int128_lshift(t, 63);
 552        r = int128_add(r, t);
 553    }
 554    if (round) {
 555        t = int128_exts64(1ll << 62);
 556        r = int128_add(r, t);
 557    }
 558    r = int128_rshift(r, 63);
 559
 560    return do_sat128_d(r);
 561}
 562
 563void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
 564                             void *va, uint32_t desc)
 565{
 566    intptr_t i, opr_sz = simd_oprsz(desc);
 567    int64_t *d = vd, *n = vn, *m = vm, *a = va;
 568
 569    for (i = 0; i < opr_sz / 8; ++i) {
 570        d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
 571    }
 572}
 573
 574void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
 575                             void *va, uint32_t desc)
 576{
 577    intptr_t i, opr_sz = simd_oprsz(desc);
 578    int64_t *d = vd, *n = vn, *m = vm, *a = va;
 579
 580    for (i = 0; i < opr_sz / 8; ++i) {
 581        d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
 582    }
 583}
 584
 585void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
 586{
 587    intptr_t i, opr_sz = simd_oprsz(desc);
 588    int64_t *d = vd, *n = vn, *m = vm;
 589
 590    for (i = 0; i < opr_sz / 8; ++i) {
 591        d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
 592    }
 593}
 594
 595void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
 596{
 597    intptr_t i, opr_sz = simd_oprsz(desc);
 598    int64_t *d = vd, *n = vn, *m = vm;
 599
 600    for (i = 0; i < opr_sz / 8; ++i) {
 601        d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
 602    }
 603}
 604
 605void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
 606{
 607    intptr_t i, j, opr_sz = simd_oprsz(desc);
 608    int idx = simd_data(desc);
 609    int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
 610
 611    for (i = 0; i < opr_sz / 8; i += 16 / 8) {
 612        int64_t mm = m[i];
 613        for (j = 0; j < 16 / 8; ++j) {
 614            d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
 615        }
 616    }
 617}
 618
 619void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
 620{
 621    intptr_t i, j, opr_sz = simd_oprsz(desc);
 622    int idx = simd_data(desc);
 623    int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
 624
 625    for (i = 0; i < opr_sz / 8; i += 16 / 8) {
 626        int64_t mm = m[i];
 627        for (j = 0; j < 16 / 8; ++j) {
 628            d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
 629        }
 630    }
 631}
 632
 633/* Integer 8 and 16-bit dot-product.
 634 *
 635 * Note that for the loops herein, host endianness does not matter
 636 * with respect to the ordering of data within the quad-width lanes.
 637 * All elements are treated equally, no matter where they are.
 638 */
 639
 640#define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
 641void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
 642{                                                                         \
 643    intptr_t i, opr_sz = simd_oprsz(desc);                                \
 644    TYPED *d = vd, *a = va;                                               \
 645    TYPEN *n = vn;                                                        \
 646    TYPEM *m = vm;                                                        \
 647    for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
 648        d[i] = (a[i] +                                                    \
 649                (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
 650                (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
 651                (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
 652                (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
 653    }                                                                     \
 654    clear_tail(d, opr_sz, simd_maxsz(desc));                              \
 655}
 656
 657DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
 658DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
 659DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
 660DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
 661DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
 662
 663#define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
 664void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
 665{                                                                         \
 666    intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
 667    intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
 668    intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
 669    intptr_t index = simd_data(desc);                                     \
 670    TYPED *d = vd, *a = va;                                               \
 671    TYPEN *n = vn;                                                        \
 672    TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
 673    do {                                                                  \
 674        TYPED m0 = m_indexed[i * 4 + 0];                                  \
 675        TYPED m1 = m_indexed[i * 4 + 1];                                  \
 676        TYPED m2 = m_indexed[i * 4 + 2];                                  \
 677        TYPED m3 = m_indexed[i * 4 + 3];                                  \
 678        do {                                                              \
 679            d[i] = (a[i] +                                                \
 680                    n[i * 4 + 0] * m0 +                                   \
 681                    n[i * 4 + 1] * m1 +                                   \
 682                    n[i * 4 + 2] * m2 +                                   \
 683                    n[i * 4 + 3] * m3);                                   \
 684        } while (++i < segend);                                           \
 685        segend = i + 4;                                                   \
 686    } while (i < opr_sz_n);                                               \
 687    clear_tail(d, opr_sz, simd_maxsz(desc));                              \
 688}
 689
 690DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
 691DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
 692DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
 693DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
 694DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
 695DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
 696
 697void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
 698                         void *vfpst, uint32_t desc)
 699{
 700    uintptr_t opr_sz = simd_oprsz(desc);
 701    float16 *d = vd;
 702    float16 *n = vn;
 703    float16 *m = vm;
 704    float_status *fpst = vfpst;
 705    uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
 706    uint32_t neg_imag = neg_real ^ 1;
 707    uintptr_t i;
 708
 709    /* Shift boolean to the sign bit so we can xor to negate.  */
 710    neg_real <<= 15;
 711    neg_imag <<= 15;
 712
 713    for (i = 0; i < opr_sz / 2; i += 2) {
 714        float16 e0 = n[H2(i)];
 715        float16 e1 = m[H2(i + 1)] ^ neg_imag;
 716        float16 e2 = n[H2(i + 1)];
 717        float16 e3 = m[H2(i)] ^ neg_real;
 718
 719        d[H2(i)] = float16_add(e0, e1, fpst);
 720        d[H2(i + 1)] = float16_add(e2, e3, fpst);
 721    }
 722    clear_tail(d, opr_sz, simd_maxsz(desc));
 723}
 724
 725void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
 726                         void *vfpst, uint32_t desc)
 727{
 728    uintptr_t opr_sz = simd_oprsz(desc);
 729    float32 *d = vd;
 730    float32 *n = vn;
 731    float32 *m = vm;
 732    float_status *fpst = vfpst;
 733    uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
 734    uint32_t neg_imag = neg_real ^ 1;
 735    uintptr_t i;
 736
 737    /* Shift boolean to the sign bit so we can xor to negate.  */
 738    neg_real <<= 31;
 739    neg_imag <<= 31;
 740
 741    for (i = 0; i < opr_sz / 4; i += 2) {
 742        float32 e0 = n[H4(i)];
 743        float32 e1 = m[H4(i + 1)] ^ neg_imag;
 744        float32 e2 = n[H4(i + 1)];
 745        float32 e3 = m[H4(i)] ^ neg_real;
 746
 747        d[H4(i)] = float32_add(e0, e1, fpst);
 748        d[H4(i + 1)] = float32_add(e2, e3, fpst);
 749    }
 750    clear_tail(d, opr_sz, simd_maxsz(desc));
 751}
 752
 753void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
 754                         void *vfpst, uint32_t desc)
 755{
 756    uintptr_t opr_sz = simd_oprsz(desc);
 757    float64 *d = vd;
 758    float64 *n = vn;
 759    float64 *m = vm;
 760    float_status *fpst = vfpst;
 761    uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
 762    uint64_t neg_imag = neg_real ^ 1;
 763    uintptr_t i;
 764
 765    /* Shift boolean to the sign bit so we can xor to negate.  */
 766    neg_real <<= 63;
 767    neg_imag <<= 63;
 768
 769    for (i = 0; i < opr_sz / 8; i += 2) {
 770        float64 e0 = n[i];
 771        float64 e1 = m[i + 1] ^ neg_imag;
 772        float64 e2 = n[i + 1];
 773        float64 e3 = m[i] ^ neg_real;
 774
 775        d[i] = float64_add(e0, e1, fpst);
 776        d[i + 1] = float64_add(e2, e3, fpst);
 777    }
 778    clear_tail(d, opr_sz, simd_maxsz(desc));
 779}
 780
 781void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
 782                         void *vfpst, uint32_t desc)
 783{
 784    uintptr_t opr_sz = simd_oprsz(desc);
 785    float16 *d = vd, *n = vn, *m = vm, *a = va;
 786    float_status *fpst = vfpst;
 787    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
 788    uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
 789    uint32_t neg_real = flip ^ neg_imag;
 790    uintptr_t i;
 791
 792    /* Shift boolean to the sign bit so we can xor to negate.  */
 793    neg_real <<= 15;
 794    neg_imag <<= 15;
 795
 796    for (i = 0; i < opr_sz / 2; i += 2) {
 797        float16 e2 = n[H2(i + flip)];
 798        float16 e1 = m[H2(i + flip)] ^ neg_real;
 799        float16 e4 = e2;
 800        float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
 801
 802        d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
 803        d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
 804    }
 805    clear_tail(d, opr_sz, simd_maxsz(desc));
 806}
 807
 808void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
 809                             void *vfpst, uint32_t desc)
 810{
 811    uintptr_t opr_sz = simd_oprsz(desc);
 812    float16 *d = vd, *n = vn, *m = vm, *a = va;
 813    float_status *fpst = vfpst;
 814    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
 815    uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
 816    intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
 817    uint32_t neg_real = flip ^ neg_imag;
 818    intptr_t elements = opr_sz / sizeof(float16);
 819    intptr_t eltspersegment = 16 / sizeof(float16);
 820    intptr_t i, j;
 821
 822    /* Shift boolean to the sign bit so we can xor to negate.  */
 823    neg_real <<= 15;
 824    neg_imag <<= 15;
 825
 826    for (i = 0; i < elements; i += eltspersegment) {
 827        float16 mr = m[H2(i + 2 * index + 0)];
 828        float16 mi = m[H2(i + 2 * index + 1)];
 829        float16 e1 = neg_real ^ (flip ? mi : mr);
 830        float16 e3 = neg_imag ^ (flip ? mr : mi);
 831
 832        for (j = i; j < i + eltspersegment; j += 2) {
 833            float16 e2 = n[H2(j + flip)];
 834            float16 e4 = e2;
 835
 836            d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
 837            d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
 838        }
 839    }
 840    clear_tail(d, opr_sz, simd_maxsz(desc));
 841}
 842
 843void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
 844                         void *vfpst, uint32_t desc)
 845{
 846    uintptr_t opr_sz = simd_oprsz(desc);
 847    float32 *d = vd, *n = vn, *m = vm, *a = va;
 848    float_status *fpst = vfpst;
 849    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
 850    uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
 851    uint32_t neg_real = flip ^ neg_imag;
 852    uintptr_t i;
 853
 854    /* Shift boolean to the sign bit so we can xor to negate.  */
 855    neg_real <<= 31;
 856    neg_imag <<= 31;
 857
 858    for (i = 0; i < opr_sz / 4; i += 2) {
 859        float32 e2 = n[H4(i + flip)];
 860        float32 e1 = m[H4(i + flip)] ^ neg_real;
 861        float32 e4 = e2;
 862        float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
 863
 864        d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
 865        d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
 866    }
 867    clear_tail(d, opr_sz, simd_maxsz(desc));
 868}
 869
 870void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
 871                             void *vfpst, uint32_t desc)
 872{
 873    uintptr_t opr_sz = simd_oprsz(desc);
 874    float32 *d = vd, *n = vn, *m = vm, *a = va;
 875    float_status *fpst = vfpst;
 876    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
 877    uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
 878    intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
 879    uint32_t neg_real = flip ^ neg_imag;
 880    intptr_t elements = opr_sz / sizeof(float32);
 881    intptr_t eltspersegment = 16 / sizeof(float32);
 882    intptr_t i, j;
 883
 884    /* Shift boolean to the sign bit so we can xor to negate.  */
 885    neg_real <<= 31;
 886    neg_imag <<= 31;
 887
 888    for (i = 0; i < elements; i += eltspersegment) {
 889        float32 mr = m[H4(i + 2 * index + 0)];
 890        float32 mi = m[H4(i + 2 * index + 1)];
 891        float32 e1 = neg_real ^ (flip ? mi : mr);
 892        float32 e3 = neg_imag ^ (flip ? mr : mi);
 893
 894        for (j = i; j < i + eltspersegment; j += 2) {
 895            float32 e2 = n[H4(j + flip)];
 896            float32 e4 = e2;
 897
 898            d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
 899            d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
 900        }
 901    }
 902    clear_tail(d, opr_sz, simd_maxsz(desc));
 903}
 904
 905void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
 906                         void *vfpst, uint32_t desc)
 907{
 908    uintptr_t opr_sz = simd_oprsz(desc);
 909    float64 *d = vd, *n = vn, *m = vm, *a = va;
 910    float_status *fpst = vfpst;
 911    intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
 912    uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
 913    uint64_t neg_real = flip ^ neg_imag;
 914    uintptr_t i;
 915
 916    /* Shift boolean to the sign bit so we can xor to negate.  */
 917    neg_real <<= 63;
 918    neg_imag <<= 63;
 919
 920    for (i = 0; i < opr_sz / 8; i += 2) {
 921        float64 e2 = n[i + flip];
 922        float64 e1 = m[i + flip] ^ neg_real;
 923        float64 e4 = e2;
 924        float64 e3 = m[i + 1 - flip] ^ neg_imag;
 925
 926        d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
 927        d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
 928    }
 929    clear_tail(d, opr_sz, simd_maxsz(desc));
 930}
 931
 932/*
 933 * Floating point comparisons producing an integer result (all 1s or all 0s).
 934 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
 935 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
 936 */
 937static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
 938{
 939    return -float16_eq_quiet(op1, op2, stat);
 940}
 941
 942static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
 943{
 944    return -float32_eq_quiet(op1, op2, stat);
 945}
 946
 947static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
 948{
 949    return -float16_le(op2, op1, stat);
 950}
 951
 952static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
 953{
 954    return -float32_le(op2, op1, stat);
 955}
 956
 957static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
 958{
 959    return -float16_lt(op2, op1, stat);
 960}
 961
 962static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
 963{
 964    return -float32_lt(op2, op1, stat);
 965}
 966
 967static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
 968{
 969    return -float16_le(float16_abs(op2), float16_abs(op1), stat);
 970}
 971
 972static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
 973{
 974    return -float32_le(float32_abs(op2), float32_abs(op1), stat);
 975}
 976
 977static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
 978{
 979    return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
 980}
 981
 982static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
 983{
 984    return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
 985}
 986
 987static int16_t vfp_tosszh(float16 x, void *fpstp)
 988{
 989    float_status *fpst = fpstp;
 990    if (float16_is_any_nan(x)) {
 991        float_raise(float_flag_invalid, fpst);
 992        return 0;
 993    }
 994    return float16_to_int16_round_to_zero(x, fpst);
 995}
 996
 997static uint16_t vfp_touszh(float16 x, void *fpstp)
 998{
 999    float_status *fpst = fpstp;
1000    if (float16_is_any_nan(x)) {
1001        float_raise(float_flag_invalid, fpst);
1002        return 0;
1003    }
1004    return float16_to_uint16_round_to_zero(x, fpst);
1005}
1006
1007#define DO_2OP(NAME, FUNC, TYPE) \
1008void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)  \
1009{                                                                 \
1010    intptr_t i, oprsz = simd_oprsz(desc);                         \
1011    TYPE *d = vd, *n = vn;                                        \
1012    for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1013        d[i] = FUNC(n[i], stat);                                  \
1014    }                                                             \
1015    clear_tail(d, oprsz, simd_maxsz(desc));                       \
1016}
1017
1018DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1019DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1020DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1021
1022DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1023DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1024DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1025
1026DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1027DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1028
1029DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1030DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1031DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1032DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1033DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1034DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1035DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1036DO_2OP(gvec_touszh, vfp_touszh, float16)
1037
1038#define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1039    static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1040    {                                                           \
1041        return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1042    }
1043
1044#define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1045    static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1046    {                                                           \
1047        return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1048    }
1049
1050#define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1051    WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1052    WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1053    DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1054    DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
1055
1056DO_2OP_CMP0(cgt, cgt, FWD)
1057DO_2OP_CMP0(cge, cge, FWD)
1058DO_2OP_CMP0(ceq, ceq, FWD)
1059DO_2OP_CMP0(clt, cgt, REV)
1060DO_2OP_CMP0(cle, cge, REV)
1061
1062#undef DO_2OP
1063#undef DO_2OP_CMP0
1064
1065/* Floating-point trigonometric starting value.
1066 * See the ARM ARM pseudocode function FPTrigSMul.
1067 */
1068static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1069{
1070    float16 result = float16_mul(op1, op1, stat);
1071    if (!float16_is_any_nan(result)) {
1072        result = float16_set_sign(result, op2 & 1);
1073    }
1074    return result;
1075}
1076
1077static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1078{
1079    float32 result = float32_mul(op1, op1, stat);
1080    if (!float32_is_any_nan(result)) {
1081        result = float32_set_sign(result, op2 & 1);
1082    }
1083    return result;
1084}
1085
1086static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1087{
1088    float64 result = float64_mul(op1, op1, stat);
1089    if (!float64_is_any_nan(result)) {
1090        result = float64_set_sign(result, op2 & 1);
1091    }
1092    return result;
1093}
1094
1095static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1096{
1097    return float16_abs(float16_sub(op1, op2, stat));
1098}
1099
1100static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1101{
1102    return float32_abs(float32_sub(op1, op2, stat));
1103}
1104
1105/*
1106 * Reciprocal step. These are the AArch32 version which uses a
1107 * non-fused multiply-and-subtract.
1108 */
1109static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1110{
1111    op1 = float16_squash_input_denormal(op1, stat);
1112    op2 = float16_squash_input_denormal(op2, stat);
1113
1114    if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1115        (float16_is_infinity(op2) && float16_is_zero(op1))) {
1116        return float16_two;
1117    }
1118    return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1119}
1120
1121static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1122{
1123    op1 = float32_squash_input_denormal(op1, stat);
1124    op2 = float32_squash_input_denormal(op2, stat);
1125
1126    if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1127        (float32_is_infinity(op2) && float32_is_zero(op1))) {
1128        return float32_two;
1129    }
1130    return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1131}
1132
1133/* Reciprocal square-root step. AArch32 non-fused semantics. */
1134static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1135{
1136    op1 = float16_squash_input_denormal(op1, stat);
1137    op2 = float16_squash_input_denormal(op2, stat);
1138
1139    if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1140        (float16_is_infinity(op2) && float16_is_zero(op1))) {
1141        return float16_one_point_five;
1142    }
1143    op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1144    return float16_div(op1, float16_two, stat);
1145}
1146
1147static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1148{
1149    op1 = float32_squash_input_denormal(op1, stat);
1150    op2 = float32_squash_input_denormal(op2, stat);
1151
1152    if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1153        (float32_is_infinity(op2) && float32_is_zero(op1))) {
1154        return float32_one_point_five;
1155    }
1156    op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1157    return float32_div(op1, float32_two, stat);
1158}
1159
1160#define DO_3OP(NAME, FUNC, TYPE) \
1161void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1162{                                                                          \
1163    intptr_t i, oprsz = simd_oprsz(desc);                                  \
1164    TYPE *d = vd, *n = vn, *m = vm;                                        \
1165    for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1166        d[i] = FUNC(n[i], m[i], stat);                                     \
1167    }                                                                      \
1168    clear_tail(d, oprsz, simd_maxsz(desc));                                \
1169}
1170
1171DO_3OP(gvec_fadd_h, float16_add, float16)
1172DO_3OP(gvec_fadd_s, float32_add, float32)
1173DO_3OP(gvec_fadd_d, float64_add, float64)
1174
1175DO_3OP(gvec_fsub_h, float16_sub, float16)
1176DO_3OP(gvec_fsub_s, float32_sub, float32)
1177DO_3OP(gvec_fsub_d, float64_sub, float64)
1178
1179DO_3OP(gvec_fmul_h, float16_mul, float16)
1180DO_3OP(gvec_fmul_s, float32_mul, float32)
1181DO_3OP(gvec_fmul_d, float64_mul, float64)
1182
1183DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1184DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1185DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1186
1187DO_3OP(gvec_fabd_h, float16_abd, float16)
1188DO_3OP(gvec_fabd_s, float32_abd, float32)
1189
1190DO_3OP(gvec_fceq_h, float16_ceq, float16)
1191DO_3OP(gvec_fceq_s, float32_ceq, float32)
1192
1193DO_3OP(gvec_fcge_h, float16_cge, float16)
1194DO_3OP(gvec_fcge_s, float32_cge, float32)
1195
1196DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1197DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1198
1199DO_3OP(gvec_facge_h, float16_acge, float16)
1200DO_3OP(gvec_facge_s, float32_acge, float32)
1201
1202DO_3OP(gvec_facgt_h, float16_acgt, float16)
1203DO_3OP(gvec_facgt_s, float32_acgt, float32)
1204
1205DO_3OP(gvec_fmax_h, float16_max, float16)
1206DO_3OP(gvec_fmax_s, float32_max, float32)
1207
1208DO_3OP(gvec_fmin_h, float16_min, float16)
1209DO_3OP(gvec_fmin_s, float32_min, float32)
1210
1211DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1212DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1213
1214DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1215DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1216
1217DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1218DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1219
1220DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1221DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1222
1223#ifdef TARGET_AARCH64
1224
1225DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1226DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1227DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1228
1229DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1230DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1231DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1232
1233#endif
1234#undef DO_3OP
1235
1236/* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1237static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1238                                 float_status *stat)
1239{
1240    return float16_add(dest, float16_mul(op1, op2, stat), stat);
1241}
1242
1243static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1244                                 float_status *stat)
1245{
1246    return float32_add(dest, float32_mul(op1, op2, stat), stat);
1247}
1248
1249static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1250                                 float_status *stat)
1251{
1252    return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1253}
1254
1255static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1256                                 float_status *stat)
1257{
1258    return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1259}
1260
1261/* Fused versions; these have the semantics Neon VFMA/VFMS want */
1262static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1263                                float_status *stat)
1264{
1265    return float16_muladd(op1, op2, dest, 0, stat);
1266}
1267
1268static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1269                                 float_status *stat)
1270{
1271    return float32_muladd(op1, op2, dest, 0, stat);
1272}
1273
1274static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1275                                 float_status *stat)
1276{
1277    return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1278}
1279
1280static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1281                                 float_status *stat)
1282{
1283    return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1284}
1285
1286#define DO_MULADD(NAME, FUNC, TYPE)                                     \
1287void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1288{                                                                          \
1289    intptr_t i, oprsz = simd_oprsz(desc);                                  \
1290    TYPE *d = vd, *n = vn, *m = vm;                                        \
1291    for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1292        d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1293    }                                                                      \
1294    clear_tail(d, oprsz, simd_maxsz(desc));                                \
1295}
1296
1297DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1298DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1299
1300DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1301DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1302
1303DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1304DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1305
1306DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1307DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1308
1309/* For the indexed ops, SVE applies the index per 128-bit vector segment.
1310 * For AdvSIMD, there is of course only one such vector segment.
1311 */
1312
1313#define DO_MUL_IDX(NAME, TYPE, H) \
1314void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1315{                                                                          \
1316    intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1317    intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1318    intptr_t idx = simd_data(desc);                                        \
1319    TYPE *d = vd, *n = vn, *m = vm;                                        \
1320    for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1321        TYPE mm = m[H(i + idx)];                                           \
1322        for (j = 0; j < segment; j++) {                                    \
1323            d[i + j] = n[i + j] * mm;                                      \
1324        }                                                                  \
1325    }                                                                      \
1326    clear_tail(d, oprsz, simd_maxsz(desc));                                \
1327}
1328
1329DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1330DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1331DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1332
1333#undef DO_MUL_IDX
1334
1335#define DO_MLA_IDX(NAME, TYPE, OP, H) \
1336void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1337{                                                                          \
1338    intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1339    intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1340    intptr_t idx = simd_data(desc);                                        \
1341    TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1342    for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1343        TYPE mm = m[H(i + idx)];                                           \
1344        for (j = 0; j < segment; j++) {                                    \
1345            d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1346        }                                                                  \
1347    }                                                                      \
1348    clear_tail(d, oprsz, simd_maxsz(desc));                                \
1349}
1350
1351DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1352DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1353DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1354
1355DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1356DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1357DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1358
1359#undef DO_MLA_IDX
1360
1361#define DO_FMUL_IDX(NAME, ADD, TYPE, H)                                    \
1362void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1363{                                                                          \
1364    intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1365    intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1366    intptr_t idx = simd_data(desc);                                        \
1367    TYPE *d = vd, *n = vn, *m = vm;                                        \
1368    for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1369        TYPE mm = m[H(i + idx)];                                           \
1370        for (j = 0; j < segment; j++) {                                    \
1371            d[i + j] = TYPE##_##ADD(d[i + j],                              \
1372                                    TYPE##_mul(n[i + j], mm, stat), stat); \
1373        }                                                                  \
1374    }                                                                      \
1375    clear_tail(d, oprsz, simd_maxsz(desc));                                \
1376}
1377
1378#define float16_nop(N, M, S) (M)
1379#define float32_nop(N, M, S) (M)
1380#define float64_nop(N, M, S) (M)
1381
1382DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2)
1383DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4)
1384DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, H8)
1385
1386/*
1387 * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1388 * the fused ops below they assume accumulate both from and into Vd.
1389 */
1390DO_FMUL_IDX(gvec_fmla_nf_idx_h, add, float16, H2)
1391DO_FMUL_IDX(gvec_fmla_nf_idx_s, add, float32, H4)
1392DO_FMUL_IDX(gvec_fmls_nf_idx_h, sub, float16, H2)
1393DO_FMUL_IDX(gvec_fmls_nf_idx_s, sub, float32, H4)
1394
1395#undef float16_nop
1396#undef float32_nop
1397#undef float64_nop
1398#undef DO_FMUL_IDX
1399
1400#define DO_FMLA_IDX(NAME, TYPE, H)                                         \
1401void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1402                  void *stat, uint32_t desc)                               \
1403{                                                                          \
1404    intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1405    intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1406    TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
1407    intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
1408    TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1409    op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
1410    for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1411        TYPE mm = m[H(i + idx)];                                           \
1412        for (j = 0; j < segment; j++) {                                    \
1413            d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
1414                                     mm, a[i + j], 0, stat);               \
1415        }                                                                  \
1416    }                                                                      \
1417    clear_tail(d, oprsz, simd_maxsz(desc));                                \
1418}
1419
1420DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1421DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1422DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1423
1424#undef DO_FMLA_IDX
1425
1426#define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1427void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1428{                                                                          \
1429    intptr_t i, oprsz = simd_oprsz(desc);                                  \
1430    TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1431    bool q = false;                                                        \
1432    for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1433        WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1434        if (dd < MIN) {                                                    \
1435            dd = MIN;                                                      \
1436            q = true;                                                      \
1437        } else if (dd > MAX) {                                             \
1438            dd = MAX;                                                      \
1439            q = true;                                                      \
1440        }                                                                  \
1441        d[i] = dd;                                                         \
1442    }                                                                      \
1443    if (q) {                                                               \
1444        uint32_t *qc = vq;                                                 \
1445        qc[0] = 1;                                                         \
1446    }                                                                      \
1447    clear_tail(d, oprsz, simd_maxsz(desc));                                \
1448}
1449
1450DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1451DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1452DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1453
1454DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1455DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1456DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1457
1458DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1459DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1460DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1461
1462DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1463DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1464DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1465
1466#undef DO_SAT
1467
1468void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1469                          void *vm, uint32_t desc)
1470{
1471    intptr_t i, oprsz = simd_oprsz(desc);
1472    uint64_t *d = vd, *n = vn, *m = vm;
1473    bool q = false;
1474
1475    for (i = 0; i < oprsz / 8; i++) {
1476        uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1477        if (dd < nn) {
1478            dd = UINT64_MAX;
1479            q = true;
1480        }
1481        d[i] = dd;
1482    }
1483    if (q) {
1484        uint32_t *qc = vq;
1485        qc[0] = 1;
1486    }
1487    clear_tail(d, oprsz, simd_maxsz(desc));
1488}
1489
1490void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1491                          void *vm, uint32_t desc)
1492{
1493    intptr_t i, oprsz = simd_oprsz(desc);
1494    uint64_t *d = vd, *n = vn, *m = vm;
1495    bool q = false;
1496
1497    for (i = 0; i < oprsz / 8; i++) {
1498        uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1499        if (nn < mm) {
1500            dd = 0;
1501            q = true;
1502        }
1503        d[i] = dd;
1504    }
1505    if (q) {
1506        uint32_t *qc = vq;
1507        qc[0] = 1;
1508    }
1509    clear_tail(d, oprsz, simd_maxsz(desc));
1510}
1511
1512void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1513                          void *vm, uint32_t desc)
1514{
1515    intptr_t i, oprsz = simd_oprsz(desc);
1516    int64_t *d = vd, *n = vn, *m = vm;
1517    bool q = false;
1518
1519    for (i = 0; i < oprsz / 8; i++) {
1520        int64_t nn = n[i], mm = m[i], dd = nn + mm;
1521        if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1522            dd = (nn >> 63) ^ ~INT64_MIN;
1523            q = true;
1524        }
1525        d[i] = dd;
1526    }
1527    if (q) {
1528        uint32_t *qc = vq;
1529        qc[0] = 1;
1530    }
1531    clear_tail(d, oprsz, simd_maxsz(desc));
1532}
1533
1534void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1535                          void *vm, uint32_t desc)
1536{
1537    intptr_t i, oprsz = simd_oprsz(desc);
1538    int64_t *d = vd, *n = vn, *m = vm;
1539    bool q = false;
1540
1541    for (i = 0; i < oprsz / 8; i++) {
1542        int64_t nn = n[i], mm = m[i], dd = nn - mm;
1543        if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1544            dd = (nn >> 63) ^ ~INT64_MIN;
1545            q = true;
1546        }
1547        d[i] = dd;
1548    }
1549    if (q) {
1550        uint32_t *qc = vq;
1551        qc[0] = 1;
1552    }
1553    clear_tail(d, oprsz, simd_maxsz(desc));
1554}
1555
1556
1557#define DO_SRA(NAME, TYPE)                              \
1558void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1559{                                                       \
1560    intptr_t i, oprsz = simd_oprsz(desc);               \
1561    int shift = simd_data(desc);                        \
1562    TYPE *d = vd, *n = vn;                              \
1563    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1564        d[i] += n[i] >> shift;                          \
1565    }                                                   \
1566    clear_tail(d, oprsz, simd_maxsz(desc));             \
1567}
1568
1569DO_SRA(gvec_ssra_b, int8_t)
1570DO_SRA(gvec_ssra_h, int16_t)
1571DO_SRA(gvec_ssra_s, int32_t)
1572DO_SRA(gvec_ssra_d, int64_t)
1573
1574DO_SRA(gvec_usra_b, uint8_t)
1575DO_SRA(gvec_usra_h, uint16_t)
1576DO_SRA(gvec_usra_s, uint32_t)
1577DO_SRA(gvec_usra_d, uint64_t)
1578
1579#undef DO_SRA
1580
1581#define DO_RSHR(NAME, TYPE)                             \
1582void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1583{                                                       \
1584    intptr_t i, oprsz = simd_oprsz(desc);               \
1585    int shift = simd_data(desc);                        \
1586    TYPE *d = vd, *n = vn;                              \
1587    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1588        TYPE tmp = n[i] >> (shift - 1);                 \
1589        d[i] = (tmp >> 1) + (tmp & 1);                  \
1590    }                                                   \
1591    clear_tail(d, oprsz, simd_maxsz(desc));             \
1592}
1593
1594DO_RSHR(gvec_srshr_b, int8_t)
1595DO_RSHR(gvec_srshr_h, int16_t)
1596DO_RSHR(gvec_srshr_s, int32_t)
1597DO_RSHR(gvec_srshr_d, int64_t)
1598
1599DO_RSHR(gvec_urshr_b, uint8_t)
1600DO_RSHR(gvec_urshr_h, uint16_t)
1601DO_RSHR(gvec_urshr_s, uint32_t)
1602DO_RSHR(gvec_urshr_d, uint64_t)
1603
1604#undef DO_RSHR
1605
1606#define DO_RSRA(NAME, TYPE)                             \
1607void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1608{                                                       \
1609    intptr_t i, oprsz = simd_oprsz(desc);               \
1610    int shift = simd_data(desc);                        \
1611    TYPE *d = vd, *n = vn;                              \
1612    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1613        TYPE tmp = n[i] >> (shift - 1);                 \
1614        d[i] += (tmp >> 1) + (tmp & 1);                 \
1615    }                                                   \
1616    clear_tail(d, oprsz, simd_maxsz(desc));             \
1617}
1618
1619DO_RSRA(gvec_srsra_b, int8_t)
1620DO_RSRA(gvec_srsra_h, int16_t)
1621DO_RSRA(gvec_srsra_s, int32_t)
1622DO_RSRA(gvec_srsra_d, int64_t)
1623
1624DO_RSRA(gvec_ursra_b, uint8_t)
1625DO_RSRA(gvec_ursra_h, uint16_t)
1626DO_RSRA(gvec_ursra_s, uint32_t)
1627DO_RSRA(gvec_ursra_d, uint64_t)
1628
1629#undef DO_RSRA
1630
1631#define DO_SRI(NAME, TYPE)                              \
1632void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1633{                                                       \
1634    intptr_t i, oprsz = simd_oprsz(desc);               \
1635    int shift = simd_data(desc);                        \
1636    TYPE *d = vd, *n = vn;                              \
1637    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1638        d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1639    }                                                   \
1640    clear_tail(d, oprsz, simd_maxsz(desc));             \
1641}
1642
1643DO_SRI(gvec_sri_b, uint8_t)
1644DO_SRI(gvec_sri_h, uint16_t)
1645DO_SRI(gvec_sri_s, uint32_t)
1646DO_SRI(gvec_sri_d, uint64_t)
1647
1648#undef DO_SRI
1649
1650#define DO_SLI(NAME, TYPE)                              \
1651void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1652{                                                       \
1653    intptr_t i, oprsz = simd_oprsz(desc);               \
1654    int shift = simd_data(desc);                        \
1655    TYPE *d = vd, *n = vn;                              \
1656    for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1657        d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1658    }                                                   \
1659    clear_tail(d, oprsz, simd_maxsz(desc));             \
1660}
1661
1662DO_SLI(gvec_sli_b, uint8_t)
1663DO_SLI(gvec_sli_h, uint16_t)
1664DO_SLI(gvec_sli_s, uint32_t)
1665DO_SLI(gvec_sli_d, uint64_t)
1666
1667#undef DO_SLI
1668
1669/*
1670 * Convert float16 to float32, raising no exceptions and
1671 * preserving exceptional values, including SNaN.
1672 * This is effectively an unpack+repack operation.
1673 */
1674static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1675{
1676    const int f16_bias = 15;
1677    const int f32_bias = 127;
1678    uint32_t sign = extract32(f16, 15, 1);
1679    uint32_t exp = extract32(f16, 10, 5);
1680    uint32_t frac = extract32(f16, 0, 10);
1681
1682    if (exp == 0x1f) {
1683        /* Inf or NaN */
1684        exp = 0xff;
1685    } else if (exp == 0) {
1686        /* Zero or denormal.  */
1687        if (frac != 0) {
1688            if (fz16) {
1689                frac = 0;
1690            } else {
1691                /*
1692                 * Denormal; these are all normal float32.
1693                 * Shift the fraction so that the msb is at bit 11,
1694                 * then remove bit 11 as the implicit bit of the
1695                 * normalized float32.  Note that we still go through
1696                 * the shift for normal numbers below, to put the
1697                 * float32 fraction at the right place.
1698                 */
1699                int shift = clz32(frac) - 21;
1700                frac = (frac << shift) & 0x3ff;
1701                exp = f32_bias - f16_bias - shift + 1;
1702            }
1703        }
1704    } else {
1705        /* Normal number; adjust the bias.  */
1706        exp += f32_bias - f16_bias;
1707    }
1708    sign <<= 31;
1709    exp <<= 23;
1710    frac <<= 23 - 10;
1711
1712    return sign | exp | frac;
1713}
1714
1715static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
1716{
1717    /*
1718     * Branchless load of u32[0], u64[0], u32[1], or u64[1].
1719     * Load the 2nd qword iff is_q & is_2.
1720     * Shift to the 2nd dword iff !is_q & is_2.
1721     * For !is_q & !is_2, the upper bits of the result are garbage.
1722     */
1723    return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
1724}
1725
1726/*
1727 * Note that FMLAL requires oprsz == 8 or oprsz == 16,
1728 * as there is not yet SVE versions that might use blocking.
1729 */
1730
1731static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
1732                     uint32_t desc, bool fz16)
1733{
1734    intptr_t i, oprsz = simd_oprsz(desc);
1735    int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1736    int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1737    int is_q = oprsz == 16;
1738    uint64_t n_4, m_4;
1739
1740    /* Pre-load all of the f16 data, avoiding overlap issues.  */
1741    n_4 = load4_f16(vn, is_q, is_2);
1742    m_4 = load4_f16(vm, is_q, is_2);
1743
1744    /* Negate all inputs for FMLSL at once.  */
1745    if (is_s) {
1746        n_4 ^= 0x8000800080008000ull;
1747    }
1748
1749    for (i = 0; i < oprsz / 4; i++) {
1750        float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1751        float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
1752        d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1753    }
1754    clear_tail(d, oprsz, simd_maxsz(desc));
1755}
1756
1757void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
1758                            void *venv, uint32_t desc)
1759{
1760    CPUARMState *env = venv;
1761    do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1762             get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1763}
1764
1765void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
1766                            void *venv, uint32_t desc)
1767{
1768    CPUARMState *env = venv;
1769    do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
1770             get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1771}
1772
1773void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
1774                               void *venv, uint32_t desc)
1775{
1776    intptr_t i, oprsz = simd_oprsz(desc);
1777    uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1778    intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1779    CPUARMState *env = venv;
1780    float_status *status = &env->vfp.fp_status;
1781    bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1782
1783    for (i = 0; i < oprsz; i += sizeof(float32)) {
1784        float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
1785        float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
1786        float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1787        float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1788        float32 aa = *(float32 *)(va + H1_4(i));
1789
1790        *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
1791    }
1792}
1793
1794static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
1795                         uint32_t desc, bool fz16)
1796{
1797    intptr_t i, oprsz = simd_oprsz(desc);
1798    int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1799    int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1800    int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
1801    int is_q = oprsz == 16;
1802    uint64_t n_4;
1803    float32 m_1;
1804
1805    /* Pre-load all of the f16 data, avoiding overlap issues.  */
1806    n_4 = load4_f16(vn, is_q, is_2);
1807
1808    /* Negate all inputs for FMLSL at once.  */
1809    if (is_s) {
1810        n_4 ^= 0x8000800080008000ull;
1811    }
1812
1813    m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
1814
1815    for (i = 0; i < oprsz / 4; i++) {
1816        float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1817        d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1818    }
1819    clear_tail(d, oprsz, simd_maxsz(desc));
1820}
1821
1822void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
1823                                void *venv, uint32_t desc)
1824{
1825    CPUARMState *env = venv;
1826    do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1827                 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1828}
1829
1830void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
1831                                void *venv, uint32_t desc)
1832{
1833    CPUARMState *env = venv;
1834    do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
1835                 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1836}
1837
1838void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
1839                               void *venv, uint32_t desc)
1840{
1841    intptr_t i, j, oprsz = simd_oprsz(desc);
1842    uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1843    intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1844    intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
1845    CPUARMState *env = venv;
1846    float_status *status = &env->vfp.fp_status;
1847    bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1848
1849    for (i = 0; i < oprsz; i += 16) {
1850        float16 mm_16 = *(float16 *)(vm + i + idx);
1851        float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1852
1853        for (j = 0; j < 16; j += sizeof(float32)) {
1854            float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
1855            float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1856            float32 aa = *(float32 *)(va + H1_4(i + j));
1857
1858            *(float32 *)(vd + H1_4(i + j)) =
1859                float32_muladd(nn, mm, aa, 0, status);
1860        }
1861    }
1862}
1863
1864void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1865{
1866    intptr_t i, opr_sz = simd_oprsz(desc);
1867    int8_t *d = vd, *n = vn, *m = vm;
1868
1869    for (i = 0; i < opr_sz; ++i) {
1870        int8_t mm = m[i];
1871        int8_t nn = n[i];
1872        int8_t res = 0;
1873        if (mm >= 0) {
1874            if (mm < 8) {
1875                res = nn << mm;
1876            }
1877        } else {
1878            res = nn >> (mm > -8 ? -mm : 7);
1879        }
1880        d[i] = res;
1881    }
1882    clear_tail(d, opr_sz, simd_maxsz(desc));
1883}
1884
1885void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1886{
1887    intptr_t i, opr_sz = simd_oprsz(desc);
1888    int16_t *d = vd, *n = vn, *m = vm;
1889
1890    for (i = 0; i < opr_sz / 2; ++i) {
1891        int8_t mm = m[i];   /* only 8 bits of shift are significant */
1892        int16_t nn = n[i];
1893        int16_t res = 0;
1894        if (mm >= 0) {
1895            if (mm < 16) {
1896                res = nn << mm;
1897            }
1898        } else {
1899            res = nn >> (mm > -16 ? -mm : 15);
1900        }
1901        d[i] = res;
1902    }
1903    clear_tail(d, opr_sz, simd_maxsz(desc));
1904}
1905
1906void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1907{
1908    intptr_t i, opr_sz = simd_oprsz(desc);
1909    uint8_t *d = vd, *n = vn, *m = vm;
1910
1911    for (i = 0; i < opr_sz; ++i) {
1912        int8_t mm = m[i];
1913        uint8_t nn = n[i];
1914        uint8_t res = 0;
1915        if (mm >= 0) {
1916            if (mm < 8) {
1917                res = nn << mm;
1918            }
1919        } else {
1920            if (mm > -8) {
1921                res = nn >> -mm;
1922            }
1923        }
1924        d[i] = res;
1925    }
1926    clear_tail(d, opr_sz, simd_maxsz(desc));
1927}
1928
1929void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1930{
1931    intptr_t i, opr_sz = simd_oprsz(desc);
1932    uint16_t *d = vd, *n = vn, *m = vm;
1933
1934    for (i = 0; i < opr_sz / 2; ++i) {
1935        int8_t mm = m[i];   /* only 8 bits of shift are significant */
1936        uint16_t nn = n[i];
1937        uint16_t res = 0;
1938        if (mm >= 0) {
1939            if (mm < 16) {
1940                res = nn << mm;
1941            }
1942        } else {
1943            if (mm > -16) {
1944                res = nn >> -mm;
1945            }
1946        }
1947        d[i] = res;
1948    }
1949    clear_tail(d, opr_sz, simd_maxsz(desc));
1950}
1951
1952/*
1953 * 8x8->8 polynomial multiply.
1954 *
1955 * Polynomial multiplication is like integer multiplication except the
1956 * partial products are XORed, not added.
1957 *
1958 * TODO: expose this as a generic vector operation, as it is a common
1959 * crypto building block.
1960 */
1961void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
1962{
1963    intptr_t i, j, opr_sz = simd_oprsz(desc);
1964    uint64_t *d = vd, *n = vn, *m = vm;
1965
1966    for (i = 0; i < opr_sz / 8; ++i) {
1967        uint64_t nn = n[i];
1968        uint64_t mm = m[i];
1969        uint64_t rr = 0;
1970
1971        for (j = 0; j < 8; ++j) {
1972            uint64_t mask = (nn & 0x0101010101010101ull) * 0xff;
1973            rr ^= mm & mask;
1974            mm = (mm << 1) & 0xfefefefefefefefeull;
1975            nn >>= 1;
1976        }
1977        d[i] = rr;
1978    }
1979    clear_tail(d, opr_sz, simd_maxsz(desc));
1980}
1981
1982/*
1983 * 64x64->128 polynomial multiply.
1984 * Because of the lanes are not accessed in strict columns,
1985 * this probably cannot be turned into a generic helper.
1986 */
1987void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
1988{
1989    intptr_t i, j, opr_sz = simd_oprsz(desc);
1990    intptr_t hi = simd_data(desc);
1991    uint64_t *d = vd, *n = vn, *m = vm;
1992
1993    for (i = 0; i < opr_sz / 8; i += 2) {
1994        uint64_t nn = n[i + hi];
1995        uint64_t mm = m[i + hi];
1996        uint64_t rhi = 0;
1997        uint64_t rlo = 0;
1998
1999        /* Bit 0 can only influence the low 64-bit result.  */
2000        if (nn & 1) {
2001            rlo = mm;
2002        }
2003
2004        for (j = 1; j < 64; ++j) {
2005            uint64_t mask = -((nn >> j) & 1);
2006            rlo ^= (mm << j) & mask;
2007            rhi ^= (mm >> (64 - j)) & mask;
2008        }
2009        d[i] = rlo;
2010        d[i + 1] = rhi;
2011    }
2012    clear_tail(d, opr_sz, simd_maxsz(desc));
2013}
2014
2015/*
2016 * 8x8->16 polynomial multiply.
2017 *
2018 * The byte inputs are expanded to (or extracted from) half-words.
2019 * Note that neon and sve2 get the inputs from different positions.
2020 * This allows 4 bytes to be processed in parallel with uint64_t.
2021 */
2022
2023static uint64_t expand_byte_to_half(uint64_t x)
2024{
2025    return  (x & 0x000000ff)
2026         | ((x & 0x0000ff00) << 8)
2027         | ((x & 0x00ff0000) << 16)
2028         | ((x & 0xff000000) << 24);
2029}
2030
2031uint64_t pmull_w(uint64_t op1, uint64_t op2)
2032{
2033    uint64_t result = 0;
2034    int i;
2035    for (i = 0; i < 16; ++i) {
2036        uint64_t mask = (op1 & 0x0000000100000001ull) * 0xffffffff;
2037        result ^= op2 & mask;
2038        op1 >>= 1;
2039        op2 <<= 1;
2040    }
2041    return result;
2042}
2043
2044uint64_t pmull_h(uint64_t op1, uint64_t op2)
2045{
2046    uint64_t result = 0;
2047    int i;
2048    for (i = 0; i < 8; ++i) {
2049        uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff;
2050        result ^= op2 & mask;
2051        op1 >>= 1;
2052        op2 <<= 1;
2053    }
2054    return result;
2055}
2056
2057void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2058{
2059    int hi = simd_data(desc);
2060    uint64_t *d = vd, *n = vn, *m = vm;
2061    uint64_t nn = n[hi], mm = m[hi];
2062
2063    d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
2064    nn >>= 32;
2065    mm >>= 32;
2066    d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
2067
2068    clear_tail(d, 16, simd_maxsz(desc));
2069}
2070
2071#ifdef TARGET_AARCH64
2072void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2073{
2074    int shift = simd_data(desc) * 8;
2075    intptr_t i, opr_sz = simd_oprsz(desc);
2076    uint64_t *d = vd, *n = vn, *m = vm;
2077
2078    for (i = 0; i < opr_sz / 8; ++i) {
2079        uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull;
2080        uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull;
2081
2082        d[i] = pmull_h(nn, mm);
2083    }
2084}
2085
2086static uint64_t pmull_d(uint64_t op1, uint64_t op2)
2087{
2088    uint64_t result = 0;
2089    int i;
2090
2091    for (i = 0; i < 32; ++i) {
2092        uint64_t mask = -((op1 >> i) & 1);
2093        result ^= (op2 << i) & mask;
2094    }
2095    return result;
2096}
2097
2098void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2099{
2100    intptr_t sel = H4(simd_data(desc));
2101    intptr_t i, opr_sz = simd_oprsz(desc);
2102    uint32_t *n = vn, *m = vm;
2103    uint64_t *d = vd;
2104
2105    for (i = 0; i < opr_sz / 8; ++i) {
2106        d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]);
2107    }
2108}
2109#endif
2110
2111#define DO_CMP0(NAME, TYPE, OP)                         \
2112void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2113{                                                       \
2114    intptr_t i, opr_sz = simd_oprsz(desc);              \
2115    for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2116        TYPE nn = *(TYPE *)(vn + i);                    \
2117        *(TYPE *)(vd + i) = -(nn OP 0);                 \
2118    }                                                   \
2119    clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2120}
2121
2122DO_CMP0(gvec_ceq0_b, int8_t, ==)
2123DO_CMP0(gvec_clt0_b, int8_t, <)
2124DO_CMP0(gvec_cle0_b, int8_t, <=)
2125DO_CMP0(gvec_cgt0_b, int8_t, >)
2126DO_CMP0(gvec_cge0_b, int8_t, >=)
2127
2128DO_CMP0(gvec_ceq0_h, int16_t, ==)
2129DO_CMP0(gvec_clt0_h, int16_t, <)
2130DO_CMP0(gvec_cle0_h, int16_t, <=)
2131DO_CMP0(gvec_cgt0_h, int16_t, >)
2132DO_CMP0(gvec_cge0_h, int16_t, >=)
2133
2134#undef DO_CMP0
2135
2136#define DO_ABD(NAME, TYPE)                                      \
2137void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2138{                                                               \
2139    intptr_t i, opr_sz = simd_oprsz(desc);                      \
2140    TYPE *d = vd, *n = vn, *m = vm;                             \
2141                                                                \
2142    for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2143        d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2144    }                                                           \
2145    clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2146}
2147
2148DO_ABD(gvec_sabd_b, int8_t)
2149DO_ABD(gvec_sabd_h, int16_t)
2150DO_ABD(gvec_sabd_s, int32_t)
2151DO_ABD(gvec_sabd_d, int64_t)
2152
2153DO_ABD(gvec_uabd_b, uint8_t)
2154DO_ABD(gvec_uabd_h, uint16_t)
2155DO_ABD(gvec_uabd_s, uint32_t)
2156DO_ABD(gvec_uabd_d, uint64_t)
2157
2158#undef DO_ABD
2159
2160#define DO_ABA(NAME, TYPE)                                      \
2161void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2162{                                                               \
2163    intptr_t i, opr_sz = simd_oprsz(desc);                      \
2164    TYPE *d = vd, *n = vn, *m = vm;                             \
2165                                                                \
2166    for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2167        d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2168    }                                                           \
2169    clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2170}
2171
2172DO_ABA(gvec_saba_b, int8_t)
2173DO_ABA(gvec_saba_h, int16_t)
2174DO_ABA(gvec_saba_s, int32_t)
2175DO_ABA(gvec_saba_d, int64_t)
2176
2177DO_ABA(gvec_uaba_b, uint8_t)
2178DO_ABA(gvec_uaba_h, uint16_t)
2179DO_ABA(gvec_uaba_s, uint32_t)
2180DO_ABA(gvec_uaba_d, uint64_t)
2181
2182#undef DO_ABA
2183
2184#define DO_NEON_PAIRWISE(NAME, OP)                                      \
2185    void HELPER(NAME##s)(void *vd, void *vn, void *vm,                  \
2186                         void *stat, uint32_t oprsz)                    \
2187    {                                                                   \
2188        float_status *fpst = stat;                                      \
2189        float32 *d = vd;                                                \
2190        float32 *n = vn;                                                \
2191        float32 *m = vm;                                                \
2192        float32 r0, r1;                                                 \
2193                                                                        \
2194        /* Read all inputs before writing outputs in case vm == vd */   \
2195        r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst);                    \
2196        r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst);                    \
2197                                                                        \
2198        d[H4(0)] = r0;                                                  \
2199        d[H4(1)] = r1;                                                  \
2200    }                                                                   \
2201                                                                        \
2202    void HELPER(NAME##h)(void *vd, void *vn, void *vm,                  \
2203                         void *stat, uint32_t oprsz)                    \
2204    {                                                                   \
2205        float_status *fpst = stat;                                      \
2206        float16 *d = vd;                                                \
2207        float16 *n = vn;                                                \
2208        float16 *m = vm;                                                \
2209        float16 r0, r1, r2, r3;                                         \
2210                                                                        \
2211        /* Read all inputs before writing outputs in case vm == vd */   \
2212        r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst);                    \
2213        r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst);                    \
2214        r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst);                    \
2215        r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst);                    \
2216                                                                        \
2217        d[H2(0)] = r0;                                                  \
2218        d[H2(1)] = r1;                                                  \
2219        d[H2(2)] = r2;                                                  \
2220        d[H2(3)] = r3;                                                  \
2221    }
2222
2223DO_NEON_PAIRWISE(neon_padd, add)
2224DO_NEON_PAIRWISE(neon_pmax, max)
2225DO_NEON_PAIRWISE(neon_pmin, min)
2226
2227#undef DO_NEON_PAIRWISE
2228
2229#define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2230    void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2231    {                                                                   \
2232        intptr_t i, oprsz = simd_oprsz(desc);                           \
2233        int shift = simd_data(desc);                                    \
2234        TYPE *d = vd, *n = vn;                                          \
2235        float_status *fpst = stat;                                      \
2236        for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2237            d[i] = FUNC(n[i], shift, fpst);                             \
2238        }                                                               \
2239        clear_tail(d, oprsz, simd_maxsz(desc));                         \
2240    }
2241
2242DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2243DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2244DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2245DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
2246DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2247DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2248DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2249DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2250
2251#undef DO_VCVT_FIXED
2252
2253#define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2254    void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2255    {                                                                   \
2256        float_status *fpst = stat;                                      \
2257        intptr_t i, oprsz = simd_oprsz(desc);                           \
2258        uint32_t rmode = simd_data(desc);                               \
2259        uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2260        TYPE *d = vd, *n = vn;                                          \
2261        set_float_rounding_mode(rmode, fpst);                           \
2262        for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2263            d[i] = FUNC(n[i], 0, fpst);                                 \
2264        }                                                               \
2265        set_float_rounding_mode(prev_rmode, fpst);                      \
2266        clear_tail(d, oprsz, simd_maxsz(desc));                         \
2267    }
2268
2269DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2270DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2271DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2272DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2273
2274#undef DO_VCVT_RMODE
2275
2276#define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2277    void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2278    {                                                                   \
2279        float_status *fpst = stat;                                      \
2280        intptr_t i, oprsz = simd_oprsz(desc);                           \
2281        uint32_t rmode = simd_data(desc);                               \
2282        uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2283        TYPE *d = vd, *n = vn;                                          \
2284        set_float_rounding_mode(rmode, fpst);                           \
2285        for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2286            d[i] = FUNC(n[i], fpst);                                    \
2287        }                                                               \
2288        set_float_rounding_mode(prev_rmode, fpst);                      \
2289        clear_tail(d, oprsz, simd_maxsz(desc));                         \
2290    }
2291
2292DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2293DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2294
2295#undef DO_VRINT_RMODE
2296
2297#ifdef TARGET_AARCH64
2298void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2299{
2300    const uint8_t *indices = vm;
2301    CPUARMState *env = venv;
2302    size_t oprsz = simd_oprsz(desc);
2303    uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2304    bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2305    uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2306    union {
2307        uint8_t b[16];
2308        uint64_t d[2];
2309    } result;
2310
2311    /*
2312     * We must construct the final result in a temp, lest the output
2313     * overlaps the input table.  For TBL, begin with zero; for TBX,
2314     * begin with the original register contents.  Note that we always
2315     * copy 16 bytes here to avoid an extra branch; clearing the high
2316     * bits of the register for oprsz == 8 is handled below.
2317     */
2318    if (is_tbx) {
2319        memcpy(&result, vd, 16);
2320    } else {
2321        memset(&result, 0, 16);
2322    }
2323
2324    for (size_t i = 0; i < oprsz; ++i) {
2325        uint32_t index = indices[H1(i)];
2326
2327        if (index < table_len) {
2328            /*
2329             * Convert index (a byte offset into the virtual table
2330             * which is a series of 128-bit vectors concatenated)
2331             * into the correct register element, bearing in mind
2332             * that the table can wrap around from V31 to V0.
2333             */
2334            const uint8_t *table = (const uint8_t *)
2335                aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2336            result.b[H1(i)] = table[H1(index % 16)];
2337        }
2338    }
2339
2340    memcpy(vd, &result, 16);
2341    clear_tail(vd, oprsz, simd_maxsz(desc));
2342}
2343#endif
2344
2345/*
2346 * NxN -> N highpart multiply
2347 *
2348 * TODO: expose this as a generic vector operation.
2349 */
2350
2351void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2352{
2353    intptr_t i, opr_sz = simd_oprsz(desc);
2354    int8_t *d = vd, *n = vn, *m = vm;
2355
2356    for (i = 0; i < opr_sz; ++i) {
2357        d[i] = ((int32_t)n[i] * m[i]) >> 8;
2358    }
2359    clear_tail(d, opr_sz, simd_maxsz(desc));
2360}
2361
2362void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2363{
2364    intptr_t i, opr_sz = simd_oprsz(desc);
2365    int16_t *d = vd, *n = vn, *m = vm;
2366
2367    for (i = 0; i < opr_sz / 2; ++i) {
2368        d[i] = ((int32_t)n[i] * m[i]) >> 16;
2369    }
2370    clear_tail(d, opr_sz, simd_maxsz(desc));
2371}
2372
2373void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2374{
2375    intptr_t i, opr_sz = simd_oprsz(desc);
2376    int32_t *d = vd, *n = vn, *m = vm;
2377
2378    for (i = 0; i < opr_sz / 4; ++i) {
2379        d[i] = ((int64_t)n[i] * m[i]) >> 32;
2380    }
2381    clear_tail(d, opr_sz, simd_maxsz(desc));
2382}
2383
2384void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2385{
2386    intptr_t i, opr_sz = simd_oprsz(desc);
2387    uint64_t *d = vd, *n = vn, *m = vm;
2388    uint64_t discard;
2389
2390    for (i = 0; i < opr_sz / 8; ++i) {
2391        muls64(&discard, &d[i], n[i], m[i]);
2392    }
2393    clear_tail(d, opr_sz, simd_maxsz(desc));
2394}
2395
2396void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2397{
2398    intptr_t i, opr_sz = simd_oprsz(desc);
2399    uint8_t *d = vd, *n = vn, *m = vm;
2400
2401    for (i = 0; i < opr_sz; ++i) {
2402        d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2403    }
2404    clear_tail(d, opr_sz, simd_maxsz(desc));
2405}
2406
2407void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2408{
2409    intptr_t i, opr_sz = simd_oprsz(desc);
2410    uint16_t *d = vd, *n = vn, *m = vm;
2411
2412    for (i = 0; i < opr_sz / 2; ++i) {
2413        d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2414    }
2415    clear_tail(d, opr_sz, simd_maxsz(desc));
2416}
2417
2418void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2419{
2420    intptr_t i, opr_sz = simd_oprsz(desc);
2421    uint32_t *d = vd, *n = vn, *m = vm;
2422
2423    for (i = 0; i < opr_sz / 4; ++i) {
2424        d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2425    }
2426    clear_tail(d, opr_sz, simd_maxsz(desc));
2427}
2428
2429void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2430{
2431    intptr_t i, opr_sz = simd_oprsz(desc);
2432    uint64_t *d = vd, *n = vn, *m = vm;
2433    uint64_t discard;
2434
2435    for (i = 0; i < opr_sz / 8; ++i) {
2436        mulu64(&discard, &d[i], n[i], m[i]);
2437    }
2438    clear_tail(d, opr_sz, simd_maxsz(desc));
2439}
2440
2441void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2442{
2443    intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2444    int shr = simd_data(desc);
2445    uint64_t *d = vd, *n = vn, *m = vm;
2446
2447    for (i = 0; i < opr_sz; ++i) {
2448        d[i] = ror64(n[i] ^ m[i], shr);
2449    }
2450    clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2451}
2452
2453/*
2454 * Integer matrix-multiply accumulate
2455 */
2456
2457static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2458{
2459    int8_t *n = vn, *m = vm;
2460
2461    for (intptr_t k = 0; k < 8; ++k) {
2462        sum += n[H1(k)] * m[H1(k)];
2463    }
2464    return sum;
2465}
2466
2467static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2468{
2469    uint8_t *n = vn, *m = vm;
2470
2471    for (intptr_t k = 0; k < 8; ++k) {
2472        sum += n[H1(k)] * m[H1(k)];
2473    }
2474    return sum;
2475}
2476
2477static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2478{
2479    uint8_t *n = vn;
2480    int8_t *m = vm;
2481
2482    for (intptr_t k = 0; k < 8; ++k) {
2483        sum += n[H1(k)] * m[H1(k)];
2484    }
2485    return sum;
2486}
2487
2488static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2489                      uint32_t (*inner_loop)(uint32_t, void *, void *))
2490{
2491    intptr_t seg, opr_sz = simd_oprsz(desc);
2492
2493    for (seg = 0; seg < opr_sz; seg += 16) {
2494        uint32_t *d = vd + seg;
2495        uint32_t *a = va + seg;
2496        uint32_t sum0, sum1, sum2, sum3;
2497
2498        /*
2499         * Process the entire segment at once, writing back the
2500         * results only after we've consumed all of the inputs.
2501         *
2502         * Key to indices by column:
2503         *          i   j                  i             j
2504         */
2505        sum0 = a[H4(0 + 0)];
2506        sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2507        sum1 = a[H4(0 + 1)];
2508        sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2509        sum2 = a[H4(2 + 0)];
2510        sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2511        sum3 = a[H4(2 + 1)];
2512        sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2513
2514        d[H4(0)] = sum0;
2515        d[H4(1)] = sum1;
2516        d[H4(2)] = sum2;
2517        d[H4(3)] = sum3;
2518    }
2519    clear_tail(vd, opr_sz, simd_maxsz(desc));
2520}
2521
2522#define DO_MMLA_B(NAME, INNER) \
2523    void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2524    { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2525
2526DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2527DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2528DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2529
2530/*
2531 * BFloat16 Dot Product
2532 */
2533
2534static float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2)
2535{
2536    /* FPCR is ignored for BFDOT and BFMMLA. */
2537    float_status bf_status = {
2538        .tininess_before_rounding = float_tininess_before_rounding,
2539        .float_rounding_mode = float_round_to_odd_inf,
2540        .flush_to_zero = true,
2541        .flush_inputs_to_zero = true,
2542        .default_nan_mode = true,
2543    };
2544    float32 t1, t2;
2545
2546    /*
2547     * Extract each BFloat16 from the element pair, and shift
2548     * them such that they become float32.
2549     */
2550    t1 = float32_mul(e1 << 16, e2 << 16, &bf_status);
2551    t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status);
2552    t1 = float32_add(t1, t2, &bf_status);
2553    t1 = float32_add(sum, t1, &bf_status);
2554
2555    return t1;
2556}
2557
2558void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2559{
2560    intptr_t i, opr_sz = simd_oprsz(desc);
2561    float32 *d = vd, *a = va;
2562    uint32_t *n = vn, *m = vm;
2563
2564    for (i = 0; i < opr_sz / 4; ++i) {
2565        d[i] = bfdotadd(a[i], n[i], m[i]);
2566    }
2567    clear_tail(d, opr_sz, simd_maxsz(desc));
2568}
2569
2570void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2571                            void *va, uint32_t desc)
2572{
2573    intptr_t i, j, opr_sz = simd_oprsz(desc);
2574    intptr_t index = simd_data(desc);
2575    intptr_t elements = opr_sz / 4;
2576    intptr_t eltspersegment = MIN(16 / 4, elements);
2577    float32 *d = vd, *a = va;
2578    uint32_t *n = vn, *m = vm;
2579
2580    for (i = 0; i < elements; i += eltspersegment) {
2581        uint32_t m_idx = m[i + H4(index)];
2582
2583        for (j = i; j < i + eltspersegment; j++) {
2584            d[j] = bfdotadd(a[j], n[j], m_idx);
2585        }
2586    }
2587    clear_tail(d, opr_sz, simd_maxsz(desc));
2588}
2589
2590void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2591{
2592    intptr_t s, opr_sz = simd_oprsz(desc);
2593    float32 *d = vd, *a = va;
2594    uint32_t *n = vn, *m = vm;
2595
2596    for (s = 0; s < opr_sz / 4; s += 4) {
2597        float32 sum00, sum01, sum10, sum11;
2598
2599        /*
2600         * Process the entire segment at once, writing back the
2601         * results only after we've consumed all of the inputs.
2602         *
2603         * Key to indicies by column:
2604         *               i   j           i   k             j   k
2605         */
2606        sum00 = a[s + H4(0 + 0)];
2607        sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]);
2608        sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]);
2609
2610        sum01 = a[s + H4(0 + 1)];
2611        sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]);
2612        sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]);
2613
2614        sum10 = a[s + H4(2 + 0)];
2615        sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]);
2616        sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]);
2617
2618        sum11 = a[s + H4(2 + 1)];
2619        sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]);
2620        sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]);
2621
2622        d[s + H4(0 + 0)] = sum00;
2623        d[s + H4(0 + 1)] = sum01;
2624        d[s + H4(2 + 0)] = sum10;
2625        d[s + H4(2 + 1)] = sum11;
2626    }
2627    clear_tail(d, opr_sz, simd_maxsz(desc));
2628}
2629
2630void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
2631                         void *stat, uint32_t desc)
2632{
2633    intptr_t i, opr_sz = simd_oprsz(desc);
2634    intptr_t sel = simd_data(desc);
2635    float32 *d = vd, *a = va;
2636    bfloat16 *n = vn, *m = vm;
2637
2638    for (i = 0; i < opr_sz / 4; ++i) {
2639        float32 nn = n[H2(i * 2 + sel)] << 16;
2640        float32 mm = m[H2(i * 2 + sel)] << 16;
2641        d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
2642    }
2643    clear_tail(d, opr_sz, simd_maxsz(desc));
2644}
2645
2646void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
2647                             void *va, void *stat, uint32_t desc)
2648{
2649    intptr_t i, j, opr_sz = simd_oprsz(desc);
2650    intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
2651    intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
2652    intptr_t elements = opr_sz / 4;
2653    intptr_t eltspersegment = MIN(16 / 4, elements);
2654    float32 *d = vd, *a = va;
2655    bfloat16 *n = vn, *m = vm;
2656
2657    for (i = 0; i < elements; i += eltspersegment) {
2658        float32 m_idx = m[H2(2 * i + index)] << 16;
2659
2660        for (j = i; j < i + eltspersegment; j++) {
2661            float32 n_j = n[H2(2 * j + sel)] << 16;
2662            d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
2663        }
2664    }
2665    clear_tail(d, opr_sz, simd_maxsz(desc));
2666}
2667