LXR qemu/target/arm/translate-neon.c

   1/*
   2 *  ARM translation: AArch32 Neon instructions
   3 *
   4 *  Copyright (c) 2003 Fabrice Bellard
   5 *  Copyright (c) 2005-2007 CodeSourcery
   6 *  Copyright (c) 2007 OpenedHand, Ltd.
   7 *  Copyright (c) 2020 Linaro, Ltd.
   8 *
   9 * This library is free software; you can redistribute it and/or
  10 * modify it under the terms of the GNU Lesser General Public
  11 * License as published by the Free Software Foundation; either
  12 * version 2.1 of the License, or (at your option) any later version.
  13 *
  14 * This library is distributed in the hope that it will be useful,
  15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17 * Lesser General Public License for more details.
  18 *
  19 * You should have received a copy of the GNU Lesser General Public
  20 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  21 */
  22
  23#include "qemu/osdep.h"
  24#include "tcg/tcg-op.h"
  25#include "tcg/tcg-op-gvec.h"
  26#include "exec/exec-all.h"
  27#include "exec/gen-icount.h"
  28#include "translate.h"
  29#include "translate-a32.h"
  30
  31/* Include the generated Neon decoder */
  32#include "decode-neon-dp.c.inc"
  33#include "decode-neon-ls.c.inc"
  34#include "decode-neon-shared.c.inc"
  35
  36static TCGv_ptr vfp_reg_ptr(bool dp, int reg)
  37{
  38    TCGv_ptr ret = tcg_temp_new_ptr();
  39    tcg_gen_addi_ptr(ret, cpu_env, vfp_reg_offset(dp, reg));
  40    return ret;
  41}
  42
  43static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
  44{
  45    long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
  46
  47    switch (mop) {
  48    case MO_UB:
  49        tcg_gen_ld8u_i32(var, cpu_env, offset);
  50        break;
  51    case MO_UW:
  52        tcg_gen_ld16u_i32(var, cpu_env, offset);
  53        break;
  54    case MO_UL:
  55        tcg_gen_ld_i32(var, cpu_env, offset);
  56        break;
  57    default:
  58        g_assert_not_reached();
  59    }
  60}
  61
  62static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
  63{
  64    long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
  65
  66    switch (mop) {
  67    case MO_UB:
  68        tcg_gen_ld8u_i64(var, cpu_env, offset);
  69        break;
  70    case MO_UW:
  71        tcg_gen_ld16u_i64(var, cpu_env, offset);
  72        break;
  73    case MO_UL:
  74        tcg_gen_ld32u_i64(var, cpu_env, offset);
  75        break;
  76    case MO_UQ:
  77        tcg_gen_ld_i64(var, cpu_env, offset);
  78        break;
  79    default:
  80        g_assert_not_reached();
  81    }
  82}
  83
  84static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
  85{
  86    long offset = neon_element_offset(reg, ele, size);
  87
  88    switch (size) {
  89    case MO_8:
  90        tcg_gen_st8_i32(var, cpu_env, offset);
  91        break;
  92    case MO_16:
  93        tcg_gen_st16_i32(var, cpu_env, offset);
  94        break;
  95    case MO_32:
  96        tcg_gen_st_i32(var, cpu_env, offset);
  97        break;
  98    default:
  99        g_assert_not_reached();
 100    }
 101}
 102
 103static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
 104{
 105    long offset = neon_element_offset(reg, ele, size);
 106
 107    switch (size) {
 108    case MO_8:
 109        tcg_gen_st8_i64(var, cpu_env, offset);
 110        break;
 111    case MO_16:
 112        tcg_gen_st16_i64(var, cpu_env, offset);
 113        break;
 114    case MO_32:
 115        tcg_gen_st32_i64(var, cpu_env, offset);
 116        break;
 117    case MO_64:
 118        tcg_gen_st_i64(var, cpu_env, offset);
 119        break;
 120    default:
 121        g_assert_not_reached();
 122    }
 123}
 124
 125static bool do_neon_ddda(DisasContext *s, int q, int vd, int vn, int vm,
 126                         int data, gen_helper_gvec_4 *fn_gvec)
 127{
 128    /* UNDEF accesses to D16-D31 if they don't exist. */
 129    if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
 130        return false;
 131    }
 132
 133    /*
 134     * UNDEF accesses to odd registers for each bit of Q.
 135     * Q will be 0b111 for all Q-reg instructions, otherwise
 136     * when we have mixed Q- and D-reg inputs.
 137     */
 138    if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
 139        return false;
 140    }
 141
 142    if (!vfp_access_check(s)) {
 143        return true;
 144    }
 145
 146    int opr_sz = q ? 16 : 8;
 147    tcg_gen_gvec_4_ool(vfp_reg_offset(1, vd),
 148                       vfp_reg_offset(1, vn),
 149                       vfp_reg_offset(1, vm),
 150                       vfp_reg_offset(1, vd),
 151                       opr_sz, opr_sz, data, fn_gvec);
 152    return true;
 153}
 154
 155static bool do_neon_ddda_fpst(DisasContext *s, int q, int vd, int vn, int vm,
 156                              int data, ARMFPStatusFlavour fp_flavour,
 157                              gen_helper_gvec_4_ptr *fn_gvec_ptr)
 158{
 159    /* UNDEF accesses to D16-D31 if they don't exist. */
 160    if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
 161        return false;
 162    }
 163
 164    /*
 165     * UNDEF accesses to odd registers for each bit of Q.
 166     * Q will be 0b111 for all Q-reg instructions, otherwise
 167     * when we have mixed Q- and D-reg inputs.
 168     */
 169    if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
 170        return false;
 171    }
 172
 173    if (!vfp_access_check(s)) {
 174        return true;
 175    }
 176
 177    int opr_sz = q ? 16 : 8;
 178    TCGv_ptr fpst = fpstatus_ptr(fp_flavour);
 179
 180    tcg_gen_gvec_4_ptr(vfp_reg_offset(1, vd),
 181                       vfp_reg_offset(1, vn),
 182                       vfp_reg_offset(1, vm),
 183                       vfp_reg_offset(1, vd),
 184                       fpst, opr_sz, opr_sz, data, fn_gvec_ptr);
 185    tcg_temp_free_ptr(fpst);
 186    return true;
 187}
 188
 189static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
 190{
 191    if (!dc_isar_feature(aa32_vcma, s)) {
 192        return false;
 193    }
 194    if (a->size == MO_16) {
 195        if (!dc_isar_feature(aa32_fp16_arith, s)) {
 196            return false;
 197        }
 198        return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
 199                                 FPST_STD_F16, gen_helper_gvec_fcmlah);
 200    }
 201    return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
 202                             FPST_STD, gen_helper_gvec_fcmlas);
 203}
 204
 205static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
 206{
 207    int opr_sz;
 208    TCGv_ptr fpst;
 209    gen_helper_gvec_3_ptr *fn_gvec_ptr;
 210
 211    if (!dc_isar_feature(aa32_vcma, s)
 212        || (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s))) {
 213        return false;
 214    }
 215
 216    /* UNDEF accesses to D16-D31 if they don't exist. */
 217    if (!dc_isar_feature(aa32_simd_r32, s) &&
 218        ((a->vd | a->vn | a->vm) & 0x10)) {
 219        return false;
 220    }
 221
 222    if ((a->vn | a->vm | a->vd) & a->q) {
 223        return false;
 224    }
 225
 226    if (!vfp_access_check(s)) {
 227        return true;
 228    }
 229
 230    opr_sz = (1 + a->q) * 8;
 231    fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
 232    fn_gvec_ptr = (a->size == MO_16) ?
 233        gen_helper_gvec_fcaddh : gen_helper_gvec_fcadds;
 234    tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 235                       vfp_reg_offset(1, a->vn),
 236                       vfp_reg_offset(1, a->vm),
 237                       fpst, opr_sz, opr_sz, a->rot,
 238                       fn_gvec_ptr);
 239    tcg_temp_free_ptr(fpst);
 240    return true;
 241}
 242
 243static bool trans_VSDOT(DisasContext *s, arg_VSDOT *a)
 244{
 245    if (!dc_isar_feature(aa32_dp, s)) {
 246        return false;
 247    }
 248    return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 249                        gen_helper_gvec_sdot_b);
 250}
 251
 252static bool trans_VUDOT(DisasContext *s, arg_VUDOT *a)
 253{
 254    if (!dc_isar_feature(aa32_dp, s)) {
 255        return false;
 256    }
 257    return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 258                        gen_helper_gvec_udot_b);
 259}
 260
 261static bool trans_VUSDOT(DisasContext *s, arg_VUSDOT *a)
 262{
 263    if (!dc_isar_feature(aa32_i8mm, s)) {
 264        return false;
 265    }
 266    return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 267                        gen_helper_gvec_usdot_b);
 268}
 269
 270static bool trans_VDOT_b16(DisasContext *s, arg_VDOT_b16 *a)
 271{
 272    if (!dc_isar_feature(aa32_bf16, s)) {
 273        return false;
 274    }
 275    return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 276                        gen_helper_gvec_bfdot);
 277}
 278
 279static bool trans_VFML(DisasContext *s, arg_VFML *a)
 280{
 281    int opr_sz;
 282
 283    if (!dc_isar_feature(aa32_fhm, s)) {
 284        return false;
 285    }
 286
 287    /* UNDEF accesses to D16-D31 if they don't exist. */
 288    if (!dc_isar_feature(aa32_simd_r32, s) &&
 289        (a->vd & 0x10)) {
 290        return false;
 291    }
 292
 293    if (a->vd & a->q) {
 294        return false;
 295    }
 296
 297    if (!vfp_access_check(s)) {
 298        return true;
 299    }
 300
 301    opr_sz = (1 + a->q) * 8;
 302    tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 303                       vfp_reg_offset(a->q, a->vn),
 304                       vfp_reg_offset(a->q, a->vm),
 305                       cpu_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
 306                       gen_helper_gvec_fmlal_a32);
 307    return true;
 308}
 309
 310static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
 311{
 312    int data = (a->index << 2) | a->rot;
 313
 314    if (!dc_isar_feature(aa32_vcma, s)) {
 315        return false;
 316    }
 317    if (a->size == MO_16) {
 318        if (!dc_isar_feature(aa32_fp16_arith, s)) {
 319            return false;
 320        }
 321        return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
 322                                 FPST_STD_F16, gen_helper_gvec_fcmlah_idx);
 323    }
 324    return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
 325                             FPST_STD, gen_helper_gvec_fcmlas_idx);
 326}
 327
 328static bool trans_VSDOT_scalar(DisasContext *s, arg_VSDOT_scalar *a)
 329{
 330    if (!dc_isar_feature(aa32_dp, s)) {
 331        return false;
 332    }
 333    return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 334                        gen_helper_gvec_sdot_idx_b);
 335}
 336
 337static bool trans_VUDOT_scalar(DisasContext *s, arg_VUDOT_scalar *a)
 338{
 339    if (!dc_isar_feature(aa32_dp, s)) {
 340        return false;
 341    }
 342    return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 343                        gen_helper_gvec_udot_idx_b);
 344}
 345
 346static bool trans_VUSDOT_scalar(DisasContext *s, arg_VUSDOT_scalar *a)
 347{
 348    if (!dc_isar_feature(aa32_i8mm, s)) {
 349        return false;
 350    }
 351    return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 352                        gen_helper_gvec_usdot_idx_b);
 353}
 354
 355static bool trans_VSUDOT_scalar(DisasContext *s, arg_VSUDOT_scalar *a)
 356{
 357    if (!dc_isar_feature(aa32_i8mm, s)) {
 358        return false;
 359    }
 360    return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 361                        gen_helper_gvec_sudot_idx_b);
 362}
 363
 364static bool trans_VDOT_b16_scal(DisasContext *s, arg_VDOT_b16_scal *a)
 365{
 366    if (!dc_isar_feature(aa32_bf16, s)) {
 367        return false;
 368    }
 369    return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 370                        gen_helper_gvec_bfdot_idx);
 371}
 372
 373static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
 374{
 375    int opr_sz;
 376
 377    if (!dc_isar_feature(aa32_fhm, s)) {
 378        return false;
 379    }
 380
 381    /* UNDEF accesses to D16-D31 if they don't exist. */
 382    if (!dc_isar_feature(aa32_simd_r32, s) &&
 383        ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
 384        return false;
 385    }
 386
 387    if (a->vd & a->q) {
 388        return false;
 389    }
 390
 391    if (!vfp_access_check(s)) {
 392        return true;
 393    }
 394
 395    opr_sz = (1 + a->q) * 8;
 396    tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 397                       vfp_reg_offset(a->q, a->vn),
 398                       vfp_reg_offset(a->q, a->rm),
 399                       cpu_env, opr_sz, opr_sz,
 400                       (a->index << 2) | a->s, /* is_2 == 0 */
 401                       gen_helper_gvec_fmlal_idx_a32);
 402    return true;
 403}
 404
 405static struct {
 406    int nregs;
 407    int interleave;
 408    int spacing;
 409} const neon_ls_element_type[11] = {
 410    {1, 4, 1},
 411    {1, 4, 2},
 412    {4, 1, 1},
 413    {2, 2, 2},
 414    {1, 3, 1},
 415    {1, 3, 2},
 416    {3, 1, 1},
 417    {1, 1, 1},
 418    {1, 2, 1},
 419    {1, 2, 2},
 420    {2, 1, 1}
 421};
 422
 423static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
 424                                      int stride)
 425{
 426    if (rm != 15) {
 427        TCGv_i32 base;
 428
 429        base = load_reg(s, rn);
 430        if (rm == 13) {
 431            tcg_gen_addi_i32(base, base, stride);
 432        } else {
 433            TCGv_i32 index;
 434            index = load_reg(s, rm);
 435            tcg_gen_add_i32(base, base, index);
 436            tcg_temp_free_i32(index);
 437        }
 438        store_reg(s, rn, base);
 439    }
 440}
 441
 442static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
 443{
 444    /* Neon load/store multiple structures */
 445    int nregs, interleave, spacing, reg, n;
 446    MemOp mop, align, endian;
 447    int mmu_idx = get_mem_index(s);
 448    int size = a->size;
 449    TCGv_i64 tmp64;
 450    TCGv_i32 addr;
 451
 452    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 453        return false;
 454    }
 455
 456    /* UNDEF accesses to D16-D31 if they don't exist */
 457    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 458        return false;
 459    }
 460    if (a->itype > 10) {
 461        return false;
 462    }
 463    /* Catch UNDEF cases for bad values of align field */
 464    switch (a->itype & 0xc) {
 465    case 4:
 466        if (a->align >= 2) {
 467            return false;
 468        }
 469        break;
 470    case 8:
 471        if (a->align == 3) {
 472            return false;
 473        }
 474        break;
 475    default:
 476        break;
 477    }
 478    nregs = neon_ls_element_type[a->itype].nregs;
 479    interleave = neon_ls_element_type[a->itype].interleave;
 480    spacing = neon_ls_element_type[a->itype].spacing;
 481    if (size == 3 && (interleave | spacing) != 1) {
 482        return false;
 483    }
 484
 485    if (!vfp_access_check(s)) {
 486        return true;
 487    }
 488
 489    /* For our purposes, bytes are always little-endian.  */
 490    endian = s->be_data;
 491    if (size == 0) {
 492        endian = MO_LE;
 493    }
 494
 495    /* Enforce alignment requested by the instruction */
 496    if (a->align) {
 497        align = pow2_align(a->align + 2); /* 4 ** a->align */
 498    } else {
 499        align = s->align_mem ? MO_ALIGN : 0;
 500    }
 501
 502    /*
 503     * Consecutive little-endian elements from a single register
 504     * can be promoted to a larger little-endian operation.
 505     */
 506    if (interleave == 1 && endian == MO_LE) {
 507        /* Retain any natural alignment. */
 508        if (align == MO_ALIGN) {
 509            align = pow2_align(size);
 510        }
 511        size = 3;
 512    }
 513
 514    tmp64 = tcg_temp_new_i64();
 515    addr = tcg_temp_new_i32();
 516    load_reg_var(s, addr, a->rn);
 517
 518    mop = endian | size | align;
 519    for (reg = 0; reg < nregs; reg++) {
 520        for (n = 0; n < 8 >> size; n++) {
 521            int xs;
 522            for (xs = 0; xs < interleave; xs++) {
 523                int tt = a->vd + reg + spacing * xs;
 524
 525                if (a->l) {
 526                    gen_aa32_ld_internal_i64(s, tmp64, addr, mmu_idx, mop);
 527                    neon_store_element64(tt, n, size, tmp64);
 528                } else {
 529                    neon_load_element64(tmp64, tt, n, size);
 530                    gen_aa32_st_internal_i64(s, tmp64, addr, mmu_idx, mop);
 531                }
 532                tcg_gen_addi_i32(addr, addr, 1 << size);
 533
 534                /* Subsequent memory operations inherit alignment */
 535                mop &= ~MO_AMASK;
 536            }
 537        }
 538    }
 539    tcg_temp_free_i32(addr);
 540    tcg_temp_free_i64(tmp64);
 541
 542    gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
 543    return true;
 544}
 545
 546static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
 547{
 548    /* Neon load single structure to all lanes */
 549    int reg, stride, vec_size;
 550    int vd = a->vd;
 551    int size = a->size;
 552    int nregs = a->n + 1;
 553    TCGv_i32 addr, tmp;
 554    MemOp mop, align;
 555
 556    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 557        return false;
 558    }
 559
 560    /* UNDEF accesses to D16-D31 if they don't exist */
 561    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 562        return false;
 563    }
 564
 565    align = 0;
 566    if (size == 3) {
 567        if (nregs != 4 || a->a == 0) {
 568            return false;
 569        }
 570        /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
 571        size = MO_32;
 572        align = MO_ALIGN_16;
 573    } else if (a->a) {
 574        switch (nregs) {
 575        case 1:
 576            if (size == 0) {
 577                return false;
 578            }
 579            align = MO_ALIGN;
 580            break;
 581        case 2:
 582            align = pow2_align(size + 1);
 583            break;
 584        case 3:
 585            return false;
 586        case 4:
 587            align = pow2_align(size + 2);
 588            break;
 589        default:
 590            g_assert_not_reached();
 591        }
 592    }
 593
 594    if (!vfp_access_check(s)) {
 595        return true;
 596    }
 597
 598    /*
 599     * VLD1 to all lanes: T bit indicates how many Dregs to write.
 600     * VLD2/3/4 to all lanes: T bit indicates register stride.
 601     */
 602    stride = a->t ? 2 : 1;
 603    vec_size = nregs == 1 ? stride * 8 : 8;
 604    mop = size | align;
 605    tmp = tcg_temp_new_i32();
 606    addr = tcg_temp_new_i32();
 607    load_reg_var(s, addr, a->rn);
 608    for (reg = 0; reg < nregs; reg++) {
 609        gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), mop);
 610        if ((vd & 1) && vec_size == 16) {
 611            /*
 612             * We cannot write 16 bytes at once because the
 613             * destination is unaligned.
 614             */
 615            tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
 616                                 8, 8, tmp);
 617            tcg_gen_gvec_mov(0, neon_full_reg_offset(vd + 1),
 618                             neon_full_reg_offset(vd), 8, 8);
 619        } else {
 620            tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
 621                                 vec_size, vec_size, tmp);
 622        }
 623        tcg_gen_addi_i32(addr, addr, 1 << size);
 624        vd += stride;
 625
 626        /* Subsequent memory operations inherit alignment */
 627        mop &= ~MO_AMASK;
 628    }
 629    tcg_temp_free_i32(tmp);
 630    tcg_temp_free_i32(addr);
 631
 632    gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
 633
 634    return true;
 635}
 636
 637static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
 638{
 639    /* Neon load/store single structure to one lane */
 640    int reg;
 641    int nregs = a->n + 1;
 642    int vd = a->vd;
 643    TCGv_i32 addr, tmp;
 644    MemOp mop;
 645
 646    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 647        return false;
 648    }
 649
 650    /* UNDEF accesses to D16-D31 if they don't exist */
 651    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 652        return false;
 653    }
 654
 655    /* Catch the UNDEF cases. This is unavoidably a bit messy. */
 656    switch (nregs) {
 657    case 1:
 658        if (a->stride != 1) {
 659            return false;
 660        }
 661        if (((a->align & (1 << a->size)) != 0) ||
 662            (a->size == 2 && (a->align == 1 || a->align == 2))) {
 663            return false;
 664        }
 665        break;
 666    case 2:
 667        if (a->size == 2 && (a->align & 2) != 0) {
 668            return false;
 669        }
 670        break;
 671    case 3:
 672        if (a->align != 0) {
 673            return false;
 674        }
 675        break;
 676    case 4:
 677        if (a->size == 2 && a->align == 3) {
 678            return false;
 679        }
 680        break;
 681    default:
 682        g_assert_not_reached();
 683    }
 684    if ((vd + a->stride * (nregs - 1)) > 31) {
 685        /*
 686         * Attempts to write off the end of the register file are
 687         * UNPREDICTABLE; we choose to UNDEF because otherwise we would
 688         * access off the end of the array that holds the register data.
 689         */
 690        return false;
 691    }
 692
 693    if (!vfp_access_check(s)) {
 694        return true;
 695    }
 696
 697    /* Pick up SCTLR settings */
 698    mop = finalize_memop(s, a->size);
 699
 700    if (a->align) {
 701        MemOp align_op;
 702
 703        switch (nregs) {
 704        case 1:
 705            /* For VLD1, use natural alignment. */
 706            align_op = MO_ALIGN;
 707            break;
 708        case 2:
 709            /* For VLD2, use double alignment. */
 710            align_op = pow2_align(a->size + 1);
 711            break;
 712        case 4:
 713            if (a->size == MO_32) {
 714                /*
 715                 * For VLD4.32, align = 1 is double alignment, align = 2 is
 716                 * quad alignment; align = 3 is rejected above.
 717                 */
 718                align_op = pow2_align(a->size + a->align);
 719            } else {
 720                /* For VLD4.8 and VLD.16, we want quad alignment. */
 721                align_op = pow2_align(a->size + 2);
 722            }
 723            break;
 724        default:
 725            /* For VLD3, the alignment field is zero and rejected above. */
 726            g_assert_not_reached();
 727        }
 728
 729        mop = (mop & ~MO_AMASK) | align_op;
 730    }
 731
 732    tmp = tcg_temp_new_i32();
 733    addr = tcg_temp_new_i32();
 734    load_reg_var(s, addr, a->rn);
 735
 736    for (reg = 0; reg < nregs; reg++) {
 737        if (a->l) {
 738            gen_aa32_ld_internal_i32(s, tmp, addr, get_mem_index(s), mop);
 739            neon_store_element(vd, a->reg_idx, a->size, tmp);
 740        } else { /* Store */
 741            neon_load_element(tmp, vd, a->reg_idx, a->size);
 742            gen_aa32_st_internal_i32(s, tmp, addr, get_mem_index(s), mop);
 743        }
 744        vd += a->stride;
 745        tcg_gen_addi_i32(addr, addr, 1 << a->size);
 746
 747        /* Subsequent memory operations inherit alignment */
 748        mop &= ~MO_AMASK;
 749    }
 750    tcg_temp_free_i32(addr);
 751    tcg_temp_free_i32(tmp);
 752
 753    gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
 754
 755    return true;
 756}
 757
 758static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
 759{
 760    int vec_size = a->q ? 16 : 8;
 761    int rd_ofs = neon_full_reg_offset(a->vd);
 762    int rn_ofs = neon_full_reg_offset(a->vn);
 763    int rm_ofs = neon_full_reg_offset(a->vm);
 764
 765    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 766        return false;
 767    }
 768
 769    /* UNDEF accesses to D16-D31 if they don't exist. */
 770    if (!dc_isar_feature(aa32_simd_r32, s) &&
 771        ((a->vd | a->vn | a->vm) & 0x10)) {
 772        return false;
 773    }
 774
 775    if ((a->vn | a->vm | a->vd) & a->q) {
 776        return false;
 777    }
 778
 779    if (!vfp_access_check(s)) {
 780        return true;
 781    }
 782
 783    fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
 784    return true;
 785}
 786
 787#define DO_3SAME(INSN, FUNC)                                            \
 788    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 789    {                                                                   \
 790        return do_3same(s, a, FUNC);                                    \
 791    }
 792
 793DO_3SAME(VADD, tcg_gen_gvec_add)
 794DO_3SAME(VSUB, tcg_gen_gvec_sub)
 795DO_3SAME(VAND, tcg_gen_gvec_and)
 796DO_3SAME(VBIC, tcg_gen_gvec_andc)
 797DO_3SAME(VORR, tcg_gen_gvec_or)
 798DO_3SAME(VORN, tcg_gen_gvec_orc)
 799DO_3SAME(VEOR, tcg_gen_gvec_xor)
 800DO_3SAME(VSHL_S, gen_gvec_sshl)
 801DO_3SAME(VSHL_U, gen_gvec_ushl)
 802DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
 803DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
 804DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
 805DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
 806
 807/* These insns are all gvec_bitsel but with the inputs in various orders. */
 808#define DO_3SAME_BITSEL(INSN, O1, O2, O3)                               \
 809    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 810                                uint32_t rn_ofs, uint32_t rm_ofs,       \
 811                                uint32_t oprsz, uint32_t maxsz)         \
 812    {                                                                   \
 813        tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz);    \
 814    }                                                                   \
 815    DO_3SAME(INSN, gen_##INSN##_3s)
 816
 817DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
 818DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
 819DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
 820
 821#define DO_3SAME_NO_SZ_3(INSN, FUNC)                                    \
 822    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 823    {                                                                   \
 824        if (a->size == 3) {                                             \
 825            return false;                                               \
 826        }                                                               \
 827        return do_3same(s, a, FUNC);                                    \
 828    }
 829
 830DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
 831DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
 832DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
 833DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
 834DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
 835DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
 836DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
 837DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
 838DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
 839DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
 840DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
 841DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
 842
 843#define DO_3SAME_CMP(INSN, COND)                                        \
 844    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 845                                uint32_t rn_ofs, uint32_t rm_ofs,       \
 846                                uint32_t oprsz, uint32_t maxsz)         \
 847    {                                                                   \
 848        tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
 849    }                                                                   \
 850    DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
 851
 852DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
 853DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
 854DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
 855DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
 856DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
 857
 858#define WRAP_OOL_FN(WRAPNAME, FUNC)                                        \
 859    static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,  \
 860                         uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz)  \
 861    {                                                                      \
 862        tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
 863    }
 864
 865WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
 866
 867static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
 868{
 869    if (a->size != 0) {
 870        return false;
 871    }
 872    return do_3same(s, a, gen_VMUL_p_3s);
 873}
 874
 875#define DO_VQRDMLAH(INSN, FUNC)                                         \
 876    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 877    {                                                                   \
 878        if (!dc_isar_feature(aa32_rdm, s)) {                            \
 879            return false;                                               \
 880        }                                                               \
 881        if (a->size != 1 && a->size != 2) {                             \
 882            return false;                                               \
 883        }                                                               \
 884        return do_3same(s, a, FUNC);                                    \
 885    }
 886
 887DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
 888DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
 889
 890#define DO_SHA1(NAME, FUNC)                                             \
 891    WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
 892    static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
 893    {                                                                   \
 894        if (!dc_isar_feature(aa32_sha1, s)) {                           \
 895            return false;                                               \
 896        }                                                               \
 897        return do_3same(s, a, gen_##NAME##_3s);                         \
 898    }
 899
 900DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
 901DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
 902DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
 903DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
 904
 905#define DO_SHA2(NAME, FUNC)                                             \
 906    WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
 907    static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
 908    {                                                                   \
 909        if (!dc_isar_feature(aa32_sha2, s)) {                           \
 910            return false;                                               \
 911        }                                                               \
 912        return do_3same(s, a, gen_##NAME##_3s);                         \
 913    }
 914
 915DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
 916DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
 917DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
 918
 919#define DO_3SAME_64(INSN, FUNC)                                         \
 920    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 921                                uint32_t rn_ofs, uint32_t rm_ofs,       \
 922                                uint32_t oprsz, uint32_t maxsz)         \
 923    {                                                                   \
 924        static const GVecGen3 op = { .fni8 = FUNC };                    \
 925        tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op);      \
 926    }                                                                   \
 927    DO_3SAME(INSN, gen_##INSN##_3s)
 928
 929#define DO_3SAME_64_ENV(INSN, FUNC)                                     \
 930    static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)    \
 931    {                                                                   \
 932        FUNC(d, cpu_env, n, m);                                         \
 933    }                                                                   \
 934    DO_3SAME_64(INSN, gen_##INSN##_elt)
 935
 936DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
 937DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
 938DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
 939DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
 940DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
 941DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
 942
 943#define DO_3SAME_32(INSN, FUNC)                                         \
 944    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 945                                uint32_t rn_ofs, uint32_t rm_ofs,       \
 946                                uint32_t oprsz, uint32_t maxsz)         \
 947    {                                                                   \
 948        static const GVecGen3 ops[4] = {                                \
 949            { .fni4 = gen_helper_neon_##FUNC##8 },                      \
 950            { .fni4 = gen_helper_neon_##FUNC##16 },                     \
 951            { .fni4 = gen_helper_neon_##FUNC##32 },                     \
 952            { 0 },                                                      \
 953        };                                                              \
 954        tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
 955    }                                                                   \
 956    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 957    {                                                                   \
 958        if (a->size > 2) {                                              \
 959            return false;                                               \
 960        }                                                               \
 961        return do_3same(s, a, gen_##INSN##_3s);                         \
 962    }
 963
 964/*
 965 * Some helper functions need to be passed the cpu_env. In order
 966 * to use those with the gvec APIs like tcg_gen_gvec_3() we need
 967 * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
 968 * and which call a NeonGenTwoOpEnvFn().
 969 */
 970#define WRAP_ENV_FN(WRAPNAME, FUNC)                                     \
 971    static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m)            \
 972    {                                                                   \
 973        FUNC(d, cpu_env, n, m);                                         \
 974    }
 975
 976#define DO_3SAME_32_ENV(INSN, FUNC)                                     \
 977    WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8);        \
 978    WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16);      \
 979    WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32);      \
 980    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 981                                uint32_t rn_ofs, uint32_t rm_ofs,       \
 982                                uint32_t oprsz, uint32_t maxsz)         \
 983    {                                                                   \
 984        static const GVecGen3 ops[4] = {                                \
 985            { .fni4 = gen_##INSN##_tramp8 },                            \
 986            { .fni4 = gen_##INSN##_tramp16 },                           \
 987            { .fni4 = gen_##INSN##_tramp32 },                           \
 988            { 0 },                                                      \
 989        };                                                              \
 990        tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
 991    }                                                                   \
 992    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 993    {                                                                   \
 994        if (a->size > 2) {                                              \
 995            return false;                                               \
 996        }                                                               \
 997        return do_3same(s, a, gen_##INSN##_3s);                         \
 998    }
 999
1000DO_3SAME_32(VHADD_S, hadd_s)

1001DO_3SAME_32(VHADD_U, hadd_u)
1002DO_3SAME_32(VHSUB_S, hsub_s)
1003DO_3SAME_32(VHSUB_U, hsub_u)
1004DO_3SAME_32(VRHADD_S, rhadd_s)
1005DO_3SAME_32(VRHADD_U, rhadd_u)
1006DO_3SAME_32(VRSHL_S, rshl_s)
1007DO_3SAME_32(VRSHL_U, rshl_u)
1008
1009DO_3SAME_32_ENV(VQSHL_S, qshl_s)
1010DO_3SAME_32_ENV(VQSHL_U, qshl_u)
1011DO_3SAME_32_ENV(VQRSHL_S, qrshl_s)
1012DO_3SAME_32_ENV(VQRSHL_U, qrshl_u)
1013
1014static bool do_3same_pair(DisasContext *s, arg_3same *a, NeonGenTwoOpFn *fn)
1015{
1016    /* Operations handled pairwise 32 bits at a time */
1017    TCGv_i32 tmp, tmp2, tmp3;
1018
1019    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1020        return false;
1021    }
1022
1023    /* UNDEF accesses to D16-D31 if they don't exist. */
1024    if (!dc_isar_feature(aa32_simd_r32, s) &&
1025        ((a->vd | a->vn | a->vm) & 0x10)) {
1026        return false;
1027    }
1028
1029    if (a->size == 3) {
1030        return false;
1031    }
1032
1033    if (!vfp_access_check(s)) {
1034        return true;
1035    }
1036
1037    assert(a->q == 0); /* enforced by decode patterns */
1038
1039    /*
1040     * Note that we have to be careful not to clobber the source operands
1041     * in the "vm == vd" case by storing the result of the first pass too
1042     * early. Since Q is 0 there are always just two passes, so instead
1043     * of a complicated loop over each pass we just unroll.
1044     */
1045    tmp = tcg_temp_new_i32();
1046    tmp2 = tcg_temp_new_i32();
1047    tmp3 = tcg_temp_new_i32();
1048
1049    read_neon_element32(tmp, a->vn, 0, MO_32);
1050    read_neon_element32(tmp2, a->vn, 1, MO_32);
1051    fn(tmp, tmp, tmp2);
1052
1053    read_neon_element32(tmp3, a->vm, 0, MO_32);
1054    read_neon_element32(tmp2, a->vm, 1, MO_32);
1055    fn(tmp3, tmp3, tmp2);
1056
1057    write_neon_element32(tmp, a->vd, 0, MO_32);
1058    write_neon_element32(tmp3, a->vd, 1, MO_32);
1059
1060    tcg_temp_free_i32(tmp);
1061    tcg_temp_free_i32(tmp2);
1062    tcg_temp_free_i32(tmp3);
1063    return true;
1064}
1065
1066#define DO_3SAME_PAIR(INSN, func)                                       \
1067    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1068    {                                                                   \
1069        static NeonGenTwoOpFn * const fns[] = {                         \
1070            gen_helper_neon_##func##8,                                  \
1071            gen_helper_neon_##func##16,                                 \
1072            gen_helper_neon_##func##32,                                 \
1073        };                                                              \
1074        if (a->size > 2) {                                              \
1075            return false;                                               \
1076        }                                                               \
1077        return do_3same_pair(s, a, fns[a->size]);                       \
1078    }
1079
1080/* 32-bit pairwise ops end up the same as the elementwise versions.  */
1081#define gen_helper_neon_pmax_s32  tcg_gen_smax_i32
1082#define gen_helper_neon_pmax_u32  tcg_gen_umax_i32
1083#define gen_helper_neon_pmin_s32  tcg_gen_smin_i32
1084#define gen_helper_neon_pmin_u32  tcg_gen_umin_i32
1085#define gen_helper_neon_padd_u32  tcg_gen_add_i32
1086
1087DO_3SAME_PAIR(VPMAX_S, pmax_s)
1088DO_3SAME_PAIR(VPMIN_S, pmin_s)
1089DO_3SAME_PAIR(VPMAX_U, pmax_u)
1090DO_3SAME_PAIR(VPMIN_U, pmin_u)
1091DO_3SAME_PAIR(VPADD, padd_u)
1092
1093#define DO_3SAME_VQDMULH(INSN, FUNC)                                    \
1094    WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16);    \
1095    WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32);    \
1096    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
1097                                uint32_t rn_ofs, uint32_t rm_ofs,       \
1098                                uint32_t oprsz, uint32_t maxsz)         \
1099    {                                                                   \
1100        static const GVecGen3 ops[2] = {                                \
1101            { .fni4 = gen_##INSN##_tramp16 },                           \
1102            { .fni4 = gen_##INSN##_tramp32 },                           \
1103        };                                                              \
1104        tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \
1105    }                                                                   \
1106    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1107    {                                                                   \
1108        if (a->size != 1 && a->size != 2) {                             \
1109            return false;                                               \
1110        }                                                               \
1111        return do_3same(s, a, gen_##INSN##_3s);                         \
1112    }
1113
1114DO_3SAME_VQDMULH(VQDMULH, qdmulh)
1115DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
1116
1117#define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC)                              \
1118    static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
1119                         uint32_t rn_ofs, uint32_t rm_ofs,              \
1120                         uint32_t oprsz, uint32_t maxsz)                \
1121    {                                                                   \
1122        TCGv_ptr fpst = fpstatus_ptr(FPST);                             \
1123        tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst,                \
1124                           oprsz, maxsz, 0, FUNC);                      \
1125        tcg_temp_free_ptr(fpst);                                        \
1126    }
1127
1128#define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC)                                 \
1129    WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC)                 \
1130    WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC)             \
1131    static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a)     \
1132    {                                                                   \
1133        if (a->size == MO_16) {                                         \
1134            if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
1135                return false;                                           \
1136            }                                                           \
1137            return do_3same(s, a, gen_##INSN##_fp16_3s);                \
1138        }                                                               \
1139        return do_3same(s, a, gen_##INSN##_fp32_3s);                    \
1140    }
1141
1142
1143DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
1144DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
1145DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
1146DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
1147DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
1148DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
1149DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
1150DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
1151DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
1152DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
1153DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
1154DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
1155DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
1156DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
1157DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
1158DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h)
1159DO_3S_FP_GVEC(VRSQRTS, gen_helper_gvec_rsqrts_nf_s, gen_helper_gvec_rsqrts_nf_h)
1160
1161WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
1162WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
1163WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s)
1164WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h)
1165
1166static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
1167{
1168    if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1169        return false;
1170    }
1171
1172    if (a->size == MO_16) {
1173        if (!dc_isar_feature(aa32_fp16_arith, s)) {
1174            return false;
1175        }
1176        return do_3same(s, a, gen_VMAXNM_fp16_3s);
1177    }
1178    return do_3same(s, a, gen_VMAXNM_fp32_3s);
1179}
1180
1181static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
1182{
1183    if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1184        return false;
1185    }
1186
1187    if (a->size == MO_16) {
1188        if (!dc_isar_feature(aa32_fp16_arith, s)) {
1189            return false;
1190        }
1191        return do_3same(s, a, gen_VMINNM_fp16_3s);
1192    }
1193    return do_3same(s, a, gen_VMINNM_fp32_3s);
1194}
1195
1196static bool do_3same_fp_pair(DisasContext *s, arg_3same *a,
1197                             gen_helper_gvec_3_ptr *fn)
1198{
1199    /* FP pairwise operations */
1200    TCGv_ptr fpstatus;
1201
1202    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1203        return false;
1204    }
1205
1206    /* UNDEF accesses to D16-D31 if they don't exist. */
1207    if (!dc_isar_feature(aa32_simd_r32, s) &&
1208        ((a->vd | a->vn | a->vm) & 0x10)) {
1209        return false;
1210    }
1211
1212    if (!vfp_access_check(s)) {
1213        return true;
1214    }
1215
1216    assert(a->q == 0); /* enforced by decode patterns */
1217
1218
1219    fpstatus = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1220    tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
1221                       vfp_reg_offset(1, a->vn),
1222                       vfp_reg_offset(1, a->vm),
1223                       fpstatus, 8, 8, 0, fn);
1224    tcg_temp_free_ptr(fpstatus);
1225
1226    return true;
1227}
1228
1229/*
1230 * For all the functions using this macro, size == 1 means fp16,
1231 * which is an architecture extension we don't implement yet.
1232 */
1233#define DO_3S_FP_PAIR(INSN,FUNC)                                    \
1234    static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
1235    {                                                               \
1236        if (a->size == MO_16) {                                     \
1237            if (!dc_isar_feature(aa32_fp16_arith, s)) {             \
1238                return false;                                       \
1239            }                                                       \
1240            return do_3same_fp_pair(s, a, FUNC##h);                 \
1241        }                                                           \
1242        return do_3same_fp_pair(s, a, FUNC##s);                     \
1243    }
1244
1245DO_3S_FP_PAIR(VPADD, gen_helper_neon_padd)
1246DO_3S_FP_PAIR(VPMAX, gen_helper_neon_pmax)
1247DO_3S_FP_PAIR(VPMIN, gen_helper_neon_pmin)
1248
1249static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
1250{
1251    /* Handle a 2-reg-shift insn which can be vectorized. */
1252    int vec_size = a->q ? 16 : 8;
1253    int rd_ofs = neon_full_reg_offset(a->vd);
1254    int rm_ofs = neon_full_reg_offset(a->vm);
1255
1256    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1257        return false;
1258    }
1259
1260    /* UNDEF accesses to D16-D31 if they don't exist. */
1261    if (!dc_isar_feature(aa32_simd_r32, s) &&
1262        ((a->vd | a->vm) & 0x10)) {
1263        return false;
1264    }
1265
1266    if ((a->vm | a->vd) & a->q) {
1267        return false;
1268    }
1269
1270    if (!vfp_access_check(s)) {
1271        return true;
1272    }
1273
1274    fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
1275    return true;
1276}
1277
1278#define DO_2SH(INSN, FUNC)                                              \
1279    static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1280    {                                                                   \
1281        return do_vector_2sh(s, a, FUNC);                               \
1282    }                                                                   \
1283
1284DO_2SH(VSHL, tcg_gen_gvec_shli)
1285DO_2SH(VSLI, gen_gvec_sli)
1286DO_2SH(VSRI, gen_gvec_sri)
1287DO_2SH(VSRA_S, gen_gvec_ssra)
1288DO_2SH(VSRA_U, gen_gvec_usra)
1289DO_2SH(VRSHR_S, gen_gvec_srshr)
1290DO_2SH(VRSHR_U, gen_gvec_urshr)
1291DO_2SH(VRSRA_S, gen_gvec_srsra)
1292DO_2SH(VRSRA_U, gen_gvec_ursra)
1293
1294static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
1295{
1296    /* Signed shift out of range results in all-sign-bits */
1297    a->shift = MIN(a->shift, (8 << a->size) - 1);
1298    return do_vector_2sh(s, a, tcg_gen_gvec_sari);
1299}
1300
1301static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
1302                            int64_t shift, uint32_t oprsz, uint32_t maxsz)
1303{
1304    tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
1305}
1306
1307static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
1308{
1309    /* Shift out of range is architecturally valid and results in zero. */
1310    if (a->shift >= (8 << a->size)) {
1311        return do_vector_2sh(s, a, gen_zero_rd_2sh);
1312    } else {
1313        return do_vector_2sh(s, a, tcg_gen_gvec_shri);
1314    }
1315}
1316
1317static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
1318                             NeonGenTwo64OpEnvFn *fn)
1319{
1320    /*
1321     * 2-reg-and-shift operations, size == 3 case, where the
1322     * function needs to be passed cpu_env.
1323     */
1324    TCGv_i64 constimm;
1325    int pass;
1326
1327    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1328        return false;
1329    }
1330
1331    /* UNDEF accesses to D16-D31 if they don't exist. */
1332    if (!dc_isar_feature(aa32_simd_r32, s) &&
1333        ((a->vd | a->vm) & 0x10)) {
1334        return false;
1335    }
1336
1337    if ((a->vm | a->vd) & a->q) {
1338        return false;
1339    }
1340
1341    if (!vfp_access_check(s)) {
1342        return true;
1343    }
1344
1345    /*
1346     * To avoid excessive duplication of ops we implement shift
1347     * by immediate using the variable shift operations.
1348     */
1349    constimm = tcg_constant_i64(dup_const(a->size, a->shift));
1350
1351    for (pass = 0; pass < a->q + 1; pass++) {
1352        TCGv_i64 tmp = tcg_temp_new_i64();
1353
1354        read_neon_element64(tmp, a->vm, pass, MO_64);
1355        fn(tmp, cpu_env, tmp, constimm);
1356        write_neon_element64(tmp, a->vd, pass, MO_64);
1357        tcg_temp_free_i64(tmp);
1358    }
1359    return true;
1360}
1361
1362static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
1363                             NeonGenTwoOpEnvFn *fn)
1364{
1365    /*
1366     * 2-reg-and-shift operations, size < 3 case, where the
1367     * helper needs to be passed cpu_env.
1368     */
1369    TCGv_i32 constimm, tmp;
1370    int pass;
1371
1372    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1373        return false;
1374    }
1375
1376    /* UNDEF accesses to D16-D31 if they don't exist. */
1377    if (!dc_isar_feature(aa32_simd_r32, s) &&
1378        ((a->vd | a->vm) & 0x10)) {
1379        return false;
1380    }
1381
1382    if ((a->vm | a->vd) & a->q) {
1383        return false;
1384    }
1385
1386    if (!vfp_access_check(s)) {
1387        return true;
1388    }
1389
1390    /*
1391     * To avoid excessive duplication of ops we implement shift
1392     * by immediate using the variable shift operations.
1393     */
1394    constimm = tcg_constant_i32(dup_const(a->size, a->shift));
1395    tmp = tcg_temp_new_i32();
1396
1397    for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1398        read_neon_element32(tmp, a->vm, pass, MO_32);
1399        fn(tmp, cpu_env, tmp, constimm);
1400        write_neon_element32(tmp, a->vd, pass, MO_32);
1401    }
1402    tcg_temp_free_i32(tmp);
1403    return true;
1404}
1405
1406#define DO_2SHIFT_ENV(INSN, FUNC)                                       \
1407    static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
1408    {                                                                   \
1409        return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64);      \
1410    }                                                                   \
1411    static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1412    {                                                                   \
1413        static NeonGenTwoOpEnvFn * const fns[] = {                      \
1414            gen_helper_neon_##FUNC##8,                                  \
1415            gen_helper_neon_##FUNC##16,                                 \
1416            gen_helper_neon_##FUNC##32,                                 \
1417        };                                                              \
1418        assert(a->size < ARRAY_SIZE(fns));                              \
1419        return do_2shift_env_32(s, a, fns[a->size]);                    \
1420    }
1421
1422DO_2SHIFT_ENV(VQSHLU, qshlu_s)
1423DO_2SHIFT_ENV(VQSHL_U, qshl_u)
1424DO_2SHIFT_ENV(VQSHL_S, qshl_s)
1425
1426static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
1427                                NeonGenTwo64OpFn *shiftfn,
1428                                NeonGenNarrowEnvFn *narrowfn)
1429{
1430    /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
1431    TCGv_i64 constimm, rm1, rm2;
1432    TCGv_i32 rd;
1433
1434    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1435        return false;
1436    }
1437
1438    /* UNDEF accesses to D16-D31 if they don't exist. */
1439    if (!dc_isar_feature(aa32_simd_r32, s) &&
1440        ((a->vd | a->vm) & 0x10)) {
1441        return false;
1442    }
1443
1444    if (a->vm & 1) {
1445        return false;
1446    }
1447
1448    if (!vfp_access_check(s)) {
1449        return true;
1450    }
1451
1452    /*
1453     * This is always a right shift, and the shiftfn is always a
1454     * left-shift helper, which thus needs the negated shift count.
1455     */
1456    constimm = tcg_constant_i64(-a->shift);
1457    rm1 = tcg_temp_new_i64();
1458    rm2 = tcg_temp_new_i64();
1459    rd = tcg_temp_new_i32();
1460
1461    /* Load both inputs first to avoid potential overwrite if rm == rd */
1462    read_neon_element64(rm1, a->vm, 0, MO_64);
1463    read_neon_element64(rm2, a->vm, 1, MO_64);
1464
1465    shiftfn(rm1, rm1, constimm);
1466    narrowfn(rd, cpu_env, rm1);
1467    write_neon_element32(rd, a->vd, 0, MO_32);
1468
1469    shiftfn(rm2, rm2, constimm);
1470    narrowfn(rd, cpu_env, rm2);
1471    write_neon_element32(rd, a->vd, 1, MO_32);
1472
1473    tcg_temp_free_i32(rd);
1474    tcg_temp_free_i64(rm1);
1475    tcg_temp_free_i64(rm2);
1476
1477    return true;
1478}
1479
1480static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
1481                                NeonGenTwoOpFn *shiftfn,
1482                                NeonGenNarrowEnvFn *narrowfn)
1483{
1484    /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
1485    TCGv_i32 constimm, rm1, rm2, rm3, rm4;
1486    TCGv_i64 rtmp;
1487    uint32_t imm;
1488
1489    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1490        return false;
1491    }
1492
1493    /* UNDEF accesses to D16-D31 if they don't exist. */
1494    if (!dc_isar_feature(aa32_simd_r32, s) &&
1495        ((a->vd | a->vm) & 0x10)) {
1496        return false;
1497    }
1498
1499    if (a->vm & 1) {
1500        return false;
1501    }
1502
1503    if (!vfp_access_check(s)) {
1504        return true;
1505    }
1506
1507    /*
1508     * This is always a right shift, and the shiftfn is always a
1509     * left-shift helper, which thus needs the negated shift count
1510     * duplicated into each lane of the immediate value.
1511     */
1512    if (a->size == 1) {
1513        imm = (uint16_t)(-a->shift);
1514        imm |= imm << 16;
1515    } else {
1516        /* size == 2 */
1517        imm = -a->shift;
1518    }
1519    constimm = tcg_constant_i32(imm);
1520
1521    /* Load all inputs first to avoid potential overwrite */
1522    rm1 = tcg_temp_new_i32();
1523    rm2 = tcg_temp_new_i32();
1524    rm3 = tcg_temp_new_i32();
1525    rm4 = tcg_temp_new_i32();
1526    read_neon_element32(rm1, a->vm, 0, MO_32);
1527    read_neon_element32(rm2, a->vm, 1, MO_32);
1528    read_neon_element32(rm3, a->vm, 2, MO_32);
1529    read_neon_element32(rm4, a->vm, 3, MO_32);
1530    rtmp = tcg_temp_new_i64();
1531
1532    shiftfn(rm1, rm1, constimm);
1533    shiftfn(rm2, rm2, constimm);
1534
1535    tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
1536    tcg_temp_free_i32(rm2);
1537
1538    narrowfn(rm1, cpu_env, rtmp);
1539    write_neon_element32(rm1, a->vd, 0, MO_32);
1540    tcg_temp_free_i32(rm1);
1541
1542    shiftfn(rm3, rm3, constimm);
1543    shiftfn(rm4, rm4, constimm);
1544
1545    tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
1546    tcg_temp_free_i32(rm4);
1547
1548    narrowfn(rm3, cpu_env, rtmp);
1549    tcg_temp_free_i64(rtmp);
1550    write_neon_element32(rm3, a->vd, 1, MO_32);
1551    tcg_temp_free_i32(rm3);
1552    return true;
1553}
1554
1555#define DO_2SN_64(INSN, FUNC, NARROWFUNC)                               \
1556    static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1557    {                                                                   \
1558        return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC);             \
1559    }
1560#define DO_2SN_32(INSN, FUNC, NARROWFUNC)                               \
1561    static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1562    {                                                                   \
1563        return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
1564    }
1565
1566static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1567{
1568    tcg_gen_extrl_i64_i32(dest, src);
1569}
1570
1571static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1572{
1573    gen_helper_neon_narrow_u16(dest, src);
1574}
1575
1576static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1577{
1578    gen_helper_neon_narrow_u8(dest, src);
1579}
1580
1581DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
1582DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
1583DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
1584
1585DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
1586DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
1587DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
1588
1589DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
1590DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
1591DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
1592
1593DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
1594DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
1595DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
1596DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
1597DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
1598DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
1599
1600DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
1601DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
1602DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
1603
1604DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
1605DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
1606DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
1607
1608DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
1609DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
1610DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
1611
1612static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
1613                         NeonGenWidenFn *widenfn, bool u)
1614{
1615    TCGv_i64 tmp;
1616    TCGv_i32 rm0, rm1;
1617    uint64_t widen_mask = 0;
1618
1619    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1620        return false;
1621    }
1622
1623    /* UNDEF accesses to D16-D31 if they don't exist. */
1624    if (!dc_isar_feature(aa32_simd_r32, s) &&
1625        ((a->vd | a->vm) & 0x10)) {
1626        return false;
1627    }
1628
1629    if (a->vd & 1) {
1630        return false;
1631    }
1632
1633    if (!vfp_access_check(s)) {
1634        return true;
1635    }
1636
1637    /*
1638     * This is a widen-and-shift operation. The shift is always less
1639     * than the width of the source type, so after widening the input
1640     * vector we can simply shift the whole 64-bit widened register,
1641     * and then clear the potential overflow bits resulting from left
1642     * bits of the narrow input appearing as right bits of the left
1643     * neighbour narrow input. Calculate a mask of bits to clear.
1644     */
1645    if ((a->shift != 0) && (a->size < 2 || u)) {
1646        int esize = 8 << a->size;
1647        widen_mask = MAKE_64BIT_MASK(0, esize);
1648        widen_mask >>= esize - a->shift;
1649        widen_mask = dup_const(a->size + 1, widen_mask);
1650    }
1651
1652    rm0 = tcg_temp_new_i32();
1653    rm1 = tcg_temp_new_i32();
1654    read_neon_element32(rm0, a->vm, 0, MO_32);
1655    read_neon_element32(rm1, a->vm, 1, MO_32);
1656    tmp = tcg_temp_new_i64();
1657
1658    widenfn(tmp, rm0);
1659    tcg_temp_free_i32(rm0);
1660    if (a->shift != 0) {
1661        tcg_gen_shli_i64(tmp, tmp, a->shift);
1662        tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1663    }
1664    write_neon_element64(tmp, a->vd, 0, MO_64);
1665
1666    widenfn(tmp, rm1);
1667    tcg_temp_free_i32(rm1);
1668    if (a->shift != 0) {
1669        tcg_gen_shli_i64(tmp, tmp, a->shift);
1670        tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1671    }
1672    write_neon_element64(tmp, a->vd, 1, MO_64);
1673    tcg_temp_free_i64(tmp);
1674    return true;
1675}
1676
1677static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
1678{
1679    static NeonGenWidenFn * const widenfn[] = {
1680        gen_helper_neon_widen_s8,
1681        gen_helper_neon_widen_s16,
1682        tcg_gen_ext_i32_i64,
1683    };
1684    return do_vshll_2sh(s, a, widenfn[a->size], false);
1685}
1686
1687static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
1688{
1689    static NeonGenWidenFn * const widenfn[] = {
1690        gen_helper_neon_widen_u8,
1691        gen_helper_neon_widen_u16,
1692        tcg_gen_extu_i32_i64,
1693    };
1694    return do_vshll_2sh(s, a, widenfn[a->size], true);
1695}
1696
1697static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
1698                      gen_helper_gvec_2_ptr *fn)
1699{
1700    /* FP operations in 2-reg-and-shift group */
1701    int vec_size = a->q ? 16 : 8;
1702    int rd_ofs = neon_full_reg_offset(a->vd);
1703    int rm_ofs = neon_full_reg_offset(a->vm);
1704    TCGv_ptr fpst;
1705
1706    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1707        return false;
1708    }
1709
1710    if (a->size == MO_16) {
1711        if (!dc_isar_feature(aa32_fp16_arith, s)) {
1712            return false;
1713        }
1714    }
1715
1716    /* UNDEF accesses to D16-D31 if they don't exist. */
1717    if (!dc_isar_feature(aa32_simd_r32, s) &&
1718        ((a->vd | a->vm) & 0x10)) {
1719        return false;
1720    }
1721
1722    if ((a->vm | a->vd) & a->q) {
1723        return false;
1724    }
1725
1726    if (!vfp_access_check(s)) {
1727        return true;
1728    }
1729
1730    fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1731    tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, vec_size, vec_size, a->shift, fn);
1732    tcg_temp_free_ptr(fpst);
1733    return true;
1734}
1735
1736#define DO_FP_2SH(INSN, FUNC)                                           \
1737    static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1738    {                                                                   \
1739        return do_fp_2sh(s, a, FUNC);                                   \
1740    }
1741
1742DO_FP_2SH(VCVT_SF, gen_helper_gvec_vcvt_sf)
1743DO_FP_2SH(VCVT_UF, gen_helper_gvec_vcvt_uf)
1744DO_FP_2SH(VCVT_FS, gen_helper_gvec_vcvt_fs)
1745DO_FP_2SH(VCVT_FU, gen_helper_gvec_vcvt_fu)
1746
1747DO_FP_2SH(VCVT_SH, gen_helper_gvec_vcvt_sh)
1748DO_FP_2SH(VCVT_UH, gen_helper_gvec_vcvt_uh)
1749DO_FP_2SH(VCVT_HS, gen_helper_gvec_vcvt_hs)
1750DO_FP_2SH(VCVT_HU, gen_helper_gvec_vcvt_hu)
1751
1752static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
1753                        GVecGen2iFn *fn)
1754{
1755    uint64_t imm;
1756    int reg_ofs, vec_size;
1757
1758    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1759        return false;
1760    }
1761
1762    /* UNDEF accesses to D16-D31 if they don't exist. */
1763    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
1764        return false;
1765    }
1766
1767    if (a->vd & a->q) {
1768        return false;
1769    }
1770
1771    if (!vfp_access_check(s)) {
1772        return true;
1773    }
1774
1775    reg_ofs = neon_full_reg_offset(a->vd);
1776    vec_size = a->q ? 16 : 8;
1777    imm = asimd_imm_const(a->imm, a->cmode, a->op);
1778
1779    fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
1780    return true;
1781}
1782
1783static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
1784                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1785{
1786    tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
1787}
1788
1789static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
1790{
1791    /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
1792    GVecGen2iFn *fn;
1793
1794    if ((a->cmode & 1) && a->cmode < 12) {
1795        /* for op=1, the imm will be inverted, so BIC becomes AND. */
1796        fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
1797    } else {
1798        /* There is one unallocated cmode/op combination in this space */
1799        if (a->cmode == 15 && a->op == 1) {
1800            return false;
1801        }
1802        fn = gen_VMOV_1r;
1803    }
1804    return do_1reg_imm(s, a, fn);
1805}
1806
1807static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
1808                           NeonGenWidenFn *widenfn,
1809                           NeonGenTwo64OpFn *opfn,
1810                           int src1_mop, int src2_mop)
1811{
1812    /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
1813    TCGv_i64 rn0_64, rn1_64, rm_64;
1814
1815    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1816        return false;
1817    }
1818
1819    /* UNDEF accesses to D16-D31 if they don't exist. */
1820    if (!dc_isar_feature(aa32_simd_r32, s) &&
1821        ((a->vd | a->vn | a->vm) & 0x10)) {
1822        return false;
1823    }
1824
1825    if (!opfn) {
1826        /* size == 3 case, which is an entirely different insn group */
1827        return false;
1828    }
1829
1830    if ((a->vd & 1) || (src1_mop == MO_UQ && (a->vn & 1))) {
1831        return false;
1832    }
1833
1834    if (!vfp_access_check(s)) {
1835        return true;
1836    }
1837
1838    rn0_64 = tcg_temp_new_i64();
1839    rn1_64 = tcg_temp_new_i64();
1840    rm_64 = tcg_temp_new_i64();
1841
1842    if (src1_mop >= 0) {
1843        read_neon_element64(rn0_64, a->vn, 0, src1_mop);
1844    } else {
1845        TCGv_i32 tmp = tcg_temp_new_i32();
1846        read_neon_element32(tmp, a->vn, 0, MO_32);
1847        widenfn(rn0_64, tmp);
1848        tcg_temp_free_i32(tmp);
1849    }
1850    if (src2_mop >= 0) {
1851        read_neon_element64(rm_64, a->vm, 0, src2_mop);
1852    } else {
1853        TCGv_i32 tmp = tcg_temp_new_i32();
1854        read_neon_element32(tmp, a->vm, 0, MO_32);
1855        widenfn(rm_64, tmp);
1856        tcg_temp_free_i32(tmp);
1857    }
1858
1859    opfn(rn0_64, rn0_64, rm_64);
1860
1861    /*
1862     * Load second pass inputs before storing the first pass result, to
1863     * avoid incorrect results if a narrow input overlaps with the result.
1864     */
1865    if (src1_mop >= 0) {
1866        read_neon_element64(rn1_64, a->vn, 1, src1_mop);
1867    } else {
1868        TCGv_i32 tmp = tcg_temp_new_i32();
1869        read_neon_element32(tmp, a->vn, 1, MO_32);
1870        widenfn(rn1_64, tmp);
1871        tcg_temp_free_i32(tmp);
1872    }
1873    if (src2_mop >= 0) {
1874        read_neon_element64(rm_64, a->vm, 1, src2_mop);
1875    } else {
1876        TCGv_i32 tmp = tcg_temp_new_i32();
1877        read_neon_element32(tmp, a->vm, 1, MO_32);
1878        widenfn(rm_64, tmp);
1879        tcg_temp_free_i32(tmp);
1880    }
1881
1882    write_neon_element64(rn0_64, a->vd, 0, MO_64);
1883
1884    opfn(rn1_64, rn1_64, rm_64);
1885    write_neon_element64(rn1_64, a->vd, 1, MO_64);
1886
1887    tcg_temp_free_i64(rn0_64);
1888    tcg_temp_free_i64(rn1_64);
1889    tcg_temp_free_i64(rm_64);
1890
1891    return true;
1892}
1893
1894#define DO_PREWIDEN(INSN, S, OP, SRC1WIDE, SIGN)                        \
1895    static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1896    {                                                                   \
1897        static NeonGenWidenFn * const widenfn[] = {                     \
1898            gen_helper_neon_widen_##S##8,                               \
1899            gen_helper_neon_widen_##S##16,                              \
1900            NULL, NULL,                                                 \
1901        };                                                              \
1902        static NeonGenTwo64OpFn * const addfn[] = {                     \
1903            gen_helper_neon_##OP##l_u16,                                \
1904            gen_helper_neon_##OP##l_u32,                                \
1905            tcg_gen_##OP##_i64,                                         \
1906            NULL,                                                       \
1907        };                                                              \
1908        int narrow_mop = a->size == MO_32 ? MO_32 | SIGN : -1;          \
1909        return do_prewiden_3d(s, a, widenfn[a->size], addfn[a->size],   \
1910                              SRC1WIDE ? MO_UQ : narrow_mop,             \
1911                              narrow_mop);                              \
1912    }
1913
1914DO_PREWIDEN(VADDL_S, s, add, false, MO_SIGN)
1915DO_PREWIDEN(VADDL_U, u, add, false, 0)
1916DO_PREWIDEN(VSUBL_S, s, sub, false, MO_SIGN)
1917DO_PREWIDEN(VSUBL_U, u, sub, false, 0)
1918DO_PREWIDEN(VADDW_S, s, add, true, MO_SIGN)
1919DO_PREWIDEN(VADDW_U, u, add, true, 0)
1920DO_PREWIDEN(VSUBW_S, s, sub, true, MO_SIGN)
1921DO_PREWIDEN(VSUBW_U, u, sub, true, 0)
1922
1923static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
1924                         NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
1925{
1926    /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
1927    TCGv_i64 rn_64, rm_64;
1928    TCGv_i32 rd0, rd1;
1929
1930    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1931        return false;
1932    }
1933
1934    /* UNDEF accesses to D16-D31 if they don't exist. */
1935    if (!dc_isar_feature(aa32_simd_r32, s) &&
1936        ((a->vd | a->vn | a->vm) & 0x10)) {
1937        return false;
1938    }
1939
1940    if (!opfn || !narrowfn) {
1941        /* size == 3 case, which is an entirely different insn group */
1942        return false;
1943    }
1944
1945    if ((a->vn | a->vm) & 1) {
1946        return false;
1947    }
1948
1949    if (!vfp_access_check(s)) {
1950        return true;
1951    }
1952
1953    rn_64 = tcg_temp_new_i64();
1954    rm_64 = tcg_temp_new_i64();
1955    rd0 = tcg_temp_new_i32();
1956    rd1 = tcg_temp_new_i32();
1957
1958    read_neon_element64(rn_64, a->vn, 0, MO_64);
1959    read_neon_element64(rm_64, a->vm, 0, MO_64);
1960
1961    opfn(rn_64, rn_64, rm_64);
1962
1963    narrowfn(rd0, rn_64);
1964
1965    read_neon_element64(rn_64, a->vn, 1, MO_64);
1966    read_neon_element64(rm_64, a->vm, 1, MO_64);
1967
1968    opfn(rn_64, rn_64, rm_64);
1969
1970    narrowfn(rd1, rn_64);
1971
1972    write_neon_element32(rd0, a->vd, 0, MO_32);
1973    write_neon_element32(rd1, a->vd, 1, MO_32);
1974
1975    tcg_temp_free_i32(rd0);
1976    tcg_temp_free_i32(rd1);
1977    tcg_temp_free_i64(rn_64);
1978    tcg_temp_free_i64(rm_64);
1979
1980    return true;
1981}
1982
1983#define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP)                       \
1984    static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1985    {                                                                   \
1986        static NeonGenTwo64OpFn * const addfn[] = {                     \
1987            gen_helper_neon_##OP##l_u16,                                \
1988            gen_helper_neon_##OP##l_u32,                                \
1989            tcg_gen_##OP##_i64,                                         \
1990            NULL,                                                       \
1991        };                                                              \
1992        static NeonGenNarrowFn * const narrowfn[] = {                   \
1993            gen_helper_neon_##NARROWTYPE##_high_u8,                     \
1994            gen_helper_neon_##NARROWTYPE##_high_u16,                    \
1995            EXTOP,                                                      \
1996            NULL,                                                       \
1997        };                                                              \
1998        return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]);   \
1999    }
2000

2001static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
2002{
2003    tcg_gen_addi_i64(rn, rn, 1u << 31);
2004    tcg_gen_extrh_i64_i32(rd, rn);
2005}
2006
2007DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
2008DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
2009DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
2010DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
2011
2012static bool do_long_3d(DisasContext *s, arg_3diff *a,
2013                       NeonGenTwoOpWidenFn *opfn,
2014                       NeonGenTwo64OpFn *accfn)
2015{
2016    /*
2017     * 3-regs different lengths, long operations.
2018     * These perform an operation on two inputs that returns a double-width
2019     * result, and then possibly perform an accumulation operation of
2020     * that result into the double-width destination.
2021     */
2022    TCGv_i64 rd0, rd1, tmp;
2023    TCGv_i32 rn, rm;
2024
2025    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2026        return false;
2027    }
2028
2029    /* UNDEF accesses to D16-D31 if they don't exist. */
2030    if (!dc_isar_feature(aa32_simd_r32, s) &&
2031        ((a->vd | a->vn | a->vm) & 0x10)) {
2032        return false;
2033    }
2034
2035    if (!opfn) {
2036        /* size == 3 case, which is an entirely different insn group */
2037        return false;
2038    }
2039
2040    if (a->vd & 1) {
2041        return false;
2042    }
2043
2044    if (!vfp_access_check(s)) {
2045        return true;
2046    }
2047
2048    rd0 = tcg_temp_new_i64();
2049    rd1 = tcg_temp_new_i64();
2050
2051    rn = tcg_temp_new_i32();
2052    rm = tcg_temp_new_i32();
2053    read_neon_element32(rn, a->vn, 0, MO_32);
2054    read_neon_element32(rm, a->vm, 0, MO_32);
2055    opfn(rd0, rn, rm);
2056
2057    read_neon_element32(rn, a->vn, 1, MO_32);
2058    read_neon_element32(rm, a->vm, 1, MO_32);
2059    opfn(rd1, rn, rm);
2060    tcg_temp_free_i32(rn);
2061    tcg_temp_free_i32(rm);
2062
2063    /* Don't store results until after all loads: they might overlap */
2064    if (accfn) {
2065        tmp = tcg_temp_new_i64();
2066        read_neon_element64(tmp, a->vd, 0, MO_64);
2067        accfn(rd0, tmp, rd0);
2068        read_neon_element64(tmp, a->vd, 1, MO_64);
2069        accfn(rd1, tmp, rd1);
2070        tcg_temp_free_i64(tmp);
2071    }
2072
2073    write_neon_element64(rd0, a->vd, 0, MO_64);
2074    write_neon_element64(rd1, a->vd, 1, MO_64);
2075    tcg_temp_free_i64(rd0);
2076    tcg_temp_free_i64(rd1);
2077
2078    return true;
2079}
2080
2081static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
2082{
2083    static NeonGenTwoOpWidenFn * const opfn[] = {
2084        gen_helper_neon_abdl_s16,
2085        gen_helper_neon_abdl_s32,
2086        gen_helper_neon_abdl_s64,
2087        NULL,
2088    };
2089
2090    return do_long_3d(s, a, opfn[a->size], NULL);
2091}
2092
2093static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
2094{
2095    static NeonGenTwoOpWidenFn * const opfn[] = {
2096        gen_helper_neon_abdl_u16,
2097        gen_helper_neon_abdl_u32,
2098        gen_helper_neon_abdl_u64,
2099        NULL,
2100    };
2101
2102    return do_long_3d(s, a, opfn[a->size], NULL);
2103}
2104
2105static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
2106{
2107    static NeonGenTwoOpWidenFn * const opfn[] = {
2108        gen_helper_neon_abdl_s16,
2109        gen_helper_neon_abdl_s32,
2110        gen_helper_neon_abdl_s64,
2111        NULL,
2112    };
2113    static NeonGenTwo64OpFn * const addfn[] = {
2114        gen_helper_neon_addl_u16,
2115        gen_helper_neon_addl_u32,
2116        tcg_gen_add_i64,
2117        NULL,
2118    };
2119
2120    return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2121}
2122
2123static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
2124{
2125    static NeonGenTwoOpWidenFn * const opfn[] = {
2126        gen_helper_neon_abdl_u16,
2127        gen_helper_neon_abdl_u32,
2128        gen_helper_neon_abdl_u64,
2129        NULL,
2130    };
2131    static NeonGenTwo64OpFn * const addfn[] = {
2132        gen_helper_neon_addl_u16,
2133        gen_helper_neon_addl_u32,
2134        tcg_gen_add_i64,
2135        NULL,
2136    };
2137
2138    return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2139}
2140
2141static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2142{
2143    TCGv_i32 lo = tcg_temp_new_i32();
2144    TCGv_i32 hi = tcg_temp_new_i32();
2145
2146    tcg_gen_muls2_i32(lo, hi, rn, rm);
2147    tcg_gen_concat_i32_i64(rd, lo, hi);
2148
2149    tcg_temp_free_i32(lo);
2150    tcg_temp_free_i32(hi);
2151}
2152
2153static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2154{
2155    TCGv_i32 lo = tcg_temp_new_i32();
2156    TCGv_i32 hi = tcg_temp_new_i32();
2157
2158    tcg_gen_mulu2_i32(lo, hi, rn, rm);
2159    tcg_gen_concat_i32_i64(rd, lo, hi);
2160
2161    tcg_temp_free_i32(lo);
2162    tcg_temp_free_i32(hi);
2163}
2164
2165static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
2166{
2167    static NeonGenTwoOpWidenFn * const opfn[] = {
2168        gen_helper_neon_mull_s8,
2169        gen_helper_neon_mull_s16,
2170        gen_mull_s32,
2171        NULL,
2172    };
2173
2174    return do_long_3d(s, a, opfn[a->size], NULL);
2175}
2176
2177static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
2178{
2179    static NeonGenTwoOpWidenFn * const opfn[] = {
2180        gen_helper_neon_mull_u8,
2181        gen_helper_neon_mull_u16,
2182        gen_mull_u32,
2183        NULL,
2184    };
2185
2186    return do_long_3d(s, a, opfn[a->size], NULL);
2187}
2188
2189#define DO_VMLAL(INSN,MULL,ACC)                                         \
2190    static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2191    {                                                                   \
2192        static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2193            gen_helper_neon_##MULL##8,                                  \
2194            gen_helper_neon_##MULL##16,                                 \
2195            gen_##MULL##32,                                             \
2196            NULL,                                                       \
2197        };                                                              \
2198        static NeonGenTwo64OpFn * const accfn[] = {                     \
2199            gen_helper_neon_##ACC##l_u16,                               \
2200            gen_helper_neon_##ACC##l_u32,                               \
2201            tcg_gen_##ACC##_i64,                                        \
2202            NULL,                                                       \
2203        };                                                              \
2204        return do_long_3d(s, a, opfn[a->size], accfn[a->size]);         \
2205    }
2206
2207DO_VMLAL(VMLAL_S,mull_s,add)
2208DO_VMLAL(VMLAL_U,mull_u,add)
2209DO_VMLAL(VMLSL_S,mull_s,sub)
2210DO_VMLAL(VMLSL_U,mull_u,sub)
2211
2212static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2213{
2214    gen_helper_neon_mull_s16(rd, rn, rm);
2215    gen_helper_neon_addl_saturate_s32(rd, cpu_env, rd, rd);
2216}
2217
2218static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2219{
2220    gen_mull_s32(rd, rn, rm);
2221    gen_helper_neon_addl_saturate_s64(rd, cpu_env, rd, rd);
2222}
2223
2224static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
2225{
2226    static NeonGenTwoOpWidenFn * const opfn[] = {
2227        NULL,
2228        gen_VQDMULL_16,
2229        gen_VQDMULL_32,
2230        NULL,
2231    };
2232
2233    return do_long_3d(s, a, opfn[a->size], NULL);
2234}
2235
2236static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2237{
2238    gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2239}
2240
2241static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2242{
2243    gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2244}
2245
2246static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
2247{
2248    static NeonGenTwoOpWidenFn * const opfn[] = {
2249        NULL,
2250        gen_VQDMULL_16,
2251        gen_VQDMULL_32,
2252        NULL,
2253    };
2254    static NeonGenTwo64OpFn * const accfn[] = {
2255        NULL,
2256        gen_VQDMLAL_acc_16,
2257        gen_VQDMLAL_acc_32,
2258        NULL,
2259    };
2260
2261    return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2262}
2263
2264static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2265{
2266    gen_helper_neon_negl_u32(rm, rm);
2267    gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2268}
2269
2270static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2271{
2272    tcg_gen_neg_i64(rm, rm);
2273    gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2274}
2275
2276static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
2277{
2278    static NeonGenTwoOpWidenFn * const opfn[] = {
2279        NULL,
2280        gen_VQDMULL_16,
2281        gen_VQDMULL_32,
2282        NULL,
2283    };
2284    static NeonGenTwo64OpFn * const accfn[] = {
2285        NULL,
2286        gen_VQDMLSL_acc_16,
2287        gen_VQDMLSL_acc_32,
2288        NULL,
2289    };
2290
2291    return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2292}
2293
2294static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
2295{
2296    gen_helper_gvec_3 *fn_gvec;
2297
2298    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2299        return false;
2300    }
2301
2302    /* UNDEF accesses to D16-D31 if they don't exist. */
2303    if (!dc_isar_feature(aa32_simd_r32, s) &&
2304        ((a->vd | a->vn | a->vm) & 0x10)) {
2305        return false;
2306    }
2307
2308    if (a->vd & 1) {
2309        return false;
2310    }
2311
2312    switch (a->size) {
2313    case 0:
2314        fn_gvec = gen_helper_neon_pmull_h;
2315        break;
2316    case 2:
2317        if (!dc_isar_feature(aa32_pmull, s)) {
2318            return false;
2319        }
2320        fn_gvec = gen_helper_gvec_pmull_q;
2321        break;
2322    default:
2323        return false;
2324    }
2325
2326    if (!vfp_access_check(s)) {
2327        return true;
2328    }
2329
2330    tcg_gen_gvec_3_ool(neon_full_reg_offset(a->vd),
2331                       neon_full_reg_offset(a->vn),
2332                       neon_full_reg_offset(a->vm),
2333                       16, 16, 0, fn_gvec);
2334    return true;
2335}
2336
2337static void gen_neon_dup_low16(TCGv_i32 var)
2338{
2339    TCGv_i32 tmp = tcg_temp_new_i32();
2340    tcg_gen_ext16u_i32(var, var);
2341    tcg_gen_shli_i32(tmp, var, 16);
2342    tcg_gen_or_i32(var, var, tmp);
2343    tcg_temp_free_i32(tmp);
2344}
2345
2346static void gen_neon_dup_high16(TCGv_i32 var)
2347{
2348    TCGv_i32 tmp = tcg_temp_new_i32();
2349    tcg_gen_andi_i32(var, var, 0xffff0000);
2350    tcg_gen_shri_i32(tmp, var, 16);
2351    tcg_gen_or_i32(var, var, tmp);
2352    tcg_temp_free_i32(tmp);
2353}
2354
2355static inline TCGv_i32 neon_get_scalar(int size, int reg)
2356{
2357    TCGv_i32 tmp = tcg_temp_new_i32();
2358    if (size == MO_16) {
2359        read_neon_element32(tmp, reg & 7, reg >> 4, MO_32);
2360        if (reg & 8) {
2361            gen_neon_dup_high16(tmp);
2362        } else {
2363            gen_neon_dup_low16(tmp);
2364        }
2365    } else {
2366        read_neon_element32(tmp, reg & 15, reg >> 4, MO_32);
2367    }
2368    return tmp;
2369}
2370
2371static bool do_2scalar(DisasContext *s, arg_2scalar *a,
2372                       NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
2373{
2374    /*
2375     * Two registers and a scalar: perform an operation between
2376     * the input elements and the scalar, and then possibly
2377     * perform an accumulation operation of that result into the
2378     * destination.
2379     */
2380    TCGv_i32 scalar, tmp;
2381    int pass;
2382
2383    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2384        return false;
2385    }
2386
2387    /* UNDEF accesses to D16-D31 if they don't exist. */
2388    if (!dc_isar_feature(aa32_simd_r32, s) &&
2389        ((a->vd | a->vn | a->vm) & 0x10)) {
2390        return false;
2391    }
2392
2393    if (!opfn) {
2394        /* Bad size (including size == 3, which is a different insn group) */
2395        return false;
2396    }
2397
2398    if (a->q && ((a->vd | a->vn) & 1)) {
2399        return false;
2400    }
2401
2402    if (!vfp_access_check(s)) {
2403        return true;
2404    }
2405
2406    scalar = neon_get_scalar(a->size, a->vm);
2407    tmp = tcg_temp_new_i32();
2408
2409    for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2410        read_neon_element32(tmp, a->vn, pass, MO_32);
2411        opfn(tmp, tmp, scalar);
2412        if (accfn) {
2413            TCGv_i32 rd = tcg_temp_new_i32();
2414            read_neon_element32(rd, a->vd, pass, MO_32);
2415            accfn(tmp, rd, tmp);
2416            tcg_temp_free_i32(rd);
2417        }
2418        write_neon_element32(tmp, a->vd, pass, MO_32);
2419    }
2420    tcg_temp_free_i32(tmp);
2421    tcg_temp_free_i32(scalar);
2422    return true;
2423}
2424
2425static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
2426{
2427    static NeonGenTwoOpFn * const opfn[] = {
2428        NULL,
2429        gen_helper_neon_mul_u16,
2430        tcg_gen_mul_i32,
2431        NULL,
2432    };
2433
2434    return do_2scalar(s, a, opfn[a->size], NULL);
2435}
2436
2437static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
2438{
2439    static NeonGenTwoOpFn * const opfn[] = {
2440        NULL,
2441        gen_helper_neon_mul_u16,
2442        tcg_gen_mul_i32,
2443        NULL,
2444    };
2445    static NeonGenTwoOpFn * const accfn[] = {
2446        NULL,
2447        gen_helper_neon_add_u16,
2448        tcg_gen_add_i32,
2449        NULL,
2450    };
2451
2452    return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2453}
2454
2455static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
2456{
2457    static NeonGenTwoOpFn * const opfn[] = {
2458        NULL,
2459        gen_helper_neon_mul_u16,
2460        tcg_gen_mul_i32,
2461        NULL,
2462    };
2463    static NeonGenTwoOpFn * const accfn[] = {
2464        NULL,
2465        gen_helper_neon_sub_u16,
2466        tcg_gen_sub_i32,
2467        NULL,
2468    };
2469
2470    return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2471}
2472
2473static bool do_2scalar_fp_vec(DisasContext *s, arg_2scalar *a,
2474                              gen_helper_gvec_3_ptr *fn)
2475{
2476    /* Two registers and a scalar, using gvec */
2477    int vec_size = a->q ? 16 : 8;
2478    int rd_ofs = neon_full_reg_offset(a->vd);
2479    int rn_ofs = neon_full_reg_offset(a->vn);
2480    int rm_ofs;
2481    int idx;
2482    TCGv_ptr fpstatus;
2483
2484    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2485        return false;
2486    }
2487
2488    /* UNDEF accesses to D16-D31 if they don't exist. */
2489    if (!dc_isar_feature(aa32_simd_r32, s) &&
2490        ((a->vd | a->vn | a->vm) & 0x10)) {
2491        return false;
2492    }
2493
2494    if (!fn) {
2495        /* Bad size (including size == 3, which is a different insn group) */
2496        return false;
2497    }
2498
2499    if (a->q && ((a->vd | a->vn) & 1)) {
2500        return false;
2501    }
2502
2503    if (!vfp_access_check(s)) {
2504        return true;
2505    }
2506
2507    /* a->vm is M:Vm, which encodes both register and index */
2508    idx = extract32(a->vm, a->size + 2, 2);
2509    a->vm = extract32(a->vm, 0, a->size + 2);
2510    rm_ofs = neon_full_reg_offset(a->vm);
2511
2512    fpstatus = fpstatus_ptr(a->size == 1 ? FPST_STD_F16 : FPST_STD);
2513    tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpstatus,
2514                       vec_size, vec_size, idx, fn);
2515    tcg_temp_free_ptr(fpstatus);
2516    return true;
2517}
2518
2519#define DO_VMUL_F_2sc(NAME, FUNC)                                       \
2520    static bool trans_##NAME##_F_2sc(DisasContext *s, arg_2scalar *a)   \
2521    {                                                                   \
2522        static gen_helper_gvec_3_ptr * const opfn[] = {                 \
2523            NULL,                                                       \
2524            gen_helper_##FUNC##_h,                                      \
2525            gen_helper_##FUNC##_s,                                      \
2526            NULL,                                                       \
2527        };                                                              \
2528        if (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s)) { \
2529            return false;                                               \
2530        }                                                               \
2531        return do_2scalar_fp_vec(s, a, opfn[a->size]);                  \
2532    }
2533
2534DO_VMUL_F_2sc(VMUL, gvec_fmul_idx)
2535DO_VMUL_F_2sc(VMLA, gvec_fmla_nf_idx)
2536DO_VMUL_F_2sc(VMLS, gvec_fmls_nf_idx)
2537
2538WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
2539WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
2540WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
2541WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
2542
2543static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
2544{
2545    static NeonGenTwoOpFn * const opfn[] = {
2546        NULL,
2547        gen_VQDMULH_16,
2548        gen_VQDMULH_32,
2549        NULL,
2550    };
2551
2552    return do_2scalar(s, a, opfn[a->size], NULL);
2553}
2554
2555static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
2556{
2557    static NeonGenTwoOpFn * const opfn[] = {
2558        NULL,
2559        gen_VQRDMULH_16,
2560        gen_VQRDMULH_32,
2561        NULL,
2562    };
2563
2564    return do_2scalar(s, a, opfn[a->size], NULL);
2565}
2566
2567static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
2568                            NeonGenThreeOpEnvFn *opfn)
2569{
2570    /*
2571     * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
2572     * performs a kind of fused op-then-accumulate using a helper
2573     * function that takes all of rd, rn and the scalar at once.
2574     */
2575    TCGv_i32 scalar, rn, rd;
2576    int pass;
2577
2578    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2579        return false;
2580    }
2581
2582    if (!dc_isar_feature(aa32_rdm, s)) {
2583        return false;
2584    }
2585
2586    /* UNDEF accesses to D16-D31 if they don't exist. */
2587    if (!dc_isar_feature(aa32_simd_r32, s) &&
2588        ((a->vd | a->vn | a->vm) & 0x10)) {
2589        return false;
2590    }
2591
2592    if (!opfn) {
2593        /* Bad size (including size == 3, which is a different insn group) */
2594        return false;
2595    }
2596
2597    if (a->q && ((a->vd | a->vn) & 1)) {
2598        return false;
2599    }
2600
2601    if (!vfp_access_check(s)) {
2602        return true;
2603    }
2604
2605    scalar = neon_get_scalar(a->size, a->vm);
2606    rn = tcg_temp_new_i32();
2607    rd = tcg_temp_new_i32();
2608
2609    for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2610        read_neon_element32(rn, a->vn, pass, MO_32);
2611        read_neon_element32(rd, a->vd, pass, MO_32);
2612        opfn(rd, cpu_env, rn, scalar, rd);
2613        write_neon_element32(rd, a->vd, pass, MO_32);
2614    }
2615    tcg_temp_free_i32(rn);
2616    tcg_temp_free_i32(rd);
2617    tcg_temp_free_i32(scalar);
2618
2619    return true;
2620}
2621
2622static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
2623{
2624    static NeonGenThreeOpEnvFn *opfn[] = {
2625        NULL,
2626        gen_helper_neon_qrdmlah_s16,
2627        gen_helper_neon_qrdmlah_s32,
2628        NULL,
2629    };
2630    return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2631}
2632
2633static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
2634{
2635    static NeonGenThreeOpEnvFn *opfn[] = {
2636        NULL,
2637        gen_helper_neon_qrdmlsh_s16,
2638        gen_helper_neon_qrdmlsh_s32,
2639        NULL,
2640    };
2641    return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2642}
2643
2644static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
2645                            NeonGenTwoOpWidenFn *opfn,
2646                            NeonGenTwo64OpFn *accfn)
2647{
2648    /*
2649     * Two registers and a scalar, long operations: perform an
2650     * operation on the input elements and the scalar which produces
2651     * a double-width result, and then possibly perform an accumulation
2652     * operation of that result into the destination.
2653     */
2654    TCGv_i32 scalar, rn;
2655    TCGv_i64 rn0_64, rn1_64;
2656
2657    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2658        return false;
2659    }
2660
2661    /* UNDEF accesses to D16-D31 if they don't exist. */
2662    if (!dc_isar_feature(aa32_simd_r32, s) &&
2663        ((a->vd | a->vn | a->vm) & 0x10)) {
2664        return false;
2665    }
2666
2667    if (!opfn) {
2668        /* Bad size (including size == 3, which is a different insn group) */
2669        return false;
2670    }
2671
2672    if (a->vd & 1) {
2673        return false;
2674    }
2675
2676    if (!vfp_access_check(s)) {
2677        return true;
2678    }
2679
2680    scalar = neon_get_scalar(a->size, a->vm);
2681
2682    /* Load all inputs before writing any outputs, in case of overlap */
2683    rn = tcg_temp_new_i32();
2684    read_neon_element32(rn, a->vn, 0, MO_32);
2685    rn0_64 = tcg_temp_new_i64();
2686    opfn(rn0_64, rn, scalar);
2687
2688    read_neon_element32(rn, a->vn, 1, MO_32);
2689    rn1_64 = tcg_temp_new_i64();
2690    opfn(rn1_64, rn, scalar);
2691    tcg_temp_free_i32(rn);
2692    tcg_temp_free_i32(scalar);
2693
2694    if (accfn) {
2695        TCGv_i64 t64 = tcg_temp_new_i64();
2696        read_neon_element64(t64, a->vd, 0, MO_64);
2697        accfn(rn0_64, t64, rn0_64);
2698        read_neon_element64(t64, a->vd, 1, MO_64);
2699        accfn(rn1_64, t64, rn1_64);
2700        tcg_temp_free_i64(t64);
2701    }
2702
2703    write_neon_element64(rn0_64, a->vd, 0, MO_64);
2704    write_neon_element64(rn1_64, a->vd, 1, MO_64);
2705    tcg_temp_free_i64(rn0_64);
2706    tcg_temp_free_i64(rn1_64);
2707    return true;
2708}
2709
2710static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
2711{
2712    static NeonGenTwoOpWidenFn * const opfn[] = {
2713        NULL,
2714        gen_helper_neon_mull_s16,
2715        gen_mull_s32,
2716        NULL,
2717    };
2718
2719    return do_2scalar_long(s, a, opfn[a->size], NULL);
2720}
2721
2722static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
2723{
2724    static NeonGenTwoOpWidenFn * const opfn[] = {
2725        NULL,
2726        gen_helper_neon_mull_u16,
2727        gen_mull_u32,
2728        NULL,
2729    };
2730
2731    return do_2scalar_long(s, a, opfn[a->size], NULL);
2732}
2733
2734#define DO_VMLAL_2SC(INSN, MULL, ACC)                                   \
2735    static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a)     \
2736    {                                                                   \
2737        static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2738            NULL,                                                       \
2739            gen_helper_neon_##MULL##16,                                 \
2740            gen_##MULL##32,                                             \
2741            NULL,                                                       \
2742        };                                                              \
2743        static NeonGenTwo64OpFn * const accfn[] = {                     \
2744            NULL,                                                       \
2745            gen_helper_neon_##ACC##l_u32,                               \
2746            tcg_gen_##ACC##_i64,                                        \
2747            NULL,                                                       \
2748        };                                                              \
2749        return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);    \
2750    }
2751
2752DO_VMLAL_2SC(VMLAL_S, mull_s, add)
2753DO_VMLAL_2SC(VMLAL_U, mull_u, add)
2754DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
2755DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
2756
2757static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
2758{
2759    static NeonGenTwoOpWidenFn * const opfn[] = {
2760        NULL,
2761        gen_VQDMULL_16,
2762        gen_VQDMULL_32,
2763        NULL,
2764    };
2765
2766    return do_2scalar_long(s, a, opfn[a->size], NULL);
2767}
2768
2769static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
2770{
2771    static NeonGenTwoOpWidenFn * const opfn[] = {
2772        NULL,
2773        gen_VQDMULL_16,
2774        gen_VQDMULL_32,
2775        NULL,
2776    };
2777    static NeonGenTwo64OpFn * const accfn[] = {
2778        NULL,
2779        gen_VQDMLAL_acc_16,
2780        gen_VQDMLAL_acc_32,
2781        NULL,
2782    };
2783
2784    return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2785}
2786
2787static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
2788{
2789    static NeonGenTwoOpWidenFn * const opfn[] = {
2790        NULL,
2791        gen_VQDMULL_16,
2792        gen_VQDMULL_32,
2793        NULL,
2794    };
2795    static NeonGenTwo64OpFn * const accfn[] = {
2796        NULL,
2797        gen_VQDMLSL_acc_16,
2798        gen_VQDMLSL_acc_32,
2799        NULL,
2800    };
2801
2802    return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2803}
2804
2805static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
2806{
2807    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2808        return false;
2809    }
2810
2811    /* UNDEF accesses to D16-D31 if they don't exist. */
2812    if (!dc_isar_feature(aa32_simd_r32, s) &&
2813        ((a->vd | a->vn | a->vm) & 0x10)) {
2814        return false;
2815    }
2816
2817    if ((a->vn | a->vm | a->vd) & a->q) {
2818        return false;
2819    }
2820
2821    if (a->imm > 7 && !a->q) {
2822        return false;
2823    }
2824
2825    if (!vfp_access_check(s)) {
2826        return true;
2827    }
2828
2829    if (!a->q) {
2830        /* Extract 64 bits from <Vm:Vn> */
2831        TCGv_i64 left, right, dest;
2832
2833        left = tcg_temp_new_i64();
2834        right = tcg_temp_new_i64();
2835        dest = tcg_temp_new_i64();
2836
2837        read_neon_element64(right, a->vn, 0, MO_64);
2838        read_neon_element64(left, a->vm, 0, MO_64);
2839        tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
2840        write_neon_element64(dest, a->vd, 0, MO_64);
2841
2842        tcg_temp_free_i64(left);
2843        tcg_temp_free_i64(right);
2844        tcg_temp_free_i64(dest);
2845    } else {
2846        /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
2847        TCGv_i64 left, middle, right, destleft, destright;
2848
2849        left = tcg_temp_new_i64();
2850        middle = tcg_temp_new_i64();
2851        right = tcg_temp_new_i64();
2852        destleft = tcg_temp_new_i64();
2853        destright = tcg_temp_new_i64();
2854
2855        if (a->imm < 8) {
2856            read_neon_element64(right, a->vn, 0, MO_64);
2857            read_neon_element64(middle, a->vn, 1, MO_64);
2858            tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
2859            read_neon_element64(left, a->vm, 0, MO_64);
2860            tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
2861        } else {
2862            read_neon_element64(right, a->vn, 1, MO_64);
2863            read_neon_element64(middle, a->vm, 0, MO_64);
2864            tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
2865            read_neon_element64(left, a->vm, 1, MO_64);
2866            tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
2867        }
2868
2869        write_neon_element64(destright, a->vd, 0, MO_64);
2870        write_neon_element64(destleft, a->vd, 1, MO_64);
2871
2872        tcg_temp_free_i64(destright);
2873        tcg_temp_free_i64(destleft);
2874        tcg_temp_free_i64(right);
2875        tcg_temp_free_i64(middle);
2876        tcg_temp_free_i64(left);
2877    }
2878    return true;
2879}
2880
2881static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
2882{
2883    TCGv_i64 val, def;
2884    TCGv_i32 desc;
2885
2886    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2887        return false;
2888    }
2889
2890    /* UNDEF accesses to D16-D31 if they don't exist. */
2891    if (!dc_isar_feature(aa32_simd_r32, s) &&
2892        ((a->vd | a->vn | a->vm) & 0x10)) {
2893        return false;
2894    }
2895
2896    if ((a->vn + a->len + 1) > 32) {
2897        /*
2898         * This is UNPREDICTABLE; we choose to UNDEF to avoid the
2899         * helper function running off the end of the register file.
2900         */
2901        return false;
2902    }
2903
2904    if (!vfp_access_check(s)) {
2905        return true;
2906    }
2907
2908    desc = tcg_constant_i32((a->vn << 2) | a->len);
2909    def = tcg_temp_new_i64();
2910    if (a->op) {
2911        read_neon_element64(def, a->vd, 0, MO_64);
2912    } else {
2913        tcg_gen_movi_i64(def, 0);
2914    }
2915    val = tcg_temp_new_i64();
2916    read_neon_element64(val, a->vm, 0, MO_64);
2917
2918    gen_helper_neon_tbl(val, cpu_env, desc, val, def);
2919    write_neon_element64(val, a->vd, 0, MO_64);
2920
2921    tcg_temp_free_i64(def);
2922    tcg_temp_free_i64(val);
2923    return true;
2924}
2925
2926static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
2927{
2928    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2929        return false;
2930    }
2931
2932    /* UNDEF accesses to D16-D31 if they don't exist. */
2933    if (!dc_isar_feature(aa32_simd_r32, s) &&
2934        ((a->vd | a->vm) & 0x10)) {
2935        return false;
2936    }
2937
2938    if (a->vd & a->q) {
2939        return false;
2940    }
2941
2942    if (!vfp_access_check(s)) {
2943        return true;
2944    }
2945
2946    tcg_gen_gvec_dup_mem(a->size, neon_full_reg_offset(a->vd),
2947                         neon_element_offset(a->vm, a->index, a->size),
2948                         a->q ? 16 : 8, a->q ? 16 : 8);
2949    return true;
2950}
2951
2952static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
2953{
2954    int pass, half;
2955    TCGv_i32 tmp[2];
2956
2957    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2958        return false;
2959    }
2960
2961    /* UNDEF accesses to D16-D31 if they don't exist. */
2962    if (!dc_isar_feature(aa32_simd_r32, s) &&
2963        ((a->vd | a->vm) & 0x10)) {
2964        return false;
2965    }
2966
2967    if ((a->vd | a->vm) & a->q) {
2968        return false;
2969    }
2970
2971    if (a->size == 3) {
2972        return false;
2973    }
2974
2975    if (!vfp_access_check(s)) {
2976        return true;
2977    }
2978
2979    tmp[0] = tcg_temp_new_i32();
2980    tmp[1] = tcg_temp_new_i32();
2981
2982    for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
2983        for (half = 0; half < 2; half++) {
2984            read_neon_element32(tmp[half], a->vm, pass * 2 + half, MO_32);
2985            switch (a->size) {
2986            case 0:
2987                tcg_gen_bswap32_i32(tmp[half], tmp[half]);
2988                break;
2989            case 1:
2990                gen_swap_half(tmp[half], tmp[half]);
2991                break;
2992            case 2:
2993                break;
2994            default:
2995                g_assert_not_reached();
2996            }
2997        }
2998        write_neon_element32(tmp[1], a->vd, pass * 2, MO_32);
2999        write_neon_element32(tmp[0], a->vd, pass * 2 + 1, MO_32);
3000    }

3001
3002    tcg_temp_free_i32(tmp[0]);
3003    tcg_temp_free_i32(tmp[1]);
3004    return true;
3005}
3006
3007static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
3008                              NeonGenWidenFn *widenfn,
3009                              NeonGenTwo64OpFn *opfn,
3010                              NeonGenTwo64OpFn *accfn)
3011{
3012    /*
3013     * Pairwise long operations: widen both halves of the pair,
3014     * combine the pairs with the opfn, and then possibly accumulate
3015     * into the destination with the accfn.
3016     */
3017    int pass;
3018
3019    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3020        return false;
3021    }
3022
3023    /* UNDEF accesses to D16-D31 if they don't exist. */
3024    if (!dc_isar_feature(aa32_simd_r32, s) &&
3025        ((a->vd | a->vm) & 0x10)) {
3026        return false;
3027    }
3028
3029    if ((a->vd | a->vm) & a->q) {
3030        return false;
3031    }
3032
3033    if (!widenfn) {
3034        return false;
3035    }
3036
3037    if (!vfp_access_check(s)) {
3038        return true;
3039    }
3040
3041    for (pass = 0; pass < a->q + 1; pass++) {
3042        TCGv_i32 tmp;
3043        TCGv_i64 rm0_64, rm1_64, rd_64;
3044
3045        rm0_64 = tcg_temp_new_i64();
3046        rm1_64 = tcg_temp_new_i64();
3047        rd_64 = tcg_temp_new_i64();
3048
3049        tmp = tcg_temp_new_i32();
3050        read_neon_element32(tmp, a->vm, pass * 2, MO_32);
3051        widenfn(rm0_64, tmp);
3052        read_neon_element32(tmp, a->vm, pass * 2 + 1, MO_32);
3053        widenfn(rm1_64, tmp);
3054        tcg_temp_free_i32(tmp);
3055
3056        opfn(rd_64, rm0_64, rm1_64);
3057        tcg_temp_free_i64(rm0_64);
3058        tcg_temp_free_i64(rm1_64);
3059
3060        if (accfn) {
3061            TCGv_i64 tmp64 = tcg_temp_new_i64();
3062            read_neon_element64(tmp64, a->vd, pass, MO_64);
3063            accfn(rd_64, tmp64, rd_64);
3064            tcg_temp_free_i64(tmp64);
3065        }
3066        write_neon_element64(rd_64, a->vd, pass, MO_64);
3067        tcg_temp_free_i64(rd_64);
3068    }
3069    return true;
3070}
3071
3072static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
3073{
3074    static NeonGenWidenFn * const widenfn[] = {
3075        gen_helper_neon_widen_s8,
3076        gen_helper_neon_widen_s16,
3077        tcg_gen_ext_i32_i64,
3078        NULL,
3079    };
3080    static NeonGenTwo64OpFn * const opfn[] = {
3081        gen_helper_neon_paddl_u16,
3082        gen_helper_neon_paddl_u32,
3083        tcg_gen_add_i64,
3084        NULL,
3085    };
3086
3087    return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3088}
3089
3090static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
3091{
3092    static NeonGenWidenFn * const widenfn[] = {
3093        gen_helper_neon_widen_u8,
3094        gen_helper_neon_widen_u16,
3095        tcg_gen_extu_i32_i64,
3096        NULL,
3097    };
3098    static NeonGenTwo64OpFn * const opfn[] = {
3099        gen_helper_neon_paddl_u16,
3100        gen_helper_neon_paddl_u32,
3101        tcg_gen_add_i64,
3102        NULL,
3103    };
3104
3105    return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3106}
3107
3108static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
3109{
3110    static NeonGenWidenFn * const widenfn[] = {
3111        gen_helper_neon_widen_s8,
3112        gen_helper_neon_widen_s16,
3113        tcg_gen_ext_i32_i64,
3114        NULL,
3115    };
3116    static NeonGenTwo64OpFn * const opfn[] = {
3117        gen_helper_neon_paddl_u16,
3118        gen_helper_neon_paddl_u32,
3119        tcg_gen_add_i64,
3120        NULL,
3121    };
3122    static NeonGenTwo64OpFn * const accfn[] = {
3123        gen_helper_neon_addl_u16,
3124        gen_helper_neon_addl_u32,
3125        tcg_gen_add_i64,
3126        NULL,
3127    };
3128
3129    return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3130                             accfn[a->size]);
3131}
3132
3133static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
3134{
3135    static NeonGenWidenFn * const widenfn[] = {
3136        gen_helper_neon_widen_u8,
3137        gen_helper_neon_widen_u16,
3138        tcg_gen_extu_i32_i64,
3139        NULL,
3140    };
3141    static NeonGenTwo64OpFn * const opfn[] = {
3142        gen_helper_neon_paddl_u16,
3143        gen_helper_neon_paddl_u32,
3144        tcg_gen_add_i64,
3145        NULL,
3146    };
3147    static NeonGenTwo64OpFn * const accfn[] = {
3148        gen_helper_neon_addl_u16,
3149        gen_helper_neon_addl_u32,
3150        tcg_gen_add_i64,
3151        NULL,
3152    };
3153
3154    return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3155                             accfn[a->size]);
3156}
3157
3158typedef void ZipFn(TCGv_ptr, TCGv_ptr);
3159
3160static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
3161                       ZipFn *fn)
3162{
3163    TCGv_ptr pd, pm;
3164
3165    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3166        return false;
3167    }
3168
3169    /* UNDEF accesses to D16-D31 if they don't exist. */
3170    if (!dc_isar_feature(aa32_simd_r32, s) &&
3171        ((a->vd | a->vm) & 0x10)) {
3172        return false;
3173    }
3174
3175    if ((a->vd | a->vm) & a->q) {
3176        return false;
3177    }
3178
3179    if (!fn) {
3180        /* Bad size or size/q combination */
3181        return false;
3182    }
3183
3184    if (!vfp_access_check(s)) {
3185        return true;
3186    }
3187
3188    pd = vfp_reg_ptr(true, a->vd);
3189    pm = vfp_reg_ptr(true, a->vm);
3190    fn(pd, pm);
3191    tcg_temp_free_ptr(pd);
3192    tcg_temp_free_ptr(pm);
3193    return true;
3194}
3195
3196static bool trans_VUZP(DisasContext *s, arg_2misc *a)
3197{
3198    static ZipFn * const fn[2][4] = {
3199        {
3200            gen_helper_neon_unzip8,
3201            gen_helper_neon_unzip16,
3202            NULL,
3203            NULL,
3204        }, {
3205            gen_helper_neon_qunzip8,
3206            gen_helper_neon_qunzip16,
3207            gen_helper_neon_qunzip32,
3208            NULL,
3209        }
3210    };
3211    return do_zip_uzp(s, a, fn[a->q][a->size]);
3212}
3213
3214static bool trans_VZIP(DisasContext *s, arg_2misc *a)
3215{
3216    static ZipFn * const fn[2][4] = {
3217        {
3218            gen_helper_neon_zip8,
3219            gen_helper_neon_zip16,
3220            NULL,
3221            NULL,
3222        }, {
3223            gen_helper_neon_qzip8,
3224            gen_helper_neon_qzip16,
3225            gen_helper_neon_qzip32,
3226            NULL,
3227        }
3228    };
3229    return do_zip_uzp(s, a, fn[a->q][a->size]);
3230}
3231
3232static bool do_vmovn(DisasContext *s, arg_2misc *a,
3233                     NeonGenNarrowEnvFn *narrowfn)
3234{
3235    TCGv_i64 rm;
3236    TCGv_i32 rd0, rd1;
3237
3238    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3239        return false;
3240    }
3241
3242    /* UNDEF accesses to D16-D31 if they don't exist. */
3243    if (!dc_isar_feature(aa32_simd_r32, s) &&
3244        ((a->vd | a->vm) & 0x10)) {
3245        return false;
3246    }
3247
3248    if (a->vm & 1) {
3249        return false;
3250    }
3251
3252    if (!narrowfn) {
3253        return false;
3254    }
3255
3256    if (!vfp_access_check(s)) {
3257        return true;
3258    }
3259
3260    rm = tcg_temp_new_i64();
3261    rd0 = tcg_temp_new_i32();
3262    rd1 = tcg_temp_new_i32();
3263
3264    read_neon_element64(rm, a->vm, 0, MO_64);
3265    narrowfn(rd0, cpu_env, rm);
3266    read_neon_element64(rm, a->vm, 1, MO_64);
3267    narrowfn(rd1, cpu_env, rm);
3268    write_neon_element32(rd0, a->vd, 0, MO_32);
3269    write_neon_element32(rd1, a->vd, 1, MO_32);
3270    tcg_temp_free_i32(rd0);
3271    tcg_temp_free_i32(rd1);
3272    tcg_temp_free_i64(rm);
3273    return true;
3274}
3275
3276#define DO_VMOVN(INSN, FUNC)                                    \
3277    static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3278    {                                                           \
3279        static NeonGenNarrowEnvFn * const narrowfn[] = {        \
3280            FUNC##8,                                            \
3281            FUNC##16,                                           \
3282            FUNC##32,                                           \
3283            NULL,                                               \
3284        };                                                      \
3285        return do_vmovn(s, a, narrowfn[a->size]);               \
3286    }
3287
3288DO_VMOVN(VMOVN, gen_neon_narrow_u)
3289DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
3290DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
3291DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
3292
3293static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
3294{
3295    TCGv_i32 rm0, rm1;
3296    TCGv_i64 rd;
3297    static NeonGenWidenFn * const widenfns[] = {
3298        gen_helper_neon_widen_u8,
3299        gen_helper_neon_widen_u16,
3300        tcg_gen_extu_i32_i64,
3301        NULL,
3302    };
3303    NeonGenWidenFn *widenfn = widenfns[a->size];
3304
3305    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3306        return false;
3307    }
3308
3309    /* UNDEF accesses to D16-D31 if they don't exist. */
3310    if (!dc_isar_feature(aa32_simd_r32, s) &&
3311        ((a->vd | a->vm) & 0x10)) {
3312        return false;
3313    }
3314
3315    if (a->vd & 1) {
3316        return false;
3317    }
3318
3319    if (!widenfn) {
3320        return false;
3321    }
3322
3323    if (!vfp_access_check(s)) {
3324        return true;
3325    }
3326
3327    rd = tcg_temp_new_i64();
3328    rm0 = tcg_temp_new_i32();
3329    rm1 = tcg_temp_new_i32();
3330
3331    read_neon_element32(rm0, a->vm, 0, MO_32);
3332    read_neon_element32(rm1, a->vm, 1, MO_32);
3333
3334    widenfn(rd, rm0);
3335    tcg_gen_shli_i64(rd, rd, 8 << a->size);
3336    write_neon_element64(rd, a->vd, 0, MO_64);
3337    widenfn(rd, rm1);
3338    tcg_gen_shli_i64(rd, rd, 8 << a->size);
3339    write_neon_element64(rd, a->vd, 1, MO_64);
3340
3341    tcg_temp_free_i64(rd);
3342    tcg_temp_free_i32(rm0);
3343    tcg_temp_free_i32(rm1);
3344    return true;
3345}
3346
3347static bool trans_VCVT_B16_F32(DisasContext *s, arg_2misc *a)
3348{
3349    TCGv_ptr fpst;
3350    TCGv_i64 tmp;
3351    TCGv_i32 dst0, dst1;
3352
3353    if (!dc_isar_feature(aa32_bf16, s)) {
3354        return false;
3355    }
3356
3357    /* UNDEF accesses to D16-D31 if they don't exist. */
3358    if (!dc_isar_feature(aa32_simd_r32, s) &&
3359        ((a->vd | a->vm) & 0x10)) {
3360        return false;
3361    }
3362
3363    if ((a->vm & 1) || (a->size != 1)) {
3364        return false;
3365    }
3366
3367    if (!vfp_access_check(s)) {
3368        return true;
3369    }
3370
3371    fpst = fpstatus_ptr(FPST_STD);
3372    tmp = tcg_temp_new_i64();
3373    dst0 = tcg_temp_new_i32();
3374    dst1 = tcg_temp_new_i32();
3375
3376    read_neon_element64(tmp, a->vm, 0, MO_64);
3377    gen_helper_bfcvt_pair(dst0, tmp, fpst);
3378
3379    read_neon_element64(tmp, a->vm, 1, MO_64);
3380    gen_helper_bfcvt_pair(dst1, tmp, fpst);
3381
3382    write_neon_element32(dst0, a->vd, 0, MO_32);
3383    write_neon_element32(dst1, a->vd, 1, MO_32);
3384
3385    tcg_temp_free_i64(tmp);
3386    tcg_temp_free_i32(dst0);
3387    tcg_temp_free_i32(dst1);
3388    tcg_temp_free_ptr(fpst);
3389    return true;
3390}
3391
3392static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
3393{
3394    TCGv_ptr fpst;
3395    TCGv_i32 ahp, tmp, tmp2, tmp3;
3396
3397    if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3398        !dc_isar_feature(aa32_fp16_spconv, s)) {
3399        return false;
3400    }
3401
3402    /* UNDEF accesses to D16-D31 if they don't exist. */
3403    if (!dc_isar_feature(aa32_simd_r32, s) &&
3404        ((a->vd | a->vm) & 0x10)) {
3405        return false;
3406    }
3407
3408    if ((a->vm & 1) || (a->size != 1)) {
3409        return false;
3410    }
3411
3412    if (!vfp_access_check(s)) {
3413        return true;
3414    }
3415
3416    fpst = fpstatus_ptr(FPST_STD);
3417    ahp = get_ahp_flag();
3418    tmp = tcg_temp_new_i32();
3419    read_neon_element32(tmp, a->vm, 0, MO_32);
3420    gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3421    tmp2 = tcg_temp_new_i32();
3422    read_neon_element32(tmp2, a->vm, 1, MO_32);
3423    gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
3424    tcg_gen_shli_i32(tmp2, tmp2, 16);
3425    tcg_gen_or_i32(tmp2, tmp2, tmp);
3426    read_neon_element32(tmp, a->vm, 2, MO_32);
3427    gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3428    tmp3 = tcg_temp_new_i32();
3429    read_neon_element32(tmp3, a->vm, 3, MO_32);
3430    write_neon_element32(tmp2, a->vd, 0, MO_32);
3431    tcg_temp_free_i32(tmp2);
3432    gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
3433    tcg_gen_shli_i32(tmp3, tmp3, 16);
3434    tcg_gen_or_i32(tmp3, tmp3, tmp);
3435    write_neon_element32(tmp3, a->vd, 1, MO_32);
3436    tcg_temp_free_i32(tmp3);
3437    tcg_temp_free_i32(tmp);
3438    tcg_temp_free_i32(ahp);
3439    tcg_temp_free_ptr(fpst);
3440
3441    return true;
3442}
3443
3444static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
3445{
3446    TCGv_ptr fpst;
3447    TCGv_i32 ahp, tmp, tmp2, tmp3;
3448
3449    if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3450        !dc_isar_feature(aa32_fp16_spconv, s)) {
3451        return false;
3452    }
3453
3454    /* UNDEF accesses to D16-D31 if they don't exist. */
3455    if (!dc_isar_feature(aa32_simd_r32, s) &&
3456        ((a->vd | a->vm) & 0x10)) {
3457        return false;
3458    }
3459
3460    if ((a->vd & 1) || (a->size != 1)) {
3461        return false;
3462    }
3463
3464    if (!vfp_access_check(s)) {
3465        return true;
3466    }
3467
3468    fpst = fpstatus_ptr(FPST_STD);
3469    ahp = get_ahp_flag();
3470    tmp3 = tcg_temp_new_i32();
3471    tmp2 = tcg_temp_new_i32();
3472    tmp = tcg_temp_new_i32();
3473    read_neon_element32(tmp, a->vm, 0, MO_32);
3474    read_neon_element32(tmp2, a->vm, 1, MO_32);
3475    tcg_gen_ext16u_i32(tmp3, tmp);
3476    gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3477    write_neon_element32(tmp3, a->vd, 0, MO_32);
3478    tcg_gen_shri_i32(tmp, tmp, 16);
3479    gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
3480    write_neon_element32(tmp, a->vd, 1, MO_32);
3481    tcg_temp_free_i32(tmp);
3482    tcg_gen_ext16u_i32(tmp3, tmp2);
3483    gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3484    write_neon_element32(tmp3, a->vd, 2, MO_32);
3485    tcg_temp_free_i32(tmp3);
3486    tcg_gen_shri_i32(tmp2, tmp2, 16);
3487    gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
3488    write_neon_element32(tmp2, a->vd, 3, MO_32);
3489    tcg_temp_free_i32(tmp2);
3490    tcg_temp_free_i32(ahp);
3491    tcg_temp_free_ptr(fpst);
3492
3493    return true;
3494}
3495
3496static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
3497{
3498    int vec_size = a->q ? 16 : 8;
3499    int rd_ofs = neon_full_reg_offset(a->vd);
3500    int rm_ofs = neon_full_reg_offset(a->vm);
3501
3502    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3503        return false;
3504    }
3505
3506    /* UNDEF accesses to D16-D31 if they don't exist. */
3507    if (!dc_isar_feature(aa32_simd_r32, s) &&
3508        ((a->vd | a->vm) & 0x10)) {
3509        return false;
3510    }
3511
3512    if (a->size == 3) {
3513        return false;
3514    }
3515
3516    if ((a->vd | a->vm) & a->q) {
3517        return false;
3518    }
3519
3520    if (!vfp_access_check(s)) {
3521        return true;
3522    }
3523
3524    fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size);
3525
3526    return true;
3527}
3528
3529#define DO_2MISC_VEC(INSN, FN)                                  \
3530    static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3531    {                                                           \
3532        return do_2misc_vec(s, a, FN);                          \
3533    }
3534
3535DO_2MISC_VEC(VNEG, tcg_gen_gvec_neg)
3536DO_2MISC_VEC(VABS, tcg_gen_gvec_abs)
3537DO_2MISC_VEC(VCEQ0, gen_gvec_ceq0)
3538DO_2MISC_VEC(VCGT0, gen_gvec_cgt0)
3539DO_2MISC_VEC(VCLE0, gen_gvec_cle0)
3540DO_2MISC_VEC(VCGE0, gen_gvec_cge0)
3541DO_2MISC_VEC(VCLT0, gen_gvec_clt0)
3542
3543static bool trans_VMVN(DisasContext *s, arg_2misc *a)
3544{
3545    if (a->size != 0) {
3546        return false;
3547    }
3548    return do_2misc_vec(s, a, tcg_gen_gvec_not);
3549}
3550
3551#define WRAP_2M_3_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3552    static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3553                         uint32_t rm_ofs, uint32_t oprsz,               \
3554                         uint32_t maxsz)                                \
3555    {                                                                   \
3556        tcg_gen_gvec_3_ool(rd_ofs, rd_ofs, rm_ofs, oprsz, maxsz,        \
3557                           DATA, FUNC);                                 \
3558    }
3559
3560#define WRAP_2M_2_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3561    static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3562                         uint32_t rm_ofs, uint32_t oprsz,               \
3563                         uint32_t maxsz)                                \
3564    {                                                                   \
3565        tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, oprsz, maxsz, DATA, FUNC);   \
3566    }
3567
3568WRAP_2M_3_OOL_FN(gen_AESE, gen_helper_crypto_aese, 0)
3569WRAP_2M_3_OOL_FN(gen_AESD, gen_helper_crypto_aese, 1)
3570WRAP_2M_2_OOL_FN(gen_AESMC, gen_helper_crypto_aesmc, 0)
3571WRAP_2M_2_OOL_FN(gen_AESIMC, gen_helper_crypto_aesmc, 1)
3572WRAP_2M_2_OOL_FN(gen_SHA1H, gen_helper_crypto_sha1h, 0)
3573WRAP_2M_2_OOL_FN(gen_SHA1SU1, gen_helper_crypto_sha1su1, 0)
3574WRAP_2M_2_OOL_FN(gen_SHA256SU0, gen_helper_crypto_sha256su0, 0)
3575
3576#define DO_2M_CRYPTO(INSN, FEATURE, SIZE)                       \
3577    static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3578    {                                                           \
3579        if (!dc_isar_feature(FEATURE, s) || a->size != SIZE) {  \
3580            return false;                                       \
3581        }                                                       \
3582        return do_2misc_vec(s, a, gen_##INSN);                  \
3583    }
3584
3585DO_2M_CRYPTO(AESE, aa32_aes, 0)
3586DO_2M_CRYPTO(AESD, aa32_aes, 0)
3587DO_2M_CRYPTO(AESMC, aa32_aes, 0)
3588DO_2M_CRYPTO(AESIMC, aa32_aes, 0)
3589DO_2M_CRYPTO(SHA1H, aa32_sha1, 2)
3590DO_2M_CRYPTO(SHA1SU1, aa32_sha1, 2)
3591DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2)
3592
3593static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn)
3594{
3595    TCGv_i32 tmp;
3596    int pass;
3597
3598    /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
3599    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3600        return false;
3601    }
3602
3603    /* UNDEF accesses to D16-D31 if they don't exist. */
3604    if (!dc_isar_feature(aa32_simd_r32, s) &&
3605        ((a->vd | a->vm) & 0x10)) {
3606        return false;
3607    }
3608
3609    if (!fn) {
3610        return false;
3611    }
3612
3613    if ((a->vd | a->vm) & a->q) {
3614        return false;
3615    }
3616
3617    if (!vfp_access_check(s)) {
3618        return true;
3619    }
3620
3621    tmp = tcg_temp_new_i32();
3622    for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3623        read_neon_element32(tmp, a->vm, pass, MO_32);
3624        fn(tmp, tmp);
3625        write_neon_element32(tmp, a->vd, pass, MO_32);
3626    }
3627    tcg_temp_free_i32(tmp);
3628
3629    return true;
3630}
3631
3632static bool trans_VREV32(DisasContext *s, arg_2misc *a)
3633{
3634    static NeonGenOneOpFn * const fn[] = {
3635        tcg_gen_bswap32_i32,
3636        gen_swap_half,
3637        NULL,
3638        NULL,
3639    };
3640    return do_2misc(s, a, fn[a->size]);
3641}
3642
3643static bool trans_VREV16(DisasContext *s, arg_2misc *a)
3644{
3645    if (a->size != 0) {
3646        return false;
3647    }
3648    return do_2misc(s, a, gen_rev16);
3649}
3650
3651static bool trans_VCLS(DisasContext *s, arg_2misc *a)
3652{
3653    static NeonGenOneOpFn * const fn[] = {
3654        gen_helper_neon_cls_s8,
3655        gen_helper_neon_cls_s16,
3656        gen_helper_neon_cls_s32,
3657        NULL,
3658    };
3659    return do_2misc(s, a, fn[a->size]);
3660}
3661
3662static void do_VCLZ_32(TCGv_i32 rd, TCGv_i32 rm)
3663{
3664    tcg_gen_clzi_i32(rd, rm, 32);
3665}
3666
3667static bool trans_VCLZ(DisasContext *s, arg_2misc *a)
3668{
3669    static NeonGenOneOpFn * const fn[] = {
3670        gen_helper_neon_clz_u8,
3671        gen_helper_neon_clz_u16,
3672        do_VCLZ_32,
3673        NULL,
3674    };
3675    return do_2misc(s, a, fn[a->size]);
3676}
3677
3678static bool trans_VCNT(DisasContext *s, arg_2misc *a)
3679{
3680    if (a->size != 0) {
3681        return false;
3682    }
3683    return do_2misc(s, a, gen_helper_neon_cnt_u8);
3684}
3685
3686static void gen_VABS_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3687                       uint32_t oprsz, uint32_t maxsz)
3688{
3689    tcg_gen_gvec_andi(vece, rd_ofs, rm_ofs,
3690                      vece == MO_16 ? 0x7fff : 0x7fffffff,
3691                      oprsz, maxsz);
3692}
3693
3694static bool trans_VABS_F(DisasContext *s, arg_2misc *a)
3695{
3696    if (a->size == MO_16) {
3697        if (!dc_isar_feature(aa32_fp16_arith, s)) {
3698            return false;
3699        }
3700    } else if (a->size != MO_32) {
3701        return false;
3702    }
3703    return do_2misc_vec(s, a, gen_VABS_F);
3704}
3705
3706static void gen_VNEG_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3707                       uint32_t oprsz, uint32_t maxsz)
3708{
3709    tcg_gen_gvec_xori(vece, rd_ofs, rm_ofs,
3710                      vece == MO_16 ? 0x8000 : 0x80000000,
3711                      oprsz, maxsz);
3712}
3713
3714static bool trans_VNEG_F(DisasContext *s, arg_2misc *a)
3715{
3716    if (a->size == MO_16) {
3717        if (!dc_isar_feature(aa32_fp16_arith, s)) {
3718            return false;
3719        }
3720    } else if (a->size != MO_32) {
3721        return false;
3722    }
3723    return do_2misc_vec(s, a, gen_VNEG_F);
3724}
3725
3726static bool trans_VRECPE(DisasContext *s, arg_2misc *a)
3727{
3728    if (a->size != 2) {
3729        return false;
3730    }
3731    return do_2misc(s, a, gen_helper_recpe_u32);
3732}
3733
3734static bool trans_VRSQRTE(DisasContext *s, arg_2misc *a)
3735{
3736    if (a->size != 2) {
3737        return false;
3738    }
3739    return do_2misc(s, a, gen_helper_rsqrte_u32);
3740}
3741
3742#define WRAP_1OP_ENV_FN(WRAPNAME, FUNC) \
3743    static void WRAPNAME(TCGv_i32 d, TCGv_i32 m)        \
3744    {                                                   \
3745        FUNC(d, cpu_env, m);                            \
3746    }
3747
3748WRAP_1OP_ENV_FN(gen_VQABS_s8, gen_helper_neon_qabs_s8)
3749WRAP_1OP_ENV_FN(gen_VQABS_s16, gen_helper_neon_qabs_s16)
3750WRAP_1OP_ENV_FN(gen_VQABS_s32, gen_helper_neon_qabs_s32)
3751WRAP_1OP_ENV_FN(gen_VQNEG_s8, gen_helper_neon_qneg_s8)
3752WRAP_1OP_ENV_FN(gen_VQNEG_s16, gen_helper_neon_qneg_s16)
3753WRAP_1OP_ENV_FN(gen_VQNEG_s32, gen_helper_neon_qneg_s32)
3754
3755static bool trans_VQABS(DisasContext *s, arg_2misc *a)
3756{
3757    static NeonGenOneOpFn * const fn[] = {
3758        gen_VQABS_s8,
3759        gen_VQABS_s16,
3760        gen_VQABS_s32,
3761        NULL,
3762    };
3763    return do_2misc(s, a, fn[a->size]);
3764}
3765
3766static bool trans_VQNEG(DisasContext *s, arg_2misc *a)
3767{
3768    static NeonGenOneOpFn * const fn[] = {
3769        gen_VQNEG_s8,
3770        gen_VQNEG_s16,
3771        gen_VQNEG_s32,
3772        NULL,
3773    };
3774    return do_2misc(s, a, fn[a->size]);
3775}
3776
3777#define DO_2MISC_FP_VEC(INSN, HFUNC, SFUNC)                             \
3778    static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3779                           uint32_t rm_ofs,                             \
3780                           uint32_t oprsz, uint32_t maxsz)              \
3781    {                                                                   \
3782        static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3783            NULL, HFUNC, SFUNC, NULL,                                   \
3784        };                                                              \
3785        TCGv_ptr fpst;                                                  \
3786        fpst = fpstatus_ptr(vece == MO_16 ? FPST_STD_F16 : FPST_STD);   \
3787        tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz, 0,       \
3788                           fns[vece]);                                  \
3789        tcg_temp_free_ptr(fpst);                                        \
3790    }                                                                   \
3791    static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3792    {                                                                   \
3793        if (a->size == MO_16) {                                         \
3794            if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3795                return false;                                           \
3796            }                                                           \
3797        } else if (a->size != MO_32) {                                  \
3798            return false;                                               \
3799        }                                                               \
3800        return do_2misc_vec(s, a, gen_##INSN);                          \
3801    }
3802
3803DO_2MISC_FP_VEC(VRECPE_F, gen_helper_gvec_frecpe_h, gen_helper_gvec_frecpe_s)
3804DO_2MISC_FP_VEC(VRSQRTE_F, gen_helper_gvec_frsqrte_h, gen_helper_gvec_frsqrte_s)
3805DO_2MISC_FP_VEC(VCGT0_F, gen_helper_gvec_fcgt0_h, gen_helper_gvec_fcgt0_s)
3806DO_2MISC_FP_VEC(VCGE0_F, gen_helper_gvec_fcge0_h, gen_helper_gvec_fcge0_s)
3807DO_2MISC_FP_VEC(VCEQ0_F, gen_helper_gvec_fceq0_h, gen_helper_gvec_fceq0_s)
3808DO_2MISC_FP_VEC(VCLT0_F, gen_helper_gvec_fclt0_h, gen_helper_gvec_fclt0_s)
3809DO_2MISC_FP_VEC(VCLE0_F, gen_helper_gvec_fcle0_h, gen_helper_gvec_fcle0_s)
3810DO_2MISC_FP_VEC(VCVT_FS, gen_helper_gvec_sstoh, gen_helper_gvec_sitos)
3811DO_2MISC_FP_VEC(VCVT_FU, gen_helper_gvec_ustoh, gen_helper_gvec_uitos)
3812DO_2MISC_FP_VEC(VCVT_SF, gen_helper_gvec_tosszh, gen_helper_gvec_tosizs)
3813DO_2MISC_FP_VEC(VCVT_UF, gen_helper_gvec_touszh, gen_helper_gvec_touizs)
3814
3815DO_2MISC_FP_VEC(VRINTX_impl, gen_helper_gvec_vrintx_h, gen_helper_gvec_vrintx_s)
3816
3817static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
3818{
3819    if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
3820        return false;
3821    }
3822    return trans_VRINTX_impl(s, a);
3823}
3824
3825#define DO_VEC_RMODE(INSN, RMODE, OP)                                   \
3826    static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3827                           uint32_t rm_ofs,                             \
3828                           uint32_t oprsz, uint32_t maxsz)              \
3829    {                                                                   \
3830        static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3831            NULL,                                                       \
3832            gen_helper_gvec_##OP##h,                                    \
3833            gen_helper_gvec_##OP##s,                                    \
3834            NULL,                                                       \
3835        };                                                              \
3836        TCGv_ptr fpst;                                                  \
3837        fpst = fpstatus_ptr(vece == 1 ? FPST_STD_F16 : FPST_STD);       \
3838        tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz,          \
3839                           arm_rmode_to_sf(RMODE), fns[vece]);          \
3840        tcg_temp_free_ptr(fpst);                                        \
3841    }                                                                   \
3842    static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3843    {                                                                   \
3844        if (!arm_dc_feature(s, ARM_FEATURE_V8)) {                       \
3845            return false;                                               \
3846        }                                                               \
3847        if (a->size == MO_16) {                                         \
3848            if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3849                return false;                                           \
3850            }                                                           \
3851        } else if (a->size != MO_32) {                                  \
3852            return false;                                               \
3853        }                                                               \
3854        return do_2misc_vec(s, a, gen_##INSN);                          \
3855    }
3856
3857DO_VEC_RMODE(VCVTAU, FPROUNDING_TIEAWAY, vcvt_rm_u)
3858DO_VEC_RMODE(VCVTAS, FPROUNDING_TIEAWAY, vcvt_rm_s)
3859DO_VEC_RMODE(VCVTNU, FPROUNDING_TIEEVEN, vcvt_rm_u)
3860DO_VEC_RMODE(VCVTNS, FPROUNDING_TIEEVEN, vcvt_rm_s)
3861DO_VEC_RMODE(VCVTPU, FPROUNDING_POSINF, vcvt_rm_u)
3862DO_VEC_RMODE(VCVTPS, FPROUNDING_POSINF, vcvt_rm_s)
3863DO_VEC_RMODE(VCVTMU, FPROUNDING_NEGINF, vcvt_rm_u)
3864DO_VEC_RMODE(VCVTMS, FPROUNDING_NEGINF, vcvt_rm_s)
3865
3866DO_VEC_RMODE(VRINTN, FPROUNDING_TIEEVEN, vrint_rm_)
3867DO_VEC_RMODE(VRINTA, FPROUNDING_TIEAWAY, vrint_rm_)
3868DO_VEC_RMODE(VRINTZ, FPROUNDING_ZERO, vrint_rm_)
3869DO_VEC_RMODE(VRINTM, FPROUNDING_NEGINF, vrint_rm_)
3870DO_VEC_RMODE(VRINTP, FPROUNDING_POSINF, vrint_rm_)
3871
3872static bool trans_VSWP(DisasContext *s, arg_2misc *a)
3873{
3874    TCGv_i64 rm, rd;
3875    int pass;
3876
3877    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3878        return false;
3879    }
3880
3881    /* UNDEF accesses to D16-D31 if they don't exist. */
3882    if (!dc_isar_feature(aa32_simd_r32, s) &&
3883        ((a->vd | a->vm) & 0x10)) {
3884        return false;
3885    }
3886
3887    if (a->size != 0) {
3888        return false;
3889    }
3890
3891    if ((a->vd | a->vm) & a->q) {
3892        return false;
3893    }
3894
3895    if (!vfp_access_check(s)) {
3896        return true;
3897    }
3898
3899    rm = tcg_temp_new_i64();
3900    rd = tcg_temp_new_i64();
3901    for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
3902        read_neon_element64(rm, a->vm, pass, MO_64);
3903        read_neon_element64(rd, a->vd, pass, MO_64);
3904        write_neon_element64(rm, a->vd, pass, MO_64);
3905        write_neon_element64(rd, a->vm, pass, MO_64);
3906    }
3907    tcg_temp_free_i64(rm);
3908    tcg_temp_free_i64(rd);
3909
3910    return true;
3911}
3912static void gen_neon_trn_u8(TCGv_i32 t0, TCGv_i32 t1)
3913{
3914    TCGv_i32 rd, tmp;
3915
3916    rd = tcg_temp_new_i32();
3917    tmp = tcg_temp_new_i32();
3918
3919    tcg_gen_shli_i32(rd, t0, 8);
3920    tcg_gen_andi_i32(rd, rd, 0xff00ff00);
3921    tcg_gen_andi_i32(tmp, t1, 0x00ff00ff);
3922    tcg_gen_or_i32(rd, rd, tmp);
3923
3924    tcg_gen_shri_i32(t1, t1, 8);
3925    tcg_gen_andi_i32(t1, t1, 0x00ff00ff);
3926    tcg_gen_andi_i32(tmp, t0, 0xff00ff00);
3927    tcg_gen_or_i32(t1, t1, tmp);
3928    tcg_gen_mov_i32(t0, rd);
3929
3930    tcg_temp_free_i32(tmp);
3931    tcg_temp_free_i32(rd);
3932}
3933
3934static void gen_neon_trn_u16(TCGv_i32 t0, TCGv_i32 t1)
3935{
3936    TCGv_i32 rd, tmp;
3937
3938    rd = tcg_temp_new_i32();
3939    tmp = tcg_temp_new_i32();
3940
3941    tcg_gen_shli_i32(rd, t0, 16);
3942    tcg_gen_andi_i32(tmp, t1, 0xffff);
3943    tcg_gen_or_i32(rd, rd, tmp);
3944    tcg_gen_shri_i32(t1, t1, 16);
3945    tcg_gen_andi_i32(tmp, t0, 0xffff0000);
3946    tcg_gen_or_i32(t1, t1, tmp);
3947    tcg_gen_mov_i32(t0, rd);
3948
3949    tcg_temp_free_i32(tmp);
3950    tcg_temp_free_i32(rd);
3951}
3952
3953static bool trans_VTRN(DisasContext *s, arg_2misc *a)
3954{
3955    TCGv_i32 tmp, tmp2;
3956    int pass;
3957
3958    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3959        return false;
3960    }
3961
3962    /* UNDEF accesses to D16-D31 if they don't exist. */
3963    if (!dc_isar_feature(aa32_simd_r32, s) &&
3964        ((a->vd | a->vm) & 0x10)) {
3965        return false;
3966    }
3967
3968    if ((a->vd | a->vm) & a->q) {
3969        return false;
3970    }
3971
3972    if (a->size == 3) {
3973        return false;
3974    }
3975
3976    if (!vfp_access_check(s)) {
3977        return true;
3978    }
3979
3980    tmp = tcg_temp_new_i32();
3981    tmp2 = tcg_temp_new_i32();
3982    if (a->size == MO_32) {
3983        for (pass = 0; pass < (a->q ? 4 : 2); pass += 2) {
3984            read_neon_element32(tmp, a->vm, pass, MO_32);
3985            read_neon_element32(tmp2, a->vd, pass + 1, MO_32);
3986            write_neon_element32(tmp2, a->vm, pass, MO_32);
3987            write_neon_element32(tmp, a->vd, pass + 1, MO_32);
3988        }
3989    } else {
3990        for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3991            read_neon_element32(tmp, a->vm, pass, MO_32);
3992            read_neon_element32(tmp2, a->vd, pass, MO_32);
3993            if (a->size == MO_8) {
3994                gen_neon_trn_u8(tmp, tmp2);
3995            } else {
3996                gen_neon_trn_u16(tmp, tmp2);
3997            }
3998            write_neon_element32(tmp2, a->vm, pass, MO_32);
3999            write_neon_element32(tmp, a->vd, pass, MO_32);
4000        }

4001    }
4002    tcg_temp_free_i32(tmp);
4003    tcg_temp_free_i32(tmp2);
4004    return true;
4005}
4006
4007static bool trans_VSMMLA(DisasContext *s, arg_VSMMLA *a)
4008{
4009    if (!dc_isar_feature(aa32_i8mm, s)) {
4010        return false;
4011    }
4012    return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4013                        gen_helper_gvec_smmla_b);
4014}
4015
4016static bool trans_VUMMLA(DisasContext *s, arg_VUMMLA *a)
4017{
4018    if (!dc_isar_feature(aa32_i8mm, s)) {
4019        return false;
4020    }
4021    return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4022                        gen_helper_gvec_ummla_b);
4023}
4024
4025static bool trans_VUSMMLA(DisasContext *s, arg_VUSMMLA *a)
4026{
4027    if (!dc_isar_feature(aa32_i8mm, s)) {
4028        return false;
4029    }
4030    return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4031                        gen_helper_gvec_usmmla_b);
4032}
4033
4034static bool trans_VMMLA_b16(DisasContext *s, arg_VMMLA_b16 *a)
4035{
4036    if (!dc_isar_feature(aa32_bf16, s)) {
4037        return false;
4038    }
4039    return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4040                        gen_helper_gvec_bfmmla);
4041}
4042
4043static bool trans_VFMA_b16(DisasContext *s, arg_VFMA_b16 *a)
4044{
4045    if (!dc_isar_feature(aa32_bf16, s)) {
4046        return false;
4047    }
4048    return do_neon_ddda_fpst(s, 7, a->vd, a->vn, a->vm, a->q, FPST_STD,
4049                             gen_helper_gvec_bfmlal);
4050}
4051
4052static bool trans_VFMA_b16_scal(DisasContext *s, arg_VFMA_b16_scal *a)
4053{
4054    if (!dc_isar_feature(aa32_bf16, s)) {
4055        return false;
4056    }
4057    return do_neon_ddda_fpst(s, 6, a->vd, a->vn, a->vm,
4058                             (a->index << 1) | a->q, FPST_STD,
4059                             gen_helper_gvec_bfmlal_idx);
4060}
4061