LXR qemu/target/arm/translate-neon.c

   1/*
   2 *  ARM translation: AArch32 Neon instructions
   3 *
   4 *  Copyright (c) 2003 Fabrice Bellard
   5 *  Copyright (c) 2005-2007 CodeSourcery
   6 *  Copyright (c) 2007 OpenedHand, Ltd.
   7 *  Copyright (c) 2020 Linaro, Ltd.
   8 *
   9 * This library is free software; you can redistribute it and/or
  10 * modify it under the terms of the GNU Lesser General Public
  11 * License as published by the Free Software Foundation; either
  12 * version 2.1 of the License, or (at your option) any later version.
  13 *
  14 * This library is distributed in the hope that it will be useful,
  15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17 * Lesser General Public License for more details.
  18 *
  19 * You should have received a copy of the GNU Lesser General Public
  20 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  21 */
  22
  23#include "qemu/osdep.h"
  24#include "tcg/tcg-op.h"
  25#include "tcg/tcg-op-gvec.h"
  26#include "exec/exec-all.h"
  27#include "exec/gen-icount.h"
  28#include "translate.h"
  29#include "translate-a32.h"
  30
  31/* Include the generated Neon decoder */
  32#include "decode-neon-dp.c.inc"
  33#include "decode-neon-ls.c.inc"
  34#include "decode-neon-shared.c.inc"
  35
  36static TCGv_ptr vfp_reg_ptr(bool dp, int reg)
  37{
  38    TCGv_ptr ret = tcg_temp_new_ptr();
  39    tcg_gen_addi_ptr(ret, cpu_env, vfp_reg_offset(dp, reg));
  40    return ret;
  41}
  42
  43static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
  44{
  45    long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
  46
  47    switch (mop) {
  48    case MO_UB:
  49        tcg_gen_ld8u_i32(var, cpu_env, offset);
  50        break;
  51    case MO_UW:
  52        tcg_gen_ld16u_i32(var, cpu_env, offset);
  53        break;
  54    case MO_UL:
  55        tcg_gen_ld_i32(var, cpu_env, offset);
  56        break;
  57    default:
  58        g_assert_not_reached();
  59    }
  60}
  61
  62static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
  63{
  64    long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
  65
  66    switch (mop) {
  67    case MO_UB:
  68        tcg_gen_ld8u_i64(var, cpu_env, offset);
  69        break;
  70    case MO_UW:
  71        tcg_gen_ld16u_i64(var, cpu_env, offset);
  72        break;
  73    case MO_UL:
  74        tcg_gen_ld32u_i64(var, cpu_env, offset);
  75        break;
  76    case MO_Q:
  77        tcg_gen_ld_i64(var, cpu_env, offset);
  78        break;
  79    default:
  80        g_assert_not_reached();
  81    }
  82}
  83
  84static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
  85{
  86    long offset = neon_element_offset(reg, ele, size);
  87
  88    switch (size) {
  89    case MO_8:
  90        tcg_gen_st8_i32(var, cpu_env, offset);
  91        break;
  92    case MO_16:
  93        tcg_gen_st16_i32(var, cpu_env, offset);
  94        break;
  95    case MO_32:
  96        tcg_gen_st_i32(var, cpu_env, offset);
  97        break;
  98    default:
  99        g_assert_not_reached();
 100    }
 101}
 102
 103static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
 104{
 105    long offset = neon_element_offset(reg, ele, size);
 106
 107    switch (size) {
 108    case MO_8:
 109        tcg_gen_st8_i64(var, cpu_env, offset);
 110        break;
 111    case MO_16:
 112        tcg_gen_st16_i64(var, cpu_env, offset);
 113        break;
 114    case MO_32:
 115        tcg_gen_st32_i64(var, cpu_env, offset);
 116        break;
 117    case MO_64:
 118        tcg_gen_st_i64(var, cpu_env, offset);
 119        break;
 120    default:
 121        g_assert_not_reached();
 122    }
 123}
 124
 125static bool do_neon_ddda(DisasContext *s, int q, int vd, int vn, int vm,
 126                         int data, gen_helper_gvec_4 *fn_gvec)
 127{
 128    /* UNDEF accesses to D16-D31 if they don't exist. */
 129    if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
 130        return false;
 131    }
 132
 133    /*
 134     * UNDEF accesses to odd registers for each bit of Q.
 135     * Q will be 0b111 for all Q-reg instructions, otherwise
 136     * when we have mixed Q- and D-reg inputs.
 137     */
 138    if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
 139        return false;
 140    }
 141
 142    if (!vfp_access_check(s)) {
 143        return true;
 144    }
 145
 146    int opr_sz = q ? 16 : 8;
 147    tcg_gen_gvec_4_ool(vfp_reg_offset(1, vd),
 148                       vfp_reg_offset(1, vn),
 149                       vfp_reg_offset(1, vm),
 150                       vfp_reg_offset(1, vd),
 151                       opr_sz, opr_sz, data, fn_gvec);
 152    return true;
 153}
 154
 155static bool do_neon_ddda_fpst(DisasContext *s, int q, int vd, int vn, int vm,
 156                              int data, ARMFPStatusFlavour fp_flavour,
 157                              gen_helper_gvec_4_ptr *fn_gvec_ptr)
 158{
 159    /* UNDEF accesses to D16-D31 if they don't exist. */
 160    if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
 161        return false;
 162    }
 163
 164    /*
 165     * UNDEF accesses to odd registers for each bit of Q.
 166     * Q will be 0b111 for all Q-reg instructions, otherwise
 167     * when we have mixed Q- and D-reg inputs.
 168     */
 169    if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
 170        return false;
 171    }
 172
 173    if (!vfp_access_check(s)) {
 174        return true;
 175    }
 176
 177    int opr_sz = q ? 16 : 8;
 178    TCGv_ptr fpst = fpstatus_ptr(fp_flavour);
 179
 180    tcg_gen_gvec_4_ptr(vfp_reg_offset(1, vd),
 181                       vfp_reg_offset(1, vn),
 182                       vfp_reg_offset(1, vm),
 183                       vfp_reg_offset(1, vd),
 184                       fpst, opr_sz, opr_sz, data, fn_gvec_ptr);
 185    tcg_temp_free_ptr(fpst);
 186    return true;
 187}
 188
 189static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
 190{
 191    if (!dc_isar_feature(aa32_vcma, s)) {
 192        return false;
 193    }
 194    if (a->size == MO_16) {
 195        if (!dc_isar_feature(aa32_fp16_arith, s)) {
 196            return false;
 197        }
 198        return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
 199                                 FPST_STD_F16, gen_helper_gvec_fcmlah);
 200    }
 201    return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
 202                             FPST_STD, gen_helper_gvec_fcmlas);
 203}
 204
 205static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
 206{
 207    int opr_sz;
 208    TCGv_ptr fpst;
 209    gen_helper_gvec_3_ptr *fn_gvec_ptr;
 210
 211    if (!dc_isar_feature(aa32_vcma, s)
 212        || (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s))) {
 213        return false;
 214    }
 215
 216    /* UNDEF accesses to D16-D31 if they don't exist. */
 217    if (!dc_isar_feature(aa32_simd_r32, s) &&
 218        ((a->vd | a->vn | a->vm) & 0x10)) {
 219        return false;
 220    }
 221
 222    if ((a->vn | a->vm | a->vd) & a->q) {
 223        return false;
 224    }
 225
 226    if (!vfp_access_check(s)) {
 227        return true;
 228    }
 229
 230    opr_sz = (1 + a->q) * 8;
 231    fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
 232    fn_gvec_ptr = (a->size == MO_16) ?
 233        gen_helper_gvec_fcaddh : gen_helper_gvec_fcadds;
 234    tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 235                       vfp_reg_offset(1, a->vn),
 236                       vfp_reg_offset(1, a->vm),
 237                       fpst, opr_sz, opr_sz, a->rot,
 238                       fn_gvec_ptr);
 239    tcg_temp_free_ptr(fpst);
 240    return true;
 241}
 242
 243static bool trans_VSDOT(DisasContext *s, arg_VSDOT *a)
 244{
 245    if (!dc_isar_feature(aa32_dp, s)) {
 246        return false;
 247    }
 248    return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 249                        gen_helper_gvec_sdot_b);
 250}
 251
 252static bool trans_VUDOT(DisasContext *s, arg_VUDOT *a)
 253{
 254    if (!dc_isar_feature(aa32_dp, s)) {
 255        return false;
 256    }
 257    return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 258                        gen_helper_gvec_udot_b);
 259}
 260
 261static bool trans_VUSDOT(DisasContext *s, arg_VUSDOT *a)
 262{
 263    if (!dc_isar_feature(aa32_i8mm, s)) {
 264        return false;
 265    }
 266    return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 267                        gen_helper_gvec_usdot_b);
 268}
 269
 270static bool trans_VDOT_b16(DisasContext *s, arg_VDOT_b16 *a)
 271{
 272    if (!dc_isar_feature(aa32_bf16, s)) {
 273        return false;
 274    }
 275    return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 276                        gen_helper_gvec_bfdot);
 277}
 278
 279static bool trans_VFML(DisasContext *s, arg_VFML *a)
 280{
 281    int opr_sz;
 282
 283    if (!dc_isar_feature(aa32_fhm, s)) {
 284        return false;
 285    }
 286
 287    /* UNDEF accesses to D16-D31 if they don't exist. */
 288    if (!dc_isar_feature(aa32_simd_r32, s) &&
 289        (a->vd & 0x10)) {
 290        return false;
 291    }
 292
 293    if (a->vd & a->q) {
 294        return false;
 295    }
 296
 297    if (!vfp_access_check(s)) {
 298        return true;
 299    }
 300
 301    opr_sz = (1 + a->q) * 8;
 302    tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 303                       vfp_reg_offset(a->q, a->vn),
 304                       vfp_reg_offset(a->q, a->vm),
 305                       cpu_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
 306                       gen_helper_gvec_fmlal_a32);
 307    return true;
 308}
 309
 310static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
 311{
 312    int data = (a->index << 2) | a->rot;
 313
 314    if (!dc_isar_feature(aa32_vcma, s)) {
 315        return false;
 316    }
 317    if (a->size == MO_16) {
 318        if (!dc_isar_feature(aa32_fp16_arith, s)) {
 319            return false;
 320        }
 321        return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
 322                                 FPST_STD_F16, gen_helper_gvec_fcmlah_idx);
 323    }
 324    return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
 325                             FPST_STD, gen_helper_gvec_fcmlas_idx);
 326}
 327
 328static bool trans_VSDOT_scalar(DisasContext *s, arg_VSDOT_scalar *a)
 329{
 330    if (!dc_isar_feature(aa32_dp, s)) {
 331        return false;
 332    }
 333    return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 334                        gen_helper_gvec_sdot_idx_b);
 335}
 336
 337static bool trans_VUDOT_scalar(DisasContext *s, arg_VUDOT_scalar *a)
 338{
 339    if (!dc_isar_feature(aa32_dp, s)) {
 340        return false;
 341    }
 342    return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 343                        gen_helper_gvec_udot_idx_b);
 344}
 345
 346static bool trans_VUSDOT_scalar(DisasContext *s, arg_VUSDOT_scalar *a)
 347{
 348    if (!dc_isar_feature(aa32_i8mm, s)) {
 349        return false;
 350    }
 351    return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 352                        gen_helper_gvec_usdot_idx_b);
 353}
 354
 355static bool trans_VSUDOT_scalar(DisasContext *s, arg_VSUDOT_scalar *a)
 356{
 357    if (!dc_isar_feature(aa32_i8mm, s)) {
 358        return false;
 359    }
 360    return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 361                        gen_helper_gvec_sudot_idx_b);
 362}
 363
 364static bool trans_VDOT_b16_scal(DisasContext *s, arg_VDOT_b16_scal *a)
 365{
 366    if (!dc_isar_feature(aa32_bf16, s)) {
 367        return false;
 368    }
 369    return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 370                        gen_helper_gvec_bfdot_idx);
 371}
 372
 373static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
 374{
 375    int opr_sz;
 376
 377    if (!dc_isar_feature(aa32_fhm, s)) {
 378        return false;
 379    }
 380
 381    /* UNDEF accesses to D16-D31 if they don't exist. */
 382    if (!dc_isar_feature(aa32_simd_r32, s) &&
 383        ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
 384        return false;
 385    }
 386
 387    if (a->vd & a->q) {
 388        return false;
 389    }
 390
 391    if (!vfp_access_check(s)) {
 392        return true;
 393    }
 394
 395    opr_sz = (1 + a->q) * 8;
 396    tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 397                       vfp_reg_offset(a->q, a->vn),
 398                       vfp_reg_offset(a->q, a->rm),
 399                       cpu_env, opr_sz, opr_sz,
 400                       (a->index << 2) | a->s, /* is_2 == 0 */
 401                       gen_helper_gvec_fmlal_idx_a32);
 402    return true;
 403}
 404
 405static struct {
 406    int nregs;
 407    int interleave;
 408    int spacing;
 409} const neon_ls_element_type[11] = {
 410    {1, 4, 1},
 411    {1, 4, 2},
 412    {4, 1, 1},
 413    {2, 2, 2},
 414    {1, 3, 1},
 415    {1, 3, 2},
 416    {3, 1, 1},
 417    {1, 1, 1},
 418    {1, 2, 1},
 419    {1, 2, 2},
 420    {2, 1, 1}
 421};
 422
 423static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
 424                                      int stride)
 425{
 426    if (rm != 15) {
 427        TCGv_i32 base;
 428
 429        base = load_reg(s, rn);
 430        if (rm == 13) {
 431            tcg_gen_addi_i32(base, base, stride);
 432        } else {
 433            TCGv_i32 index;
 434            index = load_reg(s, rm);
 435            tcg_gen_add_i32(base, base, index);
 436            tcg_temp_free_i32(index);
 437        }
 438        store_reg(s, rn, base);
 439    }
 440}
 441
 442static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
 443{
 444    /* Neon load/store multiple structures */
 445    int nregs, interleave, spacing, reg, n;
 446    MemOp mop, align, endian;
 447    int mmu_idx = get_mem_index(s);
 448    int size = a->size;
 449    TCGv_i64 tmp64;
 450    TCGv_i32 addr, tmp;
 451
 452    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 453        return false;
 454    }
 455
 456    /* UNDEF accesses to D16-D31 if they don't exist */
 457    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 458        return false;
 459    }
 460    if (a->itype > 10) {
 461        return false;
 462    }
 463    /* Catch UNDEF cases for bad values of align field */
 464    switch (a->itype & 0xc) {
 465    case 4:
 466        if (a->align >= 2) {
 467            return false;
 468        }
 469        break;
 470    case 8:
 471        if (a->align == 3) {
 472            return false;
 473        }
 474        break;
 475    default:
 476        break;
 477    }
 478    nregs = neon_ls_element_type[a->itype].nregs;
 479    interleave = neon_ls_element_type[a->itype].interleave;
 480    spacing = neon_ls_element_type[a->itype].spacing;
 481    if (size == 3 && (interleave | spacing) != 1) {
 482        return false;
 483    }
 484
 485    if (!vfp_access_check(s)) {
 486        return true;
 487    }
 488
 489    /* For our purposes, bytes are always little-endian.  */
 490    endian = s->be_data;
 491    if (size == 0) {
 492        endian = MO_LE;
 493    }
 494
 495    /* Enforce alignment requested by the instruction */
 496    if (a->align) {
 497        align = pow2_align(a->align + 2); /* 4 ** a->align */
 498    } else {
 499        align = s->align_mem ? MO_ALIGN : 0;
 500    }
 501
 502    /*
 503     * Consecutive little-endian elements from a single register
 504     * can be promoted to a larger little-endian operation.
 505     */
 506    if (interleave == 1 && endian == MO_LE) {
 507        /* Retain any natural alignment. */
 508        if (align == MO_ALIGN) {
 509            align = pow2_align(size);
 510        }
 511        size = 3;
 512    }
 513
 514    tmp64 = tcg_temp_new_i64();
 515    addr = tcg_temp_new_i32();
 516    tmp = tcg_const_i32(1 << size);
 517    load_reg_var(s, addr, a->rn);
 518
 519    mop = endian | size | align;
 520    for (reg = 0; reg < nregs; reg++) {
 521        for (n = 0; n < 8 >> size; n++) {
 522            int xs;
 523            for (xs = 0; xs < interleave; xs++) {
 524                int tt = a->vd + reg + spacing * xs;
 525
 526                if (a->l) {
 527                    gen_aa32_ld_internal_i64(s, tmp64, addr, mmu_idx, mop);
 528                    neon_store_element64(tt, n, size, tmp64);
 529                } else {
 530                    neon_load_element64(tmp64, tt, n, size);
 531                    gen_aa32_st_internal_i64(s, tmp64, addr, mmu_idx, mop);
 532                }
 533                tcg_gen_add_i32(addr, addr, tmp);
 534
 535                /* Subsequent memory operations inherit alignment */
 536                mop &= ~MO_AMASK;
 537            }
 538        }
 539    }
 540    tcg_temp_free_i32(addr);
 541    tcg_temp_free_i32(tmp);
 542    tcg_temp_free_i64(tmp64);
 543
 544    gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
 545    return true;
 546}
 547
 548static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
 549{
 550    /* Neon load single structure to all lanes */
 551    int reg, stride, vec_size;
 552    int vd = a->vd;
 553    int size = a->size;
 554    int nregs = a->n + 1;
 555    TCGv_i32 addr, tmp;
 556    MemOp mop, align;
 557
 558    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 559        return false;
 560    }
 561
 562    /* UNDEF accesses to D16-D31 if they don't exist */
 563    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 564        return false;
 565    }
 566
 567    align = 0;
 568    if (size == 3) {
 569        if (nregs != 4 || a->a == 0) {
 570            return false;
 571        }
 572        /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
 573        size = MO_32;
 574        align = MO_ALIGN_16;
 575    } else if (a->a) {
 576        switch (nregs) {
 577        case 1:
 578            if (size == 0) {
 579                return false;
 580            }
 581            align = MO_ALIGN;
 582            break;
 583        case 2:
 584            align = pow2_align(size + 1);
 585            break;
 586        case 3:
 587            return false;
 588        case 4:
 589            align = pow2_align(size + 2);
 590            break;
 591        default:
 592            g_assert_not_reached();
 593        }
 594    }
 595
 596    if (!vfp_access_check(s)) {
 597        return true;
 598    }
 599
 600    /*
 601     * VLD1 to all lanes: T bit indicates how many Dregs to write.
 602     * VLD2/3/4 to all lanes: T bit indicates register stride.
 603     */
 604    stride = a->t ? 2 : 1;
 605    vec_size = nregs == 1 ? stride * 8 : 8;
 606    mop = size | align;
 607    tmp = tcg_temp_new_i32();
 608    addr = tcg_temp_new_i32();
 609    load_reg_var(s, addr, a->rn);
 610    for (reg = 0; reg < nregs; reg++) {
 611        gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), mop);
 612        if ((vd & 1) && vec_size == 16) {
 613            /*
 614             * We cannot write 16 bytes at once because the
 615             * destination is unaligned.
 616             */
 617            tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
 618                                 8, 8, tmp);
 619            tcg_gen_gvec_mov(0, neon_full_reg_offset(vd + 1),
 620                             neon_full_reg_offset(vd), 8, 8);
 621        } else {
 622            tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
 623                                 vec_size, vec_size, tmp);
 624        }
 625        tcg_gen_addi_i32(addr, addr, 1 << size);
 626        vd += stride;
 627
 628        /* Subsequent memory operations inherit alignment */
 629        mop &= ~MO_AMASK;
 630    }
 631    tcg_temp_free_i32(tmp);
 632    tcg_temp_free_i32(addr);
 633
 634    gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
 635
 636    return true;
 637}
 638
 639static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
 640{
 641    /* Neon load/store single structure to one lane */
 642    int reg;
 643    int nregs = a->n + 1;
 644    int vd = a->vd;
 645    TCGv_i32 addr, tmp;
 646    MemOp mop;
 647
 648    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 649        return false;
 650    }
 651
 652    /* UNDEF accesses to D16-D31 if they don't exist */
 653    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 654        return false;
 655    }
 656
 657    /* Catch the UNDEF cases. This is unavoidably a bit messy. */
 658    switch (nregs) {
 659    case 1:
 660        if (((a->align & (1 << a->size)) != 0) ||
 661            (a->size == 2 && (a->align == 1 || a->align == 2))) {
 662            return false;
 663        }
 664        break;
 665    case 3:
 666        if ((a->align & 1) != 0) {
 667            return false;
 668        }
 669        /* fall through */
 670    case 2:
 671        if (a->size == 2 && (a->align & 2) != 0) {
 672            return false;
 673        }
 674        break;
 675    case 4:
 676        if (a->size == 2 && a->align == 3) {
 677            return false;
 678        }
 679        break;
 680    default:
 681        abort();
 682    }
 683    if ((vd + a->stride * (nregs - 1)) > 31) {
 684        /*
 685         * Attempts to write off the end of the register file are
 686         * UNPREDICTABLE; we choose to UNDEF because otherwise we would
 687         * access off the end of the array that holds the register data.
 688         */
 689        return false;
 690    }
 691
 692    if (!vfp_access_check(s)) {
 693        return true;
 694    }
 695
 696    /* Pick up SCTLR settings */
 697    mop = finalize_memop(s, a->size);
 698
 699    if (a->align) {
 700        MemOp align_op;
 701
 702        switch (nregs) {
 703        case 1:
 704            /* For VLD1, use natural alignment. */
 705            align_op = MO_ALIGN;
 706            break;
 707        case 2:
 708            /* For VLD2, use double alignment. */
 709            align_op = pow2_align(a->size + 1);
 710            break;
 711        case 4:
 712            if (a->size == MO_32) {
 713                /*
 714                 * For VLD4.32, align = 1 is double alignment, align = 2 is
 715                 * quad alignment; align = 3 is rejected above.
 716                 */
 717                align_op = pow2_align(a->size + a->align);
 718            } else {
 719                /* For VLD4.8 and VLD.16, we want quad alignment. */
 720                align_op = pow2_align(a->size + 2);
 721            }
 722            break;
 723        default:
 724            /* For VLD3, the alignment field is zero and rejected above. */
 725            g_assert_not_reached();
 726        }
 727
 728        mop = (mop & ~MO_AMASK) | align_op;
 729    }
 730
 731    tmp = tcg_temp_new_i32();
 732    addr = tcg_temp_new_i32();
 733    load_reg_var(s, addr, a->rn);
 734
 735    for (reg = 0; reg < nregs; reg++) {
 736        if (a->l) {
 737            gen_aa32_ld_internal_i32(s, tmp, addr, get_mem_index(s), mop);
 738            neon_store_element(vd, a->reg_idx, a->size, tmp);
 739        } else { /* Store */
 740            neon_load_element(tmp, vd, a->reg_idx, a->size);
 741            gen_aa32_st_internal_i32(s, tmp, addr, get_mem_index(s), mop);
 742        }
 743        vd += a->stride;
 744        tcg_gen_addi_i32(addr, addr, 1 << a->size);
 745
 746        /* Subsequent memory operations inherit alignment */
 747        mop &= ~MO_AMASK;
 748    }
 749    tcg_temp_free_i32(addr);
 750    tcg_temp_free_i32(tmp);
 751
 752    gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
 753
 754    return true;
 755}
 756
 757static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
 758{
 759    int vec_size = a->q ? 16 : 8;
 760    int rd_ofs = neon_full_reg_offset(a->vd);
 761    int rn_ofs = neon_full_reg_offset(a->vn);
 762    int rm_ofs = neon_full_reg_offset(a->vm);
 763
 764    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 765        return false;
 766    }
 767
 768    /* UNDEF accesses to D16-D31 if they don't exist. */
 769    if (!dc_isar_feature(aa32_simd_r32, s) &&
 770        ((a->vd | a->vn | a->vm) & 0x10)) {
 771        return false;
 772    }
 773
 774    if ((a->vn | a->vm | a->vd) & a->q) {
 775        return false;
 776    }
 777
 778    if (!vfp_access_check(s)) {
 779        return true;
 780    }
 781
 782    fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
 783    return true;
 784}
 785
 786#define DO_3SAME(INSN, FUNC)                                            \
 787    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 788    {                                                                   \
 789        return do_3same(s, a, FUNC);                                    \
 790    }
 791
 792DO_3SAME(VADD, tcg_gen_gvec_add)
 793DO_3SAME(VSUB, tcg_gen_gvec_sub)
 794DO_3SAME(VAND, tcg_gen_gvec_and)
 795DO_3SAME(VBIC, tcg_gen_gvec_andc)
 796DO_3SAME(VORR, tcg_gen_gvec_or)
 797DO_3SAME(VORN, tcg_gen_gvec_orc)
 798DO_3SAME(VEOR, tcg_gen_gvec_xor)
 799DO_3SAME(VSHL_S, gen_gvec_sshl)
 800DO_3SAME(VSHL_U, gen_gvec_ushl)
 801DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
 802DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
 803DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
 804DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
 805
 806/* These insns are all gvec_bitsel but with the inputs in various orders. */
 807#define DO_3SAME_BITSEL(INSN, O1, O2, O3)                               \
 808    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 809                                uint32_t rn_ofs, uint32_t rm_ofs,       \
 810                                uint32_t oprsz, uint32_t maxsz)         \
 811    {                                                                   \
 812        tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz);    \
 813    }                                                                   \
 814    DO_3SAME(INSN, gen_##INSN##_3s)
 815
 816DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
 817DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
 818DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
 819
 820#define DO_3SAME_NO_SZ_3(INSN, FUNC)                                    \
 821    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 822    {                                                                   \
 823        if (a->size == 3) {                                             \
 824            return false;                                               \
 825        }                                                               \
 826        return do_3same(s, a, FUNC);                                    \
 827    }
 828
 829DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
 830DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
 831DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
 832DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
 833DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
 834DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
 835DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
 836DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
 837DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
 838DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
 839DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
 840DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
 841
 842#define DO_3SAME_CMP(INSN, COND)                                        \
 843    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 844                                uint32_t rn_ofs, uint32_t rm_ofs,       \
 845                                uint32_t oprsz, uint32_t maxsz)         \
 846    {                                                                   \
 847        tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
 848    }                                                                   \
 849    DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
 850
 851DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
 852DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
 853DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
 854DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
 855DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
 856
 857#define WRAP_OOL_FN(WRAPNAME, FUNC)                                        \
 858    static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,  \
 859                         uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz)  \
 860    {                                                                      \
 861        tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
 862    }
 863
 864WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
 865
 866static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
 867{
 868    if (a->size != 0) {
 869        return false;
 870    }
 871    return do_3same(s, a, gen_VMUL_p_3s);
 872}
 873
 874#define DO_VQRDMLAH(INSN, FUNC)                                         \
 875    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 876    {                                                                   \
 877        if (!dc_isar_feature(aa32_rdm, s)) {                            \
 878            return false;                                               \
 879        }                                                               \
 880        if (a->size != 1 && a->size != 2) {                             \
 881            return false;                                               \
 882        }                                                               \
 883        return do_3same(s, a, FUNC);                                    \
 884    }
 885
 886DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
 887DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
 888
 889#define DO_SHA1(NAME, FUNC)                                             \
 890    WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
 891    static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
 892    {                                                                   \
 893        if (!dc_isar_feature(aa32_sha1, s)) {                           \
 894            return false;                                               \
 895        }                                                               \
 896        return do_3same(s, a, gen_##NAME##_3s);                         \
 897    }
 898
 899DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
 900DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
 901DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
 902DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
 903
 904#define DO_SHA2(NAME, FUNC)                                             \
 905    WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
 906    static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
 907    {                                                                   \
 908        if (!dc_isar_feature(aa32_sha2, s)) {                           \
 909            return false;                                               \
 910        }                                                               \
 911        return do_3same(s, a, gen_##NAME##_3s);                         \
 912    }
 913
 914DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
 915DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
 916DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
 917
 918#define DO_3SAME_64(INSN, FUNC)                                         \
 919    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 920                                uint32_t rn_ofs, uint32_t rm_ofs,       \
 921                                uint32_t oprsz, uint32_t maxsz)         \
 922    {                                                                   \
 923        static const GVecGen3 op = { .fni8 = FUNC };                    \
 924        tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op);      \
 925    }                                                                   \
 926    DO_3SAME(INSN, gen_##INSN##_3s)
 927
 928#define DO_3SAME_64_ENV(INSN, FUNC)                                     \
 929    static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)    \
 930    {                                                                   \
 931        FUNC(d, cpu_env, n, m);                                         \
 932    }                                                                   \
 933    DO_3SAME_64(INSN, gen_##INSN##_elt)
 934
 935DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
 936DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
 937DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
 938DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
 939DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
 940DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
 941
 942#define DO_3SAME_32(INSN, FUNC)                                         \
 943    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 944                                uint32_t rn_ofs, uint32_t rm_ofs,       \
 945                                uint32_t oprsz, uint32_t maxsz)         \
 946    {                                                                   \
 947        static const GVecGen3 ops[4] = {                                \
 948            { .fni4 = gen_helper_neon_##FUNC##8 },                      \
 949            { .fni4 = gen_helper_neon_##FUNC##16 },                     \
 950            { .fni4 = gen_helper_neon_##FUNC##32 },                     \
 951            { 0 },                                                      \
 952        };                                                              \
 953        tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
 954    }                                                                   \
 955    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 956    {                                                                   \
 957        if (a->size > 2) {                                              \
 958            return false;                                               \
 959        }                                                               \
 960        return do_3same(s, a, gen_##INSN##_3s);                         \
 961    }
 962
 963/*
 964 * Some helper functions need to be passed the cpu_env. In order
 965 * to use those with the gvec APIs like tcg_gen_gvec_3() we need
 966 * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
 967 * and which call a NeonGenTwoOpEnvFn().
 968 */
 969#define WRAP_ENV_FN(WRAPNAME, FUNC)                                     \
 970    static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m)            \
 971    {                                                                   \
 972        FUNC(d, cpu_env, n, m);                                         \
 973    }
 974
 975#define DO_3SAME_32_ENV(INSN, FUNC)                                     \
 976    WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8);        \
 977    WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16);      \
 978    WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32);      \
 979    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 980                                uint32_t rn_ofs, uint32_t rm_ofs,       \
 981                                uint32_t oprsz, uint32_t maxsz)         \
 982    {                                                                   \
 983        static const GVecGen3 ops[4] = {                                \
 984            { .fni4 = gen_##INSN##_tramp8 },                            \
 985            { .fni4 = gen_##INSN##_tramp16 },                           \
 986            { .fni4 = gen_##INSN##_tramp32 },                           \
 987            { 0 },                                                      \
 988        };                                                              \
 989        tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
 990    }                                                                   \
 991    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 992    {                                                                   \
 993        if (a->size > 2) {                                              \
 994            return false;                                               \
 995        }                                                               \
 996        return do_3same(s, a, gen_##INSN##_3s);                         \
 997    }
 998
 999DO_3SAME_32(VHADD_S, hadd_s)
1000DO_3SAME_32(VHADD_U, hadd_u)

1001DO_3SAME_32(VHSUB_S, hsub_s)
1002DO_3SAME_32(VHSUB_U, hsub_u)
1003DO_3SAME_32(VRHADD_S, rhadd_s)
1004DO_3SAME_32(VRHADD_U, rhadd_u)
1005DO_3SAME_32(VRSHL_S, rshl_s)
1006DO_3SAME_32(VRSHL_U, rshl_u)
1007
1008DO_3SAME_32_ENV(VQSHL_S, qshl_s)
1009DO_3SAME_32_ENV(VQSHL_U, qshl_u)
1010DO_3SAME_32_ENV(VQRSHL_S, qrshl_s)
1011DO_3SAME_32_ENV(VQRSHL_U, qrshl_u)
1012
1013static bool do_3same_pair(DisasContext *s, arg_3same *a, NeonGenTwoOpFn *fn)
1014{
1015    /* Operations handled pairwise 32 bits at a time */
1016    TCGv_i32 tmp, tmp2, tmp3;
1017
1018    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1019        return false;
1020    }
1021
1022    /* UNDEF accesses to D16-D31 if they don't exist. */
1023    if (!dc_isar_feature(aa32_simd_r32, s) &&
1024        ((a->vd | a->vn | a->vm) & 0x10)) {
1025        return false;
1026    }
1027
1028    if (a->size == 3) {
1029        return false;
1030    }
1031
1032    if (!vfp_access_check(s)) {
1033        return true;
1034    }
1035
1036    assert(a->q == 0); /* enforced by decode patterns */
1037
1038    /*
1039     * Note that we have to be careful not to clobber the source operands
1040     * in the "vm == vd" case by storing the result of the first pass too
1041     * early. Since Q is 0 there are always just two passes, so instead
1042     * of a complicated loop over each pass we just unroll.
1043     */
1044    tmp = tcg_temp_new_i32();
1045    tmp2 = tcg_temp_new_i32();
1046    tmp3 = tcg_temp_new_i32();
1047
1048    read_neon_element32(tmp, a->vn, 0, MO_32);
1049    read_neon_element32(tmp2, a->vn, 1, MO_32);
1050    fn(tmp, tmp, tmp2);
1051
1052    read_neon_element32(tmp3, a->vm, 0, MO_32);
1053    read_neon_element32(tmp2, a->vm, 1, MO_32);
1054    fn(tmp3, tmp3, tmp2);
1055
1056    write_neon_element32(tmp, a->vd, 0, MO_32);
1057    write_neon_element32(tmp3, a->vd, 1, MO_32);
1058
1059    tcg_temp_free_i32(tmp);
1060    tcg_temp_free_i32(tmp2);
1061    tcg_temp_free_i32(tmp3);
1062    return true;
1063}
1064
1065#define DO_3SAME_PAIR(INSN, func)                                       \
1066    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1067    {                                                                   \
1068        static NeonGenTwoOpFn * const fns[] = {                         \
1069            gen_helper_neon_##func##8,                                  \
1070            gen_helper_neon_##func##16,                                 \
1071            gen_helper_neon_##func##32,                                 \
1072        };                                                              \
1073        if (a->size > 2) {                                              \
1074            return false;                                               \
1075        }                                                               \
1076        return do_3same_pair(s, a, fns[a->size]);                       \
1077    }
1078
1079/* 32-bit pairwise ops end up the same as the elementwise versions.  */
1080#define gen_helper_neon_pmax_s32  tcg_gen_smax_i32
1081#define gen_helper_neon_pmax_u32  tcg_gen_umax_i32
1082#define gen_helper_neon_pmin_s32  tcg_gen_smin_i32
1083#define gen_helper_neon_pmin_u32  tcg_gen_umin_i32
1084#define gen_helper_neon_padd_u32  tcg_gen_add_i32
1085
1086DO_3SAME_PAIR(VPMAX_S, pmax_s)
1087DO_3SAME_PAIR(VPMIN_S, pmin_s)
1088DO_3SAME_PAIR(VPMAX_U, pmax_u)
1089DO_3SAME_PAIR(VPMIN_U, pmin_u)
1090DO_3SAME_PAIR(VPADD, padd_u)
1091
1092#define DO_3SAME_VQDMULH(INSN, FUNC)                                    \
1093    WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16);    \
1094    WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32);    \
1095    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
1096                                uint32_t rn_ofs, uint32_t rm_ofs,       \
1097                                uint32_t oprsz, uint32_t maxsz)         \
1098    {                                                                   \
1099        static const GVecGen3 ops[2] = {                                \
1100            { .fni4 = gen_##INSN##_tramp16 },                           \
1101            { .fni4 = gen_##INSN##_tramp32 },                           \
1102        };                                                              \
1103        tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \
1104    }                                                                   \
1105    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1106    {                                                                   \
1107        if (a->size != 1 && a->size != 2) {                             \
1108            return false;                                               \
1109        }                                                               \
1110        return do_3same(s, a, gen_##INSN##_3s);                         \
1111    }
1112
1113DO_3SAME_VQDMULH(VQDMULH, qdmulh)
1114DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
1115
1116#define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC)                              \
1117    static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
1118                         uint32_t rn_ofs, uint32_t rm_ofs,              \
1119                         uint32_t oprsz, uint32_t maxsz)                \
1120    {                                                                   \
1121        TCGv_ptr fpst = fpstatus_ptr(FPST);                             \
1122        tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst,                \
1123                           oprsz, maxsz, 0, FUNC);                      \
1124        tcg_temp_free_ptr(fpst);                                        \
1125    }
1126
1127#define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC)                                 \
1128    WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC)                 \
1129    WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC)             \
1130    static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a)     \
1131    {                                                                   \
1132        if (a->size == MO_16) {                                         \
1133            if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
1134                return false;                                           \
1135            }                                                           \
1136            return do_3same(s, a, gen_##INSN##_fp16_3s);                \
1137        }                                                               \
1138        return do_3same(s, a, gen_##INSN##_fp32_3s);                    \
1139    }
1140
1141
1142DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
1143DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
1144DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
1145DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
1146DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
1147DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
1148DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
1149DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
1150DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
1151DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
1152DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
1153DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
1154DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
1155DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
1156DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
1157DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h)
1158DO_3S_FP_GVEC(VRSQRTS, gen_helper_gvec_rsqrts_nf_s, gen_helper_gvec_rsqrts_nf_h)
1159
1160WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
1161WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
1162WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s)
1163WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h)
1164
1165static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
1166{
1167    if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1168        return false;
1169    }
1170
1171    if (a->size == MO_16) {
1172        if (!dc_isar_feature(aa32_fp16_arith, s)) {
1173            return false;
1174        }
1175        return do_3same(s, a, gen_VMAXNM_fp16_3s);
1176    }
1177    return do_3same(s, a, gen_VMAXNM_fp32_3s);
1178}
1179
1180static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
1181{
1182    if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1183        return false;
1184    }
1185
1186    if (a->size == MO_16) {
1187        if (!dc_isar_feature(aa32_fp16_arith, s)) {
1188            return false;
1189        }
1190        return do_3same(s, a, gen_VMINNM_fp16_3s);
1191    }
1192    return do_3same(s, a, gen_VMINNM_fp32_3s);
1193}
1194
1195static bool do_3same_fp_pair(DisasContext *s, arg_3same *a,
1196                             gen_helper_gvec_3_ptr *fn)
1197{
1198    /* FP pairwise operations */
1199    TCGv_ptr fpstatus;
1200
1201    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1202        return false;
1203    }
1204
1205    /* UNDEF accesses to D16-D31 if they don't exist. */
1206    if (!dc_isar_feature(aa32_simd_r32, s) &&
1207        ((a->vd | a->vn | a->vm) & 0x10)) {
1208        return false;
1209    }
1210
1211    if (!vfp_access_check(s)) {
1212        return true;
1213    }
1214
1215    assert(a->q == 0); /* enforced by decode patterns */
1216
1217
1218    fpstatus = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1219    tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
1220                       vfp_reg_offset(1, a->vn),
1221                       vfp_reg_offset(1, a->vm),
1222                       fpstatus, 8, 8, 0, fn);
1223    tcg_temp_free_ptr(fpstatus);
1224
1225    return true;
1226}
1227
1228/*
1229 * For all the functions using this macro, size == 1 means fp16,
1230 * which is an architecture extension we don't implement yet.
1231 */
1232#define DO_3S_FP_PAIR(INSN,FUNC)                                    \
1233    static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
1234    {                                                               \
1235        if (a->size == MO_16) {                                     \
1236            if (!dc_isar_feature(aa32_fp16_arith, s)) {             \
1237                return false;                                       \
1238            }                                                       \
1239            return do_3same_fp_pair(s, a, FUNC##h);                 \
1240        }                                                           \
1241        return do_3same_fp_pair(s, a, FUNC##s);                     \
1242    }
1243
1244DO_3S_FP_PAIR(VPADD, gen_helper_neon_padd)
1245DO_3S_FP_PAIR(VPMAX, gen_helper_neon_pmax)
1246DO_3S_FP_PAIR(VPMIN, gen_helper_neon_pmin)
1247
1248static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
1249{
1250    /* Handle a 2-reg-shift insn which can be vectorized. */
1251    int vec_size = a->q ? 16 : 8;
1252    int rd_ofs = neon_full_reg_offset(a->vd);
1253    int rm_ofs = neon_full_reg_offset(a->vm);
1254
1255    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1256        return false;
1257    }
1258
1259    /* UNDEF accesses to D16-D31 if they don't exist. */
1260    if (!dc_isar_feature(aa32_simd_r32, s) &&
1261        ((a->vd | a->vm) & 0x10)) {
1262        return false;
1263    }
1264
1265    if ((a->vm | a->vd) & a->q) {
1266        return false;
1267    }
1268
1269    if (!vfp_access_check(s)) {
1270        return true;
1271    }
1272
1273    fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
1274    return true;
1275}
1276
1277#define DO_2SH(INSN, FUNC)                                              \
1278    static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1279    {                                                                   \
1280        return do_vector_2sh(s, a, FUNC);                               \
1281    }                                                                   \
1282
1283DO_2SH(VSHL, tcg_gen_gvec_shli)
1284DO_2SH(VSLI, gen_gvec_sli)
1285DO_2SH(VSRI, gen_gvec_sri)
1286DO_2SH(VSRA_S, gen_gvec_ssra)
1287DO_2SH(VSRA_U, gen_gvec_usra)
1288DO_2SH(VRSHR_S, gen_gvec_srshr)
1289DO_2SH(VRSHR_U, gen_gvec_urshr)
1290DO_2SH(VRSRA_S, gen_gvec_srsra)
1291DO_2SH(VRSRA_U, gen_gvec_ursra)
1292
1293static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
1294{
1295    /* Signed shift out of range results in all-sign-bits */
1296    a->shift = MIN(a->shift, (8 << a->size) - 1);
1297    return do_vector_2sh(s, a, tcg_gen_gvec_sari);
1298}
1299
1300static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
1301                            int64_t shift, uint32_t oprsz, uint32_t maxsz)
1302{
1303    tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
1304}
1305
1306static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
1307{
1308    /* Shift out of range is architecturally valid and results in zero. */
1309    if (a->shift >= (8 << a->size)) {
1310        return do_vector_2sh(s, a, gen_zero_rd_2sh);
1311    } else {
1312        return do_vector_2sh(s, a, tcg_gen_gvec_shri);
1313    }
1314}
1315
1316static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
1317                             NeonGenTwo64OpEnvFn *fn)
1318{
1319    /*
1320     * 2-reg-and-shift operations, size == 3 case, where the
1321     * function needs to be passed cpu_env.
1322     */
1323    TCGv_i64 constimm;
1324    int pass;
1325
1326    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1327        return false;
1328    }
1329
1330    /* UNDEF accesses to D16-D31 if they don't exist. */
1331    if (!dc_isar_feature(aa32_simd_r32, s) &&
1332        ((a->vd | a->vm) & 0x10)) {
1333        return false;
1334    }
1335
1336    if ((a->vm | a->vd) & a->q) {
1337        return false;
1338    }
1339
1340    if (!vfp_access_check(s)) {
1341        return true;
1342    }
1343
1344    /*
1345     * To avoid excessive duplication of ops we implement shift
1346     * by immediate using the variable shift operations.
1347     */
1348    constimm = tcg_const_i64(dup_const(a->size, a->shift));
1349
1350    for (pass = 0; pass < a->q + 1; pass++) {
1351        TCGv_i64 tmp = tcg_temp_new_i64();
1352
1353        read_neon_element64(tmp, a->vm, pass, MO_64);
1354        fn(tmp, cpu_env, tmp, constimm);
1355        write_neon_element64(tmp, a->vd, pass, MO_64);
1356        tcg_temp_free_i64(tmp);
1357    }
1358    tcg_temp_free_i64(constimm);
1359    return true;
1360}
1361
1362static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
1363                             NeonGenTwoOpEnvFn *fn)
1364{
1365    /*
1366     * 2-reg-and-shift operations, size < 3 case, where the
1367     * helper needs to be passed cpu_env.
1368     */
1369    TCGv_i32 constimm, tmp;
1370    int pass;
1371
1372    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1373        return false;
1374    }
1375
1376    /* UNDEF accesses to D16-D31 if they don't exist. */
1377    if (!dc_isar_feature(aa32_simd_r32, s) &&
1378        ((a->vd | a->vm) & 0x10)) {
1379        return false;
1380    }
1381
1382    if ((a->vm | a->vd) & a->q) {
1383        return false;
1384    }
1385
1386    if (!vfp_access_check(s)) {
1387        return true;
1388    }
1389
1390    /*
1391     * To avoid excessive duplication of ops we implement shift
1392     * by immediate using the variable shift operations.
1393     */
1394    constimm = tcg_const_i32(dup_const(a->size, a->shift));
1395    tmp = tcg_temp_new_i32();
1396
1397    for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1398        read_neon_element32(tmp, a->vm, pass, MO_32);
1399        fn(tmp, cpu_env, tmp, constimm);
1400        write_neon_element32(tmp, a->vd, pass, MO_32);
1401    }
1402    tcg_temp_free_i32(tmp);
1403    tcg_temp_free_i32(constimm);
1404    return true;
1405}
1406
1407#define DO_2SHIFT_ENV(INSN, FUNC)                                       \
1408    static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
1409    {                                                                   \
1410        return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64);      \
1411    }                                                                   \
1412    static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1413    {                                                                   \
1414        static NeonGenTwoOpEnvFn * const fns[] = {                      \
1415            gen_helper_neon_##FUNC##8,                                  \
1416            gen_helper_neon_##FUNC##16,                                 \
1417            gen_helper_neon_##FUNC##32,                                 \
1418        };                                                              \
1419        assert(a->size < ARRAY_SIZE(fns));                              \
1420        return do_2shift_env_32(s, a, fns[a->size]);                    \
1421    }
1422
1423DO_2SHIFT_ENV(VQSHLU, qshlu_s)
1424DO_2SHIFT_ENV(VQSHL_U, qshl_u)
1425DO_2SHIFT_ENV(VQSHL_S, qshl_s)
1426
1427static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
1428                                NeonGenTwo64OpFn *shiftfn,
1429                                NeonGenNarrowEnvFn *narrowfn)
1430{
1431    /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
1432    TCGv_i64 constimm, rm1, rm2;
1433    TCGv_i32 rd;
1434
1435    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1436        return false;
1437    }
1438
1439    /* UNDEF accesses to D16-D31 if they don't exist. */
1440    if (!dc_isar_feature(aa32_simd_r32, s) &&
1441        ((a->vd | a->vm) & 0x10)) {
1442        return false;
1443    }
1444
1445    if (a->vm & 1) {
1446        return false;
1447    }
1448
1449    if (!vfp_access_check(s)) {
1450        return true;
1451    }
1452
1453    /*
1454     * This is always a right shift, and the shiftfn is always a
1455     * left-shift helper, which thus needs the negated shift count.
1456     */
1457    constimm = tcg_const_i64(-a->shift);
1458    rm1 = tcg_temp_new_i64();
1459    rm2 = tcg_temp_new_i64();
1460    rd = tcg_temp_new_i32();
1461
1462    /* Load both inputs first to avoid potential overwrite if rm == rd */
1463    read_neon_element64(rm1, a->vm, 0, MO_64);
1464    read_neon_element64(rm2, a->vm, 1, MO_64);
1465
1466    shiftfn(rm1, rm1, constimm);
1467    narrowfn(rd, cpu_env, rm1);
1468    write_neon_element32(rd, a->vd, 0, MO_32);
1469
1470    shiftfn(rm2, rm2, constimm);
1471    narrowfn(rd, cpu_env, rm2);
1472    write_neon_element32(rd, a->vd, 1, MO_32);
1473
1474    tcg_temp_free_i32(rd);
1475    tcg_temp_free_i64(rm1);
1476    tcg_temp_free_i64(rm2);
1477    tcg_temp_free_i64(constimm);
1478
1479    return true;
1480}
1481
1482static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
1483                                NeonGenTwoOpFn *shiftfn,
1484                                NeonGenNarrowEnvFn *narrowfn)
1485{
1486    /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
1487    TCGv_i32 constimm, rm1, rm2, rm3, rm4;
1488    TCGv_i64 rtmp;
1489    uint32_t imm;
1490
1491    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1492        return false;
1493    }
1494
1495    /* UNDEF accesses to D16-D31 if they don't exist. */
1496    if (!dc_isar_feature(aa32_simd_r32, s) &&
1497        ((a->vd | a->vm) & 0x10)) {
1498        return false;
1499    }
1500
1501    if (a->vm & 1) {
1502        return false;
1503    }
1504
1505    if (!vfp_access_check(s)) {
1506        return true;
1507    }
1508
1509    /*
1510     * This is always a right shift, and the shiftfn is always a
1511     * left-shift helper, which thus needs the negated shift count
1512     * duplicated into each lane of the immediate value.
1513     */
1514    if (a->size == 1) {
1515        imm = (uint16_t)(-a->shift);
1516        imm |= imm << 16;
1517    } else {
1518        /* size == 2 */
1519        imm = -a->shift;
1520    }
1521    constimm = tcg_const_i32(imm);
1522
1523    /* Load all inputs first to avoid potential overwrite */
1524    rm1 = tcg_temp_new_i32();
1525    rm2 = tcg_temp_new_i32();
1526    rm3 = tcg_temp_new_i32();
1527    rm4 = tcg_temp_new_i32();
1528    read_neon_element32(rm1, a->vm, 0, MO_32);
1529    read_neon_element32(rm2, a->vm, 1, MO_32);
1530    read_neon_element32(rm3, a->vm, 2, MO_32);
1531    read_neon_element32(rm4, a->vm, 3, MO_32);
1532    rtmp = tcg_temp_new_i64();
1533
1534    shiftfn(rm1, rm1, constimm);
1535    shiftfn(rm2, rm2, constimm);
1536
1537    tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
1538    tcg_temp_free_i32(rm2);
1539
1540    narrowfn(rm1, cpu_env, rtmp);
1541    write_neon_element32(rm1, a->vd, 0, MO_32);
1542    tcg_temp_free_i32(rm1);
1543
1544    shiftfn(rm3, rm3, constimm);
1545    shiftfn(rm4, rm4, constimm);
1546    tcg_temp_free_i32(constimm);
1547
1548    tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
1549    tcg_temp_free_i32(rm4);
1550
1551    narrowfn(rm3, cpu_env, rtmp);
1552    tcg_temp_free_i64(rtmp);
1553    write_neon_element32(rm3, a->vd, 1, MO_32);
1554    tcg_temp_free_i32(rm3);
1555    return true;
1556}
1557
1558#define DO_2SN_64(INSN, FUNC, NARROWFUNC)                               \
1559    static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1560    {                                                                   \
1561        return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC);             \
1562    }
1563#define DO_2SN_32(INSN, FUNC, NARROWFUNC)                               \
1564    static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1565    {                                                                   \
1566        return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
1567    }
1568
1569static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1570{
1571    tcg_gen_extrl_i64_i32(dest, src);
1572}
1573
1574static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1575{
1576    gen_helper_neon_narrow_u16(dest, src);
1577}
1578
1579static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1580{
1581    gen_helper_neon_narrow_u8(dest, src);
1582}
1583
1584DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
1585DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
1586DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
1587
1588DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
1589DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
1590DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
1591
1592DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
1593DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
1594DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
1595
1596DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
1597DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
1598DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
1599DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
1600DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
1601DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
1602
1603DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
1604DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
1605DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
1606
1607DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
1608DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
1609DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
1610
1611DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
1612DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
1613DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
1614
1615static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
1616                         NeonGenWidenFn *widenfn, bool u)
1617{
1618    TCGv_i64 tmp;
1619    TCGv_i32 rm0, rm1;
1620    uint64_t widen_mask = 0;
1621
1622    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1623        return false;
1624    }
1625
1626    /* UNDEF accesses to D16-D31 if they don't exist. */
1627    if (!dc_isar_feature(aa32_simd_r32, s) &&
1628        ((a->vd | a->vm) & 0x10)) {
1629        return false;
1630    }
1631
1632    if (a->vd & 1) {
1633        return false;
1634    }
1635
1636    if (!vfp_access_check(s)) {
1637        return true;
1638    }
1639
1640    /*
1641     * This is a widen-and-shift operation. The shift is always less
1642     * than the width of the source type, so after widening the input
1643     * vector we can simply shift the whole 64-bit widened register,
1644     * and then clear the potential overflow bits resulting from left
1645     * bits of the narrow input appearing as right bits of the left
1646     * neighbour narrow input. Calculate a mask of bits to clear.
1647     */
1648    if ((a->shift != 0) && (a->size < 2 || u)) {
1649        int esize = 8 << a->size;
1650        widen_mask = MAKE_64BIT_MASK(0, esize);
1651        widen_mask >>= esize - a->shift;
1652        widen_mask = dup_const(a->size + 1, widen_mask);
1653    }
1654
1655    rm0 = tcg_temp_new_i32();
1656    rm1 = tcg_temp_new_i32();
1657    read_neon_element32(rm0, a->vm, 0, MO_32);
1658    read_neon_element32(rm1, a->vm, 1, MO_32);
1659    tmp = tcg_temp_new_i64();
1660
1661    widenfn(tmp, rm0);
1662    tcg_temp_free_i32(rm0);
1663    if (a->shift != 0) {
1664        tcg_gen_shli_i64(tmp, tmp, a->shift);
1665        tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1666    }
1667    write_neon_element64(tmp, a->vd, 0, MO_64);
1668
1669    widenfn(tmp, rm1);
1670    tcg_temp_free_i32(rm1);
1671    if (a->shift != 0) {
1672        tcg_gen_shli_i64(tmp, tmp, a->shift);
1673        tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1674    }
1675    write_neon_element64(tmp, a->vd, 1, MO_64);
1676    tcg_temp_free_i64(tmp);
1677    return true;
1678}
1679
1680static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
1681{
1682    static NeonGenWidenFn * const widenfn[] = {
1683        gen_helper_neon_widen_s8,
1684        gen_helper_neon_widen_s16,
1685        tcg_gen_ext_i32_i64,
1686    };
1687    return do_vshll_2sh(s, a, widenfn[a->size], false);
1688}
1689
1690static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
1691{
1692    static NeonGenWidenFn * const widenfn[] = {
1693        gen_helper_neon_widen_u8,
1694        gen_helper_neon_widen_u16,
1695        tcg_gen_extu_i32_i64,
1696    };
1697    return do_vshll_2sh(s, a, widenfn[a->size], true);
1698}
1699
1700static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
1701                      gen_helper_gvec_2_ptr *fn)
1702{
1703    /* FP operations in 2-reg-and-shift group */
1704    int vec_size = a->q ? 16 : 8;
1705    int rd_ofs = neon_full_reg_offset(a->vd);
1706    int rm_ofs = neon_full_reg_offset(a->vm);
1707    TCGv_ptr fpst;
1708
1709    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1710        return false;
1711    }
1712
1713    if (a->size == MO_16) {
1714        if (!dc_isar_feature(aa32_fp16_arith, s)) {
1715            return false;
1716        }
1717    }
1718
1719    /* UNDEF accesses to D16-D31 if they don't exist. */
1720    if (!dc_isar_feature(aa32_simd_r32, s) &&
1721        ((a->vd | a->vm) & 0x10)) {
1722        return false;
1723    }
1724
1725    if ((a->vm | a->vd) & a->q) {
1726        return false;
1727    }
1728
1729    if (!vfp_access_check(s)) {
1730        return true;
1731    }
1732
1733    fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1734    tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, vec_size, vec_size, a->shift, fn);
1735    tcg_temp_free_ptr(fpst);
1736    return true;
1737}
1738
1739#define DO_FP_2SH(INSN, FUNC)                                           \
1740    static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1741    {                                                                   \
1742        return do_fp_2sh(s, a, FUNC);                                   \
1743    }
1744
1745DO_FP_2SH(VCVT_SF, gen_helper_gvec_vcvt_sf)
1746DO_FP_2SH(VCVT_UF, gen_helper_gvec_vcvt_uf)
1747DO_FP_2SH(VCVT_FS, gen_helper_gvec_vcvt_fs)
1748DO_FP_2SH(VCVT_FU, gen_helper_gvec_vcvt_fu)
1749
1750DO_FP_2SH(VCVT_SH, gen_helper_gvec_vcvt_sh)
1751DO_FP_2SH(VCVT_UH, gen_helper_gvec_vcvt_uh)
1752DO_FP_2SH(VCVT_HS, gen_helper_gvec_vcvt_hs)
1753DO_FP_2SH(VCVT_HU, gen_helper_gvec_vcvt_hu)
1754
1755static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
1756                        GVecGen2iFn *fn)
1757{
1758    uint64_t imm;
1759    int reg_ofs, vec_size;
1760
1761    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1762        return false;
1763    }
1764
1765    /* UNDEF accesses to D16-D31 if they don't exist. */
1766    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
1767        return false;
1768    }
1769
1770    if (a->vd & a->q) {
1771        return false;
1772    }
1773
1774    if (!vfp_access_check(s)) {
1775        return true;
1776    }
1777
1778    reg_ofs = neon_full_reg_offset(a->vd);
1779    vec_size = a->q ? 16 : 8;
1780    imm = asimd_imm_const(a->imm, a->cmode, a->op);
1781
1782    fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
1783    return true;
1784}
1785
1786static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
1787                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1788{
1789    tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
1790}
1791
1792static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
1793{
1794    /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
1795    GVecGen2iFn *fn;
1796
1797    if ((a->cmode & 1) && a->cmode < 12) {
1798        /* for op=1, the imm will be inverted, so BIC becomes AND. */
1799        fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
1800    } else {
1801        /* There is one unallocated cmode/op combination in this space */
1802        if (a->cmode == 15 && a->op == 1) {
1803            return false;
1804        }
1805        fn = gen_VMOV_1r;
1806    }
1807    return do_1reg_imm(s, a, fn);
1808}
1809
1810static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
1811                           NeonGenWidenFn *widenfn,
1812                           NeonGenTwo64OpFn *opfn,
1813                           int src1_mop, int src2_mop)
1814{
1815    /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
1816    TCGv_i64 rn0_64, rn1_64, rm_64;
1817
1818    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1819        return false;
1820    }
1821
1822    /* UNDEF accesses to D16-D31 if they don't exist. */
1823    if (!dc_isar_feature(aa32_simd_r32, s) &&
1824        ((a->vd | a->vn | a->vm) & 0x10)) {
1825        return false;
1826    }
1827
1828    if (!opfn) {
1829        /* size == 3 case, which is an entirely different insn group */
1830        return false;
1831    }
1832
1833    if ((a->vd & 1) || (src1_mop == MO_Q && (a->vn & 1))) {
1834        return false;
1835    }
1836
1837    if (!vfp_access_check(s)) {
1838        return true;
1839    }
1840
1841    rn0_64 = tcg_temp_new_i64();
1842    rn1_64 = tcg_temp_new_i64();
1843    rm_64 = tcg_temp_new_i64();
1844
1845    if (src1_mop >= 0) {
1846        read_neon_element64(rn0_64, a->vn, 0, src1_mop);
1847    } else {
1848        TCGv_i32 tmp = tcg_temp_new_i32();
1849        read_neon_element32(tmp, a->vn, 0, MO_32);
1850        widenfn(rn0_64, tmp);
1851        tcg_temp_free_i32(tmp);
1852    }
1853    if (src2_mop >= 0) {
1854        read_neon_element64(rm_64, a->vm, 0, src2_mop);
1855    } else {
1856        TCGv_i32 tmp = tcg_temp_new_i32();
1857        read_neon_element32(tmp, a->vm, 0, MO_32);
1858        widenfn(rm_64, tmp);
1859        tcg_temp_free_i32(tmp);
1860    }
1861
1862    opfn(rn0_64, rn0_64, rm_64);
1863
1864    /*
1865     * Load second pass inputs before storing the first pass result, to
1866     * avoid incorrect results if a narrow input overlaps with the result.
1867     */
1868    if (src1_mop >= 0) {
1869        read_neon_element64(rn1_64, a->vn, 1, src1_mop);
1870    } else {
1871        TCGv_i32 tmp = tcg_temp_new_i32();
1872        read_neon_element32(tmp, a->vn, 1, MO_32);
1873        widenfn(rn1_64, tmp);
1874        tcg_temp_free_i32(tmp);
1875    }
1876    if (src2_mop >= 0) {
1877        read_neon_element64(rm_64, a->vm, 1, src2_mop);
1878    } else {
1879        TCGv_i32 tmp = tcg_temp_new_i32();
1880        read_neon_element32(tmp, a->vm, 1, MO_32);
1881        widenfn(rm_64, tmp);
1882        tcg_temp_free_i32(tmp);
1883    }
1884
1885    write_neon_element64(rn0_64, a->vd, 0, MO_64);
1886
1887    opfn(rn1_64, rn1_64, rm_64);
1888    write_neon_element64(rn1_64, a->vd, 1, MO_64);
1889
1890    tcg_temp_free_i64(rn0_64);
1891    tcg_temp_free_i64(rn1_64);
1892    tcg_temp_free_i64(rm_64);
1893
1894    return true;
1895}
1896
1897#define DO_PREWIDEN(INSN, S, OP, SRC1WIDE, SIGN)                        \
1898    static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1899    {                                                                   \
1900        static NeonGenWidenFn * const widenfn[] = {                     \
1901            gen_helper_neon_widen_##S##8,                               \
1902            gen_helper_neon_widen_##S##16,                              \
1903            NULL, NULL,                                                 \
1904        };                                                              \
1905        static NeonGenTwo64OpFn * const addfn[] = {                     \
1906            gen_helper_neon_##OP##l_u16,                                \
1907            gen_helper_neon_##OP##l_u32,                                \
1908            tcg_gen_##OP##_i64,                                         \
1909            NULL,                                                       \
1910        };                                                              \
1911        int narrow_mop = a->size == MO_32 ? MO_32 | SIGN : -1;          \
1912        return do_prewiden_3d(s, a, widenfn[a->size], addfn[a->size],   \
1913                              SRC1WIDE ? MO_Q : narrow_mop,             \
1914                              narrow_mop);                              \
1915    }
1916
1917DO_PREWIDEN(VADDL_S, s, add, false, MO_SIGN)
1918DO_PREWIDEN(VADDL_U, u, add, false, 0)
1919DO_PREWIDEN(VSUBL_S, s, sub, false, MO_SIGN)
1920DO_PREWIDEN(VSUBL_U, u, sub, false, 0)
1921DO_PREWIDEN(VADDW_S, s, add, true, MO_SIGN)
1922DO_PREWIDEN(VADDW_U, u, add, true, 0)
1923DO_PREWIDEN(VSUBW_S, s, sub, true, MO_SIGN)
1924DO_PREWIDEN(VSUBW_U, u, sub, true, 0)
1925
1926static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
1927                         NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
1928{
1929    /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
1930    TCGv_i64 rn_64, rm_64;
1931    TCGv_i32 rd0, rd1;
1932
1933    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1934        return false;
1935    }
1936
1937    /* UNDEF accesses to D16-D31 if they don't exist. */
1938    if (!dc_isar_feature(aa32_simd_r32, s) &&
1939        ((a->vd | a->vn | a->vm) & 0x10)) {
1940        return false;
1941    }
1942
1943    if (!opfn || !narrowfn) {
1944        /* size == 3 case, which is an entirely different insn group */
1945        return false;
1946    }
1947
1948    if ((a->vn | a->vm) & 1) {
1949        return false;
1950    }
1951
1952    if (!vfp_access_check(s)) {
1953        return true;
1954    }
1955
1956    rn_64 = tcg_temp_new_i64();
1957    rm_64 = tcg_temp_new_i64();
1958    rd0 = tcg_temp_new_i32();
1959    rd1 = tcg_temp_new_i32();
1960
1961    read_neon_element64(rn_64, a->vn, 0, MO_64);
1962    read_neon_element64(rm_64, a->vm, 0, MO_64);
1963
1964    opfn(rn_64, rn_64, rm_64);
1965
1966    narrowfn(rd0, rn_64);
1967
1968    read_neon_element64(rn_64, a->vn, 1, MO_64);
1969    read_neon_element64(rm_64, a->vm, 1, MO_64);
1970
1971    opfn(rn_64, rn_64, rm_64);
1972
1973    narrowfn(rd1, rn_64);
1974
1975    write_neon_element32(rd0, a->vd, 0, MO_32);
1976    write_neon_element32(rd1, a->vd, 1, MO_32);
1977
1978    tcg_temp_free_i32(rd0);
1979    tcg_temp_free_i32(rd1);
1980    tcg_temp_free_i64(rn_64);
1981    tcg_temp_free_i64(rm_64);
1982
1983    return true;
1984}
1985
1986#define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP)                       \
1987    static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1988    {                                                                   \
1989        static NeonGenTwo64OpFn * const addfn[] = {                     \
1990            gen_helper_neon_##OP##l_u16,                                \
1991            gen_helper_neon_##OP##l_u32,                                \
1992            tcg_gen_##OP##_i64,                                         \
1993            NULL,                                                       \
1994        };                                                              \
1995        static NeonGenNarrowFn * const narrowfn[] = {                   \
1996            gen_helper_neon_##NARROWTYPE##_high_u8,                     \
1997            gen_helper_neon_##NARROWTYPE##_high_u16,                    \
1998            EXTOP,                                                      \
1999            NULL,                                                       \
2000        };                                                              \

2001        return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]);   \
2002    }
2003
2004static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
2005{
2006    tcg_gen_addi_i64(rn, rn, 1u << 31);
2007    tcg_gen_extrh_i64_i32(rd, rn);
2008}
2009
2010DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
2011DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
2012DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
2013DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
2014
2015static bool do_long_3d(DisasContext *s, arg_3diff *a,
2016                       NeonGenTwoOpWidenFn *opfn,
2017                       NeonGenTwo64OpFn *accfn)
2018{
2019    /*
2020     * 3-regs different lengths, long operations.
2021     * These perform an operation on two inputs that returns a double-width
2022     * result, and then possibly perform an accumulation operation of
2023     * that result into the double-width destination.
2024     */
2025    TCGv_i64 rd0, rd1, tmp;
2026    TCGv_i32 rn, rm;
2027
2028    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2029        return false;
2030    }
2031
2032    /* UNDEF accesses to D16-D31 if they don't exist. */
2033    if (!dc_isar_feature(aa32_simd_r32, s) &&
2034        ((a->vd | a->vn | a->vm) & 0x10)) {
2035        return false;
2036    }
2037
2038    if (!opfn) {
2039        /* size == 3 case, which is an entirely different insn group */
2040        return false;
2041    }
2042
2043    if (a->vd & 1) {
2044        return false;
2045    }
2046
2047    if (!vfp_access_check(s)) {
2048        return true;
2049    }
2050
2051    rd0 = tcg_temp_new_i64();
2052    rd1 = tcg_temp_new_i64();
2053
2054    rn = tcg_temp_new_i32();
2055    rm = tcg_temp_new_i32();
2056    read_neon_element32(rn, a->vn, 0, MO_32);
2057    read_neon_element32(rm, a->vm, 0, MO_32);
2058    opfn(rd0, rn, rm);
2059
2060    read_neon_element32(rn, a->vn, 1, MO_32);
2061    read_neon_element32(rm, a->vm, 1, MO_32);
2062    opfn(rd1, rn, rm);
2063    tcg_temp_free_i32(rn);
2064    tcg_temp_free_i32(rm);
2065
2066    /* Don't store results until after all loads: they might overlap */
2067    if (accfn) {
2068        tmp = tcg_temp_new_i64();
2069        read_neon_element64(tmp, a->vd, 0, MO_64);
2070        accfn(rd0, tmp, rd0);
2071        read_neon_element64(tmp, a->vd, 1, MO_64);
2072        accfn(rd1, tmp, rd1);
2073        tcg_temp_free_i64(tmp);
2074    }
2075
2076    write_neon_element64(rd0, a->vd, 0, MO_64);
2077    write_neon_element64(rd1, a->vd, 1, MO_64);
2078    tcg_temp_free_i64(rd0);
2079    tcg_temp_free_i64(rd1);
2080
2081    return true;
2082}
2083
2084static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
2085{
2086    static NeonGenTwoOpWidenFn * const opfn[] = {
2087        gen_helper_neon_abdl_s16,
2088        gen_helper_neon_abdl_s32,
2089        gen_helper_neon_abdl_s64,
2090        NULL,
2091    };
2092
2093    return do_long_3d(s, a, opfn[a->size], NULL);
2094}
2095
2096static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
2097{
2098    static NeonGenTwoOpWidenFn * const opfn[] = {
2099        gen_helper_neon_abdl_u16,
2100        gen_helper_neon_abdl_u32,
2101        gen_helper_neon_abdl_u64,
2102        NULL,
2103    };
2104
2105    return do_long_3d(s, a, opfn[a->size], NULL);
2106}
2107
2108static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
2109{
2110    static NeonGenTwoOpWidenFn * const opfn[] = {
2111        gen_helper_neon_abdl_s16,
2112        gen_helper_neon_abdl_s32,
2113        gen_helper_neon_abdl_s64,
2114        NULL,
2115    };
2116    static NeonGenTwo64OpFn * const addfn[] = {
2117        gen_helper_neon_addl_u16,
2118        gen_helper_neon_addl_u32,
2119        tcg_gen_add_i64,
2120        NULL,
2121    };
2122
2123    return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2124}
2125
2126static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
2127{
2128    static NeonGenTwoOpWidenFn * const opfn[] = {
2129        gen_helper_neon_abdl_u16,
2130        gen_helper_neon_abdl_u32,
2131        gen_helper_neon_abdl_u64,
2132        NULL,
2133    };
2134    static NeonGenTwo64OpFn * const addfn[] = {
2135        gen_helper_neon_addl_u16,
2136        gen_helper_neon_addl_u32,
2137        tcg_gen_add_i64,
2138        NULL,
2139    };
2140
2141    return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2142}
2143
2144static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2145{
2146    TCGv_i32 lo = tcg_temp_new_i32();
2147    TCGv_i32 hi = tcg_temp_new_i32();
2148
2149    tcg_gen_muls2_i32(lo, hi, rn, rm);
2150    tcg_gen_concat_i32_i64(rd, lo, hi);
2151
2152    tcg_temp_free_i32(lo);
2153    tcg_temp_free_i32(hi);
2154}
2155
2156static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2157{
2158    TCGv_i32 lo = tcg_temp_new_i32();
2159    TCGv_i32 hi = tcg_temp_new_i32();
2160
2161    tcg_gen_mulu2_i32(lo, hi, rn, rm);
2162    tcg_gen_concat_i32_i64(rd, lo, hi);
2163
2164    tcg_temp_free_i32(lo);
2165    tcg_temp_free_i32(hi);
2166}
2167
2168static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
2169{
2170    static NeonGenTwoOpWidenFn * const opfn[] = {
2171        gen_helper_neon_mull_s8,
2172        gen_helper_neon_mull_s16,
2173        gen_mull_s32,
2174        NULL,
2175    };
2176
2177    return do_long_3d(s, a, opfn[a->size], NULL);
2178}
2179
2180static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
2181{
2182    static NeonGenTwoOpWidenFn * const opfn[] = {
2183        gen_helper_neon_mull_u8,
2184        gen_helper_neon_mull_u16,
2185        gen_mull_u32,
2186        NULL,
2187    };
2188
2189    return do_long_3d(s, a, opfn[a->size], NULL);
2190}
2191
2192#define DO_VMLAL(INSN,MULL,ACC)                                         \
2193    static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2194    {                                                                   \
2195        static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2196            gen_helper_neon_##MULL##8,                                  \
2197            gen_helper_neon_##MULL##16,                                 \
2198            gen_##MULL##32,                                             \
2199            NULL,                                                       \
2200        };                                                              \
2201        static NeonGenTwo64OpFn * const accfn[] = {                     \
2202            gen_helper_neon_##ACC##l_u16,                               \
2203            gen_helper_neon_##ACC##l_u32,                               \
2204            tcg_gen_##ACC##_i64,                                        \
2205            NULL,                                                       \
2206        };                                                              \
2207        return do_long_3d(s, a, opfn[a->size], accfn[a->size]);         \
2208    }
2209
2210DO_VMLAL(VMLAL_S,mull_s,add)
2211DO_VMLAL(VMLAL_U,mull_u,add)
2212DO_VMLAL(VMLSL_S,mull_s,sub)
2213DO_VMLAL(VMLSL_U,mull_u,sub)
2214
2215static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2216{
2217    gen_helper_neon_mull_s16(rd, rn, rm);
2218    gen_helper_neon_addl_saturate_s32(rd, cpu_env, rd, rd);
2219}
2220
2221static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2222{
2223    gen_mull_s32(rd, rn, rm);
2224    gen_helper_neon_addl_saturate_s64(rd, cpu_env, rd, rd);
2225}
2226
2227static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
2228{
2229    static NeonGenTwoOpWidenFn * const opfn[] = {
2230        NULL,
2231        gen_VQDMULL_16,
2232        gen_VQDMULL_32,
2233        NULL,
2234    };
2235
2236    return do_long_3d(s, a, opfn[a->size], NULL);
2237}
2238
2239static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2240{
2241    gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2242}
2243
2244static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2245{
2246    gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2247}
2248
2249static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
2250{
2251    static NeonGenTwoOpWidenFn * const opfn[] = {
2252        NULL,
2253        gen_VQDMULL_16,
2254        gen_VQDMULL_32,
2255        NULL,
2256    };
2257    static NeonGenTwo64OpFn * const accfn[] = {
2258        NULL,
2259        gen_VQDMLAL_acc_16,
2260        gen_VQDMLAL_acc_32,
2261        NULL,
2262    };
2263
2264    return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2265}
2266
2267static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2268{
2269    gen_helper_neon_negl_u32(rm, rm);
2270    gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2271}
2272
2273static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2274{
2275    tcg_gen_neg_i64(rm, rm);
2276    gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2277}
2278
2279static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
2280{
2281    static NeonGenTwoOpWidenFn * const opfn[] = {
2282        NULL,
2283        gen_VQDMULL_16,
2284        gen_VQDMULL_32,
2285        NULL,
2286    };
2287    static NeonGenTwo64OpFn * const accfn[] = {
2288        NULL,
2289        gen_VQDMLSL_acc_16,
2290        gen_VQDMLSL_acc_32,
2291        NULL,
2292    };
2293
2294    return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2295}
2296
2297static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
2298{
2299    gen_helper_gvec_3 *fn_gvec;
2300
2301    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2302        return false;
2303    }
2304
2305    /* UNDEF accesses to D16-D31 if they don't exist. */
2306    if (!dc_isar_feature(aa32_simd_r32, s) &&
2307        ((a->vd | a->vn | a->vm) & 0x10)) {
2308        return false;
2309    }
2310
2311    if (a->vd & 1) {
2312        return false;
2313    }
2314
2315    switch (a->size) {
2316    case 0:
2317        fn_gvec = gen_helper_neon_pmull_h;
2318        break;
2319    case 2:
2320        if (!dc_isar_feature(aa32_pmull, s)) {
2321            return false;
2322        }
2323        fn_gvec = gen_helper_gvec_pmull_q;
2324        break;
2325    default:
2326        return false;
2327    }
2328
2329    if (!vfp_access_check(s)) {
2330        return true;
2331    }
2332
2333    tcg_gen_gvec_3_ool(neon_full_reg_offset(a->vd),
2334                       neon_full_reg_offset(a->vn),
2335                       neon_full_reg_offset(a->vm),
2336                       16, 16, 0, fn_gvec);
2337    return true;
2338}
2339
2340static void gen_neon_dup_low16(TCGv_i32 var)
2341{
2342    TCGv_i32 tmp = tcg_temp_new_i32();
2343    tcg_gen_ext16u_i32(var, var);
2344    tcg_gen_shli_i32(tmp, var, 16);
2345    tcg_gen_or_i32(var, var, tmp);
2346    tcg_temp_free_i32(tmp);
2347}
2348
2349static void gen_neon_dup_high16(TCGv_i32 var)
2350{
2351    TCGv_i32 tmp = tcg_temp_new_i32();
2352    tcg_gen_andi_i32(var, var, 0xffff0000);
2353    tcg_gen_shri_i32(tmp, var, 16);
2354    tcg_gen_or_i32(var, var, tmp);
2355    tcg_temp_free_i32(tmp);
2356}
2357
2358static inline TCGv_i32 neon_get_scalar(int size, int reg)
2359{
2360    TCGv_i32 tmp = tcg_temp_new_i32();
2361    if (size == MO_16) {
2362        read_neon_element32(tmp, reg & 7, reg >> 4, MO_32);
2363        if (reg & 8) {
2364            gen_neon_dup_high16(tmp);
2365        } else {
2366            gen_neon_dup_low16(tmp);
2367        }
2368    } else {
2369        read_neon_element32(tmp, reg & 15, reg >> 4, MO_32);
2370    }
2371    return tmp;
2372}
2373
2374static bool do_2scalar(DisasContext *s, arg_2scalar *a,
2375                       NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
2376{
2377    /*
2378     * Two registers and a scalar: perform an operation between
2379     * the input elements and the scalar, and then possibly
2380     * perform an accumulation operation of that result into the
2381     * destination.
2382     */
2383    TCGv_i32 scalar, tmp;
2384    int pass;
2385
2386    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2387        return false;
2388    }
2389
2390    /* UNDEF accesses to D16-D31 if they don't exist. */
2391    if (!dc_isar_feature(aa32_simd_r32, s) &&
2392        ((a->vd | a->vn | a->vm) & 0x10)) {
2393        return false;
2394    }
2395
2396    if (!opfn) {
2397        /* Bad size (including size == 3, which is a different insn group) */
2398        return false;
2399    }
2400
2401    if (a->q && ((a->vd | a->vn) & 1)) {
2402        return false;
2403    }
2404
2405    if (!vfp_access_check(s)) {
2406        return true;
2407    }
2408
2409    scalar = neon_get_scalar(a->size, a->vm);
2410    tmp = tcg_temp_new_i32();
2411
2412    for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2413        read_neon_element32(tmp, a->vn, pass, MO_32);
2414        opfn(tmp, tmp, scalar);
2415        if (accfn) {
2416            TCGv_i32 rd = tcg_temp_new_i32();
2417            read_neon_element32(rd, a->vd, pass, MO_32);
2418            accfn(tmp, rd, tmp);
2419            tcg_temp_free_i32(rd);
2420        }
2421        write_neon_element32(tmp, a->vd, pass, MO_32);
2422    }
2423    tcg_temp_free_i32(tmp);
2424    tcg_temp_free_i32(scalar);
2425    return true;
2426}
2427
2428static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
2429{
2430    static NeonGenTwoOpFn * const opfn[] = {
2431        NULL,
2432        gen_helper_neon_mul_u16,
2433        tcg_gen_mul_i32,
2434        NULL,
2435    };
2436
2437    return do_2scalar(s, a, opfn[a->size], NULL);
2438}
2439
2440static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
2441{
2442    static NeonGenTwoOpFn * const opfn[] = {
2443        NULL,
2444        gen_helper_neon_mul_u16,
2445        tcg_gen_mul_i32,
2446        NULL,
2447    };
2448    static NeonGenTwoOpFn * const accfn[] = {
2449        NULL,
2450        gen_helper_neon_add_u16,
2451        tcg_gen_add_i32,
2452        NULL,
2453    };
2454
2455    return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2456}
2457
2458static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
2459{
2460    static NeonGenTwoOpFn * const opfn[] = {
2461        NULL,
2462        gen_helper_neon_mul_u16,
2463        tcg_gen_mul_i32,
2464        NULL,
2465    };
2466    static NeonGenTwoOpFn * const accfn[] = {
2467        NULL,
2468        gen_helper_neon_sub_u16,
2469        tcg_gen_sub_i32,
2470        NULL,
2471    };
2472
2473    return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2474}
2475
2476static bool do_2scalar_fp_vec(DisasContext *s, arg_2scalar *a,
2477                              gen_helper_gvec_3_ptr *fn)
2478{
2479    /* Two registers and a scalar, using gvec */
2480    int vec_size = a->q ? 16 : 8;
2481    int rd_ofs = neon_full_reg_offset(a->vd);
2482    int rn_ofs = neon_full_reg_offset(a->vn);
2483    int rm_ofs;
2484    int idx;
2485    TCGv_ptr fpstatus;
2486
2487    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2488        return false;
2489    }
2490
2491    /* UNDEF accesses to D16-D31 if they don't exist. */
2492    if (!dc_isar_feature(aa32_simd_r32, s) &&
2493        ((a->vd | a->vn | a->vm) & 0x10)) {
2494        return false;
2495    }
2496
2497    if (!fn) {
2498        /* Bad size (including size == 3, which is a different insn group) */
2499        return false;
2500    }
2501
2502    if (a->q && ((a->vd | a->vn) & 1)) {
2503        return false;
2504    }
2505
2506    if (!vfp_access_check(s)) {
2507        return true;
2508    }
2509
2510    /* a->vm is M:Vm, which encodes both register and index */
2511    idx = extract32(a->vm, a->size + 2, 2);
2512    a->vm = extract32(a->vm, 0, a->size + 2);
2513    rm_ofs = neon_full_reg_offset(a->vm);
2514
2515    fpstatus = fpstatus_ptr(a->size == 1 ? FPST_STD_F16 : FPST_STD);
2516    tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpstatus,
2517                       vec_size, vec_size, idx, fn);
2518    tcg_temp_free_ptr(fpstatus);
2519    return true;
2520}
2521
2522#define DO_VMUL_F_2sc(NAME, FUNC)                                       \
2523    static bool trans_##NAME##_F_2sc(DisasContext *s, arg_2scalar *a)   \
2524    {                                                                   \
2525        static gen_helper_gvec_3_ptr * const opfn[] = {                 \
2526            NULL,                                                       \
2527            gen_helper_##FUNC##_h,                                      \
2528            gen_helper_##FUNC##_s,                                      \
2529            NULL,                                                       \
2530        };                                                              \
2531        if (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s)) { \
2532            return false;                                               \
2533        }                                                               \
2534        return do_2scalar_fp_vec(s, a, opfn[a->size]);                  \
2535    }
2536
2537DO_VMUL_F_2sc(VMUL, gvec_fmul_idx)
2538DO_VMUL_F_2sc(VMLA, gvec_fmla_nf_idx)
2539DO_VMUL_F_2sc(VMLS, gvec_fmls_nf_idx)
2540
2541WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
2542WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
2543WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
2544WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
2545
2546static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
2547{
2548    static NeonGenTwoOpFn * const opfn[] = {
2549        NULL,
2550        gen_VQDMULH_16,
2551        gen_VQDMULH_32,
2552        NULL,
2553    };
2554
2555    return do_2scalar(s, a, opfn[a->size], NULL);
2556}
2557
2558static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
2559{
2560    static NeonGenTwoOpFn * const opfn[] = {
2561        NULL,
2562        gen_VQRDMULH_16,
2563        gen_VQRDMULH_32,
2564        NULL,
2565    };
2566
2567    return do_2scalar(s, a, opfn[a->size], NULL);
2568}
2569
2570static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
2571                            NeonGenThreeOpEnvFn *opfn)
2572{
2573    /*
2574     * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
2575     * performs a kind of fused op-then-accumulate using a helper
2576     * function that takes all of rd, rn and the scalar at once.
2577     */
2578    TCGv_i32 scalar, rn, rd;
2579    int pass;
2580
2581    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2582        return false;
2583    }
2584
2585    if (!dc_isar_feature(aa32_rdm, s)) {
2586        return false;
2587    }
2588
2589    /* UNDEF accesses to D16-D31 if they don't exist. */
2590    if (!dc_isar_feature(aa32_simd_r32, s) &&
2591        ((a->vd | a->vn | a->vm) & 0x10)) {
2592        return false;
2593    }
2594
2595    if (!opfn) {
2596        /* Bad size (including size == 3, which is a different insn group) */
2597        return false;
2598    }
2599
2600    if (a->q && ((a->vd | a->vn) & 1)) {
2601        return false;
2602    }
2603
2604    if (!vfp_access_check(s)) {
2605        return true;
2606    }
2607
2608    scalar = neon_get_scalar(a->size, a->vm);
2609    rn = tcg_temp_new_i32();
2610    rd = tcg_temp_new_i32();
2611
2612    for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2613        read_neon_element32(rn, a->vn, pass, MO_32);
2614        read_neon_element32(rd, a->vd, pass, MO_32);
2615        opfn(rd, cpu_env, rn, scalar, rd);
2616        write_neon_element32(rd, a->vd, pass, MO_32);
2617    }
2618    tcg_temp_free_i32(rn);
2619    tcg_temp_free_i32(rd);
2620    tcg_temp_free_i32(scalar);
2621
2622    return true;
2623}
2624
2625static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
2626{
2627    static NeonGenThreeOpEnvFn *opfn[] = {
2628        NULL,
2629        gen_helper_neon_qrdmlah_s16,
2630        gen_helper_neon_qrdmlah_s32,
2631        NULL,
2632    };
2633    return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2634}
2635
2636static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
2637{
2638    static NeonGenThreeOpEnvFn *opfn[] = {
2639        NULL,
2640        gen_helper_neon_qrdmlsh_s16,
2641        gen_helper_neon_qrdmlsh_s32,
2642        NULL,
2643    };
2644    return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2645}
2646
2647static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
2648                            NeonGenTwoOpWidenFn *opfn,
2649                            NeonGenTwo64OpFn *accfn)
2650{
2651    /*
2652     * Two registers and a scalar, long operations: perform an
2653     * operation on the input elements and the scalar which produces
2654     * a double-width result, and then possibly perform an accumulation
2655     * operation of that result into the destination.
2656     */
2657    TCGv_i32 scalar, rn;
2658    TCGv_i64 rn0_64, rn1_64;
2659
2660    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2661        return false;
2662    }
2663
2664    /* UNDEF accesses to D16-D31 if they don't exist. */
2665    if (!dc_isar_feature(aa32_simd_r32, s) &&
2666        ((a->vd | a->vn | a->vm) & 0x10)) {
2667        return false;
2668    }
2669
2670    if (!opfn) {
2671        /* Bad size (including size == 3, which is a different insn group) */
2672        return false;
2673    }
2674
2675    if (a->vd & 1) {
2676        return false;
2677    }
2678
2679    if (!vfp_access_check(s)) {
2680        return true;
2681    }
2682
2683    scalar = neon_get_scalar(a->size, a->vm);
2684
2685    /* Load all inputs before writing any outputs, in case of overlap */
2686    rn = tcg_temp_new_i32();
2687    read_neon_element32(rn, a->vn, 0, MO_32);
2688    rn0_64 = tcg_temp_new_i64();
2689    opfn(rn0_64, rn, scalar);
2690
2691    read_neon_element32(rn, a->vn, 1, MO_32);
2692    rn1_64 = tcg_temp_new_i64();
2693    opfn(rn1_64, rn, scalar);
2694    tcg_temp_free_i32(rn);
2695    tcg_temp_free_i32(scalar);
2696
2697    if (accfn) {
2698        TCGv_i64 t64 = tcg_temp_new_i64();
2699        read_neon_element64(t64, a->vd, 0, MO_64);
2700        accfn(rn0_64, t64, rn0_64);
2701        read_neon_element64(t64, a->vd, 1, MO_64);
2702        accfn(rn1_64, t64, rn1_64);
2703        tcg_temp_free_i64(t64);
2704    }
2705
2706    write_neon_element64(rn0_64, a->vd, 0, MO_64);
2707    write_neon_element64(rn1_64, a->vd, 1, MO_64);
2708    tcg_temp_free_i64(rn0_64);
2709    tcg_temp_free_i64(rn1_64);
2710    return true;
2711}
2712
2713static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
2714{
2715    static NeonGenTwoOpWidenFn * const opfn[] = {
2716        NULL,
2717        gen_helper_neon_mull_s16,
2718        gen_mull_s32,
2719        NULL,
2720    };
2721
2722    return do_2scalar_long(s, a, opfn[a->size], NULL);
2723}
2724
2725static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
2726{
2727    static NeonGenTwoOpWidenFn * const opfn[] = {
2728        NULL,
2729        gen_helper_neon_mull_u16,
2730        gen_mull_u32,
2731        NULL,
2732    };
2733
2734    return do_2scalar_long(s, a, opfn[a->size], NULL);
2735}
2736
2737#define DO_VMLAL_2SC(INSN, MULL, ACC)                                   \
2738    static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a)     \
2739    {                                                                   \
2740        static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2741            NULL,                                                       \
2742            gen_helper_neon_##MULL##16,                                 \
2743            gen_##MULL##32,                                             \
2744            NULL,                                                       \
2745        };                                                              \
2746        static NeonGenTwo64OpFn * const accfn[] = {                     \
2747            NULL,                                                       \
2748            gen_helper_neon_##ACC##l_u32,                               \
2749            tcg_gen_##ACC##_i64,                                        \
2750            NULL,                                                       \
2751        };                                                              \
2752        return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);    \
2753    }
2754
2755DO_VMLAL_2SC(VMLAL_S, mull_s, add)
2756DO_VMLAL_2SC(VMLAL_U, mull_u, add)
2757DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
2758DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
2759
2760static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
2761{
2762    static NeonGenTwoOpWidenFn * const opfn[] = {
2763        NULL,
2764        gen_VQDMULL_16,
2765        gen_VQDMULL_32,
2766        NULL,
2767    };
2768
2769    return do_2scalar_long(s, a, opfn[a->size], NULL);
2770}
2771
2772static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
2773{
2774    static NeonGenTwoOpWidenFn * const opfn[] = {
2775        NULL,
2776        gen_VQDMULL_16,
2777        gen_VQDMULL_32,
2778        NULL,
2779    };
2780    static NeonGenTwo64OpFn * const accfn[] = {
2781        NULL,
2782        gen_VQDMLAL_acc_16,
2783        gen_VQDMLAL_acc_32,
2784        NULL,
2785    };
2786
2787    return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2788}
2789
2790static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
2791{
2792    static NeonGenTwoOpWidenFn * const opfn[] = {
2793        NULL,
2794        gen_VQDMULL_16,
2795        gen_VQDMULL_32,
2796        NULL,
2797    };
2798    static NeonGenTwo64OpFn * const accfn[] = {
2799        NULL,
2800        gen_VQDMLSL_acc_16,
2801        gen_VQDMLSL_acc_32,
2802        NULL,
2803    };
2804
2805    return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2806}
2807
2808static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
2809{
2810    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2811        return false;
2812    }
2813
2814    /* UNDEF accesses to D16-D31 if they don't exist. */
2815    if (!dc_isar_feature(aa32_simd_r32, s) &&
2816        ((a->vd | a->vn | a->vm) & 0x10)) {
2817        return false;
2818    }
2819
2820    if ((a->vn | a->vm | a->vd) & a->q) {
2821        return false;
2822    }
2823
2824    if (a->imm > 7 && !a->q) {
2825        return false;
2826    }
2827
2828    if (!vfp_access_check(s)) {
2829        return true;
2830    }
2831
2832    if (!a->q) {
2833        /* Extract 64 bits from <Vm:Vn> */
2834        TCGv_i64 left, right, dest;
2835
2836        left = tcg_temp_new_i64();
2837        right = tcg_temp_new_i64();
2838        dest = tcg_temp_new_i64();
2839
2840        read_neon_element64(right, a->vn, 0, MO_64);
2841        read_neon_element64(left, a->vm, 0, MO_64);
2842        tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
2843        write_neon_element64(dest, a->vd, 0, MO_64);
2844
2845        tcg_temp_free_i64(left);
2846        tcg_temp_free_i64(right);
2847        tcg_temp_free_i64(dest);
2848    } else {
2849        /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
2850        TCGv_i64 left, middle, right, destleft, destright;
2851
2852        left = tcg_temp_new_i64();
2853        middle = tcg_temp_new_i64();
2854        right = tcg_temp_new_i64();
2855        destleft = tcg_temp_new_i64();
2856        destright = tcg_temp_new_i64();
2857
2858        if (a->imm < 8) {
2859            read_neon_element64(right, a->vn, 0, MO_64);
2860            read_neon_element64(middle, a->vn, 1, MO_64);
2861            tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
2862            read_neon_element64(left, a->vm, 0, MO_64);
2863            tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
2864        } else {
2865            read_neon_element64(right, a->vn, 1, MO_64);
2866            read_neon_element64(middle, a->vm, 0, MO_64);
2867            tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
2868            read_neon_element64(left, a->vm, 1, MO_64);
2869            tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
2870        }
2871
2872        write_neon_element64(destright, a->vd, 0, MO_64);
2873        write_neon_element64(destleft, a->vd, 1, MO_64);
2874
2875        tcg_temp_free_i64(destright);
2876        tcg_temp_free_i64(destleft);
2877        tcg_temp_free_i64(right);
2878        tcg_temp_free_i64(middle);
2879        tcg_temp_free_i64(left);
2880    }
2881    return true;
2882}
2883
2884static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
2885{
2886    TCGv_i64 val, def;
2887    TCGv_i32 desc;
2888
2889    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2890        return false;
2891    }
2892
2893    /* UNDEF accesses to D16-D31 if they don't exist. */
2894    if (!dc_isar_feature(aa32_simd_r32, s) &&
2895        ((a->vd | a->vn | a->vm) & 0x10)) {
2896        return false;
2897    }
2898
2899    if ((a->vn + a->len + 1) > 32) {
2900        /*
2901         * This is UNPREDICTABLE; we choose to UNDEF to avoid the
2902         * helper function running off the end of the register file.
2903         */
2904        return false;
2905    }
2906
2907    if (!vfp_access_check(s)) {
2908        return true;
2909    }
2910
2911    desc = tcg_const_i32((a->vn << 2) | a->len);
2912    def = tcg_temp_new_i64();
2913    if (a->op) {
2914        read_neon_element64(def, a->vd, 0, MO_64);
2915    } else {
2916        tcg_gen_movi_i64(def, 0);
2917    }
2918    val = tcg_temp_new_i64();
2919    read_neon_element64(val, a->vm, 0, MO_64);
2920
2921    gen_helper_neon_tbl(val, cpu_env, desc, val, def);
2922    write_neon_element64(val, a->vd, 0, MO_64);
2923
2924    tcg_temp_free_i64(def);
2925    tcg_temp_free_i64(val);
2926    tcg_temp_free_i32(desc);
2927    return true;
2928}
2929
2930static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
2931{
2932    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2933        return false;
2934    }
2935
2936    /* UNDEF accesses to D16-D31 if they don't exist. */
2937    if (!dc_isar_feature(aa32_simd_r32, s) &&
2938        ((a->vd | a->vm) & 0x10)) {
2939        return false;
2940    }
2941
2942    if (a->vd & a->q) {
2943        return false;
2944    }
2945
2946    if (!vfp_access_check(s)) {
2947        return true;
2948    }
2949
2950    tcg_gen_gvec_dup_mem(a->size, neon_full_reg_offset(a->vd),
2951                         neon_element_offset(a->vm, a->index, a->size),
2952                         a->q ? 16 : 8, a->q ? 16 : 8);
2953    return true;
2954}
2955
2956static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
2957{
2958    int pass, half;
2959    TCGv_i32 tmp[2];
2960
2961    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2962        return false;
2963    }
2964
2965    /* UNDEF accesses to D16-D31 if they don't exist. */
2966    if (!dc_isar_feature(aa32_simd_r32, s) &&
2967        ((a->vd | a->vm) & 0x10)) {
2968        return false;
2969    }
2970
2971    if ((a->vd | a->vm) & a->q) {
2972        return false;
2973    }
2974
2975    if (a->size == 3) {
2976        return false;
2977    }
2978
2979    if (!vfp_access_check(s)) {
2980        return true;
2981    }
2982
2983    tmp[0] = tcg_temp_new_i32();
2984    tmp[1] = tcg_temp_new_i32();
2985
2986    for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
2987        for (half = 0; half < 2; half++) {
2988            read_neon_element32(tmp[half], a->vm, pass * 2 + half, MO_32);
2989            switch (a->size) {
2990            case 0:
2991                tcg_gen_bswap32_i32(tmp[half], tmp[half]);
2992                break;
2993            case 1:
2994                gen_swap_half(tmp[half], tmp[half]);
2995                break;
2996            case 2:
2997                break;
2998            default:
2999                g_assert_not_reached();
3000            }

3001        }
3002        write_neon_element32(tmp[1], a->vd, pass * 2, MO_32);
3003        write_neon_element32(tmp[0], a->vd, pass * 2 + 1, MO_32);
3004    }
3005
3006    tcg_temp_free_i32(tmp[0]);
3007    tcg_temp_free_i32(tmp[1]);
3008    return true;
3009}
3010
3011static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
3012                              NeonGenWidenFn *widenfn,
3013                              NeonGenTwo64OpFn *opfn,
3014                              NeonGenTwo64OpFn *accfn)
3015{
3016    /*
3017     * Pairwise long operations: widen both halves of the pair,
3018     * combine the pairs with the opfn, and then possibly accumulate
3019     * into the destination with the accfn.
3020     */
3021    int pass;
3022
3023    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3024        return false;
3025    }
3026
3027    /* UNDEF accesses to D16-D31 if they don't exist. */
3028    if (!dc_isar_feature(aa32_simd_r32, s) &&
3029        ((a->vd | a->vm) & 0x10)) {
3030        return false;
3031    }
3032
3033    if ((a->vd | a->vm) & a->q) {
3034        return false;
3035    }
3036
3037    if (!widenfn) {
3038        return false;
3039    }
3040
3041    if (!vfp_access_check(s)) {
3042        return true;
3043    }
3044
3045    for (pass = 0; pass < a->q + 1; pass++) {
3046        TCGv_i32 tmp;
3047        TCGv_i64 rm0_64, rm1_64, rd_64;
3048
3049        rm0_64 = tcg_temp_new_i64();
3050        rm1_64 = tcg_temp_new_i64();
3051        rd_64 = tcg_temp_new_i64();
3052
3053        tmp = tcg_temp_new_i32();
3054        read_neon_element32(tmp, a->vm, pass * 2, MO_32);
3055        widenfn(rm0_64, tmp);
3056        read_neon_element32(tmp, a->vm, pass * 2 + 1, MO_32);
3057        widenfn(rm1_64, tmp);
3058        tcg_temp_free_i32(tmp);
3059
3060        opfn(rd_64, rm0_64, rm1_64);
3061        tcg_temp_free_i64(rm0_64);
3062        tcg_temp_free_i64(rm1_64);
3063
3064        if (accfn) {
3065            TCGv_i64 tmp64 = tcg_temp_new_i64();
3066            read_neon_element64(tmp64, a->vd, pass, MO_64);
3067            accfn(rd_64, tmp64, rd_64);
3068            tcg_temp_free_i64(tmp64);
3069        }
3070        write_neon_element64(rd_64, a->vd, pass, MO_64);
3071        tcg_temp_free_i64(rd_64);
3072    }
3073    return true;
3074}
3075
3076static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
3077{
3078    static NeonGenWidenFn * const widenfn[] = {
3079        gen_helper_neon_widen_s8,
3080        gen_helper_neon_widen_s16,
3081        tcg_gen_ext_i32_i64,
3082        NULL,
3083    };
3084    static NeonGenTwo64OpFn * const opfn[] = {
3085        gen_helper_neon_paddl_u16,
3086        gen_helper_neon_paddl_u32,
3087        tcg_gen_add_i64,
3088        NULL,
3089    };
3090
3091    return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3092}
3093
3094static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
3095{
3096    static NeonGenWidenFn * const widenfn[] = {
3097        gen_helper_neon_widen_u8,
3098        gen_helper_neon_widen_u16,
3099        tcg_gen_extu_i32_i64,
3100        NULL,
3101    };
3102    static NeonGenTwo64OpFn * const opfn[] = {
3103        gen_helper_neon_paddl_u16,
3104        gen_helper_neon_paddl_u32,
3105        tcg_gen_add_i64,
3106        NULL,
3107    };
3108
3109    return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3110}
3111
3112static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
3113{
3114    static NeonGenWidenFn * const widenfn[] = {
3115        gen_helper_neon_widen_s8,
3116        gen_helper_neon_widen_s16,
3117        tcg_gen_ext_i32_i64,
3118        NULL,
3119    };
3120    static NeonGenTwo64OpFn * const opfn[] = {
3121        gen_helper_neon_paddl_u16,
3122        gen_helper_neon_paddl_u32,
3123        tcg_gen_add_i64,
3124        NULL,
3125    };
3126    static NeonGenTwo64OpFn * const accfn[] = {
3127        gen_helper_neon_addl_u16,
3128        gen_helper_neon_addl_u32,
3129        tcg_gen_add_i64,
3130        NULL,
3131    };
3132
3133    return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3134                             accfn[a->size]);
3135}
3136
3137static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
3138{
3139    static NeonGenWidenFn * const widenfn[] = {
3140        gen_helper_neon_widen_u8,
3141        gen_helper_neon_widen_u16,
3142        tcg_gen_extu_i32_i64,
3143        NULL,
3144    };
3145    static NeonGenTwo64OpFn * const opfn[] = {
3146        gen_helper_neon_paddl_u16,
3147        gen_helper_neon_paddl_u32,
3148        tcg_gen_add_i64,
3149        NULL,
3150    };
3151    static NeonGenTwo64OpFn * const accfn[] = {
3152        gen_helper_neon_addl_u16,
3153        gen_helper_neon_addl_u32,
3154        tcg_gen_add_i64,
3155        NULL,
3156    };
3157
3158    return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3159                             accfn[a->size]);
3160}
3161
3162typedef void ZipFn(TCGv_ptr, TCGv_ptr);
3163
3164static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
3165                       ZipFn *fn)
3166{
3167    TCGv_ptr pd, pm;
3168
3169    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3170        return false;
3171    }
3172
3173    /* UNDEF accesses to D16-D31 if they don't exist. */
3174    if (!dc_isar_feature(aa32_simd_r32, s) &&
3175        ((a->vd | a->vm) & 0x10)) {
3176        return false;
3177    }
3178
3179    if ((a->vd | a->vm) & a->q) {
3180        return false;
3181    }
3182
3183    if (!fn) {
3184        /* Bad size or size/q combination */
3185        return false;
3186    }
3187
3188    if (!vfp_access_check(s)) {
3189        return true;
3190    }
3191
3192    pd = vfp_reg_ptr(true, a->vd);
3193    pm = vfp_reg_ptr(true, a->vm);
3194    fn(pd, pm);
3195    tcg_temp_free_ptr(pd);
3196    tcg_temp_free_ptr(pm);
3197    return true;
3198}
3199
3200static bool trans_VUZP(DisasContext *s, arg_2misc *a)
3201{
3202    static ZipFn * const fn[2][4] = {
3203        {
3204            gen_helper_neon_unzip8,
3205            gen_helper_neon_unzip16,
3206            NULL,
3207            NULL,
3208        }, {
3209            gen_helper_neon_qunzip8,
3210            gen_helper_neon_qunzip16,
3211            gen_helper_neon_qunzip32,
3212            NULL,
3213        }
3214    };
3215    return do_zip_uzp(s, a, fn[a->q][a->size]);
3216}
3217
3218static bool trans_VZIP(DisasContext *s, arg_2misc *a)
3219{
3220    static ZipFn * const fn[2][4] = {
3221        {
3222            gen_helper_neon_zip8,
3223            gen_helper_neon_zip16,
3224            NULL,
3225            NULL,
3226        }, {
3227            gen_helper_neon_qzip8,
3228            gen_helper_neon_qzip16,
3229            gen_helper_neon_qzip32,
3230            NULL,
3231        }
3232    };
3233    return do_zip_uzp(s, a, fn[a->q][a->size]);
3234}
3235
3236static bool do_vmovn(DisasContext *s, arg_2misc *a,
3237                     NeonGenNarrowEnvFn *narrowfn)
3238{
3239    TCGv_i64 rm;
3240    TCGv_i32 rd0, rd1;
3241
3242    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3243        return false;
3244    }
3245
3246    /* UNDEF accesses to D16-D31 if they don't exist. */
3247    if (!dc_isar_feature(aa32_simd_r32, s) &&
3248        ((a->vd | a->vm) & 0x10)) {
3249        return false;
3250    }
3251
3252    if (a->vm & 1) {
3253        return false;
3254    }
3255
3256    if (!narrowfn) {
3257        return false;
3258    }
3259
3260    if (!vfp_access_check(s)) {
3261        return true;
3262    }
3263
3264    rm = tcg_temp_new_i64();
3265    rd0 = tcg_temp_new_i32();
3266    rd1 = tcg_temp_new_i32();
3267
3268    read_neon_element64(rm, a->vm, 0, MO_64);
3269    narrowfn(rd0, cpu_env, rm);
3270    read_neon_element64(rm, a->vm, 1, MO_64);
3271    narrowfn(rd1, cpu_env, rm);
3272    write_neon_element32(rd0, a->vd, 0, MO_32);
3273    write_neon_element32(rd1, a->vd, 1, MO_32);
3274    tcg_temp_free_i32(rd0);
3275    tcg_temp_free_i32(rd1);
3276    tcg_temp_free_i64(rm);
3277    return true;
3278}
3279
3280#define DO_VMOVN(INSN, FUNC)                                    \
3281    static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3282    {                                                           \
3283        static NeonGenNarrowEnvFn * const narrowfn[] = {        \
3284            FUNC##8,                                            \
3285            FUNC##16,                                           \
3286            FUNC##32,                                           \
3287            NULL,                                               \
3288        };                                                      \
3289        return do_vmovn(s, a, narrowfn[a->size]);               \
3290    }
3291
3292DO_VMOVN(VMOVN, gen_neon_narrow_u)
3293DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
3294DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
3295DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
3296
3297static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
3298{
3299    TCGv_i32 rm0, rm1;
3300    TCGv_i64 rd;
3301    static NeonGenWidenFn * const widenfns[] = {
3302        gen_helper_neon_widen_u8,
3303        gen_helper_neon_widen_u16,
3304        tcg_gen_extu_i32_i64,
3305        NULL,
3306    };
3307    NeonGenWidenFn *widenfn = widenfns[a->size];
3308
3309    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3310        return false;
3311    }
3312
3313    /* UNDEF accesses to D16-D31 if they don't exist. */
3314    if (!dc_isar_feature(aa32_simd_r32, s) &&
3315        ((a->vd | a->vm) & 0x10)) {
3316        return false;
3317    }
3318
3319    if (a->vd & 1) {
3320        return false;
3321    }
3322
3323    if (!widenfn) {
3324        return false;
3325    }
3326
3327    if (!vfp_access_check(s)) {
3328        return true;
3329    }
3330
3331    rd = tcg_temp_new_i64();
3332    rm0 = tcg_temp_new_i32();
3333    rm1 = tcg_temp_new_i32();
3334
3335    read_neon_element32(rm0, a->vm, 0, MO_32);
3336    read_neon_element32(rm1, a->vm, 1, MO_32);
3337
3338    widenfn(rd, rm0);
3339    tcg_gen_shli_i64(rd, rd, 8 << a->size);
3340    write_neon_element64(rd, a->vd, 0, MO_64);
3341    widenfn(rd, rm1);
3342    tcg_gen_shli_i64(rd, rd, 8 << a->size);
3343    write_neon_element64(rd, a->vd, 1, MO_64);
3344
3345    tcg_temp_free_i64(rd);
3346    tcg_temp_free_i32(rm0);
3347    tcg_temp_free_i32(rm1);
3348    return true;
3349}
3350
3351static bool trans_VCVT_B16_F32(DisasContext *s, arg_2misc *a)
3352{
3353    TCGv_ptr fpst;
3354    TCGv_i64 tmp;
3355    TCGv_i32 dst0, dst1;
3356
3357    if (!dc_isar_feature(aa32_bf16, s)) {
3358        return false;
3359    }
3360
3361    /* UNDEF accesses to D16-D31 if they don't exist. */
3362    if (!dc_isar_feature(aa32_simd_r32, s) &&
3363        ((a->vd | a->vm) & 0x10)) {
3364        return false;
3365    }
3366
3367    if ((a->vm & 1) || (a->size != 1)) {
3368        return false;
3369    }
3370
3371    if (!vfp_access_check(s)) {
3372        return true;
3373    }
3374
3375    fpst = fpstatus_ptr(FPST_STD);
3376    tmp = tcg_temp_new_i64();
3377    dst0 = tcg_temp_new_i32();
3378    dst1 = tcg_temp_new_i32();
3379
3380    read_neon_element64(tmp, a->vm, 0, MO_64);
3381    gen_helper_bfcvt_pair(dst0, tmp, fpst);
3382
3383    read_neon_element64(tmp, a->vm, 1, MO_64);
3384    gen_helper_bfcvt_pair(dst1, tmp, fpst);
3385
3386    write_neon_element32(dst0, a->vd, 0, MO_32);
3387    write_neon_element32(dst1, a->vd, 1, MO_32);
3388
3389    tcg_temp_free_i64(tmp);
3390    tcg_temp_free_i32(dst0);
3391    tcg_temp_free_i32(dst1);
3392    tcg_temp_free_ptr(fpst);
3393    return true;
3394}
3395
3396static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
3397{
3398    TCGv_ptr fpst;
3399    TCGv_i32 ahp, tmp, tmp2, tmp3;
3400
3401    if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3402        !dc_isar_feature(aa32_fp16_spconv, s)) {
3403        return false;
3404    }
3405
3406    /* UNDEF accesses to D16-D31 if they don't exist. */
3407    if (!dc_isar_feature(aa32_simd_r32, s) &&
3408        ((a->vd | a->vm) & 0x10)) {
3409        return false;
3410    }
3411
3412    if ((a->vm & 1) || (a->size != 1)) {
3413        return false;
3414    }
3415
3416    if (!vfp_access_check(s)) {
3417        return true;
3418    }
3419
3420    fpst = fpstatus_ptr(FPST_STD);
3421    ahp = get_ahp_flag();
3422    tmp = tcg_temp_new_i32();
3423    read_neon_element32(tmp, a->vm, 0, MO_32);
3424    gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3425    tmp2 = tcg_temp_new_i32();
3426    read_neon_element32(tmp2, a->vm, 1, MO_32);
3427    gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
3428    tcg_gen_shli_i32(tmp2, tmp2, 16);
3429    tcg_gen_or_i32(tmp2, tmp2, tmp);
3430    read_neon_element32(tmp, a->vm, 2, MO_32);
3431    gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3432    tmp3 = tcg_temp_new_i32();
3433    read_neon_element32(tmp3, a->vm, 3, MO_32);
3434    write_neon_element32(tmp2, a->vd, 0, MO_32);
3435    tcg_temp_free_i32(tmp2);
3436    gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
3437    tcg_gen_shli_i32(tmp3, tmp3, 16);
3438    tcg_gen_or_i32(tmp3, tmp3, tmp);
3439    write_neon_element32(tmp3, a->vd, 1, MO_32);
3440    tcg_temp_free_i32(tmp3);
3441    tcg_temp_free_i32(tmp);
3442    tcg_temp_free_i32(ahp);
3443    tcg_temp_free_ptr(fpst);
3444
3445    return true;
3446}
3447
3448static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
3449{
3450    TCGv_ptr fpst;
3451    TCGv_i32 ahp, tmp, tmp2, tmp3;
3452
3453    if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3454        !dc_isar_feature(aa32_fp16_spconv, s)) {
3455        return false;
3456    }
3457
3458    /* UNDEF accesses to D16-D31 if they don't exist. */
3459    if (!dc_isar_feature(aa32_simd_r32, s) &&
3460        ((a->vd | a->vm) & 0x10)) {
3461        return false;
3462    }
3463
3464    if ((a->vd & 1) || (a->size != 1)) {
3465        return false;
3466    }
3467
3468    if (!vfp_access_check(s)) {
3469        return true;
3470    }
3471
3472    fpst = fpstatus_ptr(FPST_STD);
3473    ahp = get_ahp_flag();
3474    tmp3 = tcg_temp_new_i32();
3475    tmp2 = tcg_temp_new_i32();
3476    tmp = tcg_temp_new_i32();
3477    read_neon_element32(tmp, a->vm, 0, MO_32);
3478    read_neon_element32(tmp2, a->vm, 1, MO_32);
3479    tcg_gen_ext16u_i32(tmp3, tmp);
3480    gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3481    write_neon_element32(tmp3, a->vd, 0, MO_32);
3482    tcg_gen_shri_i32(tmp, tmp, 16);
3483    gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
3484    write_neon_element32(tmp, a->vd, 1, MO_32);
3485    tcg_temp_free_i32(tmp);
3486    tcg_gen_ext16u_i32(tmp3, tmp2);
3487    gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3488    write_neon_element32(tmp3, a->vd, 2, MO_32);
3489    tcg_temp_free_i32(tmp3);
3490    tcg_gen_shri_i32(tmp2, tmp2, 16);
3491    gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
3492    write_neon_element32(tmp2, a->vd, 3, MO_32);
3493    tcg_temp_free_i32(tmp2);
3494    tcg_temp_free_i32(ahp);
3495    tcg_temp_free_ptr(fpst);
3496
3497    return true;
3498}
3499
3500static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
3501{
3502    int vec_size = a->q ? 16 : 8;
3503    int rd_ofs = neon_full_reg_offset(a->vd);
3504    int rm_ofs = neon_full_reg_offset(a->vm);
3505
3506    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3507        return false;
3508    }
3509
3510    /* UNDEF accesses to D16-D31 if they don't exist. */
3511    if (!dc_isar_feature(aa32_simd_r32, s) &&
3512        ((a->vd | a->vm) & 0x10)) {
3513        return false;
3514    }
3515
3516    if (a->size == 3) {
3517        return false;
3518    }
3519
3520    if ((a->vd | a->vm) & a->q) {
3521        return false;
3522    }
3523
3524    if (!vfp_access_check(s)) {
3525        return true;
3526    }
3527
3528    fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size);
3529
3530    return true;
3531}
3532
3533#define DO_2MISC_VEC(INSN, FN)                                  \
3534    static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3535    {                                                           \
3536        return do_2misc_vec(s, a, FN);                          \
3537    }
3538
3539DO_2MISC_VEC(VNEG, tcg_gen_gvec_neg)
3540DO_2MISC_VEC(VABS, tcg_gen_gvec_abs)
3541DO_2MISC_VEC(VCEQ0, gen_gvec_ceq0)
3542DO_2MISC_VEC(VCGT0, gen_gvec_cgt0)
3543DO_2MISC_VEC(VCLE0, gen_gvec_cle0)
3544DO_2MISC_VEC(VCGE0, gen_gvec_cge0)
3545DO_2MISC_VEC(VCLT0, gen_gvec_clt0)
3546
3547static bool trans_VMVN(DisasContext *s, arg_2misc *a)
3548{
3549    if (a->size != 0) {
3550        return false;
3551    }
3552    return do_2misc_vec(s, a, tcg_gen_gvec_not);
3553}
3554
3555#define WRAP_2M_3_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3556    static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3557                         uint32_t rm_ofs, uint32_t oprsz,               \
3558                         uint32_t maxsz)                                \
3559    {                                                                   \
3560        tcg_gen_gvec_3_ool(rd_ofs, rd_ofs, rm_ofs, oprsz, maxsz,        \
3561                           DATA, FUNC);                                 \
3562    }
3563
3564#define WRAP_2M_2_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3565    static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3566                         uint32_t rm_ofs, uint32_t oprsz,               \
3567                         uint32_t maxsz)                                \
3568    {                                                                   \
3569        tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, oprsz, maxsz, DATA, FUNC);   \
3570    }
3571
3572WRAP_2M_3_OOL_FN(gen_AESE, gen_helper_crypto_aese, 0)
3573WRAP_2M_3_OOL_FN(gen_AESD, gen_helper_crypto_aese, 1)
3574WRAP_2M_2_OOL_FN(gen_AESMC, gen_helper_crypto_aesmc, 0)
3575WRAP_2M_2_OOL_FN(gen_AESIMC, gen_helper_crypto_aesmc, 1)
3576WRAP_2M_2_OOL_FN(gen_SHA1H, gen_helper_crypto_sha1h, 0)
3577WRAP_2M_2_OOL_FN(gen_SHA1SU1, gen_helper_crypto_sha1su1, 0)
3578WRAP_2M_2_OOL_FN(gen_SHA256SU0, gen_helper_crypto_sha256su0, 0)
3579
3580#define DO_2M_CRYPTO(INSN, FEATURE, SIZE)                       \
3581    static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3582    {                                                           \
3583        if (!dc_isar_feature(FEATURE, s) || a->size != SIZE) {  \
3584            return false;                                       \
3585        }                                                       \
3586        return do_2misc_vec(s, a, gen_##INSN);                  \
3587    }
3588
3589DO_2M_CRYPTO(AESE, aa32_aes, 0)
3590DO_2M_CRYPTO(AESD, aa32_aes, 0)
3591DO_2M_CRYPTO(AESMC, aa32_aes, 0)
3592DO_2M_CRYPTO(AESIMC, aa32_aes, 0)
3593DO_2M_CRYPTO(SHA1H, aa32_sha1, 2)
3594DO_2M_CRYPTO(SHA1SU1, aa32_sha1, 2)
3595DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2)
3596
3597static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn)
3598{
3599    TCGv_i32 tmp;
3600    int pass;
3601
3602    /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
3603    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3604        return false;
3605    }
3606
3607    /* UNDEF accesses to D16-D31 if they don't exist. */
3608    if (!dc_isar_feature(aa32_simd_r32, s) &&
3609        ((a->vd | a->vm) & 0x10)) {
3610        return false;
3611    }
3612
3613    if (!fn) {
3614        return false;
3615    }
3616
3617    if ((a->vd | a->vm) & a->q) {
3618        return false;
3619    }
3620
3621    if (!vfp_access_check(s)) {
3622        return true;
3623    }
3624
3625    tmp = tcg_temp_new_i32();
3626    for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3627        read_neon_element32(tmp, a->vm, pass, MO_32);
3628        fn(tmp, tmp);
3629        write_neon_element32(tmp, a->vd, pass, MO_32);
3630    }
3631    tcg_temp_free_i32(tmp);
3632
3633    return true;
3634}
3635
3636static bool trans_VREV32(DisasContext *s, arg_2misc *a)
3637{
3638    static NeonGenOneOpFn * const fn[] = {
3639        tcg_gen_bswap32_i32,
3640        gen_swap_half,
3641        NULL,
3642        NULL,
3643    };
3644    return do_2misc(s, a, fn[a->size]);
3645}
3646
3647static bool trans_VREV16(DisasContext *s, arg_2misc *a)
3648{
3649    if (a->size != 0) {
3650        return false;
3651    }
3652    return do_2misc(s, a, gen_rev16);
3653}
3654
3655static bool trans_VCLS(DisasContext *s, arg_2misc *a)
3656{
3657    static NeonGenOneOpFn * const fn[] = {
3658        gen_helper_neon_cls_s8,
3659        gen_helper_neon_cls_s16,
3660        gen_helper_neon_cls_s32,
3661        NULL,
3662    };
3663    return do_2misc(s, a, fn[a->size]);
3664}
3665
3666static void do_VCLZ_32(TCGv_i32 rd, TCGv_i32 rm)
3667{
3668    tcg_gen_clzi_i32(rd, rm, 32);
3669}
3670
3671static bool trans_VCLZ(DisasContext *s, arg_2misc *a)
3672{
3673    static NeonGenOneOpFn * const fn[] = {
3674        gen_helper_neon_clz_u8,
3675        gen_helper_neon_clz_u16,
3676        do_VCLZ_32,
3677        NULL,
3678    };
3679    return do_2misc(s, a, fn[a->size]);
3680}
3681
3682static bool trans_VCNT(DisasContext *s, arg_2misc *a)
3683{
3684    if (a->size != 0) {
3685        return false;
3686    }
3687    return do_2misc(s, a, gen_helper_neon_cnt_u8);
3688}
3689
3690static void gen_VABS_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3691                       uint32_t oprsz, uint32_t maxsz)
3692{
3693    tcg_gen_gvec_andi(vece, rd_ofs, rm_ofs,
3694                      vece == MO_16 ? 0x7fff : 0x7fffffff,
3695                      oprsz, maxsz);
3696}
3697
3698static bool trans_VABS_F(DisasContext *s, arg_2misc *a)
3699{
3700    if (a->size == MO_16) {
3701        if (!dc_isar_feature(aa32_fp16_arith, s)) {
3702            return false;
3703        }
3704    } else if (a->size != MO_32) {
3705        return false;
3706    }
3707    return do_2misc_vec(s, a, gen_VABS_F);
3708}
3709
3710static void gen_VNEG_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3711                       uint32_t oprsz, uint32_t maxsz)
3712{
3713    tcg_gen_gvec_xori(vece, rd_ofs, rm_ofs,
3714                      vece == MO_16 ? 0x8000 : 0x80000000,
3715                      oprsz, maxsz);
3716}
3717
3718static bool trans_VNEG_F(DisasContext *s, arg_2misc *a)
3719{
3720    if (a->size == MO_16) {
3721        if (!dc_isar_feature(aa32_fp16_arith, s)) {
3722            return false;
3723        }
3724    } else if (a->size != MO_32) {
3725        return false;
3726    }
3727    return do_2misc_vec(s, a, gen_VNEG_F);
3728}
3729
3730static bool trans_VRECPE(DisasContext *s, arg_2misc *a)
3731{
3732    if (a->size != 2) {
3733        return false;
3734    }
3735    return do_2misc(s, a, gen_helper_recpe_u32);
3736}
3737
3738static bool trans_VRSQRTE(DisasContext *s, arg_2misc *a)
3739{
3740    if (a->size != 2) {
3741        return false;
3742    }
3743    return do_2misc(s, a, gen_helper_rsqrte_u32);
3744}
3745
3746#define WRAP_1OP_ENV_FN(WRAPNAME, FUNC) \
3747    static void WRAPNAME(TCGv_i32 d, TCGv_i32 m)        \
3748    {                                                   \
3749        FUNC(d, cpu_env, m);                            \
3750    }
3751
3752WRAP_1OP_ENV_FN(gen_VQABS_s8, gen_helper_neon_qabs_s8)
3753WRAP_1OP_ENV_FN(gen_VQABS_s16, gen_helper_neon_qabs_s16)
3754WRAP_1OP_ENV_FN(gen_VQABS_s32, gen_helper_neon_qabs_s32)
3755WRAP_1OP_ENV_FN(gen_VQNEG_s8, gen_helper_neon_qneg_s8)
3756WRAP_1OP_ENV_FN(gen_VQNEG_s16, gen_helper_neon_qneg_s16)
3757WRAP_1OP_ENV_FN(gen_VQNEG_s32, gen_helper_neon_qneg_s32)
3758
3759static bool trans_VQABS(DisasContext *s, arg_2misc *a)
3760{
3761    static NeonGenOneOpFn * const fn[] = {
3762        gen_VQABS_s8,
3763        gen_VQABS_s16,
3764        gen_VQABS_s32,
3765        NULL,
3766    };
3767    return do_2misc(s, a, fn[a->size]);
3768}
3769
3770static bool trans_VQNEG(DisasContext *s, arg_2misc *a)
3771{
3772    static NeonGenOneOpFn * const fn[] = {
3773        gen_VQNEG_s8,
3774        gen_VQNEG_s16,
3775        gen_VQNEG_s32,
3776        NULL,
3777    };
3778    return do_2misc(s, a, fn[a->size]);
3779}
3780
3781#define DO_2MISC_FP_VEC(INSN, HFUNC, SFUNC)                             \
3782    static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3783                           uint32_t rm_ofs,                             \
3784                           uint32_t oprsz, uint32_t maxsz)              \
3785    {                                                                   \
3786        static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3787            NULL, HFUNC, SFUNC, NULL,                                   \
3788        };                                                              \
3789        TCGv_ptr fpst;                                                  \
3790        fpst = fpstatus_ptr(vece == MO_16 ? FPST_STD_F16 : FPST_STD);   \
3791        tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz, 0,       \
3792                           fns[vece]);                                  \
3793        tcg_temp_free_ptr(fpst);                                        \
3794    }                                                                   \
3795    static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3796    {                                                                   \
3797        if (a->size == MO_16) {                                         \
3798            if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3799                return false;                                           \
3800            }                                                           \
3801        } else if (a->size != MO_32) {                                  \
3802            return false;                                               \
3803        }                                                               \
3804        return do_2misc_vec(s, a, gen_##INSN);                          \
3805    }
3806
3807DO_2MISC_FP_VEC(VRECPE_F, gen_helper_gvec_frecpe_h, gen_helper_gvec_frecpe_s)
3808DO_2MISC_FP_VEC(VRSQRTE_F, gen_helper_gvec_frsqrte_h, gen_helper_gvec_frsqrte_s)
3809DO_2MISC_FP_VEC(VCGT0_F, gen_helper_gvec_fcgt0_h, gen_helper_gvec_fcgt0_s)
3810DO_2MISC_FP_VEC(VCGE0_F, gen_helper_gvec_fcge0_h, gen_helper_gvec_fcge0_s)
3811DO_2MISC_FP_VEC(VCEQ0_F, gen_helper_gvec_fceq0_h, gen_helper_gvec_fceq0_s)
3812DO_2MISC_FP_VEC(VCLT0_F, gen_helper_gvec_fclt0_h, gen_helper_gvec_fclt0_s)
3813DO_2MISC_FP_VEC(VCLE0_F, gen_helper_gvec_fcle0_h, gen_helper_gvec_fcle0_s)
3814DO_2MISC_FP_VEC(VCVT_FS, gen_helper_gvec_sstoh, gen_helper_gvec_sitos)
3815DO_2MISC_FP_VEC(VCVT_FU, gen_helper_gvec_ustoh, gen_helper_gvec_uitos)
3816DO_2MISC_FP_VEC(VCVT_SF, gen_helper_gvec_tosszh, gen_helper_gvec_tosizs)
3817DO_2MISC_FP_VEC(VCVT_UF, gen_helper_gvec_touszh, gen_helper_gvec_touizs)
3818
3819DO_2MISC_FP_VEC(VRINTX_impl, gen_helper_gvec_vrintx_h, gen_helper_gvec_vrintx_s)
3820
3821static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
3822{
3823    if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
3824        return false;
3825    }
3826    return trans_VRINTX_impl(s, a);
3827}
3828
3829#define DO_VEC_RMODE(INSN, RMODE, OP)                                   \
3830    static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3831                           uint32_t rm_ofs,                             \
3832                           uint32_t oprsz, uint32_t maxsz)              \
3833    {                                                                   \
3834        static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3835            NULL,                                                       \
3836            gen_helper_gvec_##OP##h,                                    \
3837            gen_helper_gvec_##OP##s,                                    \
3838            NULL,                                                       \
3839        };                                                              \
3840        TCGv_ptr fpst;                                                  \
3841        fpst = fpstatus_ptr(vece == 1 ? FPST_STD_F16 : FPST_STD);       \
3842        tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz,          \
3843                           arm_rmode_to_sf(RMODE), fns[vece]);          \
3844        tcg_temp_free_ptr(fpst);                                        \
3845    }                                                                   \
3846    static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3847    {                                                                   \
3848        if (!arm_dc_feature(s, ARM_FEATURE_V8)) {                       \
3849            return false;                                               \
3850        }                                                               \
3851        if (a->size == MO_16) {                                         \
3852            if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3853                return false;                                           \
3854            }                                                           \
3855        } else if (a->size != MO_32) {                                  \
3856            return false;                                               \
3857        }                                                               \
3858        return do_2misc_vec(s, a, gen_##INSN);                          \
3859    }
3860
3861DO_VEC_RMODE(VCVTAU, FPROUNDING_TIEAWAY, vcvt_rm_u)
3862DO_VEC_RMODE(VCVTAS, FPROUNDING_TIEAWAY, vcvt_rm_s)
3863DO_VEC_RMODE(VCVTNU, FPROUNDING_TIEEVEN, vcvt_rm_u)
3864DO_VEC_RMODE(VCVTNS, FPROUNDING_TIEEVEN, vcvt_rm_s)
3865DO_VEC_RMODE(VCVTPU, FPROUNDING_POSINF, vcvt_rm_u)
3866DO_VEC_RMODE(VCVTPS, FPROUNDING_POSINF, vcvt_rm_s)
3867DO_VEC_RMODE(VCVTMU, FPROUNDING_NEGINF, vcvt_rm_u)
3868DO_VEC_RMODE(VCVTMS, FPROUNDING_NEGINF, vcvt_rm_s)
3869
3870DO_VEC_RMODE(VRINTN, FPROUNDING_TIEEVEN, vrint_rm_)
3871DO_VEC_RMODE(VRINTA, FPROUNDING_TIEAWAY, vrint_rm_)
3872DO_VEC_RMODE(VRINTZ, FPROUNDING_ZERO, vrint_rm_)
3873DO_VEC_RMODE(VRINTM, FPROUNDING_NEGINF, vrint_rm_)
3874DO_VEC_RMODE(VRINTP, FPROUNDING_POSINF, vrint_rm_)
3875
3876static bool trans_VSWP(DisasContext *s, arg_2misc *a)
3877{
3878    TCGv_i64 rm, rd;
3879    int pass;
3880
3881    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3882        return false;
3883    }
3884
3885    /* UNDEF accesses to D16-D31 if they don't exist. */
3886    if (!dc_isar_feature(aa32_simd_r32, s) &&
3887        ((a->vd | a->vm) & 0x10)) {
3888        return false;
3889    }
3890
3891    if (a->size != 0) {
3892        return false;
3893    }
3894
3895    if ((a->vd | a->vm) & a->q) {
3896        return false;
3897    }
3898
3899    if (!vfp_access_check(s)) {
3900        return true;
3901    }
3902
3903    rm = tcg_temp_new_i64();
3904    rd = tcg_temp_new_i64();
3905    for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
3906        read_neon_element64(rm, a->vm, pass, MO_64);
3907        read_neon_element64(rd, a->vd, pass, MO_64);
3908        write_neon_element64(rm, a->vd, pass, MO_64);
3909        write_neon_element64(rd, a->vm, pass, MO_64);
3910    }
3911    tcg_temp_free_i64(rm);
3912    tcg_temp_free_i64(rd);
3913
3914    return true;
3915}
3916static void gen_neon_trn_u8(TCGv_i32 t0, TCGv_i32 t1)
3917{
3918    TCGv_i32 rd, tmp;
3919
3920    rd = tcg_temp_new_i32();
3921    tmp = tcg_temp_new_i32();
3922
3923    tcg_gen_shli_i32(rd, t0, 8);
3924    tcg_gen_andi_i32(rd, rd, 0xff00ff00);
3925    tcg_gen_andi_i32(tmp, t1, 0x00ff00ff);
3926    tcg_gen_or_i32(rd, rd, tmp);
3927
3928    tcg_gen_shri_i32(t1, t1, 8);
3929    tcg_gen_andi_i32(t1, t1, 0x00ff00ff);
3930    tcg_gen_andi_i32(tmp, t0, 0xff00ff00);
3931    tcg_gen_or_i32(t1, t1, tmp);
3932    tcg_gen_mov_i32(t0, rd);
3933
3934    tcg_temp_free_i32(tmp);
3935    tcg_temp_free_i32(rd);
3936}
3937
3938static void gen_neon_trn_u16(TCGv_i32 t0, TCGv_i32 t1)
3939{
3940    TCGv_i32 rd, tmp;
3941
3942    rd = tcg_temp_new_i32();
3943    tmp = tcg_temp_new_i32();
3944
3945    tcg_gen_shli_i32(rd, t0, 16);
3946    tcg_gen_andi_i32(tmp, t1, 0xffff);
3947    tcg_gen_or_i32(rd, rd, tmp);
3948    tcg_gen_shri_i32(t1, t1, 16);
3949    tcg_gen_andi_i32(tmp, t0, 0xffff0000);
3950    tcg_gen_or_i32(t1, t1, tmp);
3951    tcg_gen_mov_i32(t0, rd);
3952
3953    tcg_temp_free_i32(tmp);
3954    tcg_temp_free_i32(rd);
3955}
3956
3957static bool trans_VTRN(DisasContext *s, arg_2misc *a)
3958{
3959    TCGv_i32 tmp, tmp2;
3960    int pass;
3961
3962    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3963        return false;
3964    }
3965
3966    /* UNDEF accesses to D16-D31 if they don't exist. */
3967    if (!dc_isar_feature(aa32_simd_r32, s) &&
3968        ((a->vd | a->vm) & 0x10)) {
3969        return false;
3970    }
3971
3972    if ((a->vd | a->vm) & a->q) {
3973        return false;
3974    }
3975
3976    if (a->size == 3) {
3977        return false;
3978    }
3979
3980    if (!vfp_access_check(s)) {
3981        return true;
3982    }
3983
3984    tmp = tcg_temp_new_i32();
3985    tmp2 = tcg_temp_new_i32();
3986    if (a->size == MO_32) {
3987        for (pass = 0; pass < (a->q ? 4 : 2); pass += 2) {
3988            read_neon_element32(tmp, a->vm, pass, MO_32);
3989            read_neon_element32(tmp2, a->vd, pass + 1, MO_32);
3990            write_neon_element32(tmp2, a->vm, pass, MO_32);
3991            write_neon_element32(tmp, a->vd, pass + 1, MO_32);
3992        }
3993    } else {
3994        for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3995            read_neon_element32(tmp, a->vm, pass, MO_32);
3996            read_neon_element32(tmp2, a->vd, pass, MO_32);
3997            if (a->size == MO_8) {
3998                gen_neon_trn_u8(tmp, tmp2);
3999            } else {
4000                gen_neon_trn_u16(tmp, tmp2);

4001            }
4002            write_neon_element32(tmp2, a->vm, pass, MO_32);
4003            write_neon_element32(tmp, a->vd, pass, MO_32);
4004        }
4005    }
4006    tcg_temp_free_i32(tmp);
4007    tcg_temp_free_i32(tmp2);
4008    return true;
4009}
4010
4011static bool trans_VSMMLA(DisasContext *s, arg_VSMMLA *a)
4012{
4013    if (!dc_isar_feature(aa32_i8mm, s)) {
4014        return false;
4015    }
4016    return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4017                        gen_helper_gvec_smmla_b);
4018}
4019
4020static bool trans_VUMMLA(DisasContext *s, arg_VUMMLA *a)
4021{
4022    if (!dc_isar_feature(aa32_i8mm, s)) {
4023        return false;
4024    }
4025    return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4026                        gen_helper_gvec_ummla_b);
4027}
4028
4029static bool trans_VUSMMLA(DisasContext *s, arg_VUSMMLA *a)
4030{
4031    if (!dc_isar_feature(aa32_i8mm, s)) {
4032        return false;
4033    }
4034    return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4035                        gen_helper_gvec_usmmla_b);
4036}
4037
4038static bool trans_VMMLA_b16(DisasContext *s, arg_VMMLA_b16 *a)
4039{
4040    if (!dc_isar_feature(aa32_bf16, s)) {
4041        return false;
4042    }
4043    return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4044                        gen_helper_gvec_bfmmla);
4045}
4046
4047static bool trans_VFMA_b16(DisasContext *s, arg_VFMA_b16 *a)
4048{
4049    if (!dc_isar_feature(aa32_bf16, s)) {
4050        return false;
4051    }
4052    return do_neon_ddda_fpst(s, 7, a->vd, a->vn, a->vm, a->q, FPST_STD,
4053                             gen_helper_gvec_bfmlal);
4054}
4055
4056static bool trans_VFMA_b16_scal(DisasContext *s, arg_VFMA_b16_scal *a)
4057{
4058    if (!dc_isar_feature(aa32_bf16, s)) {
4059        return false;
4060    }
4061    return do_neon_ddda_fpst(s, 6, a->vd, a->vn, a->vm,
4062                             (a->index << 1) | a->q, FPST_STD,
4063                             gen_helper_gvec_bfmlal_idx);
4064}
4065