qemu/target/arm/translate-neon.c
<<
>>
Prefs
   1/*
   2 *  ARM translation: AArch32 Neon instructions
   3 *
   4 *  Copyright (c) 2003 Fabrice Bellard
   5 *  Copyright (c) 2005-2007 CodeSourcery
   6 *  Copyright (c) 2007 OpenedHand, Ltd.
   7 *  Copyright (c) 2020 Linaro, Ltd.
   8 *
   9 * This library is free software; you can redistribute it and/or
  10 * modify it under the terms of the GNU Lesser General Public
  11 * License as published by the Free Software Foundation; either
  12 * version 2.1 of the License, or (at your option) any later version.
  13 *
  14 * This library is distributed in the hope that it will be useful,
  15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17 * Lesser General Public License for more details.
  18 *
  19 * You should have received a copy of the GNU Lesser General Public
  20 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  21 */
  22
  23#include "qemu/osdep.h"
  24#include "tcg/tcg-op.h"
  25#include "tcg/tcg-op-gvec.h"
  26#include "exec/exec-all.h"
  27#include "exec/gen-icount.h"
  28#include "translate.h"
  29#include "translate-a32.h"
  30
  31static inline int neon_3same_fp_size(DisasContext *s, int x)
  32{
  33    /* Convert 0==fp32, 1==fp16 into a MO_* value */
  34    return MO_32 - x;
  35}
  36
  37/* Include the generated Neon decoder */
  38#include "decode-neon-dp.c.inc"
  39#include "decode-neon-ls.c.inc"
  40#include "decode-neon-shared.c.inc"
  41
  42static TCGv_ptr vfp_reg_ptr(bool dp, int reg)
  43{
  44    TCGv_ptr ret = tcg_temp_new_ptr();
  45    tcg_gen_addi_ptr(ret, cpu_env, vfp_reg_offset(dp, reg));
  46    return ret;
  47}
  48
  49static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
  50{
  51    long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
  52
  53    switch (mop) {
  54    case MO_UB:
  55        tcg_gen_ld8u_i32(var, cpu_env, offset);
  56        break;
  57    case MO_UW:
  58        tcg_gen_ld16u_i32(var, cpu_env, offset);
  59        break;
  60    case MO_UL:
  61        tcg_gen_ld_i32(var, cpu_env, offset);
  62        break;
  63    default:
  64        g_assert_not_reached();
  65    }
  66}
  67
  68static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
  69{
  70    long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
  71
  72    switch (mop) {
  73    case MO_UB:
  74        tcg_gen_ld8u_i64(var, cpu_env, offset);
  75        break;
  76    case MO_UW:
  77        tcg_gen_ld16u_i64(var, cpu_env, offset);
  78        break;
  79    case MO_UL:
  80        tcg_gen_ld32u_i64(var, cpu_env, offset);
  81        break;
  82    case MO_Q:
  83        tcg_gen_ld_i64(var, cpu_env, offset);
  84        break;
  85    default:
  86        g_assert_not_reached();
  87    }
  88}
  89
  90static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
  91{
  92    long offset = neon_element_offset(reg, ele, size);
  93
  94    switch (size) {
  95    case MO_8:
  96        tcg_gen_st8_i32(var, cpu_env, offset);
  97        break;
  98    case MO_16:
  99        tcg_gen_st16_i32(var, cpu_env, offset);
 100        break;
 101    case MO_32:
 102        tcg_gen_st_i32(var, cpu_env, offset);
 103        break;
 104    default:
 105        g_assert_not_reached();
 106    }
 107}
 108
 109static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
 110{
 111    long offset = neon_element_offset(reg, ele, size);
 112
 113    switch (size) {
 114    case MO_8:
 115        tcg_gen_st8_i64(var, cpu_env, offset);
 116        break;
 117    case MO_16:
 118        tcg_gen_st16_i64(var, cpu_env, offset);
 119        break;
 120    case MO_32:
 121        tcg_gen_st32_i64(var, cpu_env, offset);
 122        break;
 123    case MO_64:
 124        tcg_gen_st_i64(var, cpu_env, offset);
 125        break;
 126    default:
 127        g_assert_not_reached();
 128    }
 129}
 130
 131static bool do_neon_ddda(DisasContext *s, int q, int vd, int vn, int vm,
 132                         int data, gen_helper_gvec_4 *fn_gvec)
 133{
 134    /* UNDEF accesses to D16-D31 if they don't exist. */
 135    if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
 136        return false;
 137    }
 138
 139    /*
 140     * UNDEF accesses to odd registers for each bit of Q.
 141     * Q will be 0b111 for all Q-reg instructions, otherwise
 142     * when we have mixed Q- and D-reg inputs.
 143     */
 144    if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
 145        return false;
 146    }
 147
 148    if (!vfp_access_check(s)) {
 149        return true;
 150    }
 151
 152    int opr_sz = q ? 16 : 8;
 153    tcg_gen_gvec_4_ool(vfp_reg_offset(1, vd),
 154                       vfp_reg_offset(1, vn),
 155                       vfp_reg_offset(1, vm),
 156                       vfp_reg_offset(1, vd),
 157                       opr_sz, opr_sz, data, fn_gvec);
 158    return true;
 159}
 160
 161static bool do_neon_ddda_fpst(DisasContext *s, int q, int vd, int vn, int vm,
 162                              int data, ARMFPStatusFlavour fp_flavour,
 163                              gen_helper_gvec_4_ptr *fn_gvec_ptr)
 164{
 165    /* UNDEF accesses to D16-D31 if they don't exist. */
 166    if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
 167        return false;
 168    }
 169
 170    /*
 171     * UNDEF accesses to odd registers for each bit of Q.
 172     * Q will be 0b111 for all Q-reg instructions, otherwise
 173     * when we have mixed Q- and D-reg inputs.
 174     */
 175    if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
 176        return false;
 177    }
 178
 179    if (!vfp_access_check(s)) {
 180        return true;
 181    }
 182
 183    int opr_sz = q ? 16 : 8;
 184    TCGv_ptr fpst = fpstatus_ptr(fp_flavour);
 185
 186    tcg_gen_gvec_4_ptr(vfp_reg_offset(1, vd),
 187                       vfp_reg_offset(1, vn),
 188                       vfp_reg_offset(1, vm),
 189                       vfp_reg_offset(1, vd),
 190                       fpst, opr_sz, opr_sz, data, fn_gvec_ptr);
 191    tcg_temp_free_ptr(fpst);
 192    return true;
 193}
 194
 195static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
 196{
 197    if (!dc_isar_feature(aa32_vcma, s)) {
 198        return false;
 199    }
 200    if (a->size == MO_16) {
 201        if (!dc_isar_feature(aa32_fp16_arith, s)) {
 202            return false;
 203        }
 204        return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
 205                                 FPST_STD_F16, gen_helper_gvec_fcmlah);
 206    }
 207    return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
 208                             FPST_STD, gen_helper_gvec_fcmlas);
 209}
 210
 211static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
 212{
 213    int opr_sz;
 214    TCGv_ptr fpst;
 215    gen_helper_gvec_3_ptr *fn_gvec_ptr;
 216
 217    if (!dc_isar_feature(aa32_vcma, s)
 218        || (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s))) {
 219        return false;
 220    }
 221
 222    /* UNDEF accesses to D16-D31 if they don't exist. */
 223    if (!dc_isar_feature(aa32_simd_r32, s) &&
 224        ((a->vd | a->vn | a->vm) & 0x10)) {
 225        return false;
 226    }
 227
 228    if ((a->vn | a->vm | a->vd) & a->q) {
 229        return false;
 230    }
 231
 232    if (!vfp_access_check(s)) {
 233        return true;
 234    }
 235
 236    opr_sz = (1 + a->q) * 8;
 237    fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
 238    fn_gvec_ptr = (a->size == MO_16) ?
 239        gen_helper_gvec_fcaddh : gen_helper_gvec_fcadds;
 240    tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 241                       vfp_reg_offset(1, a->vn),
 242                       vfp_reg_offset(1, a->vm),
 243                       fpst, opr_sz, opr_sz, a->rot,
 244                       fn_gvec_ptr);
 245    tcg_temp_free_ptr(fpst);
 246    return true;
 247}
 248
 249static bool trans_VSDOT(DisasContext *s, arg_VSDOT *a)
 250{
 251    if (!dc_isar_feature(aa32_dp, s)) {
 252        return false;
 253    }
 254    return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 255                        gen_helper_gvec_sdot_b);
 256}
 257
 258static bool trans_VUDOT(DisasContext *s, arg_VUDOT *a)
 259{
 260    if (!dc_isar_feature(aa32_dp, s)) {
 261        return false;
 262    }
 263    return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 264                        gen_helper_gvec_udot_b);
 265}
 266
 267static bool trans_VUSDOT(DisasContext *s, arg_VUSDOT *a)
 268{
 269    if (!dc_isar_feature(aa32_i8mm, s)) {
 270        return false;
 271    }
 272    return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 273                        gen_helper_gvec_usdot_b);
 274}
 275
 276static bool trans_VDOT_b16(DisasContext *s, arg_VDOT_b16 *a)
 277{
 278    if (!dc_isar_feature(aa32_bf16, s)) {
 279        return false;
 280    }
 281    return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 282                        gen_helper_gvec_bfdot);
 283}
 284
 285static bool trans_VFML(DisasContext *s, arg_VFML *a)
 286{
 287    int opr_sz;
 288
 289    if (!dc_isar_feature(aa32_fhm, s)) {
 290        return false;
 291    }
 292
 293    /* UNDEF accesses to D16-D31 if they don't exist. */
 294    if (!dc_isar_feature(aa32_simd_r32, s) &&
 295        (a->vd & 0x10)) {
 296        return false;
 297    }
 298
 299    if (a->vd & a->q) {
 300        return false;
 301    }
 302
 303    if (!vfp_access_check(s)) {
 304        return true;
 305    }
 306
 307    opr_sz = (1 + a->q) * 8;
 308    tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 309                       vfp_reg_offset(a->q, a->vn),
 310                       vfp_reg_offset(a->q, a->vm),
 311                       cpu_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
 312                       gen_helper_gvec_fmlal_a32);
 313    return true;
 314}
 315
 316static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
 317{
 318    int data = (a->index << 2) | a->rot;
 319
 320    if (!dc_isar_feature(aa32_vcma, s)) {
 321        return false;
 322    }
 323    if (a->size == MO_16) {
 324        if (!dc_isar_feature(aa32_fp16_arith, s)) {
 325            return false;
 326        }
 327        return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
 328                                 FPST_STD_F16, gen_helper_gvec_fcmlah_idx);
 329    }
 330    return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
 331                             FPST_STD, gen_helper_gvec_fcmlas_idx);
 332}
 333
 334static bool trans_VSDOT_scalar(DisasContext *s, arg_VSDOT_scalar *a)
 335{
 336    if (!dc_isar_feature(aa32_dp, s)) {
 337        return false;
 338    }
 339    return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 340                        gen_helper_gvec_sdot_idx_b);
 341}
 342
 343static bool trans_VUDOT_scalar(DisasContext *s, arg_VUDOT_scalar *a)
 344{
 345    if (!dc_isar_feature(aa32_dp, s)) {
 346        return false;
 347    }
 348    return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 349                        gen_helper_gvec_udot_idx_b);
 350}
 351
 352static bool trans_VUSDOT_scalar(DisasContext *s, arg_VUSDOT_scalar *a)
 353{
 354    if (!dc_isar_feature(aa32_i8mm, s)) {
 355        return false;
 356    }
 357    return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 358                        gen_helper_gvec_usdot_idx_b);
 359}
 360
 361static bool trans_VSUDOT_scalar(DisasContext *s, arg_VSUDOT_scalar *a)
 362{
 363    if (!dc_isar_feature(aa32_i8mm, s)) {
 364        return false;
 365    }
 366    return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 367                        gen_helper_gvec_sudot_idx_b);
 368}
 369
 370static bool trans_VDOT_b16_scal(DisasContext *s, arg_VDOT_b16_scal *a)
 371{
 372    if (!dc_isar_feature(aa32_bf16, s)) {
 373        return false;
 374    }
 375    return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 376                        gen_helper_gvec_bfdot_idx);
 377}
 378
 379static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
 380{
 381    int opr_sz;
 382
 383    if (!dc_isar_feature(aa32_fhm, s)) {
 384        return false;
 385    }
 386
 387    /* UNDEF accesses to D16-D31 if they don't exist. */
 388    if (!dc_isar_feature(aa32_simd_r32, s) &&
 389        ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
 390        return false;
 391    }
 392
 393    if (a->vd & a->q) {
 394        return false;
 395    }
 396
 397    if (!vfp_access_check(s)) {
 398        return true;
 399    }
 400
 401    opr_sz = (1 + a->q) * 8;
 402    tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 403                       vfp_reg_offset(a->q, a->vn),
 404                       vfp_reg_offset(a->q, a->rm),
 405                       cpu_env, opr_sz, opr_sz,
 406                       (a->index << 2) | a->s, /* is_2 == 0 */
 407                       gen_helper_gvec_fmlal_idx_a32);
 408    return true;
 409}
 410
 411static struct {
 412    int nregs;
 413    int interleave;
 414    int spacing;
 415} const neon_ls_element_type[11] = {
 416    {1, 4, 1},
 417    {1, 4, 2},
 418    {4, 1, 1},
 419    {2, 2, 2},
 420    {1, 3, 1},
 421    {1, 3, 2},
 422    {3, 1, 1},
 423    {1, 1, 1},
 424    {1, 2, 1},
 425    {1, 2, 2},
 426    {2, 1, 1}
 427};
 428
 429static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
 430                                      int stride)
 431{
 432    if (rm != 15) {
 433        TCGv_i32 base;
 434
 435        base = load_reg(s, rn);
 436        if (rm == 13) {
 437            tcg_gen_addi_i32(base, base, stride);
 438        } else {
 439            TCGv_i32 index;
 440            index = load_reg(s, rm);
 441            tcg_gen_add_i32(base, base, index);
 442            tcg_temp_free_i32(index);
 443        }
 444        store_reg(s, rn, base);
 445    }
 446}
 447
 448static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
 449{
 450    /* Neon load/store multiple structures */
 451    int nregs, interleave, spacing, reg, n;
 452    MemOp mop, align, endian;
 453    int mmu_idx = get_mem_index(s);
 454    int size = a->size;
 455    TCGv_i64 tmp64;
 456    TCGv_i32 addr, tmp;
 457
 458    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 459        return false;
 460    }
 461
 462    /* UNDEF accesses to D16-D31 if they don't exist */
 463    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 464        return false;
 465    }
 466    if (a->itype > 10) {
 467        return false;
 468    }
 469    /* Catch UNDEF cases for bad values of align field */
 470    switch (a->itype & 0xc) {
 471    case 4:
 472        if (a->align >= 2) {
 473            return false;
 474        }
 475        break;
 476    case 8:
 477        if (a->align == 3) {
 478            return false;
 479        }
 480        break;
 481    default:
 482        break;
 483    }
 484    nregs = neon_ls_element_type[a->itype].nregs;
 485    interleave = neon_ls_element_type[a->itype].interleave;
 486    spacing = neon_ls_element_type[a->itype].spacing;
 487    if (size == 3 && (interleave | spacing) != 1) {
 488        return false;
 489    }
 490
 491    if (!vfp_access_check(s)) {
 492        return true;
 493    }
 494
 495    /* For our purposes, bytes are always little-endian.  */
 496    endian = s->be_data;
 497    if (size == 0) {
 498        endian = MO_LE;
 499    }
 500
 501    /* Enforce alignment requested by the instruction */
 502    if (a->align) {
 503        align = pow2_align(a->align + 2); /* 4 ** a->align */
 504    } else {
 505        align = s->align_mem ? MO_ALIGN : 0;
 506    }
 507
 508    /*
 509     * Consecutive little-endian elements from a single register
 510     * can be promoted to a larger little-endian operation.
 511     */
 512    if (interleave == 1 && endian == MO_LE) {
 513        /* Retain any natural alignment. */
 514        if (align == MO_ALIGN) {
 515            align = pow2_align(size);
 516        }
 517        size = 3;
 518    }
 519
 520    tmp64 = tcg_temp_new_i64();
 521    addr = tcg_temp_new_i32();
 522    tmp = tcg_const_i32(1 << size);
 523    load_reg_var(s, addr, a->rn);
 524
 525    mop = endian | size | align;
 526    for (reg = 0; reg < nregs; reg++) {
 527        for (n = 0; n < 8 >> size; n++) {
 528            int xs;
 529            for (xs = 0; xs < interleave; xs++) {
 530                int tt = a->vd + reg + spacing * xs;
 531
 532                if (a->l) {
 533                    gen_aa32_ld_internal_i64(s, tmp64, addr, mmu_idx, mop);
 534                    neon_store_element64(tt, n, size, tmp64);
 535                } else {
 536                    neon_load_element64(tmp64, tt, n, size);
 537                    gen_aa32_st_internal_i64(s, tmp64, addr, mmu_idx, mop);
 538                }
 539                tcg_gen_add_i32(addr, addr, tmp);
 540
 541                /* Subsequent memory operations inherit alignment */
 542                mop &= ~MO_AMASK;
 543            }
 544        }
 545    }
 546    tcg_temp_free_i32(addr);
 547    tcg_temp_free_i32(tmp);
 548    tcg_temp_free_i64(tmp64);
 549
 550    gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
 551    return true;
 552}
 553
 554static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
 555{
 556    /* Neon load single structure to all lanes */
 557    int reg, stride, vec_size;
 558    int vd = a->vd;
 559    int size = a->size;
 560    int nregs = a->n + 1;
 561    TCGv_i32 addr, tmp;
 562    MemOp mop, align;
 563
 564    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 565        return false;
 566    }
 567
 568    /* UNDEF accesses to D16-D31 if they don't exist */
 569    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 570        return false;
 571    }
 572
 573    align = 0;
 574    if (size == 3) {
 575        if (nregs != 4 || a->a == 0) {
 576            return false;
 577        }
 578        /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
 579        size = MO_32;
 580        align = MO_ALIGN_16;
 581    } else if (a->a) {
 582        switch (nregs) {
 583        case 1:
 584            if (size == 0) {
 585                return false;
 586            }
 587            align = MO_ALIGN;
 588            break;
 589        case 2:
 590            align = pow2_align(size + 1);
 591            break;
 592        case 3:
 593            return false;
 594        case 4:
 595            align = pow2_align(size + 2);
 596            break;
 597        default:
 598            g_assert_not_reached();
 599        }
 600    }
 601
 602    if (!vfp_access_check(s)) {
 603        return true;
 604    }
 605
 606    /*
 607     * VLD1 to all lanes: T bit indicates how many Dregs to write.
 608     * VLD2/3/4 to all lanes: T bit indicates register stride.
 609     */
 610    stride = a->t ? 2 : 1;
 611    vec_size = nregs == 1 ? stride * 8 : 8;
 612    mop = size | align;
 613    tmp = tcg_temp_new_i32();
 614    addr = tcg_temp_new_i32();
 615    load_reg_var(s, addr, a->rn);
 616    for (reg = 0; reg < nregs; reg++) {
 617        gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), mop);
 618        if ((vd & 1) && vec_size == 16) {
 619            /*
 620             * We cannot write 16 bytes at once because the
 621             * destination is unaligned.
 622             */
 623            tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
 624                                 8, 8, tmp);
 625            tcg_gen_gvec_mov(0, neon_full_reg_offset(vd + 1),
 626                             neon_full_reg_offset(vd), 8, 8);
 627        } else {
 628            tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
 629                                 vec_size, vec_size, tmp);
 630        }
 631        tcg_gen_addi_i32(addr, addr, 1 << size);
 632        vd += stride;
 633
 634        /* Subsequent memory operations inherit alignment */
 635        mop &= ~MO_AMASK;
 636    }
 637    tcg_temp_free_i32(tmp);
 638    tcg_temp_free_i32(addr);
 639
 640    gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
 641
 642    return true;
 643}
 644
 645static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
 646{
 647    /* Neon load/store single structure to one lane */
 648    int reg;
 649    int nregs = a->n + 1;
 650    int vd = a->vd;
 651    TCGv_i32 addr, tmp;
 652    MemOp mop;
 653
 654    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 655        return false;
 656    }
 657
 658    /* UNDEF accesses to D16-D31 if they don't exist */
 659    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 660        return false;
 661    }
 662
 663    /* Catch the UNDEF cases. This is unavoidably a bit messy. */
 664    switch (nregs) {
 665    case 1:
 666        if (((a->align & (1 << a->size)) != 0) ||
 667            (a->size == 2 && (a->align == 1 || a->align == 2))) {
 668            return false;
 669        }
 670        break;
 671    case 3:
 672        if ((a->align & 1) != 0) {
 673            return false;
 674        }
 675        /* fall through */
 676    case 2:
 677        if (a->size == 2 && (a->align & 2) != 0) {
 678            return false;
 679        }
 680        break;
 681    case 4:
 682        if (a->size == 2 && a->align == 3) {
 683            return false;
 684        }
 685        break;
 686    default:
 687        abort();
 688    }
 689    if ((vd + a->stride * (nregs - 1)) > 31) {
 690        /*
 691         * Attempts to write off the end of the register file are
 692         * UNPREDICTABLE; we choose to UNDEF because otherwise we would
 693         * access off the end of the array that holds the register data.
 694         */
 695        return false;
 696    }
 697
 698    if (!vfp_access_check(s)) {
 699        return true;
 700    }
 701
 702    /* Pick up SCTLR settings */
 703    mop = finalize_memop(s, a->size);
 704
 705    if (a->align) {
 706        MemOp align_op;
 707
 708        switch (nregs) {
 709        case 1:
 710            /* For VLD1, use natural alignment. */
 711            align_op = MO_ALIGN;
 712            break;
 713        case 2:
 714            /* For VLD2, use double alignment. */
 715            align_op = pow2_align(a->size + 1);
 716            break;
 717        case 4:
 718            if (a->size == MO_32) {
 719                /*
 720                 * For VLD4.32, align = 1 is double alignment, align = 2 is
 721                 * quad alignment; align = 3 is rejected above.
 722                 */
 723                align_op = pow2_align(a->size + a->align);
 724            } else {
 725                /* For VLD4.8 and VLD.16, we want quad alignment. */
 726                align_op = pow2_align(a->size + 2);
 727            }
 728            break;
 729        default:
 730            /* For VLD3, the alignment field is zero and rejected above. */
 731            g_assert_not_reached();
 732        }
 733
 734        mop = (mop & ~MO_AMASK) | align_op;
 735    }
 736
 737    tmp = tcg_temp_new_i32();
 738    addr = tcg_temp_new_i32();
 739    load_reg_var(s, addr, a->rn);
 740
 741    for (reg = 0; reg < nregs; reg++) {
 742        if (a->l) {
 743            gen_aa32_ld_internal_i32(s, tmp, addr, get_mem_index(s), mop);
 744            neon_store_element(vd, a->reg_idx, a->size, tmp);
 745        } else { /* Store */
 746            neon_load_element(tmp, vd, a->reg_idx, a->size);
 747            gen_aa32_st_internal_i32(s, tmp, addr, get_mem_index(s), mop);
 748        }
 749        vd += a->stride;
 750        tcg_gen_addi_i32(addr, addr, 1 << a->size);
 751
 752        /* Subsequent memory operations inherit alignment */
 753        mop &= ~MO_AMASK;
 754    }
 755    tcg_temp_free_i32(addr);
 756    tcg_temp_free_i32(tmp);
 757
 758    gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
 759
 760    return true;
 761}
 762
 763static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
 764{
 765    int vec_size = a->q ? 16 : 8;
 766    int rd_ofs = neon_full_reg_offset(a->vd);
 767    int rn_ofs = neon_full_reg_offset(a->vn);
 768    int rm_ofs = neon_full_reg_offset(a->vm);
 769
 770    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 771        return false;
 772    }
 773
 774    /* UNDEF accesses to D16-D31 if they don't exist. */
 775    if (!dc_isar_feature(aa32_simd_r32, s) &&
 776        ((a->vd | a->vn | a->vm) & 0x10)) {
 777        return false;
 778    }
 779
 780    if ((a->vn | a->vm | a->vd) & a->q) {
 781        return false;
 782    }
 783
 784    if (!vfp_access_check(s)) {
 785        return true;
 786    }
 787
 788    fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
 789    return true;
 790}
 791
 792#define DO_3SAME(INSN, FUNC)                                            \
 793    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 794    {                                                                   \
 795        return do_3same(s, a, FUNC);                                    \
 796    }
 797
 798DO_3SAME(VADD, tcg_gen_gvec_add)
 799DO_3SAME(VSUB, tcg_gen_gvec_sub)
 800DO_3SAME(VAND, tcg_gen_gvec_and)
 801DO_3SAME(VBIC, tcg_gen_gvec_andc)
 802DO_3SAME(VORR, tcg_gen_gvec_or)
 803DO_3SAME(VORN, tcg_gen_gvec_orc)
 804DO_3SAME(VEOR, tcg_gen_gvec_xor)
 805DO_3SAME(VSHL_S, gen_gvec_sshl)
 806DO_3SAME(VSHL_U, gen_gvec_ushl)
 807DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
 808DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
 809DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
 810DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
 811
 812/* These insns are all gvec_bitsel but with the inputs in various orders. */
 813#define DO_3SAME_BITSEL(INSN, O1, O2, O3)                               \
 814    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 815                                uint32_t rn_ofs, uint32_t rm_ofs,       \
 816                                uint32_t oprsz, uint32_t maxsz)         \
 817    {                                                                   \
 818        tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz);    \
 819    }                                                                   \
 820    DO_3SAME(INSN, gen_##INSN##_3s)
 821
 822DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
 823DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
 824DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
 825
 826#define DO_3SAME_NO_SZ_3(INSN, FUNC)                                    \
 827    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 828    {                                                                   \
 829        if (a->size == 3) {                                             \
 830            return false;                                               \
 831        }                                                               \
 832        return do_3same(s, a, FUNC);                                    \
 833    }
 834
 835DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
 836DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
 837DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
 838DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
 839DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
 840DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
 841DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
 842DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
 843DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
 844DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
 845DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
 846DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
 847
 848#define DO_3SAME_CMP(INSN, COND)                                        \
 849    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 850                                uint32_t rn_ofs, uint32_t rm_ofs,       \
 851                                uint32_t oprsz, uint32_t maxsz)         \
 852    {                                                                   \
 853        tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
 854    }                                                                   \
 855    DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
 856
 857DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
 858DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
 859DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
 860DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
 861DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
 862
 863#define WRAP_OOL_FN(WRAPNAME, FUNC)                                        \
 864    static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,  \
 865                         uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz)  \
 866    {                                                                      \
 867        tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
 868    }
 869
 870WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
 871
 872static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
 873{
 874    if (a->size != 0) {
 875        return false;
 876    }
 877    return do_3same(s, a, gen_VMUL_p_3s);
 878}
 879
 880#define DO_VQRDMLAH(INSN, FUNC)                                         \
 881    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 882    {                                                                   \
 883        if (!dc_isar_feature(aa32_rdm, s)) {                            \
 884            return false;                                               \
 885        }                                                               \
 886        if (a->size != 1 && a->size != 2) {                             \
 887            return false;                                               \
 888        }                                                               \
 889        return do_3same(s, a, FUNC);                                    \
 890    }
 891
 892DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
 893DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
 894
 895#define DO_SHA1(NAME, FUNC)                                             \
 896    WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
 897    static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
 898    {                                                                   \
 899        if (!dc_isar_feature(aa32_sha1, s)) {                           \
 900            return false;                                               \
 901        }                                                               \
 902        return do_3same(s, a, gen_##NAME##_3s);                         \
 903    }
 904
 905DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
 906DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
 907DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
 908DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
 909
 910#define DO_SHA2(NAME, FUNC)                                             \
 911    WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
 912    static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
 913    {                                                                   \
 914        if (!dc_isar_feature(aa32_sha2, s)) {                           \
 915            return false;                                               \
 916        }                                                               \
 917        return do_3same(s, a, gen_##NAME##_3s);                         \
 918    }
 919
 920DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
 921DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
 922DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
 923
 924#define DO_3SAME_64(INSN, FUNC)                                         \
 925    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 926                                uint32_t rn_ofs, uint32_t rm_ofs,       \
 927                                uint32_t oprsz, uint32_t maxsz)         \
 928    {                                                                   \
 929        static const GVecGen3 op = { .fni8 = FUNC };                    \
 930        tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op);      \
 931    }                                                                   \
 932    DO_3SAME(INSN, gen_##INSN##_3s)
 933
 934#define DO_3SAME_64_ENV(INSN, FUNC)                                     \
 935    static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)    \
 936    {                                                                   \
 937        FUNC(d, cpu_env, n, m);                                         \
 938    }                                                                   \
 939    DO_3SAME_64(INSN, gen_##INSN##_elt)
 940
 941DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
 942DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
 943DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
 944DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
 945DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
 946DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
 947
 948#define DO_3SAME_32(INSN, FUNC)                                         \
 949    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 950                                uint32_t rn_ofs, uint32_t rm_ofs,       \
 951                                uint32_t oprsz, uint32_t maxsz)         \
 952    {                                                                   \
 953        static const GVecGen3 ops[4] = {                                \
 954            { .fni4 = gen_helper_neon_##FUNC##8 },                      \
 955            { .fni4 = gen_helper_neon_##FUNC##16 },                     \
 956            { .fni4 = gen_helper_neon_##FUNC##32 },                     \
 957            { 0 },                                                      \
 958        };                                                              \
 959        tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
 960    }                                                                   \
 961    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 962    {                                                                   \
 963        if (a->size > 2) {                                              \
 964            return false;                                               \
 965        }                                                               \
 966        return do_3same(s, a, gen_##INSN##_3s);                         \
 967    }
 968
 969/*
 970 * Some helper functions need to be passed the cpu_env. In order
 971 * to use those with the gvec APIs like tcg_gen_gvec_3() we need
 972 * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
 973 * and which call a NeonGenTwoOpEnvFn().
 974 */
 975#define WRAP_ENV_FN(WRAPNAME, FUNC)                                     \
 976    static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m)            \
 977    {                                                                   \
 978        FUNC(d, cpu_env, n, m);                                         \
 979    }
 980
 981#define DO_3SAME_32_ENV(INSN, FUNC)                                     \
 982    WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8);        \
 983    WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16);      \
 984    WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32);      \
 985    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 986                                uint32_t rn_ofs, uint32_t rm_ofs,       \
 987                                uint32_t oprsz, uint32_t maxsz)         \
 988    {                                                                   \
 989        static const GVecGen3 ops[4] = {                                \
 990            { .fni4 = gen_##INSN##_tramp8 },                            \
 991            { .fni4 = gen_##INSN##_tramp16 },                           \
 992            { .fni4 = gen_##INSN##_tramp32 },                           \
 993            { 0 },                                                      \
 994        };                                                              \
 995        tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
 996    }                                                                   \
 997    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 998    {                                                                   \
 999        if (a->size > 2) {                                              \
1000            return false;                                               \
1001        }                                                               \
1002        return do_3same(s, a, gen_##INSN##_3s);                         \
1003    }
1004
1005DO_3SAME_32(VHADD_S, hadd_s)
1006DO_3SAME_32(VHADD_U, hadd_u)
1007DO_3SAME_32(VHSUB_S, hsub_s)
1008DO_3SAME_32(VHSUB_U, hsub_u)
1009DO_3SAME_32(VRHADD_S, rhadd_s)
1010DO_3SAME_32(VRHADD_U, rhadd_u)
1011DO_3SAME_32(VRSHL_S, rshl_s)
1012DO_3SAME_32(VRSHL_U, rshl_u)
1013
1014DO_3SAME_32_ENV(VQSHL_S, qshl_s)
1015DO_3SAME_32_ENV(VQSHL_U, qshl_u)
1016DO_3SAME_32_ENV(VQRSHL_S, qrshl_s)
1017DO_3SAME_32_ENV(VQRSHL_U, qrshl_u)
1018
1019static bool do_3same_pair(DisasContext *s, arg_3same *a, NeonGenTwoOpFn *fn)
1020{
1021    /* Operations handled pairwise 32 bits at a time */
1022    TCGv_i32 tmp, tmp2, tmp3;
1023
1024    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1025        return false;
1026    }
1027
1028    /* UNDEF accesses to D16-D31 if they don't exist. */
1029    if (!dc_isar_feature(aa32_simd_r32, s) &&
1030        ((a->vd | a->vn | a->vm) & 0x10)) {
1031        return false;
1032    }
1033
1034    if (a->size == 3) {
1035        return false;
1036    }
1037
1038    if (!vfp_access_check(s)) {
1039        return true;
1040    }
1041
1042    assert(a->q == 0); /* enforced by decode patterns */
1043
1044    /*
1045     * Note that we have to be careful not to clobber the source operands
1046     * in the "vm == vd" case by storing the result of the first pass too
1047     * early. Since Q is 0 there are always just two passes, so instead
1048     * of a complicated loop over each pass we just unroll.
1049     */
1050    tmp = tcg_temp_new_i32();
1051    tmp2 = tcg_temp_new_i32();
1052    tmp3 = tcg_temp_new_i32();
1053
1054    read_neon_element32(tmp, a->vn, 0, MO_32);
1055    read_neon_element32(tmp2, a->vn, 1, MO_32);
1056    fn(tmp, tmp, tmp2);
1057
1058    read_neon_element32(tmp3, a->vm, 0, MO_32);
1059    read_neon_element32(tmp2, a->vm, 1, MO_32);
1060    fn(tmp3, tmp3, tmp2);
1061
1062    write_neon_element32(tmp, a->vd, 0, MO_32);
1063    write_neon_element32(tmp3, a->vd, 1, MO_32);
1064
1065    tcg_temp_free_i32(tmp);
1066    tcg_temp_free_i32(tmp2);
1067    tcg_temp_free_i32(tmp3);
1068    return true;
1069}
1070
1071#define DO_3SAME_PAIR(INSN, func)                                       \
1072    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1073    {                                                                   \
1074        static NeonGenTwoOpFn * const fns[] = {                         \
1075            gen_helper_neon_##func##8,                                  \
1076            gen_helper_neon_##func##16,                                 \
1077            gen_helper_neon_##func##32,                                 \
1078        };                                                              \
1079        if (a->size > 2) {                                              \
1080            return false;                                               \
1081        }                                                               \
1082        return do_3same_pair(s, a, fns[a->size]);                       \
1083    }
1084
1085/* 32-bit pairwise ops end up the same as the elementwise versions.  */
1086#define gen_helper_neon_pmax_s32  tcg_gen_smax_i32
1087#define gen_helper_neon_pmax_u32  tcg_gen_umax_i32
1088#define gen_helper_neon_pmin_s32  tcg_gen_smin_i32
1089#define gen_helper_neon_pmin_u32  tcg_gen_umin_i32
1090#define gen_helper_neon_padd_u32  tcg_gen_add_i32
1091
1092DO_3SAME_PAIR(VPMAX_S, pmax_s)
1093DO_3SAME_PAIR(VPMIN_S, pmin_s)
1094DO_3SAME_PAIR(VPMAX_U, pmax_u)
1095DO_3SAME_PAIR(VPMIN_U, pmin_u)
1096DO_3SAME_PAIR(VPADD, padd_u)
1097
1098#define DO_3SAME_VQDMULH(INSN, FUNC)                                    \
1099    WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16);    \
1100    WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32);    \
1101    static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
1102                                uint32_t rn_ofs, uint32_t rm_ofs,       \
1103                                uint32_t oprsz, uint32_t maxsz)         \
1104    {                                                                   \
1105        static const GVecGen3 ops[2] = {                                \
1106            { .fni4 = gen_##INSN##_tramp16 },                           \
1107            { .fni4 = gen_##INSN##_tramp32 },                           \
1108        };                                                              \
1109        tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \
1110    }                                                                   \
1111    static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1112    {                                                                   \
1113        if (a->size != 1 && a->size != 2) {                             \
1114            return false;                                               \
1115        }                                                               \
1116        return do_3same(s, a, gen_##INSN##_3s);                         \
1117    }
1118
1119DO_3SAME_VQDMULH(VQDMULH, qdmulh)
1120DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
1121
1122#define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC)                              \
1123    static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
1124                         uint32_t rn_ofs, uint32_t rm_ofs,              \
1125                         uint32_t oprsz, uint32_t maxsz)                \
1126    {                                                                   \
1127        TCGv_ptr fpst = fpstatus_ptr(FPST);                             \
1128        tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst,                \
1129                           oprsz, maxsz, 0, FUNC);                      \
1130        tcg_temp_free_ptr(fpst);                                        \
1131    }
1132
1133#define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC)                                 \
1134    WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC)                 \
1135    WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC)             \
1136    static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a)     \
1137    {                                                                   \
1138        if (a->size == MO_16) {                                         \
1139            if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
1140                return false;                                           \
1141            }                                                           \
1142            return do_3same(s, a, gen_##INSN##_fp16_3s);                \
1143        }                                                               \
1144        return do_3same(s, a, gen_##INSN##_fp32_3s);                    \
1145    }
1146
1147
1148DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
1149DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
1150DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
1151DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
1152DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
1153DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
1154DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
1155DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
1156DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
1157DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
1158DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
1159DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
1160DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
1161DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
1162DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
1163DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h)
1164DO_3S_FP_GVEC(VRSQRTS, gen_helper_gvec_rsqrts_nf_s, gen_helper_gvec_rsqrts_nf_h)
1165
1166WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
1167WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
1168WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s)
1169WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h)
1170
1171static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
1172{
1173    if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1174        return false;
1175    }
1176
1177    if (a->size == MO_16) {
1178        if (!dc_isar_feature(aa32_fp16_arith, s)) {
1179            return false;
1180        }
1181        return do_3same(s, a, gen_VMAXNM_fp16_3s);
1182    }
1183    return do_3same(s, a, gen_VMAXNM_fp32_3s);
1184}
1185
1186static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
1187{
1188    if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1189        return false;
1190    }
1191
1192    if (a->size == MO_16) {
1193        if (!dc_isar_feature(aa32_fp16_arith, s)) {
1194            return false;
1195        }
1196        return do_3same(s, a, gen_VMINNM_fp16_3s);
1197    }
1198    return do_3same(s, a, gen_VMINNM_fp32_3s);
1199}
1200
1201static bool do_3same_fp_pair(DisasContext *s, arg_3same *a,
1202                             gen_helper_gvec_3_ptr *fn)
1203{
1204    /* FP pairwise operations */
1205    TCGv_ptr fpstatus;
1206
1207    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1208        return false;
1209    }
1210
1211    /* UNDEF accesses to D16-D31 if they don't exist. */
1212    if (!dc_isar_feature(aa32_simd_r32, s) &&
1213        ((a->vd | a->vn | a->vm) & 0x10)) {
1214        return false;
1215    }
1216
1217    if (!vfp_access_check(s)) {
1218        return true;
1219    }
1220
1221    assert(a->q == 0); /* enforced by decode patterns */
1222
1223
1224    fpstatus = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1225    tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
1226                       vfp_reg_offset(1, a->vn),
1227                       vfp_reg_offset(1, a->vm),
1228                       fpstatus, 8, 8, 0, fn);
1229    tcg_temp_free_ptr(fpstatus);
1230
1231    return true;
1232}
1233
1234/*
1235 * For all the functions using this macro, size == 1 means fp16,
1236 * which is an architecture extension we don't implement yet.
1237 */
1238#define DO_3S_FP_PAIR(INSN,FUNC)                                    \
1239    static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
1240    {                                                               \
1241        if (a->size == MO_16) {                                     \
1242            if (!dc_isar_feature(aa32_fp16_arith, s)) {             \
1243                return false;                                       \
1244            }                                                       \
1245            return do_3same_fp_pair(s, a, FUNC##h);                 \
1246        }                                                           \
1247        return do_3same_fp_pair(s, a, FUNC##s);                     \
1248    }
1249
1250DO_3S_FP_PAIR(VPADD, gen_helper_neon_padd)
1251DO_3S_FP_PAIR(VPMAX, gen_helper_neon_pmax)
1252DO_3S_FP_PAIR(VPMIN, gen_helper_neon_pmin)
1253
1254static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
1255{
1256    /* Handle a 2-reg-shift insn which can be vectorized. */
1257    int vec_size = a->q ? 16 : 8;
1258    int rd_ofs = neon_full_reg_offset(a->vd);
1259    int rm_ofs = neon_full_reg_offset(a->vm);
1260
1261    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1262        return false;
1263    }
1264
1265    /* UNDEF accesses to D16-D31 if they don't exist. */
1266    if (!dc_isar_feature(aa32_simd_r32, s) &&
1267        ((a->vd | a->vm) & 0x10)) {
1268        return false;
1269    }
1270
1271    if ((a->vm | a->vd) & a->q) {
1272        return false;
1273    }
1274
1275    if (!vfp_access_check(s)) {
1276        return true;
1277    }
1278
1279    fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
1280    return true;
1281}
1282
1283#define DO_2SH(INSN, FUNC)                                              \
1284    static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1285    {                                                                   \
1286        return do_vector_2sh(s, a, FUNC);                               \
1287    }                                                                   \
1288
1289DO_2SH(VSHL, tcg_gen_gvec_shli)
1290DO_2SH(VSLI, gen_gvec_sli)
1291DO_2SH(VSRI, gen_gvec_sri)
1292DO_2SH(VSRA_S, gen_gvec_ssra)
1293DO_2SH(VSRA_U, gen_gvec_usra)
1294DO_2SH(VRSHR_S, gen_gvec_srshr)
1295DO_2SH(VRSHR_U, gen_gvec_urshr)
1296DO_2SH(VRSRA_S, gen_gvec_srsra)
1297DO_2SH(VRSRA_U, gen_gvec_ursra)
1298
1299static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
1300{
1301    /* Signed shift out of range results in all-sign-bits */
1302    a->shift = MIN(a->shift, (8 << a->size) - 1);
1303    return do_vector_2sh(s, a, tcg_gen_gvec_sari);
1304}
1305
1306static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
1307                            int64_t shift, uint32_t oprsz, uint32_t maxsz)
1308{
1309    tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
1310}
1311
1312static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
1313{
1314    /* Shift out of range is architecturally valid and results in zero. */
1315    if (a->shift >= (8 << a->size)) {
1316        return do_vector_2sh(s, a, gen_zero_rd_2sh);
1317    } else {
1318        return do_vector_2sh(s, a, tcg_gen_gvec_shri);
1319    }
1320}
1321
1322static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
1323                             NeonGenTwo64OpEnvFn *fn)
1324{
1325    /*
1326     * 2-reg-and-shift operations, size == 3 case, where the
1327     * function needs to be passed cpu_env.
1328     */
1329    TCGv_i64 constimm;
1330    int pass;
1331
1332    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1333        return false;
1334    }
1335
1336    /* UNDEF accesses to D16-D31 if they don't exist. */
1337    if (!dc_isar_feature(aa32_simd_r32, s) &&
1338        ((a->vd | a->vm) & 0x10)) {
1339        return false;
1340    }
1341
1342    if ((a->vm | a->vd) & a->q) {
1343        return false;
1344    }
1345
1346    if (!vfp_access_check(s)) {
1347        return true;
1348    }
1349
1350    /*
1351     * To avoid excessive duplication of ops we implement shift
1352     * by immediate using the variable shift operations.
1353     */
1354    constimm = tcg_const_i64(dup_const(a->size, a->shift));
1355
1356    for (pass = 0; pass < a->q + 1; pass++) {
1357        TCGv_i64 tmp = tcg_temp_new_i64();
1358
1359        read_neon_element64(tmp, a->vm, pass, MO_64);
1360        fn(tmp, cpu_env, tmp, constimm);
1361        write_neon_element64(tmp, a->vd, pass, MO_64);
1362        tcg_temp_free_i64(tmp);
1363    }
1364    tcg_temp_free_i64(constimm);
1365    return true;
1366}
1367
1368static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
1369                             NeonGenTwoOpEnvFn *fn)
1370{
1371    /*
1372     * 2-reg-and-shift operations, size < 3 case, where the
1373     * helper needs to be passed cpu_env.
1374     */
1375    TCGv_i32 constimm, tmp;
1376    int pass;
1377
1378    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1379        return false;
1380    }
1381
1382    /* UNDEF accesses to D16-D31 if they don't exist. */
1383    if (!dc_isar_feature(aa32_simd_r32, s) &&
1384        ((a->vd | a->vm) & 0x10)) {
1385        return false;
1386    }
1387
1388    if ((a->vm | a->vd) & a->q) {
1389        return false;
1390    }
1391
1392    if (!vfp_access_check(s)) {
1393        return true;
1394    }
1395
1396    /*
1397     * To avoid excessive duplication of ops we implement shift
1398     * by immediate using the variable shift operations.
1399     */
1400    constimm = tcg_const_i32(dup_const(a->size, a->shift));
1401    tmp = tcg_temp_new_i32();
1402
1403    for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1404        read_neon_element32(tmp, a->vm, pass, MO_32);
1405        fn(tmp, cpu_env, tmp, constimm);
1406        write_neon_element32(tmp, a->vd, pass, MO_32);
1407    }
1408    tcg_temp_free_i32(tmp);
1409    tcg_temp_free_i32(constimm);
1410    return true;
1411}
1412
1413#define DO_2SHIFT_ENV(INSN, FUNC)                                       \
1414    static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
1415    {                                                                   \
1416        return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64);      \
1417    }                                                                   \
1418    static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1419    {                                                                   \
1420        static NeonGenTwoOpEnvFn * const fns[] = {                      \
1421            gen_helper_neon_##FUNC##8,                                  \
1422            gen_helper_neon_##FUNC##16,                                 \
1423            gen_helper_neon_##FUNC##32,                                 \
1424        };                                                              \
1425        assert(a->size < ARRAY_SIZE(fns));                              \
1426        return do_2shift_env_32(s, a, fns[a->size]);                    \
1427    }
1428
1429DO_2SHIFT_ENV(VQSHLU, qshlu_s)
1430DO_2SHIFT_ENV(VQSHL_U, qshl_u)
1431DO_2SHIFT_ENV(VQSHL_S, qshl_s)
1432
1433static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
1434                                NeonGenTwo64OpFn *shiftfn,
1435                                NeonGenNarrowEnvFn *narrowfn)
1436{
1437    /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
1438    TCGv_i64 constimm, rm1, rm2;
1439    TCGv_i32 rd;
1440
1441    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1442        return false;
1443    }
1444
1445    /* UNDEF accesses to D16-D31 if they don't exist. */
1446    if (!dc_isar_feature(aa32_simd_r32, s) &&
1447        ((a->vd | a->vm) & 0x10)) {
1448        return false;
1449    }
1450
1451    if (a->vm & 1) {
1452        return false;
1453    }
1454
1455    if (!vfp_access_check(s)) {
1456        return true;
1457    }
1458
1459    /*
1460     * This is always a right shift, and the shiftfn is always a
1461     * left-shift helper, which thus needs the negated shift count.
1462     */
1463    constimm = tcg_const_i64(-a->shift);
1464    rm1 = tcg_temp_new_i64();
1465    rm2 = tcg_temp_new_i64();
1466    rd = tcg_temp_new_i32();
1467
1468    /* Load both inputs first to avoid potential overwrite if rm == rd */
1469    read_neon_element64(rm1, a->vm, 0, MO_64);
1470    read_neon_element64(rm2, a->vm, 1, MO_64);
1471
1472    shiftfn(rm1, rm1, constimm);
1473    narrowfn(rd, cpu_env, rm1);
1474    write_neon_element32(rd, a->vd, 0, MO_32);
1475
1476    shiftfn(rm2, rm2, constimm);
1477    narrowfn(rd, cpu_env, rm2);
1478    write_neon_element32(rd, a->vd, 1, MO_32);
1479
1480    tcg_temp_free_i32(rd);
1481    tcg_temp_free_i64(rm1);
1482    tcg_temp_free_i64(rm2);
1483    tcg_temp_free_i64(constimm);
1484
1485    return true;
1486}
1487
1488static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
1489                                NeonGenTwoOpFn *shiftfn,
1490                                NeonGenNarrowEnvFn *narrowfn)
1491{
1492    /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
1493    TCGv_i32 constimm, rm1, rm2, rm3, rm4;
1494    TCGv_i64 rtmp;
1495    uint32_t imm;
1496
1497    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1498        return false;
1499    }
1500
1501    /* UNDEF accesses to D16-D31 if they don't exist. */
1502    if (!dc_isar_feature(aa32_simd_r32, s) &&
1503        ((a->vd | a->vm) & 0x10)) {
1504        return false;
1505    }
1506
1507    if (a->vm & 1) {
1508        return false;
1509    }
1510
1511    if (!vfp_access_check(s)) {
1512        return true;
1513    }
1514
1515    /*
1516     * This is always a right shift, and the shiftfn is always a
1517     * left-shift helper, which thus needs the negated shift count
1518     * duplicated into each lane of the immediate value.
1519     */
1520    if (a->size == 1) {
1521        imm = (uint16_t)(-a->shift);
1522        imm |= imm << 16;
1523    } else {
1524        /* size == 2 */
1525        imm = -a->shift;
1526    }
1527    constimm = tcg_const_i32(imm);
1528
1529    /* Load all inputs first to avoid potential overwrite */
1530    rm1 = tcg_temp_new_i32();
1531    rm2 = tcg_temp_new_i32();
1532    rm3 = tcg_temp_new_i32();
1533    rm4 = tcg_temp_new_i32();
1534    read_neon_element32(rm1, a->vm, 0, MO_32);
1535    read_neon_element32(rm2, a->vm, 1, MO_32);
1536    read_neon_element32(rm3, a->vm, 2, MO_32);
1537    read_neon_element32(rm4, a->vm, 3, MO_32);
1538    rtmp = tcg_temp_new_i64();
1539
1540    shiftfn(rm1, rm1, constimm);
1541    shiftfn(rm2, rm2, constimm);
1542
1543    tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
1544    tcg_temp_free_i32(rm2);
1545
1546    narrowfn(rm1, cpu_env, rtmp);
1547    write_neon_element32(rm1, a->vd, 0, MO_32);
1548    tcg_temp_free_i32(rm1);
1549
1550    shiftfn(rm3, rm3, constimm);
1551    shiftfn(rm4, rm4, constimm);
1552    tcg_temp_free_i32(constimm);
1553
1554    tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
1555    tcg_temp_free_i32(rm4);
1556
1557    narrowfn(rm3, cpu_env, rtmp);
1558    tcg_temp_free_i64(rtmp);
1559    write_neon_element32(rm3, a->vd, 1, MO_32);
1560    tcg_temp_free_i32(rm3);
1561    return true;
1562}
1563
1564#define DO_2SN_64(INSN, FUNC, NARROWFUNC)                               \
1565    static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1566    {                                                                   \
1567        return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC);             \
1568    }
1569#define DO_2SN_32(INSN, FUNC, NARROWFUNC)                               \
1570    static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1571    {                                                                   \
1572        return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
1573    }
1574
1575static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1576{
1577    tcg_gen_extrl_i64_i32(dest, src);
1578}
1579
1580static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1581{
1582    gen_helper_neon_narrow_u16(dest, src);
1583}
1584
1585static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1586{
1587    gen_helper_neon_narrow_u8(dest, src);
1588}
1589
1590DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
1591DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
1592DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
1593
1594DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
1595DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
1596DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
1597
1598DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
1599DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
1600DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
1601
1602DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
1603DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
1604DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
1605DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
1606DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
1607DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
1608
1609DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
1610DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
1611DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
1612
1613DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
1614DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
1615DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
1616
1617DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
1618DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
1619DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
1620
1621static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
1622                         NeonGenWidenFn *widenfn, bool u)
1623{
1624    TCGv_i64 tmp;
1625    TCGv_i32 rm0, rm1;
1626    uint64_t widen_mask = 0;
1627
1628    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1629        return false;
1630    }
1631
1632    /* UNDEF accesses to D16-D31 if they don't exist. */
1633    if (!dc_isar_feature(aa32_simd_r32, s) &&
1634        ((a->vd | a->vm) & 0x10)) {
1635        return false;
1636    }
1637
1638    if (a->vd & 1) {
1639        return false;
1640    }
1641
1642    if (!vfp_access_check(s)) {
1643        return true;
1644    }
1645
1646    /*
1647     * This is a widen-and-shift operation. The shift is always less
1648     * than the width of the source type, so after widening the input
1649     * vector we can simply shift the whole 64-bit widened register,
1650     * and then clear the potential overflow bits resulting from left
1651     * bits of the narrow input appearing as right bits of the left
1652     * neighbour narrow input. Calculate a mask of bits to clear.
1653     */
1654    if ((a->shift != 0) && (a->size < 2 || u)) {
1655        int esize = 8 << a->size;
1656        widen_mask = MAKE_64BIT_MASK(0, esize);
1657        widen_mask >>= esize - a->shift;
1658        widen_mask = dup_const(a->size + 1, widen_mask);
1659    }
1660
1661    rm0 = tcg_temp_new_i32();
1662    rm1 = tcg_temp_new_i32();
1663    read_neon_element32(rm0, a->vm, 0, MO_32);
1664    read_neon_element32(rm1, a->vm, 1, MO_32);
1665    tmp = tcg_temp_new_i64();
1666
1667    widenfn(tmp, rm0);
1668    tcg_temp_free_i32(rm0);
1669    if (a->shift != 0) {
1670        tcg_gen_shli_i64(tmp, tmp, a->shift);
1671        tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1672    }
1673    write_neon_element64(tmp, a->vd, 0, MO_64);
1674
1675    widenfn(tmp, rm1);
1676    tcg_temp_free_i32(rm1);
1677    if (a->shift != 0) {
1678        tcg_gen_shli_i64(tmp, tmp, a->shift);
1679        tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1680    }
1681    write_neon_element64(tmp, a->vd, 1, MO_64);
1682    tcg_temp_free_i64(tmp);
1683    return true;
1684}
1685
1686static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
1687{
1688    static NeonGenWidenFn * const widenfn[] = {
1689        gen_helper_neon_widen_s8,
1690        gen_helper_neon_widen_s16,
1691        tcg_gen_ext_i32_i64,
1692    };
1693    return do_vshll_2sh(s, a, widenfn[a->size], false);
1694}
1695
1696static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
1697{
1698    static NeonGenWidenFn * const widenfn[] = {
1699        gen_helper_neon_widen_u8,
1700        gen_helper_neon_widen_u16,
1701        tcg_gen_extu_i32_i64,
1702    };
1703    return do_vshll_2sh(s, a, widenfn[a->size], true);
1704}
1705
1706static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
1707                      gen_helper_gvec_2_ptr *fn)
1708{
1709    /* FP operations in 2-reg-and-shift group */
1710    int vec_size = a->q ? 16 : 8;
1711    int rd_ofs = neon_full_reg_offset(a->vd);
1712    int rm_ofs = neon_full_reg_offset(a->vm);
1713    TCGv_ptr fpst;
1714
1715    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1716        return false;
1717    }
1718
1719    if (a->size == MO_16) {
1720        if (!dc_isar_feature(aa32_fp16_arith, s)) {
1721            return false;
1722        }
1723    }
1724
1725    /* UNDEF accesses to D16-D31 if they don't exist. */
1726    if (!dc_isar_feature(aa32_simd_r32, s) &&
1727        ((a->vd | a->vm) & 0x10)) {
1728        return false;
1729    }
1730
1731    if ((a->vm | a->vd) & a->q) {
1732        return false;
1733    }
1734
1735    if (!vfp_access_check(s)) {
1736        return true;
1737    }
1738
1739    fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1740    tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, vec_size, vec_size, a->shift, fn);
1741    tcg_temp_free_ptr(fpst);
1742    return true;
1743}
1744
1745#define DO_FP_2SH(INSN, FUNC)                                           \
1746    static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1747    {                                                                   \
1748        return do_fp_2sh(s, a, FUNC);                                   \
1749    }
1750
1751DO_FP_2SH(VCVT_SF, gen_helper_gvec_vcvt_sf)
1752DO_FP_2SH(VCVT_UF, gen_helper_gvec_vcvt_uf)
1753DO_FP_2SH(VCVT_FS, gen_helper_gvec_vcvt_fs)
1754DO_FP_2SH(VCVT_FU, gen_helper_gvec_vcvt_fu)
1755
1756DO_FP_2SH(VCVT_SH, gen_helper_gvec_vcvt_sh)
1757DO_FP_2SH(VCVT_UH, gen_helper_gvec_vcvt_uh)
1758DO_FP_2SH(VCVT_HS, gen_helper_gvec_vcvt_hs)
1759DO_FP_2SH(VCVT_HU, gen_helper_gvec_vcvt_hu)
1760
1761static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
1762                        GVecGen2iFn *fn)
1763{
1764    uint64_t imm;
1765    int reg_ofs, vec_size;
1766
1767    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1768        return false;
1769    }
1770
1771    /* UNDEF accesses to D16-D31 if they don't exist. */
1772    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
1773        return false;
1774    }
1775
1776    if (a->vd & a->q) {
1777        return false;
1778    }
1779
1780    if (!vfp_access_check(s)) {
1781        return true;
1782    }
1783
1784    reg_ofs = neon_full_reg_offset(a->vd);
1785    vec_size = a->q ? 16 : 8;
1786    imm = asimd_imm_const(a->imm, a->cmode, a->op);
1787
1788    fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
1789    return true;
1790}
1791
1792static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
1793                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1794{
1795    tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
1796}
1797
1798static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
1799{
1800    /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
1801    GVecGen2iFn *fn;
1802
1803    if ((a->cmode & 1) && a->cmode < 12) {
1804        /* for op=1, the imm will be inverted, so BIC becomes AND. */
1805        fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
1806    } else {
1807        /* There is one unallocated cmode/op combination in this space */
1808        if (a->cmode == 15 && a->op == 1) {
1809            return false;
1810        }
1811        fn = gen_VMOV_1r;
1812    }
1813    return do_1reg_imm(s, a, fn);
1814}
1815
1816static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
1817                           NeonGenWidenFn *widenfn,
1818                           NeonGenTwo64OpFn *opfn,
1819                           int src1_mop, int src2_mop)
1820{
1821    /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
1822    TCGv_i64 rn0_64, rn1_64, rm_64;
1823
1824    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1825        return false;
1826    }
1827
1828    /* UNDEF accesses to D16-D31 if they don't exist. */
1829    if (!dc_isar_feature(aa32_simd_r32, s) &&
1830        ((a->vd | a->vn | a->vm) & 0x10)) {
1831        return false;
1832    }
1833
1834    if (!opfn) {
1835        /* size == 3 case, which is an entirely different insn group */
1836        return false;
1837    }
1838
1839    if ((a->vd & 1) || (src1_mop == MO_Q && (a->vn & 1))) {
1840        return false;
1841    }
1842
1843    if (!vfp_access_check(s)) {
1844        return true;
1845    }
1846
1847    rn0_64 = tcg_temp_new_i64();
1848    rn1_64 = tcg_temp_new_i64();
1849    rm_64 = tcg_temp_new_i64();
1850
1851    if (src1_mop >= 0) {
1852        read_neon_element64(rn0_64, a->vn, 0, src1_mop);
1853    } else {
1854        TCGv_i32 tmp = tcg_temp_new_i32();
1855        read_neon_element32(tmp, a->vn, 0, MO_32);
1856        widenfn(rn0_64, tmp);
1857        tcg_temp_free_i32(tmp);
1858    }
1859    if (src2_mop >= 0) {
1860        read_neon_element64(rm_64, a->vm, 0, src2_mop);
1861    } else {
1862        TCGv_i32 tmp = tcg_temp_new_i32();
1863        read_neon_element32(tmp, a->vm, 0, MO_32);
1864        widenfn(rm_64, tmp);
1865        tcg_temp_free_i32(tmp);
1866    }
1867
1868    opfn(rn0_64, rn0_64, rm_64);
1869
1870    /*
1871     * Load second pass inputs before storing the first pass result, to
1872     * avoid incorrect results if a narrow input overlaps with the result.
1873     */
1874    if (src1_mop >= 0) {
1875        read_neon_element64(rn1_64, a->vn, 1, src1_mop);
1876    } else {
1877        TCGv_i32 tmp = tcg_temp_new_i32();
1878        read_neon_element32(tmp, a->vn, 1, MO_32);
1879        widenfn(rn1_64, tmp);
1880        tcg_temp_free_i32(tmp);
1881    }
1882    if (src2_mop >= 0) {
1883        read_neon_element64(rm_64, a->vm, 1, src2_mop);
1884    } else {
1885        TCGv_i32 tmp = tcg_temp_new_i32();
1886        read_neon_element32(tmp, a->vm, 1, MO_32);
1887        widenfn(rm_64, tmp);
1888        tcg_temp_free_i32(tmp);
1889    }
1890
1891    write_neon_element64(rn0_64, a->vd, 0, MO_64);
1892
1893    opfn(rn1_64, rn1_64, rm_64);
1894    write_neon_element64(rn1_64, a->vd, 1, MO_64);
1895
1896    tcg_temp_free_i64(rn0_64);
1897    tcg_temp_free_i64(rn1_64);
1898    tcg_temp_free_i64(rm_64);
1899
1900    return true;
1901}
1902
1903#define DO_PREWIDEN(INSN, S, OP, SRC1WIDE, SIGN)                        \
1904    static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1905    {                                                                   \
1906        static NeonGenWidenFn * const widenfn[] = {                     \
1907            gen_helper_neon_widen_##S##8,                               \
1908            gen_helper_neon_widen_##S##16,                              \
1909            NULL, NULL,                                                 \
1910        };                                                              \
1911        static NeonGenTwo64OpFn * const addfn[] = {                     \
1912            gen_helper_neon_##OP##l_u16,                                \
1913            gen_helper_neon_##OP##l_u32,                                \
1914            tcg_gen_##OP##_i64,                                         \
1915            NULL,                                                       \
1916        };                                                              \
1917        int narrow_mop = a->size == MO_32 ? MO_32 | SIGN : -1;          \
1918        return do_prewiden_3d(s, a, widenfn[a->size], addfn[a->size],   \
1919                              SRC1WIDE ? MO_Q : narrow_mop,             \
1920                              narrow_mop);                              \
1921    }
1922
1923DO_PREWIDEN(VADDL_S, s, add, false, MO_SIGN)
1924DO_PREWIDEN(VADDL_U, u, add, false, 0)
1925DO_PREWIDEN(VSUBL_S, s, sub, false, MO_SIGN)
1926DO_PREWIDEN(VSUBL_U, u, sub, false, 0)
1927DO_PREWIDEN(VADDW_S, s, add, true, MO_SIGN)
1928DO_PREWIDEN(VADDW_U, u, add, true, 0)
1929DO_PREWIDEN(VSUBW_S, s, sub, true, MO_SIGN)
1930DO_PREWIDEN(VSUBW_U, u, sub, true, 0)
1931
1932static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
1933                         NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
1934{
1935    /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
1936    TCGv_i64 rn_64, rm_64;
1937    TCGv_i32 rd0, rd1;
1938
1939    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1940        return false;
1941    }
1942
1943    /* UNDEF accesses to D16-D31 if they don't exist. */
1944    if (!dc_isar_feature(aa32_simd_r32, s) &&
1945        ((a->vd | a->vn | a->vm) & 0x10)) {
1946        return false;
1947    }
1948
1949    if (!opfn || !narrowfn) {
1950        /* size == 3 case, which is an entirely different insn group */
1951        return false;
1952    }
1953
1954    if ((a->vn | a->vm) & 1) {
1955        return false;
1956    }
1957
1958    if (!vfp_access_check(s)) {
1959        return true;
1960    }
1961
1962    rn_64 = tcg_temp_new_i64();
1963    rm_64 = tcg_temp_new_i64();
1964    rd0 = tcg_temp_new_i32();
1965    rd1 = tcg_temp_new_i32();
1966
1967    read_neon_element64(rn_64, a->vn, 0, MO_64);
1968    read_neon_element64(rm_64, a->vm, 0, MO_64);
1969
1970    opfn(rn_64, rn_64, rm_64);
1971
1972    narrowfn(rd0, rn_64);
1973
1974    read_neon_element64(rn_64, a->vn, 1, MO_64);
1975    read_neon_element64(rm_64, a->vm, 1, MO_64);
1976
1977    opfn(rn_64, rn_64, rm_64);
1978
1979    narrowfn(rd1, rn_64);
1980
1981    write_neon_element32(rd0, a->vd, 0, MO_32);
1982    write_neon_element32(rd1, a->vd, 1, MO_32);
1983
1984    tcg_temp_free_i32(rd0);
1985    tcg_temp_free_i32(rd1);
1986    tcg_temp_free_i64(rn_64);
1987    tcg_temp_free_i64(rm_64);
1988
1989    return true;
1990}
1991
1992#define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP)                       \
1993    static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1994    {                                                                   \
1995        static NeonGenTwo64OpFn * const addfn[] = {                     \
1996            gen_helper_neon_##OP##l_u16,                                \
1997            gen_helper_neon_##OP##l_u32,                                \
1998            tcg_gen_##OP##_i64,                                         \
1999            NULL,                                                       \
2000        };                                                              \
2001        static NeonGenNarrowFn * const narrowfn[] = {                   \
2002            gen_helper_neon_##NARROWTYPE##_high_u8,                     \
2003            gen_helper_neon_##NARROWTYPE##_high_u16,                    \
2004            EXTOP,                                                      \
2005            NULL,                                                       \
2006        };                                                              \
2007        return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]);   \
2008    }
2009
2010static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
2011{
2012    tcg_gen_addi_i64(rn, rn, 1u << 31);
2013    tcg_gen_extrh_i64_i32(rd, rn);
2014}
2015
2016DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
2017DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
2018DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
2019DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
2020
2021static bool do_long_3d(DisasContext *s, arg_3diff *a,
2022                       NeonGenTwoOpWidenFn *opfn,
2023                       NeonGenTwo64OpFn *accfn)
2024{
2025    /*
2026     * 3-regs different lengths, long operations.
2027     * These perform an operation on two inputs that returns a double-width
2028     * result, and then possibly perform an accumulation operation of
2029     * that result into the double-width destination.
2030     */
2031    TCGv_i64 rd0, rd1, tmp;
2032    TCGv_i32 rn, rm;
2033
2034    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2035        return false;
2036    }
2037
2038    /* UNDEF accesses to D16-D31 if they don't exist. */
2039    if (!dc_isar_feature(aa32_simd_r32, s) &&
2040        ((a->vd | a->vn | a->vm) & 0x10)) {
2041        return false;
2042    }
2043
2044    if (!opfn) {
2045        /* size == 3 case, which is an entirely different insn group */
2046        return false;
2047    }
2048
2049    if (a->vd & 1) {
2050        return false;
2051    }
2052
2053    if (!vfp_access_check(s)) {
2054        return true;
2055    }
2056
2057    rd0 = tcg_temp_new_i64();
2058    rd1 = tcg_temp_new_i64();
2059
2060    rn = tcg_temp_new_i32();
2061    rm = tcg_temp_new_i32();
2062    read_neon_element32(rn, a->vn, 0, MO_32);
2063    read_neon_element32(rm, a->vm, 0, MO_32);
2064    opfn(rd0, rn, rm);
2065
2066    read_neon_element32(rn, a->vn, 1, MO_32);
2067    read_neon_element32(rm, a->vm, 1, MO_32);
2068    opfn(rd1, rn, rm);
2069    tcg_temp_free_i32(rn);
2070    tcg_temp_free_i32(rm);
2071
2072    /* Don't store results until after all loads: they might overlap */
2073    if (accfn) {
2074        tmp = tcg_temp_new_i64();
2075        read_neon_element64(tmp, a->vd, 0, MO_64);
2076        accfn(rd0, tmp, rd0);
2077        read_neon_element64(tmp, a->vd, 1, MO_64);
2078        accfn(rd1, tmp, rd1);
2079        tcg_temp_free_i64(tmp);
2080    }
2081
2082    write_neon_element64(rd0, a->vd, 0, MO_64);
2083    write_neon_element64(rd1, a->vd, 1, MO_64);
2084    tcg_temp_free_i64(rd0);
2085    tcg_temp_free_i64(rd1);
2086
2087    return true;
2088}
2089
2090static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
2091{
2092    static NeonGenTwoOpWidenFn * const opfn[] = {
2093        gen_helper_neon_abdl_s16,
2094        gen_helper_neon_abdl_s32,
2095        gen_helper_neon_abdl_s64,
2096        NULL,
2097    };
2098
2099    return do_long_3d(s, a, opfn[a->size], NULL);
2100}
2101
2102static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
2103{
2104    static NeonGenTwoOpWidenFn * const opfn[] = {
2105        gen_helper_neon_abdl_u16,
2106        gen_helper_neon_abdl_u32,
2107        gen_helper_neon_abdl_u64,
2108        NULL,
2109    };
2110
2111    return do_long_3d(s, a, opfn[a->size], NULL);
2112}
2113
2114static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
2115{
2116    static NeonGenTwoOpWidenFn * const opfn[] = {
2117        gen_helper_neon_abdl_s16,
2118        gen_helper_neon_abdl_s32,
2119        gen_helper_neon_abdl_s64,
2120        NULL,
2121    };
2122    static NeonGenTwo64OpFn * const addfn[] = {
2123        gen_helper_neon_addl_u16,
2124        gen_helper_neon_addl_u32,
2125        tcg_gen_add_i64,
2126        NULL,
2127    };
2128
2129    return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2130}
2131
2132static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
2133{
2134    static NeonGenTwoOpWidenFn * const opfn[] = {
2135        gen_helper_neon_abdl_u16,
2136        gen_helper_neon_abdl_u32,
2137        gen_helper_neon_abdl_u64,
2138        NULL,
2139    };
2140    static NeonGenTwo64OpFn * const addfn[] = {
2141        gen_helper_neon_addl_u16,
2142        gen_helper_neon_addl_u32,
2143        tcg_gen_add_i64,
2144        NULL,
2145    };
2146
2147    return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2148}
2149
2150static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2151{
2152    TCGv_i32 lo = tcg_temp_new_i32();
2153    TCGv_i32 hi = tcg_temp_new_i32();
2154
2155    tcg_gen_muls2_i32(lo, hi, rn, rm);
2156    tcg_gen_concat_i32_i64(rd, lo, hi);
2157
2158    tcg_temp_free_i32(lo);
2159    tcg_temp_free_i32(hi);
2160}
2161
2162static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2163{
2164    TCGv_i32 lo = tcg_temp_new_i32();
2165    TCGv_i32 hi = tcg_temp_new_i32();
2166
2167    tcg_gen_mulu2_i32(lo, hi, rn, rm);
2168    tcg_gen_concat_i32_i64(rd, lo, hi);
2169
2170    tcg_temp_free_i32(lo);
2171    tcg_temp_free_i32(hi);
2172}
2173
2174static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
2175{
2176    static NeonGenTwoOpWidenFn * const opfn[] = {
2177        gen_helper_neon_mull_s8,
2178        gen_helper_neon_mull_s16,
2179        gen_mull_s32,
2180        NULL,
2181    };
2182
2183    return do_long_3d(s, a, opfn[a->size], NULL);
2184}
2185
2186static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
2187{
2188    static NeonGenTwoOpWidenFn * const opfn[] = {
2189        gen_helper_neon_mull_u8,
2190        gen_helper_neon_mull_u16,
2191        gen_mull_u32,
2192        NULL,
2193    };
2194
2195    return do_long_3d(s, a, opfn[a->size], NULL);
2196}
2197
2198#define DO_VMLAL(INSN,MULL,ACC)                                         \
2199    static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2200    {                                                                   \
2201        static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2202            gen_helper_neon_##MULL##8,                                  \
2203            gen_helper_neon_##MULL##16,                                 \
2204            gen_##MULL##32,                                             \
2205            NULL,                                                       \
2206        };                                                              \
2207        static NeonGenTwo64OpFn * const accfn[] = {                     \
2208            gen_helper_neon_##ACC##l_u16,                               \
2209            gen_helper_neon_##ACC##l_u32,                               \
2210            tcg_gen_##ACC##_i64,                                        \
2211            NULL,                                                       \
2212        };                                                              \
2213        return do_long_3d(s, a, opfn[a->size], accfn[a->size]);         \
2214    }
2215
2216DO_VMLAL(VMLAL_S,mull_s,add)
2217DO_VMLAL(VMLAL_U,mull_u,add)
2218DO_VMLAL(VMLSL_S,mull_s,sub)
2219DO_VMLAL(VMLSL_U,mull_u,sub)
2220
2221static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2222{
2223    gen_helper_neon_mull_s16(rd, rn, rm);
2224    gen_helper_neon_addl_saturate_s32(rd, cpu_env, rd, rd);
2225}
2226
2227static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2228{
2229    gen_mull_s32(rd, rn, rm);
2230    gen_helper_neon_addl_saturate_s64(rd, cpu_env, rd, rd);
2231}
2232
2233static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
2234{
2235    static NeonGenTwoOpWidenFn * const opfn[] = {
2236        NULL,
2237        gen_VQDMULL_16,
2238        gen_VQDMULL_32,
2239        NULL,
2240    };
2241
2242    return do_long_3d(s, a, opfn[a->size], NULL);
2243}
2244
2245static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2246{
2247    gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2248}
2249
2250static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2251{
2252    gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2253}
2254
2255static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
2256{
2257    static NeonGenTwoOpWidenFn * const opfn[] = {
2258        NULL,
2259        gen_VQDMULL_16,
2260        gen_VQDMULL_32,
2261        NULL,
2262    };
2263    static NeonGenTwo64OpFn * const accfn[] = {
2264        NULL,
2265        gen_VQDMLAL_acc_16,
2266        gen_VQDMLAL_acc_32,
2267        NULL,
2268    };
2269
2270    return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2271}
2272
2273static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2274{
2275    gen_helper_neon_negl_u32(rm, rm);
2276    gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2277}
2278
2279static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2280{
2281    tcg_gen_neg_i64(rm, rm);
2282    gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2283}
2284
2285static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
2286{
2287    static NeonGenTwoOpWidenFn * const opfn[] = {
2288        NULL,
2289        gen_VQDMULL_16,
2290        gen_VQDMULL_32,
2291        NULL,
2292    };
2293    static NeonGenTwo64OpFn * const accfn[] = {
2294        NULL,
2295        gen_VQDMLSL_acc_16,
2296        gen_VQDMLSL_acc_32,
2297        NULL,
2298    };
2299
2300    return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2301}
2302
2303static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
2304{
2305    gen_helper_gvec_3 *fn_gvec;
2306
2307    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2308        return false;
2309    }
2310
2311    /* UNDEF accesses to D16-D31 if they don't exist. */
2312    if (!dc_isar_feature(aa32_simd_r32, s) &&
2313        ((a->vd | a->vn | a->vm) & 0x10)) {
2314        return false;
2315    }
2316
2317    if (a->vd & 1) {
2318        return false;
2319    }
2320
2321    switch (a->size) {
2322    case 0:
2323        fn_gvec = gen_helper_neon_pmull_h;
2324        break;
2325    case 2:
2326        if (!dc_isar_feature(aa32_pmull, s)) {
2327            return false;
2328        }
2329        fn_gvec = gen_helper_gvec_pmull_q;
2330        break;
2331    default:
2332        return false;
2333    }
2334
2335    if (!vfp_access_check(s)) {
2336        return true;
2337    }
2338
2339    tcg_gen_gvec_3_ool(neon_full_reg_offset(a->vd),
2340                       neon_full_reg_offset(a->vn),
2341                       neon_full_reg_offset(a->vm),
2342                       16, 16, 0, fn_gvec);
2343    return true;
2344}
2345
2346static void gen_neon_dup_low16(TCGv_i32 var)
2347{
2348    TCGv_i32 tmp = tcg_temp_new_i32();
2349    tcg_gen_ext16u_i32(var, var);
2350    tcg_gen_shli_i32(tmp, var, 16);
2351    tcg_gen_or_i32(var, var, tmp);
2352    tcg_temp_free_i32(tmp);
2353}
2354
2355static void gen_neon_dup_high16(TCGv_i32 var)
2356{
2357    TCGv_i32 tmp = tcg_temp_new_i32();
2358    tcg_gen_andi_i32(var, var, 0xffff0000);
2359    tcg_gen_shri_i32(tmp, var, 16);
2360    tcg_gen_or_i32(var, var, tmp);
2361    tcg_temp_free_i32(tmp);
2362}
2363
2364static inline TCGv_i32 neon_get_scalar(int size, int reg)
2365{
2366    TCGv_i32 tmp = tcg_temp_new_i32();
2367    if (size == MO_16) {
2368        read_neon_element32(tmp, reg & 7, reg >> 4, MO_32);
2369        if (reg & 8) {
2370            gen_neon_dup_high16(tmp);
2371        } else {
2372            gen_neon_dup_low16(tmp);
2373        }
2374    } else {
2375        read_neon_element32(tmp, reg & 15, reg >> 4, MO_32);
2376    }
2377    return tmp;
2378}
2379
2380static bool do_2scalar(DisasContext *s, arg_2scalar *a,
2381                       NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
2382{
2383    /*
2384     * Two registers and a scalar: perform an operation between
2385     * the input elements and the scalar, and then possibly
2386     * perform an accumulation operation of that result into the
2387     * destination.
2388     */
2389    TCGv_i32 scalar, tmp;
2390    int pass;
2391
2392    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2393        return false;
2394    }
2395
2396    /* UNDEF accesses to D16-D31 if they don't exist. */
2397    if (!dc_isar_feature(aa32_simd_r32, s) &&
2398        ((a->vd | a->vn | a->vm) & 0x10)) {
2399        return false;
2400    }
2401
2402    if (!opfn) {
2403        /* Bad size (including size == 3, which is a different insn group) */
2404        return false;
2405    }
2406
2407    if (a->q && ((a->vd | a->vn) & 1)) {
2408        return false;
2409    }
2410
2411    if (!vfp_access_check(s)) {
2412        return true;
2413    }
2414
2415    scalar = neon_get_scalar(a->size, a->vm);
2416    tmp = tcg_temp_new_i32();
2417
2418    for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2419        read_neon_element32(tmp, a->vn, pass, MO_32);
2420        opfn(tmp, tmp, scalar);
2421        if (accfn) {
2422            TCGv_i32 rd = tcg_temp_new_i32();
2423            read_neon_element32(rd, a->vd, pass, MO_32);
2424            accfn(tmp, rd, tmp);
2425            tcg_temp_free_i32(rd);
2426        }
2427        write_neon_element32(tmp, a->vd, pass, MO_32);
2428    }
2429    tcg_temp_free_i32(tmp);
2430    tcg_temp_free_i32(scalar);
2431    return true;
2432}
2433
2434static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
2435{
2436    static NeonGenTwoOpFn * const opfn[] = {
2437        NULL,
2438        gen_helper_neon_mul_u16,
2439        tcg_gen_mul_i32,
2440        NULL,
2441    };
2442
2443    return do_2scalar(s, a, opfn[a->size], NULL);
2444}
2445
2446static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
2447{
2448    static NeonGenTwoOpFn * const opfn[] = {
2449        NULL,
2450        gen_helper_neon_mul_u16,
2451        tcg_gen_mul_i32,
2452        NULL,
2453    };
2454    static NeonGenTwoOpFn * const accfn[] = {
2455        NULL,
2456        gen_helper_neon_add_u16,
2457        tcg_gen_add_i32,
2458        NULL,
2459    };
2460
2461    return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2462}
2463
2464static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
2465{
2466    static NeonGenTwoOpFn * const opfn[] = {
2467        NULL,
2468        gen_helper_neon_mul_u16,
2469        tcg_gen_mul_i32,
2470        NULL,
2471    };
2472    static NeonGenTwoOpFn * const accfn[] = {
2473        NULL,
2474        gen_helper_neon_sub_u16,
2475        tcg_gen_sub_i32,
2476        NULL,
2477    };
2478
2479    return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2480}
2481
2482static bool do_2scalar_fp_vec(DisasContext *s, arg_2scalar *a,
2483                              gen_helper_gvec_3_ptr *fn)
2484{
2485    /* Two registers and a scalar, using gvec */
2486    int vec_size = a->q ? 16 : 8;
2487    int rd_ofs = neon_full_reg_offset(a->vd);
2488    int rn_ofs = neon_full_reg_offset(a->vn);
2489    int rm_ofs;
2490    int idx;
2491    TCGv_ptr fpstatus;
2492
2493    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2494        return false;
2495    }
2496
2497    /* UNDEF accesses to D16-D31 if they don't exist. */
2498    if (!dc_isar_feature(aa32_simd_r32, s) &&
2499        ((a->vd | a->vn | a->vm) & 0x10)) {
2500        return false;
2501    }
2502
2503    if (!fn) {
2504        /* Bad size (including size == 3, which is a different insn group) */
2505        return false;
2506    }
2507
2508    if (a->q && ((a->vd | a->vn) & 1)) {
2509        return false;
2510    }
2511
2512    if (!vfp_access_check(s)) {
2513        return true;
2514    }
2515
2516    /* a->vm is M:Vm, which encodes both register and index */
2517    idx = extract32(a->vm, a->size + 2, 2);
2518    a->vm = extract32(a->vm, 0, a->size + 2);
2519    rm_ofs = neon_full_reg_offset(a->vm);
2520
2521    fpstatus = fpstatus_ptr(a->size == 1 ? FPST_STD_F16 : FPST_STD);
2522    tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpstatus,
2523                       vec_size, vec_size, idx, fn);
2524    tcg_temp_free_ptr(fpstatus);
2525    return true;
2526}
2527
2528#define DO_VMUL_F_2sc(NAME, FUNC)                                       \
2529    static bool trans_##NAME##_F_2sc(DisasContext *s, arg_2scalar *a)   \
2530    {                                                                   \
2531        static gen_helper_gvec_3_ptr * const opfn[] = {                 \
2532            NULL,                                                       \
2533            gen_helper_##FUNC##_h,                                      \
2534            gen_helper_##FUNC##_s,                                      \
2535            NULL,                                                       \
2536        };                                                              \
2537        if (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s)) { \
2538            return false;                                               \
2539        }                                                               \
2540        return do_2scalar_fp_vec(s, a, opfn[a->size]);                  \
2541    }
2542
2543DO_VMUL_F_2sc(VMUL, gvec_fmul_idx)
2544DO_VMUL_F_2sc(VMLA, gvec_fmla_nf_idx)
2545DO_VMUL_F_2sc(VMLS, gvec_fmls_nf_idx)
2546
2547WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
2548WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
2549WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
2550WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
2551
2552static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
2553{
2554    static NeonGenTwoOpFn * const opfn[] = {
2555        NULL,
2556        gen_VQDMULH_16,
2557        gen_VQDMULH_32,
2558        NULL,
2559    };
2560
2561    return do_2scalar(s, a, opfn[a->size], NULL);
2562}
2563
2564static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
2565{
2566    static NeonGenTwoOpFn * const opfn[] = {
2567        NULL,
2568        gen_VQRDMULH_16,
2569        gen_VQRDMULH_32,
2570        NULL,
2571    };
2572
2573    return do_2scalar(s, a, opfn[a->size], NULL);
2574}
2575
2576static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
2577                            NeonGenThreeOpEnvFn *opfn)
2578{
2579    /*
2580     * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
2581     * performs a kind of fused op-then-accumulate using a helper
2582     * function that takes all of rd, rn and the scalar at once.
2583     */
2584    TCGv_i32 scalar, rn, rd;
2585    int pass;
2586
2587    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2588        return false;
2589    }
2590
2591    if (!dc_isar_feature(aa32_rdm, s)) {
2592        return false;
2593    }
2594
2595    /* UNDEF accesses to D16-D31 if they don't exist. */
2596    if (!dc_isar_feature(aa32_simd_r32, s) &&
2597        ((a->vd | a->vn | a->vm) & 0x10)) {
2598        return false;
2599    }
2600
2601    if (!opfn) {
2602        /* Bad size (including size == 3, which is a different insn group) */
2603        return false;
2604    }
2605
2606    if (a->q && ((a->vd | a->vn) & 1)) {
2607        return false;
2608    }
2609
2610    if (!vfp_access_check(s)) {
2611        return true;
2612    }
2613
2614    scalar = neon_get_scalar(a->size, a->vm);
2615    rn = tcg_temp_new_i32();
2616    rd = tcg_temp_new_i32();
2617
2618    for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2619        read_neon_element32(rn, a->vn, pass, MO_32);
2620        read_neon_element32(rd, a->vd, pass, MO_32);
2621        opfn(rd, cpu_env, rn, scalar, rd);
2622        write_neon_element32(rd, a->vd, pass, MO_32);
2623    }
2624    tcg_temp_free_i32(rn);
2625    tcg_temp_free_i32(rd);
2626    tcg_temp_free_i32(scalar);
2627
2628    return true;
2629}
2630
2631static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
2632{
2633    static NeonGenThreeOpEnvFn *opfn[] = {
2634        NULL,
2635        gen_helper_neon_qrdmlah_s16,
2636        gen_helper_neon_qrdmlah_s32,
2637        NULL,
2638    };
2639    return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2640}
2641
2642static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
2643{
2644    static NeonGenThreeOpEnvFn *opfn[] = {
2645        NULL,
2646        gen_helper_neon_qrdmlsh_s16,
2647        gen_helper_neon_qrdmlsh_s32,
2648        NULL,
2649    };
2650    return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2651}
2652
2653static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
2654                            NeonGenTwoOpWidenFn *opfn,
2655                            NeonGenTwo64OpFn *accfn)
2656{
2657    /*
2658     * Two registers and a scalar, long operations: perform an
2659     * operation on the input elements and the scalar which produces
2660     * a double-width result, and then possibly perform an accumulation
2661     * operation of that result into the destination.
2662     */
2663    TCGv_i32 scalar, rn;
2664    TCGv_i64 rn0_64, rn1_64;
2665
2666    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2667        return false;
2668    }
2669
2670    /* UNDEF accesses to D16-D31 if they don't exist. */
2671    if (!dc_isar_feature(aa32_simd_r32, s) &&
2672        ((a->vd | a->vn | a->vm) & 0x10)) {
2673        return false;
2674    }
2675
2676    if (!opfn) {
2677        /* Bad size (including size == 3, which is a different insn group) */
2678        return false;
2679    }
2680
2681    if (a->vd & 1) {
2682        return false;
2683    }
2684
2685    if (!vfp_access_check(s)) {
2686        return true;
2687    }
2688
2689    scalar = neon_get_scalar(a->size, a->vm);
2690
2691    /* Load all inputs before writing any outputs, in case of overlap */
2692    rn = tcg_temp_new_i32();
2693    read_neon_element32(rn, a->vn, 0, MO_32);
2694    rn0_64 = tcg_temp_new_i64();
2695    opfn(rn0_64, rn, scalar);
2696
2697    read_neon_element32(rn, a->vn, 1, MO_32);
2698    rn1_64 = tcg_temp_new_i64();
2699    opfn(rn1_64, rn, scalar);
2700    tcg_temp_free_i32(rn);
2701    tcg_temp_free_i32(scalar);
2702
2703    if (accfn) {
2704        TCGv_i64 t64 = tcg_temp_new_i64();
2705        read_neon_element64(t64, a->vd, 0, MO_64);
2706        accfn(rn0_64, t64, rn0_64);
2707        read_neon_element64(t64, a->vd, 1, MO_64);
2708        accfn(rn1_64, t64, rn1_64);
2709        tcg_temp_free_i64(t64);
2710    }
2711
2712    write_neon_element64(rn0_64, a->vd, 0, MO_64);
2713    write_neon_element64(rn1_64, a->vd, 1, MO_64);
2714    tcg_temp_free_i64(rn0_64);
2715    tcg_temp_free_i64(rn1_64);
2716    return true;
2717}
2718
2719static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
2720{
2721    static NeonGenTwoOpWidenFn * const opfn[] = {
2722        NULL,
2723        gen_helper_neon_mull_s16,
2724        gen_mull_s32,
2725        NULL,
2726    };
2727
2728    return do_2scalar_long(s, a, opfn[a->size], NULL);
2729}
2730
2731static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
2732{
2733    static NeonGenTwoOpWidenFn * const opfn[] = {
2734        NULL,
2735        gen_helper_neon_mull_u16,
2736        gen_mull_u32,
2737        NULL,
2738    };
2739
2740    return do_2scalar_long(s, a, opfn[a->size], NULL);
2741}
2742
2743#define DO_VMLAL_2SC(INSN, MULL, ACC)                                   \
2744    static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a)     \
2745    {                                                                   \
2746        static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2747            NULL,                                                       \
2748            gen_helper_neon_##MULL##16,                                 \
2749            gen_##MULL##32,                                             \
2750            NULL,                                                       \
2751        };                                                              \
2752        static NeonGenTwo64OpFn * const accfn[] = {                     \
2753            NULL,                                                       \
2754            gen_helper_neon_##ACC##l_u32,                               \
2755            tcg_gen_##ACC##_i64,                                        \
2756            NULL,                                                       \
2757        };                                                              \
2758        return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);    \
2759    }
2760
2761DO_VMLAL_2SC(VMLAL_S, mull_s, add)
2762DO_VMLAL_2SC(VMLAL_U, mull_u, add)
2763DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
2764DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
2765
2766static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
2767{
2768    static NeonGenTwoOpWidenFn * const opfn[] = {
2769        NULL,
2770        gen_VQDMULL_16,
2771        gen_VQDMULL_32,
2772        NULL,
2773    };
2774
2775    return do_2scalar_long(s, a, opfn[a->size], NULL);
2776}
2777
2778static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
2779{
2780    static NeonGenTwoOpWidenFn * const opfn[] = {
2781        NULL,
2782        gen_VQDMULL_16,
2783        gen_VQDMULL_32,
2784        NULL,
2785    };
2786    static NeonGenTwo64OpFn * const accfn[] = {
2787        NULL,
2788        gen_VQDMLAL_acc_16,
2789        gen_VQDMLAL_acc_32,
2790        NULL,
2791    };
2792
2793    return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2794}
2795
2796static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
2797{
2798    static NeonGenTwoOpWidenFn * const opfn[] = {
2799        NULL,
2800        gen_VQDMULL_16,
2801        gen_VQDMULL_32,
2802        NULL,
2803    };
2804    static NeonGenTwo64OpFn * const accfn[] = {
2805        NULL,
2806        gen_VQDMLSL_acc_16,
2807        gen_VQDMLSL_acc_32,
2808        NULL,
2809    };
2810
2811    return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2812}
2813
2814static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
2815{
2816    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2817        return false;
2818    }
2819
2820    /* UNDEF accesses to D16-D31 if they don't exist. */
2821    if (!dc_isar_feature(aa32_simd_r32, s) &&
2822        ((a->vd | a->vn | a->vm) & 0x10)) {
2823        return false;
2824    }
2825
2826    if ((a->vn | a->vm | a->vd) & a->q) {
2827        return false;
2828    }
2829
2830    if (a->imm > 7 && !a->q) {
2831        return false;
2832    }
2833
2834    if (!vfp_access_check(s)) {
2835        return true;
2836    }
2837
2838    if (!a->q) {
2839        /* Extract 64 bits from <Vm:Vn> */
2840        TCGv_i64 left, right, dest;
2841
2842        left = tcg_temp_new_i64();
2843        right = tcg_temp_new_i64();
2844        dest = tcg_temp_new_i64();
2845
2846        read_neon_element64(right, a->vn, 0, MO_64);
2847        read_neon_element64(left, a->vm, 0, MO_64);
2848        tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
2849        write_neon_element64(dest, a->vd, 0, MO_64);
2850
2851        tcg_temp_free_i64(left);
2852        tcg_temp_free_i64(right);
2853        tcg_temp_free_i64(dest);
2854    } else {
2855        /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
2856        TCGv_i64 left, middle, right, destleft, destright;
2857
2858        left = tcg_temp_new_i64();
2859        middle = tcg_temp_new_i64();
2860        right = tcg_temp_new_i64();
2861        destleft = tcg_temp_new_i64();
2862        destright = tcg_temp_new_i64();
2863
2864        if (a->imm < 8) {
2865            read_neon_element64(right, a->vn, 0, MO_64);
2866            read_neon_element64(middle, a->vn, 1, MO_64);
2867            tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
2868            read_neon_element64(left, a->vm, 0, MO_64);
2869            tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
2870        } else {
2871            read_neon_element64(right, a->vn, 1, MO_64);
2872            read_neon_element64(middle, a->vm, 0, MO_64);
2873            tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
2874            read_neon_element64(left, a->vm, 1, MO_64);
2875            tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
2876        }
2877
2878        write_neon_element64(destright, a->vd, 0, MO_64);
2879        write_neon_element64(destleft, a->vd, 1, MO_64);
2880
2881        tcg_temp_free_i64(destright);
2882        tcg_temp_free_i64(destleft);
2883        tcg_temp_free_i64(right);
2884        tcg_temp_free_i64(middle);
2885        tcg_temp_free_i64(left);
2886    }
2887    return true;
2888}
2889
2890static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
2891{
2892    TCGv_i64 val, def;
2893    TCGv_i32 desc;
2894
2895    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2896        return false;
2897    }
2898
2899    /* UNDEF accesses to D16-D31 if they don't exist. */
2900    if (!dc_isar_feature(aa32_simd_r32, s) &&
2901        ((a->vd | a->vn | a->vm) & 0x10)) {
2902        return false;
2903    }
2904
2905    if ((a->vn + a->len + 1) > 32) {
2906        /*
2907         * This is UNPREDICTABLE; we choose to UNDEF to avoid the
2908         * helper function running off the end of the register file.
2909         */
2910        return false;
2911    }
2912
2913    if (!vfp_access_check(s)) {
2914        return true;
2915    }
2916
2917    desc = tcg_const_i32((a->vn << 2) | a->len);
2918    def = tcg_temp_new_i64();
2919    if (a->op) {
2920        read_neon_element64(def, a->vd, 0, MO_64);
2921    } else {
2922        tcg_gen_movi_i64(def, 0);
2923    }
2924    val = tcg_temp_new_i64();
2925    read_neon_element64(val, a->vm, 0, MO_64);
2926
2927    gen_helper_neon_tbl(val, cpu_env, desc, val, def);
2928    write_neon_element64(val, a->vd, 0, MO_64);
2929
2930    tcg_temp_free_i64(def);
2931    tcg_temp_free_i64(val);
2932    tcg_temp_free_i32(desc);
2933    return true;
2934}
2935
2936static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
2937{
2938    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2939        return false;
2940    }
2941
2942    /* UNDEF accesses to D16-D31 if they don't exist. */
2943    if (!dc_isar_feature(aa32_simd_r32, s) &&
2944        ((a->vd | a->vm) & 0x10)) {
2945        return false;
2946    }
2947
2948    if (a->vd & a->q) {
2949        return false;
2950    }
2951
2952    if (!vfp_access_check(s)) {
2953        return true;
2954    }
2955
2956    tcg_gen_gvec_dup_mem(a->size, neon_full_reg_offset(a->vd),
2957                         neon_element_offset(a->vm, a->index, a->size),
2958                         a->q ? 16 : 8, a->q ? 16 : 8);
2959    return true;
2960}
2961
2962static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
2963{
2964    int pass, half;
2965    TCGv_i32 tmp[2];
2966
2967    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2968        return false;
2969    }
2970
2971    /* UNDEF accesses to D16-D31 if they don't exist. */
2972    if (!dc_isar_feature(aa32_simd_r32, s) &&
2973        ((a->vd | a->vm) & 0x10)) {
2974        return false;
2975    }
2976
2977    if ((a->vd | a->vm) & a->q) {
2978        return false;
2979    }
2980
2981    if (a->size == 3) {
2982        return false;
2983    }
2984
2985    if (!vfp_access_check(s)) {
2986        return true;
2987    }
2988
2989    tmp[0] = tcg_temp_new_i32();
2990    tmp[1] = tcg_temp_new_i32();
2991
2992    for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
2993        for (half = 0; half < 2; half++) {
2994            read_neon_element32(tmp[half], a->vm, pass * 2 + half, MO_32);
2995            switch (a->size) {
2996            case 0:
2997                tcg_gen_bswap32_i32(tmp[half], tmp[half]);
2998                break;
2999            case 1:
3000                gen_swap_half(tmp[half], tmp[half]);
3001                break;
3002            case 2:
3003                break;
3004            default:
3005                g_assert_not_reached();
3006            }
3007        }
3008        write_neon_element32(tmp[1], a->vd, pass * 2, MO_32);
3009        write_neon_element32(tmp[0], a->vd, pass * 2 + 1, MO_32);
3010    }
3011
3012    tcg_temp_free_i32(tmp[0]);
3013    tcg_temp_free_i32(tmp[1]);
3014    return true;
3015}
3016
3017static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
3018                              NeonGenWidenFn *widenfn,
3019                              NeonGenTwo64OpFn *opfn,
3020                              NeonGenTwo64OpFn *accfn)
3021{
3022    /*
3023     * Pairwise long operations: widen both halves of the pair,
3024     * combine the pairs with the opfn, and then possibly accumulate
3025     * into the destination with the accfn.
3026     */
3027    int pass;
3028
3029    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3030        return false;
3031    }
3032
3033    /* UNDEF accesses to D16-D31 if they don't exist. */
3034    if (!dc_isar_feature(aa32_simd_r32, s) &&
3035        ((a->vd | a->vm) & 0x10)) {
3036        return false;
3037    }
3038
3039    if ((a->vd | a->vm) & a->q) {
3040        return false;
3041    }
3042
3043    if (!widenfn) {
3044        return false;
3045    }
3046
3047    if (!vfp_access_check(s)) {
3048        return true;
3049    }
3050
3051    for (pass = 0; pass < a->q + 1; pass++) {
3052        TCGv_i32 tmp;
3053        TCGv_i64 rm0_64, rm1_64, rd_64;
3054
3055        rm0_64 = tcg_temp_new_i64();
3056        rm1_64 = tcg_temp_new_i64();
3057        rd_64 = tcg_temp_new_i64();
3058
3059        tmp = tcg_temp_new_i32();
3060        read_neon_element32(tmp, a->vm, pass * 2, MO_32);
3061        widenfn(rm0_64, tmp);
3062        read_neon_element32(tmp, a->vm, pass * 2 + 1, MO_32);
3063        widenfn(rm1_64, tmp);
3064        tcg_temp_free_i32(tmp);
3065
3066        opfn(rd_64, rm0_64, rm1_64);
3067        tcg_temp_free_i64(rm0_64);
3068        tcg_temp_free_i64(rm1_64);
3069
3070        if (accfn) {
3071            TCGv_i64 tmp64 = tcg_temp_new_i64();
3072            read_neon_element64(tmp64, a->vd, pass, MO_64);
3073            accfn(rd_64, tmp64, rd_64);
3074            tcg_temp_free_i64(tmp64);
3075        }
3076        write_neon_element64(rd_64, a->vd, pass, MO_64);
3077        tcg_temp_free_i64(rd_64);
3078    }
3079    return true;
3080}
3081
3082static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
3083{
3084    static NeonGenWidenFn * const widenfn[] = {
3085        gen_helper_neon_widen_s8,
3086        gen_helper_neon_widen_s16,
3087        tcg_gen_ext_i32_i64,
3088        NULL,
3089    };
3090    static NeonGenTwo64OpFn * const opfn[] = {
3091        gen_helper_neon_paddl_u16,
3092        gen_helper_neon_paddl_u32,
3093        tcg_gen_add_i64,
3094        NULL,
3095    };
3096
3097    return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3098}
3099
3100static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
3101{
3102    static NeonGenWidenFn * const widenfn[] = {
3103        gen_helper_neon_widen_u8,
3104        gen_helper_neon_widen_u16,
3105        tcg_gen_extu_i32_i64,
3106        NULL,
3107    };
3108    static NeonGenTwo64OpFn * const opfn[] = {
3109        gen_helper_neon_paddl_u16,
3110        gen_helper_neon_paddl_u32,
3111        tcg_gen_add_i64,
3112        NULL,
3113    };
3114
3115    return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3116}
3117
3118static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
3119{
3120    static NeonGenWidenFn * const widenfn[] = {
3121        gen_helper_neon_widen_s8,
3122        gen_helper_neon_widen_s16,
3123        tcg_gen_ext_i32_i64,
3124        NULL,
3125    };
3126    static NeonGenTwo64OpFn * const opfn[] = {
3127        gen_helper_neon_paddl_u16,
3128        gen_helper_neon_paddl_u32,
3129        tcg_gen_add_i64,
3130        NULL,
3131    };
3132    static NeonGenTwo64OpFn * const accfn[] = {
3133        gen_helper_neon_addl_u16,
3134        gen_helper_neon_addl_u32,
3135        tcg_gen_add_i64,
3136        NULL,
3137    };
3138
3139    return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3140                             accfn[a->size]);
3141}
3142
3143static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
3144{
3145    static NeonGenWidenFn * const widenfn[] = {
3146        gen_helper_neon_widen_u8,
3147        gen_helper_neon_widen_u16,
3148        tcg_gen_extu_i32_i64,
3149        NULL,
3150    };
3151    static NeonGenTwo64OpFn * const opfn[] = {
3152        gen_helper_neon_paddl_u16,
3153        gen_helper_neon_paddl_u32,
3154        tcg_gen_add_i64,
3155        NULL,
3156    };
3157    static NeonGenTwo64OpFn * const accfn[] = {
3158        gen_helper_neon_addl_u16,
3159        gen_helper_neon_addl_u32,
3160        tcg_gen_add_i64,
3161        NULL,
3162    };
3163
3164    return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3165                             accfn[a->size]);
3166}
3167
3168typedef void ZipFn(TCGv_ptr, TCGv_ptr);
3169
3170static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
3171                       ZipFn *fn)
3172{
3173    TCGv_ptr pd, pm;
3174
3175    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3176        return false;
3177    }
3178
3179    /* UNDEF accesses to D16-D31 if they don't exist. */
3180    if (!dc_isar_feature(aa32_simd_r32, s) &&
3181        ((a->vd | a->vm) & 0x10)) {
3182        return false;
3183    }
3184
3185    if ((a->vd | a->vm) & a->q) {
3186        return false;
3187    }
3188
3189    if (!fn) {
3190        /* Bad size or size/q combination */
3191        return false;
3192    }
3193
3194    if (!vfp_access_check(s)) {
3195        return true;
3196    }
3197
3198    pd = vfp_reg_ptr(true, a->vd);
3199    pm = vfp_reg_ptr(true, a->vm);
3200    fn(pd, pm);
3201    tcg_temp_free_ptr(pd);
3202    tcg_temp_free_ptr(pm);
3203    return true;
3204}
3205
3206static bool trans_VUZP(DisasContext *s, arg_2misc *a)
3207{
3208    static ZipFn * const fn[2][4] = {
3209        {
3210            gen_helper_neon_unzip8,
3211            gen_helper_neon_unzip16,
3212            NULL,
3213            NULL,
3214        }, {
3215            gen_helper_neon_qunzip8,
3216            gen_helper_neon_qunzip16,
3217            gen_helper_neon_qunzip32,
3218            NULL,
3219        }
3220    };
3221    return do_zip_uzp(s, a, fn[a->q][a->size]);
3222}
3223
3224static bool trans_VZIP(DisasContext *s, arg_2misc *a)
3225{
3226    static ZipFn * const fn[2][4] = {
3227        {
3228            gen_helper_neon_zip8,
3229            gen_helper_neon_zip16,
3230            NULL,
3231            NULL,
3232        }, {
3233            gen_helper_neon_qzip8,
3234            gen_helper_neon_qzip16,
3235            gen_helper_neon_qzip32,
3236            NULL,
3237        }
3238    };
3239    return do_zip_uzp(s, a, fn[a->q][a->size]);
3240}
3241
3242static bool do_vmovn(DisasContext *s, arg_2misc *a,
3243                     NeonGenNarrowEnvFn *narrowfn)
3244{
3245    TCGv_i64 rm;
3246    TCGv_i32 rd0, rd1;
3247
3248    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3249        return false;
3250    }
3251
3252    /* UNDEF accesses to D16-D31 if they don't exist. */
3253    if (!dc_isar_feature(aa32_simd_r32, s) &&
3254        ((a->vd | a->vm) & 0x10)) {
3255        return false;
3256    }
3257
3258    if (a->vm & 1) {
3259        return false;
3260    }
3261
3262    if (!narrowfn) {
3263        return false;
3264    }
3265
3266    if (!vfp_access_check(s)) {
3267        return true;
3268    }
3269
3270    rm = tcg_temp_new_i64();
3271    rd0 = tcg_temp_new_i32();
3272    rd1 = tcg_temp_new_i32();
3273
3274    read_neon_element64(rm, a->vm, 0, MO_64);
3275    narrowfn(rd0, cpu_env, rm);
3276    read_neon_element64(rm, a->vm, 1, MO_64);
3277    narrowfn(rd1, cpu_env, rm);
3278    write_neon_element32(rd0, a->vd, 0, MO_32);
3279    write_neon_element32(rd1, a->vd, 1, MO_32);
3280    tcg_temp_free_i32(rd0);
3281    tcg_temp_free_i32(rd1);
3282    tcg_temp_free_i64(rm);
3283    return true;
3284}
3285
3286#define DO_VMOVN(INSN, FUNC)                                    \
3287    static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3288    {                                                           \
3289        static NeonGenNarrowEnvFn * const narrowfn[] = {        \
3290            FUNC##8,                                            \
3291            FUNC##16,                                           \
3292            FUNC##32,                                           \
3293            NULL,                                               \
3294        };                                                      \
3295        return do_vmovn(s, a, narrowfn[a->size]);               \
3296    }
3297
3298DO_VMOVN(VMOVN, gen_neon_narrow_u)
3299DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
3300DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
3301DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
3302
3303static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
3304{
3305    TCGv_i32 rm0, rm1;
3306    TCGv_i64 rd;
3307    static NeonGenWidenFn * const widenfns[] = {
3308        gen_helper_neon_widen_u8,
3309        gen_helper_neon_widen_u16,
3310        tcg_gen_extu_i32_i64,
3311        NULL,
3312    };
3313    NeonGenWidenFn *widenfn = widenfns[a->size];
3314
3315    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3316        return false;
3317    }
3318
3319    /* UNDEF accesses to D16-D31 if they don't exist. */
3320    if (!dc_isar_feature(aa32_simd_r32, s) &&
3321        ((a->vd | a->vm) & 0x10)) {
3322        return false;
3323    }
3324
3325    if (a->vd & 1) {
3326        return false;
3327    }
3328
3329    if (!widenfn) {
3330        return false;
3331    }
3332
3333    if (!vfp_access_check(s)) {
3334        return true;
3335    }
3336
3337    rd = tcg_temp_new_i64();
3338    rm0 = tcg_temp_new_i32();
3339    rm1 = tcg_temp_new_i32();
3340
3341    read_neon_element32(rm0, a->vm, 0, MO_32);
3342    read_neon_element32(rm1, a->vm, 1, MO_32);
3343
3344    widenfn(rd, rm0);
3345    tcg_gen_shli_i64(rd, rd, 8 << a->size);
3346    write_neon_element64(rd, a->vd, 0, MO_64);
3347    widenfn(rd, rm1);
3348    tcg_gen_shli_i64(rd, rd, 8 << a->size);
3349    write_neon_element64(rd, a->vd, 1, MO_64);
3350
3351    tcg_temp_free_i64(rd);
3352    tcg_temp_free_i32(rm0);
3353    tcg_temp_free_i32(rm1);
3354    return true;
3355}
3356
3357static bool trans_VCVT_B16_F32(DisasContext *s, arg_2misc *a)
3358{
3359    TCGv_ptr fpst;
3360    TCGv_i64 tmp;
3361    TCGv_i32 dst0, dst1;
3362
3363    if (!dc_isar_feature(aa32_bf16, s)) {
3364        return false;
3365    }
3366
3367    /* UNDEF accesses to D16-D31 if they don't exist. */
3368    if (!dc_isar_feature(aa32_simd_r32, s) &&
3369        ((a->vd | a->vm) & 0x10)) {
3370        return false;
3371    }
3372
3373    if ((a->vm & 1) || (a->size != 1)) {
3374        return false;
3375    }
3376
3377    if (!vfp_access_check(s)) {
3378        return true;
3379    }
3380
3381    fpst = fpstatus_ptr(FPST_STD);
3382    tmp = tcg_temp_new_i64();
3383    dst0 = tcg_temp_new_i32();
3384    dst1 = tcg_temp_new_i32();
3385
3386    read_neon_element64(tmp, a->vm, 0, MO_64);
3387    gen_helper_bfcvt_pair(dst0, tmp, fpst);
3388
3389    read_neon_element64(tmp, a->vm, 1, MO_64);
3390    gen_helper_bfcvt_pair(dst1, tmp, fpst);
3391
3392    write_neon_element32(dst0, a->vd, 0, MO_32);
3393    write_neon_element32(dst1, a->vd, 1, MO_32);
3394
3395    tcg_temp_free_i64(tmp);
3396    tcg_temp_free_i32(dst0);
3397    tcg_temp_free_i32(dst1);
3398    tcg_temp_free_ptr(fpst);
3399    return true;
3400}
3401
3402static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
3403{
3404    TCGv_ptr fpst;
3405    TCGv_i32 ahp, tmp, tmp2, tmp3;
3406
3407    if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3408        !dc_isar_feature(aa32_fp16_spconv, s)) {
3409        return false;
3410    }
3411
3412    /* UNDEF accesses to D16-D31 if they don't exist. */
3413    if (!dc_isar_feature(aa32_simd_r32, s) &&
3414        ((a->vd | a->vm) & 0x10)) {
3415        return false;
3416    }
3417
3418    if ((a->vm & 1) || (a->size != 1)) {
3419        return false;
3420    }
3421
3422    if (!vfp_access_check(s)) {
3423        return true;
3424    }
3425
3426    fpst = fpstatus_ptr(FPST_STD);
3427    ahp = get_ahp_flag();
3428    tmp = tcg_temp_new_i32();
3429    read_neon_element32(tmp, a->vm, 0, MO_32);
3430    gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3431    tmp2 = tcg_temp_new_i32();
3432    read_neon_element32(tmp2, a->vm, 1, MO_32);
3433    gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
3434    tcg_gen_shli_i32(tmp2, tmp2, 16);
3435    tcg_gen_or_i32(tmp2, tmp2, tmp);
3436    read_neon_element32(tmp, a->vm, 2, MO_32);
3437    gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3438    tmp3 = tcg_temp_new_i32();
3439    read_neon_element32(tmp3, a->vm, 3, MO_32);
3440    write_neon_element32(tmp2, a->vd, 0, MO_32);
3441    tcg_temp_free_i32(tmp2);
3442    gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
3443    tcg_gen_shli_i32(tmp3, tmp3, 16);
3444    tcg_gen_or_i32(tmp3, tmp3, tmp);
3445    write_neon_element32(tmp3, a->vd, 1, MO_32);
3446    tcg_temp_free_i32(tmp3);
3447    tcg_temp_free_i32(tmp);
3448    tcg_temp_free_i32(ahp);
3449    tcg_temp_free_ptr(fpst);
3450
3451    return true;
3452}
3453
3454static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
3455{
3456    TCGv_ptr fpst;
3457    TCGv_i32 ahp, tmp, tmp2, tmp3;
3458
3459    if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3460        !dc_isar_feature(aa32_fp16_spconv, s)) {
3461        return false;
3462    }
3463
3464    /* UNDEF accesses to D16-D31 if they don't exist. */
3465    if (!dc_isar_feature(aa32_simd_r32, s) &&
3466        ((a->vd | a->vm) & 0x10)) {
3467        return false;
3468    }
3469
3470    if ((a->vd & 1) || (a->size != 1)) {
3471        return false;
3472    }
3473
3474    if (!vfp_access_check(s)) {
3475        return true;
3476    }
3477
3478    fpst = fpstatus_ptr(FPST_STD);
3479    ahp = get_ahp_flag();
3480    tmp3 = tcg_temp_new_i32();
3481    tmp2 = tcg_temp_new_i32();
3482    tmp = tcg_temp_new_i32();
3483    read_neon_element32(tmp, a->vm, 0, MO_32);
3484    read_neon_element32(tmp2, a->vm, 1, MO_32);
3485    tcg_gen_ext16u_i32(tmp3, tmp);
3486    gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3487    write_neon_element32(tmp3, a->vd, 0, MO_32);
3488    tcg_gen_shri_i32(tmp, tmp, 16);
3489    gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
3490    write_neon_element32(tmp, a->vd, 1, MO_32);
3491    tcg_temp_free_i32(tmp);
3492    tcg_gen_ext16u_i32(tmp3, tmp2);
3493    gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3494    write_neon_element32(tmp3, a->vd, 2, MO_32);
3495    tcg_temp_free_i32(tmp3);
3496    tcg_gen_shri_i32(tmp2, tmp2, 16);
3497    gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
3498    write_neon_element32(tmp2, a->vd, 3, MO_32);
3499    tcg_temp_free_i32(tmp2);
3500    tcg_temp_free_i32(ahp);
3501    tcg_temp_free_ptr(fpst);
3502
3503    return true;
3504}
3505
3506static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
3507{
3508    int vec_size = a->q ? 16 : 8;
3509    int rd_ofs = neon_full_reg_offset(a->vd);
3510    int rm_ofs = neon_full_reg_offset(a->vm);
3511
3512    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3513        return false;
3514    }
3515
3516    /* UNDEF accesses to D16-D31 if they don't exist. */
3517    if (!dc_isar_feature(aa32_simd_r32, s) &&
3518        ((a->vd | a->vm) & 0x10)) {
3519        return false;
3520    }
3521
3522    if (a->size == 3) {
3523        return false;
3524    }
3525
3526    if ((a->vd | a->vm) & a->q) {
3527        return false;
3528    }
3529
3530    if (!vfp_access_check(s)) {
3531        return true;
3532    }
3533
3534    fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size);
3535
3536    return true;
3537}
3538
3539#define DO_2MISC_VEC(INSN, FN)                                  \
3540    static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3541    {                                                           \
3542        return do_2misc_vec(s, a, FN);                          \
3543    }
3544
3545DO_2MISC_VEC(VNEG, tcg_gen_gvec_neg)
3546DO_2MISC_VEC(VABS, tcg_gen_gvec_abs)
3547DO_2MISC_VEC(VCEQ0, gen_gvec_ceq0)
3548DO_2MISC_VEC(VCGT0, gen_gvec_cgt0)
3549DO_2MISC_VEC(VCLE0, gen_gvec_cle0)
3550DO_2MISC_VEC(VCGE0, gen_gvec_cge0)
3551DO_2MISC_VEC(VCLT0, gen_gvec_clt0)
3552
3553static bool trans_VMVN(DisasContext *s, arg_2misc *a)
3554{
3555    if (a->size != 0) {
3556        return false;
3557    }
3558    return do_2misc_vec(s, a, tcg_gen_gvec_not);
3559}
3560
3561#define WRAP_2M_3_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3562    static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3563                         uint32_t rm_ofs, uint32_t oprsz,               \
3564                         uint32_t maxsz)                                \
3565    {                                                                   \
3566        tcg_gen_gvec_3_ool(rd_ofs, rd_ofs, rm_ofs, oprsz, maxsz,        \
3567                           DATA, FUNC);                                 \
3568    }
3569
3570#define WRAP_2M_2_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3571    static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3572                         uint32_t rm_ofs, uint32_t oprsz,               \
3573                         uint32_t maxsz)                                \
3574    {                                                                   \
3575        tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, oprsz, maxsz, DATA, FUNC);   \
3576    }
3577
3578WRAP_2M_3_OOL_FN(gen_AESE, gen_helper_crypto_aese, 0)
3579WRAP_2M_3_OOL_FN(gen_AESD, gen_helper_crypto_aese, 1)
3580WRAP_2M_2_OOL_FN(gen_AESMC, gen_helper_crypto_aesmc, 0)
3581WRAP_2M_2_OOL_FN(gen_AESIMC, gen_helper_crypto_aesmc, 1)
3582WRAP_2M_2_OOL_FN(gen_SHA1H, gen_helper_crypto_sha1h, 0)
3583WRAP_2M_2_OOL_FN(gen_SHA1SU1, gen_helper_crypto_sha1su1, 0)
3584WRAP_2M_2_OOL_FN(gen_SHA256SU0, gen_helper_crypto_sha256su0, 0)
3585
3586#define DO_2M_CRYPTO(INSN, FEATURE, SIZE)                       \
3587    static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3588    {                                                           \
3589        if (!dc_isar_feature(FEATURE, s) || a->size != SIZE) {  \
3590            return false;                                       \
3591        }                                                       \
3592        return do_2misc_vec(s, a, gen_##INSN);                  \
3593    }
3594
3595DO_2M_CRYPTO(AESE, aa32_aes, 0)
3596DO_2M_CRYPTO(AESD, aa32_aes, 0)
3597DO_2M_CRYPTO(AESMC, aa32_aes, 0)
3598DO_2M_CRYPTO(AESIMC, aa32_aes, 0)
3599DO_2M_CRYPTO(SHA1H, aa32_sha1, 2)
3600DO_2M_CRYPTO(SHA1SU1, aa32_sha1, 2)
3601DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2)
3602
3603static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn)
3604{
3605    TCGv_i32 tmp;
3606    int pass;
3607
3608    /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
3609    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3610        return false;
3611    }
3612
3613    /* UNDEF accesses to D16-D31 if they don't exist. */
3614    if (!dc_isar_feature(aa32_simd_r32, s) &&
3615        ((a->vd | a->vm) & 0x10)) {
3616        return false;
3617    }
3618
3619    if (!fn) {
3620        return false;
3621    }
3622
3623    if ((a->vd | a->vm) & a->q) {
3624        return false;
3625    }
3626
3627    if (!vfp_access_check(s)) {
3628        return true;
3629    }
3630
3631    tmp = tcg_temp_new_i32();
3632    for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3633        read_neon_element32(tmp, a->vm, pass, MO_32);
3634        fn(tmp, tmp);
3635        write_neon_element32(tmp, a->vd, pass, MO_32);
3636    }
3637    tcg_temp_free_i32(tmp);
3638
3639    return true;
3640}
3641
3642static bool trans_VREV32(DisasContext *s, arg_2misc *a)
3643{
3644    static NeonGenOneOpFn * const fn[] = {
3645        tcg_gen_bswap32_i32,
3646        gen_swap_half,
3647        NULL,
3648        NULL,
3649    };
3650    return do_2misc(s, a, fn[a->size]);
3651}
3652
3653static bool trans_VREV16(DisasContext *s, arg_2misc *a)
3654{
3655    if (a->size != 0) {
3656        return false;
3657    }
3658    return do_2misc(s, a, gen_rev16);
3659}
3660
3661static bool trans_VCLS(DisasContext *s, arg_2misc *a)
3662{
3663    static NeonGenOneOpFn * const fn[] = {
3664        gen_helper_neon_cls_s8,
3665        gen_helper_neon_cls_s16,
3666        gen_helper_neon_cls_s32,
3667        NULL,
3668    };
3669    return do_2misc(s, a, fn[a->size]);
3670}
3671
3672static void do_VCLZ_32(TCGv_i32 rd, TCGv_i32 rm)
3673{
3674    tcg_gen_clzi_i32(rd, rm, 32);
3675}
3676
3677static bool trans_VCLZ(DisasContext *s, arg_2misc *a)
3678{
3679    static NeonGenOneOpFn * const fn[] = {
3680        gen_helper_neon_clz_u8,
3681        gen_helper_neon_clz_u16,
3682        do_VCLZ_32,
3683        NULL,
3684    };
3685    return do_2misc(s, a, fn[a->size]);
3686}
3687
3688static bool trans_VCNT(DisasContext *s, arg_2misc *a)
3689{
3690    if (a->size != 0) {
3691        return false;
3692    }
3693    return do_2misc(s, a, gen_helper_neon_cnt_u8);
3694}
3695
3696static void gen_VABS_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3697                       uint32_t oprsz, uint32_t maxsz)
3698{
3699    tcg_gen_gvec_andi(vece, rd_ofs, rm_ofs,
3700                      vece == MO_16 ? 0x7fff : 0x7fffffff,
3701                      oprsz, maxsz);
3702}
3703
3704static bool trans_VABS_F(DisasContext *s, arg_2misc *a)
3705{
3706    if (a->size == MO_16) {
3707        if (!dc_isar_feature(aa32_fp16_arith, s)) {
3708            return false;
3709        }
3710    } else if (a->size != MO_32) {
3711        return false;
3712    }
3713    return do_2misc_vec(s, a, gen_VABS_F);
3714}
3715
3716static void gen_VNEG_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3717                       uint32_t oprsz, uint32_t maxsz)
3718{
3719    tcg_gen_gvec_xori(vece, rd_ofs, rm_ofs,
3720                      vece == MO_16 ? 0x8000 : 0x80000000,
3721                      oprsz, maxsz);
3722}
3723
3724static bool trans_VNEG_F(DisasContext *s, arg_2misc *a)
3725{
3726    if (a->size == MO_16) {
3727        if (!dc_isar_feature(aa32_fp16_arith, s)) {
3728            return false;
3729        }
3730    } else if (a->size != MO_32) {
3731        return false;
3732    }
3733    return do_2misc_vec(s, a, gen_VNEG_F);
3734}
3735
3736static bool trans_VRECPE(DisasContext *s, arg_2misc *a)
3737{
3738    if (a->size != 2) {
3739        return false;
3740    }
3741    return do_2misc(s, a, gen_helper_recpe_u32);
3742}
3743
3744static bool trans_VRSQRTE(DisasContext *s, arg_2misc *a)
3745{
3746    if (a->size != 2) {
3747        return false;
3748    }
3749    return do_2misc(s, a, gen_helper_rsqrte_u32);
3750}
3751
3752#define WRAP_1OP_ENV_FN(WRAPNAME, FUNC) \
3753    static void WRAPNAME(TCGv_i32 d, TCGv_i32 m)        \
3754    {                                                   \
3755        FUNC(d, cpu_env, m);                            \
3756    }
3757
3758WRAP_1OP_ENV_FN(gen_VQABS_s8, gen_helper_neon_qabs_s8)
3759WRAP_1OP_ENV_FN(gen_VQABS_s16, gen_helper_neon_qabs_s16)
3760WRAP_1OP_ENV_FN(gen_VQABS_s32, gen_helper_neon_qabs_s32)
3761WRAP_1OP_ENV_FN(gen_VQNEG_s8, gen_helper_neon_qneg_s8)
3762WRAP_1OP_ENV_FN(gen_VQNEG_s16, gen_helper_neon_qneg_s16)
3763WRAP_1OP_ENV_FN(gen_VQNEG_s32, gen_helper_neon_qneg_s32)
3764
3765static bool trans_VQABS(DisasContext *s, arg_2misc *a)
3766{
3767    static NeonGenOneOpFn * const fn[] = {
3768        gen_VQABS_s8,
3769        gen_VQABS_s16,
3770        gen_VQABS_s32,
3771        NULL,
3772    };
3773    return do_2misc(s, a, fn[a->size]);
3774}
3775
3776static bool trans_VQNEG(DisasContext *s, arg_2misc *a)
3777{
3778    static NeonGenOneOpFn * const fn[] = {
3779        gen_VQNEG_s8,
3780        gen_VQNEG_s16,
3781        gen_VQNEG_s32,
3782        NULL,
3783    };
3784    return do_2misc(s, a, fn[a->size]);
3785}
3786
3787#define DO_2MISC_FP_VEC(INSN, HFUNC, SFUNC)                             \
3788    static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3789                           uint32_t rm_ofs,                             \
3790                           uint32_t oprsz, uint32_t maxsz)              \
3791    {                                                                   \
3792        static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3793            NULL, HFUNC, SFUNC, NULL,                                   \
3794        };                                                              \
3795        TCGv_ptr fpst;                                                  \
3796        fpst = fpstatus_ptr(vece == MO_16 ? FPST_STD_F16 : FPST_STD);   \
3797        tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz, 0,       \
3798                           fns[vece]);                                  \
3799        tcg_temp_free_ptr(fpst);                                        \
3800    }                                                                   \
3801    static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3802    {                                                                   \
3803        if (a->size == MO_16) {                                         \
3804            if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3805                return false;                                           \
3806            }                                                           \
3807        } else if (a->size != MO_32) {                                  \
3808            return false;                                               \
3809        }                                                               \
3810        return do_2misc_vec(s, a, gen_##INSN);                          \
3811    }
3812
3813DO_2MISC_FP_VEC(VRECPE_F, gen_helper_gvec_frecpe_h, gen_helper_gvec_frecpe_s)
3814DO_2MISC_FP_VEC(VRSQRTE_F, gen_helper_gvec_frsqrte_h, gen_helper_gvec_frsqrte_s)
3815DO_2MISC_FP_VEC(VCGT0_F, gen_helper_gvec_fcgt0_h, gen_helper_gvec_fcgt0_s)
3816DO_2MISC_FP_VEC(VCGE0_F, gen_helper_gvec_fcge0_h, gen_helper_gvec_fcge0_s)
3817DO_2MISC_FP_VEC(VCEQ0_F, gen_helper_gvec_fceq0_h, gen_helper_gvec_fceq0_s)
3818DO_2MISC_FP_VEC(VCLT0_F, gen_helper_gvec_fclt0_h, gen_helper_gvec_fclt0_s)
3819DO_2MISC_FP_VEC(VCLE0_F, gen_helper_gvec_fcle0_h, gen_helper_gvec_fcle0_s)
3820DO_2MISC_FP_VEC(VCVT_FS, gen_helper_gvec_sstoh, gen_helper_gvec_sitos)
3821DO_2MISC_FP_VEC(VCVT_FU, gen_helper_gvec_ustoh, gen_helper_gvec_uitos)
3822DO_2MISC_FP_VEC(VCVT_SF, gen_helper_gvec_tosszh, gen_helper_gvec_tosizs)
3823DO_2MISC_FP_VEC(VCVT_UF, gen_helper_gvec_touszh, gen_helper_gvec_touizs)
3824
3825DO_2MISC_FP_VEC(VRINTX_impl, gen_helper_gvec_vrintx_h, gen_helper_gvec_vrintx_s)
3826
3827static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
3828{
3829    if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
3830        return false;
3831    }
3832    return trans_VRINTX_impl(s, a);
3833}
3834
3835#define DO_VEC_RMODE(INSN, RMODE, OP)                                   \
3836    static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3837                           uint32_t rm_ofs,                             \
3838                           uint32_t oprsz, uint32_t maxsz)              \
3839    {                                                                   \
3840        static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3841            NULL,                                                       \
3842            gen_helper_gvec_##OP##h,                                    \
3843            gen_helper_gvec_##OP##s,                                    \
3844            NULL,                                                       \
3845        };                                                              \
3846        TCGv_ptr fpst;                                                  \
3847        fpst = fpstatus_ptr(vece == 1 ? FPST_STD_F16 : FPST_STD);       \
3848        tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz,          \
3849                           arm_rmode_to_sf(RMODE), fns[vece]);          \
3850        tcg_temp_free_ptr(fpst);                                        \
3851    }                                                                   \
3852    static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3853    {                                                                   \
3854        if (!arm_dc_feature(s, ARM_FEATURE_V8)) {                       \
3855            return false;                                               \
3856        }                                                               \
3857        if (a->size == MO_16) {                                         \
3858            if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3859                return false;                                           \
3860            }                                                           \
3861        } else if (a->size != MO_32) {                                  \
3862            return false;                                               \
3863        }                                                               \
3864        return do_2misc_vec(s, a, gen_##INSN);                          \
3865    }
3866
3867DO_VEC_RMODE(VCVTAU, FPROUNDING_TIEAWAY, vcvt_rm_u)
3868DO_VEC_RMODE(VCVTAS, FPROUNDING_TIEAWAY, vcvt_rm_s)
3869DO_VEC_RMODE(VCVTNU, FPROUNDING_TIEEVEN, vcvt_rm_u)
3870DO_VEC_RMODE(VCVTNS, FPROUNDING_TIEEVEN, vcvt_rm_s)
3871DO_VEC_RMODE(VCVTPU, FPROUNDING_POSINF, vcvt_rm_u)
3872DO_VEC_RMODE(VCVTPS, FPROUNDING_POSINF, vcvt_rm_s)
3873DO_VEC_RMODE(VCVTMU, FPROUNDING_NEGINF, vcvt_rm_u)
3874DO_VEC_RMODE(VCVTMS, FPROUNDING_NEGINF, vcvt_rm_s)
3875
3876DO_VEC_RMODE(VRINTN, FPROUNDING_TIEEVEN, vrint_rm_)
3877DO_VEC_RMODE(VRINTA, FPROUNDING_TIEAWAY, vrint_rm_)
3878DO_VEC_RMODE(VRINTZ, FPROUNDING_ZERO, vrint_rm_)
3879DO_VEC_RMODE(VRINTM, FPROUNDING_NEGINF, vrint_rm_)
3880DO_VEC_RMODE(VRINTP, FPROUNDING_POSINF, vrint_rm_)
3881
3882static bool trans_VSWP(DisasContext *s, arg_2misc *a)
3883{
3884    TCGv_i64 rm, rd;
3885    int pass;
3886
3887    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3888        return false;
3889    }
3890
3891    /* UNDEF accesses to D16-D31 if they don't exist. */
3892    if (!dc_isar_feature(aa32_simd_r32, s) &&
3893        ((a->vd | a->vm) & 0x10)) {
3894        return false;
3895    }
3896
3897    if (a->size != 0) {
3898        return false;
3899    }
3900
3901    if ((a->vd | a->vm) & a->q) {
3902        return false;
3903    }
3904
3905    if (!vfp_access_check(s)) {
3906        return true;
3907    }
3908
3909    rm = tcg_temp_new_i64();
3910    rd = tcg_temp_new_i64();
3911    for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
3912        read_neon_element64(rm, a->vm, pass, MO_64);
3913        read_neon_element64(rd, a->vd, pass, MO_64);
3914        write_neon_element64(rm, a->vd, pass, MO_64);
3915        write_neon_element64(rd, a->vm, pass, MO_64);
3916    }
3917    tcg_temp_free_i64(rm);
3918    tcg_temp_free_i64(rd);
3919
3920    return true;
3921}
3922static void gen_neon_trn_u8(TCGv_i32 t0, TCGv_i32 t1)
3923{
3924    TCGv_i32 rd, tmp;
3925
3926    rd = tcg_temp_new_i32();
3927    tmp = tcg_temp_new_i32();
3928
3929    tcg_gen_shli_i32(rd, t0, 8);
3930    tcg_gen_andi_i32(rd, rd, 0xff00ff00);
3931    tcg_gen_andi_i32(tmp, t1, 0x00ff00ff);
3932    tcg_gen_or_i32(rd, rd, tmp);
3933
3934    tcg_gen_shri_i32(t1, t1, 8);
3935    tcg_gen_andi_i32(t1, t1, 0x00ff00ff);
3936    tcg_gen_andi_i32(tmp, t0, 0xff00ff00);
3937    tcg_gen_or_i32(t1, t1, tmp);
3938    tcg_gen_mov_i32(t0, rd);
3939
3940    tcg_temp_free_i32(tmp);
3941    tcg_temp_free_i32(rd);
3942}
3943
3944static void gen_neon_trn_u16(TCGv_i32 t0, TCGv_i32 t1)
3945{
3946    TCGv_i32 rd, tmp;
3947
3948    rd = tcg_temp_new_i32();
3949    tmp = tcg_temp_new_i32();
3950
3951    tcg_gen_shli_i32(rd, t0, 16);
3952    tcg_gen_andi_i32(tmp, t1, 0xffff);
3953    tcg_gen_or_i32(rd, rd, tmp);
3954    tcg_gen_shri_i32(t1, t1, 16);
3955    tcg_gen_andi_i32(tmp, t0, 0xffff0000);
3956    tcg_gen_or_i32(t1, t1, tmp);
3957    tcg_gen_mov_i32(t0, rd);
3958
3959    tcg_temp_free_i32(tmp);
3960    tcg_temp_free_i32(rd);
3961}
3962
3963static bool trans_VTRN(DisasContext *s, arg_2misc *a)
3964{
3965    TCGv_i32 tmp, tmp2;
3966    int pass;
3967
3968    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3969        return false;
3970    }
3971
3972    /* UNDEF accesses to D16-D31 if they don't exist. */
3973    if (!dc_isar_feature(aa32_simd_r32, s) &&
3974        ((a->vd | a->vm) & 0x10)) {
3975        return false;
3976    }
3977
3978    if ((a->vd | a->vm) & a->q) {
3979        return false;
3980    }
3981
3982    if (a->size == 3) {
3983        return false;
3984    }
3985
3986    if (!vfp_access_check(s)) {
3987        return true;
3988    }
3989
3990    tmp = tcg_temp_new_i32();
3991    tmp2 = tcg_temp_new_i32();
3992    if (a->size == MO_32) {
3993        for (pass = 0; pass < (a->q ? 4 : 2); pass += 2) {
3994            read_neon_element32(tmp, a->vm, pass, MO_32);
3995            read_neon_element32(tmp2, a->vd, pass + 1, MO_32);
3996            write_neon_element32(tmp2, a->vm, pass, MO_32);
3997            write_neon_element32(tmp, a->vd, pass + 1, MO_32);
3998        }
3999    } else {
4000        for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
4001            read_neon_element32(tmp, a->vm, pass, MO_32);
4002            read_neon_element32(tmp2, a->vd, pass, MO_32);
4003            if (a->size == MO_8) {
4004                gen_neon_trn_u8(tmp, tmp2);
4005            } else {
4006                gen_neon_trn_u16(tmp, tmp2);
4007            }
4008            write_neon_element32(tmp2, a->vm, pass, MO_32);
4009            write_neon_element32(tmp, a->vd, pass, MO_32);
4010        }
4011    }
4012    tcg_temp_free_i32(tmp);
4013    tcg_temp_free_i32(tmp2);
4014    return true;
4015}
4016
4017static bool trans_VSMMLA(DisasContext *s, arg_VSMMLA *a)
4018{
4019    if (!dc_isar_feature(aa32_i8mm, s)) {
4020        return false;
4021    }
4022    return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4023                        gen_helper_gvec_smmla_b);
4024}
4025
4026static bool trans_VUMMLA(DisasContext *s, arg_VUMMLA *a)
4027{
4028    if (!dc_isar_feature(aa32_i8mm, s)) {
4029        return false;
4030    }
4031    return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4032                        gen_helper_gvec_ummla_b);
4033}
4034
4035static bool trans_VUSMMLA(DisasContext *s, arg_VUSMMLA *a)
4036{
4037    if (!dc_isar_feature(aa32_i8mm, s)) {
4038        return false;
4039    }
4040    return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4041                        gen_helper_gvec_usmmla_b);
4042}
4043
4044static bool trans_VMMLA_b16(DisasContext *s, arg_VMMLA_b16 *a)
4045{
4046    if (!dc_isar_feature(aa32_bf16, s)) {
4047        return false;
4048    }
4049    return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
4050                        gen_helper_gvec_bfmmla);
4051}
4052
4053static bool trans_VFMA_b16(DisasContext *s, arg_VFMA_b16 *a)
4054{
4055    if (!dc_isar_feature(aa32_bf16, s)) {
4056        return false;
4057    }
4058    return do_neon_ddda_fpst(s, 7, a->vd, a->vn, a->vm, a->q, FPST_STD,
4059                             gen_helper_gvec_bfmlal);
4060}
4061
4062static bool trans_VFMA_b16_scal(DisasContext *s, arg_VFMA_b16_scal *a)
4063{
4064    if (!dc_isar_feature(aa32_bf16, s)) {
4065        return false;
4066    }
4067    return do_neon_ddda_fpst(s, 6, a->vd, a->vn, a->vm,
4068                             (a->index << 1) | a->q, FPST_STD,
4069                             gen_helper_gvec_bfmlal_idx);
4070}
4071