qemu/tcg/tcg-op-gvec.c
<<
>>
Prefs
   1/*
   2 * Generic vector operation expansion
   3 *
   4 * Copyright (c) 2018 Linaro
   5 *
   6 * This library is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU Lesser General Public
   8 * License as published by the Free Software Foundation; either
   9 * version 2.1 of the License, or (at your option) any later version.
  10 *
  11 * This library is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * Lesser General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU Lesser General Public
  17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20#include "qemu/osdep.h"
  21#include "tcg.h"
  22#include "tcg-op.h"
  23#include "tcg-op-gvec.h"
  24#include "tcg-gvec-desc.h"
  25
  26#define MAX_UNROLL  4
  27
  28#ifdef CONFIG_DEBUG_TCG
  29static const TCGOpcode vecop_list_empty[1] = { 0 };
  30#else
  31#define vecop_list_empty NULL
  32#endif
  33
  34
  35/* Verify vector size and alignment rules.  OFS should be the OR of all
  36   of the operand offsets so that we can check them all at once.  */
  37static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
  38{
  39    uint32_t opr_align = oprsz >= 16 ? 15 : 7;
  40    uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7;
  41    tcg_debug_assert(oprsz > 0);
  42    tcg_debug_assert(oprsz <= maxsz);
  43    tcg_debug_assert((oprsz & opr_align) == 0);
  44    tcg_debug_assert((maxsz & max_align) == 0);
  45    tcg_debug_assert((ofs & max_align) == 0);
  46}
  47
  48/* Verify vector overlap rules for two operands.  */
  49static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
  50{
  51    tcg_debug_assert(d == a || d + s <= a || a + s <= d);
  52}
  53
  54/* Verify vector overlap rules for three operands.  */
  55static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
  56{
  57    check_overlap_2(d, a, s);
  58    check_overlap_2(d, b, s);
  59    check_overlap_2(a, b, s);
  60}
  61
  62/* Verify vector overlap rules for four operands.  */
  63static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
  64                            uint32_t c, uint32_t s)
  65{
  66    check_overlap_2(d, a, s);
  67    check_overlap_2(d, b, s);
  68    check_overlap_2(d, c, s);
  69    check_overlap_2(a, b, s);
  70    check_overlap_2(a, c, s);
  71    check_overlap_2(b, c, s);
  72}
  73
  74/* Create a descriptor from components.  */
  75uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
  76{
  77    uint32_t desc = 0;
  78
  79    assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS));
  80    assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS));
  81    assert(data == sextract32(data, 0, SIMD_DATA_BITS));
  82
  83    oprsz = (oprsz / 8) - 1;
  84    maxsz = (maxsz / 8) - 1;
  85    desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
  86    desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
  87    desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
  88
  89    return desc;
  90}
  91
  92/* Generate a call to a gvec-style helper with two vector operands.  */
  93void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
  94                        uint32_t oprsz, uint32_t maxsz, int32_t data,
  95                        gen_helper_gvec_2 *fn)
  96{
  97    TCGv_ptr a0, a1;
  98    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
  99
 100    a0 = tcg_temp_new_ptr();
 101    a1 = tcg_temp_new_ptr();
 102
 103    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 104    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 105
 106    fn(a0, a1, desc);
 107
 108    tcg_temp_free_ptr(a0);
 109    tcg_temp_free_ptr(a1);
 110    tcg_temp_free_i32(desc);
 111}
 112
 113/* Generate a call to a gvec-style helper with two vector operands
 114   and one scalar operand.  */
 115void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
 116                         uint32_t oprsz, uint32_t maxsz, int32_t data,
 117                         gen_helper_gvec_2i *fn)
 118{
 119    TCGv_ptr a0, a1;
 120    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 121
 122    a0 = tcg_temp_new_ptr();
 123    a1 = tcg_temp_new_ptr();
 124
 125    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 126    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 127
 128    fn(a0, a1, c, desc);
 129
 130    tcg_temp_free_ptr(a0);
 131    tcg_temp_free_ptr(a1);
 132    tcg_temp_free_i32(desc);
 133}
 134
 135/* Generate a call to a gvec-style helper with three vector operands.  */
 136void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 137                        uint32_t oprsz, uint32_t maxsz, int32_t data,
 138                        gen_helper_gvec_3 *fn)
 139{
 140    TCGv_ptr a0, a1, a2;
 141    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 142
 143    a0 = tcg_temp_new_ptr();
 144    a1 = tcg_temp_new_ptr();
 145    a2 = tcg_temp_new_ptr();
 146
 147    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 148    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 149    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 150
 151    fn(a0, a1, a2, desc);
 152
 153    tcg_temp_free_ptr(a0);
 154    tcg_temp_free_ptr(a1);
 155    tcg_temp_free_ptr(a2);
 156    tcg_temp_free_i32(desc);
 157}
 158
 159/* Generate a call to a gvec-style helper with four vector operands.  */
 160void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 161                        uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
 162                        int32_t data, gen_helper_gvec_4 *fn)
 163{
 164    TCGv_ptr a0, a1, a2, a3;
 165    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 166
 167    a0 = tcg_temp_new_ptr();
 168    a1 = tcg_temp_new_ptr();
 169    a2 = tcg_temp_new_ptr();
 170    a3 = tcg_temp_new_ptr();
 171
 172    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 173    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 174    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 175    tcg_gen_addi_ptr(a3, cpu_env, cofs);
 176
 177    fn(a0, a1, a2, a3, desc);
 178
 179    tcg_temp_free_ptr(a0);
 180    tcg_temp_free_ptr(a1);
 181    tcg_temp_free_ptr(a2);
 182    tcg_temp_free_ptr(a3);
 183    tcg_temp_free_i32(desc);
 184}
 185
 186/* Generate a call to a gvec-style helper with five vector operands.  */
 187void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 188                        uint32_t cofs, uint32_t xofs, uint32_t oprsz,
 189                        uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
 190{
 191    TCGv_ptr a0, a1, a2, a3, a4;
 192    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 193
 194    a0 = tcg_temp_new_ptr();
 195    a1 = tcg_temp_new_ptr();
 196    a2 = tcg_temp_new_ptr();
 197    a3 = tcg_temp_new_ptr();
 198    a4 = tcg_temp_new_ptr();
 199
 200    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 201    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 202    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 203    tcg_gen_addi_ptr(a3, cpu_env, cofs);
 204    tcg_gen_addi_ptr(a4, cpu_env, xofs);
 205
 206    fn(a0, a1, a2, a3, a4, desc);
 207
 208    tcg_temp_free_ptr(a0);
 209    tcg_temp_free_ptr(a1);
 210    tcg_temp_free_ptr(a2);
 211    tcg_temp_free_ptr(a3);
 212    tcg_temp_free_ptr(a4);
 213    tcg_temp_free_i32(desc);
 214}
 215
 216/* Generate a call to a gvec-style helper with three vector operands
 217   and an extra pointer operand.  */
 218void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
 219                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 220                        int32_t data, gen_helper_gvec_2_ptr *fn)
 221{
 222    TCGv_ptr a0, a1;
 223    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 224
 225    a0 = tcg_temp_new_ptr();
 226    a1 = tcg_temp_new_ptr();
 227
 228    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 229    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 230
 231    fn(a0, a1, ptr, desc);
 232
 233    tcg_temp_free_ptr(a0);
 234    tcg_temp_free_ptr(a1);
 235    tcg_temp_free_i32(desc);
 236}
 237
 238/* Generate a call to a gvec-style helper with three vector operands
 239   and an extra pointer operand.  */
 240void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 241                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 242                        int32_t data, gen_helper_gvec_3_ptr *fn)
 243{
 244    TCGv_ptr a0, a1, a2;
 245    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 246
 247    a0 = tcg_temp_new_ptr();
 248    a1 = tcg_temp_new_ptr();
 249    a2 = tcg_temp_new_ptr();
 250
 251    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 252    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 253    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 254
 255    fn(a0, a1, a2, ptr, desc);
 256
 257    tcg_temp_free_ptr(a0);
 258    tcg_temp_free_ptr(a1);
 259    tcg_temp_free_ptr(a2);
 260    tcg_temp_free_i32(desc);
 261}
 262
 263/* Generate a call to a gvec-style helper with four vector operands
 264   and an extra pointer operand.  */
 265void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 266                        uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
 267                        uint32_t maxsz, int32_t data,
 268                        gen_helper_gvec_4_ptr *fn)
 269{
 270    TCGv_ptr a0, a1, a2, a3;
 271    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 272
 273    a0 = tcg_temp_new_ptr();
 274    a1 = tcg_temp_new_ptr();
 275    a2 = tcg_temp_new_ptr();
 276    a3 = tcg_temp_new_ptr();
 277
 278    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 279    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 280    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 281    tcg_gen_addi_ptr(a3, cpu_env, cofs);
 282
 283    fn(a0, a1, a2, a3, ptr, desc);
 284
 285    tcg_temp_free_ptr(a0);
 286    tcg_temp_free_ptr(a1);
 287    tcg_temp_free_ptr(a2);
 288    tcg_temp_free_ptr(a3);
 289    tcg_temp_free_i32(desc);
 290}
 291
 292/* Return true if we want to implement something of OPRSZ bytes
 293   in units of LNSZ.  This limits the expansion of inline code.  */
 294static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
 295{
 296    if (oprsz % lnsz == 0) {
 297        uint32_t lnct = oprsz / lnsz;
 298        return lnct >= 1 && lnct <= MAX_UNROLL;
 299    }
 300    return false;
 301}
 302
 303static void expand_clr(uint32_t dofs, uint32_t maxsz);
 304
 305/* Duplicate C as per VECE.  */
 306uint64_t (dup_const)(unsigned vece, uint64_t c)
 307{
 308    switch (vece) {
 309    case MO_8:
 310        return 0x0101010101010101ull * (uint8_t)c;
 311    case MO_16:
 312        return 0x0001000100010001ull * (uint16_t)c;
 313    case MO_32:
 314        return 0x0000000100000001ull * (uint32_t)c;
 315    case MO_64:
 316        return c;
 317    default:
 318        g_assert_not_reached();
 319    }
 320}
 321
 322/* Duplicate IN into OUT as per VECE.  */
 323static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
 324{
 325    switch (vece) {
 326    case MO_8:
 327        tcg_gen_ext8u_i32(out, in);
 328        tcg_gen_muli_i32(out, out, 0x01010101);
 329        break;
 330    case MO_16:
 331        tcg_gen_deposit_i32(out, in, in, 16, 16);
 332        break;
 333    case MO_32:
 334        tcg_gen_mov_i32(out, in);
 335        break;
 336    default:
 337        g_assert_not_reached();
 338    }
 339}
 340
 341static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
 342{
 343    switch (vece) {
 344    case MO_8:
 345        tcg_gen_ext8u_i64(out, in);
 346        tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
 347        break;
 348    case MO_16:
 349        tcg_gen_ext16u_i64(out, in);
 350        tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
 351        break;
 352    case MO_32:
 353        tcg_gen_deposit_i64(out, in, in, 32, 32);
 354        break;
 355    case MO_64:
 356        tcg_gen_mov_i64(out, in);
 357        break;
 358    default:
 359        g_assert_not_reached();
 360    }
 361}
 362
 363/* Select a supported vector type for implementing an operation on SIZE
 364 * bytes.  If OP is 0, assume that the real operation to be performed is
 365 * required by all backends.  Otherwise, make sure than OP can be performed
 366 * on elements of size VECE in the selected type.  Do not select V64 if
 367 * PREFER_I64 is true.  Return 0 if no vector type is selected.
 368 */
 369static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
 370                                  uint32_t size, bool prefer_i64)
 371{
 372    if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) {
 373        /*
 374         * Recall that ARM SVE allows vector sizes that are not a
 375         * power of 2, but always a multiple of 16.  The intent is
 376         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 377         * It is hard to imagine a case in which v256 is supported
 378         * but v128 is not, but check anyway.
 379         */
 380        if (tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece)
 381            && (size % 32 == 0
 382                || tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) {
 383            return TCG_TYPE_V256;
 384        }
 385    }
 386    if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16)
 387        && tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece)) {
 388        return TCG_TYPE_V128;
 389    }
 390    if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
 391        && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) {
 392        return TCG_TYPE_V64;
 393    }
 394    return 0;
 395}
 396
 397static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
 398                         uint32_t maxsz, TCGv_vec t_vec)
 399{
 400    uint32_t i = 0;
 401
 402    switch (type) {
 403    case TCG_TYPE_V256:
 404        /*
 405         * Recall that ARM SVE allows vector sizes that are not a
 406         * power of 2, but always a multiple of 16.  The intent is
 407         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 408         */
 409        for (; i + 32 <= oprsz; i += 32) {
 410            tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
 411        }
 412        /* fallthru */
 413    case TCG_TYPE_V128:
 414        for (; i + 16 <= oprsz; i += 16) {
 415            tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
 416        }
 417        break;
 418    case TCG_TYPE_V64:
 419        for (; i < oprsz; i += 8) {
 420            tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
 421        }
 422        break;
 423    default:
 424        g_assert_not_reached();
 425    }
 426
 427    if (oprsz < maxsz) {
 428        expand_clr(dofs + oprsz, maxsz - oprsz);
 429    }
 430}
 431
 432/* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
 433 * Only one of IN_32 or IN_64 may be set;
 434 * IN_C is used if IN_32 and IN_64 are unset.
 435 */
 436static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
 437                   uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
 438                   uint64_t in_c)
 439{
 440    TCGType type;
 441    TCGv_i64 t_64;
 442    TCGv_i32 t_32, t_desc;
 443    TCGv_ptr t_ptr;
 444    uint32_t i;
 445
 446    assert(vece <= (in_32 ? MO_32 : MO_64));
 447    assert(in_32 == NULL || in_64 == NULL);
 448
 449    /* If we're storing 0, expand oprsz to maxsz.  */
 450    if (in_32 == NULL && in_64 == NULL) {
 451        in_c = dup_const(vece, in_c);
 452        if (in_c == 0) {
 453            oprsz = maxsz;
 454        }
 455    }
 456
 457    /* Implement inline with a vector type, if possible.
 458     * Prefer integer when 64-bit host and no variable dup.
 459     */
 460    type = choose_vector_type(NULL, vece, oprsz,
 461                              (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
 462                               && (in_64 == NULL || vece == MO_64)));
 463    if (type != 0) {
 464        TCGv_vec t_vec = tcg_temp_new_vec(type);
 465
 466        if (in_32) {
 467            tcg_gen_dup_i32_vec(vece, t_vec, in_32);
 468        } else if (in_64) {
 469            tcg_gen_dup_i64_vec(vece, t_vec, in_64);
 470        } else {
 471            tcg_gen_dupi_vec(vece, t_vec, in_c);
 472        }
 473        do_dup_store(type, dofs, oprsz, maxsz, t_vec);
 474        tcg_temp_free_vec(t_vec);
 475        return;
 476    }
 477
 478    /* Otherwise, inline with an integer type, unless "large".  */
 479    if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
 480        t_64 = NULL;
 481        t_32 = NULL;
 482
 483        if (in_32) {
 484            /* We are given a 32-bit variable input.  For a 64-bit host,
 485               use a 64-bit operation unless the 32-bit operation would
 486               be simple enough.  */
 487            if (TCG_TARGET_REG_BITS == 64
 488                && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
 489                t_64 = tcg_temp_new_i64();
 490                tcg_gen_extu_i32_i64(t_64, in_32);
 491                gen_dup_i64(vece, t_64, t_64);
 492            } else {
 493                t_32 = tcg_temp_new_i32();
 494                gen_dup_i32(vece, t_32, in_32);
 495            }
 496        } else if (in_64) {
 497            /* We are given a 64-bit variable input.  */
 498            t_64 = tcg_temp_new_i64();
 499            gen_dup_i64(vece, t_64, in_64);
 500        } else {
 501            /* We are given a constant input.  */
 502            /* For 64-bit hosts, use 64-bit constants for "simple" constants
 503               or when we'd need too many 32-bit stores, or when a 64-bit
 504               constant is really required.  */
 505            if (vece == MO_64
 506                || (TCG_TARGET_REG_BITS == 64
 507                    && (in_c == 0 || in_c == -1
 508                        || !check_size_impl(oprsz, 4)))) {
 509                t_64 = tcg_const_i64(in_c);
 510            } else {
 511                t_32 = tcg_const_i32(in_c);
 512            }
 513        }
 514
 515        /* Implement inline if we picked an implementation size above.  */
 516        if (t_32) {
 517            for (i = 0; i < oprsz; i += 4) {
 518                tcg_gen_st_i32(t_32, cpu_env, dofs + i);
 519            }
 520            tcg_temp_free_i32(t_32);
 521            goto done;
 522        }
 523        if (t_64) {
 524            for (i = 0; i < oprsz; i += 8) {
 525                tcg_gen_st_i64(t_64, cpu_env, dofs + i);
 526            }
 527            tcg_temp_free_i64(t_64);
 528            goto done;
 529        }
 530    }
 531
 532    /* Otherwise implement out of line.  */
 533    t_ptr = tcg_temp_new_ptr();
 534    tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
 535    t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));
 536
 537    if (vece == MO_64) {
 538        if (in_64) {
 539            gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
 540        } else {
 541            t_64 = tcg_const_i64(in_c);
 542            gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
 543            tcg_temp_free_i64(t_64);
 544        }
 545    } else {
 546        typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
 547        static dup_fn * const fns[3] = {
 548            gen_helper_gvec_dup8,
 549            gen_helper_gvec_dup16,
 550            gen_helper_gvec_dup32
 551        };
 552
 553        if (in_32) {
 554            fns[vece](t_ptr, t_desc, in_32);
 555        } else {
 556            t_32 = tcg_temp_new_i32();
 557            if (in_64) {
 558                tcg_gen_extrl_i64_i32(t_32, in_64);
 559            } else if (vece == MO_8) {
 560                tcg_gen_movi_i32(t_32, in_c & 0xff);
 561            } else if (vece == MO_16) {
 562                tcg_gen_movi_i32(t_32, in_c & 0xffff);
 563            } else {
 564                tcg_gen_movi_i32(t_32, in_c);
 565            }
 566            fns[vece](t_ptr, t_desc, t_32);
 567            tcg_temp_free_i32(t_32);
 568        }
 569    }
 570
 571    tcg_temp_free_ptr(t_ptr);
 572    tcg_temp_free_i32(t_desc);
 573    return;
 574
 575 done:
 576    if (oprsz < maxsz) {
 577        expand_clr(dofs + oprsz, maxsz - oprsz);
 578    }
 579}
 580
 581/* Likewise, but with zero.  */
 582static void expand_clr(uint32_t dofs, uint32_t maxsz)
 583{
 584    do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
 585}
 586
 587/* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
 588static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 589                         void (*fni)(TCGv_i32, TCGv_i32))
 590{
 591    TCGv_i32 t0 = tcg_temp_new_i32();
 592    uint32_t i;
 593
 594    for (i = 0; i < oprsz; i += 4) {
 595        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 596        fni(t0, t0);
 597        tcg_gen_st_i32(t0, cpu_env, dofs + i);
 598    }
 599    tcg_temp_free_i32(t0);
 600}
 601
 602static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 603                          int32_t c, bool load_dest,
 604                          void (*fni)(TCGv_i32, TCGv_i32, int32_t))
 605{
 606    TCGv_i32 t0 = tcg_temp_new_i32();
 607    TCGv_i32 t1 = tcg_temp_new_i32();
 608    uint32_t i;
 609
 610    for (i = 0; i < oprsz; i += 4) {
 611        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 612        if (load_dest) {
 613            tcg_gen_ld_i32(t1, cpu_env, dofs + i);
 614        }
 615        fni(t1, t0, c);
 616        tcg_gen_st_i32(t1, cpu_env, dofs + i);
 617    }
 618    tcg_temp_free_i32(t0);
 619    tcg_temp_free_i32(t1);
 620}
 621
 622static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 623                          TCGv_i32 c, bool scalar_first,
 624                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
 625{
 626    TCGv_i32 t0 = tcg_temp_new_i32();
 627    TCGv_i32 t1 = tcg_temp_new_i32();
 628    uint32_t i;
 629
 630    for (i = 0; i < oprsz; i += 4) {
 631        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 632        if (scalar_first) {
 633            fni(t1, c, t0);
 634        } else {
 635            fni(t1, t0, c);
 636        }
 637        tcg_gen_st_i32(t1, cpu_env, dofs + i);
 638    }
 639    tcg_temp_free_i32(t0);
 640    tcg_temp_free_i32(t1);
 641}
 642
 643/* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
 644static void expand_3_i32(uint32_t dofs, uint32_t aofs,
 645                         uint32_t bofs, uint32_t oprsz, bool load_dest,
 646                         void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
 647{
 648    TCGv_i32 t0 = tcg_temp_new_i32();
 649    TCGv_i32 t1 = tcg_temp_new_i32();
 650    TCGv_i32 t2 = tcg_temp_new_i32();
 651    uint32_t i;
 652
 653    for (i = 0; i < oprsz; i += 4) {
 654        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 655        tcg_gen_ld_i32(t1, cpu_env, bofs + i);
 656        if (load_dest) {
 657            tcg_gen_ld_i32(t2, cpu_env, dofs + i);
 658        }
 659        fni(t2, t0, t1);
 660        tcg_gen_st_i32(t2, cpu_env, dofs + i);
 661    }
 662    tcg_temp_free_i32(t2);
 663    tcg_temp_free_i32(t1);
 664    tcg_temp_free_i32(t0);
 665}
 666
 667static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 668                          uint32_t oprsz, int32_t c, bool load_dest,
 669                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
 670{
 671    TCGv_i32 t0 = tcg_temp_new_i32();
 672    TCGv_i32 t1 = tcg_temp_new_i32();
 673    TCGv_i32 t2 = tcg_temp_new_i32();
 674    uint32_t i;
 675
 676    for (i = 0; i < oprsz; i += 4) {
 677        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 678        tcg_gen_ld_i32(t1, cpu_env, bofs + i);
 679        if (load_dest) {
 680            tcg_gen_ld_i32(t2, cpu_env, dofs + i);
 681        }
 682        fni(t2, t0, t1, c);
 683        tcg_gen_st_i32(t2, cpu_env, dofs + i);
 684    }
 685    tcg_temp_free_i32(t0);
 686    tcg_temp_free_i32(t1);
 687    tcg_temp_free_i32(t2);
 688}
 689
 690/* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
 691static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 692                         uint32_t cofs, uint32_t oprsz, bool write_aofs,
 693                         void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
 694{
 695    TCGv_i32 t0 = tcg_temp_new_i32();
 696    TCGv_i32 t1 = tcg_temp_new_i32();
 697    TCGv_i32 t2 = tcg_temp_new_i32();
 698    TCGv_i32 t3 = tcg_temp_new_i32();
 699    uint32_t i;
 700
 701    for (i = 0; i < oprsz; i += 4) {
 702        tcg_gen_ld_i32(t1, cpu_env, aofs + i);
 703        tcg_gen_ld_i32(t2, cpu_env, bofs + i);
 704        tcg_gen_ld_i32(t3, cpu_env, cofs + i);
 705        fni(t0, t1, t2, t3);
 706        tcg_gen_st_i32(t0, cpu_env, dofs + i);
 707        if (write_aofs) {
 708            tcg_gen_st_i32(t1, cpu_env, aofs + i);
 709        }
 710    }
 711    tcg_temp_free_i32(t3);
 712    tcg_temp_free_i32(t2);
 713    tcg_temp_free_i32(t1);
 714    tcg_temp_free_i32(t0);
 715}
 716
 717/* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
 718static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 719                         void (*fni)(TCGv_i64, TCGv_i64))
 720{
 721    TCGv_i64 t0 = tcg_temp_new_i64();
 722    uint32_t i;
 723
 724    for (i = 0; i < oprsz; i += 8) {
 725        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 726        fni(t0, t0);
 727        tcg_gen_st_i64(t0, cpu_env, dofs + i);
 728    }
 729    tcg_temp_free_i64(t0);
 730}
 731
 732static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 733                          int64_t c, bool load_dest,
 734                          void (*fni)(TCGv_i64, TCGv_i64, int64_t))
 735{
 736    TCGv_i64 t0 = tcg_temp_new_i64();
 737    TCGv_i64 t1 = tcg_temp_new_i64();
 738    uint32_t i;
 739
 740    for (i = 0; i < oprsz; i += 8) {
 741        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 742        if (load_dest) {
 743            tcg_gen_ld_i64(t1, cpu_env, dofs + i);
 744        }
 745        fni(t1, t0, c);
 746        tcg_gen_st_i64(t1, cpu_env, dofs + i);
 747    }
 748    tcg_temp_free_i64(t0);
 749    tcg_temp_free_i64(t1);
 750}
 751
 752static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 753                          TCGv_i64 c, bool scalar_first,
 754                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
 755{
 756    TCGv_i64 t0 = tcg_temp_new_i64();
 757    TCGv_i64 t1 = tcg_temp_new_i64();
 758    uint32_t i;
 759
 760    for (i = 0; i < oprsz; i += 8) {
 761        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 762        if (scalar_first) {
 763            fni(t1, c, t0);
 764        } else {
 765            fni(t1, t0, c);
 766        }
 767        tcg_gen_st_i64(t1, cpu_env, dofs + i);
 768    }
 769    tcg_temp_free_i64(t0);
 770    tcg_temp_free_i64(t1);
 771}
 772
 773/* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
 774static void expand_3_i64(uint32_t dofs, uint32_t aofs,
 775                         uint32_t bofs, uint32_t oprsz, bool load_dest,
 776                         void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
 777{
 778    TCGv_i64 t0 = tcg_temp_new_i64();
 779    TCGv_i64 t1 = tcg_temp_new_i64();
 780    TCGv_i64 t2 = tcg_temp_new_i64();
 781    uint32_t i;
 782
 783    for (i = 0; i < oprsz; i += 8) {
 784        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 785        tcg_gen_ld_i64(t1, cpu_env, bofs + i);
 786        if (load_dest) {
 787            tcg_gen_ld_i64(t2, cpu_env, dofs + i);
 788        }
 789        fni(t2, t0, t1);
 790        tcg_gen_st_i64(t2, cpu_env, dofs + i);
 791    }
 792    tcg_temp_free_i64(t2);
 793    tcg_temp_free_i64(t1);
 794    tcg_temp_free_i64(t0);
 795}
 796
 797static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 798                          uint32_t oprsz, int64_t c, bool load_dest,
 799                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
 800{
 801    TCGv_i64 t0 = tcg_temp_new_i64();
 802    TCGv_i64 t1 = tcg_temp_new_i64();
 803    TCGv_i64 t2 = tcg_temp_new_i64();
 804    uint32_t i;
 805
 806    for (i = 0; i < oprsz; i += 8) {
 807        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 808        tcg_gen_ld_i64(t1, cpu_env, bofs + i);
 809        if (load_dest) {
 810            tcg_gen_ld_i64(t2, cpu_env, dofs + i);
 811        }
 812        fni(t2, t0, t1, c);
 813        tcg_gen_st_i64(t2, cpu_env, dofs + i);
 814    }
 815    tcg_temp_free_i64(t0);
 816    tcg_temp_free_i64(t1);
 817    tcg_temp_free_i64(t2);
 818}
 819
 820/* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
 821static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 822                         uint32_t cofs, uint32_t oprsz, bool write_aofs,
 823                         void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
 824{
 825    TCGv_i64 t0 = tcg_temp_new_i64();
 826    TCGv_i64 t1 = tcg_temp_new_i64();
 827    TCGv_i64 t2 = tcg_temp_new_i64();
 828    TCGv_i64 t3 = tcg_temp_new_i64();
 829    uint32_t i;
 830
 831    for (i = 0; i < oprsz; i += 8) {
 832        tcg_gen_ld_i64(t1, cpu_env, aofs + i);
 833        tcg_gen_ld_i64(t2, cpu_env, bofs + i);
 834        tcg_gen_ld_i64(t3, cpu_env, cofs + i);
 835        fni(t0, t1, t2, t3);
 836        tcg_gen_st_i64(t0, cpu_env, dofs + i);
 837        if (write_aofs) {
 838            tcg_gen_st_i64(t1, cpu_env, aofs + i);
 839        }
 840    }
 841    tcg_temp_free_i64(t3);
 842    tcg_temp_free_i64(t2);
 843    tcg_temp_free_i64(t1);
 844    tcg_temp_free_i64(t0);
 845}
 846
 847/* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
 848static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 849                         uint32_t oprsz, uint32_t tysz, TCGType type,
 850                         void (*fni)(unsigned, TCGv_vec, TCGv_vec))
 851{
 852    TCGv_vec t0 = tcg_temp_new_vec(type);
 853    uint32_t i;
 854
 855    for (i = 0; i < oprsz; i += tysz) {
 856        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 857        fni(vece, t0, t0);
 858        tcg_gen_st_vec(t0, cpu_env, dofs + i);
 859    }
 860    tcg_temp_free_vec(t0);
 861}
 862
 863/* Expand OPSZ bytes worth of two-vector operands and an immediate operand
 864   using host vectors.  */
 865static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 866                          uint32_t oprsz, uint32_t tysz, TCGType type,
 867                          int64_t c, bool load_dest,
 868                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
 869{
 870    TCGv_vec t0 = tcg_temp_new_vec(type);
 871    TCGv_vec t1 = tcg_temp_new_vec(type);
 872    uint32_t i;
 873
 874    for (i = 0; i < oprsz; i += tysz) {
 875        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 876        if (load_dest) {
 877            tcg_gen_ld_vec(t1, cpu_env, dofs + i);
 878        }
 879        fni(vece, t1, t0, c);
 880        tcg_gen_st_vec(t1, cpu_env, dofs + i);
 881    }
 882    tcg_temp_free_vec(t0);
 883    tcg_temp_free_vec(t1);
 884}
 885
 886static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 887                          uint32_t oprsz, uint32_t tysz, TCGType type,
 888                          TCGv_vec c, bool scalar_first,
 889                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
 890{
 891    TCGv_vec t0 = tcg_temp_new_vec(type);
 892    TCGv_vec t1 = tcg_temp_new_vec(type);
 893    uint32_t i;
 894
 895    for (i = 0; i < oprsz; i += tysz) {
 896        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 897        if (scalar_first) {
 898            fni(vece, t1, c, t0);
 899        } else {
 900            fni(vece, t1, t0, c);
 901        }
 902        tcg_gen_st_vec(t1, cpu_env, dofs + i);
 903    }
 904    tcg_temp_free_vec(t0);
 905    tcg_temp_free_vec(t1);
 906}
 907
 908/* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
 909static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 910                         uint32_t bofs, uint32_t oprsz,
 911                         uint32_t tysz, TCGType type, bool load_dest,
 912                         void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
 913{
 914    TCGv_vec t0 = tcg_temp_new_vec(type);
 915    TCGv_vec t1 = tcg_temp_new_vec(type);
 916    TCGv_vec t2 = tcg_temp_new_vec(type);
 917    uint32_t i;
 918
 919    for (i = 0; i < oprsz; i += tysz) {
 920        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 921        tcg_gen_ld_vec(t1, cpu_env, bofs + i);
 922        if (load_dest) {
 923            tcg_gen_ld_vec(t2, cpu_env, dofs + i);
 924        }
 925        fni(vece, t2, t0, t1);
 926        tcg_gen_st_vec(t2, cpu_env, dofs + i);
 927    }
 928    tcg_temp_free_vec(t2);
 929    tcg_temp_free_vec(t1);
 930    tcg_temp_free_vec(t0);
 931}
 932
 933/*
 934 * Expand OPSZ bytes worth of three-vector operands and an immediate operand
 935 * using host vectors.
 936 */
 937static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 938                          uint32_t bofs, uint32_t oprsz, uint32_t tysz,
 939                          TCGType type, int64_t c, bool load_dest,
 940                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
 941                                      int64_t))
 942{
 943    TCGv_vec t0 = tcg_temp_new_vec(type);
 944    TCGv_vec t1 = tcg_temp_new_vec(type);
 945    TCGv_vec t2 = tcg_temp_new_vec(type);
 946    uint32_t i;
 947
 948    for (i = 0; i < oprsz; i += tysz) {
 949        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 950        tcg_gen_ld_vec(t1, cpu_env, bofs + i);
 951        if (load_dest) {
 952            tcg_gen_ld_vec(t2, cpu_env, dofs + i);
 953        }
 954        fni(vece, t2, t0, t1, c);
 955        tcg_gen_st_vec(t2, cpu_env, dofs + i);
 956    }
 957    tcg_temp_free_vec(t0);
 958    tcg_temp_free_vec(t1);
 959    tcg_temp_free_vec(t2);
 960}
 961
 962/* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
 963static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 964                         uint32_t bofs, uint32_t cofs, uint32_t oprsz,
 965                         uint32_t tysz, TCGType type, bool write_aofs,
 966                         void (*fni)(unsigned, TCGv_vec, TCGv_vec,
 967                                     TCGv_vec, TCGv_vec))
 968{
 969    TCGv_vec t0 = tcg_temp_new_vec(type);
 970    TCGv_vec t1 = tcg_temp_new_vec(type);
 971    TCGv_vec t2 = tcg_temp_new_vec(type);
 972    TCGv_vec t3 = tcg_temp_new_vec(type);
 973    uint32_t i;
 974
 975    for (i = 0; i < oprsz; i += tysz) {
 976        tcg_gen_ld_vec(t1, cpu_env, aofs + i);
 977        tcg_gen_ld_vec(t2, cpu_env, bofs + i);
 978        tcg_gen_ld_vec(t3, cpu_env, cofs + i);
 979        fni(vece, t0, t1, t2, t3);
 980        tcg_gen_st_vec(t0, cpu_env, dofs + i);
 981        if (write_aofs) {
 982            tcg_gen_st_vec(t1, cpu_env, aofs + i);
 983        }
 984    }
 985    tcg_temp_free_vec(t3);
 986    tcg_temp_free_vec(t2);
 987    tcg_temp_free_vec(t1);
 988    tcg_temp_free_vec(t0);
 989}
 990
 991/* Expand a vector two-operand operation.  */
 992void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
 993                    uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
 994{
 995    const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
 996    const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
 997    TCGType type;
 998    uint32_t some;
 999
1000    check_size_align(oprsz, maxsz, dofs | aofs);
1001    check_overlap_2(dofs, aofs, maxsz);
1002
1003    type = 0;
1004    if (g->fniv) {
1005        type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1006    }
1007    switch (type) {
1008    case TCG_TYPE_V256:
1009        /* Recall that ARM SVE allows vector sizes that are not a
1010         * power of 2, but always a multiple of 16.  The intent is
1011         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1012         */
1013        some = QEMU_ALIGN_DOWN(oprsz, 32);
1014        expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv);
1015        if (some == oprsz) {
1016            break;
1017        }
1018        dofs += some;
1019        aofs += some;
1020        oprsz -= some;
1021        maxsz -= some;
1022        /* fallthru */
1023    case TCG_TYPE_V128:
1024        expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv);
1025        break;
1026    case TCG_TYPE_V64:
1027        expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv);
1028        break;
1029
1030    case 0:
1031        if (g->fni8 && check_size_impl(oprsz, 8)) {
1032            expand_2_i64(dofs, aofs, oprsz, g->fni8);
1033        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1034            expand_2_i32(dofs, aofs, oprsz, g->fni4);
1035        } else {
1036            assert(g->fno != NULL);
1037            tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
1038            oprsz = maxsz;
1039        }
1040        break;
1041
1042    default:
1043        g_assert_not_reached();
1044    }
1045    tcg_swap_vecop_list(hold_list);
1046
1047    if (oprsz < maxsz) {
1048        expand_clr(dofs + oprsz, maxsz - oprsz);
1049    }
1050}
1051
1052/* Expand a vector operation with two vectors and an immediate.  */
1053void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1054                     uint32_t maxsz, int64_t c, const GVecGen2i *g)
1055{
1056    const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1057    const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1058    TCGType type;
1059    uint32_t some;
1060
1061    check_size_align(oprsz, maxsz, dofs | aofs);
1062    check_overlap_2(dofs, aofs, maxsz);
1063
1064    type = 0;
1065    if (g->fniv) {
1066        type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1067    }
1068    switch (type) {
1069    case TCG_TYPE_V256:
1070        /* Recall that ARM SVE allows vector sizes that are not a
1071         * power of 2, but always a multiple of 16.  The intent is
1072         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1073         */
1074        some = QEMU_ALIGN_DOWN(oprsz, 32);
1075        expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1076                      c, g->load_dest, g->fniv);
1077        if (some == oprsz) {
1078            break;
1079        }
1080        dofs += some;
1081        aofs += some;
1082        oprsz -= some;
1083        maxsz -= some;
1084        /* fallthru */
1085    case TCG_TYPE_V128:
1086        expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1087                      c, g->load_dest, g->fniv);
1088        break;
1089    case TCG_TYPE_V64:
1090        expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1091                      c, g->load_dest, g->fniv);
1092        break;
1093
1094    case 0:
1095        if (g->fni8 && check_size_impl(oprsz, 8)) {
1096            expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1097        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1098            expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1099        } else {
1100            if (g->fno) {
1101                tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1102            } else {
1103                TCGv_i64 tcg_c = tcg_const_i64(c);
1104                tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1105                                    maxsz, c, g->fnoi);
1106                tcg_temp_free_i64(tcg_c);
1107            }
1108            oprsz = maxsz;
1109        }
1110        break;
1111
1112    default:
1113        g_assert_not_reached();
1114    }
1115    tcg_swap_vecop_list(hold_list);
1116
1117    if (oprsz < maxsz) {
1118        expand_clr(dofs + oprsz, maxsz - oprsz);
1119    }
1120}
1121
1122/* Expand a vector operation with two vectors and a scalar.  */
1123void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1124                     uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1125{
1126    TCGType type;
1127
1128    check_size_align(oprsz, maxsz, dofs | aofs);
1129    check_overlap_2(dofs, aofs, maxsz);
1130
1131    type = 0;
1132    if (g->fniv) {
1133        type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1134    }
1135    if (type != 0) {
1136        const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1137        const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1138        TCGv_vec t_vec = tcg_temp_new_vec(type);
1139        uint32_t some;
1140
1141        tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1142
1143        switch (type) {
1144        case TCG_TYPE_V256:
1145            /* Recall that ARM SVE allows vector sizes that are not a
1146             * power of 2, but always a multiple of 16.  The intent is
1147             * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1148             */
1149            some = QEMU_ALIGN_DOWN(oprsz, 32);
1150            expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1151                          t_vec, g->scalar_first, g->fniv);
1152            if (some == oprsz) {
1153                break;
1154            }
1155            dofs += some;
1156            aofs += some;
1157            oprsz -= some;
1158            maxsz -= some;
1159            /* fallthru */
1160
1161        case TCG_TYPE_V128:
1162            expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1163                          t_vec, g->scalar_first, g->fniv);
1164            break;
1165
1166        case TCG_TYPE_V64:
1167            expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1168                          t_vec, g->scalar_first, g->fniv);
1169            break;
1170
1171        default:
1172            g_assert_not_reached();
1173        }
1174        tcg_temp_free_vec(t_vec);
1175        tcg_swap_vecop_list(hold_list);
1176    } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1177        TCGv_i64 t64 = tcg_temp_new_i64();
1178
1179        gen_dup_i64(g->vece, t64, c);
1180        expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1181        tcg_temp_free_i64(t64);
1182    } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1183        TCGv_i32 t32 = tcg_temp_new_i32();
1184
1185        tcg_gen_extrl_i64_i32(t32, c);
1186        gen_dup_i32(g->vece, t32, t32);
1187        expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1188        tcg_temp_free_i32(t32);
1189    } else {
1190        tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1191        return;
1192    }
1193
1194    if (oprsz < maxsz) {
1195        expand_clr(dofs + oprsz, maxsz - oprsz);
1196    }
1197}
1198
1199/* Expand a vector three-operand operation.  */
1200void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1201                    uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1202{
1203    const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1204    const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1205    TCGType type;
1206    uint32_t some;
1207
1208    check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1209    check_overlap_3(dofs, aofs, bofs, maxsz);
1210
1211    type = 0;
1212    if (g->fniv) {
1213        type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1214    }
1215    switch (type) {
1216    case TCG_TYPE_V256:
1217        /* Recall that ARM SVE allows vector sizes that are not a
1218         * power of 2, but always a multiple of 16.  The intent is
1219         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1220         */
1221        some = QEMU_ALIGN_DOWN(oprsz, 32);
1222        expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1223                     g->load_dest, g->fniv);
1224        if (some == oprsz) {
1225            break;
1226        }
1227        dofs += some;
1228        aofs += some;
1229        bofs += some;
1230        oprsz -= some;
1231        maxsz -= some;
1232        /* fallthru */
1233    case TCG_TYPE_V128:
1234        expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1235                     g->load_dest, g->fniv);
1236        break;
1237    case TCG_TYPE_V64:
1238        expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1239                     g->load_dest, g->fniv);
1240        break;
1241
1242    case 0:
1243        if (g->fni8 && check_size_impl(oprsz, 8)) {
1244            expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1245        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1246            expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1247        } else {
1248            assert(g->fno != NULL);
1249            tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
1250                               maxsz, g->data, g->fno);
1251            oprsz = maxsz;
1252        }
1253        break;
1254
1255    default:
1256        g_assert_not_reached();
1257    }
1258    tcg_swap_vecop_list(hold_list);
1259
1260    if (oprsz < maxsz) {
1261        expand_clr(dofs + oprsz, maxsz - oprsz);
1262    }
1263}
1264
1265/* Expand a vector operation with three vectors and an immediate.  */
1266void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1267                     uint32_t oprsz, uint32_t maxsz, int64_t c,
1268                     const GVecGen3i *g)
1269{
1270    const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1271    const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1272    TCGType type;
1273    uint32_t some;
1274
1275    check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1276    check_overlap_3(dofs, aofs, bofs, maxsz);
1277
1278    type = 0;
1279    if (g->fniv) {
1280        type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1281    }
1282    switch (type) {
1283    case TCG_TYPE_V256:
1284        /*
1285         * Recall that ARM SVE allows vector sizes that are not a
1286         * power of 2, but always a multiple of 16.  The intent is
1287         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1288         */
1289        some = QEMU_ALIGN_DOWN(oprsz, 32);
1290        expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1291                      c, g->load_dest, g->fniv);
1292        if (some == oprsz) {
1293            break;
1294        }
1295        dofs += some;
1296        aofs += some;
1297        bofs += some;
1298        oprsz -= some;
1299        maxsz -= some;
1300        /* fallthru */
1301    case TCG_TYPE_V128:
1302        expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1303                      c, g->load_dest, g->fniv);
1304        break;
1305    case TCG_TYPE_V64:
1306        expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1307                      c, g->load_dest, g->fniv);
1308        break;
1309
1310    case 0:
1311        if (g->fni8 && check_size_impl(oprsz, 8)) {
1312            expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8);
1313        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1314            expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4);
1315        } else {
1316            assert(g->fno != NULL);
1317            tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);
1318            oprsz = maxsz;
1319        }
1320        break;
1321
1322    default:
1323        g_assert_not_reached();
1324    }
1325    tcg_swap_vecop_list(hold_list);
1326
1327    if (oprsz < maxsz) {
1328        expand_clr(dofs + oprsz, maxsz - oprsz);
1329    }
1330}
1331
1332/* Expand a vector four-operand operation.  */
1333void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1334                    uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1335{
1336    const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1337    const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1338    TCGType type;
1339    uint32_t some;
1340
1341    check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1342    check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1343
1344    type = 0;
1345    if (g->fniv) {
1346        type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1347    }
1348    switch (type) {
1349    case TCG_TYPE_V256:
1350        /* Recall that ARM SVE allows vector sizes that are not a
1351         * power of 2, but always a multiple of 16.  The intent is
1352         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1353         */
1354        some = QEMU_ALIGN_DOWN(oprsz, 32);
1355        expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1356                     32, TCG_TYPE_V256, g->write_aofs, g->fniv);
1357        if (some == oprsz) {
1358            break;
1359        }
1360        dofs += some;
1361        aofs += some;
1362        bofs += some;
1363        cofs += some;
1364        oprsz -= some;
1365        maxsz -= some;
1366        /* fallthru */
1367    case TCG_TYPE_V128:
1368        expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1369                     16, TCG_TYPE_V128, g->write_aofs, g->fniv);
1370        break;
1371    case TCG_TYPE_V64:
1372        expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1373                     8, TCG_TYPE_V64, g->write_aofs, g->fniv);
1374        break;
1375
1376    case 0:
1377        if (g->fni8 && check_size_impl(oprsz, 8)) {
1378            expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
1379                         g->write_aofs, g->fni8);
1380        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1381            expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
1382                         g->write_aofs, g->fni4);
1383        } else {
1384            assert(g->fno != NULL);
1385            tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1386                               oprsz, maxsz, g->data, g->fno);
1387            oprsz = maxsz;
1388        }
1389        break;
1390
1391    default:
1392        g_assert_not_reached();
1393    }
1394    tcg_swap_vecop_list(hold_list);
1395
1396    if (oprsz < maxsz) {
1397        expand_clr(dofs + oprsz, maxsz - oprsz);
1398    }
1399}
1400
1401/*
1402 * Expand specific vector operations.
1403 */
1404
1405static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1406{
1407    tcg_gen_mov_vec(a, b);
1408}
1409
1410void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1411                      uint32_t oprsz, uint32_t maxsz)
1412{
1413    static const GVecGen2 g = {
1414        .fni8 = tcg_gen_mov_i64,
1415        .fniv = vec_mov2,
1416        .fno = gen_helper_gvec_mov,
1417        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1418    };
1419    if (dofs != aofs) {
1420        tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1421    } else {
1422        check_size_align(oprsz, maxsz, dofs);
1423        if (oprsz < maxsz) {
1424            expand_clr(dofs + oprsz, maxsz - oprsz);
1425        }
1426    }
1427}
1428
1429void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1430                          uint32_t maxsz, TCGv_i32 in)
1431{
1432    check_size_align(oprsz, maxsz, dofs);
1433    tcg_debug_assert(vece <= MO_32);
1434    do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1435}
1436
1437void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1438                          uint32_t maxsz, TCGv_i64 in)
1439{
1440    check_size_align(oprsz, maxsz, dofs);
1441    tcg_debug_assert(vece <= MO_64);
1442    do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1443}
1444
1445void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1446                          uint32_t oprsz, uint32_t maxsz)
1447{
1448    check_size_align(oprsz, maxsz, dofs);
1449    if (vece <= MO_64) {
1450        TCGType type = choose_vector_type(NULL, vece, oprsz, 0);
1451        if (type != 0) {
1452            TCGv_vec t_vec = tcg_temp_new_vec(type);
1453            tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs);
1454            do_dup_store(type, dofs, oprsz, maxsz, t_vec);
1455            tcg_temp_free_vec(t_vec);
1456        } else if (vece <= MO_32) {
1457            TCGv_i32 in = tcg_temp_new_i32();
1458            switch (vece) {
1459            case MO_8:
1460                tcg_gen_ld8u_i32(in, cpu_env, aofs);
1461                break;
1462            case MO_16:
1463                tcg_gen_ld16u_i32(in, cpu_env, aofs);
1464                break;
1465            default:
1466                tcg_gen_ld_i32(in, cpu_env, aofs);
1467                break;
1468            }
1469            do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1470            tcg_temp_free_i32(in);
1471        } else {
1472            TCGv_i64 in = tcg_temp_new_i64();
1473            tcg_gen_ld_i64(in, cpu_env, aofs);
1474            do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1475            tcg_temp_free_i64(in);
1476        }
1477    } else {
1478        /* 128-bit duplicate.  */
1479        /* ??? Dup to 256-bit vector.  */
1480        int i;
1481
1482        tcg_debug_assert(vece == 4);
1483        tcg_debug_assert(oprsz >= 16);
1484        if (TCG_TARGET_HAS_v128) {
1485            TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1486
1487            tcg_gen_ld_vec(in, cpu_env, aofs);
1488            for (i = 0; i < oprsz; i += 16) {
1489                tcg_gen_st_vec(in, cpu_env, dofs + i);
1490            }
1491            tcg_temp_free_vec(in);
1492        } else {
1493            TCGv_i64 in0 = tcg_temp_new_i64();
1494            TCGv_i64 in1 = tcg_temp_new_i64();
1495
1496            tcg_gen_ld_i64(in0, cpu_env, aofs);
1497            tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
1498            for (i = 0; i < oprsz; i += 16) {
1499                tcg_gen_st_i64(in0, cpu_env, dofs + i);
1500                tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
1501            }
1502            tcg_temp_free_i64(in0);
1503            tcg_temp_free_i64(in1);
1504        }
1505        if (oprsz < maxsz) {
1506            expand_clr(dofs + oprsz, maxsz - oprsz);
1507        }
1508    }
1509}
1510
1511void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz,
1512                         uint32_t maxsz, uint64_t x)
1513{
1514    check_size_align(oprsz, maxsz, dofs);
1515    do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x);
1516}
1517
1518void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz,
1519                         uint32_t maxsz, uint32_t x)
1520{
1521    check_size_align(oprsz, maxsz, dofs);
1522    do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x);
1523}
1524
1525void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz,
1526                         uint32_t maxsz, uint16_t x)
1527{
1528    check_size_align(oprsz, maxsz, dofs);
1529    do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x);
1530}
1531
1532void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz,
1533                         uint32_t maxsz, uint8_t x)
1534{
1535    check_size_align(oprsz, maxsz, dofs);
1536    do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x);
1537}
1538
1539void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1540                      uint32_t oprsz, uint32_t maxsz)
1541{
1542    static const GVecGen2 g = {
1543        .fni8 = tcg_gen_not_i64,
1544        .fniv = tcg_gen_not_vec,
1545        .fno = gen_helper_gvec_not,
1546        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1547    };
1548    tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1549}
1550
1551/* Perform a vector addition using normal addition and a mask.  The mask
1552   should be the sign bit of each lane.  This 6-operation form is more
1553   efficient than separate additions when there are 4 or more lanes in
1554   the 64-bit operation.  */
1555static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1556{
1557    TCGv_i64 t1 = tcg_temp_new_i64();
1558    TCGv_i64 t2 = tcg_temp_new_i64();
1559    TCGv_i64 t3 = tcg_temp_new_i64();
1560
1561    tcg_gen_andc_i64(t1, a, m);
1562    tcg_gen_andc_i64(t2, b, m);
1563    tcg_gen_xor_i64(t3, a, b);
1564    tcg_gen_add_i64(d, t1, t2);
1565    tcg_gen_and_i64(t3, t3, m);
1566    tcg_gen_xor_i64(d, d, t3);
1567
1568    tcg_temp_free_i64(t1);
1569    tcg_temp_free_i64(t2);
1570    tcg_temp_free_i64(t3);
1571}
1572
1573void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1574{
1575    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1576    gen_addv_mask(d, a, b, m);
1577    tcg_temp_free_i64(m);
1578}
1579
1580void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1581{
1582    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1583    gen_addv_mask(d, a, b, m);
1584    tcg_temp_free_i64(m);
1585}
1586
1587void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1588{
1589    TCGv_i64 t1 = tcg_temp_new_i64();
1590    TCGv_i64 t2 = tcg_temp_new_i64();
1591
1592    tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1593    tcg_gen_add_i64(t2, a, b);
1594    tcg_gen_add_i64(t1, t1, b);
1595    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1596
1597    tcg_temp_free_i64(t1);
1598    tcg_temp_free_i64(t2);
1599}
1600
1601static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 };
1602
1603void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1604                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1605{
1606    static const GVecGen3 g[4] = {
1607        { .fni8 = tcg_gen_vec_add8_i64,
1608          .fniv = tcg_gen_add_vec,
1609          .fno = gen_helper_gvec_add8,
1610          .opt_opc = vecop_list_add,
1611          .vece = MO_8 },
1612        { .fni8 = tcg_gen_vec_add16_i64,
1613          .fniv = tcg_gen_add_vec,
1614          .fno = gen_helper_gvec_add16,
1615          .opt_opc = vecop_list_add,
1616          .vece = MO_16 },
1617        { .fni4 = tcg_gen_add_i32,
1618          .fniv = tcg_gen_add_vec,
1619          .fno = gen_helper_gvec_add32,
1620          .opt_opc = vecop_list_add,
1621          .vece = MO_32 },
1622        { .fni8 = tcg_gen_add_i64,
1623          .fniv = tcg_gen_add_vec,
1624          .fno = gen_helper_gvec_add64,
1625          .opt_opc = vecop_list_add,
1626          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1627          .vece = MO_64 },
1628    };
1629
1630    tcg_debug_assert(vece <= MO_64);
1631    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1632}
1633
1634void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1635                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1636{
1637    static const GVecGen2s g[4] = {
1638        { .fni8 = tcg_gen_vec_add8_i64,
1639          .fniv = tcg_gen_add_vec,
1640          .fno = gen_helper_gvec_adds8,
1641          .opt_opc = vecop_list_add,
1642          .vece = MO_8 },
1643        { .fni8 = tcg_gen_vec_add16_i64,
1644          .fniv = tcg_gen_add_vec,
1645          .fno = gen_helper_gvec_adds16,
1646          .opt_opc = vecop_list_add,
1647          .vece = MO_16 },
1648        { .fni4 = tcg_gen_add_i32,
1649          .fniv = tcg_gen_add_vec,
1650          .fno = gen_helper_gvec_adds32,
1651          .opt_opc = vecop_list_add,
1652          .vece = MO_32 },
1653        { .fni8 = tcg_gen_add_i64,
1654          .fniv = tcg_gen_add_vec,
1655          .fno = gen_helper_gvec_adds64,
1656          .opt_opc = vecop_list_add,
1657          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1658          .vece = MO_64 },
1659    };
1660
1661    tcg_debug_assert(vece <= MO_64);
1662    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1663}
1664
1665void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
1666                       int64_t c, uint32_t oprsz, uint32_t maxsz)
1667{
1668    TCGv_i64 tmp = tcg_const_i64(c);
1669    tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
1670    tcg_temp_free_i64(tmp);
1671}
1672
1673static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
1674
1675void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
1676                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1677{
1678    static const GVecGen2s g[4] = {
1679        { .fni8 = tcg_gen_vec_sub8_i64,
1680          .fniv = tcg_gen_sub_vec,
1681          .fno = gen_helper_gvec_subs8,
1682          .opt_opc = vecop_list_sub,
1683          .vece = MO_8 },
1684        { .fni8 = tcg_gen_vec_sub16_i64,
1685          .fniv = tcg_gen_sub_vec,
1686          .fno = gen_helper_gvec_subs16,
1687          .opt_opc = vecop_list_sub,
1688          .vece = MO_16 },
1689        { .fni4 = tcg_gen_sub_i32,
1690          .fniv = tcg_gen_sub_vec,
1691          .fno = gen_helper_gvec_subs32,
1692          .opt_opc = vecop_list_sub,
1693          .vece = MO_32 },
1694        { .fni8 = tcg_gen_sub_i64,
1695          .fniv = tcg_gen_sub_vec,
1696          .fno = gen_helper_gvec_subs64,
1697          .opt_opc = vecop_list_sub,
1698          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1699          .vece = MO_64 },
1700    };
1701
1702    tcg_debug_assert(vece <= MO_64);
1703    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1704}
1705
1706/* Perform a vector subtraction using normal subtraction and a mask.
1707   Compare gen_addv_mask above.  */
1708static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1709{
1710    TCGv_i64 t1 = tcg_temp_new_i64();
1711    TCGv_i64 t2 = tcg_temp_new_i64();
1712    TCGv_i64 t3 = tcg_temp_new_i64();
1713
1714    tcg_gen_or_i64(t1, a, m);
1715    tcg_gen_andc_i64(t2, b, m);
1716    tcg_gen_eqv_i64(t3, a, b);
1717    tcg_gen_sub_i64(d, t1, t2);
1718    tcg_gen_and_i64(t3, t3, m);
1719    tcg_gen_xor_i64(d, d, t3);
1720
1721    tcg_temp_free_i64(t1);
1722    tcg_temp_free_i64(t2);
1723    tcg_temp_free_i64(t3);
1724}
1725
1726void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1727{
1728    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1729    gen_subv_mask(d, a, b, m);
1730    tcg_temp_free_i64(m);
1731}
1732
1733void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1734{
1735    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1736    gen_subv_mask(d, a, b, m);
1737    tcg_temp_free_i64(m);
1738}
1739
1740void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1741{
1742    TCGv_i64 t1 = tcg_temp_new_i64();
1743    TCGv_i64 t2 = tcg_temp_new_i64();
1744
1745    tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1746    tcg_gen_sub_i64(t2, a, b);
1747    tcg_gen_sub_i64(t1, a, t1);
1748    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1749
1750    tcg_temp_free_i64(t1);
1751    tcg_temp_free_i64(t2);
1752}
1753
1754void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
1755                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1756{
1757    static const GVecGen3 g[4] = {
1758        { .fni8 = tcg_gen_vec_sub8_i64,
1759          .fniv = tcg_gen_sub_vec,
1760          .fno = gen_helper_gvec_sub8,
1761          .opt_opc = vecop_list_sub,
1762          .vece = MO_8 },
1763        { .fni8 = tcg_gen_vec_sub16_i64,
1764          .fniv = tcg_gen_sub_vec,
1765          .fno = gen_helper_gvec_sub16,
1766          .opt_opc = vecop_list_sub,
1767          .vece = MO_16 },
1768        { .fni4 = tcg_gen_sub_i32,
1769          .fniv = tcg_gen_sub_vec,
1770          .fno = gen_helper_gvec_sub32,
1771          .opt_opc = vecop_list_sub,
1772          .vece = MO_32 },
1773        { .fni8 = tcg_gen_sub_i64,
1774          .fniv = tcg_gen_sub_vec,
1775          .fno = gen_helper_gvec_sub64,
1776          .opt_opc = vecop_list_sub,
1777          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1778          .vece = MO_64 },
1779    };
1780
1781    tcg_debug_assert(vece <= MO_64);
1782    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1783}
1784
1785static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 };
1786
1787void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
1788                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1789{
1790    static const GVecGen3 g[4] = {
1791        { .fniv = tcg_gen_mul_vec,
1792          .fno = gen_helper_gvec_mul8,
1793          .opt_opc = vecop_list_mul,
1794          .vece = MO_8 },
1795        { .fniv = tcg_gen_mul_vec,
1796          .fno = gen_helper_gvec_mul16,
1797          .opt_opc = vecop_list_mul,
1798          .vece = MO_16 },
1799        { .fni4 = tcg_gen_mul_i32,
1800          .fniv = tcg_gen_mul_vec,
1801          .fno = gen_helper_gvec_mul32,
1802          .opt_opc = vecop_list_mul,
1803          .vece = MO_32 },
1804        { .fni8 = tcg_gen_mul_i64,
1805          .fniv = tcg_gen_mul_vec,
1806          .fno = gen_helper_gvec_mul64,
1807          .opt_opc = vecop_list_mul,
1808          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1809          .vece = MO_64 },
1810    };
1811
1812    tcg_debug_assert(vece <= MO_64);
1813    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1814}
1815
1816void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
1817                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1818{
1819    static const GVecGen2s g[4] = {
1820        { .fniv = tcg_gen_mul_vec,
1821          .fno = gen_helper_gvec_muls8,
1822          .opt_opc = vecop_list_mul,
1823          .vece = MO_8 },
1824        { .fniv = tcg_gen_mul_vec,
1825          .fno = gen_helper_gvec_muls16,
1826          .opt_opc = vecop_list_mul,
1827          .vece = MO_16 },
1828        { .fni4 = tcg_gen_mul_i32,
1829          .fniv = tcg_gen_mul_vec,
1830          .fno = gen_helper_gvec_muls32,
1831          .opt_opc = vecop_list_mul,
1832          .vece = MO_32 },
1833        { .fni8 = tcg_gen_mul_i64,
1834          .fniv = tcg_gen_mul_vec,
1835          .fno = gen_helper_gvec_muls64,
1836          .opt_opc = vecop_list_mul,
1837          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1838          .vece = MO_64 },
1839    };
1840
1841    tcg_debug_assert(vece <= MO_64);
1842    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1843}
1844
1845void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
1846                       int64_t c, uint32_t oprsz, uint32_t maxsz)
1847{
1848    TCGv_i64 tmp = tcg_const_i64(c);
1849    tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
1850    tcg_temp_free_i64(tmp);
1851}
1852
1853void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1854                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1855{
1856    static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 };
1857    static const GVecGen3 g[4] = {
1858        { .fniv = tcg_gen_ssadd_vec,
1859          .fno = gen_helper_gvec_ssadd8,
1860          .opt_opc = vecop_list,
1861          .vece = MO_8 },
1862        { .fniv = tcg_gen_ssadd_vec,
1863          .fno = gen_helper_gvec_ssadd16,
1864          .opt_opc = vecop_list,
1865          .vece = MO_16 },
1866        { .fniv = tcg_gen_ssadd_vec,
1867          .fno = gen_helper_gvec_ssadd32,
1868          .opt_opc = vecop_list,
1869          .vece = MO_32 },
1870        { .fniv = tcg_gen_ssadd_vec,
1871          .fno = gen_helper_gvec_ssadd64,
1872          .opt_opc = vecop_list,
1873          .vece = MO_64 },
1874    };
1875    tcg_debug_assert(vece <= MO_64);
1876    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1877}
1878
1879void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
1880                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1881{
1882    static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 };
1883    static const GVecGen3 g[4] = {
1884        { .fniv = tcg_gen_sssub_vec,
1885          .fno = gen_helper_gvec_sssub8,
1886          .opt_opc = vecop_list,
1887          .vece = MO_8 },
1888        { .fniv = tcg_gen_sssub_vec,
1889          .fno = gen_helper_gvec_sssub16,
1890          .opt_opc = vecop_list,
1891          .vece = MO_16 },
1892        { .fniv = tcg_gen_sssub_vec,
1893          .fno = gen_helper_gvec_sssub32,
1894          .opt_opc = vecop_list,
1895          .vece = MO_32 },
1896        { .fniv = tcg_gen_sssub_vec,
1897          .fno = gen_helper_gvec_sssub64,
1898          .opt_opc = vecop_list,
1899          .vece = MO_64 },
1900    };
1901    tcg_debug_assert(vece <= MO_64);
1902    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1903}
1904
1905static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1906{
1907    TCGv_i32 max = tcg_const_i32(-1);
1908    tcg_gen_add_i32(d, a, b);
1909    tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
1910    tcg_temp_free_i32(max);
1911}
1912
1913static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1914{
1915    TCGv_i64 max = tcg_const_i64(-1);
1916    tcg_gen_add_i64(d, a, b);
1917    tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
1918    tcg_temp_free_i64(max);
1919}
1920
1921void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1922                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1923{
1924    static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 };
1925    static const GVecGen3 g[4] = {
1926        { .fniv = tcg_gen_usadd_vec,
1927          .fno = gen_helper_gvec_usadd8,
1928          .opt_opc = vecop_list,
1929          .vece = MO_8 },
1930        { .fniv = tcg_gen_usadd_vec,
1931          .fno = gen_helper_gvec_usadd16,
1932          .opt_opc = vecop_list,
1933          .vece = MO_16 },
1934        { .fni4 = tcg_gen_usadd_i32,
1935          .fniv = tcg_gen_usadd_vec,
1936          .fno = gen_helper_gvec_usadd32,
1937          .opt_opc = vecop_list,
1938          .vece = MO_32 },
1939        { .fni8 = tcg_gen_usadd_i64,
1940          .fniv = tcg_gen_usadd_vec,
1941          .fno = gen_helper_gvec_usadd64,
1942          .opt_opc = vecop_list,
1943          .vece = MO_64 }
1944    };
1945    tcg_debug_assert(vece <= MO_64);
1946    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1947}
1948
1949static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1950{
1951    TCGv_i32 min = tcg_const_i32(0);
1952    tcg_gen_sub_i32(d, a, b);
1953    tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
1954    tcg_temp_free_i32(min);
1955}
1956
1957static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1958{
1959    TCGv_i64 min = tcg_const_i64(0);
1960    tcg_gen_sub_i64(d, a, b);
1961    tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
1962    tcg_temp_free_i64(min);
1963}
1964
1965void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
1966                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1967{
1968    static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 };
1969    static const GVecGen3 g[4] = {
1970        { .fniv = tcg_gen_ussub_vec,
1971          .fno = gen_helper_gvec_ussub8,
1972          .opt_opc = vecop_list,
1973          .vece = MO_8 },
1974        { .fniv = tcg_gen_ussub_vec,
1975          .fno = gen_helper_gvec_ussub16,
1976          .opt_opc = vecop_list,
1977          .vece = MO_16 },
1978        { .fni4 = tcg_gen_ussub_i32,
1979          .fniv = tcg_gen_ussub_vec,
1980          .fno = gen_helper_gvec_ussub32,
1981          .opt_opc = vecop_list,
1982          .vece = MO_32 },
1983        { .fni8 = tcg_gen_ussub_i64,
1984          .fniv = tcg_gen_ussub_vec,
1985          .fno = gen_helper_gvec_ussub64,
1986          .opt_opc = vecop_list,
1987          .vece = MO_64 }
1988    };
1989    tcg_debug_assert(vece <= MO_64);
1990    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1991}
1992
1993void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
1994                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1995{
1996    static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 };
1997    static const GVecGen3 g[4] = {
1998        { .fniv = tcg_gen_smin_vec,
1999          .fno = gen_helper_gvec_smin8,
2000          .opt_opc = vecop_list,
2001          .vece = MO_8 },
2002        { .fniv = tcg_gen_smin_vec,
2003          .fno = gen_helper_gvec_smin16,
2004          .opt_opc = vecop_list,
2005          .vece = MO_16 },
2006        { .fni4 = tcg_gen_smin_i32,
2007          .fniv = tcg_gen_smin_vec,
2008          .fno = gen_helper_gvec_smin32,
2009          .opt_opc = vecop_list,
2010          .vece = MO_32 },
2011        { .fni8 = tcg_gen_smin_i64,
2012          .fniv = tcg_gen_smin_vec,
2013          .fno = gen_helper_gvec_smin64,
2014          .opt_opc = vecop_list,
2015          .vece = MO_64 }
2016    };
2017    tcg_debug_assert(vece <= MO_64);
2018    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2019}
2020
2021void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
2022                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2023{
2024    static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 };
2025    static const GVecGen3 g[4] = {
2026        { .fniv = tcg_gen_umin_vec,
2027          .fno = gen_helper_gvec_umin8,
2028          .opt_opc = vecop_list,
2029          .vece = MO_8 },
2030        { .fniv = tcg_gen_umin_vec,
2031          .fno = gen_helper_gvec_umin16,
2032          .opt_opc = vecop_list,
2033          .vece = MO_16 },
2034        { .fni4 = tcg_gen_umin_i32,
2035          .fniv = tcg_gen_umin_vec,
2036          .fno = gen_helper_gvec_umin32,
2037          .opt_opc = vecop_list,
2038          .vece = MO_32 },
2039        { .fni8 = tcg_gen_umin_i64,
2040          .fniv = tcg_gen_umin_vec,
2041          .fno = gen_helper_gvec_umin64,
2042          .opt_opc = vecop_list,
2043          .vece = MO_64 }
2044    };
2045    tcg_debug_assert(vece <= MO_64);
2046    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2047}
2048
2049void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
2050                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2051{
2052    static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 };
2053    static const GVecGen3 g[4] = {
2054        { .fniv = tcg_gen_smax_vec,
2055          .fno = gen_helper_gvec_smax8,
2056          .opt_opc = vecop_list,
2057          .vece = MO_8 },
2058        { .fniv = tcg_gen_smax_vec,
2059          .fno = gen_helper_gvec_smax16,
2060          .opt_opc = vecop_list,
2061          .vece = MO_16 },
2062        { .fni4 = tcg_gen_smax_i32,
2063          .fniv = tcg_gen_smax_vec,
2064          .fno = gen_helper_gvec_smax32,
2065          .opt_opc = vecop_list,
2066          .vece = MO_32 },
2067        { .fni8 = tcg_gen_smax_i64,
2068          .fniv = tcg_gen_smax_vec,
2069          .fno = gen_helper_gvec_smax64,
2070          .opt_opc = vecop_list,
2071          .vece = MO_64 }
2072    };
2073    tcg_debug_assert(vece <= MO_64);
2074    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2075}
2076
2077void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
2078                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2079{
2080    static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 };
2081    static const GVecGen3 g[4] = {
2082        { .fniv = tcg_gen_umax_vec,
2083          .fno = gen_helper_gvec_umax8,
2084          .opt_opc = vecop_list,
2085          .vece = MO_8 },
2086        { .fniv = tcg_gen_umax_vec,
2087          .fno = gen_helper_gvec_umax16,
2088          .opt_opc = vecop_list,
2089          .vece = MO_16 },
2090        { .fni4 = tcg_gen_umax_i32,
2091          .fniv = tcg_gen_umax_vec,
2092          .fno = gen_helper_gvec_umax32,
2093          .opt_opc = vecop_list,
2094          .vece = MO_32 },
2095        { .fni8 = tcg_gen_umax_i64,
2096          .fniv = tcg_gen_umax_vec,
2097          .fno = gen_helper_gvec_umax64,
2098          .opt_opc = vecop_list,
2099          .vece = MO_64 }
2100    };
2101    tcg_debug_assert(vece <= MO_64);
2102    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2103}
2104
2105/* Perform a vector negation using normal negation and a mask.
2106   Compare gen_subv_mask above.  */
2107static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
2108{
2109    TCGv_i64 t2 = tcg_temp_new_i64();
2110    TCGv_i64 t3 = tcg_temp_new_i64();
2111
2112    tcg_gen_andc_i64(t3, m, b);
2113    tcg_gen_andc_i64(t2, b, m);
2114    tcg_gen_sub_i64(d, m, t2);
2115    tcg_gen_xor_i64(d, d, t3);
2116
2117    tcg_temp_free_i64(t2);
2118    tcg_temp_free_i64(t3);
2119}
2120
2121void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
2122{
2123    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
2124    gen_negv_mask(d, b, m);
2125    tcg_temp_free_i64(m);
2126}
2127
2128void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
2129{
2130    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
2131    gen_negv_mask(d, b, m);
2132    tcg_temp_free_i64(m);
2133}
2134
2135void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
2136{
2137    TCGv_i64 t1 = tcg_temp_new_i64();
2138    TCGv_i64 t2 = tcg_temp_new_i64();
2139
2140    tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2141    tcg_gen_neg_i64(t2, b);
2142    tcg_gen_neg_i64(t1, t1);
2143    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2144
2145    tcg_temp_free_i64(t1);
2146    tcg_temp_free_i64(t2);
2147}
2148
2149void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
2150                      uint32_t oprsz, uint32_t maxsz)
2151{
2152    static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 };
2153    static const GVecGen2 g[4] = {
2154        { .fni8 = tcg_gen_vec_neg8_i64,
2155          .fniv = tcg_gen_neg_vec,
2156          .fno = gen_helper_gvec_neg8,
2157          .opt_opc = vecop_list,
2158          .vece = MO_8 },
2159        { .fni8 = tcg_gen_vec_neg16_i64,
2160          .fniv = tcg_gen_neg_vec,
2161          .fno = gen_helper_gvec_neg16,
2162          .opt_opc = vecop_list,
2163          .vece = MO_16 },
2164        { .fni4 = tcg_gen_neg_i32,
2165          .fniv = tcg_gen_neg_vec,
2166          .fno = gen_helper_gvec_neg32,
2167          .opt_opc = vecop_list,
2168          .vece = MO_32 },
2169        { .fni8 = tcg_gen_neg_i64,
2170          .fniv = tcg_gen_neg_vec,
2171          .fno = gen_helper_gvec_neg64,
2172          .opt_opc = vecop_list,
2173          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2174          .vece = MO_64 },
2175    };
2176
2177    tcg_debug_assert(vece <= MO_64);
2178    tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2179}
2180
2181static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
2182{
2183    TCGv_i64 t = tcg_temp_new_i64();
2184    int nbit = 8 << vece;
2185
2186    /* Create -1 for each negative element.  */
2187    tcg_gen_shri_i64(t, b, nbit - 1);
2188    tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2189    tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
2190
2191    /*
2192     * Invert (via xor -1) and add one (via sub -1).
2193     * Because of the ordering the msb is cleared,
2194     * so we never have carry into the next element.
2195     */
2196    tcg_gen_xor_i64(d, b, t);
2197    tcg_gen_sub_i64(d, d, t);
2198
2199    tcg_temp_free_i64(t);
2200}
2201
2202static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b)
2203{
2204    gen_absv_mask(d, b, MO_8);
2205}
2206
2207static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b)
2208{
2209    gen_absv_mask(d, b, MO_16);
2210}
2211
2212void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
2213                      uint32_t oprsz, uint32_t maxsz)
2214{
2215    static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 };
2216    static const GVecGen2 g[4] = {
2217        { .fni8 = tcg_gen_vec_abs8_i64,
2218          .fniv = tcg_gen_abs_vec,
2219          .fno = gen_helper_gvec_abs8,
2220          .opt_opc = vecop_list,
2221          .vece = MO_8 },
2222        { .fni8 = tcg_gen_vec_abs16_i64,
2223          .fniv = tcg_gen_abs_vec,
2224          .fno = gen_helper_gvec_abs16,
2225          .opt_opc = vecop_list,
2226          .vece = MO_16 },
2227        { .fni4 = tcg_gen_abs_i32,
2228          .fniv = tcg_gen_abs_vec,
2229          .fno = gen_helper_gvec_abs32,
2230          .opt_opc = vecop_list,
2231          .vece = MO_32 },
2232        { .fni8 = tcg_gen_abs_i64,
2233          .fniv = tcg_gen_abs_vec,
2234          .fno = gen_helper_gvec_abs64,
2235          .opt_opc = vecop_list,
2236          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2237          .vece = MO_64 },
2238    };
2239
2240    tcg_debug_assert(vece <= MO_64);
2241    tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2242}
2243
2244void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
2245                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2246{
2247    static const GVecGen3 g = {
2248        .fni8 = tcg_gen_and_i64,
2249        .fniv = tcg_gen_and_vec,
2250        .fno = gen_helper_gvec_and,
2251        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2252    };
2253
2254    if (aofs == bofs) {
2255        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2256    } else {
2257        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2258    }
2259}
2260
2261void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
2262                     uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2263{
2264    static const GVecGen3 g = {
2265        .fni8 = tcg_gen_or_i64,
2266        .fniv = tcg_gen_or_vec,
2267        .fno = gen_helper_gvec_or,
2268        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2269    };
2270
2271    if (aofs == bofs) {
2272        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2273    } else {
2274        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2275    }
2276}
2277
2278void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
2279                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2280{
2281    static const GVecGen3 g = {
2282        .fni8 = tcg_gen_xor_i64,
2283        .fniv = tcg_gen_xor_vec,
2284        .fno = gen_helper_gvec_xor,
2285        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2286    };
2287
2288    if (aofs == bofs) {
2289        tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0);
2290    } else {
2291        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2292    }
2293}
2294
2295void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
2296                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2297{
2298    static const GVecGen3 g = {
2299        .fni8 = tcg_gen_andc_i64,
2300        .fniv = tcg_gen_andc_vec,
2301        .fno = gen_helper_gvec_andc,
2302        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2303    };
2304
2305    if (aofs == bofs) {
2306        tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0);
2307    } else {
2308        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2309    }
2310}
2311
2312void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
2313                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2314{
2315    static const GVecGen3 g = {
2316        .fni8 = tcg_gen_orc_i64,
2317        .fniv = tcg_gen_orc_vec,
2318        .fno = gen_helper_gvec_orc,
2319        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2320    };
2321
2322    if (aofs == bofs) {
2323        tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1);
2324    } else {
2325        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2326    }
2327}
2328
2329void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
2330                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2331{
2332    static const GVecGen3 g = {
2333        .fni8 = tcg_gen_nand_i64,
2334        .fniv = tcg_gen_nand_vec,
2335        .fno = gen_helper_gvec_nand,
2336        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2337    };
2338
2339    if (aofs == bofs) {
2340        tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2341    } else {
2342        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2343    }
2344}
2345
2346void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
2347                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2348{
2349    static const GVecGen3 g = {
2350        .fni8 = tcg_gen_nor_i64,
2351        .fniv = tcg_gen_nor_vec,
2352        .fno = gen_helper_gvec_nor,
2353        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2354    };
2355
2356    if (aofs == bofs) {
2357        tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2358    } else {
2359        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2360    }
2361}
2362
2363void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
2364                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2365{
2366    static const GVecGen3 g = {
2367        .fni8 = tcg_gen_eqv_i64,
2368        .fniv = tcg_gen_eqv_vec,
2369        .fno = gen_helper_gvec_eqv,
2370        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2371    };
2372
2373    if (aofs == bofs) {
2374        tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1);
2375    } else {
2376        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2377    }
2378}
2379
2380static const GVecGen2s gop_ands = {
2381    .fni8 = tcg_gen_and_i64,
2382    .fniv = tcg_gen_and_vec,
2383    .fno = gen_helper_gvec_ands,
2384    .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2385    .vece = MO_64
2386};
2387
2388void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
2389                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2390{
2391    TCGv_i64 tmp = tcg_temp_new_i64();
2392    gen_dup_i64(vece, tmp, c);
2393    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2394    tcg_temp_free_i64(tmp);
2395}
2396
2397void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
2398                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2399{
2400    TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2401    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2402    tcg_temp_free_i64(tmp);
2403}
2404
2405static const GVecGen2s gop_xors = {
2406    .fni8 = tcg_gen_xor_i64,
2407    .fniv = tcg_gen_xor_vec,
2408    .fno = gen_helper_gvec_xors,
2409    .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2410    .vece = MO_64
2411};
2412
2413void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
2414                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2415{
2416    TCGv_i64 tmp = tcg_temp_new_i64();
2417    gen_dup_i64(vece, tmp, c);
2418    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2419    tcg_temp_free_i64(tmp);
2420}
2421
2422void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
2423                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2424{
2425    TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2426    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2427    tcg_temp_free_i64(tmp);
2428}
2429
2430static const GVecGen2s gop_ors = {
2431    .fni8 = tcg_gen_or_i64,
2432    .fniv = tcg_gen_or_vec,
2433    .fno = gen_helper_gvec_ors,
2434    .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2435    .vece = MO_64
2436};
2437
2438void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
2439                      TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2440{
2441    TCGv_i64 tmp = tcg_temp_new_i64();
2442    gen_dup_i64(vece, tmp, c);
2443    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2444    tcg_temp_free_i64(tmp);
2445}
2446
2447void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
2448                      int64_t c, uint32_t oprsz, uint32_t maxsz)
2449{
2450    TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2451    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2452    tcg_temp_free_i64(tmp);
2453}
2454
2455void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2456{
2457    uint64_t mask = dup_const(MO_8, 0xff << c);
2458    tcg_gen_shli_i64(d, a, c);
2459    tcg_gen_andi_i64(d, d, mask);
2460}
2461
2462void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2463{
2464    uint64_t mask = dup_const(MO_16, 0xffff << c);
2465    tcg_gen_shli_i64(d, a, c);
2466    tcg_gen_andi_i64(d, d, mask);
2467}
2468
2469void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
2470                       int64_t shift, uint32_t oprsz, uint32_t maxsz)
2471{
2472    static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
2473    static const GVecGen2i g[4] = {
2474        { .fni8 = tcg_gen_vec_shl8i_i64,
2475          .fniv = tcg_gen_shli_vec,
2476          .fno = gen_helper_gvec_shl8i,
2477          .opt_opc = vecop_list,
2478          .vece = MO_8 },
2479        { .fni8 = tcg_gen_vec_shl16i_i64,
2480          .fniv = tcg_gen_shli_vec,
2481          .fno = gen_helper_gvec_shl16i,
2482          .opt_opc = vecop_list,
2483          .vece = MO_16 },
2484        { .fni4 = tcg_gen_shli_i32,
2485          .fniv = tcg_gen_shli_vec,
2486          .fno = gen_helper_gvec_shl32i,
2487          .opt_opc = vecop_list,
2488          .vece = MO_32 },
2489        { .fni8 = tcg_gen_shli_i64,
2490          .fniv = tcg_gen_shli_vec,
2491          .fno = gen_helper_gvec_shl64i,
2492          .opt_opc = vecop_list,
2493          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2494          .vece = MO_64 },
2495    };
2496
2497    tcg_debug_assert(vece <= MO_64);
2498    tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2499    if (shift == 0) {
2500        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2501    } else {
2502        tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2503    }
2504}
2505
2506void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2507{
2508    uint64_t mask = dup_const(MO_8, 0xff >> c);
2509    tcg_gen_shri_i64(d, a, c);
2510    tcg_gen_andi_i64(d, d, mask);
2511}
2512
2513void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2514{
2515    uint64_t mask = dup_const(MO_16, 0xffff >> c);
2516    tcg_gen_shri_i64(d, a, c);
2517    tcg_gen_andi_i64(d, d, mask);
2518}
2519
2520void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
2521                       int64_t shift, uint32_t oprsz, uint32_t maxsz)
2522{
2523    static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
2524    static const GVecGen2i g[4] = {
2525        { .fni8 = tcg_gen_vec_shr8i_i64,
2526          .fniv = tcg_gen_shri_vec,
2527          .fno = gen_helper_gvec_shr8i,
2528          .opt_opc = vecop_list,
2529          .vece = MO_8 },
2530        { .fni8 = tcg_gen_vec_shr16i_i64,
2531          .fniv = tcg_gen_shri_vec,
2532          .fno = gen_helper_gvec_shr16i,
2533          .opt_opc = vecop_list,
2534          .vece = MO_16 },
2535        { .fni4 = tcg_gen_shri_i32,
2536          .fniv = tcg_gen_shri_vec,
2537          .fno = gen_helper_gvec_shr32i,
2538          .opt_opc = vecop_list,
2539          .vece = MO_32 },
2540        { .fni8 = tcg_gen_shri_i64,
2541          .fniv = tcg_gen_shri_vec,
2542          .fno = gen_helper_gvec_shr64i,
2543          .opt_opc = vecop_list,
2544          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2545          .vece = MO_64 },
2546    };
2547
2548    tcg_debug_assert(vece <= MO_64);
2549    tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2550    if (shift == 0) {
2551        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2552    } else {
2553        tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2554    }
2555}
2556
2557void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2558{
2559    uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2560    uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2561    TCGv_i64 s = tcg_temp_new_i64();
2562
2563    tcg_gen_shri_i64(d, a, c);
2564    tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2565    tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2566    tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2567    tcg_gen_or_i64(d, d, s);         /* include sign extension */
2568    tcg_temp_free_i64(s);
2569}
2570
2571void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2572{
2573    uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2574    uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2575    TCGv_i64 s = tcg_temp_new_i64();
2576
2577    tcg_gen_shri_i64(d, a, c);
2578    tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2579    tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2580    tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2581    tcg_gen_or_i64(d, d, s);         /* include sign extension */
2582    tcg_temp_free_i64(s);
2583}
2584
2585void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
2586                       int64_t shift, uint32_t oprsz, uint32_t maxsz)
2587{
2588    static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 };
2589    static const GVecGen2i g[4] = {
2590        { .fni8 = tcg_gen_vec_sar8i_i64,
2591          .fniv = tcg_gen_sari_vec,
2592          .fno = gen_helper_gvec_sar8i,
2593          .opt_opc = vecop_list,
2594          .vece = MO_8 },
2595        { .fni8 = tcg_gen_vec_sar16i_i64,
2596          .fniv = tcg_gen_sari_vec,
2597          .fno = gen_helper_gvec_sar16i,
2598          .opt_opc = vecop_list,
2599          .vece = MO_16 },
2600        { .fni4 = tcg_gen_sari_i32,
2601          .fniv = tcg_gen_sari_vec,
2602          .fno = gen_helper_gvec_sar32i,
2603          .opt_opc = vecop_list,
2604          .vece = MO_32 },
2605        { .fni8 = tcg_gen_sari_i64,
2606          .fniv = tcg_gen_sari_vec,
2607          .fno = gen_helper_gvec_sar64i,
2608          .opt_opc = vecop_list,
2609          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2610          .vece = MO_64 },
2611    };
2612
2613    tcg_debug_assert(vece <= MO_64);
2614    tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2615    if (shift == 0) {
2616        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2617    } else {
2618        tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2619    }
2620}
2621
2622/*
2623 * Specialized generation vector shifts by a non-constant scalar.
2624 */
2625
2626typedef struct {
2627    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
2628    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
2629    void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32);
2630    void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
2631    gen_helper_gvec_2 *fno[4];
2632    TCGOpcode s_list[2];
2633    TCGOpcode v_list[2];
2634} GVecGen2sh;
2635
2636static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
2637                           uint32_t oprsz, uint32_t tysz, TCGType type,
2638                           TCGv_i32 shift,
2639                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
2640{
2641    TCGv_vec t0 = tcg_temp_new_vec(type);
2642    uint32_t i;
2643
2644    for (i = 0; i < oprsz; i += tysz) {
2645        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
2646        fni(vece, t0, t0, shift);
2647        tcg_gen_st_vec(t0, cpu_env, dofs + i);
2648    }
2649    tcg_temp_free_vec(t0);
2650}
2651
2652static void
2653do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift,
2654               uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g)
2655{
2656    TCGType type;
2657    uint32_t some;
2658
2659    check_size_align(oprsz, maxsz, dofs | aofs);
2660    check_overlap_2(dofs, aofs, maxsz);
2661
2662    /* If the backend has a scalar expansion, great.  */
2663    type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64);
2664    if (type) {
2665        const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
2666        switch (type) {
2667        case TCG_TYPE_V256:
2668            some = QEMU_ALIGN_DOWN(oprsz, 32);
2669            expand_2sh_vec(vece, dofs, aofs, some, 32,
2670                           TCG_TYPE_V256, shift, g->fniv_s);
2671            if (some == oprsz) {
2672                break;
2673            }
2674            dofs += some;
2675            aofs += some;
2676            oprsz -= some;
2677            maxsz -= some;
2678            /* fallthru */
2679        case TCG_TYPE_V128:
2680            expand_2sh_vec(vece, dofs, aofs, oprsz, 16,
2681                           TCG_TYPE_V128, shift, g->fniv_s);
2682            break;
2683        case TCG_TYPE_V64:
2684            expand_2sh_vec(vece, dofs, aofs, oprsz, 8,
2685                           TCG_TYPE_V64, shift, g->fniv_s);
2686            break;
2687        default:
2688            g_assert_not_reached();
2689        }
2690        tcg_swap_vecop_list(hold_list);
2691        goto clear_tail;
2692    }
2693
2694    /* If the backend supports variable vector shifts, also cool.  */
2695    type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64);
2696    if (type) {
2697        const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
2698        TCGv_vec v_shift = tcg_temp_new_vec(type);
2699
2700        if (vece == MO_64) {
2701            TCGv_i64 sh64 = tcg_temp_new_i64();
2702            tcg_gen_extu_i32_i64(sh64, shift);
2703            tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);
2704            tcg_temp_free_i64(sh64);
2705        } else {
2706            tcg_gen_dup_i32_vec(vece, v_shift, shift);
2707        }
2708
2709        switch (type) {
2710        case TCG_TYPE_V256:
2711            some = QEMU_ALIGN_DOWN(oprsz, 32);
2712            expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,
2713                          v_shift, false, g->fniv_v);
2714            if (some == oprsz) {
2715                break;
2716            }
2717            dofs += some;
2718            aofs += some;
2719            oprsz -= some;
2720            maxsz -= some;
2721            /* fallthru */
2722        case TCG_TYPE_V128:
2723            expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
2724                          v_shift, false, g->fniv_v);
2725            break;
2726        case TCG_TYPE_V64:
2727            expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
2728                          v_shift, false, g->fniv_v);
2729            break;
2730        default:
2731            g_assert_not_reached();
2732        }
2733        tcg_temp_free_vec(v_shift);
2734        tcg_swap_vecop_list(hold_list);
2735        goto clear_tail;
2736    }
2737
2738    /* Otherwise fall back to integral... */
2739    if (vece == MO_32 && check_size_impl(oprsz, 4)) {
2740        expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4);
2741    } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
2742        TCGv_i64 sh64 = tcg_temp_new_i64();
2743        tcg_gen_extu_i32_i64(sh64, shift);
2744        expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8);
2745        tcg_temp_free_i64(sh64);
2746    } else {
2747        TCGv_ptr a0 = tcg_temp_new_ptr();
2748        TCGv_ptr a1 = tcg_temp_new_ptr();
2749        TCGv_i32 desc = tcg_temp_new_i32();
2750
2751        tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);
2752        tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));
2753        tcg_gen_addi_ptr(a0, cpu_env, dofs);
2754        tcg_gen_addi_ptr(a1, cpu_env, aofs);
2755
2756        g->fno[vece](a0, a1, desc);
2757
2758        tcg_temp_free_ptr(a0);
2759        tcg_temp_free_ptr(a1);
2760        tcg_temp_free_i32(desc);
2761        return;
2762    }
2763
2764 clear_tail:
2765    if (oprsz < maxsz) {
2766        expand_clr(dofs + oprsz, maxsz - oprsz);
2767    }
2768}
2769
2770void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
2771                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
2772{
2773    static const GVecGen2sh g = {
2774        .fni4 = tcg_gen_shl_i32,
2775        .fni8 = tcg_gen_shl_i64,
2776        .fniv_s = tcg_gen_shls_vec,
2777        .fniv_v = tcg_gen_shlv_vec,
2778        .fno = {
2779            gen_helper_gvec_shl8i,
2780            gen_helper_gvec_shl16i,
2781            gen_helper_gvec_shl32i,
2782            gen_helper_gvec_shl64i,
2783        },
2784        .s_list = { INDEX_op_shls_vec, 0 },
2785        .v_list = { INDEX_op_shlv_vec, 0 },
2786    };
2787
2788    tcg_debug_assert(vece <= MO_64);
2789    do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
2790}
2791
2792void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
2793                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
2794{
2795    static const GVecGen2sh g = {
2796        .fni4 = tcg_gen_shr_i32,
2797        .fni8 = tcg_gen_shr_i64,
2798        .fniv_s = tcg_gen_shrs_vec,
2799        .fniv_v = tcg_gen_shrv_vec,
2800        .fno = {
2801            gen_helper_gvec_shr8i,
2802            gen_helper_gvec_shr16i,
2803            gen_helper_gvec_shr32i,
2804            gen_helper_gvec_shr64i,
2805        },
2806        .s_list = { INDEX_op_shrs_vec, 0 },
2807        .v_list = { INDEX_op_shrv_vec, 0 },
2808    };
2809
2810    tcg_debug_assert(vece <= MO_64);
2811    do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
2812}
2813
2814void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
2815                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
2816{
2817    static const GVecGen2sh g = {
2818        .fni4 = tcg_gen_sar_i32,
2819        .fni8 = tcg_gen_sar_i64,
2820        .fniv_s = tcg_gen_sars_vec,
2821        .fniv_v = tcg_gen_sarv_vec,
2822        .fno = {
2823            gen_helper_gvec_sar8i,
2824            gen_helper_gvec_sar16i,
2825            gen_helper_gvec_sar32i,
2826            gen_helper_gvec_sar64i,
2827        },
2828        .s_list = { INDEX_op_sars_vec, 0 },
2829        .v_list = { INDEX_op_sarv_vec, 0 },
2830    };
2831
2832    tcg_debug_assert(vece <= MO_64);
2833    do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
2834}
2835
2836/*
2837 * Expand D = A << (B % element bits)
2838 *
2839 * Unlike scalar shifts, where it is easy for the target front end
2840 * to include the modulo as part of the expansion.  If the target
2841 * naturally includes the modulo as part of the operation, great!
2842 * If the target has some other behaviour from out-of-range shifts,
2843 * then it could not use this function anyway, and would need to
2844 * do it's own expansion with custom functions.
2845 */
2846static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
2847                                 TCGv_vec a, TCGv_vec b)
2848{
2849    TCGv_vec t = tcg_temp_new_vec_matching(d);
2850
2851    tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
2852    tcg_gen_and_vec(vece, t, t, b);
2853    tcg_gen_shlv_vec(vece, d, a, t);
2854    tcg_temp_free_vec(t);
2855}
2856
2857static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2858{
2859    TCGv_i32 t = tcg_temp_new_i32();
2860
2861    tcg_gen_andi_i32(t, b, 31);
2862    tcg_gen_shl_i32(d, a, t);
2863    tcg_temp_free_i32(t);
2864}
2865
2866static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2867{
2868    TCGv_i64 t = tcg_temp_new_i64();
2869
2870    tcg_gen_andi_i64(t, b, 63);
2871    tcg_gen_shl_i64(d, a, t);
2872    tcg_temp_free_i64(t);
2873}
2874
2875void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
2876                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2877{
2878    static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 };
2879    static const GVecGen3 g[4] = {
2880        { .fniv = tcg_gen_shlv_mod_vec,
2881          .fno = gen_helper_gvec_shl8v,
2882          .opt_opc = vecop_list,
2883          .vece = MO_8 },
2884        { .fniv = tcg_gen_shlv_mod_vec,
2885          .fno = gen_helper_gvec_shl16v,
2886          .opt_opc = vecop_list,
2887          .vece = MO_16 },
2888        { .fni4 = tcg_gen_shl_mod_i32,
2889          .fniv = tcg_gen_shlv_mod_vec,
2890          .fno = gen_helper_gvec_shl32v,
2891          .opt_opc = vecop_list,
2892          .vece = MO_32 },
2893        { .fni8 = tcg_gen_shl_mod_i64,
2894          .fniv = tcg_gen_shlv_mod_vec,
2895          .fno = gen_helper_gvec_shl64v,
2896          .opt_opc = vecop_list,
2897          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2898          .vece = MO_64 },
2899    };
2900
2901    tcg_debug_assert(vece <= MO_64);
2902    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2903}
2904
2905/*
2906 * Similarly for logical right shifts.
2907 */
2908
2909static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
2910                                 TCGv_vec a, TCGv_vec b)
2911{
2912    TCGv_vec t = tcg_temp_new_vec_matching(d);
2913
2914    tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
2915    tcg_gen_and_vec(vece, t, t, b);
2916    tcg_gen_shrv_vec(vece, d, a, t);
2917    tcg_temp_free_vec(t);
2918}
2919
2920static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2921{
2922    TCGv_i32 t = tcg_temp_new_i32();
2923
2924    tcg_gen_andi_i32(t, b, 31);
2925    tcg_gen_shr_i32(d, a, t);
2926    tcg_temp_free_i32(t);
2927}
2928
2929static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2930{
2931    TCGv_i64 t = tcg_temp_new_i64();
2932
2933    tcg_gen_andi_i64(t, b, 63);
2934    tcg_gen_shr_i64(d, a, t);
2935    tcg_temp_free_i64(t);
2936}
2937
2938void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
2939                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2940{
2941    static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 };
2942    static const GVecGen3 g[4] = {
2943        { .fniv = tcg_gen_shrv_mod_vec,
2944          .fno = gen_helper_gvec_shr8v,
2945          .opt_opc = vecop_list,
2946          .vece = MO_8 },
2947        { .fniv = tcg_gen_shrv_mod_vec,
2948          .fno = gen_helper_gvec_shr16v,
2949          .opt_opc = vecop_list,
2950          .vece = MO_16 },
2951        { .fni4 = tcg_gen_shr_mod_i32,
2952          .fniv = tcg_gen_shrv_mod_vec,
2953          .fno = gen_helper_gvec_shr32v,
2954          .opt_opc = vecop_list,
2955          .vece = MO_32 },
2956        { .fni8 = tcg_gen_shr_mod_i64,
2957          .fniv = tcg_gen_shrv_mod_vec,
2958          .fno = gen_helper_gvec_shr64v,
2959          .opt_opc = vecop_list,
2960          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2961          .vece = MO_64 },
2962    };
2963
2964    tcg_debug_assert(vece <= MO_64);
2965    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2966}
2967
2968/*
2969 * Similarly for arithmetic right shifts.
2970 */
2971
2972static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
2973                                 TCGv_vec a, TCGv_vec b)
2974{
2975    TCGv_vec t = tcg_temp_new_vec_matching(d);
2976
2977    tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
2978    tcg_gen_and_vec(vece, t, t, b);
2979    tcg_gen_sarv_vec(vece, d, a, t);
2980    tcg_temp_free_vec(t);
2981}
2982
2983static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2984{
2985    TCGv_i32 t = tcg_temp_new_i32();
2986
2987    tcg_gen_andi_i32(t, b, 31);
2988    tcg_gen_sar_i32(d, a, t);
2989    tcg_temp_free_i32(t);
2990}
2991
2992static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2993{
2994    TCGv_i64 t = tcg_temp_new_i64();
2995
2996    tcg_gen_andi_i64(t, b, 63);
2997    tcg_gen_sar_i64(d, a, t);
2998    tcg_temp_free_i64(t);
2999}
3000
3001void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
3002                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3003{
3004    static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 };
3005    static const GVecGen3 g[4] = {
3006        { .fniv = tcg_gen_sarv_mod_vec,
3007          .fno = gen_helper_gvec_sar8v,
3008          .opt_opc = vecop_list,
3009          .vece = MO_8 },
3010        { .fniv = tcg_gen_sarv_mod_vec,
3011          .fno = gen_helper_gvec_sar16v,
3012          .opt_opc = vecop_list,
3013          .vece = MO_16 },
3014        { .fni4 = tcg_gen_sar_mod_i32,
3015          .fniv = tcg_gen_sarv_mod_vec,
3016          .fno = gen_helper_gvec_sar32v,
3017          .opt_opc = vecop_list,
3018          .vece = MO_32 },
3019        { .fni8 = tcg_gen_sar_mod_i64,
3020          .fniv = tcg_gen_sarv_mod_vec,
3021          .fno = gen_helper_gvec_sar64v,
3022          .opt_opc = vecop_list,
3023          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3024          .vece = MO_64 },
3025    };
3026
3027    tcg_debug_assert(vece <= MO_64);
3028    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3029}
3030
3031/* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
3032static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3033                           uint32_t oprsz, TCGCond cond)
3034{
3035    TCGv_i32 t0 = tcg_temp_new_i32();
3036    TCGv_i32 t1 = tcg_temp_new_i32();
3037    uint32_t i;
3038
3039    for (i = 0; i < oprsz; i += 4) {
3040        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
3041        tcg_gen_ld_i32(t1, cpu_env, bofs + i);
3042        tcg_gen_setcond_i32(cond, t0, t0, t1);
3043        tcg_gen_neg_i32(t0, t0);
3044        tcg_gen_st_i32(t0, cpu_env, dofs + i);
3045    }
3046    tcg_temp_free_i32(t1);
3047    tcg_temp_free_i32(t0);
3048}
3049
3050static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3051                           uint32_t oprsz, TCGCond cond)
3052{
3053    TCGv_i64 t0 = tcg_temp_new_i64();
3054    TCGv_i64 t1 = tcg_temp_new_i64();
3055    uint32_t i;
3056
3057    for (i = 0; i < oprsz; i += 8) {
3058        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
3059        tcg_gen_ld_i64(t1, cpu_env, bofs + i);
3060        tcg_gen_setcond_i64(cond, t0, t0, t1);
3061        tcg_gen_neg_i64(t0, t0);
3062        tcg_gen_st_i64(t0, cpu_env, dofs + i);
3063    }
3064    tcg_temp_free_i64(t1);
3065    tcg_temp_free_i64(t0);
3066}
3067
3068static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3069                           uint32_t bofs, uint32_t oprsz, uint32_t tysz,
3070                           TCGType type, TCGCond cond)
3071{
3072    TCGv_vec t0 = tcg_temp_new_vec(type);
3073    TCGv_vec t1 = tcg_temp_new_vec(type);
3074    uint32_t i;
3075
3076    for (i = 0; i < oprsz; i += tysz) {
3077        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
3078        tcg_gen_ld_vec(t1, cpu_env, bofs + i);
3079        tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
3080        tcg_gen_st_vec(t0, cpu_env, dofs + i);
3081    }
3082    tcg_temp_free_vec(t1);
3083    tcg_temp_free_vec(t0);
3084}
3085
3086void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
3087                      uint32_t aofs, uint32_t bofs,
3088                      uint32_t oprsz, uint32_t maxsz)
3089{
3090    static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3091    static gen_helper_gvec_3 * const eq_fn[4] = {
3092        gen_helper_gvec_eq8, gen_helper_gvec_eq16,
3093        gen_helper_gvec_eq32, gen_helper_gvec_eq64
3094    };
3095    static gen_helper_gvec_3 * const ne_fn[4] = {
3096        gen_helper_gvec_ne8, gen_helper_gvec_ne16,
3097        gen_helper_gvec_ne32, gen_helper_gvec_ne64
3098    };
3099    static gen_helper_gvec_3 * const lt_fn[4] = {
3100        gen_helper_gvec_lt8, gen_helper_gvec_lt16,
3101        gen_helper_gvec_lt32, gen_helper_gvec_lt64
3102    };
3103    static gen_helper_gvec_3 * const le_fn[4] = {
3104        gen_helper_gvec_le8, gen_helper_gvec_le16,
3105        gen_helper_gvec_le32, gen_helper_gvec_le64
3106    };
3107    static gen_helper_gvec_3 * const ltu_fn[4] = {
3108        gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
3109        gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
3110    };
3111    static gen_helper_gvec_3 * const leu_fn[4] = {
3112        gen_helper_gvec_leu8, gen_helper_gvec_leu16,
3113        gen_helper_gvec_leu32, gen_helper_gvec_leu64
3114    };
3115    static gen_helper_gvec_3 * const * const fns[16] = {
3116        [TCG_COND_EQ] = eq_fn,
3117        [TCG_COND_NE] = ne_fn,
3118        [TCG_COND_LT] = lt_fn,
3119        [TCG_COND_LE] = le_fn,
3120        [TCG_COND_LTU] = ltu_fn,
3121        [TCG_COND_LEU] = leu_fn,
3122    };
3123
3124    const TCGOpcode *hold_list;
3125    TCGType type;
3126    uint32_t some;
3127
3128    check_size_align(oprsz, maxsz, dofs | aofs | bofs);
3129    check_overlap_3(dofs, aofs, bofs, maxsz);
3130
3131    if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3132        do_dup(MO_8, dofs, oprsz, maxsz,
3133               NULL, NULL, -(cond == TCG_COND_ALWAYS));
3134        return;
3135    }
3136
3137    /*
3138     * Implement inline with a vector type, if possible.
3139     * Prefer integer when 64-bit host and 64-bit comparison.
3140     */
3141    hold_list = tcg_swap_vecop_list(cmp_list);
3142    type = choose_vector_type(cmp_list, vece, oprsz,
3143                              TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3144    switch (type) {
3145    case TCG_TYPE_V256:
3146        /* Recall that ARM SVE allows vector sizes that are not a
3147         * power of 2, but always a multiple of 16.  The intent is
3148         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
3149         */
3150        some = QEMU_ALIGN_DOWN(oprsz, 32);
3151        expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
3152        if (some == oprsz) {
3153            break;
3154        }
3155        dofs += some;
3156        aofs += some;
3157        bofs += some;
3158        oprsz -= some;
3159        maxsz -= some;
3160        /* fallthru */
3161    case TCG_TYPE_V128:
3162        expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
3163        break;
3164    case TCG_TYPE_V64:
3165        expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
3166        break;
3167
3168    case 0:
3169        if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3170            expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
3171        } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3172            expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
3173        } else {
3174            gen_helper_gvec_3 * const *fn = fns[cond];
3175
3176            if (fn == NULL) {
3177                uint32_t tmp;
3178                tmp = aofs, aofs = bofs, bofs = tmp;
3179                cond = tcg_swap_cond(cond);
3180                fn = fns[cond];
3181                assert(fn != NULL);
3182            }
3183            tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
3184            oprsz = maxsz;
3185        }
3186        break;
3187
3188    default:
3189        g_assert_not_reached();
3190    }
3191    tcg_swap_vecop_list(hold_list);
3192
3193    if (oprsz < maxsz) {
3194        expand_clr(dofs + oprsz, maxsz - oprsz);
3195    }
3196}
3197
3198static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c)
3199{
3200    TCGv_i64 t = tcg_temp_new_i64();
3201
3202    tcg_gen_and_i64(t, b, a);
3203    tcg_gen_andc_i64(d, c, a);
3204    tcg_gen_or_i64(d, d, t);
3205    tcg_temp_free_i64(t);
3206}
3207
3208void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
3209                         uint32_t bofs, uint32_t cofs,
3210                         uint32_t oprsz, uint32_t maxsz)
3211{
3212    static const GVecGen4 g = {
3213        .fni8 = tcg_gen_bitsel_i64,
3214        .fniv = tcg_gen_bitsel_vec,
3215        .fno = gen_helper_gvec_bitsel,
3216    };
3217
3218    tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g);
3219}
3220