qemu/tcg/tcg-op-gvec.c
<<
>>
Prefs
   1/*
   2 * Generic vector operation expansion
   3 *
   4 * Copyright (c) 2018 Linaro
   5 *
   6 * This library is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU Lesser General Public
   8 * License as published by the Free Software Foundation; either
   9 * version 2.1 of the License, or (at your option) any later version.
  10 *
  11 * This library is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * Lesser General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU Lesser General Public
  17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20#include "qemu/osdep.h"
  21#include "tcg/tcg.h"
  22#include "tcg/tcg-op.h"
  23#include "tcg/tcg-op-gvec.h"
  24#include "qemu/main-loop.h"
  25#include "tcg/tcg-gvec-desc.h"
  26
  27#define MAX_UNROLL  4
  28
  29#ifdef CONFIG_DEBUG_TCG
  30static const TCGOpcode vecop_list_empty[1] = { 0 };
  31#else
  32#define vecop_list_empty NULL
  33#endif
  34
  35
  36/* Verify vector size and alignment rules.  OFS should be the OR of all
  37   of the operand offsets so that we can check them all at once.  */
  38static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
  39{
  40    uint32_t max_align;
  41
  42    switch (oprsz) {
  43    case 8:
  44    case 16:
  45    case 32:
  46        tcg_debug_assert(oprsz <= maxsz);
  47        break;
  48    default:
  49        tcg_debug_assert(oprsz == maxsz);
  50        break;
  51    }
  52    tcg_debug_assert(maxsz <= (8 << SIMD_MAXSZ_BITS));
  53
  54    max_align = maxsz >= 16 ? 15 : 7;
  55    tcg_debug_assert((maxsz & max_align) == 0);
  56    tcg_debug_assert((ofs & max_align) == 0);
  57}
  58
  59/* Verify vector overlap rules for two operands.  */
  60static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
  61{
  62    tcg_debug_assert(d == a || d + s <= a || a + s <= d);
  63}
  64
  65/* Verify vector overlap rules for three operands.  */
  66static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
  67{
  68    check_overlap_2(d, a, s);
  69    check_overlap_2(d, b, s);
  70    check_overlap_2(a, b, s);
  71}
  72
  73/* Verify vector overlap rules for four operands.  */
  74static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
  75                            uint32_t c, uint32_t s)
  76{
  77    check_overlap_2(d, a, s);
  78    check_overlap_2(d, b, s);
  79    check_overlap_2(d, c, s);
  80    check_overlap_2(a, b, s);
  81    check_overlap_2(a, c, s);
  82    check_overlap_2(b, c, s);
  83}
  84
  85/* Create a descriptor from components.  */
  86uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
  87{
  88    uint32_t desc = 0;
  89
  90    check_size_align(oprsz, maxsz, 0);
  91    tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS));
  92
  93    oprsz = (oprsz / 8) - 1;
  94    maxsz = (maxsz / 8) - 1;
  95
  96    /*
  97     * We have just asserted in check_size_align that either
  98     * oprsz is {8,16,32} or matches maxsz.  Encode the final
  99     * case with '2', as that would otherwise map to 24.
 100     */
 101    if (oprsz == maxsz) {
 102        oprsz = 2;
 103    }
 104
 105    desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
 106    desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
 107    desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
 108
 109    return desc;
 110}
 111
 112/* Generate a call to a gvec-style helper with two vector operands.  */
 113void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
 114                        uint32_t oprsz, uint32_t maxsz, int32_t data,
 115                        gen_helper_gvec_2 *fn)
 116{
 117    TCGv_ptr a0, a1;
 118    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 119
 120    a0 = tcg_temp_new_ptr();
 121    a1 = tcg_temp_new_ptr();
 122
 123    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 124    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 125
 126    fn(a0, a1, desc);
 127
 128    tcg_temp_free_ptr(a0);
 129    tcg_temp_free_ptr(a1);
 130}
 131
 132/* Generate a call to a gvec-style helper with two vector operands
 133   and one scalar operand.  */
 134void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
 135                         uint32_t oprsz, uint32_t maxsz, int32_t data,
 136                         gen_helper_gvec_2i *fn)
 137{
 138    TCGv_ptr a0, a1;
 139    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 140
 141    a0 = tcg_temp_new_ptr();
 142    a1 = tcg_temp_new_ptr();
 143
 144    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 145    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 146
 147    fn(a0, a1, c, desc);
 148
 149    tcg_temp_free_ptr(a0);
 150    tcg_temp_free_ptr(a1);
 151}
 152
 153/* Generate a call to a gvec-style helper with three vector operands.  */
 154void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 155                        uint32_t oprsz, uint32_t maxsz, int32_t data,
 156                        gen_helper_gvec_3 *fn)
 157{
 158    TCGv_ptr a0, a1, a2;
 159    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 160
 161    a0 = tcg_temp_new_ptr();
 162    a1 = tcg_temp_new_ptr();
 163    a2 = tcg_temp_new_ptr();
 164
 165    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 166    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 167    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 168
 169    fn(a0, a1, a2, desc);
 170
 171    tcg_temp_free_ptr(a0);
 172    tcg_temp_free_ptr(a1);
 173    tcg_temp_free_ptr(a2);
 174}
 175
 176/* Generate a call to a gvec-style helper with four vector operands.  */
 177void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 178                        uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
 179                        int32_t data, gen_helper_gvec_4 *fn)
 180{
 181    TCGv_ptr a0, a1, a2, a3;
 182    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 183
 184    a0 = tcg_temp_new_ptr();
 185    a1 = tcg_temp_new_ptr();
 186    a2 = tcg_temp_new_ptr();
 187    a3 = tcg_temp_new_ptr();
 188
 189    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 190    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 191    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 192    tcg_gen_addi_ptr(a3, cpu_env, cofs);
 193
 194    fn(a0, a1, a2, a3, desc);
 195
 196    tcg_temp_free_ptr(a0);
 197    tcg_temp_free_ptr(a1);
 198    tcg_temp_free_ptr(a2);
 199    tcg_temp_free_ptr(a3);
 200}
 201
 202/* Generate a call to a gvec-style helper with five vector operands.  */
 203void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 204                        uint32_t cofs, uint32_t xofs, uint32_t oprsz,
 205                        uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
 206{
 207    TCGv_ptr a0, a1, a2, a3, a4;
 208    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 209
 210    a0 = tcg_temp_new_ptr();
 211    a1 = tcg_temp_new_ptr();
 212    a2 = tcg_temp_new_ptr();
 213    a3 = tcg_temp_new_ptr();
 214    a4 = tcg_temp_new_ptr();
 215
 216    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 217    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 218    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 219    tcg_gen_addi_ptr(a3, cpu_env, cofs);
 220    tcg_gen_addi_ptr(a4, cpu_env, xofs);
 221
 222    fn(a0, a1, a2, a3, a4, desc);
 223
 224    tcg_temp_free_ptr(a0);
 225    tcg_temp_free_ptr(a1);
 226    tcg_temp_free_ptr(a2);
 227    tcg_temp_free_ptr(a3);
 228    tcg_temp_free_ptr(a4);
 229}
 230
 231/* Generate a call to a gvec-style helper with three vector operands
 232   and an extra pointer operand.  */
 233void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
 234                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 235                        int32_t data, gen_helper_gvec_2_ptr *fn)
 236{
 237    TCGv_ptr a0, a1;
 238    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 239
 240    a0 = tcg_temp_new_ptr();
 241    a1 = tcg_temp_new_ptr();
 242
 243    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 244    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 245
 246    fn(a0, a1, ptr, desc);
 247
 248    tcg_temp_free_ptr(a0);
 249    tcg_temp_free_ptr(a1);
 250}
 251
 252/* Generate a call to a gvec-style helper with three vector operands
 253   and an extra pointer operand.  */
 254void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 255                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 256                        int32_t data, gen_helper_gvec_3_ptr *fn)
 257{
 258    TCGv_ptr a0, a1, a2;
 259    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 260
 261    a0 = tcg_temp_new_ptr();
 262    a1 = tcg_temp_new_ptr();
 263    a2 = tcg_temp_new_ptr();
 264
 265    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 266    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 267    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 268
 269    fn(a0, a1, a2, ptr, desc);
 270
 271    tcg_temp_free_ptr(a0);
 272    tcg_temp_free_ptr(a1);
 273    tcg_temp_free_ptr(a2);
 274}
 275
 276/* Generate a call to a gvec-style helper with four vector operands
 277   and an extra pointer operand.  */
 278void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 279                        uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
 280                        uint32_t maxsz, int32_t data,
 281                        gen_helper_gvec_4_ptr *fn)
 282{
 283    TCGv_ptr a0, a1, a2, a3;
 284    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 285
 286    a0 = tcg_temp_new_ptr();
 287    a1 = tcg_temp_new_ptr();
 288    a2 = tcg_temp_new_ptr();
 289    a3 = tcg_temp_new_ptr();
 290
 291    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 292    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 293    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 294    tcg_gen_addi_ptr(a3, cpu_env, cofs);
 295
 296    fn(a0, a1, a2, a3, ptr, desc);
 297
 298    tcg_temp_free_ptr(a0);
 299    tcg_temp_free_ptr(a1);
 300    tcg_temp_free_ptr(a2);
 301    tcg_temp_free_ptr(a3);
 302}
 303
 304/* Generate a call to a gvec-style helper with five vector operands
 305   and an extra pointer operand.  */
 306void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 307                        uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
 308                        uint32_t oprsz, uint32_t maxsz, int32_t data,
 309                        gen_helper_gvec_5_ptr *fn)
 310{
 311    TCGv_ptr a0, a1, a2, a3, a4;
 312    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 313
 314    a0 = tcg_temp_new_ptr();
 315    a1 = tcg_temp_new_ptr();
 316    a2 = tcg_temp_new_ptr();
 317    a3 = tcg_temp_new_ptr();
 318    a4 = tcg_temp_new_ptr();
 319
 320    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 321    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 322    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 323    tcg_gen_addi_ptr(a3, cpu_env, cofs);
 324    tcg_gen_addi_ptr(a4, cpu_env, eofs);
 325
 326    fn(a0, a1, a2, a3, a4, ptr, desc);
 327
 328    tcg_temp_free_ptr(a0);
 329    tcg_temp_free_ptr(a1);
 330    tcg_temp_free_ptr(a2);
 331    tcg_temp_free_ptr(a3);
 332    tcg_temp_free_ptr(a4);
 333}
 334
 335/* Return true if we want to implement something of OPRSZ bytes
 336   in units of LNSZ.  This limits the expansion of inline code.  */
 337static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
 338{
 339    uint32_t q, r;
 340
 341    if (oprsz < lnsz) {
 342        return false;
 343    }
 344
 345    q = oprsz / lnsz;
 346    r = oprsz % lnsz;
 347    tcg_debug_assert((r & 7) == 0);
 348
 349    if (lnsz < 16) {
 350        /* For sizes below 16, accept no remainder. */
 351        if (r != 0) {
 352            return false;
 353        }
 354    } else {
 355        /*
 356         * Recall that ARM SVE allows vector sizes that are not a
 357         * power of 2, but always a multiple of 16.  The intent is
 358         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 359         * In addition, expand_clr needs to handle a multiple of 8.
 360         * Thus we can handle the tail with one more operation per
 361         * diminishing power of 2.
 362         */
 363        q += ctpop32(r);
 364    }
 365
 366    return q <= MAX_UNROLL;
 367}
 368
 369static void expand_clr(uint32_t dofs, uint32_t maxsz);
 370
 371/* Duplicate C as per VECE.  */
 372uint64_t (dup_const)(unsigned vece, uint64_t c)
 373{
 374    switch (vece) {
 375    case MO_8:
 376        return 0x0101010101010101ull * (uint8_t)c;
 377    case MO_16:
 378        return 0x0001000100010001ull * (uint16_t)c;
 379    case MO_32:
 380        return 0x0000000100000001ull * (uint32_t)c;
 381    case MO_64:
 382        return c;
 383    default:
 384        g_assert_not_reached();
 385    }
 386}
 387
 388/* Duplicate IN into OUT as per VECE.  */
 389void tcg_gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
 390{
 391    switch (vece) {
 392    case MO_8:
 393        tcg_gen_ext8u_i32(out, in);
 394        tcg_gen_muli_i32(out, out, 0x01010101);
 395        break;
 396    case MO_16:
 397        tcg_gen_deposit_i32(out, in, in, 16, 16);
 398        break;
 399    case MO_32:
 400        tcg_gen_mov_i32(out, in);
 401        break;
 402    default:
 403        g_assert_not_reached();
 404    }
 405}
 406
 407void tcg_gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
 408{
 409    switch (vece) {
 410    case MO_8:
 411        tcg_gen_ext8u_i64(out, in);
 412        tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
 413        break;
 414    case MO_16:
 415        tcg_gen_ext16u_i64(out, in);
 416        tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
 417        break;
 418    case MO_32:
 419        tcg_gen_deposit_i64(out, in, in, 32, 32);
 420        break;
 421    case MO_64:
 422        tcg_gen_mov_i64(out, in);
 423        break;
 424    default:
 425        g_assert_not_reached();
 426    }
 427}
 428
 429/* Select a supported vector type for implementing an operation on SIZE
 430 * bytes.  If OP is 0, assume that the real operation to be performed is
 431 * required by all backends.  Otherwise, make sure than OP can be performed
 432 * on elements of size VECE in the selected type.  Do not select V64 if
 433 * PREFER_I64 is true.  Return 0 if no vector type is selected.
 434 */
 435static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
 436                                  uint32_t size, bool prefer_i64)
 437{
 438    /*
 439     * Recall that ARM SVE allows vector sizes that are not a
 440     * power of 2, but always a multiple of 16.  The intent is
 441     * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 442     * It is hard to imagine a case in which v256 is supported
 443     * but v128 is not, but check anyway.
 444     * In addition, expand_clr needs to handle a multiple of 8.
 445     */
 446    if (TCG_TARGET_HAS_v256 &&
 447        check_size_impl(size, 32) &&
 448        tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) &&
 449        (!(size & 16) ||
 450         (TCG_TARGET_HAS_v128 &&
 451          tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) &&
 452        (!(size & 8) ||
 453         (TCG_TARGET_HAS_v64 &&
 454          tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
 455        return TCG_TYPE_V256;
 456    }
 457    if (TCG_TARGET_HAS_v128 &&
 458        check_size_impl(size, 16) &&
 459        tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) &&
 460        (!(size & 8) ||
 461         (TCG_TARGET_HAS_v64 &&
 462          tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
 463        return TCG_TYPE_V128;
 464    }
 465    if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
 466        && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) {
 467        return TCG_TYPE_V64;
 468    }
 469    return 0;
 470}
 471
 472static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
 473                         uint32_t maxsz, TCGv_vec t_vec)
 474{
 475    uint32_t i = 0;
 476
 477    tcg_debug_assert(oprsz >= 8);
 478
 479    /*
 480     * This may be expand_clr for the tail of an operation, e.g.
 481     * oprsz == 8 && maxsz == 64.  The first 8 bytes of this store
 482     * are misaligned wrt the maximum vector size, so do that first.
 483     */
 484    if (dofs & 8) {
 485        tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
 486        i += 8;
 487    }
 488
 489    switch (type) {
 490    case TCG_TYPE_V256:
 491        /*
 492         * Recall that ARM SVE allows vector sizes that are not a
 493         * power of 2, but always a multiple of 16.  The intent is
 494         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 495         */
 496        for (; i + 32 <= oprsz; i += 32) {
 497            tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
 498        }
 499        /* fallthru */
 500    case TCG_TYPE_V128:
 501        for (; i + 16 <= oprsz; i += 16) {
 502            tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
 503        }
 504        break;
 505    case TCG_TYPE_V64:
 506        for (; i < oprsz; i += 8) {
 507            tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
 508        }
 509        break;
 510    default:
 511        g_assert_not_reached();
 512    }
 513
 514    if (oprsz < maxsz) {
 515        expand_clr(dofs + oprsz, maxsz - oprsz);
 516    }
 517}
 518
 519/* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
 520 * Only one of IN_32 or IN_64 may be set;
 521 * IN_C is used if IN_32 and IN_64 are unset.
 522 */
 523static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
 524                   uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
 525                   uint64_t in_c)
 526{
 527    TCGType type;
 528    TCGv_i64 t_64;
 529    TCGv_i32 t_32, t_desc;
 530    TCGv_ptr t_ptr;
 531    uint32_t i;
 532
 533    assert(vece <= (in_32 ? MO_32 : MO_64));
 534    assert(in_32 == NULL || in_64 == NULL);
 535
 536    /* If we're storing 0, expand oprsz to maxsz.  */
 537    if (in_32 == NULL && in_64 == NULL) {
 538        in_c = dup_const(vece, in_c);
 539        if (in_c == 0) {
 540            oprsz = maxsz;
 541            vece = MO_8;
 542        } else if (in_c == dup_const(MO_8, in_c)) {
 543            vece = MO_8;
 544        }
 545    }
 546
 547    /* Implement inline with a vector type, if possible.
 548     * Prefer integer when 64-bit host and no variable dup.
 549     */
 550    type = choose_vector_type(NULL, vece, oprsz,
 551                              (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
 552                               && (in_64 == NULL || vece == MO_64)));
 553    if (type != 0) {
 554        TCGv_vec t_vec = tcg_temp_new_vec(type);
 555
 556        if (in_32) {
 557            tcg_gen_dup_i32_vec(vece, t_vec, in_32);
 558        } else if (in_64) {
 559            tcg_gen_dup_i64_vec(vece, t_vec, in_64);
 560        } else {
 561            tcg_gen_dupi_vec(vece, t_vec, in_c);
 562        }
 563        do_dup_store(type, dofs, oprsz, maxsz, t_vec);
 564        tcg_temp_free_vec(t_vec);
 565        return;
 566    }
 567
 568    /* Otherwise, inline with an integer type, unless "large".  */
 569    if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
 570        t_64 = NULL;
 571        t_32 = NULL;
 572
 573        if (in_32) {
 574            /* We are given a 32-bit variable input.  For a 64-bit host,
 575               use a 64-bit operation unless the 32-bit operation would
 576               be simple enough.  */
 577            if (TCG_TARGET_REG_BITS == 64
 578                && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
 579                t_64 = tcg_temp_new_i64();
 580                tcg_gen_extu_i32_i64(t_64, in_32);
 581                tcg_gen_dup_i64(vece, t_64, t_64);
 582            } else {
 583                t_32 = tcg_temp_new_i32();
 584                tcg_gen_dup_i32(vece, t_32, in_32);
 585            }
 586        } else if (in_64) {
 587            /* We are given a 64-bit variable input.  */
 588            t_64 = tcg_temp_new_i64();
 589            tcg_gen_dup_i64(vece, t_64, in_64);
 590        } else {
 591            /* We are given a constant input.  */
 592            /* For 64-bit hosts, use 64-bit constants for "simple" constants
 593               or when we'd need too many 32-bit stores, or when a 64-bit
 594               constant is really required.  */
 595            if (vece == MO_64
 596                || (TCG_TARGET_REG_BITS == 64
 597                    && (in_c == 0 || in_c == -1
 598                        || !check_size_impl(oprsz, 4)))) {
 599                t_64 = tcg_constant_i64(in_c);
 600            } else {
 601                t_32 = tcg_constant_i32(in_c);
 602            }
 603        }
 604
 605        /* Implement inline if we picked an implementation size above.  */
 606        if (t_32) {
 607            for (i = 0; i < oprsz; i += 4) {
 608                tcg_gen_st_i32(t_32, cpu_env, dofs + i);
 609            }
 610            tcg_temp_free_i32(t_32);
 611            goto done;
 612        }
 613        if (t_64) {
 614            for (i = 0; i < oprsz; i += 8) {
 615                tcg_gen_st_i64(t_64, cpu_env, dofs + i);
 616            }
 617            tcg_temp_free_i64(t_64);
 618            goto done;
 619        }
 620    }
 621
 622    /* Otherwise implement out of line.  */
 623    t_ptr = tcg_temp_new_ptr();
 624    tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
 625
 626    /*
 627     * This may be expand_clr for the tail of an operation, e.g.
 628     * oprsz == 8 && maxsz == 64.  The size of the clear is misaligned
 629     * wrt simd_desc and will assert.  Simply pass all replicated byte
 630     * stores through to memset.
 631     */
 632    if (oprsz == maxsz && vece == MO_8) {
 633        TCGv_ptr t_size = tcg_const_ptr(oprsz);
 634        TCGv_i32 t_val;
 635
 636        if (in_32) {
 637            t_val = in_32;
 638        } else if (in_64) {
 639            t_val = tcg_temp_new_i32();
 640            tcg_gen_extrl_i64_i32(t_val, in_64);
 641        } else {
 642            t_val = tcg_constant_i32(in_c);
 643        }
 644        gen_helper_memset(t_ptr, t_ptr, t_val, t_size);
 645
 646        if (in_64) {
 647            tcg_temp_free_i32(t_val);
 648        }
 649        tcg_temp_free_ptr(t_size);
 650        tcg_temp_free_ptr(t_ptr);
 651        return;
 652    }
 653
 654    t_desc = tcg_constant_i32(simd_desc(oprsz, maxsz, 0));
 655
 656    if (vece == MO_64) {
 657        if (in_64) {
 658            gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
 659        } else {
 660            t_64 = tcg_constant_i64(in_c);
 661            gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
 662        }
 663    } else {
 664        typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
 665        static dup_fn * const fns[3] = {
 666            gen_helper_gvec_dup8,
 667            gen_helper_gvec_dup16,
 668            gen_helper_gvec_dup32
 669        };
 670
 671        if (in_32) {
 672            fns[vece](t_ptr, t_desc, in_32);
 673        } else if (in_64) {
 674            t_32 = tcg_temp_new_i32();
 675            tcg_gen_extrl_i64_i32(t_32, in_64);
 676            fns[vece](t_ptr, t_desc, t_32);
 677            tcg_temp_free_i32(t_32);
 678        } else {
 679            if (vece == MO_8) {
 680                in_c &= 0xff;
 681            } else if (vece == MO_16) {
 682                in_c &= 0xffff;
 683            }
 684            t_32 = tcg_constant_i32(in_c);
 685            fns[vece](t_ptr, t_desc, t_32);
 686        }
 687    }
 688
 689    tcg_temp_free_ptr(t_ptr);
 690    return;
 691
 692 done:
 693    if (oprsz < maxsz) {
 694        expand_clr(dofs + oprsz, maxsz - oprsz);
 695    }
 696}
 697
 698/* Likewise, but with zero.  */
 699static void expand_clr(uint32_t dofs, uint32_t maxsz)
 700{
 701    do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
 702}
 703
 704/* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
 705static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 706                         bool load_dest, void (*fni)(TCGv_i32, TCGv_i32))
 707{
 708    TCGv_i32 t0 = tcg_temp_new_i32();
 709    TCGv_i32 t1 = tcg_temp_new_i32();
 710    uint32_t i;
 711
 712    for (i = 0; i < oprsz; i += 4) {
 713        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 714        if (load_dest) {
 715            tcg_gen_ld_i32(t1, cpu_env, dofs + i);
 716        }
 717        fni(t1, t0);
 718        tcg_gen_st_i32(t1, cpu_env, dofs + i);
 719    }
 720    tcg_temp_free_i32(t0);
 721    tcg_temp_free_i32(t1);
 722}
 723
 724static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 725                          int32_t c, bool load_dest,
 726                          void (*fni)(TCGv_i32, TCGv_i32, int32_t))
 727{
 728    TCGv_i32 t0 = tcg_temp_new_i32();
 729    TCGv_i32 t1 = tcg_temp_new_i32();
 730    uint32_t i;
 731
 732    for (i = 0; i < oprsz; i += 4) {
 733        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 734        if (load_dest) {
 735            tcg_gen_ld_i32(t1, cpu_env, dofs + i);
 736        }
 737        fni(t1, t0, c);
 738        tcg_gen_st_i32(t1, cpu_env, dofs + i);
 739    }
 740    tcg_temp_free_i32(t0);
 741    tcg_temp_free_i32(t1);
 742}
 743
 744static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 745                          TCGv_i32 c, bool scalar_first,
 746                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
 747{
 748    TCGv_i32 t0 = tcg_temp_new_i32();
 749    TCGv_i32 t1 = tcg_temp_new_i32();
 750    uint32_t i;
 751
 752    for (i = 0; i < oprsz; i += 4) {
 753        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 754        if (scalar_first) {
 755            fni(t1, c, t0);
 756        } else {
 757            fni(t1, t0, c);
 758        }
 759        tcg_gen_st_i32(t1, cpu_env, dofs + i);
 760    }
 761    tcg_temp_free_i32(t0);
 762    tcg_temp_free_i32(t1);
 763}
 764
 765/* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
 766static void expand_3_i32(uint32_t dofs, uint32_t aofs,
 767                         uint32_t bofs, uint32_t oprsz, bool load_dest,
 768                         void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
 769{
 770    TCGv_i32 t0 = tcg_temp_new_i32();
 771    TCGv_i32 t1 = tcg_temp_new_i32();
 772    TCGv_i32 t2 = tcg_temp_new_i32();
 773    uint32_t i;
 774
 775    for (i = 0; i < oprsz; i += 4) {
 776        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 777        tcg_gen_ld_i32(t1, cpu_env, bofs + i);
 778        if (load_dest) {
 779            tcg_gen_ld_i32(t2, cpu_env, dofs + i);
 780        }
 781        fni(t2, t0, t1);
 782        tcg_gen_st_i32(t2, cpu_env, dofs + i);
 783    }
 784    tcg_temp_free_i32(t2);
 785    tcg_temp_free_i32(t1);
 786    tcg_temp_free_i32(t0);
 787}
 788
 789static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 790                          uint32_t oprsz, int32_t c, bool load_dest,
 791                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
 792{
 793    TCGv_i32 t0 = tcg_temp_new_i32();
 794    TCGv_i32 t1 = tcg_temp_new_i32();
 795    TCGv_i32 t2 = tcg_temp_new_i32();
 796    uint32_t i;
 797
 798    for (i = 0; i < oprsz; i += 4) {
 799        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 800        tcg_gen_ld_i32(t1, cpu_env, bofs + i);
 801        if (load_dest) {
 802            tcg_gen_ld_i32(t2, cpu_env, dofs + i);
 803        }
 804        fni(t2, t0, t1, c);
 805        tcg_gen_st_i32(t2, cpu_env, dofs + i);
 806    }
 807    tcg_temp_free_i32(t0);
 808    tcg_temp_free_i32(t1);
 809    tcg_temp_free_i32(t2);
 810}
 811
 812/* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
 813static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 814                         uint32_t cofs, uint32_t oprsz, bool write_aofs,
 815                         void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
 816{
 817    TCGv_i32 t0 = tcg_temp_new_i32();
 818    TCGv_i32 t1 = tcg_temp_new_i32();
 819    TCGv_i32 t2 = tcg_temp_new_i32();
 820    TCGv_i32 t3 = tcg_temp_new_i32();
 821    uint32_t i;
 822
 823    for (i = 0; i < oprsz; i += 4) {
 824        tcg_gen_ld_i32(t1, cpu_env, aofs + i);
 825        tcg_gen_ld_i32(t2, cpu_env, bofs + i);
 826        tcg_gen_ld_i32(t3, cpu_env, cofs + i);
 827        fni(t0, t1, t2, t3);
 828        tcg_gen_st_i32(t0, cpu_env, dofs + i);
 829        if (write_aofs) {
 830            tcg_gen_st_i32(t1, cpu_env, aofs + i);
 831        }
 832    }
 833    tcg_temp_free_i32(t3);
 834    tcg_temp_free_i32(t2);
 835    tcg_temp_free_i32(t1);
 836    tcg_temp_free_i32(t0);
 837}
 838
 839static void expand_4i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 840                          uint32_t cofs, uint32_t oprsz, int32_t c,
 841                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32,
 842                                      int32_t))
 843{
 844    TCGv_i32 t0 = tcg_temp_new_i32();
 845    TCGv_i32 t1 = tcg_temp_new_i32();
 846    TCGv_i32 t2 = tcg_temp_new_i32();
 847    TCGv_i32 t3 = tcg_temp_new_i32();
 848    uint32_t i;
 849
 850    for (i = 0; i < oprsz; i += 4) {
 851        tcg_gen_ld_i32(t1, cpu_env, aofs + i);
 852        tcg_gen_ld_i32(t2, cpu_env, bofs + i);
 853        tcg_gen_ld_i32(t3, cpu_env, cofs + i);
 854        fni(t0, t1, t2, t3, c);
 855        tcg_gen_st_i32(t0, cpu_env, dofs + i);
 856    }
 857    tcg_temp_free_i32(t3);
 858    tcg_temp_free_i32(t2);
 859    tcg_temp_free_i32(t1);
 860    tcg_temp_free_i32(t0);
 861}
 862
 863/* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
 864static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 865                         bool load_dest, void (*fni)(TCGv_i64, TCGv_i64))
 866{
 867    TCGv_i64 t0 = tcg_temp_new_i64();
 868    TCGv_i64 t1 = tcg_temp_new_i64();
 869    uint32_t i;
 870
 871    for (i = 0; i < oprsz; i += 8) {
 872        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 873        if (load_dest) {
 874            tcg_gen_ld_i64(t1, cpu_env, dofs + i);
 875        }
 876        fni(t1, t0);
 877        tcg_gen_st_i64(t1, cpu_env, dofs + i);
 878    }
 879    tcg_temp_free_i64(t0);
 880    tcg_temp_free_i64(t1);
 881}
 882
 883static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 884                          int64_t c, bool load_dest,
 885                          void (*fni)(TCGv_i64, TCGv_i64, int64_t))
 886{
 887    TCGv_i64 t0 = tcg_temp_new_i64();
 888    TCGv_i64 t1 = tcg_temp_new_i64();
 889    uint32_t i;
 890
 891    for (i = 0; i < oprsz; i += 8) {
 892        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 893        if (load_dest) {
 894            tcg_gen_ld_i64(t1, cpu_env, dofs + i);
 895        }
 896        fni(t1, t0, c);
 897        tcg_gen_st_i64(t1, cpu_env, dofs + i);
 898    }
 899    tcg_temp_free_i64(t0);
 900    tcg_temp_free_i64(t1);
 901}
 902
 903static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 904                          TCGv_i64 c, bool scalar_first,
 905                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
 906{
 907    TCGv_i64 t0 = tcg_temp_new_i64();
 908    TCGv_i64 t1 = tcg_temp_new_i64();
 909    uint32_t i;
 910
 911    for (i = 0; i < oprsz; i += 8) {
 912        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 913        if (scalar_first) {
 914            fni(t1, c, t0);
 915        } else {
 916            fni(t1, t0, c);
 917        }
 918        tcg_gen_st_i64(t1, cpu_env, dofs + i);
 919    }
 920    tcg_temp_free_i64(t0);
 921    tcg_temp_free_i64(t1);
 922}
 923
 924/* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
 925static void expand_3_i64(uint32_t dofs, uint32_t aofs,
 926                         uint32_t bofs, uint32_t oprsz, bool load_dest,
 927                         void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
 928{
 929    TCGv_i64 t0 = tcg_temp_new_i64();
 930    TCGv_i64 t1 = tcg_temp_new_i64();
 931    TCGv_i64 t2 = tcg_temp_new_i64();
 932    uint32_t i;
 933
 934    for (i = 0; i < oprsz; i += 8) {
 935        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 936        tcg_gen_ld_i64(t1, cpu_env, bofs + i);
 937        if (load_dest) {
 938            tcg_gen_ld_i64(t2, cpu_env, dofs + i);
 939        }
 940        fni(t2, t0, t1);
 941        tcg_gen_st_i64(t2, cpu_env, dofs + i);
 942    }
 943    tcg_temp_free_i64(t2);
 944    tcg_temp_free_i64(t1);
 945    tcg_temp_free_i64(t0);
 946}
 947
 948static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 949                          uint32_t oprsz, int64_t c, bool load_dest,
 950                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
 951{
 952    TCGv_i64 t0 = tcg_temp_new_i64();
 953    TCGv_i64 t1 = tcg_temp_new_i64();
 954    TCGv_i64 t2 = tcg_temp_new_i64();
 955    uint32_t i;
 956
 957    for (i = 0; i < oprsz; i += 8) {
 958        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 959        tcg_gen_ld_i64(t1, cpu_env, bofs + i);
 960        if (load_dest) {
 961            tcg_gen_ld_i64(t2, cpu_env, dofs + i);
 962        }
 963        fni(t2, t0, t1, c);
 964        tcg_gen_st_i64(t2, cpu_env, dofs + i);
 965    }
 966    tcg_temp_free_i64(t0);
 967    tcg_temp_free_i64(t1);
 968    tcg_temp_free_i64(t2);
 969}
 970
 971/* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
 972static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 973                         uint32_t cofs, uint32_t oprsz, bool write_aofs,
 974                         void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
 975{
 976    TCGv_i64 t0 = tcg_temp_new_i64();
 977    TCGv_i64 t1 = tcg_temp_new_i64();
 978    TCGv_i64 t2 = tcg_temp_new_i64();
 979    TCGv_i64 t3 = tcg_temp_new_i64();
 980    uint32_t i;
 981
 982    for (i = 0; i < oprsz; i += 8) {
 983        tcg_gen_ld_i64(t1, cpu_env, aofs + i);
 984        tcg_gen_ld_i64(t2, cpu_env, bofs + i);
 985        tcg_gen_ld_i64(t3, cpu_env, cofs + i);
 986        fni(t0, t1, t2, t3);
 987        tcg_gen_st_i64(t0, cpu_env, dofs + i);
 988        if (write_aofs) {
 989            tcg_gen_st_i64(t1, cpu_env, aofs + i);
 990        }
 991    }
 992    tcg_temp_free_i64(t3);
 993    tcg_temp_free_i64(t2);
 994    tcg_temp_free_i64(t1);
 995    tcg_temp_free_i64(t0);
 996}
 997
 998static void expand_4i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 999                          uint32_t cofs, uint32_t oprsz, int64_t c,
1000                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64,
1001                                      int64_t))
1002{
1003    TCGv_i64 t0 = tcg_temp_new_i64();
1004    TCGv_i64 t1 = tcg_temp_new_i64();
1005    TCGv_i64 t2 = tcg_temp_new_i64();
1006    TCGv_i64 t3 = tcg_temp_new_i64();
1007    uint32_t i;
1008
1009    for (i = 0; i < oprsz; i += 8) {
1010        tcg_gen_ld_i64(t1, cpu_env, aofs + i);
1011        tcg_gen_ld_i64(t2, cpu_env, bofs + i);
1012        tcg_gen_ld_i64(t3, cpu_env, cofs + i);
1013        fni(t0, t1, t2, t3, c);
1014        tcg_gen_st_i64(t0, cpu_env, dofs + i);
1015    }
1016    tcg_temp_free_i64(t3);
1017    tcg_temp_free_i64(t2);
1018    tcg_temp_free_i64(t1);
1019    tcg_temp_free_i64(t0);
1020}
1021
1022/* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
1023static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1024                         uint32_t oprsz, uint32_t tysz, TCGType type,
1025                         bool load_dest,
1026                         void (*fni)(unsigned, TCGv_vec, TCGv_vec))
1027{
1028    TCGv_vec t0 = tcg_temp_new_vec(type);
1029    TCGv_vec t1 = tcg_temp_new_vec(type);
1030    uint32_t i;
1031
1032    for (i = 0; i < oprsz; i += tysz) {
1033        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1034        if (load_dest) {
1035            tcg_gen_ld_vec(t1, cpu_env, dofs + i);
1036        }
1037        fni(vece, t1, t0);
1038        tcg_gen_st_vec(t1, cpu_env, dofs + i);
1039    }
1040    tcg_temp_free_vec(t0);
1041    tcg_temp_free_vec(t1);
1042}
1043
1044/* Expand OPSZ bytes worth of two-vector operands and an immediate operand
1045   using host vectors.  */
1046static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1047                          uint32_t oprsz, uint32_t tysz, TCGType type,
1048                          int64_t c, bool load_dest,
1049                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
1050{
1051    TCGv_vec t0 = tcg_temp_new_vec(type);
1052    TCGv_vec t1 = tcg_temp_new_vec(type);
1053    uint32_t i;
1054
1055    for (i = 0; i < oprsz; i += tysz) {
1056        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1057        if (load_dest) {
1058            tcg_gen_ld_vec(t1, cpu_env, dofs + i);
1059        }
1060        fni(vece, t1, t0, c);
1061        tcg_gen_st_vec(t1, cpu_env, dofs + i);
1062    }
1063    tcg_temp_free_vec(t0);
1064    tcg_temp_free_vec(t1);
1065}
1066
1067static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1068                          uint32_t oprsz, uint32_t tysz, TCGType type,
1069                          TCGv_vec c, bool scalar_first,
1070                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1071{
1072    TCGv_vec t0 = tcg_temp_new_vec(type);
1073    TCGv_vec t1 = tcg_temp_new_vec(type);
1074    uint32_t i;
1075
1076    for (i = 0; i < oprsz; i += tysz) {
1077        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1078        if (scalar_first) {
1079            fni(vece, t1, c, t0);
1080        } else {
1081            fni(vece, t1, t0, c);
1082        }
1083        tcg_gen_st_vec(t1, cpu_env, dofs + i);
1084    }
1085    tcg_temp_free_vec(t0);
1086    tcg_temp_free_vec(t1);
1087}
1088
1089/* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
1090static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1091                         uint32_t bofs, uint32_t oprsz,
1092                         uint32_t tysz, TCGType type, bool load_dest,
1093                         void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1094{
1095    TCGv_vec t0 = tcg_temp_new_vec(type);
1096    TCGv_vec t1 = tcg_temp_new_vec(type);
1097    TCGv_vec t2 = tcg_temp_new_vec(type);
1098    uint32_t i;
1099
1100    for (i = 0; i < oprsz; i += tysz) {
1101        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1102        tcg_gen_ld_vec(t1, cpu_env, bofs + i);
1103        if (load_dest) {
1104            tcg_gen_ld_vec(t2, cpu_env, dofs + i);
1105        }
1106        fni(vece, t2, t0, t1);
1107        tcg_gen_st_vec(t2, cpu_env, dofs + i);
1108    }
1109    tcg_temp_free_vec(t2);
1110    tcg_temp_free_vec(t1);
1111    tcg_temp_free_vec(t0);
1112}
1113
1114/*
1115 * Expand OPSZ bytes worth of three-vector operands and an immediate operand
1116 * using host vectors.
1117 */
1118static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1119                          uint32_t bofs, uint32_t oprsz, uint32_t tysz,
1120                          TCGType type, int64_t c, bool load_dest,
1121                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
1122                                      int64_t))
1123{
1124    TCGv_vec t0 = tcg_temp_new_vec(type);
1125    TCGv_vec t1 = tcg_temp_new_vec(type);
1126    TCGv_vec t2 = tcg_temp_new_vec(type);
1127    uint32_t i;
1128
1129    for (i = 0; i < oprsz; i += tysz) {
1130        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1131        tcg_gen_ld_vec(t1, cpu_env, bofs + i);
1132        if (load_dest) {
1133            tcg_gen_ld_vec(t2, cpu_env, dofs + i);
1134        }
1135        fni(vece, t2, t0, t1, c);
1136        tcg_gen_st_vec(t2, cpu_env, dofs + i);
1137    }
1138    tcg_temp_free_vec(t0);
1139    tcg_temp_free_vec(t1);
1140    tcg_temp_free_vec(t2);
1141}
1142
1143/* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
1144static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1145                         uint32_t bofs, uint32_t cofs, uint32_t oprsz,
1146                         uint32_t tysz, TCGType type, bool write_aofs,
1147                         void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1148                                     TCGv_vec, TCGv_vec))
1149{
1150    TCGv_vec t0 = tcg_temp_new_vec(type);
1151    TCGv_vec t1 = tcg_temp_new_vec(type);
1152    TCGv_vec t2 = tcg_temp_new_vec(type);
1153    TCGv_vec t3 = tcg_temp_new_vec(type);
1154    uint32_t i;
1155
1156    for (i = 0; i < oprsz; i += tysz) {
1157        tcg_gen_ld_vec(t1, cpu_env, aofs + i);
1158        tcg_gen_ld_vec(t2, cpu_env, bofs + i);
1159        tcg_gen_ld_vec(t3, cpu_env, cofs + i);
1160        fni(vece, t0, t1, t2, t3);
1161        tcg_gen_st_vec(t0, cpu_env, dofs + i);
1162        if (write_aofs) {
1163            tcg_gen_st_vec(t1, cpu_env, aofs + i);
1164        }
1165    }
1166    tcg_temp_free_vec(t3);
1167    tcg_temp_free_vec(t2);
1168    tcg_temp_free_vec(t1);
1169    tcg_temp_free_vec(t0);
1170}
1171
1172/*
1173 * Expand OPSZ bytes worth of four-vector operands and an immediate operand
1174 * using host vectors.
1175 */
1176static void expand_4i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1177                          uint32_t bofs, uint32_t cofs, uint32_t oprsz,
1178                          uint32_t tysz, TCGType type, int64_t c,
1179                          void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1180                                     TCGv_vec, TCGv_vec, int64_t))
1181{
1182    TCGv_vec t0 = tcg_temp_new_vec(type);
1183    TCGv_vec t1 = tcg_temp_new_vec(type);
1184    TCGv_vec t2 = tcg_temp_new_vec(type);
1185    TCGv_vec t3 = tcg_temp_new_vec(type);
1186    uint32_t i;
1187
1188    for (i = 0; i < oprsz; i += tysz) {
1189        tcg_gen_ld_vec(t1, cpu_env, aofs + i);
1190        tcg_gen_ld_vec(t2, cpu_env, bofs + i);
1191        tcg_gen_ld_vec(t3, cpu_env, cofs + i);
1192        fni(vece, t0, t1, t2, t3, c);
1193        tcg_gen_st_vec(t0, cpu_env, dofs + i);
1194    }
1195    tcg_temp_free_vec(t3);
1196    tcg_temp_free_vec(t2);
1197    tcg_temp_free_vec(t1);
1198    tcg_temp_free_vec(t0);
1199}
1200
1201/* Expand a vector two-operand operation.  */
1202void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
1203                    uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
1204{
1205    const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1206    const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1207    TCGType type;
1208    uint32_t some;
1209
1210    check_size_align(oprsz, maxsz, dofs | aofs);
1211    check_overlap_2(dofs, aofs, maxsz);
1212
1213    type = 0;
1214    if (g->fniv) {
1215        type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1216    }
1217    switch (type) {
1218    case TCG_TYPE_V256:
1219        /* Recall that ARM SVE allows vector sizes that are not a
1220         * power of 2, but always a multiple of 16.  The intent is
1221         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1222         */
1223        some = QEMU_ALIGN_DOWN(oprsz, 32);
1224        expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1225                     g->load_dest, g->fniv);
1226        if (some == oprsz) {
1227            break;
1228        }
1229        dofs += some;
1230        aofs += some;
1231        oprsz -= some;
1232        maxsz -= some;
1233        /* fallthru */
1234    case TCG_TYPE_V128:
1235        expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1236                     g->load_dest, g->fniv);
1237        break;
1238    case TCG_TYPE_V64:
1239        expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1240                     g->load_dest, g->fniv);
1241        break;
1242
1243    case 0:
1244        if (g->fni8 && check_size_impl(oprsz, 8)) {
1245            expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8);
1246        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1247            expand_2_i32(dofs, aofs, oprsz, g->load_dest, g->fni4);
1248        } else {
1249            assert(g->fno != NULL);
1250            tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
1251            oprsz = maxsz;
1252        }
1253        break;
1254
1255    default:
1256        g_assert_not_reached();
1257    }
1258    tcg_swap_vecop_list(hold_list);
1259
1260    if (oprsz < maxsz) {
1261        expand_clr(dofs + oprsz, maxsz - oprsz);
1262    }
1263}
1264
1265/* Expand a vector operation with two vectors and an immediate.  */
1266void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1267                     uint32_t maxsz, int64_t c, const GVecGen2i *g)
1268{
1269    const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1270    const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1271    TCGType type;
1272    uint32_t some;
1273
1274    check_size_align(oprsz, maxsz, dofs | aofs);
1275    check_overlap_2(dofs, aofs, maxsz);
1276
1277    type = 0;
1278    if (g->fniv) {
1279        type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1280    }
1281    switch (type) {
1282    case TCG_TYPE_V256:
1283        /* Recall that ARM SVE allows vector sizes that are not a
1284         * power of 2, but always a multiple of 16.  The intent is
1285         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1286         */
1287        some = QEMU_ALIGN_DOWN(oprsz, 32);
1288        expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1289                      c, g->load_dest, g->fniv);
1290        if (some == oprsz) {
1291            break;
1292        }
1293        dofs += some;
1294        aofs += some;
1295        oprsz -= some;
1296        maxsz -= some;
1297        /* fallthru */
1298    case TCG_TYPE_V128:
1299        expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1300                      c, g->load_dest, g->fniv);
1301        break;
1302    case TCG_TYPE_V64:
1303        expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1304                      c, g->load_dest, g->fniv);
1305        break;
1306
1307    case 0:
1308        if (g->fni8 && check_size_impl(oprsz, 8)) {
1309            expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1310        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1311            expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1312        } else {
1313            if (g->fno) {
1314                tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1315            } else {
1316                TCGv_i64 tcg_c = tcg_constant_i64(c);
1317                tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1318                                    maxsz, c, g->fnoi);
1319            }
1320            oprsz = maxsz;
1321        }
1322        break;
1323
1324    default:
1325        g_assert_not_reached();
1326    }
1327    tcg_swap_vecop_list(hold_list);
1328
1329    if (oprsz < maxsz) {
1330        expand_clr(dofs + oprsz, maxsz - oprsz);
1331    }
1332}
1333
1334/* Expand a vector operation with two vectors and a scalar.  */
1335void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1336                     uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1337{
1338    TCGType type;
1339
1340    check_size_align(oprsz, maxsz, dofs | aofs);
1341    check_overlap_2(dofs, aofs, maxsz);
1342
1343    type = 0;
1344    if (g->fniv) {
1345        type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1346    }
1347    if (type != 0) {
1348        const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1349        const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1350        TCGv_vec t_vec = tcg_temp_new_vec(type);
1351        uint32_t some;
1352
1353        tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1354
1355        switch (type) {
1356        case TCG_TYPE_V256:
1357            /* Recall that ARM SVE allows vector sizes that are not a
1358             * power of 2, but always a multiple of 16.  The intent is
1359             * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1360             */
1361            some = QEMU_ALIGN_DOWN(oprsz, 32);
1362            expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1363                          t_vec, g->scalar_first, g->fniv);
1364            if (some == oprsz) {
1365                break;
1366            }
1367            dofs += some;
1368            aofs += some;
1369            oprsz -= some;
1370            maxsz -= some;
1371            /* fallthru */
1372
1373        case TCG_TYPE_V128:
1374            expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1375                          t_vec, g->scalar_first, g->fniv);
1376            break;
1377
1378        case TCG_TYPE_V64:
1379            expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1380                          t_vec, g->scalar_first, g->fniv);
1381            break;
1382
1383        default:
1384            g_assert_not_reached();
1385        }
1386        tcg_temp_free_vec(t_vec);
1387        tcg_swap_vecop_list(hold_list);
1388    } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1389        TCGv_i64 t64 = tcg_temp_new_i64();
1390
1391        tcg_gen_dup_i64(g->vece, t64, c);
1392        expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1393        tcg_temp_free_i64(t64);
1394    } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1395        TCGv_i32 t32 = tcg_temp_new_i32();
1396
1397        tcg_gen_extrl_i64_i32(t32, c);
1398        tcg_gen_dup_i32(g->vece, t32, t32);
1399        expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1400        tcg_temp_free_i32(t32);
1401    } else {
1402        tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1403        return;
1404    }
1405
1406    if (oprsz < maxsz) {
1407        expand_clr(dofs + oprsz, maxsz - oprsz);
1408    }
1409}
1410
1411/* Expand a vector three-operand operation.  */
1412void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1413                    uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1414{
1415    const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1416    const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1417    TCGType type;
1418    uint32_t some;
1419
1420    check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1421    check_overlap_3(dofs, aofs, bofs, maxsz);
1422
1423    type = 0;
1424    if (g->fniv) {
1425        type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1426    }
1427    switch (type) {
1428    case TCG_TYPE_V256:
1429        /* Recall that ARM SVE allows vector sizes that are not a
1430         * power of 2, but always a multiple of 16.  The intent is
1431         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1432         */
1433        some = QEMU_ALIGN_DOWN(oprsz, 32);
1434        expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1435                     g->load_dest, g->fniv);
1436        if (some == oprsz) {
1437            break;
1438        }
1439        dofs += some;
1440        aofs += some;
1441        bofs += some;
1442        oprsz -= some;
1443        maxsz -= some;
1444        /* fallthru */
1445    case TCG_TYPE_V128:
1446        expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1447                     g->load_dest, g->fniv);
1448        break;
1449    case TCG_TYPE_V64:
1450        expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1451                     g->load_dest, g->fniv);
1452        break;
1453
1454    case 0:
1455        if (g->fni8 && check_size_impl(oprsz, 8)) {
1456            expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1457        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1458            expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1459        } else {
1460            assert(g->fno != NULL);
1461            tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
1462                               maxsz, g->data, g->fno);
1463            oprsz = maxsz;
1464        }
1465        break;
1466
1467    default:
1468        g_assert_not_reached();
1469    }
1470    tcg_swap_vecop_list(hold_list);
1471
1472    if (oprsz < maxsz) {
1473        expand_clr(dofs + oprsz, maxsz - oprsz);
1474    }
1475}
1476
1477/* Expand a vector operation with three vectors and an immediate.  */
1478void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1479                     uint32_t oprsz, uint32_t maxsz, int64_t c,
1480                     const GVecGen3i *g)
1481{
1482    const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1483    const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1484    TCGType type;
1485    uint32_t some;
1486
1487    check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1488    check_overlap_3(dofs, aofs, bofs, maxsz);
1489
1490    type = 0;
1491    if (g->fniv) {
1492        type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1493    }
1494    switch (type) {
1495    case TCG_TYPE_V256:
1496        /*
1497         * Recall that ARM SVE allows vector sizes that are not a
1498         * power of 2, but always a multiple of 16.  The intent is
1499         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1500         */
1501        some = QEMU_ALIGN_DOWN(oprsz, 32);
1502        expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1503                      c, g->load_dest, g->fniv);
1504        if (some == oprsz) {
1505            break;
1506        }
1507        dofs += some;
1508        aofs += some;
1509        bofs += some;
1510        oprsz -= some;
1511        maxsz -= some;
1512        /* fallthru */
1513    case TCG_TYPE_V128:
1514        expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1515                      c, g->load_dest, g->fniv);
1516        break;
1517    case TCG_TYPE_V64:
1518        expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1519                      c, g->load_dest, g->fniv);
1520        break;
1521
1522    case 0:
1523        if (g->fni8 && check_size_impl(oprsz, 8)) {
1524            expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8);
1525        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1526            expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4);
1527        } else {
1528            assert(g->fno != NULL);
1529            tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);
1530            oprsz = maxsz;
1531        }
1532        break;
1533
1534    default:
1535        g_assert_not_reached();
1536    }
1537    tcg_swap_vecop_list(hold_list);
1538
1539    if (oprsz < maxsz) {
1540        expand_clr(dofs + oprsz, maxsz - oprsz);
1541    }
1542}
1543
1544/* Expand a vector four-operand operation.  */
1545void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1546                    uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1547{
1548    const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1549    const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1550    TCGType type;
1551    uint32_t some;
1552
1553    check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1554    check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1555
1556    type = 0;
1557    if (g->fniv) {
1558        type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1559    }
1560    switch (type) {
1561    case TCG_TYPE_V256:
1562        /* Recall that ARM SVE allows vector sizes that are not a
1563         * power of 2, but always a multiple of 16.  The intent is
1564         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1565         */
1566        some = QEMU_ALIGN_DOWN(oprsz, 32);
1567        expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1568                     32, TCG_TYPE_V256, g->write_aofs, g->fniv);
1569        if (some == oprsz) {
1570            break;
1571        }
1572        dofs += some;
1573        aofs += some;
1574        bofs += some;
1575        cofs += some;
1576        oprsz -= some;
1577        maxsz -= some;
1578        /* fallthru */
1579    case TCG_TYPE_V128:
1580        expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1581                     16, TCG_TYPE_V128, g->write_aofs, g->fniv);
1582        break;
1583    case TCG_TYPE_V64:
1584        expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1585                     8, TCG_TYPE_V64, g->write_aofs, g->fniv);
1586        break;
1587
1588    case 0:
1589        if (g->fni8 && check_size_impl(oprsz, 8)) {
1590            expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
1591                         g->write_aofs, g->fni8);
1592        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1593            expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
1594                         g->write_aofs, g->fni4);
1595        } else {
1596            assert(g->fno != NULL);
1597            tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1598                               oprsz, maxsz, g->data, g->fno);
1599            oprsz = maxsz;
1600        }
1601        break;
1602
1603    default:
1604        g_assert_not_reached();
1605    }
1606    tcg_swap_vecop_list(hold_list);
1607
1608    if (oprsz < maxsz) {
1609        expand_clr(dofs + oprsz, maxsz - oprsz);
1610    }
1611}
1612
1613/* Expand a vector four-operand operation.  */
1614void tcg_gen_gvec_4i(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1615                     uint32_t oprsz, uint32_t maxsz, int64_t c,
1616                     const GVecGen4i *g)
1617{
1618    const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1619    const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1620    TCGType type;
1621    uint32_t some;
1622
1623    check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1624    check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1625
1626    type = 0;
1627    if (g->fniv) {
1628        type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1629    }
1630    switch (type) {
1631    case TCG_TYPE_V256:
1632        /*
1633         * Recall that ARM SVE allows vector sizes that are not a
1634         * power of 2, but always a multiple of 16.  The intent is
1635         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1636         */
1637        some = QEMU_ALIGN_DOWN(oprsz, 32);
1638        expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, some,
1639                      32, TCG_TYPE_V256, c, g->fniv);
1640        if (some == oprsz) {
1641            break;
1642        }
1643        dofs += some;
1644        aofs += some;
1645        bofs += some;
1646        cofs += some;
1647        oprsz -= some;
1648        maxsz -= some;
1649        /* fallthru */
1650    case TCG_TYPE_V128:
1651        expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1652                       16, TCG_TYPE_V128, c, g->fniv);
1653        break;
1654    case TCG_TYPE_V64:
1655        expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1656                      8, TCG_TYPE_V64, c, g->fniv);
1657        break;
1658
1659    case 0:
1660        if (g->fni8 && check_size_impl(oprsz, 8)) {
1661            expand_4i_i64(dofs, aofs, bofs, cofs, oprsz, c, g->fni8);
1662        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1663            expand_4i_i32(dofs, aofs, bofs, cofs, oprsz, c, g->fni4);
1664        } else {
1665            assert(g->fno != NULL);
1666            tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1667                               oprsz, maxsz, c, g->fno);
1668            oprsz = maxsz;
1669        }
1670        break;
1671
1672    default:
1673        g_assert_not_reached();
1674    }
1675    tcg_swap_vecop_list(hold_list);
1676
1677    if (oprsz < maxsz) {
1678        expand_clr(dofs + oprsz, maxsz - oprsz);
1679    }
1680}
1681
1682/*
1683 * Expand specific vector operations.
1684 */
1685
1686static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1687{
1688    tcg_gen_mov_vec(a, b);
1689}
1690
1691void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1692                      uint32_t oprsz, uint32_t maxsz)
1693{
1694    static const GVecGen2 g = {
1695        .fni8 = tcg_gen_mov_i64,
1696        .fniv = vec_mov2,
1697        .fno = gen_helper_gvec_mov,
1698        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1699    };
1700    if (dofs != aofs) {
1701        tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1702    } else {
1703        check_size_align(oprsz, maxsz, dofs);
1704        if (oprsz < maxsz) {
1705            expand_clr(dofs + oprsz, maxsz - oprsz);
1706        }
1707    }
1708}
1709
1710void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1711                          uint32_t maxsz, TCGv_i32 in)
1712{
1713    check_size_align(oprsz, maxsz, dofs);
1714    tcg_debug_assert(vece <= MO_32);
1715    do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1716}
1717
1718void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1719                          uint32_t maxsz, TCGv_i64 in)
1720{
1721    check_size_align(oprsz, maxsz, dofs);
1722    tcg_debug_assert(vece <= MO_64);
1723    do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1724}
1725
1726void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1727                          uint32_t oprsz, uint32_t maxsz)
1728{
1729    check_size_align(oprsz, maxsz, dofs);
1730    if (vece <= MO_64) {
1731        TCGType type = choose_vector_type(NULL, vece, oprsz, 0);
1732        if (type != 0) {
1733            TCGv_vec t_vec = tcg_temp_new_vec(type);
1734            tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs);
1735            do_dup_store(type, dofs, oprsz, maxsz, t_vec);
1736            tcg_temp_free_vec(t_vec);
1737        } else if (vece <= MO_32) {
1738            TCGv_i32 in = tcg_temp_new_i32();
1739            switch (vece) {
1740            case MO_8:
1741                tcg_gen_ld8u_i32(in, cpu_env, aofs);
1742                break;
1743            case MO_16:
1744                tcg_gen_ld16u_i32(in, cpu_env, aofs);
1745                break;
1746            default:
1747                tcg_gen_ld_i32(in, cpu_env, aofs);
1748                break;
1749            }
1750            do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1751            tcg_temp_free_i32(in);
1752        } else {
1753            TCGv_i64 in = tcg_temp_new_i64();
1754            tcg_gen_ld_i64(in, cpu_env, aofs);
1755            do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1756            tcg_temp_free_i64(in);
1757        }
1758    } else if (vece == 4) {
1759        /* 128-bit duplicate.  */
1760        int i;
1761
1762        tcg_debug_assert(oprsz >= 16);
1763        if (TCG_TARGET_HAS_v128) {
1764            TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1765
1766            tcg_gen_ld_vec(in, cpu_env, aofs);
1767            for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1768                tcg_gen_st_vec(in, cpu_env, dofs + i);
1769            }
1770            tcg_temp_free_vec(in);
1771        } else {
1772            TCGv_i64 in0 = tcg_temp_new_i64();
1773            TCGv_i64 in1 = tcg_temp_new_i64();
1774
1775            tcg_gen_ld_i64(in0, cpu_env, aofs);
1776            tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
1777            for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1778                tcg_gen_st_i64(in0, cpu_env, dofs + i);
1779                tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
1780            }
1781            tcg_temp_free_i64(in0);
1782            tcg_temp_free_i64(in1);
1783        }
1784        if (oprsz < maxsz) {
1785            expand_clr(dofs + oprsz, maxsz - oprsz);
1786        }
1787    } else if (vece == 5) {
1788        /* 256-bit duplicate.  */
1789        int i;
1790
1791        tcg_debug_assert(oprsz >= 32);
1792        tcg_debug_assert(oprsz % 32 == 0);
1793        if (TCG_TARGET_HAS_v256) {
1794            TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256);
1795
1796            tcg_gen_ld_vec(in, cpu_env, aofs);
1797            for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1798                tcg_gen_st_vec(in, cpu_env, dofs + i);
1799            }
1800            tcg_temp_free_vec(in);
1801        } else if (TCG_TARGET_HAS_v128) {
1802            TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128);
1803            TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128);
1804
1805            tcg_gen_ld_vec(in0, cpu_env, aofs);
1806            tcg_gen_ld_vec(in1, cpu_env, aofs + 16);
1807            for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1808                tcg_gen_st_vec(in0, cpu_env, dofs + i);
1809                tcg_gen_st_vec(in1, cpu_env, dofs + i + 16);
1810            }
1811            tcg_temp_free_vec(in0);
1812            tcg_temp_free_vec(in1);
1813        } else {
1814            TCGv_i64 in[4];
1815            int j;
1816
1817            for (j = 0; j < 4; ++j) {
1818                in[j] = tcg_temp_new_i64();
1819                tcg_gen_ld_i64(in[j], cpu_env, aofs + j * 8);
1820            }
1821            for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1822                for (j = 0; j < 4; ++j) {
1823                    tcg_gen_st_i64(in[j], cpu_env, dofs + i + j * 8);
1824                }
1825            }
1826            for (j = 0; j < 4; ++j) {
1827                tcg_temp_free_i64(in[j]);
1828            }
1829        }
1830        if (oprsz < maxsz) {
1831            expand_clr(dofs + oprsz, maxsz - oprsz);
1832        }
1833    } else {
1834        g_assert_not_reached();
1835    }
1836}
1837
1838void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz,
1839                          uint32_t maxsz, uint64_t x)
1840{
1841    check_size_align(oprsz, maxsz, dofs);
1842    do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x);
1843}
1844
1845void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1846                      uint32_t oprsz, uint32_t maxsz)
1847{
1848    static const GVecGen2 g = {
1849        .fni8 = tcg_gen_not_i64,
1850        .fniv = tcg_gen_not_vec,
1851        .fno = gen_helper_gvec_not,
1852        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1853    };
1854    tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1855}
1856
1857/* Perform a vector addition using normal addition and a mask.  The mask
1858   should be the sign bit of each lane.  This 6-operation form is more
1859   efficient than separate additions when there are 4 or more lanes in
1860   the 64-bit operation.  */
1861static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1862{
1863    TCGv_i64 t1 = tcg_temp_new_i64();
1864    TCGv_i64 t2 = tcg_temp_new_i64();
1865    TCGv_i64 t3 = tcg_temp_new_i64();
1866
1867    tcg_gen_andc_i64(t1, a, m);
1868    tcg_gen_andc_i64(t2, b, m);
1869    tcg_gen_xor_i64(t3, a, b);
1870    tcg_gen_add_i64(d, t1, t2);
1871    tcg_gen_and_i64(t3, t3, m);
1872    tcg_gen_xor_i64(d, d, t3);
1873
1874    tcg_temp_free_i64(t1);
1875    tcg_temp_free_i64(t2);
1876    tcg_temp_free_i64(t3);
1877}
1878
1879void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1880{
1881    TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
1882    gen_addv_mask(d, a, b, m);
1883}
1884
1885void tcg_gen_vec_add8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1886{
1887    TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80));
1888    TCGv_i32 t1 = tcg_temp_new_i32();
1889    TCGv_i32 t2 = tcg_temp_new_i32();
1890    TCGv_i32 t3 = tcg_temp_new_i32();
1891
1892    tcg_gen_andc_i32(t1, a, m);
1893    tcg_gen_andc_i32(t2, b, m);
1894    tcg_gen_xor_i32(t3, a, b);
1895    tcg_gen_add_i32(d, t1, t2);
1896    tcg_gen_and_i32(t3, t3, m);
1897    tcg_gen_xor_i32(d, d, t3);
1898
1899    tcg_temp_free_i32(t1);
1900    tcg_temp_free_i32(t2);
1901    tcg_temp_free_i32(t3);
1902}
1903
1904void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1905{
1906    TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
1907    gen_addv_mask(d, a, b, m);
1908}
1909
1910void tcg_gen_vec_add16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1911{
1912    TCGv_i32 t1 = tcg_temp_new_i32();
1913    TCGv_i32 t2 = tcg_temp_new_i32();
1914
1915    tcg_gen_andi_i32(t1, a, ~0xffff);
1916    tcg_gen_add_i32(t2, a, b);
1917    tcg_gen_add_i32(t1, t1, b);
1918    tcg_gen_deposit_i32(d, t1, t2, 0, 16);
1919
1920    tcg_temp_free_i32(t1);
1921    tcg_temp_free_i32(t2);
1922}
1923
1924void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1925{
1926    TCGv_i64 t1 = tcg_temp_new_i64();
1927    TCGv_i64 t2 = tcg_temp_new_i64();
1928
1929    tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1930    tcg_gen_add_i64(t2, a, b);
1931    tcg_gen_add_i64(t1, t1, b);
1932    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1933
1934    tcg_temp_free_i64(t1);
1935    tcg_temp_free_i64(t2);
1936}
1937
1938static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 };
1939
1940void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1941                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1942{
1943    static const GVecGen3 g[4] = {
1944        { .fni8 = tcg_gen_vec_add8_i64,
1945          .fniv = tcg_gen_add_vec,
1946          .fno = gen_helper_gvec_add8,
1947          .opt_opc = vecop_list_add,
1948          .vece = MO_8 },
1949        { .fni8 = tcg_gen_vec_add16_i64,
1950          .fniv = tcg_gen_add_vec,
1951          .fno = gen_helper_gvec_add16,
1952          .opt_opc = vecop_list_add,
1953          .vece = MO_16 },
1954        { .fni4 = tcg_gen_add_i32,
1955          .fniv = tcg_gen_add_vec,
1956          .fno = gen_helper_gvec_add32,
1957          .opt_opc = vecop_list_add,
1958          .vece = MO_32 },
1959        { .fni8 = tcg_gen_add_i64,
1960          .fniv = tcg_gen_add_vec,
1961          .fno = gen_helper_gvec_add64,
1962          .opt_opc = vecop_list_add,
1963          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1964          .vece = MO_64 },
1965    };
1966
1967    tcg_debug_assert(vece <= MO_64);
1968    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1969}
1970
1971void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1972                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1973{
1974    static const GVecGen2s g[4] = {
1975        { .fni8 = tcg_gen_vec_add8_i64,
1976          .fniv = tcg_gen_add_vec,
1977          .fno = gen_helper_gvec_adds8,
1978          .opt_opc = vecop_list_add,
1979          .vece = MO_8 },
1980        { .fni8 = tcg_gen_vec_add16_i64,
1981          .fniv = tcg_gen_add_vec,
1982          .fno = gen_helper_gvec_adds16,
1983          .opt_opc = vecop_list_add,
1984          .vece = MO_16 },
1985        { .fni4 = tcg_gen_add_i32,
1986          .fniv = tcg_gen_add_vec,
1987          .fno = gen_helper_gvec_adds32,
1988          .opt_opc = vecop_list_add,
1989          .vece = MO_32 },
1990        { .fni8 = tcg_gen_add_i64,
1991          .fniv = tcg_gen_add_vec,
1992          .fno = gen_helper_gvec_adds64,
1993          .opt_opc = vecop_list_add,
1994          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1995          .vece = MO_64 },
1996    };
1997
1998    tcg_debug_assert(vece <= MO_64);
1999    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
2000}
2001
2002void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
2003                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2004{
2005    TCGv_i64 tmp = tcg_constant_i64(c);
2006    tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
2007}
2008
2009static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
2010
2011void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
2012                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2013{
2014    static const GVecGen2s g[4] = {
2015        { .fni8 = tcg_gen_vec_sub8_i64,
2016          .fniv = tcg_gen_sub_vec,
2017          .fno = gen_helper_gvec_subs8,
2018          .opt_opc = vecop_list_sub,
2019          .vece = MO_8 },
2020        { .fni8 = tcg_gen_vec_sub16_i64,
2021          .fniv = tcg_gen_sub_vec,
2022          .fno = gen_helper_gvec_subs16,
2023          .opt_opc = vecop_list_sub,
2024          .vece = MO_16 },
2025        { .fni4 = tcg_gen_sub_i32,
2026          .fniv = tcg_gen_sub_vec,
2027          .fno = gen_helper_gvec_subs32,
2028          .opt_opc = vecop_list_sub,
2029          .vece = MO_32 },
2030        { .fni8 = tcg_gen_sub_i64,
2031          .fniv = tcg_gen_sub_vec,
2032          .fno = gen_helper_gvec_subs64,
2033          .opt_opc = vecop_list_sub,
2034          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2035          .vece = MO_64 },
2036    };
2037
2038    tcg_debug_assert(vece <= MO_64);
2039    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
2040}
2041
2042/* Perform a vector subtraction using normal subtraction and a mask.
2043   Compare gen_addv_mask above.  */
2044static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
2045{
2046    TCGv_i64 t1 = tcg_temp_new_i64();
2047    TCGv_i64 t2 = tcg_temp_new_i64();
2048    TCGv_i64 t3 = tcg_temp_new_i64();
2049
2050    tcg_gen_or_i64(t1, a, m);
2051    tcg_gen_andc_i64(t2, b, m);
2052    tcg_gen_eqv_i64(t3, a, b);
2053    tcg_gen_sub_i64(d, t1, t2);
2054    tcg_gen_and_i64(t3, t3, m);
2055    tcg_gen_xor_i64(d, d, t3);
2056
2057    tcg_temp_free_i64(t1);
2058    tcg_temp_free_i64(t2);
2059    tcg_temp_free_i64(t3);
2060}
2061
2062void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2063{
2064    TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
2065    gen_subv_mask(d, a, b, m);
2066}
2067
2068void tcg_gen_vec_sub8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2069{
2070    TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80));
2071    TCGv_i32 t1 = tcg_temp_new_i32();
2072    TCGv_i32 t2 = tcg_temp_new_i32();
2073    TCGv_i32 t3 = tcg_temp_new_i32();
2074
2075    tcg_gen_or_i32(t1, a, m);
2076    tcg_gen_andc_i32(t2, b, m);
2077    tcg_gen_eqv_i32(t3, a, b);
2078    tcg_gen_sub_i32(d, t1, t2);
2079    tcg_gen_and_i32(t3, t3, m);
2080    tcg_gen_xor_i32(d, d, t3);
2081
2082    tcg_temp_free_i32(t1);
2083    tcg_temp_free_i32(t2);
2084    tcg_temp_free_i32(t3);
2085}
2086
2087void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2088{
2089    TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
2090    gen_subv_mask(d, a, b, m);
2091}
2092
2093void tcg_gen_vec_sub16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2094{
2095    TCGv_i32 t1 = tcg_temp_new_i32();
2096    TCGv_i32 t2 = tcg_temp_new_i32();
2097
2098    tcg_gen_andi_i32(t1, b, ~0xffff);
2099    tcg_gen_sub_i32(t2, a, b);
2100    tcg_gen_sub_i32(t1, a, t1);
2101    tcg_gen_deposit_i32(d, t1, t2, 0, 16);
2102
2103    tcg_temp_free_i32(t1);
2104    tcg_temp_free_i32(t2);
2105}
2106
2107void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2108{
2109    TCGv_i64 t1 = tcg_temp_new_i64();
2110    TCGv_i64 t2 = tcg_temp_new_i64();
2111
2112    tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2113    tcg_gen_sub_i64(t2, a, b);
2114    tcg_gen_sub_i64(t1, a, t1);
2115    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2116
2117    tcg_temp_free_i64(t1);
2118    tcg_temp_free_i64(t2);
2119}
2120
2121void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
2122                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2123{
2124    static const GVecGen3 g[4] = {
2125        { .fni8 = tcg_gen_vec_sub8_i64,
2126          .fniv = tcg_gen_sub_vec,
2127          .fno = gen_helper_gvec_sub8,
2128          .opt_opc = vecop_list_sub,
2129          .vece = MO_8 },
2130        { .fni8 = tcg_gen_vec_sub16_i64,
2131          .fniv = tcg_gen_sub_vec,
2132          .fno = gen_helper_gvec_sub16,
2133          .opt_opc = vecop_list_sub,
2134          .vece = MO_16 },
2135        { .fni4 = tcg_gen_sub_i32,
2136          .fniv = tcg_gen_sub_vec,
2137          .fno = gen_helper_gvec_sub32,
2138          .opt_opc = vecop_list_sub,
2139          .vece = MO_32 },
2140        { .fni8 = tcg_gen_sub_i64,
2141          .fniv = tcg_gen_sub_vec,
2142          .fno = gen_helper_gvec_sub64,
2143          .opt_opc = vecop_list_sub,
2144          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2145          .vece = MO_64 },
2146    };
2147
2148    tcg_debug_assert(vece <= MO_64);
2149    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2150}
2151
2152static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 };
2153
2154void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
2155                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2156{
2157    static const GVecGen3 g[4] = {
2158        { .fniv = tcg_gen_mul_vec,
2159          .fno = gen_helper_gvec_mul8,
2160          .opt_opc = vecop_list_mul,
2161          .vece = MO_8 },
2162        { .fniv = tcg_gen_mul_vec,
2163          .fno = gen_helper_gvec_mul16,
2164          .opt_opc = vecop_list_mul,
2165          .vece = MO_16 },
2166        { .fni4 = tcg_gen_mul_i32,
2167          .fniv = tcg_gen_mul_vec,
2168          .fno = gen_helper_gvec_mul32,
2169          .opt_opc = vecop_list_mul,
2170          .vece = MO_32 },
2171        { .fni8 = tcg_gen_mul_i64,
2172          .fniv = tcg_gen_mul_vec,
2173          .fno = gen_helper_gvec_mul64,
2174          .opt_opc = vecop_list_mul,
2175          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2176          .vece = MO_64 },
2177    };
2178
2179    tcg_debug_assert(vece <= MO_64);
2180    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2181}
2182
2183void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
2184                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2185{
2186    static const GVecGen2s g[4] = {
2187        { .fniv = tcg_gen_mul_vec,
2188          .fno = gen_helper_gvec_muls8,
2189          .opt_opc = vecop_list_mul,
2190          .vece = MO_8 },
2191        { .fniv = tcg_gen_mul_vec,
2192          .fno = gen_helper_gvec_muls16,
2193          .opt_opc = vecop_list_mul,
2194          .vece = MO_16 },
2195        { .fni4 = tcg_gen_mul_i32,
2196          .fniv = tcg_gen_mul_vec,
2197          .fno = gen_helper_gvec_muls32,
2198          .opt_opc = vecop_list_mul,
2199          .vece = MO_32 },
2200        { .fni8 = tcg_gen_mul_i64,
2201          .fniv = tcg_gen_mul_vec,
2202          .fno = gen_helper_gvec_muls64,
2203          .opt_opc = vecop_list_mul,
2204          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2205          .vece = MO_64 },
2206    };
2207
2208    tcg_debug_assert(vece <= MO_64);
2209    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
2210}
2211
2212void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
2213                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2214{
2215    TCGv_i64 tmp = tcg_constant_i64(c);
2216    tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
2217}
2218
2219void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2220                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2221{
2222    static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 };
2223    static const GVecGen3 g[4] = {
2224        { .fniv = tcg_gen_ssadd_vec,
2225          .fno = gen_helper_gvec_ssadd8,
2226          .opt_opc = vecop_list,
2227          .vece = MO_8 },
2228        { .fniv = tcg_gen_ssadd_vec,
2229          .fno = gen_helper_gvec_ssadd16,
2230          .opt_opc = vecop_list,
2231          .vece = MO_16 },
2232        { .fniv = tcg_gen_ssadd_vec,
2233          .fno = gen_helper_gvec_ssadd32,
2234          .opt_opc = vecop_list,
2235          .vece = MO_32 },
2236        { .fniv = tcg_gen_ssadd_vec,
2237          .fno = gen_helper_gvec_ssadd64,
2238          .opt_opc = vecop_list,
2239          .vece = MO_64 },
2240    };
2241    tcg_debug_assert(vece <= MO_64);
2242    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2243}
2244
2245void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
2246                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2247{
2248    static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 };
2249    static const GVecGen3 g[4] = {
2250        { .fniv = tcg_gen_sssub_vec,
2251          .fno = gen_helper_gvec_sssub8,
2252          .opt_opc = vecop_list,
2253          .vece = MO_8 },
2254        { .fniv = tcg_gen_sssub_vec,
2255          .fno = gen_helper_gvec_sssub16,
2256          .opt_opc = vecop_list,
2257          .vece = MO_16 },
2258        { .fniv = tcg_gen_sssub_vec,
2259          .fno = gen_helper_gvec_sssub32,
2260          .opt_opc = vecop_list,
2261          .vece = MO_32 },
2262        { .fniv = tcg_gen_sssub_vec,
2263          .fno = gen_helper_gvec_sssub64,
2264          .opt_opc = vecop_list,
2265          .vece = MO_64 },
2266    };
2267    tcg_debug_assert(vece <= MO_64);
2268    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2269}
2270
2271static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2272{
2273    TCGv_i32 max = tcg_constant_i32(-1);
2274    tcg_gen_add_i32(d, a, b);
2275    tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
2276}
2277
2278static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2279{
2280    TCGv_i64 max = tcg_constant_i64(-1);
2281    tcg_gen_add_i64(d, a, b);
2282    tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
2283}
2284
2285void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2286                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2287{
2288    static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 };
2289    static const GVecGen3 g[4] = {
2290        { .fniv = tcg_gen_usadd_vec,
2291          .fno = gen_helper_gvec_usadd8,
2292          .opt_opc = vecop_list,
2293          .vece = MO_8 },
2294        { .fniv = tcg_gen_usadd_vec,
2295          .fno = gen_helper_gvec_usadd16,
2296          .opt_opc = vecop_list,
2297          .vece = MO_16 },
2298        { .fni4 = tcg_gen_usadd_i32,
2299          .fniv = tcg_gen_usadd_vec,
2300          .fno = gen_helper_gvec_usadd32,
2301          .opt_opc = vecop_list,
2302          .vece = MO_32 },
2303        { .fni8 = tcg_gen_usadd_i64,
2304          .fniv = tcg_gen_usadd_vec,
2305          .fno = gen_helper_gvec_usadd64,
2306          .opt_opc = vecop_list,
2307          .vece = MO_64 }
2308    };
2309    tcg_debug_assert(vece <= MO_64);
2310    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2311}
2312
2313static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2314{
2315    TCGv_i32 min = tcg_constant_i32(0);
2316    tcg_gen_sub_i32(d, a, b);
2317    tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
2318}
2319
2320static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2321{
2322    TCGv_i64 min = tcg_constant_i64(0);
2323    tcg_gen_sub_i64(d, a, b);
2324    tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
2325}
2326
2327void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
2328                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2329{
2330    static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 };
2331    static const GVecGen3 g[4] = {
2332        { .fniv = tcg_gen_ussub_vec,
2333          .fno = gen_helper_gvec_ussub8,
2334          .opt_opc = vecop_list,
2335          .vece = MO_8 },
2336        { .fniv = tcg_gen_ussub_vec,
2337          .fno = gen_helper_gvec_ussub16,
2338          .opt_opc = vecop_list,
2339          .vece = MO_16 },
2340        { .fni4 = tcg_gen_ussub_i32,
2341          .fniv = tcg_gen_ussub_vec,
2342          .fno = gen_helper_gvec_ussub32,
2343          .opt_opc = vecop_list,
2344          .vece = MO_32 },
2345        { .fni8 = tcg_gen_ussub_i64,
2346          .fniv = tcg_gen_ussub_vec,
2347          .fno = gen_helper_gvec_ussub64,
2348          .opt_opc = vecop_list,
2349          .vece = MO_64 }
2350    };
2351    tcg_debug_assert(vece <= MO_64);
2352    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2353}
2354
2355void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
2356                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2357{
2358    static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 };
2359    static const GVecGen3 g[4] = {
2360        { .fniv = tcg_gen_smin_vec,
2361          .fno = gen_helper_gvec_smin8,
2362          .opt_opc = vecop_list,
2363          .vece = MO_8 },
2364        { .fniv = tcg_gen_smin_vec,
2365          .fno = gen_helper_gvec_smin16,
2366          .opt_opc = vecop_list,
2367          .vece = MO_16 },
2368        { .fni4 = tcg_gen_smin_i32,
2369          .fniv = tcg_gen_smin_vec,
2370          .fno = gen_helper_gvec_smin32,
2371          .opt_opc = vecop_list,
2372          .vece = MO_32 },
2373        { .fni8 = tcg_gen_smin_i64,
2374          .fniv = tcg_gen_smin_vec,
2375          .fno = gen_helper_gvec_smin64,
2376          .opt_opc = vecop_list,
2377          .vece = MO_64 }
2378    };
2379    tcg_debug_assert(vece <= MO_64);
2380    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2381}
2382
2383void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
2384                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2385{
2386    static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 };
2387    static const GVecGen3 g[4] = {
2388        { .fniv = tcg_gen_umin_vec,
2389          .fno = gen_helper_gvec_umin8,
2390          .opt_opc = vecop_list,
2391          .vece = MO_8 },
2392        { .fniv = tcg_gen_umin_vec,
2393          .fno = gen_helper_gvec_umin16,
2394          .opt_opc = vecop_list,
2395          .vece = MO_16 },
2396        { .fni4 = tcg_gen_umin_i32,
2397          .fniv = tcg_gen_umin_vec,
2398          .fno = gen_helper_gvec_umin32,
2399          .opt_opc = vecop_list,
2400          .vece = MO_32 },
2401        { .fni8 = tcg_gen_umin_i64,
2402          .fniv = tcg_gen_umin_vec,
2403          .fno = gen_helper_gvec_umin64,
2404          .opt_opc = vecop_list,
2405          .vece = MO_64 }
2406    };
2407    tcg_debug_assert(vece <= MO_64);
2408    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2409}
2410
2411void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
2412                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2413{
2414    static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 };
2415    static const GVecGen3 g[4] = {
2416        { .fniv = tcg_gen_smax_vec,
2417          .fno = gen_helper_gvec_smax8,
2418          .opt_opc = vecop_list,
2419          .vece = MO_8 },
2420        { .fniv = tcg_gen_smax_vec,
2421          .fno = gen_helper_gvec_smax16,
2422          .opt_opc = vecop_list,
2423          .vece = MO_16 },
2424        { .fni4 = tcg_gen_smax_i32,
2425          .fniv = tcg_gen_smax_vec,
2426          .fno = gen_helper_gvec_smax32,
2427          .opt_opc = vecop_list,
2428          .vece = MO_32 },
2429        { .fni8 = tcg_gen_smax_i64,
2430          .fniv = tcg_gen_smax_vec,
2431          .fno = gen_helper_gvec_smax64,
2432          .opt_opc = vecop_list,
2433          .vece = MO_64 }
2434    };
2435    tcg_debug_assert(vece <= MO_64);
2436    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2437}
2438
2439void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
2440                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2441{
2442    static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 };
2443    static const GVecGen3 g[4] = {
2444        { .fniv = tcg_gen_umax_vec,
2445          .fno = gen_helper_gvec_umax8,
2446          .opt_opc = vecop_list,
2447          .vece = MO_8 },
2448        { .fniv = tcg_gen_umax_vec,
2449          .fno = gen_helper_gvec_umax16,
2450          .opt_opc = vecop_list,
2451          .vece = MO_16 },
2452        { .fni4 = tcg_gen_umax_i32,
2453          .fniv = tcg_gen_umax_vec,
2454          .fno = gen_helper_gvec_umax32,
2455          .opt_opc = vecop_list,
2456          .vece = MO_32 },
2457        { .fni8 = tcg_gen_umax_i64,
2458          .fniv = tcg_gen_umax_vec,
2459          .fno = gen_helper_gvec_umax64,
2460          .opt_opc = vecop_list,
2461          .vece = MO_64 }
2462    };
2463    tcg_debug_assert(vece <= MO_64);
2464    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2465}
2466
2467/* Perform a vector negation using normal negation and a mask.
2468   Compare gen_subv_mask above.  */
2469static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
2470{
2471    TCGv_i64 t2 = tcg_temp_new_i64();
2472    TCGv_i64 t3 = tcg_temp_new_i64();
2473
2474    tcg_gen_andc_i64(t3, m, b);
2475    tcg_gen_andc_i64(t2, b, m);
2476    tcg_gen_sub_i64(d, m, t2);
2477    tcg_gen_xor_i64(d, d, t3);
2478
2479    tcg_temp_free_i64(t2);
2480    tcg_temp_free_i64(t3);
2481}
2482
2483void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
2484{
2485    TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
2486    gen_negv_mask(d, b, m);
2487}
2488
2489void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
2490{
2491    TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
2492    gen_negv_mask(d, b, m);
2493}
2494
2495void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
2496{
2497    TCGv_i64 t1 = tcg_temp_new_i64();
2498    TCGv_i64 t2 = tcg_temp_new_i64();
2499
2500    tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2501    tcg_gen_neg_i64(t2, b);
2502    tcg_gen_neg_i64(t1, t1);
2503    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2504
2505    tcg_temp_free_i64(t1);
2506    tcg_temp_free_i64(t2);
2507}
2508
2509void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
2510                      uint32_t oprsz, uint32_t maxsz)
2511{
2512    static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 };
2513    static const GVecGen2 g[4] = {
2514        { .fni8 = tcg_gen_vec_neg8_i64,
2515          .fniv = tcg_gen_neg_vec,
2516          .fno = gen_helper_gvec_neg8,
2517          .opt_opc = vecop_list,
2518          .vece = MO_8 },
2519        { .fni8 = tcg_gen_vec_neg16_i64,
2520          .fniv = tcg_gen_neg_vec,
2521          .fno = gen_helper_gvec_neg16,
2522          .opt_opc = vecop_list,
2523          .vece = MO_16 },
2524        { .fni4 = tcg_gen_neg_i32,
2525          .fniv = tcg_gen_neg_vec,
2526          .fno = gen_helper_gvec_neg32,
2527          .opt_opc = vecop_list,
2528          .vece = MO_32 },
2529        { .fni8 = tcg_gen_neg_i64,
2530          .fniv = tcg_gen_neg_vec,
2531          .fno = gen_helper_gvec_neg64,
2532          .opt_opc = vecop_list,
2533          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2534          .vece = MO_64 },
2535    };
2536
2537    tcg_debug_assert(vece <= MO_64);
2538    tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2539}
2540
2541static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
2542{
2543    TCGv_i64 t = tcg_temp_new_i64();
2544    int nbit = 8 << vece;
2545
2546    /* Create -1 for each negative element.  */
2547    tcg_gen_shri_i64(t, b, nbit - 1);
2548    tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2549    tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
2550
2551    /*
2552     * Invert (via xor -1) and add one.
2553     * Because of the ordering the msb is cleared,
2554     * so we never have carry into the next element.
2555     */
2556    tcg_gen_xor_i64(d, b, t);
2557    tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2558    tcg_gen_add_i64(d, d, t);
2559
2560    tcg_temp_free_i64(t);
2561}
2562
2563static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b)
2564{
2565    gen_absv_mask(d, b, MO_8);
2566}
2567
2568static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b)
2569{
2570    gen_absv_mask(d, b, MO_16);
2571}
2572
2573void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
2574                      uint32_t oprsz, uint32_t maxsz)
2575{
2576    static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 };
2577    static const GVecGen2 g[4] = {
2578        { .fni8 = tcg_gen_vec_abs8_i64,
2579          .fniv = tcg_gen_abs_vec,
2580          .fno = gen_helper_gvec_abs8,
2581          .opt_opc = vecop_list,
2582          .vece = MO_8 },
2583        { .fni8 = tcg_gen_vec_abs16_i64,
2584          .fniv = tcg_gen_abs_vec,
2585          .fno = gen_helper_gvec_abs16,
2586          .opt_opc = vecop_list,
2587          .vece = MO_16 },
2588        { .fni4 = tcg_gen_abs_i32,
2589          .fniv = tcg_gen_abs_vec,
2590          .fno = gen_helper_gvec_abs32,
2591          .opt_opc = vecop_list,
2592          .vece = MO_32 },
2593        { .fni8 = tcg_gen_abs_i64,
2594          .fniv = tcg_gen_abs_vec,
2595          .fno = gen_helper_gvec_abs64,
2596          .opt_opc = vecop_list,
2597          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2598          .vece = MO_64 },
2599    };
2600
2601    tcg_debug_assert(vece <= MO_64);
2602    tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2603}
2604
2605void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
2606                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2607{
2608    static const GVecGen3 g = {
2609        .fni8 = tcg_gen_and_i64,
2610        .fniv = tcg_gen_and_vec,
2611        .fno = gen_helper_gvec_and,
2612        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2613    };
2614
2615    if (aofs == bofs) {
2616        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2617    } else {
2618        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2619    }
2620}
2621
2622void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
2623                     uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2624{
2625    static const GVecGen3 g = {
2626        .fni8 = tcg_gen_or_i64,
2627        .fniv = tcg_gen_or_vec,
2628        .fno = gen_helper_gvec_or,
2629        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2630    };
2631
2632    if (aofs == bofs) {
2633        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2634    } else {
2635        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2636    }
2637}
2638
2639void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
2640                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2641{
2642    static const GVecGen3 g = {
2643        .fni8 = tcg_gen_xor_i64,
2644        .fniv = tcg_gen_xor_vec,
2645        .fno = gen_helper_gvec_xor,
2646        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2647    };
2648
2649    if (aofs == bofs) {
2650        tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2651    } else {
2652        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2653    }
2654}
2655
2656void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
2657                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2658{
2659    static const GVecGen3 g = {
2660        .fni8 = tcg_gen_andc_i64,
2661        .fniv = tcg_gen_andc_vec,
2662        .fno = gen_helper_gvec_andc,
2663        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2664    };
2665
2666    if (aofs == bofs) {
2667        tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2668    } else {
2669        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2670    }
2671}
2672
2673void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
2674                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2675{
2676    static const GVecGen3 g = {
2677        .fni8 = tcg_gen_orc_i64,
2678        .fniv = tcg_gen_orc_vec,
2679        .fno = gen_helper_gvec_orc,
2680        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2681    };
2682
2683    if (aofs == bofs) {
2684        tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2685    } else {
2686        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2687    }
2688}
2689
2690void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
2691                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2692{
2693    static const GVecGen3 g = {
2694        .fni8 = tcg_gen_nand_i64,
2695        .fniv = tcg_gen_nand_vec,
2696        .fno = gen_helper_gvec_nand,
2697        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2698    };
2699
2700    if (aofs == bofs) {
2701        tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2702    } else {
2703        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2704    }
2705}
2706
2707void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
2708                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2709{
2710    static const GVecGen3 g = {
2711        .fni8 = tcg_gen_nor_i64,
2712        .fniv = tcg_gen_nor_vec,
2713        .fno = gen_helper_gvec_nor,
2714        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2715    };
2716
2717    if (aofs == bofs) {
2718        tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2719    } else {
2720        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2721    }
2722}
2723
2724void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
2725                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2726{
2727    static const GVecGen3 g = {
2728        .fni8 = tcg_gen_eqv_i64,
2729        .fniv = tcg_gen_eqv_vec,
2730        .fno = gen_helper_gvec_eqv,
2731        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2732    };
2733
2734    if (aofs == bofs) {
2735        tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2736    } else {
2737        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2738    }
2739}
2740
2741static const GVecGen2s gop_ands = {
2742    .fni8 = tcg_gen_and_i64,
2743    .fniv = tcg_gen_and_vec,
2744    .fno = gen_helper_gvec_ands,
2745    .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2746    .vece = MO_64
2747};
2748
2749void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
2750                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2751{
2752    TCGv_i64 tmp = tcg_temp_new_i64();
2753    tcg_gen_dup_i64(vece, tmp, c);
2754    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2755    tcg_temp_free_i64(tmp);
2756}
2757
2758void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
2759                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2760{
2761    TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2762    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2763}
2764
2765static const GVecGen2s gop_xors = {
2766    .fni8 = tcg_gen_xor_i64,
2767    .fniv = tcg_gen_xor_vec,
2768    .fno = gen_helper_gvec_xors,
2769    .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2770    .vece = MO_64
2771};
2772
2773void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
2774                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2775{
2776    TCGv_i64 tmp = tcg_temp_new_i64();
2777    tcg_gen_dup_i64(vece, tmp, c);
2778    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2779    tcg_temp_free_i64(tmp);
2780}
2781
2782void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
2783                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2784{
2785    TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2786    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2787}
2788
2789static const GVecGen2s gop_ors = {
2790    .fni8 = tcg_gen_or_i64,
2791    .fniv = tcg_gen_or_vec,
2792    .fno = gen_helper_gvec_ors,
2793    .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2794    .vece = MO_64
2795};
2796
2797void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
2798                      TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2799{
2800    TCGv_i64 tmp = tcg_temp_new_i64();
2801    tcg_gen_dup_i64(vece, tmp, c);
2802    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2803    tcg_temp_free_i64(tmp);
2804}
2805
2806void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
2807                      int64_t c, uint32_t oprsz, uint32_t maxsz)
2808{
2809    TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2810    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2811}
2812
2813void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2814{
2815    uint64_t mask = dup_const(MO_8, 0xff << c);
2816    tcg_gen_shli_i64(d, a, c);
2817    tcg_gen_andi_i64(d, d, mask);
2818}
2819
2820void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2821{
2822    uint64_t mask = dup_const(MO_16, 0xffff << c);
2823    tcg_gen_shli_i64(d, a, c);
2824    tcg_gen_andi_i64(d, d, mask);
2825}
2826
2827void tcg_gen_vec_shl8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2828{
2829    uint32_t mask = dup_const(MO_8, 0xff << c);
2830    tcg_gen_shli_i32(d, a, c);
2831    tcg_gen_andi_i32(d, d, mask);
2832}
2833
2834void tcg_gen_vec_shl16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2835{
2836    uint32_t mask = dup_const(MO_16, 0xffff << c);
2837    tcg_gen_shli_i32(d, a, c);
2838    tcg_gen_andi_i32(d, d, mask);
2839}
2840
2841void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
2842                       int64_t shift, uint32_t oprsz, uint32_t maxsz)
2843{
2844    static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
2845    static const GVecGen2i g[4] = {
2846        { .fni8 = tcg_gen_vec_shl8i_i64,
2847          .fniv = tcg_gen_shli_vec,
2848          .fno = gen_helper_gvec_shl8i,
2849          .opt_opc = vecop_list,
2850          .vece = MO_8 },
2851        { .fni8 = tcg_gen_vec_shl16i_i64,
2852          .fniv = tcg_gen_shli_vec,
2853          .fno = gen_helper_gvec_shl16i,
2854          .opt_opc = vecop_list,
2855          .vece = MO_16 },
2856        { .fni4 = tcg_gen_shli_i32,
2857          .fniv = tcg_gen_shli_vec,
2858          .fno = gen_helper_gvec_shl32i,
2859          .opt_opc = vecop_list,
2860          .vece = MO_32 },
2861        { .fni8 = tcg_gen_shli_i64,
2862          .fniv = tcg_gen_shli_vec,
2863          .fno = gen_helper_gvec_shl64i,
2864          .opt_opc = vecop_list,
2865          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2866          .vece = MO_64 },
2867    };
2868
2869    tcg_debug_assert(vece <= MO_64);
2870    tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2871    if (shift == 0) {
2872        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2873    } else {
2874        tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2875    }
2876}
2877
2878void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2879{
2880    uint64_t mask = dup_const(MO_8, 0xff >> c);
2881    tcg_gen_shri_i64(d, a, c);
2882    tcg_gen_andi_i64(d, d, mask);
2883}
2884
2885void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2886{
2887    uint64_t mask = dup_const(MO_16, 0xffff >> c);
2888    tcg_gen_shri_i64(d, a, c);
2889    tcg_gen_andi_i64(d, d, mask);
2890}
2891
2892void tcg_gen_vec_shr8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2893{
2894    uint32_t mask = dup_const(MO_8, 0xff >> c);
2895    tcg_gen_shri_i32(d, a, c);
2896    tcg_gen_andi_i32(d, d, mask);
2897}
2898
2899void tcg_gen_vec_shr16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2900{
2901    uint32_t mask = dup_const(MO_16, 0xffff >> c);
2902    tcg_gen_shri_i32(d, a, c);
2903    tcg_gen_andi_i32(d, d, mask);
2904}
2905
2906void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
2907                       int64_t shift, uint32_t oprsz, uint32_t maxsz)
2908{
2909    static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
2910    static const GVecGen2i g[4] = {
2911        { .fni8 = tcg_gen_vec_shr8i_i64,
2912          .fniv = tcg_gen_shri_vec,
2913          .fno = gen_helper_gvec_shr8i,
2914          .opt_opc = vecop_list,
2915          .vece = MO_8 },
2916        { .fni8 = tcg_gen_vec_shr16i_i64,
2917          .fniv = tcg_gen_shri_vec,
2918          .fno = gen_helper_gvec_shr16i,
2919          .opt_opc = vecop_list,
2920          .vece = MO_16 },
2921        { .fni4 = tcg_gen_shri_i32,
2922          .fniv = tcg_gen_shri_vec,
2923          .fno = gen_helper_gvec_shr32i,
2924          .opt_opc = vecop_list,
2925          .vece = MO_32 },
2926        { .fni8 = tcg_gen_shri_i64,
2927          .fniv = tcg_gen_shri_vec,
2928          .fno = gen_helper_gvec_shr64i,
2929          .opt_opc = vecop_list,
2930          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2931          .vece = MO_64 },
2932    };
2933
2934    tcg_debug_assert(vece <= MO_64);
2935    tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2936    if (shift == 0) {
2937        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2938    } else {
2939        tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2940    }
2941}
2942
2943void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2944{
2945    uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2946    uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2947    TCGv_i64 s = tcg_temp_new_i64();
2948
2949    tcg_gen_shri_i64(d, a, c);
2950    tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2951    tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2952    tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2953    tcg_gen_or_i64(d, d, s);         /* include sign extension */
2954    tcg_temp_free_i64(s);
2955}
2956
2957void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2958{
2959    uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2960    uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2961    TCGv_i64 s = tcg_temp_new_i64();
2962
2963    tcg_gen_shri_i64(d, a, c);
2964    tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2965    tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2966    tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2967    tcg_gen_or_i64(d, d, s);         /* include sign extension */
2968    tcg_temp_free_i64(s);
2969}
2970
2971void tcg_gen_vec_sar8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2972{
2973    uint32_t s_mask = dup_const(MO_8, 0x80 >> c);
2974    uint32_t c_mask = dup_const(MO_8, 0xff >> c);
2975    TCGv_i32 s = tcg_temp_new_i32();
2976
2977    tcg_gen_shri_i32(d, a, c);
2978    tcg_gen_andi_i32(s, d, s_mask);  /* isolate (shifted) sign bit */
2979    tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */
2980    tcg_gen_andi_i32(d, d, c_mask);  /* clear out bits above sign  */
2981    tcg_gen_or_i32(d, d, s);         /* include sign extension */
2982    tcg_temp_free_i32(s);
2983}
2984
2985void tcg_gen_vec_sar16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2986{
2987    uint32_t s_mask = dup_const(MO_16, 0x8000 >> c);
2988    uint32_t c_mask = dup_const(MO_16, 0xffff >> c);
2989    TCGv_i32 s = tcg_temp_new_i32();
2990
2991    tcg_gen_shri_i32(d, a, c);
2992    tcg_gen_andi_i32(s, d, s_mask);  /* isolate (shifted) sign bit */
2993    tcg_gen_andi_i32(d, d, c_mask);  /* clear out bits above sign  */
2994    tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */
2995    tcg_gen_or_i32(d, d, s);         /* include sign extension */
2996    tcg_temp_free_i32(s);
2997}
2998
2999void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
3000                       int64_t shift, uint32_t oprsz, uint32_t maxsz)
3001{
3002    static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 };
3003    static const GVecGen2i g[4] = {
3004        { .fni8 = tcg_gen_vec_sar8i_i64,
3005          .fniv = tcg_gen_sari_vec,
3006          .fno = gen_helper_gvec_sar8i,
3007          .opt_opc = vecop_list,
3008          .vece = MO_8 },
3009        { .fni8 = tcg_gen_vec_sar16i_i64,
3010          .fniv = tcg_gen_sari_vec,
3011          .fno = gen_helper_gvec_sar16i,
3012          .opt_opc = vecop_list,
3013          .vece = MO_16 },
3014        { .fni4 = tcg_gen_sari_i32,
3015          .fniv = tcg_gen_sari_vec,
3016          .fno = gen_helper_gvec_sar32i,
3017          .opt_opc = vecop_list,
3018          .vece = MO_32 },
3019        { .fni8 = tcg_gen_sari_i64,
3020          .fniv = tcg_gen_sari_vec,
3021          .fno = gen_helper_gvec_sar64i,
3022          .opt_opc = vecop_list,
3023          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3024          .vece = MO_64 },
3025    };
3026
3027    tcg_debug_assert(vece <= MO_64);
3028    tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3029    if (shift == 0) {
3030        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
3031    } else {
3032        tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
3033    }
3034}
3035
3036void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
3037{
3038    uint64_t mask = dup_const(MO_8, 0xff << c);
3039
3040    tcg_gen_shli_i64(d, a, c);
3041    tcg_gen_shri_i64(a, a, 8 - c);
3042    tcg_gen_andi_i64(d, d, mask);
3043    tcg_gen_andi_i64(a, a, ~mask);
3044    tcg_gen_or_i64(d, d, a);
3045}
3046
3047void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
3048{
3049    uint64_t mask = dup_const(MO_16, 0xffff << c);
3050
3051    tcg_gen_shli_i64(d, a, c);
3052    tcg_gen_shri_i64(a, a, 16 - c);
3053    tcg_gen_andi_i64(d, d, mask);
3054    tcg_gen_andi_i64(a, a, ~mask);
3055    tcg_gen_or_i64(d, d, a);
3056}
3057
3058void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs,
3059                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
3060{
3061    static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 };
3062    static const GVecGen2i g[4] = {
3063        { .fni8 = tcg_gen_vec_rotl8i_i64,
3064          .fniv = tcg_gen_rotli_vec,
3065          .fno = gen_helper_gvec_rotl8i,
3066          .opt_opc = vecop_list,
3067          .vece = MO_8 },
3068        { .fni8 = tcg_gen_vec_rotl16i_i64,
3069          .fniv = tcg_gen_rotli_vec,
3070          .fno = gen_helper_gvec_rotl16i,
3071          .opt_opc = vecop_list,
3072          .vece = MO_16 },
3073        { .fni4 = tcg_gen_rotli_i32,
3074          .fniv = tcg_gen_rotli_vec,
3075          .fno = gen_helper_gvec_rotl32i,
3076          .opt_opc = vecop_list,
3077          .vece = MO_32 },
3078        { .fni8 = tcg_gen_rotli_i64,
3079          .fniv = tcg_gen_rotli_vec,
3080          .fno = gen_helper_gvec_rotl64i,
3081          .opt_opc = vecop_list,
3082          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3083          .vece = MO_64 },
3084    };
3085
3086    tcg_debug_assert(vece <= MO_64);
3087    tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3088    if (shift == 0) {
3089        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
3090    } else {
3091        tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
3092    }
3093}
3094
3095void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs,
3096                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
3097{
3098    tcg_debug_assert(vece <= MO_64);
3099    tcg_debug_assert(shift >= 0 && shift < (8 << vece));
3100    tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1),
3101                       oprsz, maxsz);
3102}
3103
3104/*
3105 * Specialized generation vector shifts by a non-constant scalar.
3106 */
3107
3108typedef struct {
3109    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
3110    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
3111    void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32);
3112    void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
3113    gen_helper_gvec_2 *fno[4];
3114    TCGOpcode s_list[2];
3115    TCGOpcode v_list[2];
3116} GVecGen2sh;
3117
3118static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3119                           uint32_t oprsz, uint32_t tysz, TCGType type,
3120                           TCGv_i32 shift,
3121                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
3122{
3123    TCGv_vec t0 = tcg_temp_new_vec(type);
3124    uint32_t i;
3125
3126    for (i = 0; i < oprsz; i += tysz) {
3127        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
3128        fni(vece, t0, t0, shift);
3129        tcg_gen_st_vec(t0, cpu_env, dofs + i);
3130    }
3131    tcg_temp_free_vec(t0);
3132}
3133
3134static void
3135do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift,
3136               uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g)
3137{
3138    TCGType type;
3139    uint32_t some;
3140
3141    check_size_align(oprsz, maxsz, dofs | aofs);
3142    check_overlap_2(dofs, aofs, maxsz);
3143
3144    /* If the backend has a scalar expansion, great.  */
3145    type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64);
3146    if (type) {
3147        const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
3148        switch (type) {
3149        case TCG_TYPE_V256:
3150            some = QEMU_ALIGN_DOWN(oprsz, 32);
3151            expand_2sh_vec(vece, dofs, aofs, some, 32,
3152                           TCG_TYPE_V256, shift, g->fniv_s);
3153            if (some == oprsz) {
3154                break;
3155            }
3156            dofs += some;
3157            aofs += some;
3158            oprsz -= some;
3159            maxsz -= some;
3160            /* fallthru */
3161        case TCG_TYPE_V128:
3162            expand_2sh_vec(vece, dofs, aofs, oprsz, 16,
3163                           TCG_TYPE_V128, shift, g->fniv_s);
3164            break;
3165        case TCG_TYPE_V64:
3166            expand_2sh_vec(vece, dofs, aofs, oprsz, 8,
3167                           TCG_TYPE_V64, shift, g->fniv_s);
3168            break;
3169        default:
3170            g_assert_not_reached();
3171        }
3172        tcg_swap_vecop_list(hold_list);
3173        goto clear_tail;
3174    }
3175
3176    /* If the backend supports variable vector shifts, also cool.  */
3177    type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64);
3178    if (type) {
3179        const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
3180        TCGv_vec v_shift = tcg_temp_new_vec(type);
3181
3182        if (vece == MO_64) {
3183            TCGv_i64 sh64 = tcg_temp_new_i64();
3184            tcg_gen_extu_i32_i64(sh64, shift);
3185            tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);
3186            tcg_temp_free_i64(sh64);
3187        } else {
3188            tcg_gen_dup_i32_vec(vece, v_shift, shift);
3189        }
3190
3191        switch (type) {
3192        case TCG_TYPE_V256:
3193            some = QEMU_ALIGN_DOWN(oprsz, 32);
3194            expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,
3195                          v_shift, false, g->fniv_v);
3196            if (some == oprsz) {
3197                break;
3198            }
3199            dofs += some;
3200            aofs += some;
3201            oprsz -= some;
3202            maxsz -= some;
3203            /* fallthru */
3204        case TCG_TYPE_V128:
3205            expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
3206                          v_shift, false, g->fniv_v);
3207            break;
3208        case TCG_TYPE_V64:
3209            expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
3210                          v_shift, false, g->fniv_v);
3211            break;
3212        default:
3213            g_assert_not_reached();
3214        }
3215        tcg_temp_free_vec(v_shift);
3216        tcg_swap_vecop_list(hold_list);
3217        goto clear_tail;
3218    }
3219
3220    /* Otherwise fall back to integral... */
3221    if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3222        expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4);
3223    } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3224        TCGv_i64 sh64 = tcg_temp_new_i64();
3225        tcg_gen_extu_i32_i64(sh64, shift);
3226        expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8);
3227        tcg_temp_free_i64(sh64);
3228    } else {
3229        TCGv_ptr a0 = tcg_temp_new_ptr();
3230        TCGv_ptr a1 = tcg_temp_new_ptr();
3231        TCGv_i32 desc = tcg_temp_new_i32();
3232
3233        tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);
3234        tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));
3235        tcg_gen_addi_ptr(a0, cpu_env, dofs);
3236        tcg_gen_addi_ptr(a1, cpu_env, aofs);
3237
3238        g->fno[vece](a0, a1, desc);
3239
3240        tcg_temp_free_ptr(a0);
3241        tcg_temp_free_ptr(a1);
3242        tcg_temp_free_i32(desc);
3243        return;
3244    }
3245
3246 clear_tail:
3247    if (oprsz < maxsz) {
3248        expand_clr(dofs + oprsz, maxsz - oprsz);
3249    }
3250}
3251
3252void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
3253                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3254{
3255    static const GVecGen2sh g = {
3256        .fni4 = tcg_gen_shl_i32,
3257        .fni8 = tcg_gen_shl_i64,
3258        .fniv_s = tcg_gen_shls_vec,
3259        .fniv_v = tcg_gen_shlv_vec,
3260        .fno = {
3261            gen_helper_gvec_shl8i,
3262            gen_helper_gvec_shl16i,
3263            gen_helper_gvec_shl32i,
3264            gen_helper_gvec_shl64i,
3265        },
3266        .s_list = { INDEX_op_shls_vec, 0 },
3267        .v_list = { INDEX_op_shlv_vec, 0 },
3268    };
3269
3270    tcg_debug_assert(vece <= MO_64);
3271    do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3272}
3273
3274void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
3275                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3276{
3277    static const GVecGen2sh g = {
3278        .fni4 = tcg_gen_shr_i32,
3279        .fni8 = tcg_gen_shr_i64,
3280        .fniv_s = tcg_gen_shrs_vec,
3281        .fniv_v = tcg_gen_shrv_vec,
3282        .fno = {
3283            gen_helper_gvec_shr8i,
3284            gen_helper_gvec_shr16i,
3285            gen_helper_gvec_shr32i,
3286            gen_helper_gvec_shr64i,
3287        },
3288        .s_list = { INDEX_op_shrs_vec, 0 },
3289        .v_list = { INDEX_op_shrv_vec, 0 },
3290    };
3291
3292    tcg_debug_assert(vece <= MO_64);
3293    do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3294}
3295
3296void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
3297                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3298{
3299    static const GVecGen2sh g = {
3300        .fni4 = tcg_gen_sar_i32,
3301        .fni8 = tcg_gen_sar_i64,
3302        .fniv_s = tcg_gen_sars_vec,
3303        .fniv_v = tcg_gen_sarv_vec,
3304        .fno = {
3305            gen_helper_gvec_sar8i,
3306            gen_helper_gvec_sar16i,
3307            gen_helper_gvec_sar32i,
3308            gen_helper_gvec_sar64i,
3309        },
3310        .s_list = { INDEX_op_sars_vec, 0 },
3311        .v_list = { INDEX_op_sarv_vec, 0 },
3312    };
3313
3314    tcg_debug_assert(vece <= MO_64);
3315    do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3316}
3317
3318void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
3319                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3320{
3321    static const GVecGen2sh g = {
3322        .fni4 = tcg_gen_rotl_i32,
3323        .fni8 = tcg_gen_rotl_i64,
3324        .fniv_s = tcg_gen_rotls_vec,
3325        .fniv_v = tcg_gen_rotlv_vec,
3326        .fno = {
3327            gen_helper_gvec_rotl8i,
3328            gen_helper_gvec_rotl16i,
3329            gen_helper_gvec_rotl32i,
3330            gen_helper_gvec_rotl64i,
3331        },
3332        .s_list = { INDEX_op_rotls_vec, 0 },
3333        .v_list = { INDEX_op_rotlv_vec, 0 },
3334    };
3335
3336    tcg_debug_assert(vece <= MO_64);
3337    do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3338}
3339
3340/*
3341 * Expand D = A << (B % element bits)
3342 *
3343 * Unlike scalar shifts, where it is easy for the target front end
3344 * to include the modulo as part of the expansion.  If the target
3345 * naturally includes the modulo as part of the operation, great!
3346 * If the target has some other behaviour from out-of-range shifts,
3347 * then it could not use this function anyway, and would need to
3348 * do it's own expansion with custom functions.
3349 */
3350static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
3351                                 TCGv_vec a, TCGv_vec b)
3352{
3353    TCGv_vec t = tcg_temp_new_vec_matching(d);
3354    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3355
3356    tcg_gen_and_vec(vece, t, b, m);
3357    tcg_gen_shlv_vec(vece, d, a, t);
3358    tcg_temp_free_vec(t);
3359}
3360
3361static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3362{
3363    TCGv_i32 t = tcg_temp_new_i32();
3364
3365    tcg_gen_andi_i32(t, b, 31);
3366    tcg_gen_shl_i32(d, a, t);
3367    tcg_temp_free_i32(t);
3368}
3369
3370static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3371{
3372    TCGv_i64 t = tcg_temp_new_i64();
3373
3374    tcg_gen_andi_i64(t, b, 63);
3375    tcg_gen_shl_i64(d, a, t);
3376    tcg_temp_free_i64(t);
3377}
3378
3379void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3380                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3381{
3382    static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 };
3383    static const GVecGen3 g[4] = {
3384        { .fniv = tcg_gen_shlv_mod_vec,
3385          .fno = gen_helper_gvec_shl8v,
3386          .opt_opc = vecop_list,
3387          .vece = MO_8 },
3388        { .fniv = tcg_gen_shlv_mod_vec,
3389          .fno = gen_helper_gvec_shl16v,
3390          .opt_opc = vecop_list,
3391          .vece = MO_16 },
3392        { .fni4 = tcg_gen_shl_mod_i32,
3393          .fniv = tcg_gen_shlv_mod_vec,
3394          .fno = gen_helper_gvec_shl32v,
3395          .opt_opc = vecop_list,
3396          .vece = MO_32 },
3397        { .fni8 = tcg_gen_shl_mod_i64,
3398          .fniv = tcg_gen_shlv_mod_vec,
3399          .fno = gen_helper_gvec_shl64v,
3400          .opt_opc = vecop_list,
3401          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3402          .vece = MO_64 },
3403    };
3404
3405    tcg_debug_assert(vece <= MO_64);
3406    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3407}
3408
3409/*
3410 * Similarly for logical right shifts.
3411 */
3412
3413static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
3414                                 TCGv_vec a, TCGv_vec b)
3415{
3416    TCGv_vec t = tcg_temp_new_vec_matching(d);
3417    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3418
3419    tcg_gen_and_vec(vece, t, b, m);
3420    tcg_gen_shrv_vec(vece, d, a, t);
3421    tcg_temp_free_vec(t);
3422}
3423
3424static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3425{
3426    TCGv_i32 t = tcg_temp_new_i32();
3427
3428    tcg_gen_andi_i32(t, b, 31);
3429    tcg_gen_shr_i32(d, a, t);
3430    tcg_temp_free_i32(t);
3431}
3432
3433static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3434{
3435    TCGv_i64 t = tcg_temp_new_i64();
3436
3437    tcg_gen_andi_i64(t, b, 63);
3438    tcg_gen_shr_i64(d, a, t);
3439    tcg_temp_free_i64(t);
3440}
3441
3442void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3443                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3444{
3445    static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 };
3446    static const GVecGen3 g[4] = {
3447        { .fniv = tcg_gen_shrv_mod_vec,
3448          .fno = gen_helper_gvec_shr8v,
3449          .opt_opc = vecop_list,
3450          .vece = MO_8 },
3451        { .fniv = tcg_gen_shrv_mod_vec,
3452          .fno = gen_helper_gvec_shr16v,
3453          .opt_opc = vecop_list,
3454          .vece = MO_16 },
3455        { .fni4 = tcg_gen_shr_mod_i32,
3456          .fniv = tcg_gen_shrv_mod_vec,
3457          .fno = gen_helper_gvec_shr32v,
3458          .opt_opc = vecop_list,
3459          .vece = MO_32 },
3460        { .fni8 = tcg_gen_shr_mod_i64,
3461          .fniv = tcg_gen_shrv_mod_vec,
3462          .fno = gen_helper_gvec_shr64v,
3463          .opt_opc = vecop_list,
3464          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3465          .vece = MO_64 },
3466    };
3467
3468    tcg_debug_assert(vece <= MO_64);
3469    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3470}
3471
3472/*
3473 * Similarly for arithmetic right shifts.
3474 */
3475
3476static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
3477                                 TCGv_vec a, TCGv_vec b)
3478{
3479    TCGv_vec t = tcg_temp_new_vec_matching(d);
3480    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3481
3482    tcg_gen_and_vec(vece, t, b, m);
3483    tcg_gen_sarv_vec(vece, d, a, t);
3484    tcg_temp_free_vec(t);
3485}
3486
3487static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3488{
3489    TCGv_i32 t = tcg_temp_new_i32();
3490
3491    tcg_gen_andi_i32(t, b, 31);
3492    tcg_gen_sar_i32(d, a, t);
3493    tcg_temp_free_i32(t);
3494}
3495
3496static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3497{
3498    TCGv_i64 t = tcg_temp_new_i64();
3499
3500    tcg_gen_andi_i64(t, b, 63);
3501    tcg_gen_sar_i64(d, a, t);
3502    tcg_temp_free_i64(t);
3503}
3504
3505void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
3506                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3507{
3508    static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 };
3509    static const GVecGen3 g[4] = {
3510        { .fniv = tcg_gen_sarv_mod_vec,
3511          .fno = gen_helper_gvec_sar8v,
3512          .opt_opc = vecop_list,
3513          .vece = MO_8 },
3514        { .fniv = tcg_gen_sarv_mod_vec,
3515          .fno = gen_helper_gvec_sar16v,
3516          .opt_opc = vecop_list,
3517          .vece = MO_16 },
3518        { .fni4 = tcg_gen_sar_mod_i32,
3519          .fniv = tcg_gen_sarv_mod_vec,
3520          .fno = gen_helper_gvec_sar32v,
3521          .opt_opc = vecop_list,
3522          .vece = MO_32 },
3523        { .fni8 = tcg_gen_sar_mod_i64,
3524          .fniv = tcg_gen_sarv_mod_vec,
3525          .fno = gen_helper_gvec_sar64v,
3526          .opt_opc = vecop_list,
3527          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3528          .vece = MO_64 },
3529    };
3530
3531    tcg_debug_assert(vece <= MO_64);
3532    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3533}
3534
3535/*
3536 * Similarly for rotates.
3537 */
3538
3539static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d,
3540                                  TCGv_vec a, TCGv_vec b)
3541{
3542    TCGv_vec t = tcg_temp_new_vec_matching(d);
3543    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3544
3545    tcg_gen_and_vec(vece, t, b, m);
3546    tcg_gen_rotlv_vec(vece, d, a, t);
3547    tcg_temp_free_vec(t);
3548}
3549
3550static void tcg_gen_rotl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3551{
3552    TCGv_i32 t = tcg_temp_new_i32();
3553
3554    tcg_gen_andi_i32(t, b, 31);
3555    tcg_gen_rotl_i32(d, a, t);
3556    tcg_temp_free_i32(t);
3557}
3558
3559static void tcg_gen_rotl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3560{
3561    TCGv_i64 t = tcg_temp_new_i64();
3562
3563    tcg_gen_andi_i64(t, b, 63);
3564    tcg_gen_rotl_i64(d, a, t);
3565    tcg_temp_free_i64(t);
3566}
3567
3568void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3569                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3570{
3571    static const TCGOpcode vecop_list[] = { INDEX_op_rotlv_vec, 0 };
3572    static const GVecGen3 g[4] = {
3573        { .fniv = tcg_gen_rotlv_mod_vec,
3574          .fno = gen_helper_gvec_rotl8v,
3575          .opt_opc = vecop_list,
3576          .vece = MO_8 },
3577        { .fniv = tcg_gen_rotlv_mod_vec,
3578          .fno = gen_helper_gvec_rotl16v,
3579          .opt_opc = vecop_list,
3580          .vece = MO_16 },
3581        { .fni4 = tcg_gen_rotl_mod_i32,
3582          .fniv = tcg_gen_rotlv_mod_vec,
3583          .fno = gen_helper_gvec_rotl32v,
3584          .opt_opc = vecop_list,
3585          .vece = MO_32 },
3586        { .fni8 = tcg_gen_rotl_mod_i64,
3587          .fniv = tcg_gen_rotlv_mod_vec,
3588          .fno = gen_helper_gvec_rotl64v,
3589          .opt_opc = vecop_list,
3590          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3591          .vece = MO_64 },
3592    };
3593
3594    tcg_debug_assert(vece <= MO_64);
3595    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3596}
3597
3598static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d,
3599                                  TCGv_vec a, TCGv_vec b)
3600{
3601    TCGv_vec t = tcg_temp_new_vec_matching(d);
3602    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3603
3604    tcg_gen_and_vec(vece, t, b, m);
3605    tcg_gen_rotrv_vec(vece, d, a, t);
3606    tcg_temp_free_vec(t);
3607}
3608
3609static void tcg_gen_rotr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3610{
3611    TCGv_i32 t = tcg_temp_new_i32();
3612
3613    tcg_gen_andi_i32(t, b, 31);
3614    tcg_gen_rotr_i32(d, a, t);
3615    tcg_temp_free_i32(t);
3616}
3617
3618static void tcg_gen_rotr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3619{
3620    TCGv_i64 t = tcg_temp_new_i64();
3621
3622    tcg_gen_andi_i64(t, b, 63);
3623    tcg_gen_rotr_i64(d, a, t);
3624    tcg_temp_free_i64(t);
3625}
3626
3627void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3628                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3629{
3630    static const TCGOpcode vecop_list[] = { INDEX_op_rotrv_vec, 0 };
3631    static const GVecGen3 g[4] = {
3632        { .fniv = tcg_gen_rotrv_mod_vec,
3633          .fno = gen_helper_gvec_rotr8v,
3634          .opt_opc = vecop_list,
3635          .vece = MO_8 },
3636        { .fniv = tcg_gen_rotrv_mod_vec,
3637          .fno = gen_helper_gvec_rotr16v,
3638          .opt_opc = vecop_list,
3639          .vece = MO_16 },
3640        { .fni4 = tcg_gen_rotr_mod_i32,
3641          .fniv = tcg_gen_rotrv_mod_vec,
3642          .fno = gen_helper_gvec_rotr32v,
3643          .opt_opc = vecop_list,
3644          .vece = MO_32 },
3645        { .fni8 = tcg_gen_rotr_mod_i64,
3646          .fniv = tcg_gen_rotrv_mod_vec,
3647          .fno = gen_helper_gvec_rotr64v,
3648          .opt_opc = vecop_list,
3649          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3650          .vece = MO_64 },
3651    };
3652
3653    tcg_debug_assert(vece <= MO_64);
3654    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3655}
3656
3657/* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
3658static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3659                           uint32_t oprsz, TCGCond cond)
3660{
3661    TCGv_i32 t0 = tcg_temp_new_i32();
3662    TCGv_i32 t1 = tcg_temp_new_i32();
3663    uint32_t i;
3664
3665    for (i = 0; i < oprsz; i += 4) {
3666        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
3667        tcg_gen_ld_i32(t1, cpu_env, bofs + i);
3668        tcg_gen_setcond_i32(cond, t0, t0, t1);
3669        tcg_gen_neg_i32(t0, t0);
3670        tcg_gen_st_i32(t0, cpu_env, dofs + i);
3671    }
3672    tcg_temp_free_i32(t1);
3673    tcg_temp_free_i32(t0);
3674}
3675
3676static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3677                           uint32_t oprsz, TCGCond cond)
3678{
3679    TCGv_i64 t0 = tcg_temp_new_i64();
3680    TCGv_i64 t1 = tcg_temp_new_i64();
3681    uint32_t i;
3682
3683    for (i = 0; i < oprsz; i += 8) {
3684        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
3685        tcg_gen_ld_i64(t1, cpu_env, bofs + i);
3686        tcg_gen_setcond_i64(cond, t0, t0, t1);
3687        tcg_gen_neg_i64(t0, t0);
3688        tcg_gen_st_i64(t0, cpu_env, dofs + i);
3689    }
3690    tcg_temp_free_i64(t1);
3691    tcg_temp_free_i64(t0);
3692}
3693
3694static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3695                           uint32_t bofs, uint32_t oprsz, uint32_t tysz,
3696                           TCGType type, TCGCond cond)
3697{
3698    TCGv_vec t0 = tcg_temp_new_vec(type);
3699    TCGv_vec t1 = tcg_temp_new_vec(type);
3700    uint32_t i;
3701
3702    for (i = 0; i < oprsz; i += tysz) {
3703        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
3704        tcg_gen_ld_vec(t1, cpu_env, bofs + i);
3705        tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
3706        tcg_gen_st_vec(t0, cpu_env, dofs + i);
3707    }
3708    tcg_temp_free_vec(t1);
3709    tcg_temp_free_vec(t0);
3710}
3711
3712void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
3713                      uint32_t aofs, uint32_t bofs,
3714                      uint32_t oprsz, uint32_t maxsz)
3715{
3716    static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3717    static gen_helper_gvec_3 * const eq_fn[4] = {
3718        gen_helper_gvec_eq8, gen_helper_gvec_eq16,
3719        gen_helper_gvec_eq32, gen_helper_gvec_eq64
3720    };
3721    static gen_helper_gvec_3 * const ne_fn[4] = {
3722        gen_helper_gvec_ne8, gen_helper_gvec_ne16,
3723        gen_helper_gvec_ne32, gen_helper_gvec_ne64
3724    };
3725    static gen_helper_gvec_3 * const lt_fn[4] = {
3726        gen_helper_gvec_lt8, gen_helper_gvec_lt16,
3727        gen_helper_gvec_lt32, gen_helper_gvec_lt64
3728    };
3729    static gen_helper_gvec_3 * const le_fn[4] = {
3730        gen_helper_gvec_le8, gen_helper_gvec_le16,
3731        gen_helper_gvec_le32, gen_helper_gvec_le64
3732    };
3733    static gen_helper_gvec_3 * const ltu_fn[4] = {
3734        gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
3735        gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
3736    };
3737    static gen_helper_gvec_3 * const leu_fn[4] = {
3738        gen_helper_gvec_leu8, gen_helper_gvec_leu16,
3739        gen_helper_gvec_leu32, gen_helper_gvec_leu64
3740    };
3741    static gen_helper_gvec_3 * const * const fns[16] = {
3742        [TCG_COND_EQ] = eq_fn,
3743        [TCG_COND_NE] = ne_fn,
3744        [TCG_COND_LT] = lt_fn,
3745        [TCG_COND_LE] = le_fn,
3746        [TCG_COND_LTU] = ltu_fn,
3747        [TCG_COND_LEU] = leu_fn,
3748    };
3749
3750    const TCGOpcode *hold_list;
3751    TCGType type;
3752    uint32_t some;
3753
3754    check_size_align(oprsz, maxsz, dofs | aofs | bofs);
3755    check_overlap_3(dofs, aofs, bofs, maxsz);
3756
3757    if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3758        do_dup(MO_8, dofs, oprsz, maxsz,
3759               NULL, NULL, -(cond == TCG_COND_ALWAYS));
3760        return;
3761    }
3762
3763    /*
3764     * Implement inline with a vector type, if possible.
3765     * Prefer integer when 64-bit host and 64-bit comparison.
3766     */
3767    hold_list = tcg_swap_vecop_list(cmp_list);
3768    type = choose_vector_type(cmp_list, vece, oprsz,
3769                              TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3770    switch (type) {
3771    case TCG_TYPE_V256:
3772        /* Recall that ARM SVE allows vector sizes that are not a
3773         * power of 2, but always a multiple of 16.  The intent is
3774         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
3775         */
3776        some = QEMU_ALIGN_DOWN(oprsz, 32);
3777        expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
3778        if (some == oprsz) {
3779            break;
3780        }
3781        dofs += some;
3782        aofs += some;
3783        bofs += some;
3784        oprsz -= some;
3785        maxsz -= some;
3786        /* fallthru */
3787    case TCG_TYPE_V128:
3788        expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
3789        break;
3790    case TCG_TYPE_V64:
3791        expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
3792        break;
3793
3794    case 0:
3795        if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3796            expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
3797        } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3798            expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
3799        } else {
3800            gen_helper_gvec_3 * const *fn = fns[cond];
3801
3802            if (fn == NULL) {
3803                uint32_t tmp;
3804                tmp = aofs, aofs = bofs, bofs = tmp;
3805                cond = tcg_swap_cond(cond);
3806                fn = fns[cond];
3807                assert(fn != NULL);
3808            }
3809            tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
3810            oprsz = maxsz;
3811        }
3812        break;
3813
3814    default:
3815        g_assert_not_reached();
3816    }
3817    tcg_swap_vecop_list(hold_list);
3818
3819    if (oprsz < maxsz) {
3820        expand_clr(dofs + oprsz, maxsz - oprsz);
3821    }
3822}
3823
3824static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c)
3825{
3826    TCGv_i64 t = tcg_temp_new_i64();
3827
3828    tcg_gen_and_i64(t, b, a);
3829    tcg_gen_andc_i64(d, c, a);
3830    tcg_gen_or_i64(d, d, t);
3831    tcg_temp_free_i64(t);
3832}
3833
3834void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
3835                         uint32_t bofs, uint32_t cofs,
3836                         uint32_t oprsz, uint32_t maxsz)
3837{
3838    static const GVecGen4 g = {
3839        .fni8 = tcg_gen_bitsel_i64,
3840        .fniv = tcg_gen_bitsel_vec,
3841        .fno = gen_helper_gvec_bitsel,
3842    };
3843
3844    tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g);
3845}
3846