qemu/tcg/tcg-op-gvec.c
<<
>>
Prefs
   1/*
   2 * Generic vector operation expansion
   3 *
   4 * Copyright (c) 2018 Linaro
   5 *
   6 * This library is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU Lesser General Public
   8 * License as published by the Free Software Foundation; either
   9 * version 2.1 of the License, or (at your option) any later version.
  10 *
  11 * This library is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * Lesser General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU Lesser General Public
  17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20#include "qemu/osdep.h"
  21#include "tcg/tcg.h"
  22#include "tcg/tcg-op.h"
  23#include "tcg/tcg-op-gvec.h"
  24#include "qemu/main-loop.h"
  25#include "tcg/tcg-gvec-desc.h"
  26
  27#define MAX_UNROLL  4
  28
  29#ifdef CONFIG_DEBUG_TCG
  30static const TCGOpcode vecop_list_empty[1] = { 0 };
  31#else
  32#define vecop_list_empty NULL
  33#endif
  34
  35
  36/* Verify vector size and alignment rules.  OFS should be the OR of all
  37   of the operand offsets so that we can check them all at once.  */
  38static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
  39{
  40    uint32_t max_align;
  41
  42    switch (oprsz) {
  43    case 8:
  44    case 16:
  45    case 32:
  46        tcg_debug_assert(oprsz <= maxsz);
  47        break;
  48    default:
  49        tcg_debug_assert(oprsz == maxsz);
  50        break;
  51    }
  52    tcg_debug_assert(maxsz <= (8 << SIMD_MAXSZ_BITS));
  53
  54    max_align = maxsz >= 16 ? 15 : 7;
  55    tcg_debug_assert((maxsz & max_align) == 0);
  56    tcg_debug_assert((ofs & max_align) == 0);
  57}
  58
  59/* Verify vector overlap rules for two operands.  */
  60static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
  61{
  62    tcg_debug_assert(d == a || d + s <= a || a + s <= d);
  63}
  64
  65/* Verify vector overlap rules for three operands.  */
  66static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
  67{
  68    check_overlap_2(d, a, s);
  69    check_overlap_2(d, b, s);
  70    check_overlap_2(a, b, s);
  71}
  72
  73/* Verify vector overlap rules for four operands.  */
  74static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
  75                            uint32_t c, uint32_t s)
  76{
  77    check_overlap_2(d, a, s);
  78    check_overlap_2(d, b, s);
  79    check_overlap_2(d, c, s);
  80    check_overlap_2(a, b, s);
  81    check_overlap_2(a, c, s);
  82    check_overlap_2(b, c, s);
  83}
  84
  85/* Create a descriptor from components.  */
  86uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
  87{
  88    uint32_t desc = 0;
  89
  90    check_size_align(oprsz, maxsz, 0);
  91    tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS));
  92
  93    oprsz = (oprsz / 8) - 1;
  94    maxsz = (maxsz / 8) - 1;
  95
  96    /*
  97     * We have just asserted in check_size_align that either
  98     * oprsz is {8,16,32} or matches maxsz.  Encode the final
  99     * case with '2', as that would otherwise map to 24.
 100     */
 101    if (oprsz == maxsz) {
 102        oprsz = 2;
 103    }
 104
 105    desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
 106    desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
 107    desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
 108
 109    return desc;
 110}
 111
 112/* Generate a call to a gvec-style helper with two vector operands.  */
 113void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
 114                        uint32_t oprsz, uint32_t maxsz, int32_t data,
 115                        gen_helper_gvec_2 *fn)
 116{
 117    TCGv_ptr a0, a1;
 118    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 119
 120    a0 = tcg_temp_new_ptr();
 121    a1 = tcg_temp_new_ptr();
 122
 123    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 124    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 125
 126    fn(a0, a1, desc);
 127
 128    tcg_temp_free_ptr(a0);
 129    tcg_temp_free_ptr(a1);
 130}
 131
 132/* Generate a call to a gvec-style helper with two vector operands
 133   and one scalar operand.  */
 134void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
 135                         uint32_t oprsz, uint32_t maxsz, int32_t data,
 136                         gen_helper_gvec_2i *fn)
 137{
 138    TCGv_ptr a0, a1;
 139    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 140
 141    a0 = tcg_temp_new_ptr();
 142    a1 = tcg_temp_new_ptr();
 143
 144    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 145    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 146
 147    fn(a0, a1, c, desc);
 148
 149    tcg_temp_free_ptr(a0);
 150    tcg_temp_free_ptr(a1);
 151}
 152
 153/* Generate a call to a gvec-style helper with three vector operands.  */
 154void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 155                        uint32_t oprsz, uint32_t maxsz, int32_t data,
 156                        gen_helper_gvec_3 *fn)
 157{
 158    TCGv_ptr a0, a1, a2;
 159    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 160
 161    a0 = tcg_temp_new_ptr();
 162    a1 = tcg_temp_new_ptr();
 163    a2 = tcg_temp_new_ptr();
 164
 165    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 166    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 167    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 168
 169    fn(a0, a1, a2, desc);
 170
 171    tcg_temp_free_ptr(a0);
 172    tcg_temp_free_ptr(a1);
 173    tcg_temp_free_ptr(a2);
 174}
 175
 176/* Generate a call to a gvec-style helper with four vector operands.  */
 177void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 178                        uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
 179                        int32_t data, gen_helper_gvec_4 *fn)
 180{
 181    TCGv_ptr a0, a1, a2, a3;
 182    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 183
 184    a0 = tcg_temp_new_ptr();
 185    a1 = tcg_temp_new_ptr();
 186    a2 = tcg_temp_new_ptr();
 187    a3 = tcg_temp_new_ptr();
 188
 189    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 190    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 191    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 192    tcg_gen_addi_ptr(a3, cpu_env, cofs);
 193
 194    fn(a0, a1, a2, a3, desc);
 195
 196    tcg_temp_free_ptr(a0);
 197    tcg_temp_free_ptr(a1);
 198    tcg_temp_free_ptr(a2);
 199    tcg_temp_free_ptr(a3);
 200}
 201
 202/* Generate a call to a gvec-style helper with five vector operands.  */
 203void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 204                        uint32_t cofs, uint32_t xofs, uint32_t oprsz,
 205                        uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
 206{
 207    TCGv_ptr a0, a1, a2, a3, a4;
 208    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 209
 210    a0 = tcg_temp_new_ptr();
 211    a1 = tcg_temp_new_ptr();
 212    a2 = tcg_temp_new_ptr();
 213    a3 = tcg_temp_new_ptr();
 214    a4 = tcg_temp_new_ptr();
 215
 216    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 217    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 218    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 219    tcg_gen_addi_ptr(a3, cpu_env, cofs);
 220    tcg_gen_addi_ptr(a4, cpu_env, xofs);
 221
 222    fn(a0, a1, a2, a3, a4, desc);
 223
 224    tcg_temp_free_ptr(a0);
 225    tcg_temp_free_ptr(a1);
 226    tcg_temp_free_ptr(a2);
 227    tcg_temp_free_ptr(a3);
 228    tcg_temp_free_ptr(a4);
 229}
 230
 231/* Generate a call to a gvec-style helper with three vector operands
 232   and an extra pointer operand.  */
 233void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
 234                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 235                        int32_t data, gen_helper_gvec_2_ptr *fn)
 236{
 237    TCGv_ptr a0, a1;
 238    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 239
 240    a0 = tcg_temp_new_ptr();
 241    a1 = tcg_temp_new_ptr();
 242
 243    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 244    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 245
 246    fn(a0, a1, ptr, desc);
 247
 248    tcg_temp_free_ptr(a0);
 249    tcg_temp_free_ptr(a1);
 250}
 251
 252/* Generate a call to a gvec-style helper with three vector operands
 253   and an extra pointer operand.  */
 254void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 255                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 256                        int32_t data, gen_helper_gvec_3_ptr *fn)
 257{
 258    TCGv_ptr a0, a1, a2;
 259    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 260
 261    a0 = tcg_temp_new_ptr();
 262    a1 = tcg_temp_new_ptr();
 263    a2 = tcg_temp_new_ptr();
 264
 265    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 266    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 267    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 268
 269    fn(a0, a1, a2, ptr, desc);
 270
 271    tcg_temp_free_ptr(a0);
 272    tcg_temp_free_ptr(a1);
 273    tcg_temp_free_ptr(a2);
 274}
 275
 276/* Generate a call to a gvec-style helper with four vector operands
 277   and an extra pointer operand.  */
 278void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 279                        uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
 280                        uint32_t maxsz, int32_t data,
 281                        gen_helper_gvec_4_ptr *fn)
 282{
 283    TCGv_ptr a0, a1, a2, a3;
 284    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 285
 286    a0 = tcg_temp_new_ptr();
 287    a1 = tcg_temp_new_ptr();
 288    a2 = tcg_temp_new_ptr();
 289    a3 = tcg_temp_new_ptr();
 290
 291    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 292    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 293    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 294    tcg_gen_addi_ptr(a3, cpu_env, cofs);
 295
 296    fn(a0, a1, a2, a3, ptr, desc);
 297
 298    tcg_temp_free_ptr(a0);
 299    tcg_temp_free_ptr(a1);
 300    tcg_temp_free_ptr(a2);
 301    tcg_temp_free_ptr(a3);
 302}
 303
 304/* Generate a call to a gvec-style helper with five vector operands
 305   and an extra pointer operand.  */
 306void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 307                        uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
 308                        uint32_t oprsz, uint32_t maxsz, int32_t data,
 309                        gen_helper_gvec_5_ptr *fn)
 310{
 311    TCGv_ptr a0, a1, a2, a3, a4;
 312    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 313
 314    a0 = tcg_temp_new_ptr();
 315    a1 = tcg_temp_new_ptr();
 316    a2 = tcg_temp_new_ptr();
 317    a3 = tcg_temp_new_ptr();
 318    a4 = tcg_temp_new_ptr();
 319
 320    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 321    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 322    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 323    tcg_gen_addi_ptr(a3, cpu_env, cofs);
 324    tcg_gen_addi_ptr(a4, cpu_env, eofs);
 325
 326    fn(a0, a1, a2, a3, a4, ptr, desc);
 327
 328    tcg_temp_free_ptr(a0);
 329    tcg_temp_free_ptr(a1);
 330    tcg_temp_free_ptr(a2);
 331    tcg_temp_free_ptr(a3);
 332    tcg_temp_free_ptr(a4);
 333}
 334
 335/* Return true if we want to implement something of OPRSZ bytes
 336   in units of LNSZ.  This limits the expansion of inline code.  */
 337static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
 338{
 339    uint32_t q, r;
 340
 341    if (oprsz < lnsz) {
 342        return false;
 343    }
 344
 345    q = oprsz / lnsz;
 346    r = oprsz % lnsz;
 347    tcg_debug_assert((r & 7) == 0);
 348
 349    if (lnsz < 16) {
 350        /* For sizes below 16, accept no remainder. */
 351        if (r != 0) {
 352            return false;
 353        }
 354    } else {
 355        /*
 356         * Recall that ARM SVE allows vector sizes that are not a
 357         * power of 2, but always a multiple of 16.  The intent is
 358         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 359         * In addition, expand_clr needs to handle a multiple of 8.
 360         * Thus we can handle the tail with one more operation per
 361         * diminishing power of 2.
 362         */
 363        q += ctpop32(r);
 364    }
 365
 366    return q <= MAX_UNROLL;
 367}
 368
 369static void expand_clr(uint32_t dofs, uint32_t maxsz);
 370
 371/* Duplicate C as per VECE.  */
 372uint64_t (dup_const)(unsigned vece, uint64_t c)
 373{
 374    switch (vece) {
 375    case MO_8:
 376        return 0x0101010101010101ull * (uint8_t)c;
 377    case MO_16:
 378        return 0x0001000100010001ull * (uint16_t)c;
 379    case MO_32:
 380        return 0x0000000100000001ull * (uint32_t)c;
 381    case MO_64:
 382        return c;
 383    default:
 384        g_assert_not_reached();
 385    }
 386}
 387
 388/* Duplicate IN into OUT as per VECE.  */
 389void tcg_gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
 390{
 391    switch (vece) {
 392    case MO_8:
 393        tcg_gen_ext8u_i32(out, in);
 394        tcg_gen_muli_i32(out, out, 0x01010101);
 395        break;
 396    case MO_16:
 397        tcg_gen_deposit_i32(out, in, in, 16, 16);
 398        break;
 399    case MO_32:
 400        tcg_gen_mov_i32(out, in);
 401        break;
 402    default:
 403        g_assert_not_reached();
 404    }
 405}
 406
 407void tcg_gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
 408{
 409    switch (vece) {
 410    case MO_8:
 411        tcg_gen_ext8u_i64(out, in);
 412        tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
 413        break;
 414    case MO_16:
 415        tcg_gen_ext16u_i64(out, in);
 416        tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
 417        break;
 418    case MO_32:
 419        tcg_gen_deposit_i64(out, in, in, 32, 32);
 420        break;
 421    case MO_64:
 422        tcg_gen_mov_i64(out, in);
 423        break;
 424    default:
 425        g_assert_not_reached();
 426    }
 427}
 428
 429/* Select a supported vector type for implementing an operation on SIZE
 430 * bytes.  If OP is 0, assume that the real operation to be performed is
 431 * required by all backends.  Otherwise, make sure than OP can be performed
 432 * on elements of size VECE in the selected type.  Do not select V64 if
 433 * PREFER_I64 is true.  Return 0 if no vector type is selected.
 434 */
 435static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
 436                                  uint32_t size, bool prefer_i64)
 437{
 438    /*
 439     * Recall that ARM SVE allows vector sizes that are not a
 440     * power of 2, but always a multiple of 16.  The intent is
 441     * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 442     * It is hard to imagine a case in which v256 is supported
 443     * but v128 is not, but check anyway.
 444     * In addition, expand_clr needs to handle a multiple of 8.
 445     */
 446    if (TCG_TARGET_HAS_v256 &&
 447        check_size_impl(size, 32) &&
 448        tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) &&
 449        (!(size & 16) ||
 450         (TCG_TARGET_HAS_v128 &&
 451          tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) &&
 452        (!(size & 8) ||
 453         (TCG_TARGET_HAS_v64 &&
 454          tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
 455        return TCG_TYPE_V256;
 456    }
 457    if (TCG_TARGET_HAS_v128 &&
 458        check_size_impl(size, 16) &&
 459        tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) &&
 460        (!(size & 8) ||
 461         (TCG_TARGET_HAS_v64 &&
 462          tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
 463        return TCG_TYPE_V128;
 464    }
 465    if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
 466        && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) {
 467        return TCG_TYPE_V64;
 468    }
 469    return 0;
 470}
 471
 472static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
 473                         uint32_t maxsz, TCGv_vec t_vec)
 474{
 475    uint32_t i = 0;
 476
 477    tcg_debug_assert(oprsz >= 8);
 478
 479    /*
 480     * This may be expand_clr for the tail of an operation, e.g.
 481     * oprsz == 8 && maxsz == 64.  The first 8 bytes of this store
 482     * are misaligned wrt the maximum vector size, so do that first.
 483     */
 484    if (dofs & 8) {
 485        tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
 486        i += 8;
 487    }
 488
 489    switch (type) {
 490    case TCG_TYPE_V256:
 491        /*
 492         * Recall that ARM SVE allows vector sizes that are not a
 493         * power of 2, but always a multiple of 16.  The intent is
 494         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 495         */
 496        for (; i + 32 <= oprsz; i += 32) {
 497            tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
 498        }
 499        /* fallthru */
 500    case TCG_TYPE_V128:
 501        for (; i + 16 <= oprsz; i += 16) {
 502            tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
 503        }
 504        break;
 505    case TCG_TYPE_V64:
 506        for (; i < oprsz; i += 8) {
 507            tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
 508        }
 509        break;
 510    default:
 511        g_assert_not_reached();
 512    }
 513
 514    if (oprsz < maxsz) {
 515        expand_clr(dofs + oprsz, maxsz - oprsz);
 516    }
 517}
 518
 519/* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
 520 * Only one of IN_32 or IN_64 may be set;
 521 * IN_C is used if IN_32 and IN_64 are unset.
 522 */
 523static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
 524                   uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
 525                   uint64_t in_c)
 526{
 527    TCGType type;
 528    TCGv_i64 t_64;
 529    TCGv_i32 t_32, t_desc;
 530    TCGv_ptr t_ptr;
 531    uint32_t i;
 532
 533    assert(vece <= (in_32 ? MO_32 : MO_64));
 534    assert(in_32 == NULL || in_64 == NULL);
 535
 536    /* If we're storing 0, expand oprsz to maxsz.  */
 537    if (in_32 == NULL && in_64 == NULL) {
 538        in_c = dup_const(vece, in_c);
 539        if (in_c == 0) {
 540            oprsz = maxsz;
 541            vece = MO_8;
 542        } else if (in_c == dup_const(MO_8, in_c)) {
 543            vece = MO_8;
 544        }
 545    }
 546
 547    /* Implement inline with a vector type, if possible.
 548     * Prefer integer when 64-bit host and no variable dup.
 549     */
 550    type = choose_vector_type(NULL, vece, oprsz,
 551                              (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
 552                               && (in_64 == NULL || vece == MO_64)));
 553    if (type != 0) {
 554        TCGv_vec t_vec = tcg_temp_new_vec(type);
 555
 556        if (in_32) {
 557            tcg_gen_dup_i32_vec(vece, t_vec, in_32);
 558        } else if (in_64) {
 559            tcg_gen_dup_i64_vec(vece, t_vec, in_64);
 560        } else {
 561            tcg_gen_dupi_vec(vece, t_vec, in_c);
 562        }
 563        do_dup_store(type, dofs, oprsz, maxsz, t_vec);
 564        tcg_temp_free_vec(t_vec);
 565        return;
 566    }
 567
 568    /* Otherwise, inline with an integer type, unless "large".  */
 569    if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
 570        t_64 = NULL;
 571        t_32 = NULL;
 572
 573        if (in_32) {
 574            /* We are given a 32-bit variable input.  For a 64-bit host,
 575               use a 64-bit operation unless the 32-bit operation would
 576               be simple enough.  */
 577            if (TCG_TARGET_REG_BITS == 64
 578                && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
 579                t_64 = tcg_temp_new_i64();
 580                tcg_gen_extu_i32_i64(t_64, in_32);
 581                tcg_gen_dup_i64(vece, t_64, t_64);
 582            } else {
 583                t_32 = tcg_temp_new_i32();
 584                tcg_gen_dup_i32(vece, t_32, in_32);
 585            }
 586        } else if (in_64) {
 587            /* We are given a 64-bit variable input.  */
 588            t_64 = tcg_temp_new_i64();
 589            tcg_gen_dup_i64(vece, t_64, in_64);
 590        } else {
 591            /* We are given a constant input.  */
 592            /* For 64-bit hosts, use 64-bit constants for "simple" constants
 593               or when we'd need too many 32-bit stores, or when a 64-bit
 594               constant is really required.  */
 595            if (vece == MO_64
 596                || (TCG_TARGET_REG_BITS == 64
 597                    && (in_c == 0 || in_c == -1
 598                        || !check_size_impl(oprsz, 4)))) {
 599                t_64 = tcg_constant_i64(in_c);
 600            } else {
 601                t_32 = tcg_constant_i32(in_c);
 602            }
 603        }
 604
 605        /* Implement inline if we picked an implementation size above.  */
 606        if (t_32) {
 607            for (i = 0; i < oprsz; i += 4) {
 608                tcg_gen_st_i32(t_32, cpu_env, dofs + i);
 609            }
 610            tcg_temp_free_i32(t_32);
 611            goto done;
 612        }
 613        if (t_64) {
 614            for (i = 0; i < oprsz; i += 8) {
 615                tcg_gen_st_i64(t_64, cpu_env, dofs + i);
 616            }
 617            tcg_temp_free_i64(t_64);
 618            goto done;
 619        }
 620    }
 621
 622    /* Otherwise implement out of line.  */
 623    t_ptr = tcg_temp_new_ptr();
 624    tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
 625
 626    /*
 627     * This may be expand_clr for the tail of an operation, e.g.
 628     * oprsz == 8 && maxsz == 64.  The size of the clear is misaligned
 629     * wrt simd_desc and will assert.  Simply pass all replicated byte
 630     * stores through to memset.
 631     */
 632    if (oprsz == maxsz && vece == MO_8) {
 633        TCGv_ptr t_size = tcg_const_ptr(oprsz);
 634        TCGv_i32 t_val;
 635
 636        if (in_32) {
 637            t_val = in_32;
 638        } else if (in_64) {
 639            t_val = tcg_temp_new_i32();
 640            tcg_gen_extrl_i64_i32(t_val, in_64);
 641        } else {
 642            t_val = tcg_constant_i32(in_c);
 643        }
 644        gen_helper_memset(t_ptr, t_ptr, t_val, t_size);
 645
 646        if (in_64) {
 647            tcg_temp_free_i32(t_val);
 648        }
 649        tcg_temp_free_ptr(t_size);
 650        tcg_temp_free_ptr(t_ptr);
 651        return;
 652    }
 653
 654    t_desc = tcg_constant_i32(simd_desc(oprsz, maxsz, 0));
 655
 656    if (vece == MO_64) {
 657        if (in_64) {
 658            gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
 659        } else {
 660            t_64 = tcg_constant_i64(in_c);
 661            gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
 662        }
 663    } else {
 664        typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
 665        static dup_fn * const fns[3] = {
 666            gen_helper_gvec_dup8,
 667            gen_helper_gvec_dup16,
 668            gen_helper_gvec_dup32
 669        };
 670
 671        if (in_32) {
 672            fns[vece](t_ptr, t_desc, in_32);
 673        } else if (in_64) {
 674            t_32 = tcg_temp_new_i32();
 675            tcg_gen_extrl_i64_i32(t_32, in_64);
 676            fns[vece](t_ptr, t_desc, t_32);
 677            tcg_temp_free_i32(t_32);
 678        } else {
 679            if (vece == MO_8) {
 680                in_c &= 0xff;
 681            } else if (vece == MO_16) {
 682                in_c &= 0xffff;
 683            }
 684            t_32 = tcg_constant_i32(in_c);
 685            fns[vece](t_ptr, t_desc, t_32);
 686        }
 687    }
 688
 689    tcg_temp_free_ptr(t_ptr);
 690    return;
 691
 692 done:
 693    if (oprsz < maxsz) {
 694        expand_clr(dofs + oprsz, maxsz - oprsz);
 695    }
 696}
 697
 698/* Likewise, but with zero.  */
 699static void expand_clr(uint32_t dofs, uint32_t maxsz)
 700{
 701    do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
 702}
 703
 704/* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
 705static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 706                         bool load_dest, void (*fni)(TCGv_i32, TCGv_i32))
 707{
 708    TCGv_i32 t0 = tcg_temp_new_i32();
 709    TCGv_i32 t1 = tcg_temp_new_i32();
 710    uint32_t i;
 711
 712    for (i = 0; i < oprsz; i += 4) {
 713        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 714        if (load_dest) {
 715            tcg_gen_ld_i32(t1, cpu_env, dofs + i);
 716        }
 717        fni(t1, t0);
 718        tcg_gen_st_i32(t1, cpu_env, dofs + i);
 719    }
 720    tcg_temp_free_i32(t0);
 721    tcg_temp_free_i32(t1);
 722}
 723
 724static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 725                          int32_t c, bool load_dest,
 726                          void (*fni)(TCGv_i32, TCGv_i32, int32_t))
 727{
 728    TCGv_i32 t0 = tcg_temp_new_i32();
 729    TCGv_i32 t1 = tcg_temp_new_i32();
 730    uint32_t i;
 731
 732    for (i = 0; i < oprsz; i += 4) {
 733        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 734        if (load_dest) {
 735            tcg_gen_ld_i32(t1, cpu_env, dofs + i);
 736        }
 737        fni(t1, t0, c);
 738        tcg_gen_st_i32(t1, cpu_env, dofs + i);
 739    }
 740    tcg_temp_free_i32(t0);
 741    tcg_temp_free_i32(t1);
 742}
 743
 744static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 745                          TCGv_i32 c, bool scalar_first,
 746                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
 747{
 748    TCGv_i32 t0 = tcg_temp_new_i32();
 749    TCGv_i32 t1 = tcg_temp_new_i32();
 750    uint32_t i;
 751
 752    for (i = 0; i < oprsz; i += 4) {
 753        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 754        if (scalar_first) {
 755            fni(t1, c, t0);
 756        } else {
 757            fni(t1, t0, c);
 758        }
 759        tcg_gen_st_i32(t1, cpu_env, dofs + i);
 760    }
 761    tcg_temp_free_i32(t0);
 762    tcg_temp_free_i32(t1);
 763}
 764
 765/* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
 766static void expand_3_i32(uint32_t dofs, uint32_t aofs,
 767                         uint32_t bofs, uint32_t oprsz, bool load_dest,
 768                         void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
 769{
 770    TCGv_i32 t0 = tcg_temp_new_i32();
 771    TCGv_i32 t1 = tcg_temp_new_i32();
 772    TCGv_i32 t2 = tcg_temp_new_i32();
 773    uint32_t i;
 774
 775    for (i = 0; i < oprsz; i += 4) {
 776        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 777        tcg_gen_ld_i32(t1, cpu_env, bofs + i);
 778        if (load_dest) {
 779            tcg_gen_ld_i32(t2, cpu_env, dofs + i);
 780        }
 781        fni(t2, t0, t1);
 782        tcg_gen_st_i32(t2, cpu_env, dofs + i);
 783    }
 784    tcg_temp_free_i32(t2);
 785    tcg_temp_free_i32(t1);
 786    tcg_temp_free_i32(t0);
 787}
 788
 789static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 790                          uint32_t oprsz, int32_t c, bool load_dest,
 791                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
 792{
 793    TCGv_i32 t0 = tcg_temp_new_i32();
 794    TCGv_i32 t1 = tcg_temp_new_i32();
 795    TCGv_i32 t2 = tcg_temp_new_i32();
 796    uint32_t i;
 797
 798    for (i = 0; i < oprsz; i += 4) {
 799        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 800        tcg_gen_ld_i32(t1, cpu_env, bofs + i);
 801        if (load_dest) {
 802            tcg_gen_ld_i32(t2, cpu_env, dofs + i);
 803        }
 804        fni(t2, t0, t1, c);
 805        tcg_gen_st_i32(t2, cpu_env, dofs + i);
 806    }
 807    tcg_temp_free_i32(t0);
 808    tcg_temp_free_i32(t1);
 809    tcg_temp_free_i32(t2);
 810}
 811
 812/* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
 813static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 814                         uint32_t cofs, uint32_t oprsz, bool write_aofs,
 815                         void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
 816{
 817    TCGv_i32 t0 = tcg_temp_new_i32();
 818    TCGv_i32 t1 = tcg_temp_new_i32();
 819    TCGv_i32 t2 = tcg_temp_new_i32();
 820    TCGv_i32 t3 = tcg_temp_new_i32();
 821    uint32_t i;
 822
 823    for (i = 0; i < oprsz; i += 4) {
 824        tcg_gen_ld_i32(t1, cpu_env, aofs + i);
 825        tcg_gen_ld_i32(t2, cpu_env, bofs + i);
 826        tcg_gen_ld_i32(t3, cpu_env, cofs + i);
 827        fni(t0, t1, t2, t3);
 828        tcg_gen_st_i32(t0, cpu_env, dofs + i);
 829        if (write_aofs) {
 830            tcg_gen_st_i32(t1, cpu_env, aofs + i);
 831        }
 832    }
 833    tcg_temp_free_i32(t3);
 834    tcg_temp_free_i32(t2);
 835    tcg_temp_free_i32(t1);
 836    tcg_temp_free_i32(t0);
 837}
 838
 839/* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
 840static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 841                         bool load_dest, void (*fni)(TCGv_i64, TCGv_i64))
 842{
 843    TCGv_i64 t0 = tcg_temp_new_i64();
 844    TCGv_i64 t1 = tcg_temp_new_i64();
 845    uint32_t i;
 846
 847    for (i = 0; i < oprsz; i += 8) {
 848        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 849        if (load_dest) {
 850            tcg_gen_ld_i64(t1, cpu_env, dofs + i);
 851        }
 852        fni(t1, t0);
 853        tcg_gen_st_i64(t1, cpu_env, dofs + i);
 854    }
 855    tcg_temp_free_i64(t0);
 856    tcg_temp_free_i64(t1);
 857}
 858
 859static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 860                          int64_t c, bool load_dest,
 861                          void (*fni)(TCGv_i64, TCGv_i64, int64_t))
 862{
 863    TCGv_i64 t0 = tcg_temp_new_i64();
 864    TCGv_i64 t1 = tcg_temp_new_i64();
 865    uint32_t i;
 866
 867    for (i = 0; i < oprsz; i += 8) {
 868        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 869        if (load_dest) {
 870            tcg_gen_ld_i64(t1, cpu_env, dofs + i);
 871        }
 872        fni(t1, t0, c);
 873        tcg_gen_st_i64(t1, cpu_env, dofs + i);
 874    }
 875    tcg_temp_free_i64(t0);
 876    tcg_temp_free_i64(t1);
 877}
 878
 879static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 880                          TCGv_i64 c, bool scalar_first,
 881                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
 882{
 883    TCGv_i64 t0 = tcg_temp_new_i64();
 884    TCGv_i64 t1 = tcg_temp_new_i64();
 885    uint32_t i;
 886
 887    for (i = 0; i < oprsz; i += 8) {
 888        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 889        if (scalar_first) {
 890            fni(t1, c, t0);
 891        } else {
 892            fni(t1, t0, c);
 893        }
 894        tcg_gen_st_i64(t1, cpu_env, dofs + i);
 895    }
 896    tcg_temp_free_i64(t0);
 897    tcg_temp_free_i64(t1);
 898}
 899
 900/* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
 901static void expand_3_i64(uint32_t dofs, uint32_t aofs,
 902                         uint32_t bofs, uint32_t oprsz, bool load_dest,
 903                         void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
 904{
 905    TCGv_i64 t0 = tcg_temp_new_i64();
 906    TCGv_i64 t1 = tcg_temp_new_i64();
 907    TCGv_i64 t2 = tcg_temp_new_i64();
 908    uint32_t i;
 909
 910    for (i = 0; i < oprsz; i += 8) {
 911        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 912        tcg_gen_ld_i64(t1, cpu_env, bofs + i);
 913        if (load_dest) {
 914            tcg_gen_ld_i64(t2, cpu_env, dofs + i);
 915        }
 916        fni(t2, t0, t1);
 917        tcg_gen_st_i64(t2, cpu_env, dofs + i);
 918    }
 919    tcg_temp_free_i64(t2);
 920    tcg_temp_free_i64(t1);
 921    tcg_temp_free_i64(t0);
 922}
 923
 924static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 925                          uint32_t oprsz, int64_t c, bool load_dest,
 926                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
 927{
 928    TCGv_i64 t0 = tcg_temp_new_i64();
 929    TCGv_i64 t1 = tcg_temp_new_i64();
 930    TCGv_i64 t2 = tcg_temp_new_i64();
 931    uint32_t i;
 932
 933    for (i = 0; i < oprsz; i += 8) {
 934        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 935        tcg_gen_ld_i64(t1, cpu_env, bofs + i);
 936        if (load_dest) {
 937            tcg_gen_ld_i64(t2, cpu_env, dofs + i);
 938        }
 939        fni(t2, t0, t1, c);
 940        tcg_gen_st_i64(t2, cpu_env, dofs + i);
 941    }
 942    tcg_temp_free_i64(t0);
 943    tcg_temp_free_i64(t1);
 944    tcg_temp_free_i64(t2);
 945}
 946
 947/* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
 948static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 949                         uint32_t cofs, uint32_t oprsz, bool write_aofs,
 950                         void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
 951{
 952    TCGv_i64 t0 = tcg_temp_new_i64();
 953    TCGv_i64 t1 = tcg_temp_new_i64();
 954    TCGv_i64 t2 = tcg_temp_new_i64();
 955    TCGv_i64 t3 = tcg_temp_new_i64();
 956    uint32_t i;
 957
 958    for (i = 0; i < oprsz; i += 8) {
 959        tcg_gen_ld_i64(t1, cpu_env, aofs + i);
 960        tcg_gen_ld_i64(t2, cpu_env, bofs + i);
 961        tcg_gen_ld_i64(t3, cpu_env, cofs + i);
 962        fni(t0, t1, t2, t3);
 963        tcg_gen_st_i64(t0, cpu_env, dofs + i);
 964        if (write_aofs) {
 965            tcg_gen_st_i64(t1, cpu_env, aofs + i);
 966        }
 967    }
 968    tcg_temp_free_i64(t3);
 969    tcg_temp_free_i64(t2);
 970    tcg_temp_free_i64(t1);
 971    tcg_temp_free_i64(t0);
 972}
 973
 974/* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
 975static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 976                         uint32_t oprsz, uint32_t tysz, TCGType type,
 977                         bool load_dest,
 978                         void (*fni)(unsigned, TCGv_vec, TCGv_vec))
 979{
 980    TCGv_vec t0 = tcg_temp_new_vec(type);
 981    TCGv_vec t1 = tcg_temp_new_vec(type);
 982    uint32_t i;
 983
 984    for (i = 0; i < oprsz; i += tysz) {
 985        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 986        if (load_dest) {
 987            tcg_gen_ld_vec(t1, cpu_env, dofs + i);
 988        }
 989        fni(vece, t1, t0);
 990        tcg_gen_st_vec(t1, cpu_env, dofs + i);
 991    }
 992    tcg_temp_free_vec(t0);
 993    tcg_temp_free_vec(t1);
 994}
 995
 996/* Expand OPSZ bytes worth of two-vector operands and an immediate operand
 997   using host vectors.  */
 998static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 999                          uint32_t oprsz, uint32_t tysz, TCGType type,
1000                          int64_t c, bool load_dest,
1001                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
1002{
1003    TCGv_vec t0 = tcg_temp_new_vec(type);
1004    TCGv_vec t1 = tcg_temp_new_vec(type);
1005    uint32_t i;
1006
1007    for (i = 0; i < oprsz; i += tysz) {
1008        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1009        if (load_dest) {
1010            tcg_gen_ld_vec(t1, cpu_env, dofs + i);
1011        }
1012        fni(vece, t1, t0, c);
1013        tcg_gen_st_vec(t1, cpu_env, dofs + i);
1014    }
1015    tcg_temp_free_vec(t0);
1016    tcg_temp_free_vec(t1);
1017}
1018
1019static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1020                          uint32_t oprsz, uint32_t tysz, TCGType type,
1021                          TCGv_vec c, bool scalar_first,
1022                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1023{
1024    TCGv_vec t0 = tcg_temp_new_vec(type);
1025    TCGv_vec t1 = tcg_temp_new_vec(type);
1026    uint32_t i;
1027
1028    for (i = 0; i < oprsz; i += tysz) {
1029        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1030        if (scalar_first) {
1031            fni(vece, t1, c, t0);
1032        } else {
1033            fni(vece, t1, t0, c);
1034        }
1035        tcg_gen_st_vec(t1, cpu_env, dofs + i);
1036    }
1037    tcg_temp_free_vec(t0);
1038    tcg_temp_free_vec(t1);
1039}
1040
1041/* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
1042static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1043                         uint32_t bofs, uint32_t oprsz,
1044                         uint32_t tysz, TCGType type, bool load_dest,
1045                         void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1046{
1047    TCGv_vec t0 = tcg_temp_new_vec(type);
1048    TCGv_vec t1 = tcg_temp_new_vec(type);
1049    TCGv_vec t2 = tcg_temp_new_vec(type);
1050    uint32_t i;
1051
1052    for (i = 0; i < oprsz; i += tysz) {
1053        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1054        tcg_gen_ld_vec(t1, cpu_env, bofs + i);
1055        if (load_dest) {
1056            tcg_gen_ld_vec(t2, cpu_env, dofs + i);
1057        }
1058        fni(vece, t2, t0, t1);
1059        tcg_gen_st_vec(t2, cpu_env, dofs + i);
1060    }
1061    tcg_temp_free_vec(t2);
1062    tcg_temp_free_vec(t1);
1063    tcg_temp_free_vec(t0);
1064}
1065
1066/*
1067 * Expand OPSZ bytes worth of three-vector operands and an immediate operand
1068 * using host vectors.
1069 */
1070static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1071                          uint32_t bofs, uint32_t oprsz, uint32_t tysz,
1072                          TCGType type, int64_t c, bool load_dest,
1073                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
1074                                      int64_t))
1075{
1076    TCGv_vec t0 = tcg_temp_new_vec(type);
1077    TCGv_vec t1 = tcg_temp_new_vec(type);
1078    TCGv_vec t2 = tcg_temp_new_vec(type);
1079    uint32_t i;
1080
1081    for (i = 0; i < oprsz; i += tysz) {
1082        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1083        tcg_gen_ld_vec(t1, cpu_env, bofs + i);
1084        if (load_dest) {
1085            tcg_gen_ld_vec(t2, cpu_env, dofs + i);
1086        }
1087        fni(vece, t2, t0, t1, c);
1088        tcg_gen_st_vec(t2, cpu_env, dofs + i);
1089    }
1090    tcg_temp_free_vec(t0);
1091    tcg_temp_free_vec(t1);
1092    tcg_temp_free_vec(t2);
1093}
1094
1095/* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
1096static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1097                         uint32_t bofs, uint32_t cofs, uint32_t oprsz,
1098                         uint32_t tysz, TCGType type, bool write_aofs,
1099                         void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1100                                     TCGv_vec, TCGv_vec))
1101{
1102    TCGv_vec t0 = tcg_temp_new_vec(type);
1103    TCGv_vec t1 = tcg_temp_new_vec(type);
1104    TCGv_vec t2 = tcg_temp_new_vec(type);
1105    TCGv_vec t3 = tcg_temp_new_vec(type);
1106    uint32_t i;
1107
1108    for (i = 0; i < oprsz; i += tysz) {
1109        tcg_gen_ld_vec(t1, cpu_env, aofs + i);
1110        tcg_gen_ld_vec(t2, cpu_env, bofs + i);
1111        tcg_gen_ld_vec(t3, cpu_env, cofs + i);
1112        fni(vece, t0, t1, t2, t3);
1113        tcg_gen_st_vec(t0, cpu_env, dofs + i);
1114        if (write_aofs) {
1115            tcg_gen_st_vec(t1, cpu_env, aofs + i);
1116        }
1117    }
1118    tcg_temp_free_vec(t3);
1119    tcg_temp_free_vec(t2);
1120    tcg_temp_free_vec(t1);
1121    tcg_temp_free_vec(t0);
1122}
1123
1124/* Expand a vector two-operand operation.  */
1125void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
1126                    uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
1127{
1128    const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1129    const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1130    TCGType type;
1131    uint32_t some;
1132
1133    check_size_align(oprsz, maxsz, dofs | aofs);
1134    check_overlap_2(dofs, aofs, maxsz);
1135
1136    type = 0;
1137    if (g->fniv) {
1138        type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1139    }
1140    switch (type) {
1141    case TCG_TYPE_V256:
1142        /* Recall that ARM SVE allows vector sizes that are not a
1143         * power of 2, but always a multiple of 16.  The intent is
1144         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1145         */
1146        some = QEMU_ALIGN_DOWN(oprsz, 32);
1147        expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1148                     g->load_dest, g->fniv);
1149        if (some == oprsz) {
1150            break;
1151        }
1152        dofs += some;
1153        aofs += some;
1154        oprsz -= some;
1155        maxsz -= some;
1156        /* fallthru */
1157    case TCG_TYPE_V128:
1158        expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1159                     g->load_dest, g->fniv);
1160        break;
1161    case TCG_TYPE_V64:
1162        expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1163                     g->load_dest, g->fniv);
1164        break;
1165
1166    case 0:
1167        if (g->fni8 && check_size_impl(oprsz, 8)) {
1168            expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8);
1169        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1170            expand_2_i32(dofs, aofs, oprsz, g->load_dest, g->fni4);
1171        } else {
1172            assert(g->fno != NULL);
1173            tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
1174            oprsz = maxsz;
1175        }
1176        break;
1177
1178    default:
1179        g_assert_not_reached();
1180    }
1181    tcg_swap_vecop_list(hold_list);
1182
1183    if (oprsz < maxsz) {
1184        expand_clr(dofs + oprsz, maxsz - oprsz);
1185    }
1186}
1187
1188/* Expand a vector operation with two vectors and an immediate.  */
1189void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1190                     uint32_t maxsz, int64_t c, const GVecGen2i *g)
1191{
1192    const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1193    const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1194    TCGType type;
1195    uint32_t some;
1196
1197    check_size_align(oprsz, maxsz, dofs | aofs);
1198    check_overlap_2(dofs, aofs, maxsz);
1199
1200    type = 0;
1201    if (g->fniv) {
1202        type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1203    }
1204    switch (type) {
1205    case TCG_TYPE_V256:
1206        /* Recall that ARM SVE allows vector sizes that are not a
1207         * power of 2, but always a multiple of 16.  The intent is
1208         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1209         */
1210        some = QEMU_ALIGN_DOWN(oprsz, 32);
1211        expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1212                      c, g->load_dest, g->fniv);
1213        if (some == oprsz) {
1214            break;
1215        }
1216        dofs += some;
1217        aofs += some;
1218        oprsz -= some;
1219        maxsz -= some;
1220        /* fallthru */
1221    case TCG_TYPE_V128:
1222        expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1223                      c, g->load_dest, g->fniv);
1224        break;
1225    case TCG_TYPE_V64:
1226        expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1227                      c, g->load_dest, g->fniv);
1228        break;
1229
1230    case 0:
1231        if (g->fni8 && check_size_impl(oprsz, 8)) {
1232            expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1233        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1234            expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1235        } else {
1236            if (g->fno) {
1237                tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1238            } else {
1239                TCGv_i64 tcg_c = tcg_constant_i64(c);
1240                tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1241                                    maxsz, c, g->fnoi);
1242            }
1243            oprsz = maxsz;
1244        }
1245        break;
1246
1247    default:
1248        g_assert_not_reached();
1249    }
1250    tcg_swap_vecop_list(hold_list);
1251
1252    if (oprsz < maxsz) {
1253        expand_clr(dofs + oprsz, maxsz - oprsz);
1254    }
1255}
1256
1257/* Expand a vector operation with two vectors and a scalar.  */
1258void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1259                     uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1260{
1261    TCGType type;
1262
1263    check_size_align(oprsz, maxsz, dofs | aofs);
1264    check_overlap_2(dofs, aofs, maxsz);
1265
1266    type = 0;
1267    if (g->fniv) {
1268        type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1269    }
1270    if (type != 0) {
1271        const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1272        const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1273        TCGv_vec t_vec = tcg_temp_new_vec(type);
1274        uint32_t some;
1275
1276        tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1277
1278        switch (type) {
1279        case TCG_TYPE_V256:
1280            /* Recall that ARM SVE allows vector sizes that are not a
1281             * power of 2, but always a multiple of 16.  The intent is
1282             * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1283             */
1284            some = QEMU_ALIGN_DOWN(oprsz, 32);
1285            expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1286                          t_vec, g->scalar_first, g->fniv);
1287            if (some == oprsz) {
1288                break;
1289            }
1290            dofs += some;
1291            aofs += some;
1292            oprsz -= some;
1293            maxsz -= some;
1294            /* fallthru */
1295
1296        case TCG_TYPE_V128:
1297            expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1298                          t_vec, g->scalar_first, g->fniv);
1299            break;
1300
1301        case TCG_TYPE_V64:
1302            expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1303                          t_vec, g->scalar_first, g->fniv);
1304            break;
1305
1306        default:
1307            g_assert_not_reached();
1308        }
1309        tcg_temp_free_vec(t_vec);
1310        tcg_swap_vecop_list(hold_list);
1311    } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1312        TCGv_i64 t64 = tcg_temp_new_i64();
1313
1314        tcg_gen_dup_i64(g->vece, t64, c);
1315        expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1316        tcg_temp_free_i64(t64);
1317    } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1318        TCGv_i32 t32 = tcg_temp_new_i32();
1319
1320        tcg_gen_extrl_i64_i32(t32, c);
1321        tcg_gen_dup_i32(g->vece, t32, t32);
1322        expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1323        tcg_temp_free_i32(t32);
1324    } else {
1325        tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1326        return;
1327    }
1328
1329    if (oprsz < maxsz) {
1330        expand_clr(dofs + oprsz, maxsz - oprsz);
1331    }
1332}
1333
1334/* Expand a vector three-operand operation.  */
1335void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1336                    uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1337{
1338    const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1339    const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1340    TCGType type;
1341    uint32_t some;
1342
1343    check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1344    check_overlap_3(dofs, aofs, bofs, maxsz);
1345
1346    type = 0;
1347    if (g->fniv) {
1348        type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1349    }
1350    switch (type) {
1351    case TCG_TYPE_V256:
1352        /* Recall that ARM SVE allows vector sizes that are not a
1353         * power of 2, but always a multiple of 16.  The intent is
1354         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1355         */
1356        some = QEMU_ALIGN_DOWN(oprsz, 32);
1357        expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1358                     g->load_dest, g->fniv);
1359        if (some == oprsz) {
1360            break;
1361        }
1362        dofs += some;
1363        aofs += some;
1364        bofs += some;
1365        oprsz -= some;
1366        maxsz -= some;
1367        /* fallthru */
1368    case TCG_TYPE_V128:
1369        expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1370                     g->load_dest, g->fniv);
1371        break;
1372    case TCG_TYPE_V64:
1373        expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1374                     g->load_dest, g->fniv);
1375        break;
1376
1377    case 0:
1378        if (g->fni8 && check_size_impl(oprsz, 8)) {
1379            expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1380        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1381            expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1382        } else {
1383            assert(g->fno != NULL);
1384            tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
1385                               maxsz, g->data, g->fno);
1386            oprsz = maxsz;
1387        }
1388        break;
1389
1390    default:
1391        g_assert_not_reached();
1392    }
1393    tcg_swap_vecop_list(hold_list);
1394
1395    if (oprsz < maxsz) {
1396        expand_clr(dofs + oprsz, maxsz - oprsz);
1397    }
1398}
1399
1400/* Expand a vector operation with three vectors and an immediate.  */
1401void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1402                     uint32_t oprsz, uint32_t maxsz, int64_t c,
1403                     const GVecGen3i *g)
1404{
1405    const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1406    const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1407    TCGType type;
1408    uint32_t some;
1409
1410    check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1411    check_overlap_3(dofs, aofs, bofs, maxsz);
1412
1413    type = 0;
1414    if (g->fniv) {
1415        type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1416    }
1417    switch (type) {
1418    case TCG_TYPE_V256:
1419        /*
1420         * Recall that ARM SVE allows vector sizes that are not a
1421         * power of 2, but always a multiple of 16.  The intent is
1422         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1423         */
1424        some = QEMU_ALIGN_DOWN(oprsz, 32);
1425        expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1426                      c, g->load_dest, g->fniv);
1427        if (some == oprsz) {
1428            break;
1429        }
1430        dofs += some;
1431        aofs += some;
1432        bofs += some;
1433        oprsz -= some;
1434        maxsz -= some;
1435        /* fallthru */
1436    case TCG_TYPE_V128:
1437        expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1438                      c, g->load_dest, g->fniv);
1439        break;
1440    case TCG_TYPE_V64:
1441        expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1442                      c, g->load_dest, g->fniv);
1443        break;
1444
1445    case 0:
1446        if (g->fni8 && check_size_impl(oprsz, 8)) {
1447            expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8);
1448        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1449            expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4);
1450        } else {
1451            assert(g->fno != NULL);
1452            tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);
1453            oprsz = maxsz;
1454        }
1455        break;
1456
1457    default:
1458        g_assert_not_reached();
1459    }
1460    tcg_swap_vecop_list(hold_list);
1461
1462    if (oprsz < maxsz) {
1463        expand_clr(dofs + oprsz, maxsz - oprsz);
1464    }
1465}
1466
1467/* Expand a vector four-operand operation.  */
1468void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1469                    uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1470{
1471    const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1472    const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1473    TCGType type;
1474    uint32_t some;
1475
1476    check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1477    check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1478
1479    type = 0;
1480    if (g->fniv) {
1481        type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1482    }
1483    switch (type) {
1484    case TCG_TYPE_V256:
1485        /* Recall that ARM SVE allows vector sizes that are not a
1486         * power of 2, but always a multiple of 16.  The intent is
1487         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1488         */
1489        some = QEMU_ALIGN_DOWN(oprsz, 32);
1490        expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1491                     32, TCG_TYPE_V256, g->write_aofs, g->fniv);
1492        if (some == oprsz) {
1493            break;
1494        }
1495        dofs += some;
1496        aofs += some;
1497        bofs += some;
1498        cofs += some;
1499        oprsz -= some;
1500        maxsz -= some;
1501        /* fallthru */
1502    case TCG_TYPE_V128:
1503        expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1504                     16, TCG_TYPE_V128, g->write_aofs, g->fniv);
1505        break;
1506    case TCG_TYPE_V64:
1507        expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1508                     8, TCG_TYPE_V64, g->write_aofs, g->fniv);
1509        break;
1510
1511    case 0:
1512        if (g->fni8 && check_size_impl(oprsz, 8)) {
1513            expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
1514                         g->write_aofs, g->fni8);
1515        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1516            expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
1517                         g->write_aofs, g->fni4);
1518        } else {
1519            assert(g->fno != NULL);
1520            tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1521                               oprsz, maxsz, g->data, g->fno);
1522            oprsz = maxsz;
1523        }
1524        break;
1525
1526    default:
1527        g_assert_not_reached();
1528    }
1529    tcg_swap_vecop_list(hold_list);
1530
1531    if (oprsz < maxsz) {
1532        expand_clr(dofs + oprsz, maxsz - oprsz);
1533    }
1534}
1535
1536/*
1537 * Expand specific vector operations.
1538 */
1539
1540static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1541{
1542    tcg_gen_mov_vec(a, b);
1543}
1544
1545void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1546                      uint32_t oprsz, uint32_t maxsz)
1547{
1548    static const GVecGen2 g = {
1549        .fni8 = tcg_gen_mov_i64,
1550        .fniv = vec_mov2,
1551        .fno = gen_helper_gvec_mov,
1552        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1553    };
1554    if (dofs != aofs) {
1555        tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1556    } else {
1557        check_size_align(oprsz, maxsz, dofs);
1558        if (oprsz < maxsz) {
1559            expand_clr(dofs + oprsz, maxsz - oprsz);
1560        }
1561    }
1562}
1563
1564void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1565                          uint32_t maxsz, TCGv_i32 in)
1566{
1567    check_size_align(oprsz, maxsz, dofs);
1568    tcg_debug_assert(vece <= MO_32);
1569    do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1570}
1571
1572void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1573                          uint32_t maxsz, TCGv_i64 in)
1574{
1575    check_size_align(oprsz, maxsz, dofs);
1576    tcg_debug_assert(vece <= MO_64);
1577    do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1578}
1579
1580void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1581                          uint32_t oprsz, uint32_t maxsz)
1582{
1583    check_size_align(oprsz, maxsz, dofs);
1584    if (vece <= MO_64) {
1585        TCGType type = choose_vector_type(NULL, vece, oprsz, 0);
1586        if (type != 0) {
1587            TCGv_vec t_vec = tcg_temp_new_vec(type);
1588            tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs);
1589            do_dup_store(type, dofs, oprsz, maxsz, t_vec);
1590            tcg_temp_free_vec(t_vec);
1591        } else if (vece <= MO_32) {
1592            TCGv_i32 in = tcg_temp_new_i32();
1593            switch (vece) {
1594            case MO_8:
1595                tcg_gen_ld8u_i32(in, cpu_env, aofs);
1596                break;
1597            case MO_16:
1598                tcg_gen_ld16u_i32(in, cpu_env, aofs);
1599                break;
1600            default:
1601                tcg_gen_ld_i32(in, cpu_env, aofs);
1602                break;
1603            }
1604            do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1605            tcg_temp_free_i32(in);
1606        } else {
1607            TCGv_i64 in = tcg_temp_new_i64();
1608            tcg_gen_ld_i64(in, cpu_env, aofs);
1609            do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1610            tcg_temp_free_i64(in);
1611        }
1612    } else if (vece == 4) {
1613        /* 128-bit duplicate.  */
1614        int i;
1615
1616        tcg_debug_assert(oprsz >= 16);
1617        if (TCG_TARGET_HAS_v128) {
1618            TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1619
1620            tcg_gen_ld_vec(in, cpu_env, aofs);
1621            for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1622                tcg_gen_st_vec(in, cpu_env, dofs + i);
1623            }
1624            tcg_temp_free_vec(in);
1625        } else {
1626            TCGv_i64 in0 = tcg_temp_new_i64();
1627            TCGv_i64 in1 = tcg_temp_new_i64();
1628
1629            tcg_gen_ld_i64(in0, cpu_env, aofs);
1630            tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
1631            for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1632                tcg_gen_st_i64(in0, cpu_env, dofs + i);
1633                tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
1634            }
1635            tcg_temp_free_i64(in0);
1636            tcg_temp_free_i64(in1);
1637        }
1638        if (oprsz < maxsz) {
1639            expand_clr(dofs + oprsz, maxsz - oprsz);
1640        }
1641    } else if (vece == 5) {
1642        /* 256-bit duplicate.  */
1643        int i;
1644
1645        tcg_debug_assert(oprsz >= 32);
1646        tcg_debug_assert(oprsz % 32 == 0);
1647        if (TCG_TARGET_HAS_v256) {
1648            TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256);
1649
1650            tcg_gen_ld_vec(in, cpu_env, aofs);
1651            for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1652                tcg_gen_st_vec(in, cpu_env, dofs + i);
1653            }
1654            tcg_temp_free_vec(in);
1655        } else if (TCG_TARGET_HAS_v128) {
1656            TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128);
1657            TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128);
1658
1659            tcg_gen_ld_vec(in0, cpu_env, aofs);
1660            tcg_gen_ld_vec(in1, cpu_env, aofs + 16);
1661            for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1662                tcg_gen_st_vec(in0, cpu_env, dofs + i);
1663                tcg_gen_st_vec(in1, cpu_env, dofs + i + 16);
1664            }
1665            tcg_temp_free_vec(in0);
1666            tcg_temp_free_vec(in1);
1667        } else {
1668            TCGv_i64 in[4];
1669            int j;
1670
1671            for (j = 0; j < 4; ++j) {
1672                in[j] = tcg_temp_new_i64();
1673                tcg_gen_ld_i64(in[j], cpu_env, aofs + j * 8);
1674            }
1675            for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1676                for (j = 0; j < 4; ++j) {
1677                    tcg_gen_st_i64(in[j], cpu_env, dofs + i + j * 8);
1678                }
1679            }
1680            for (j = 0; j < 4; ++j) {
1681                tcg_temp_free_i64(in[j]);
1682            }
1683        }
1684        if (oprsz < maxsz) {
1685            expand_clr(dofs + oprsz, maxsz - oprsz);
1686        }
1687    } else {
1688        g_assert_not_reached();
1689    }
1690}
1691
1692void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz,
1693                          uint32_t maxsz, uint64_t x)
1694{
1695    check_size_align(oprsz, maxsz, dofs);
1696    do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x);
1697}
1698
1699void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1700                      uint32_t oprsz, uint32_t maxsz)
1701{
1702    static const GVecGen2 g = {
1703        .fni8 = tcg_gen_not_i64,
1704        .fniv = tcg_gen_not_vec,
1705        .fno = gen_helper_gvec_not,
1706        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1707    };
1708    tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1709}
1710
1711/* Perform a vector addition using normal addition and a mask.  The mask
1712   should be the sign bit of each lane.  This 6-operation form is more
1713   efficient than separate additions when there are 4 or more lanes in
1714   the 64-bit operation.  */
1715static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1716{
1717    TCGv_i64 t1 = tcg_temp_new_i64();
1718    TCGv_i64 t2 = tcg_temp_new_i64();
1719    TCGv_i64 t3 = tcg_temp_new_i64();
1720
1721    tcg_gen_andc_i64(t1, a, m);
1722    tcg_gen_andc_i64(t2, b, m);
1723    tcg_gen_xor_i64(t3, a, b);
1724    tcg_gen_add_i64(d, t1, t2);
1725    tcg_gen_and_i64(t3, t3, m);
1726    tcg_gen_xor_i64(d, d, t3);
1727
1728    tcg_temp_free_i64(t1);
1729    tcg_temp_free_i64(t2);
1730    tcg_temp_free_i64(t3);
1731}
1732
1733void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1734{
1735    TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
1736    gen_addv_mask(d, a, b, m);
1737}
1738
1739void tcg_gen_vec_add8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1740{
1741    TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80));
1742    TCGv_i32 t1 = tcg_temp_new_i32();
1743    TCGv_i32 t2 = tcg_temp_new_i32();
1744    TCGv_i32 t3 = tcg_temp_new_i32();
1745
1746    tcg_gen_andc_i32(t1, a, m);
1747    tcg_gen_andc_i32(t2, b, m);
1748    tcg_gen_xor_i32(t3, a, b);
1749    tcg_gen_add_i32(d, t1, t2);
1750    tcg_gen_and_i32(t3, t3, m);
1751    tcg_gen_xor_i32(d, d, t3);
1752
1753    tcg_temp_free_i32(t1);
1754    tcg_temp_free_i32(t2);
1755    tcg_temp_free_i32(t3);
1756}
1757
1758void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1759{
1760    TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
1761    gen_addv_mask(d, a, b, m);
1762}
1763
1764void tcg_gen_vec_add16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1765{
1766    TCGv_i32 t1 = tcg_temp_new_i32();
1767    TCGv_i32 t2 = tcg_temp_new_i32();
1768
1769    tcg_gen_andi_i32(t1, a, ~0xffff);
1770    tcg_gen_add_i32(t2, a, b);
1771    tcg_gen_add_i32(t1, t1, b);
1772    tcg_gen_deposit_i32(d, t1, t2, 0, 16);
1773
1774    tcg_temp_free_i32(t1);
1775    tcg_temp_free_i32(t2);
1776}
1777
1778void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1779{
1780    TCGv_i64 t1 = tcg_temp_new_i64();
1781    TCGv_i64 t2 = tcg_temp_new_i64();
1782
1783    tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1784    tcg_gen_add_i64(t2, a, b);
1785    tcg_gen_add_i64(t1, t1, b);
1786    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1787
1788    tcg_temp_free_i64(t1);
1789    tcg_temp_free_i64(t2);
1790}
1791
1792static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 };
1793
1794void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1795                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1796{
1797    static const GVecGen3 g[4] = {
1798        { .fni8 = tcg_gen_vec_add8_i64,
1799          .fniv = tcg_gen_add_vec,
1800          .fno = gen_helper_gvec_add8,
1801          .opt_opc = vecop_list_add,
1802          .vece = MO_8 },
1803        { .fni8 = tcg_gen_vec_add16_i64,
1804          .fniv = tcg_gen_add_vec,
1805          .fno = gen_helper_gvec_add16,
1806          .opt_opc = vecop_list_add,
1807          .vece = MO_16 },
1808        { .fni4 = tcg_gen_add_i32,
1809          .fniv = tcg_gen_add_vec,
1810          .fno = gen_helper_gvec_add32,
1811          .opt_opc = vecop_list_add,
1812          .vece = MO_32 },
1813        { .fni8 = tcg_gen_add_i64,
1814          .fniv = tcg_gen_add_vec,
1815          .fno = gen_helper_gvec_add64,
1816          .opt_opc = vecop_list_add,
1817          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1818          .vece = MO_64 },
1819    };
1820
1821    tcg_debug_assert(vece <= MO_64);
1822    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1823}
1824
1825void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1826                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1827{
1828    static const GVecGen2s g[4] = {
1829        { .fni8 = tcg_gen_vec_add8_i64,
1830          .fniv = tcg_gen_add_vec,
1831          .fno = gen_helper_gvec_adds8,
1832          .opt_opc = vecop_list_add,
1833          .vece = MO_8 },
1834        { .fni8 = tcg_gen_vec_add16_i64,
1835          .fniv = tcg_gen_add_vec,
1836          .fno = gen_helper_gvec_adds16,
1837          .opt_opc = vecop_list_add,
1838          .vece = MO_16 },
1839        { .fni4 = tcg_gen_add_i32,
1840          .fniv = tcg_gen_add_vec,
1841          .fno = gen_helper_gvec_adds32,
1842          .opt_opc = vecop_list_add,
1843          .vece = MO_32 },
1844        { .fni8 = tcg_gen_add_i64,
1845          .fniv = tcg_gen_add_vec,
1846          .fno = gen_helper_gvec_adds64,
1847          .opt_opc = vecop_list_add,
1848          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1849          .vece = MO_64 },
1850    };
1851
1852    tcg_debug_assert(vece <= MO_64);
1853    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1854}
1855
1856void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
1857                       int64_t c, uint32_t oprsz, uint32_t maxsz)
1858{
1859    TCGv_i64 tmp = tcg_constant_i64(c);
1860    tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
1861}
1862
1863static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
1864
1865void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
1866                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1867{
1868    static const GVecGen2s g[4] = {
1869        { .fni8 = tcg_gen_vec_sub8_i64,
1870          .fniv = tcg_gen_sub_vec,
1871          .fno = gen_helper_gvec_subs8,
1872          .opt_opc = vecop_list_sub,
1873          .vece = MO_8 },
1874        { .fni8 = tcg_gen_vec_sub16_i64,
1875          .fniv = tcg_gen_sub_vec,
1876          .fno = gen_helper_gvec_subs16,
1877          .opt_opc = vecop_list_sub,
1878          .vece = MO_16 },
1879        { .fni4 = tcg_gen_sub_i32,
1880          .fniv = tcg_gen_sub_vec,
1881          .fno = gen_helper_gvec_subs32,
1882          .opt_opc = vecop_list_sub,
1883          .vece = MO_32 },
1884        { .fni8 = tcg_gen_sub_i64,
1885          .fniv = tcg_gen_sub_vec,
1886          .fno = gen_helper_gvec_subs64,
1887          .opt_opc = vecop_list_sub,
1888          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1889          .vece = MO_64 },
1890    };
1891
1892    tcg_debug_assert(vece <= MO_64);
1893    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1894}
1895
1896/* Perform a vector subtraction using normal subtraction and a mask.
1897   Compare gen_addv_mask above.  */
1898static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1899{
1900    TCGv_i64 t1 = tcg_temp_new_i64();
1901    TCGv_i64 t2 = tcg_temp_new_i64();
1902    TCGv_i64 t3 = tcg_temp_new_i64();
1903
1904    tcg_gen_or_i64(t1, a, m);
1905    tcg_gen_andc_i64(t2, b, m);
1906    tcg_gen_eqv_i64(t3, a, b);
1907    tcg_gen_sub_i64(d, t1, t2);
1908    tcg_gen_and_i64(t3, t3, m);
1909    tcg_gen_xor_i64(d, d, t3);
1910
1911    tcg_temp_free_i64(t1);
1912    tcg_temp_free_i64(t2);
1913    tcg_temp_free_i64(t3);
1914}
1915
1916void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1917{
1918    TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
1919    gen_subv_mask(d, a, b, m);
1920}
1921
1922void tcg_gen_vec_sub8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1923{
1924    TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80));
1925    TCGv_i32 t1 = tcg_temp_new_i32();
1926    TCGv_i32 t2 = tcg_temp_new_i32();
1927    TCGv_i32 t3 = tcg_temp_new_i32();
1928
1929    tcg_gen_or_i32(t1, a, m);
1930    tcg_gen_andc_i32(t2, b, m);
1931    tcg_gen_eqv_i32(t3, a, b);
1932    tcg_gen_sub_i32(d, t1, t2);
1933    tcg_gen_and_i32(t3, t3, m);
1934    tcg_gen_xor_i32(d, d, t3);
1935
1936    tcg_temp_free_i32(t1);
1937    tcg_temp_free_i32(t2);
1938    tcg_temp_free_i32(t3);
1939}
1940
1941void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1942{
1943    TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
1944    gen_subv_mask(d, a, b, m);
1945}
1946
1947void tcg_gen_vec_sub16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1948{
1949    TCGv_i32 t1 = tcg_temp_new_i32();
1950    TCGv_i32 t2 = tcg_temp_new_i32();
1951
1952    tcg_gen_andi_i32(t1, b, ~0xffff);
1953    tcg_gen_sub_i32(t2, a, b);
1954    tcg_gen_sub_i32(t1, a, t1);
1955    tcg_gen_deposit_i32(d, t1, t2, 0, 16);
1956
1957    tcg_temp_free_i32(t1);
1958    tcg_temp_free_i32(t2);
1959}
1960
1961void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1962{
1963    TCGv_i64 t1 = tcg_temp_new_i64();
1964    TCGv_i64 t2 = tcg_temp_new_i64();
1965
1966    tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1967    tcg_gen_sub_i64(t2, a, b);
1968    tcg_gen_sub_i64(t1, a, t1);
1969    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1970
1971    tcg_temp_free_i64(t1);
1972    tcg_temp_free_i64(t2);
1973}
1974
1975void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
1976                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1977{
1978    static const GVecGen3 g[4] = {
1979        { .fni8 = tcg_gen_vec_sub8_i64,
1980          .fniv = tcg_gen_sub_vec,
1981          .fno = gen_helper_gvec_sub8,
1982          .opt_opc = vecop_list_sub,
1983          .vece = MO_8 },
1984        { .fni8 = tcg_gen_vec_sub16_i64,
1985          .fniv = tcg_gen_sub_vec,
1986          .fno = gen_helper_gvec_sub16,
1987          .opt_opc = vecop_list_sub,
1988          .vece = MO_16 },
1989        { .fni4 = tcg_gen_sub_i32,
1990          .fniv = tcg_gen_sub_vec,
1991          .fno = gen_helper_gvec_sub32,
1992          .opt_opc = vecop_list_sub,
1993          .vece = MO_32 },
1994        { .fni8 = tcg_gen_sub_i64,
1995          .fniv = tcg_gen_sub_vec,
1996          .fno = gen_helper_gvec_sub64,
1997          .opt_opc = vecop_list_sub,
1998          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1999          .vece = MO_64 },
2000    };
2001
2002    tcg_debug_assert(vece <= MO_64);
2003    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2004}
2005
2006static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 };
2007
2008void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
2009                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2010{
2011    static const GVecGen3 g[4] = {
2012        { .fniv = tcg_gen_mul_vec,
2013          .fno = gen_helper_gvec_mul8,
2014          .opt_opc = vecop_list_mul,
2015          .vece = MO_8 },
2016        { .fniv = tcg_gen_mul_vec,
2017          .fno = gen_helper_gvec_mul16,
2018          .opt_opc = vecop_list_mul,
2019          .vece = MO_16 },
2020        { .fni4 = tcg_gen_mul_i32,
2021          .fniv = tcg_gen_mul_vec,
2022          .fno = gen_helper_gvec_mul32,
2023          .opt_opc = vecop_list_mul,
2024          .vece = MO_32 },
2025        { .fni8 = tcg_gen_mul_i64,
2026          .fniv = tcg_gen_mul_vec,
2027          .fno = gen_helper_gvec_mul64,
2028          .opt_opc = vecop_list_mul,
2029          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2030          .vece = MO_64 },
2031    };
2032
2033    tcg_debug_assert(vece <= MO_64);
2034    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2035}
2036
2037void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
2038                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2039{
2040    static const GVecGen2s g[4] = {
2041        { .fniv = tcg_gen_mul_vec,
2042          .fno = gen_helper_gvec_muls8,
2043          .opt_opc = vecop_list_mul,
2044          .vece = MO_8 },
2045        { .fniv = tcg_gen_mul_vec,
2046          .fno = gen_helper_gvec_muls16,
2047          .opt_opc = vecop_list_mul,
2048          .vece = MO_16 },
2049        { .fni4 = tcg_gen_mul_i32,
2050          .fniv = tcg_gen_mul_vec,
2051          .fno = gen_helper_gvec_muls32,
2052          .opt_opc = vecop_list_mul,
2053          .vece = MO_32 },
2054        { .fni8 = tcg_gen_mul_i64,
2055          .fniv = tcg_gen_mul_vec,
2056          .fno = gen_helper_gvec_muls64,
2057          .opt_opc = vecop_list_mul,
2058          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2059          .vece = MO_64 },
2060    };
2061
2062    tcg_debug_assert(vece <= MO_64);
2063    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
2064}
2065
2066void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
2067                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2068{
2069    TCGv_i64 tmp = tcg_constant_i64(c);
2070    tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
2071}
2072
2073void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2074                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2075{
2076    static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 };
2077    static const GVecGen3 g[4] = {
2078        { .fniv = tcg_gen_ssadd_vec,
2079          .fno = gen_helper_gvec_ssadd8,
2080          .opt_opc = vecop_list,
2081          .vece = MO_8 },
2082        { .fniv = tcg_gen_ssadd_vec,
2083          .fno = gen_helper_gvec_ssadd16,
2084          .opt_opc = vecop_list,
2085          .vece = MO_16 },
2086        { .fniv = tcg_gen_ssadd_vec,
2087          .fno = gen_helper_gvec_ssadd32,
2088          .opt_opc = vecop_list,
2089          .vece = MO_32 },
2090        { .fniv = tcg_gen_ssadd_vec,
2091          .fno = gen_helper_gvec_ssadd64,
2092          .opt_opc = vecop_list,
2093          .vece = MO_64 },
2094    };
2095    tcg_debug_assert(vece <= MO_64);
2096    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2097}
2098
2099void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
2100                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2101{
2102    static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 };
2103    static const GVecGen3 g[4] = {
2104        { .fniv = tcg_gen_sssub_vec,
2105          .fno = gen_helper_gvec_sssub8,
2106          .opt_opc = vecop_list,
2107          .vece = MO_8 },
2108        { .fniv = tcg_gen_sssub_vec,
2109          .fno = gen_helper_gvec_sssub16,
2110          .opt_opc = vecop_list,
2111          .vece = MO_16 },
2112        { .fniv = tcg_gen_sssub_vec,
2113          .fno = gen_helper_gvec_sssub32,
2114          .opt_opc = vecop_list,
2115          .vece = MO_32 },
2116        { .fniv = tcg_gen_sssub_vec,
2117          .fno = gen_helper_gvec_sssub64,
2118          .opt_opc = vecop_list,
2119          .vece = MO_64 },
2120    };
2121    tcg_debug_assert(vece <= MO_64);
2122    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2123}
2124
2125static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2126{
2127    TCGv_i32 max = tcg_constant_i32(-1);
2128    tcg_gen_add_i32(d, a, b);
2129    tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
2130}
2131
2132static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2133{
2134    TCGv_i64 max = tcg_constant_i64(-1);
2135    tcg_gen_add_i64(d, a, b);
2136    tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
2137}
2138
2139void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2140                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2141{
2142    static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 };
2143    static const GVecGen3 g[4] = {
2144        { .fniv = tcg_gen_usadd_vec,
2145          .fno = gen_helper_gvec_usadd8,
2146          .opt_opc = vecop_list,
2147          .vece = MO_8 },
2148        { .fniv = tcg_gen_usadd_vec,
2149          .fno = gen_helper_gvec_usadd16,
2150          .opt_opc = vecop_list,
2151          .vece = MO_16 },
2152        { .fni4 = tcg_gen_usadd_i32,
2153          .fniv = tcg_gen_usadd_vec,
2154          .fno = gen_helper_gvec_usadd32,
2155          .opt_opc = vecop_list,
2156          .vece = MO_32 },
2157        { .fni8 = tcg_gen_usadd_i64,
2158          .fniv = tcg_gen_usadd_vec,
2159          .fno = gen_helper_gvec_usadd64,
2160          .opt_opc = vecop_list,
2161          .vece = MO_64 }
2162    };
2163    tcg_debug_assert(vece <= MO_64);
2164    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2165}
2166
2167static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2168{
2169    TCGv_i32 min = tcg_constant_i32(0);
2170    tcg_gen_sub_i32(d, a, b);
2171    tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
2172}
2173
2174static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2175{
2176    TCGv_i64 min = tcg_constant_i64(0);
2177    tcg_gen_sub_i64(d, a, b);
2178    tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
2179}
2180
2181void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
2182                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2183{
2184    static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 };
2185    static const GVecGen3 g[4] = {
2186        { .fniv = tcg_gen_ussub_vec,
2187          .fno = gen_helper_gvec_ussub8,
2188          .opt_opc = vecop_list,
2189          .vece = MO_8 },
2190        { .fniv = tcg_gen_ussub_vec,
2191          .fno = gen_helper_gvec_ussub16,
2192          .opt_opc = vecop_list,
2193          .vece = MO_16 },
2194        { .fni4 = tcg_gen_ussub_i32,
2195          .fniv = tcg_gen_ussub_vec,
2196          .fno = gen_helper_gvec_ussub32,
2197          .opt_opc = vecop_list,
2198          .vece = MO_32 },
2199        { .fni8 = tcg_gen_ussub_i64,
2200          .fniv = tcg_gen_ussub_vec,
2201          .fno = gen_helper_gvec_ussub64,
2202          .opt_opc = vecop_list,
2203          .vece = MO_64 }
2204    };
2205    tcg_debug_assert(vece <= MO_64);
2206    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2207}
2208
2209void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
2210                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2211{
2212    static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 };
2213    static const GVecGen3 g[4] = {
2214        { .fniv = tcg_gen_smin_vec,
2215          .fno = gen_helper_gvec_smin8,
2216          .opt_opc = vecop_list,
2217          .vece = MO_8 },
2218        { .fniv = tcg_gen_smin_vec,
2219          .fno = gen_helper_gvec_smin16,
2220          .opt_opc = vecop_list,
2221          .vece = MO_16 },
2222        { .fni4 = tcg_gen_smin_i32,
2223          .fniv = tcg_gen_smin_vec,
2224          .fno = gen_helper_gvec_smin32,
2225          .opt_opc = vecop_list,
2226          .vece = MO_32 },
2227        { .fni8 = tcg_gen_smin_i64,
2228          .fniv = tcg_gen_smin_vec,
2229          .fno = gen_helper_gvec_smin64,
2230          .opt_opc = vecop_list,
2231          .vece = MO_64 }
2232    };
2233    tcg_debug_assert(vece <= MO_64);
2234    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2235}
2236
2237void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
2238                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2239{
2240    static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 };
2241    static const GVecGen3 g[4] = {
2242        { .fniv = tcg_gen_umin_vec,
2243          .fno = gen_helper_gvec_umin8,
2244          .opt_opc = vecop_list,
2245          .vece = MO_8 },
2246        { .fniv = tcg_gen_umin_vec,
2247          .fno = gen_helper_gvec_umin16,
2248          .opt_opc = vecop_list,
2249          .vece = MO_16 },
2250        { .fni4 = tcg_gen_umin_i32,
2251          .fniv = tcg_gen_umin_vec,
2252          .fno = gen_helper_gvec_umin32,
2253          .opt_opc = vecop_list,
2254          .vece = MO_32 },
2255        { .fni8 = tcg_gen_umin_i64,
2256          .fniv = tcg_gen_umin_vec,
2257          .fno = gen_helper_gvec_umin64,
2258          .opt_opc = vecop_list,
2259          .vece = MO_64 }
2260    };
2261    tcg_debug_assert(vece <= MO_64);
2262    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2263}
2264
2265void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
2266                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2267{
2268    static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 };
2269    static const GVecGen3 g[4] = {
2270        { .fniv = tcg_gen_smax_vec,
2271          .fno = gen_helper_gvec_smax8,
2272          .opt_opc = vecop_list,
2273          .vece = MO_8 },
2274        { .fniv = tcg_gen_smax_vec,
2275          .fno = gen_helper_gvec_smax16,
2276          .opt_opc = vecop_list,
2277          .vece = MO_16 },
2278        { .fni4 = tcg_gen_smax_i32,
2279          .fniv = tcg_gen_smax_vec,
2280          .fno = gen_helper_gvec_smax32,
2281          .opt_opc = vecop_list,
2282          .vece = MO_32 },
2283        { .fni8 = tcg_gen_smax_i64,
2284          .fniv = tcg_gen_smax_vec,
2285          .fno = gen_helper_gvec_smax64,
2286          .opt_opc = vecop_list,
2287          .vece = MO_64 }
2288    };
2289    tcg_debug_assert(vece <= MO_64);
2290    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2291}
2292
2293void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
2294                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2295{
2296    static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 };
2297    static const GVecGen3 g[4] = {
2298        { .fniv = tcg_gen_umax_vec,
2299          .fno = gen_helper_gvec_umax8,
2300          .opt_opc = vecop_list,
2301          .vece = MO_8 },
2302        { .fniv = tcg_gen_umax_vec,
2303          .fno = gen_helper_gvec_umax16,
2304          .opt_opc = vecop_list,
2305          .vece = MO_16 },
2306        { .fni4 = tcg_gen_umax_i32,
2307          .fniv = tcg_gen_umax_vec,
2308          .fno = gen_helper_gvec_umax32,
2309          .opt_opc = vecop_list,
2310          .vece = MO_32 },
2311        { .fni8 = tcg_gen_umax_i64,
2312          .fniv = tcg_gen_umax_vec,
2313          .fno = gen_helper_gvec_umax64,
2314          .opt_opc = vecop_list,
2315          .vece = MO_64 }
2316    };
2317    tcg_debug_assert(vece <= MO_64);
2318    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2319}
2320
2321/* Perform a vector negation using normal negation and a mask.
2322   Compare gen_subv_mask above.  */
2323static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
2324{
2325    TCGv_i64 t2 = tcg_temp_new_i64();
2326    TCGv_i64 t3 = tcg_temp_new_i64();
2327
2328    tcg_gen_andc_i64(t3, m, b);
2329    tcg_gen_andc_i64(t2, b, m);
2330    tcg_gen_sub_i64(d, m, t2);
2331    tcg_gen_xor_i64(d, d, t3);
2332
2333    tcg_temp_free_i64(t2);
2334    tcg_temp_free_i64(t3);
2335}
2336
2337void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
2338{
2339    TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
2340    gen_negv_mask(d, b, m);
2341}
2342
2343void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
2344{
2345    TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
2346    gen_negv_mask(d, b, m);
2347}
2348
2349void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
2350{
2351    TCGv_i64 t1 = tcg_temp_new_i64();
2352    TCGv_i64 t2 = tcg_temp_new_i64();
2353
2354    tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2355    tcg_gen_neg_i64(t2, b);
2356    tcg_gen_neg_i64(t1, t1);
2357    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2358
2359    tcg_temp_free_i64(t1);
2360    tcg_temp_free_i64(t2);
2361}
2362
2363void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
2364                      uint32_t oprsz, uint32_t maxsz)
2365{
2366    static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 };
2367    static const GVecGen2 g[4] = {
2368        { .fni8 = tcg_gen_vec_neg8_i64,
2369          .fniv = tcg_gen_neg_vec,
2370          .fno = gen_helper_gvec_neg8,
2371          .opt_opc = vecop_list,
2372          .vece = MO_8 },
2373        { .fni8 = tcg_gen_vec_neg16_i64,
2374          .fniv = tcg_gen_neg_vec,
2375          .fno = gen_helper_gvec_neg16,
2376          .opt_opc = vecop_list,
2377          .vece = MO_16 },
2378        { .fni4 = tcg_gen_neg_i32,
2379          .fniv = tcg_gen_neg_vec,
2380          .fno = gen_helper_gvec_neg32,
2381          .opt_opc = vecop_list,
2382          .vece = MO_32 },
2383        { .fni8 = tcg_gen_neg_i64,
2384          .fniv = tcg_gen_neg_vec,
2385          .fno = gen_helper_gvec_neg64,
2386          .opt_opc = vecop_list,
2387          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2388          .vece = MO_64 },
2389    };
2390
2391    tcg_debug_assert(vece <= MO_64);
2392    tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2393}
2394
2395static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
2396{
2397    TCGv_i64 t = tcg_temp_new_i64();
2398    int nbit = 8 << vece;
2399
2400    /* Create -1 for each negative element.  */
2401    tcg_gen_shri_i64(t, b, nbit - 1);
2402    tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2403    tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
2404
2405    /*
2406     * Invert (via xor -1) and add one.
2407     * Because of the ordering the msb is cleared,
2408     * so we never have carry into the next element.
2409     */
2410    tcg_gen_xor_i64(d, b, t);
2411    tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2412    tcg_gen_add_i64(d, d, t);
2413
2414    tcg_temp_free_i64(t);
2415}
2416
2417static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b)
2418{
2419    gen_absv_mask(d, b, MO_8);
2420}
2421
2422static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b)
2423{
2424    gen_absv_mask(d, b, MO_16);
2425}
2426
2427void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
2428                      uint32_t oprsz, uint32_t maxsz)
2429{
2430    static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 };
2431    static const GVecGen2 g[4] = {
2432        { .fni8 = tcg_gen_vec_abs8_i64,
2433          .fniv = tcg_gen_abs_vec,
2434          .fno = gen_helper_gvec_abs8,
2435          .opt_opc = vecop_list,
2436          .vece = MO_8 },
2437        { .fni8 = tcg_gen_vec_abs16_i64,
2438          .fniv = tcg_gen_abs_vec,
2439          .fno = gen_helper_gvec_abs16,
2440          .opt_opc = vecop_list,
2441          .vece = MO_16 },
2442        { .fni4 = tcg_gen_abs_i32,
2443          .fniv = tcg_gen_abs_vec,
2444          .fno = gen_helper_gvec_abs32,
2445          .opt_opc = vecop_list,
2446          .vece = MO_32 },
2447        { .fni8 = tcg_gen_abs_i64,
2448          .fniv = tcg_gen_abs_vec,
2449          .fno = gen_helper_gvec_abs64,
2450          .opt_opc = vecop_list,
2451          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2452          .vece = MO_64 },
2453    };
2454
2455    tcg_debug_assert(vece <= MO_64);
2456    tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2457}
2458
2459void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
2460                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2461{
2462    static const GVecGen3 g = {
2463        .fni8 = tcg_gen_and_i64,
2464        .fniv = tcg_gen_and_vec,
2465        .fno = gen_helper_gvec_and,
2466        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2467    };
2468
2469    if (aofs == bofs) {
2470        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2471    } else {
2472        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2473    }
2474}
2475
2476void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
2477                     uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2478{
2479    static const GVecGen3 g = {
2480        .fni8 = tcg_gen_or_i64,
2481        .fniv = tcg_gen_or_vec,
2482        .fno = gen_helper_gvec_or,
2483        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2484    };
2485
2486    if (aofs == bofs) {
2487        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2488    } else {
2489        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2490    }
2491}
2492
2493void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
2494                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2495{
2496    static const GVecGen3 g = {
2497        .fni8 = tcg_gen_xor_i64,
2498        .fniv = tcg_gen_xor_vec,
2499        .fno = gen_helper_gvec_xor,
2500        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2501    };
2502
2503    if (aofs == bofs) {
2504        tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2505    } else {
2506        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2507    }
2508}
2509
2510void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
2511                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2512{
2513    static const GVecGen3 g = {
2514        .fni8 = tcg_gen_andc_i64,
2515        .fniv = tcg_gen_andc_vec,
2516        .fno = gen_helper_gvec_andc,
2517        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2518    };
2519
2520    if (aofs == bofs) {
2521        tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2522    } else {
2523        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2524    }
2525}
2526
2527void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
2528                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2529{
2530    static const GVecGen3 g = {
2531        .fni8 = tcg_gen_orc_i64,
2532        .fniv = tcg_gen_orc_vec,
2533        .fno = gen_helper_gvec_orc,
2534        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2535    };
2536
2537    if (aofs == bofs) {
2538        tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2539    } else {
2540        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2541    }
2542}
2543
2544void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
2545                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2546{
2547    static const GVecGen3 g = {
2548        .fni8 = tcg_gen_nand_i64,
2549        .fniv = tcg_gen_nand_vec,
2550        .fno = gen_helper_gvec_nand,
2551        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2552    };
2553
2554    if (aofs == bofs) {
2555        tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2556    } else {
2557        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2558    }
2559}
2560
2561void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
2562                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2563{
2564    static const GVecGen3 g = {
2565        .fni8 = tcg_gen_nor_i64,
2566        .fniv = tcg_gen_nor_vec,
2567        .fno = gen_helper_gvec_nor,
2568        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2569    };
2570
2571    if (aofs == bofs) {
2572        tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2573    } else {
2574        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2575    }
2576}
2577
2578void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
2579                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2580{
2581    static const GVecGen3 g = {
2582        .fni8 = tcg_gen_eqv_i64,
2583        .fniv = tcg_gen_eqv_vec,
2584        .fno = gen_helper_gvec_eqv,
2585        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2586    };
2587
2588    if (aofs == bofs) {
2589        tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2590    } else {
2591        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2592    }
2593}
2594
2595static const GVecGen2s gop_ands = {
2596    .fni8 = tcg_gen_and_i64,
2597    .fniv = tcg_gen_and_vec,
2598    .fno = gen_helper_gvec_ands,
2599    .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2600    .vece = MO_64
2601};
2602
2603void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
2604                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2605{
2606    TCGv_i64 tmp = tcg_temp_new_i64();
2607    tcg_gen_dup_i64(vece, tmp, c);
2608    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2609    tcg_temp_free_i64(tmp);
2610}
2611
2612void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
2613                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2614{
2615    TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2616    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2617}
2618
2619static const GVecGen2s gop_xors = {
2620    .fni8 = tcg_gen_xor_i64,
2621    .fniv = tcg_gen_xor_vec,
2622    .fno = gen_helper_gvec_xors,
2623    .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2624    .vece = MO_64
2625};
2626
2627void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
2628                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2629{
2630    TCGv_i64 tmp = tcg_temp_new_i64();
2631    tcg_gen_dup_i64(vece, tmp, c);
2632    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2633    tcg_temp_free_i64(tmp);
2634}
2635
2636void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
2637                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2638{
2639    TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2640    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2641}
2642
2643static const GVecGen2s gop_ors = {
2644    .fni8 = tcg_gen_or_i64,
2645    .fniv = tcg_gen_or_vec,
2646    .fno = gen_helper_gvec_ors,
2647    .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2648    .vece = MO_64
2649};
2650
2651void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
2652                      TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2653{
2654    TCGv_i64 tmp = tcg_temp_new_i64();
2655    tcg_gen_dup_i64(vece, tmp, c);
2656    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2657    tcg_temp_free_i64(tmp);
2658}
2659
2660void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
2661                      int64_t c, uint32_t oprsz, uint32_t maxsz)
2662{
2663    TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2664    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2665}
2666
2667void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2668{
2669    uint64_t mask = dup_const(MO_8, 0xff << c);
2670    tcg_gen_shli_i64(d, a, c);
2671    tcg_gen_andi_i64(d, d, mask);
2672}
2673
2674void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2675{
2676    uint64_t mask = dup_const(MO_16, 0xffff << c);
2677    tcg_gen_shli_i64(d, a, c);
2678    tcg_gen_andi_i64(d, d, mask);
2679}
2680
2681void tcg_gen_vec_shl8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2682{
2683    uint32_t mask = dup_const(MO_8, 0xff << c);
2684    tcg_gen_shli_i32(d, a, c);
2685    tcg_gen_andi_i32(d, d, mask);
2686}
2687
2688void tcg_gen_vec_shl16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2689{
2690    uint32_t mask = dup_const(MO_16, 0xffff << c);
2691    tcg_gen_shli_i32(d, a, c);
2692    tcg_gen_andi_i32(d, d, mask);
2693}
2694
2695void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
2696                       int64_t shift, uint32_t oprsz, uint32_t maxsz)
2697{
2698    static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
2699    static const GVecGen2i g[4] = {
2700        { .fni8 = tcg_gen_vec_shl8i_i64,
2701          .fniv = tcg_gen_shli_vec,
2702          .fno = gen_helper_gvec_shl8i,
2703          .opt_opc = vecop_list,
2704          .vece = MO_8 },
2705        { .fni8 = tcg_gen_vec_shl16i_i64,
2706          .fniv = tcg_gen_shli_vec,
2707          .fno = gen_helper_gvec_shl16i,
2708          .opt_opc = vecop_list,
2709          .vece = MO_16 },
2710        { .fni4 = tcg_gen_shli_i32,
2711          .fniv = tcg_gen_shli_vec,
2712          .fno = gen_helper_gvec_shl32i,
2713          .opt_opc = vecop_list,
2714          .vece = MO_32 },
2715        { .fni8 = tcg_gen_shli_i64,
2716          .fniv = tcg_gen_shli_vec,
2717          .fno = gen_helper_gvec_shl64i,
2718          .opt_opc = vecop_list,
2719          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2720          .vece = MO_64 },
2721    };
2722
2723    tcg_debug_assert(vece <= MO_64);
2724    tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2725    if (shift == 0) {
2726        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2727    } else {
2728        tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2729    }
2730}
2731
2732void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2733{
2734    uint64_t mask = dup_const(MO_8, 0xff >> c);
2735    tcg_gen_shri_i64(d, a, c);
2736    tcg_gen_andi_i64(d, d, mask);
2737}
2738
2739void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2740{
2741    uint64_t mask = dup_const(MO_16, 0xffff >> c);
2742    tcg_gen_shri_i64(d, a, c);
2743    tcg_gen_andi_i64(d, d, mask);
2744}
2745
2746void tcg_gen_vec_shr8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2747{
2748    uint32_t mask = dup_const(MO_8, 0xff >> c);
2749    tcg_gen_shri_i32(d, a, c);
2750    tcg_gen_andi_i32(d, d, mask);
2751}
2752
2753void tcg_gen_vec_shr16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2754{
2755    uint32_t mask = dup_const(MO_16, 0xffff >> c);
2756    tcg_gen_shri_i32(d, a, c);
2757    tcg_gen_andi_i32(d, d, mask);
2758}
2759
2760void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
2761                       int64_t shift, uint32_t oprsz, uint32_t maxsz)
2762{
2763    static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
2764    static const GVecGen2i g[4] = {
2765        { .fni8 = tcg_gen_vec_shr8i_i64,
2766          .fniv = tcg_gen_shri_vec,
2767          .fno = gen_helper_gvec_shr8i,
2768          .opt_opc = vecop_list,
2769          .vece = MO_8 },
2770        { .fni8 = tcg_gen_vec_shr16i_i64,
2771          .fniv = tcg_gen_shri_vec,
2772          .fno = gen_helper_gvec_shr16i,
2773          .opt_opc = vecop_list,
2774          .vece = MO_16 },
2775        { .fni4 = tcg_gen_shri_i32,
2776          .fniv = tcg_gen_shri_vec,
2777          .fno = gen_helper_gvec_shr32i,
2778          .opt_opc = vecop_list,
2779          .vece = MO_32 },
2780        { .fni8 = tcg_gen_shri_i64,
2781          .fniv = tcg_gen_shri_vec,
2782          .fno = gen_helper_gvec_shr64i,
2783          .opt_opc = vecop_list,
2784          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2785          .vece = MO_64 },
2786    };
2787
2788    tcg_debug_assert(vece <= MO_64);
2789    tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2790    if (shift == 0) {
2791        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2792    } else {
2793        tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2794    }
2795}
2796
2797void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2798{
2799    uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2800    uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2801    TCGv_i64 s = tcg_temp_new_i64();
2802
2803    tcg_gen_shri_i64(d, a, c);
2804    tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2805    tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2806    tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2807    tcg_gen_or_i64(d, d, s);         /* include sign extension */
2808    tcg_temp_free_i64(s);
2809}
2810
2811void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2812{
2813    uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2814    uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2815    TCGv_i64 s = tcg_temp_new_i64();
2816
2817    tcg_gen_shri_i64(d, a, c);
2818    tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2819    tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2820    tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2821    tcg_gen_or_i64(d, d, s);         /* include sign extension */
2822    tcg_temp_free_i64(s);
2823}
2824
2825void tcg_gen_vec_sar8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2826{
2827    uint32_t s_mask = dup_const(MO_8, 0x80 >> c);
2828    uint32_t c_mask = dup_const(MO_8, 0xff >> c);
2829    TCGv_i32 s = tcg_temp_new_i32();
2830
2831    tcg_gen_shri_i32(d, a, c);
2832    tcg_gen_andi_i32(s, d, s_mask);  /* isolate (shifted) sign bit */
2833    tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */
2834    tcg_gen_andi_i32(d, d, c_mask);  /* clear out bits above sign  */
2835    tcg_gen_or_i32(d, d, s);         /* include sign extension */
2836    tcg_temp_free_i32(s);
2837}
2838
2839void tcg_gen_vec_sar16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2840{
2841    uint32_t s_mask = dup_const(MO_16, 0x8000 >> c);
2842    uint32_t c_mask = dup_const(MO_16, 0xffff >> c);
2843    TCGv_i32 s = tcg_temp_new_i32();
2844
2845    tcg_gen_shri_i32(d, a, c);
2846    tcg_gen_andi_i32(s, d, s_mask);  /* isolate (shifted) sign bit */
2847    tcg_gen_andi_i32(d, d, c_mask);  /* clear out bits above sign  */
2848    tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */
2849    tcg_gen_or_i32(d, d, s);         /* include sign extension */
2850    tcg_temp_free_i32(s);
2851}
2852
2853void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
2854                       int64_t shift, uint32_t oprsz, uint32_t maxsz)
2855{
2856    static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 };
2857    static const GVecGen2i g[4] = {
2858        { .fni8 = tcg_gen_vec_sar8i_i64,
2859          .fniv = tcg_gen_sari_vec,
2860          .fno = gen_helper_gvec_sar8i,
2861          .opt_opc = vecop_list,
2862          .vece = MO_8 },
2863        { .fni8 = tcg_gen_vec_sar16i_i64,
2864          .fniv = tcg_gen_sari_vec,
2865          .fno = gen_helper_gvec_sar16i,
2866          .opt_opc = vecop_list,
2867          .vece = MO_16 },
2868        { .fni4 = tcg_gen_sari_i32,
2869          .fniv = tcg_gen_sari_vec,
2870          .fno = gen_helper_gvec_sar32i,
2871          .opt_opc = vecop_list,
2872          .vece = MO_32 },
2873        { .fni8 = tcg_gen_sari_i64,
2874          .fniv = tcg_gen_sari_vec,
2875          .fno = gen_helper_gvec_sar64i,
2876          .opt_opc = vecop_list,
2877          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2878          .vece = MO_64 },
2879    };
2880
2881    tcg_debug_assert(vece <= MO_64);
2882    tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2883    if (shift == 0) {
2884        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2885    } else {
2886        tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2887    }
2888}
2889
2890void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2891{
2892    uint64_t mask = dup_const(MO_8, 0xff << c);
2893
2894    tcg_gen_shli_i64(d, a, c);
2895    tcg_gen_shri_i64(a, a, 8 - c);
2896    tcg_gen_andi_i64(d, d, mask);
2897    tcg_gen_andi_i64(a, a, ~mask);
2898    tcg_gen_or_i64(d, d, a);
2899}
2900
2901void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2902{
2903    uint64_t mask = dup_const(MO_16, 0xffff << c);
2904
2905    tcg_gen_shli_i64(d, a, c);
2906    tcg_gen_shri_i64(a, a, 16 - c);
2907    tcg_gen_andi_i64(d, d, mask);
2908    tcg_gen_andi_i64(a, a, ~mask);
2909    tcg_gen_or_i64(d, d, a);
2910}
2911
2912void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs,
2913                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2914{
2915    static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 };
2916    static const GVecGen2i g[4] = {
2917        { .fni8 = tcg_gen_vec_rotl8i_i64,
2918          .fniv = tcg_gen_rotli_vec,
2919          .fno = gen_helper_gvec_rotl8i,
2920          .opt_opc = vecop_list,
2921          .vece = MO_8 },
2922        { .fni8 = tcg_gen_vec_rotl16i_i64,
2923          .fniv = tcg_gen_rotli_vec,
2924          .fno = gen_helper_gvec_rotl16i,
2925          .opt_opc = vecop_list,
2926          .vece = MO_16 },
2927        { .fni4 = tcg_gen_rotli_i32,
2928          .fniv = tcg_gen_rotli_vec,
2929          .fno = gen_helper_gvec_rotl32i,
2930          .opt_opc = vecop_list,
2931          .vece = MO_32 },
2932        { .fni8 = tcg_gen_rotli_i64,
2933          .fniv = tcg_gen_rotli_vec,
2934          .fno = gen_helper_gvec_rotl64i,
2935          .opt_opc = vecop_list,
2936          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2937          .vece = MO_64 },
2938    };
2939
2940    tcg_debug_assert(vece <= MO_64);
2941    tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2942    if (shift == 0) {
2943        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2944    } else {
2945        tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2946    }
2947}
2948
2949void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs,
2950                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2951{
2952    tcg_debug_assert(vece <= MO_64);
2953    tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2954    tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1),
2955                       oprsz, maxsz);
2956}
2957
2958/*
2959 * Specialized generation vector shifts by a non-constant scalar.
2960 */
2961
2962typedef struct {
2963    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
2964    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
2965    void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32);
2966    void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
2967    gen_helper_gvec_2 *fno[4];
2968    TCGOpcode s_list[2];
2969    TCGOpcode v_list[2];
2970} GVecGen2sh;
2971
2972static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
2973                           uint32_t oprsz, uint32_t tysz, TCGType type,
2974                           TCGv_i32 shift,
2975                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
2976{
2977    TCGv_vec t0 = tcg_temp_new_vec(type);
2978    uint32_t i;
2979
2980    for (i = 0; i < oprsz; i += tysz) {
2981        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
2982        fni(vece, t0, t0, shift);
2983        tcg_gen_st_vec(t0, cpu_env, dofs + i);
2984    }
2985    tcg_temp_free_vec(t0);
2986}
2987
2988static void
2989do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift,
2990               uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g)
2991{
2992    TCGType type;
2993    uint32_t some;
2994
2995    check_size_align(oprsz, maxsz, dofs | aofs);
2996    check_overlap_2(dofs, aofs, maxsz);
2997
2998    /* If the backend has a scalar expansion, great.  */
2999    type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64);
3000    if (type) {
3001        const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
3002        switch (type) {
3003        case TCG_TYPE_V256:
3004            some = QEMU_ALIGN_DOWN(oprsz, 32);
3005            expand_2sh_vec(vece, dofs, aofs, some, 32,
3006                           TCG_TYPE_V256, shift, g->fniv_s);
3007            if (some == oprsz) {
3008                break;
3009            }
3010            dofs += some;
3011            aofs += some;
3012            oprsz -= some;
3013            maxsz -= some;
3014            /* fallthru */
3015        case TCG_TYPE_V128:
3016            expand_2sh_vec(vece, dofs, aofs, oprsz, 16,
3017                           TCG_TYPE_V128, shift, g->fniv_s);
3018            break;
3019        case TCG_TYPE_V64:
3020            expand_2sh_vec(vece, dofs, aofs, oprsz, 8,
3021                           TCG_TYPE_V64, shift, g->fniv_s);
3022            break;
3023        default:
3024            g_assert_not_reached();
3025        }
3026        tcg_swap_vecop_list(hold_list);
3027        goto clear_tail;
3028    }
3029
3030    /* If the backend supports variable vector shifts, also cool.  */
3031    type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64);
3032    if (type) {
3033        const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
3034        TCGv_vec v_shift = tcg_temp_new_vec(type);
3035
3036        if (vece == MO_64) {
3037            TCGv_i64 sh64 = tcg_temp_new_i64();
3038            tcg_gen_extu_i32_i64(sh64, shift);
3039            tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);
3040            tcg_temp_free_i64(sh64);
3041        } else {
3042            tcg_gen_dup_i32_vec(vece, v_shift, shift);
3043        }
3044
3045        switch (type) {
3046        case TCG_TYPE_V256:
3047            some = QEMU_ALIGN_DOWN(oprsz, 32);
3048            expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,
3049                          v_shift, false, g->fniv_v);
3050            if (some == oprsz) {
3051                break;
3052            }
3053            dofs += some;
3054            aofs += some;
3055            oprsz -= some;
3056            maxsz -= some;
3057            /* fallthru */
3058        case TCG_TYPE_V128:
3059            expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
3060                          v_shift, false, g->fniv_v);
3061            break;
3062        case TCG_TYPE_V64:
3063            expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
3064                          v_shift, false, g->fniv_v);
3065            break;
3066        default:
3067            g_assert_not_reached();
3068        }
3069        tcg_temp_free_vec(v_shift);
3070        tcg_swap_vecop_list(hold_list);
3071        goto clear_tail;
3072    }
3073
3074    /* Otherwise fall back to integral... */
3075    if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3076        expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4);
3077    } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3078        TCGv_i64 sh64 = tcg_temp_new_i64();
3079        tcg_gen_extu_i32_i64(sh64, shift);
3080        expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8);
3081        tcg_temp_free_i64(sh64);
3082    } else {
3083        TCGv_ptr a0 = tcg_temp_new_ptr();
3084        TCGv_ptr a1 = tcg_temp_new_ptr();
3085        TCGv_i32 desc = tcg_temp_new_i32();
3086
3087        tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);
3088        tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));
3089        tcg_gen_addi_ptr(a0, cpu_env, dofs);
3090        tcg_gen_addi_ptr(a1, cpu_env, aofs);
3091
3092        g->fno[vece](a0, a1, desc);
3093
3094        tcg_temp_free_ptr(a0);
3095        tcg_temp_free_ptr(a1);
3096        tcg_temp_free_i32(desc);
3097        return;
3098    }
3099
3100 clear_tail:
3101    if (oprsz < maxsz) {
3102        expand_clr(dofs + oprsz, maxsz - oprsz);
3103    }
3104}
3105
3106void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
3107                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3108{
3109    static const GVecGen2sh g = {
3110        .fni4 = tcg_gen_shl_i32,
3111        .fni8 = tcg_gen_shl_i64,
3112        .fniv_s = tcg_gen_shls_vec,
3113        .fniv_v = tcg_gen_shlv_vec,
3114        .fno = {
3115            gen_helper_gvec_shl8i,
3116            gen_helper_gvec_shl16i,
3117            gen_helper_gvec_shl32i,
3118            gen_helper_gvec_shl64i,
3119        },
3120        .s_list = { INDEX_op_shls_vec, 0 },
3121        .v_list = { INDEX_op_shlv_vec, 0 },
3122    };
3123
3124    tcg_debug_assert(vece <= MO_64);
3125    do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3126}
3127
3128void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
3129                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3130{
3131    static const GVecGen2sh g = {
3132        .fni4 = tcg_gen_shr_i32,
3133        .fni8 = tcg_gen_shr_i64,
3134        .fniv_s = tcg_gen_shrs_vec,
3135        .fniv_v = tcg_gen_shrv_vec,
3136        .fno = {
3137            gen_helper_gvec_shr8i,
3138            gen_helper_gvec_shr16i,
3139            gen_helper_gvec_shr32i,
3140            gen_helper_gvec_shr64i,
3141        },
3142        .s_list = { INDEX_op_shrs_vec, 0 },
3143        .v_list = { INDEX_op_shrv_vec, 0 },
3144    };
3145
3146    tcg_debug_assert(vece <= MO_64);
3147    do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3148}
3149
3150void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
3151                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3152{
3153    static const GVecGen2sh g = {
3154        .fni4 = tcg_gen_sar_i32,
3155        .fni8 = tcg_gen_sar_i64,
3156        .fniv_s = tcg_gen_sars_vec,
3157        .fniv_v = tcg_gen_sarv_vec,
3158        .fno = {
3159            gen_helper_gvec_sar8i,
3160            gen_helper_gvec_sar16i,
3161            gen_helper_gvec_sar32i,
3162            gen_helper_gvec_sar64i,
3163        },
3164        .s_list = { INDEX_op_sars_vec, 0 },
3165        .v_list = { INDEX_op_sarv_vec, 0 },
3166    };
3167
3168    tcg_debug_assert(vece <= MO_64);
3169    do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3170}
3171
3172void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
3173                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3174{
3175    static const GVecGen2sh g = {
3176        .fni4 = tcg_gen_rotl_i32,
3177        .fni8 = tcg_gen_rotl_i64,
3178        .fniv_s = tcg_gen_rotls_vec,
3179        .fniv_v = tcg_gen_rotlv_vec,
3180        .fno = {
3181            gen_helper_gvec_rotl8i,
3182            gen_helper_gvec_rotl16i,
3183            gen_helper_gvec_rotl32i,
3184            gen_helper_gvec_rotl64i,
3185        },
3186        .s_list = { INDEX_op_rotls_vec, 0 },
3187        .v_list = { INDEX_op_rotlv_vec, 0 },
3188    };
3189
3190    tcg_debug_assert(vece <= MO_64);
3191    do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3192}
3193
3194/*
3195 * Expand D = A << (B % element bits)
3196 *
3197 * Unlike scalar shifts, where it is easy for the target front end
3198 * to include the modulo as part of the expansion.  If the target
3199 * naturally includes the modulo as part of the operation, great!
3200 * If the target has some other behaviour from out-of-range shifts,
3201 * then it could not use this function anyway, and would need to
3202 * do it's own expansion with custom functions.
3203 */
3204static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
3205                                 TCGv_vec a, TCGv_vec b)
3206{
3207    TCGv_vec t = tcg_temp_new_vec_matching(d);
3208    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3209
3210    tcg_gen_and_vec(vece, t, b, m);
3211    tcg_gen_shlv_vec(vece, d, a, t);
3212    tcg_temp_free_vec(t);
3213}
3214
3215static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3216{
3217    TCGv_i32 t = tcg_temp_new_i32();
3218
3219    tcg_gen_andi_i32(t, b, 31);
3220    tcg_gen_shl_i32(d, a, t);
3221    tcg_temp_free_i32(t);
3222}
3223
3224static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3225{
3226    TCGv_i64 t = tcg_temp_new_i64();
3227
3228    tcg_gen_andi_i64(t, b, 63);
3229    tcg_gen_shl_i64(d, a, t);
3230    tcg_temp_free_i64(t);
3231}
3232
3233void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3234                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3235{
3236    static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 };
3237    static const GVecGen3 g[4] = {
3238        { .fniv = tcg_gen_shlv_mod_vec,
3239          .fno = gen_helper_gvec_shl8v,
3240          .opt_opc = vecop_list,
3241          .vece = MO_8 },
3242        { .fniv = tcg_gen_shlv_mod_vec,
3243          .fno = gen_helper_gvec_shl16v,
3244          .opt_opc = vecop_list,
3245          .vece = MO_16 },
3246        { .fni4 = tcg_gen_shl_mod_i32,
3247          .fniv = tcg_gen_shlv_mod_vec,
3248          .fno = gen_helper_gvec_shl32v,
3249          .opt_opc = vecop_list,
3250          .vece = MO_32 },
3251        { .fni8 = tcg_gen_shl_mod_i64,
3252          .fniv = tcg_gen_shlv_mod_vec,
3253          .fno = gen_helper_gvec_shl64v,
3254          .opt_opc = vecop_list,
3255          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3256          .vece = MO_64 },
3257    };
3258
3259    tcg_debug_assert(vece <= MO_64);
3260    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3261}
3262
3263/*
3264 * Similarly for logical right shifts.
3265 */
3266
3267static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
3268                                 TCGv_vec a, TCGv_vec b)
3269{
3270    TCGv_vec t = tcg_temp_new_vec_matching(d);
3271    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3272
3273    tcg_gen_and_vec(vece, t, b, m);
3274    tcg_gen_shrv_vec(vece, d, a, t);
3275    tcg_temp_free_vec(t);
3276}
3277
3278static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3279{
3280    TCGv_i32 t = tcg_temp_new_i32();
3281
3282    tcg_gen_andi_i32(t, b, 31);
3283    tcg_gen_shr_i32(d, a, t);
3284    tcg_temp_free_i32(t);
3285}
3286
3287static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3288{
3289    TCGv_i64 t = tcg_temp_new_i64();
3290
3291    tcg_gen_andi_i64(t, b, 63);
3292    tcg_gen_shr_i64(d, a, t);
3293    tcg_temp_free_i64(t);
3294}
3295
3296void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3297                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3298{
3299    static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 };
3300    static const GVecGen3 g[4] = {
3301        { .fniv = tcg_gen_shrv_mod_vec,
3302          .fno = gen_helper_gvec_shr8v,
3303          .opt_opc = vecop_list,
3304          .vece = MO_8 },
3305        { .fniv = tcg_gen_shrv_mod_vec,
3306          .fno = gen_helper_gvec_shr16v,
3307          .opt_opc = vecop_list,
3308          .vece = MO_16 },
3309        { .fni4 = tcg_gen_shr_mod_i32,
3310          .fniv = tcg_gen_shrv_mod_vec,
3311          .fno = gen_helper_gvec_shr32v,
3312          .opt_opc = vecop_list,
3313          .vece = MO_32 },
3314        { .fni8 = tcg_gen_shr_mod_i64,
3315          .fniv = tcg_gen_shrv_mod_vec,
3316          .fno = gen_helper_gvec_shr64v,
3317          .opt_opc = vecop_list,
3318          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3319          .vece = MO_64 },
3320    };
3321
3322    tcg_debug_assert(vece <= MO_64);
3323    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3324}
3325
3326/*
3327 * Similarly for arithmetic right shifts.
3328 */
3329
3330static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
3331                                 TCGv_vec a, TCGv_vec b)
3332{
3333    TCGv_vec t = tcg_temp_new_vec_matching(d);
3334    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3335
3336    tcg_gen_and_vec(vece, t, b, m);
3337    tcg_gen_sarv_vec(vece, d, a, t);
3338    tcg_temp_free_vec(t);
3339}
3340
3341static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3342{
3343    TCGv_i32 t = tcg_temp_new_i32();
3344
3345    tcg_gen_andi_i32(t, b, 31);
3346    tcg_gen_sar_i32(d, a, t);
3347    tcg_temp_free_i32(t);
3348}
3349
3350static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3351{
3352    TCGv_i64 t = tcg_temp_new_i64();
3353
3354    tcg_gen_andi_i64(t, b, 63);
3355    tcg_gen_sar_i64(d, a, t);
3356    tcg_temp_free_i64(t);
3357}
3358
3359void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
3360                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3361{
3362    static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 };
3363    static const GVecGen3 g[4] = {
3364        { .fniv = tcg_gen_sarv_mod_vec,
3365          .fno = gen_helper_gvec_sar8v,
3366          .opt_opc = vecop_list,
3367          .vece = MO_8 },
3368        { .fniv = tcg_gen_sarv_mod_vec,
3369          .fno = gen_helper_gvec_sar16v,
3370          .opt_opc = vecop_list,
3371          .vece = MO_16 },
3372        { .fni4 = tcg_gen_sar_mod_i32,
3373          .fniv = tcg_gen_sarv_mod_vec,
3374          .fno = gen_helper_gvec_sar32v,
3375          .opt_opc = vecop_list,
3376          .vece = MO_32 },
3377        { .fni8 = tcg_gen_sar_mod_i64,
3378          .fniv = tcg_gen_sarv_mod_vec,
3379          .fno = gen_helper_gvec_sar64v,
3380          .opt_opc = vecop_list,
3381          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3382          .vece = MO_64 },
3383    };
3384
3385    tcg_debug_assert(vece <= MO_64);
3386    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3387}
3388
3389/*
3390 * Similarly for rotates.
3391 */
3392
3393static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d,
3394                                  TCGv_vec a, TCGv_vec b)
3395{
3396    TCGv_vec t = tcg_temp_new_vec_matching(d);
3397    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3398
3399    tcg_gen_and_vec(vece, t, b, m);
3400    tcg_gen_rotlv_vec(vece, d, a, t);
3401    tcg_temp_free_vec(t);
3402}
3403
3404static void tcg_gen_rotl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3405{
3406    TCGv_i32 t = tcg_temp_new_i32();
3407
3408    tcg_gen_andi_i32(t, b, 31);
3409    tcg_gen_rotl_i32(d, a, t);
3410    tcg_temp_free_i32(t);
3411}
3412
3413static void tcg_gen_rotl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3414{
3415    TCGv_i64 t = tcg_temp_new_i64();
3416
3417    tcg_gen_andi_i64(t, b, 63);
3418    tcg_gen_rotl_i64(d, a, t);
3419    tcg_temp_free_i64(t);
3420}
3421
3422void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3423                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3424{
3425    static const TCGOpcode vecop_list[] = { INDEX_op_rotlv_vec, 0 };
3426    static const GVecGen3 g[4] = {
3427        { .fniv = tcg_gen_rotlv_mod_vec,
3428          .fno = gen_helper_gvec_rotl8v,
3429          .opt_opc = vecop_list,
3430          .vece = MO_8 },
3431        { .fniv = tcg_gen_rotlv_mod_vec,
3432          .fno = gen_helper_gvec_rotl16v,
3433          .opt_opc = vecop_list,
3434          .vece = MO_16 },
3435        { .fni4 = tcg_gen_rotl_mod_i32,
3436          .fniv = tcg_gen_rotlv_mod_vec,
3437          .fno = gen_helper_gvec_rotl32v,
3438          .opt_opc = vecop_list,
3439          .vece = MO_32 },
3440        { .fni8 = tcg_gen_rotl_mod_i64,
3441          .fniv = tcg_gen_rotlv_mod_vec,
3442          .fno = gen_helper_gvec_rotl64v,
3443          .opt_opc = vecop_list,
3444          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3445          .vece = MO_64 },
3446    };
3447
3448    tcg_debug_assert(vece <= MO_64);
3449    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3450}
3451
3452static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d,
3453                                  TCGv_vec a, TCGv_vec b)
3454{
3455    TCGv_vec t = tcg_temp_new_vec_matching(d);
3456    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3457
3458    tcg_gen_and_vec(vece, t, b, m);
3459    tcg_gen_rotrv_vec(vece, d, a, t);
3460    tcg_temp_free_vec(t);
3461}
3462
3463static void tcg_gen_rotr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3464{
3465    TCGv_i32 t = tcg_temp_new_i32();
3466
3467    tcg_gen_andi_i32(t, b, 31);
3468    tcg_gen_rotr_i32(d, a, t);
3469    tcg_temp_free_i32(t);
3470}
3471
3472static void tcg_gen_rotr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3473{
3474    TCGv_i64 t = tcg_temp_new_i64();
3475
3476    tcg_gen_andi_i64(t, b, 63);
3477    tcg_gen_rotr_i64(d, a, t);
3478    tcg_temp_free_i64(t);
3479}
3480
3481void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3482                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3483{
3484    static const TCGOpcode vecop_list[] = { INDEX_op_rotrv_vec, 0 };
3485    static const GVecGen3 g[4] = {
3486        { .fniv = tcg_gen_rotrv_mod_vec,
3487          .fno = gen_helper_gvec_rotr8v,
3488          .opt_opc = vecop_list,
3489          .vece = MO_8 },
3490        { .fniv = tcg_gen_rotrv_mod_vec,
3491          .fno = gen_helper_gvec_rotr16v,
3492          .opt_opc = vecop_list,
3493          .vece = MO_16 },
3494        { .fni4 = tcg_gen_rotr_mod_i32,
3495          .fniv = tcg_gen_rotrv_mod_vec,
3496          .fno = gen_helper_gvec_rotr32v,
3497          .opt_opc = vecop_list,
3498          .vece = MO_32 },
3499        { .fni8 = tcg_gen_rotr_mod_i64,
3500          .fniv = tcg_gen_rotrv_mod_vec,
3501          .fno = gen_helper_gvec_rotr64v,
3502          .opt_opc = vecop_list,
3503          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3504          .vece = MO_64 },
3505    };
3506
3507    tcg_debug_assert(vece <= MO_64);
3508    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3509}
3510
3511/* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
3512static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3513                           uint32_t oprsz, TCGCond cond)
3514{
3515    TCGv_i32 t0 = tcg_temp_new_i32();
3516    TCGv_i32 t1 = tcg_temp_new_i32();
3517    uint32_t i;
3518
3519    for (i = 0; i < oprsz; i += 4) {
3520        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
3521        tcg_gen_ld_i32(t1, cpu_env, bofs + i);
3522        tcg_gen_setcond_i32(cond, t0, t0, t1);
3523        tcg_gen_neg_i32(t0, t0);
3524        tcg_gen_st_i32(t0, cpu_env, dofs + i);
3525    }
3526    tcg_temp_free_i32(t1);
3527    tcg_temp_free_i32(t0);
3528}
3529
3530static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3531                           uint32_t oprsz, TCGCond cond)
3532{
3533    TCGv_i64 t0 = tcg_temp_new_i64();
3534    TCGv_i64 t1 = tcg_temp_new_i64();
3535    uint32_t i;
3536
3537    for (i = 0; i < oprsz; i += 8) {
3538        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
3539        tcg_gen_ld_i64(t1, cpu_env, bofs + i);
3540        tcg_gen_setcond_i64(cond, t0, t0, t1);
3541        tcg_gen_neg_i64(t0, t0);
3542        tcg_gen_st_i64(t0, cpu_env, dofs + i);
3543    }
3544    tcg_temp_free_i64(t1);
3545    tcg_temp_free_i64(t0);
3546}
3547
3548static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3549                           uint32_t bofs, uint32_t oprsz, uint32_t tysz,
3550                           TCGType type, TCGCond cond)
3551{
3552    TCGv_vec t0 = tcg_temp_new_vec(type);
3553    TCGv_vec t1 = tcg_temp_new_vec(type);
3554    uint32_t i;
3555
3556    for (i = 0; i < oprsz; i += tysz) {
3557        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
3558        tcg_gen_ld_vec(t1, cpu_env, bofs + i);
3559        tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
3560        tcg_gen_st_vec(t0, cpu_env, dofs + i);
3561    }
3562    tcg_temp_free_vec(t1);
3563    tcg_temp_free_vec(t0);
3564}
3565
3566void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
3567                      uint32_t aofs, uint32_t bofs,
3568                      uint32_t oprsz, uint32_t maxsz)
3569{
3570    static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3571    static gen_helper_gvec_3 * const eq_fn[4] = {
3572        gen_helper_gvec_eq8, gen_helper_gvec_eq16,
3573        gen_helper_gvec_eq32, gen_helper_gvec_eq64
3574    };
3575    static gen_helper_gvec_3 * const ne_fn[4] = {
3576        gen_helper_gvec_ne8, gen_helper_gvec_ne16,
3577        gen_helper_gvec_ne32, gen_helper_gvec_ne64
3578    };
3579    static gen_helper_gvec_3 * const lt_fn[4] = {
3580        gen_helper_gvec_lt8, gen_helper_gvec_lt16,
3581        gen_helper_gvec_lt32, gen_helper_gvec_lt64
3582    };
3583    static gen_helper_gvec_3 * const le_fn[4] = {
3584        gen_helper_gvec_le8, gen_helper_gvec_le16,
3585        gen_helper_gvec_le32, gen_helper_gvec_le64
3586    };
3587    static gen_helper_gvec_3 * const ltu_fn[4] = {
3588        gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
3589        gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
3590    };
3591    static gen_helper_gvec_3 * const leu_fn[4] = {
3592        gen_helper_gvec_leu8, gen_helper_gvec_leu16,
3593        gen_helper_gvec_leu32, gen_helper_gvec_leu64
3594    };
3595    static gen_helper_gvec_3 * const * const fns[16] = {
3596        [TCG_COND_EQ] = eq_fn,
3597        [TCG_COND_NE] = ne_fn,
3598        [TCG_COND_LT] = lt_fn,
3599        [TCG_COND_LE] = le_fn,
3600        [TCG_COND_LTU] = ltu_fn,
3601        [TCG_COND_LEU] = leu_fn,
3602    };
3603
3604    const TCGOpcode *hold_list;
3605    TCGType type;
3606    uint32_t some;
3607
3608    check_size_align(oprsz, maxsz, dofs | aofs | bofs);
3609    check_overlap_3(dofs, aofs, bofs, maxsz);
3610
3611    if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3612        do_dup(MO_8, dofs, oprsz, maxsz,
3613               NULL, NULL, -(cond == TCG_COND_ALWAYS));
3614        return;
3615    }
3616
3617    /*
3618     * Implement inline with a vector type, if possible.
3619     * Prefer integer when 64-bit host and 64-bit comparison.
3620     */
3621    hold_list = tcg_swap_vecop_list(cmp_list);
3622    type = choose_vector_type(cmp_list, vece, oprsz,
3623                              TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3624    switch (type) {
3625    case TCG_TYPE_V256:
3626        /* Recall that ARM SVE allows vector sizes that are not a
3627         * power of 2, but always a multiple of 16.  The intent is
3628         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
3629         */
3630        some = QEMU_ALIGN_DOWN(oprsz, 32);
3631        expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
3632        if (some == oprsz) {
3633            break;
3634        }
3635        dofs += some;
3636        aofs += some;
3637        bofs += some;
3638        oprsz -= some;
3639        maxsz -= some;
3640        /* fallthru */
3641    case TCG_TYPE_V128:
3642        expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
3643        break;
3644    case TCG_TYPE_V64:
3645        expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
3646        break;
3647
3648    case 0:
3649        if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3650            expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
3651        } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3652            expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
3653        } else {
3654            gen_helper_gvec_3 * const *fn = fns[cond];
3655
3656            if (fn == NULL) {
3657                uint32_t tmp;
3658                tmp = aofs, aofs = bofs, bofs = tmp;
3659                cond = tcg_swap_cond(cond);
3660                fn = fns[cond];
3661                assert(fn != NULL);
3662            }
3663            tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
3664            oprsz = maxsz;
3665        }
3666        break;
3667
3668    default:
3669        g_assert_not_reached();
3670    }
3671    tcg_swap_vecop_list(hold_list);
3672
3673    if (oprsz < maxsz) {
3674        expand_clr(dofs + oprsz, maxsz - oprsz);
3675    }
3676}
3677
3678static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c)
3679{
3680    TCGv_i64 t = tcg_temp_new_i64();
3681
3682    tcg_gen_and_i64(t, b, a);
3683    tcg_gen_andc_i64(d, c, a);
3684    tcg_gen_or_i64(d, d, t);
3685    tcg_temp_free_i64(t);
3686}
3687
3688void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
3689                         uint32_t bofs, uint32_t cofs,
3690                         uint32_t oprsz, uint32_t maxsz)
3691{
3692    static const GVecGen4 g = {
3693        .fni8 = tcg_gen_bitsel_i64,
3694        .fniv = tcg_gen_bitsel_vec,
3695        .fno = gen_helper_gvec_bitsel,
3696    };
3697
3698    tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g);
3699}
3700