qemu/tcg/tcg-op-gvec.c
<<
>>
Prefs
   1/*
   2 * Generic vector operation expansion
   3 *
   4 * Copyright (c) 2018 Linaro
   5 *
   6 * This library is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU Lesser General Public
   8 * License as published by the Free Software Foundation; either
   9 * version 2.1 of the License, or (at your option) any later version.
  10 *
  11 * This library is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * Lesser General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU Lesser General Public
  17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20#include "qemu/osdep.h"
  21#include "tcg/tcg.h"
  22#include "tcg/tcg-op.h"
  23#include "tcg/tcg-op-gvec.h"
  24#include "qemu/main-loop.h"
  25#include "tcg/tcg-gvec-desc.h"
  26
  27#define MAX_UNROLL  4
  28
  29#ifdef CONFIG_DEBUG_TCG
  30static const TCGOpcode vecop_list_empty[1] = { 0 };
  31#else
  32#define vecop_list_empty NULL
  33#endif
  34
  35
  36/* Verify vector size and alignment rules.  OFS should be the OR of all
  37   of the operand offsets so that we can check them all at once.  */
  38static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
  39{
  40    uint32_t opr_align = oprsz >= 16 ? 15 : 7;
  41    uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7;
  42    tcg_debug_assert(oprsz > 0);
  43    tcg_debug_assert(oprsz <= maxsz);
  44    tcg_debug_assert((oprsz & opr_align) == 0);
  45    tcg_debug_assert((maxsz & max_align) == 0);
  46    tcg_debug_assert((ofs & max_align) == 0);
  47}
  48
  49/* Verify vector overlap rules for two operands.  */
  50static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
  51{
  52    tcg_debug_assert(d == a || d + s <= a || a + s <= d);
  53}
  54
  55/* Verify vector overlap rules for three operands.  */
  56static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
  57{
  58    check_overlap_2(d, a, s);
  59    check_overlap_2(d, b, s);
  60    check_overlap_2(a, b, s);
  61}
  62
  63/* Verify vector overlap rules for four operands.  */
  64static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
  65                            uint32_t c, uint32_t s)
  66{
  67    check_overlap_2(d, a, s);
  68    check_overlap_2(d, b, s);
  69    check_overlap_2(d, c, s);
  70    check_overlap_2(a, b, s);
  71    check_overlap_2(a, c, s);
  72    check_overlap_2(b, c, s);
  73}
  74
  75/* Create a descriptor from components.  */
  76uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
  77{
  78    uint32_t desc = 0;
  79
  80    assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS));
  81    assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS));
  82    assert(data == sextract32(data, 0, SIMD_DATA_BITS));
  83
  84    oprsz = (oprsz / 8) - 1;
  85    maxsz = (maxsz / 8) - 1;
  86    desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
  87    desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
  88    desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
  89
  90    return desc;
  91}
  92
  93/* Generate a call to a gvec-style helper with two vector operands.  */
  94void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
  95                        uint32_t oprsz, uint32_t maxsz, int32_t data,
  96                        gen_helper_gvec_2 *fn)
  97{
  98    TCGv_ptr a0, a1;
  99    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 100
 101    a0 = tcg_temp_new_ptr();
 102    a1 = tcg_temp_new_ptr();
 103
 104    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 105    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 106
 107    fn(a0, a1, desc);
 108
 109    tcg_temp_free_ptr(a0);
 110    tcg_temp_free_ptr(a1);
 111    tcg_temp_free_i32(desc);
 112}
 113
 114/* Generate a call to a gvec-style helper with two vector operands
 115   and one scalar operand.  */
 116void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
 117                         uint32_t oprsz, uint32_t maxsz, int32_t data,
 118                         gen_helper_gvec_2i *fn)
 119{
 120    TCGv_ptr a0, a1;
 121    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 122
 123    a0 = tcg_temp_new_ptr();
 124    a1 = tcg_temp_new_ptr();
 125
 126    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 127    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 128
 129    fn(a0, a1, c, desc);
 130
 131    tcg_temp_free_ptr(a0);
 132    tcg_temp_free_ptr(a1);
 133    tcg_temp_free_i32(desc);
 134}
 135
 136/* Generate a call to a gvec-style helper with three vector operands.  */
 137void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 138                        uint32_t oprsz, uint32_t maxsz, int32_t data,
 139                        gen_helper_gvec_3 *fn)
 140{
 141    TCGv_ptr a0, a1, a2;
 142    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 143
 144    a0 = tcg_temp_new_ptr();
 145    a1 = tcg_temp_new_ptr();
 146    a2 = tcg_temp_new_ptr();
 147
 148    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 149    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 150    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 151
 152    fn(a0, a1, a2, desc);
 153
 154    tcg_temp_free_ptr(a0);
 155    tcg_temp_free_ptr(a1);
 156    tcg_temp_free_ptr(a2);
 157    tcg_temp_free_i32(desc);
 158}
 159
 160/* Generate a call to a gvec-style helper with four vector operands.  */
 161void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 162                        uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
 163                        int32_t data, gen_helper_gvec_4 *fn)
 164{
 165    TCGv_ptr a0, a1, a2, a3;
 166    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 167
 168    a0 = tcg_temp_new_ptr();
 169    a1 = tcg_temp_new_ptr();
 170    a2 = tcg_temp_new_ptr();
 171    a3 = tcg_temp_new_ptr();
 172
 173    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 174    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 175    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 176    tcg_gen_addi_ptr(a3, cpu_env, cofs);
 177
 178    fn(a0, a1, a2, a3, desc);
 179
 180    tcg_temp_free_ptr(a0);
 181    tcg_temp_free_ptr(a1);
 182    tcg_temp_free_ptr(a2);
 183    tcg_temp_free_ptr(a3);
 184    tcg_temp_free_i32(desc);
 185}
 186
 187/* Generate a call to a gvec-style helper with five vector operands.  */
 188void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 189                        uint32_t cofs, uint32_t xofs, uint32_t oprsz,
 190                        uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
 191{
 192    TCGv_ptr a0, a1, a2, a3, a4;
 193    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 194
 195    a0 = tcg_temp_new_ptr();
 196    a1 = tcg_temp_new_ptr();
 197    a2 = tcg_temp_new_ptr();
 198    a3 = tcg_temp_new_ptr();
 199    a4 = tcg_temp_new_ptr();
 200
 201    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 202    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 203    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 204    tcg_gen_addi_ptr(a3, cpu_env, cofs);
 205    tcg_gen_addi_ptr(a4, cpu_env, xofs);
 206
 207    fn(a0, a1, a2, a3, a4, desc);
 208
 209    tcg_temp_free_ptr(a0);
 210    tcg_temp_free_ptr(a1);
 211    tcg_temp_free_ptr(a2);
 212    tcg_temp_free_ptr(a3);
 213    tcg_temp_free_ptr(a4);
 214    tcg_temp_free_i32(desc);
 215}
 216
 217/* Generate a call to a gvec-style helper with three vector operands
 218   and an extra pointer operand.  */
 219void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
 220                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 221                        int32_t data, gen_helper_gvec_2_ptr *fn)
 222{
 223    TCGv_ptr a0, a1;
 224    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 225
 226    a0 = tcg_temp_new_ptr();
 227    a1 = tcg_temp_new_ptr();
 228
 229    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 230    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 231
 232    fn(a0, a1, ptr, desc);
 233
 234    tcg_temp_free_ptr(a0);
 235    tcg_temp_free_ptr(a1);
 236    tcg_temp_free_i32(desc);
 237}
 238
 239/* Generate a call to a gvec-style helper with three vector operands
 240   and an extra pointer operand.  */
 241void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 242                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 243                        int32_t data, gen_helper_gvec_3_ptr *fn)
 244{
 245    TCGv_ptr a0, a1, a2;
 246    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 247
 248    a0 = tcg_temp_new_ptr();
 249    a1 = tcg_temp_new_ptr();
 250    a2 = tcg_temp_new_ptr();
 251
 252    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 253    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 254    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 255
 256    fn(a0, a1, a2, ptr, desc);
 257
 258    tcg_temp_free_ptr(a0);
 259    tcg_temp_free_ptr(a1);
 260    tcg_temp_free_ptr(a2);
 261    tcg_temp_free_i32(desc);
 262}
 263
 264/* Generate a call to a gvec-style helper with four vector operands
 265   and an extra pointer operand.  */
 266void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 267                        uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
 268                        uint32_t maxsz, int32_t data,
 269                        gen_helper_gvec_4_ptr *fn)
 270{
 271    TCGv_ptr a0, a1, a2, a3;
 272    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 273
 274    a0 = tcg_temp_new_ptr();
 275    a1 = tcg_temp_new_ptr();
 276    a2 = tcg_temp_new_ptr();
 277    a3 = tcg_temp_new_ptr();
 278
 279    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 280    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 281    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 282    tcg_gen_addi_ptr(a3, cpu_env, cofs);
 283
 284    fn(a0, a1, a2, a3, ptr, desc);
 285
 286    tcg_temp_free_ptr(a0);
 287    tcg_temp_free_ptr(a1);
 288    tcg_temp_free_ptr(a2);
 289    tcg_temp_free_ptr(a3);
 290    tcg_temp_free_i32(desc);
 291}
 292
 293/* Generate a call to a gvec-style helper with five vector operands
 294   and an extra pointer operand.  */
 295void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 296                        uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
 297                        uint32_t oprsz, uint32_t maxsz, int32_t data,
 298                        gen_helper_gvec_5_ptr *fn)
 299{
 300    TCGv_ptr a0, a1, a2, a3, a4;
 301    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 302
 303    a0 = tcg_temp_new_ptr();
 304    a1 = tcg_temp_new_ptr();
 305    a2 = tcg_temp_new_ptr();
 306    a3 = tcg_temp_new_ptr();
 307    a4 = tcg_temp_new_ptr();
 308
 309    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 310    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 311    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 312    tcg_gen_addi_ptr(a3, cpu_env, cofs);
 313    tcg_gen_addi_ptr(a4, cpu_env, eofs);
 314
 315    fn(a0, a1, a2, a3, a4, ptr, desc);
 316
 317    tcg_temp_free_ptr(a0);
 318    tcg_temp_free_ptr(a1);
 319    tcg_temp_free_ptr(a2);
 320    tcg_temp_free_ptr(a3);
 321    tcg_temp_free_ptr(a4);
 322    tcg_temp_free_i32(desc);
 323}
 324
 325/* Return true if we want to implement something of OPRSZ bytes
 326   in units of LNSZ.  This limits the expansion of inline code.  */
 327static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
 328{
 329    if (oprsz % lnsz == 0) {
 330        uint32_t lnct = oprsz / lnsz;
 331        return lnct >= 1 && lnct <= MAX_UNROLL;
 332    }
 333    return false;
 334}
 335
 336static void expand_clr(uint32_t dofs, uint32_t maxsz);
 337
 338/* Duplicate C as per VECE.  */
 339uint64_t (dup_const)(unsigned vece, uint64_t c)
 340{
 341    switch (vece) {
 342    case MO_8:
 343        return 0x0101010101010101ull * (uint8_t)c;
 344    case MO_16:
 345        return 0x0001000100010001ull * (uint16_t)c;
 346    case MO_32:
 347        return 0x0000000100000001ull * (uint32_t)c;
 348    case MO_64:
 349        return c;
 350    default:
 351        g_assert_not_reached();
 352    }
 353}
 354
 355/* Duplicate IN into OUT as per VECE.  */
 356static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
 357{
 358    switch (vece) {
 359    case MO_8:
 360        tcg_gen_ext8u_i32(out, in);
 361        tcg_gen_muli_i32(out, out, 0x01010101);
 362        break;
 363    case MO_16:
 364        tcg_gen_deposit_i32(out, in, in, 16, 16);
 365        break;
 366    case MO_32:
 367        tcg_gen_mov_i32(out, in);
 368        break;
 369    default:
 370        g_assert_not_reached();
 371    }
 372}
 373
 374static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
 375{
 376    switch (vece) {
 377    case MO_8:
 378        tcg_gen_ext8u_i64(out, in);
 379        tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
 380        break;
 381    case MO_16:
 382        tcg_gen_ext16u_i64(out, in);
 383        tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
 384        break;
 385    case MO_32:
 386        tcg_gen_deposit_i64(out, in, in, 32, 32);
 387        break;
 388    case MO_64:
 389        tcg_gen_mov_i64(out, in);
 390        break;
 391    default:
 392        g_assert_not_reached();
 393    }
 394}
 395
 396/* Select a supported vector type for implementing an operation on SIZE
 397 * bytes.  If OP is 0, assume that the real operation to be performed is
 398 * required by all backends.  Otherwise, make sure than OP can be performed
 399 * on elements of size VECE in the selected type.  Do not select V64 if
 400 * PREFER_I64 is true.  Return 0 if no vector type is selected.
 401 */
 402static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
 403                                  uint32_t size, bool prefer_i64)
 404{
 405    if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) {
 406        /*
 407         * Recall that ARM SVE allows vector sizes that are not a
 408         * power of 2, but always a multiple of 16.  The intent is
 409         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 410         * It is hard to imagine a case in which v256 is supported
 411         * but v128 is not, but check anyway.
 412         */
 413        if (tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece)
 414            && (size % 32 == 0
 415                || tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) {
 416            return TCG_TYPE_V256;
 417        }
 418    }
 419    if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16)
 420        && tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece)) {
 421        return TCG_TYPE_V128;
 422    }
 423    if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
 424        && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) {
 425        return TCG_TYPE_V64;
 426    }
 427    return 0;
 428}
 429
 430static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
 431                         uint32_t maxsz, TCGv_vec t_vec)
 432{
 433    uint32_t i = 0;
 434
 435    switch (type) {
 436    case TCG_TYPE_V256:
 437        /*
 438         * Recall that ARM SVE allows vector sizes that are not a
 439         * power of 2, but always a multiple of 16.  The intent is
 440         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 441         */
 442        for (; i + 32 <= oprsz; i += 32) {
 443            tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
 444        }
 445        /* fallthru */
 446    case TCG_TYPE_V128:
 447        for (; i + 16 <= oprsz; i += 16) {
 448            tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
 449        }
 450        break;
 451    case TCG_TYPE_V64:
 452        for (; i < oprsz; i += 8) {
 453            tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
 454        }
 455        break;
 456    default:
 457        g_assert_not_reached();
 458    }
 459
 460    if (oprsz < maxsz) {
 461        expand_clr(dofs + oprsz, maxsz - oprsz);
 462    }
 463}
 464
 465/* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
 466 * Only one of IN_32 or IN_64 may be set;
 467 * IN_C is used if IN_32 and IN_64 are unset.
 468 */
 469static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
 470                   uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
 471                   uint64_t in_c)
 472{
 473    TCGType type;
 474    TCGv_i64 t_64;
 475    TCGv_i32 t_32, t_desc;
 476    TCGv_ptr t_ptr;
 477    uint32_t i;
 478
 479    assert(vece <= (in_32 ? MO_32 : MO_64));
 480    assert(in_32 == NULL || in_64 == NULL);
 481
 482    /* If we're storing 0, expand oprsz to maxsz.  */
 483    if (in_32 == NULL && in_64 == NULL) {
 484        in_c = dup_const(vece, in_c);
 485        if (in_c == 0) {
 486            oprsz = maxsz;
 487        }
 488    }
 489
 490    /* Implement inline with a vector type, if possible.
 491     * Prefer integer when 64-bit host and no variable dup.
 492     */
 493    type = choose_vector_type(NULL, vece, oprsz,
 494                              (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
 495                               && (in_64 == NULL || vece == MO_64)));
 496    if (type != 0) {
 497        TCGv_vec t_vec = tcg_temp_new_vec(type);
 498
 499        if (in_32) {
 500            tcg_gen_dup_i32_vec(vece, t_vec, in_32);
 501        } else if (in_64) {
 502            tcg_gen_dup_i64_vec(vece, t_vec, in_64);
 503        } else {
 504            tcg_gen_dupi_vec(vece, t_vec, in_c);
 505        }
 506        do_dup_store(type, dofs, oprsz, maxsz, t_vec);
 507        tcg_temp_free_vec(t_vec);
 508        return;
 509    }
 510
 511    /* Otherwise, inline with an integer type, unless "large".  */
 512    if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
 513        t_64 = NULL;
 514        t_32 = NULL;
 515
 516        if (in_32) {
 517            /* We are given a 32-bit variable input.  For a 64-bit host,
 518               use a 64-bit operation unless the 32-bit operation would
 519               be simple enough.  */
 520            if (TCG_TARGET_REG_BITS == 64
 521                && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
 522                t_64 = tcg_temp_new_i64();
 523                tcg_gen_extu_i32_i64(t_64, in_32);
 524                gen_dup_i64(vece, t_64, t_64);
 525            } else {
 526                t_32 = tcg_temp_new_i32();
 527                gen_dup_i32(vece, t_32, in_32);
 528            }
 529        } else if (in_64) {
 530            /* We are given a 64-bit variable input.  */
 531            t_64 = tcg_temp_new_i64();
 532            gen_dup_i64(vece, t_64, in_64);
 533        } else {
 534            /* We are given a constant input.  */
 535            /* For 64-bit hosts, use 64-bit constants for "simple" constants
 536               or when we'd need too many 32-bit stores, or when a 64-bit
 537               constant is really required.  */
 538            if (vece == MO_64
 539                || (TCG_TARGET_REG_BITS == 64
 540                    && (in_c == 0 || in_c == -1
 541                        || !check_size_impl(oprsz, 4)))) {
 542                t_64 = tcg_const_i64(in_c);
 543            } else {
 544                t_32 = tcg_const_i32(in_c);
 545            }
 546        }
 547
 548        /* Implement inline if we picked an implementation size above.  */
 549        if (t_32) {
 550            for (i = 0; i < oprsz; i += 4) {
 551                tcg_gen_st_i32(t_32, cpu_env, dofs + i);
 552            }
 553            tcg_temp_free_i32(t_32);
 554            goto done;
 555        }
 556        if (t_64) {
 557            for (i = 0; i < oprsz; i += 8) {
 558                tcg_gen_st_i64(t_64, cpu_env, dofs + i);
 559            }
 560            tcg_temp_free_i64(t_64);
 561            goto done;
 562        }
 563    }
 564
 565    /* Otherwise implement out of line.  */
 566    t_ptr = tcg_temp_new_ptr();
 567    tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
 568    t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));
 569
 570    if (vece == MO_64) {
 571        if (in_64) {
 572            gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
 573        } else {
 574            t_64 = tcg_const_i64(in_c);
 575            gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
 576            tcg_temp_free_i64(t_64);
 577        }
 578    } else {
 579        typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
 580        static dup_fn * const fns[3] = {
 581            gen_helper_gvec_dup8,
 582            gen_helper_gvec_dup16,
 583            gen_helper_gvec_dup32
 584        };
 585
 586        if (in_32) {
 587            fns[vece](t_ptr, t_desc, in_32);
 588        } else {
 589            t_32 = tcg_temp_new_i32();
 590            if (in_64) {
 591                tcg_gen_extrl_i64_i32(t_32, in_64);
 592            } else if (vece == MO_8) {
 593                tcg_gen_movi_i32(t_32, in_c & 0xff);
 594            } else if (vece == MO_16) {
 595                tcg_gen_movi_i32(t_32, in_c & 0xffff);
 596            } else {
 597                tcg_gen_movi_i32(t_32, in_c);
 598            }
 599            fns[vece](t_ptr, t_desc, t_32);
 600            tcg_temp_free_i32(t_32);
 601        }
 602    }
 603
 604    tcg_temp_free_ptr(t_ptr);
 605    tcg_temp_free_i32(t_desc);
 606    return;
 607
 608 done:
 609    if (oprsz < maxsz) {
 610        expand_clr(dofs + oprsz, maxsz - oprsz);
 611    }
 612}
 613
 614/* Likewise, but with zero.  */
 615static void expand_clr(uint32_t dofs, uint32_t maxsz)
 616{
 617    do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
 618}
 619
 620/* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
 621static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 622                         void (*fni)(TCGv_i32, TCGv_i32))
 623{
 624    TCGv_i32 t0 = tcg_temp_new_i32();
 625    uint32_t i;
 626
 627    for (i = 0; i < oprsz; i += 4) {
 628        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 629        fni(t0, t0);
 630        tcg_gen_st_i32(t0, cpu_env, dofs + i);
 631    }
 632    tcg_temp_free_i32(t0);
 633}
 634
 635static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 636                          int32_t c, bool load_dest,
 637                          void (*fni)(TCGv_i32, TCGv_i32, int32_t))
 638{
 639    TCGv_i32 t0 = tcg_temp_new_i32();
 640    TCGv_i32 t1 = tcg_temp_new_i32();
 641    uint32_t i;
 642
 643    for (i = 0; i < oprsz; i += 4) {
 644        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 645        if (load_dest) {
 646            tcg_gen_ld_i32(t1, cpu_env, dofs + i);
 647        }
 648        fni(t1, t0, c);
 649        tcg_gen_st_i32(t1, cpu_env, dofs + i);
 650    }
 651    tcg_temp_free_i32(t0);
 652    tcg_temp_free_i32(t1);
 653}
 654
 655static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 656                          TCGv_i32 c, bool scalar_first,
 657                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
 658{
 659    TCGv_i32 t0 = tcg_temp_new_i32();
 660    TCGv_i32 t1 = tcg_temp_new_i32();
 661    uint32_t i;
 662
 663    for (i = 0; i < oprsz; i += 4) {
 664        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 665        if (scalar_first) {
 666            fni(t1, c, t0);
 667        } else {
 668            fni(t1, t0, c);
 669        }
 670        tcg_gen_st_i32(t1, cpu_env, dofs + i);
 671    }
 672    tcg_temp_free_i32(t0);
 673    tcg_temp_free_i32(t1);
 674}
 675
 676/* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
 677static void expand_3_i32(uint32_t dofs, uint32_t aofs,
 678                         uint32_t bofs, uint32_t oprsz, bool load_dest,
 679                         void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
 680{
 681    TCGv_i32 t0 = tcg_temp_new_i32();
 682    TCGv_i32 t1 = tcg_temp_new_i32();
 683    TCGv_i32 t2 = tcg_temp_new_i32();
 684    uint32_t i;
 685
 686    for (i = 0; i < oprsz; i += 4) {
 687        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 688        tcg_gen_ld_i32(t1, cpu_env, bofs + i);
 689        if (load_dest) {
 690            tcg_gen_ld_i32(t2, cpu_env, dofs + i);
 691        }
 692        fni(t2, t0, t1);
 693        tcg_gen_st_i32(t2, cpu_env, dofs + i);
 694    }
 695    tcg_temp_free_i32(t2);
 696    tcg_temp_free_i32(t1);
 697    tcg_temp_free_i32(t0);
 698}
 699
 700static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 701                          uint32_t oprsz, int32_t c, bool load_dest,
 702                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
 703{
 704    TCGv_i32 t0 = tcg_temp_new_i32();
 705    TCGv_i32 t1 = tcg_temp_new_i32();
 706    TCGv_i32 t2 = tcg_temp_new_i32();
 707    uint32_t i;
 708
 709    for (i = 0; i < oprsz; i += 4) {
 710        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 711        tcg_gen_ld_i32(t1, cpu_env, bofs + i);
 712        if (load_dest) {
 713            tcg_gen_ld_i32(t2, cpu_env, dofs + i);
 714        }
 715        fni(t2, t0, t1, c);
 716        tcg_gen_st_i32(t2, cpu_env, dofs + i);
 717    }
 718    tcg_temp_free_i32(t0);
 719    tcg_temp_free_i32(t1);
 720    tcg_temp_free_i32(t2);
 721}
 722
 723/* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
 724static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 725                         uint32_t cofs, uint32_t oprsz, bool write_aofs,
 726                         void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
 727{
 728    TCGv_i32 t0 = tcg_temp_new_i32();
 729    TCGv_i32 t1 = tcg_temp_new_i32();
 730    TCGv_i32 t2 = tcg_temp_new_i32();
 731    TCGv_i32 t3 = tcg_temp_new_i32();
 732    uint32_t i;
 733
 734    for (i = 0; i < oprsz; i += 4) {
 735        tcg_gen_ld_i32(t1, cpu_env, aofs + i);
 736        tcg_gen_ld_i32(t2, cpu_env, bofs + i);
 737        tcg_gen_ld_i32(t3, cpu_env, cofs + i);
 738        fni(t0, t1, t2, t3);
 739        tcg_gen_st_i32(t0, cpu_env, dofs + i);
 740        if (write_aofs) {
 741            tcg_gen_st_i32(t1, cpu_env, aofs + i);
 742        }
 743    }
 744    tcg_temp_free_i32(t3);
 745    tcg_temp_free_i32(t2);
 746    tcg_temp_free_i32(t1);
 747    tcg_temp_free_i32(t0);
 748}
 749
 750/* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
 751static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 752                         void (*fni)(TCGv_i64, TCGv_i64))
 753{
 754    TCGv_i64 t0 = tcg_temp_new_i64();
 755    uint32_t i;
 756
 757    for (i = 0; i < oprsz; i += 8) {
 758        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 759        fni(t0, t0);
 760        tcg_gen_st_i64(t0, cpu_env, dofs + i);
 761    }
 762    tcg_temp_free_i64(t0);
 763}
 764
 765static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 766                          int64_t c, bool load_dest,
 767                          void (*fni)(TCGv_i64, TCGv_i64, int64_t))
 768{
 769    TCGv_i64 t0 = tcg_temp_new_i64();
 770    TCGv_i64 t1 = tcg_temp_new_i64();
 771    uint32_t i;
 772
 773    for (i = 0; i < oprsz; i += 8) {
 774        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 775        if (load_dest) {
 776            tcg_gen_ld_i64(t1, cpu_env, dofs + i);
 777        }
 778        fni(t1, t0, c);
 779        tcg_gen_st_i64(t1, cpu_env, dofs + i);
 780    }
 781    tcg_temp_free_i64(t0);
 782    tcg_temp_free_i64(t1);
 783}
 784
 785static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 786                          TCGv_i64 c, bool scalar_first,
 787                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
 788{
 789    TCGv_i64 t0 = tcg_temp_new_i64();
 790    TCGv_i64 t1 = tcg_temp_new_i64();
 791    uint32_t i;
 792
 793    for (i = 0; i < oprsz; i += 8) {
 794        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 795        if (scalar_first) {
 796            fni(t1, c, t0);
 797        } else {
 798            fni(t1, t0, c);
 799        }
 800        tcg_gen_st_i64(t1, cpu_env, dofs + i);
 801    }
 802    tcg_temp_free_i64(t0);
 803    tcg_temp_free_i64(t1);
 804}
 805
 806/* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
 807static void expand_3_i64(uint32_t dofs, uint32_t aofs,
 808                         uint32_t bofs, uint32_t oprsz, bool load_dest,
 809                         void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
 810{
 811    TCGv_i64 t0 = tcg_temp_new_i64();
 812    TCGv_i64 t1 = tcg_temp_new_i64();
 813    TCGv_i64 t2 = tcg_temp_new_i64();
 814    uint32_t i;
 815
 816    for (i = 0; i < oprsz; i += 8) {
 817        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 818        tcg_gen_ld_i64(t1, cpu_env, bofs + i);
 819        if (load_dest) {
 820            tcg_gen_ld_i64(t2, cpu_env, dofs + i);
 821        }
 822        fni(t2, t0, t1);
 823        tcg_gen_st_i64(t2, cpu_env, dofs + i);
 824    }
 825    tcg_temp_free_i64(t2);
 826    tcg_temp_free_i64(t1);
 827    tcg_temp_free_i64(t0);
 828}
 829
 830static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 831                          uint32_t oprsz, int64_t c, bool load_dest,
 832                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
 833{
 834    TCGv_i64 t0 = tcg_temp_new_i64();
 835    TCGv_i64 t1 = tcg_temp_new_i64();
 836    TCGv_i64 t2 = tcg_temp_new_i64();
 837    uint32_t i;
 838
 839    for (i = 0; i < oprsz; i += 8) {
 840        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 841        tcg_gen_ld_i64(t1, cpu_env, bofs + i);
 842        if (load_dest) {
 843            tcg_gen_ld_i64(t2, cpu_env, dofs + i);
 844        }
 845        fni(t2, t0, t1, c);
 846        tcg_gen_st_i64(t2, cpu_env, dofs + i);
 847    }
 848    tcg_temp_free_i64(t0);
 849    tcg_temp_free_i64(t1);
 850    tcg_temp_free_i64(t2);
 851}
 852
 853/* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
 854static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 855                         uint32_t cofs, uint32_t oprsz, bool write_aofs,
 856                         void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
 857{
 858    TCGv_i64 t0 = tcg_temp_new_i64();
 859    TCGv_i64 t1 = tcg_temp_new_i64();
 860    TCGv_i64 t2 = tcg_temp_new_i64();
 861    TCGv_i64 t3 = tcg_temp_new_i64();
 862    uint32_t i;
 863
 864    for (i = 0; i < oprsz; i += 8) {
 865        tcg_gen_ld_i64(t1, cpu_env, aofs + i);
 866        tcg_gen_ld_i64(t2, cpu_env, bofs + i);
 867        tcg_gen_ld_i64(t3, cpu_env, cofs + i);
 868        fni(t0, t1, t2, t3);
 869        tcg_gen_st_i64(t0, cpu_env, dofs + i);
 870        if (write_aofs) {
 871            tcg_gen_st_i64(t1, cpu_env, aofs + i);
 872        }
 873    }
 874    tcg_temp_free_i64(t3);
 875    tcg_temp_free_i64(t2);
 876    tcg_temp_free_i64(t1);
 877    tcg_temp_free_i64(t0);
 878}
 879
 880/* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
 881static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 882                         uint32_t oprsz, uint32_t tysz, TCGType type,
 883                         void (*fni)(unsigned, TCGv_vec, TCGv_vec))
 884{
 885    TCGv_vec t0 = tcg_temp_new_vec(type);
 886    uint32_t i;
 887
 888    for (i = 0; i < oprsz; i += tysz) {
 889        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 890        fni(vece, t0, t0);
 891        tcg_gen_st_vec(t0, cpu_env, dofs + i);
 892    }
 893    tcg_temp_free_vec(t0);
 894}
 895
 896/* Expand OPSZ bytes worth of two-vector operands and an immediate operand
 897   using host vectors.  */
 898static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 899                          uint32_t oprsz, uint32_t tysz, TCGType type,
 900                          int64_t c, bool load_dest,
 901                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
 902{
 903    TCGv_vec t0 = tcg_temp_new_vec(type);
 904    TCGv_vec t1 = tcg_temp_new_vec(type);
 905    uint32_t i;
 906
 907    for (i = 0; i < oprsz; i += tysz) {
 908        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 909        if (load_dest) {
 910            tcg_gen_ld_vec(t1, cpu_env, dofs + i);
 911        }
 912        fni(vece, t1, t0, c);
 913        tcg_gen_st_vec(t1, cpu_env, dofs + i);
 914    }
 915    tcg_temp_free_vec(t0);
 916    tcg_temp_free_vec(t1);
 917}
 918
 919static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 920                          uint32_t oprsz, uint32_t tysz, TCGType type,
 921                          TCGv_vec c, bool scalar_first,
 922                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
 923{
 924    TCGv_vec t0 = tcg_temp_new_vec(type);
 925    TCGv_vec t1 = tcg_temp_new_vec(type);
 926    uint32_t i;
 927
 928    for (i = 0; i < oprsz; i += tysz) {
 929        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 930        if (scalar_first) {
 931            fni(vece, t1, c, t0);
 932        } else {
 933            fni(vece, t1, t0, c);
 934        }
 935        tcg_gen_st_vec(t1, cpu_env, dofs + i);
 936    }
 937    tcg_temp_free_vec(t0);
 938    tcg_temp_free_vec(t1);
 939}
 940
 941/* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
 942static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 943                         uint32_t bofs, uint32_t oprsz,
 944                         uint32_t tysz, TCGType type, bool load_dest,
 945                         void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
 946{
 947    TCGv_vec t0 = tcg_temp_new_vec(type);
 948    TCGv_vec t1 = tcg_temp_new_vec(type);
 949    TCGv_vec t2 = tcg_temp_new_vec(type);
 950    uint32_t i;
 951
 952    for (i = 0; i < oprsz; i += tysz) {
 953        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 954        tcg_gen_ld_vec(t1, cpu_env, bofs + i);
 955        if (load_dest) {
 956            tcg_gen_ld_vec(t2, cpu_env, dofs + i);
 957        }
 958        fni(vece, t2, t0, t1);
 959        tcg_gen_st_vec(t2, cpu_env, dofs + i);
 960    }
 961    tcg_temp_free_vec(t2);
 962    tcg_temp_free_vec(t1);
 963    tcg_temp_free_vec(t0);
 964}
 965
 966/*
 967 * Expand OPSZ bytes worth of three-vector operands and an immediate operand
 968 * using host vectors.
 969 */
 970static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 971                          uint32_t bofs, uint32_t oprsz, uint32_t tysz,
 972                          TCGType type, int64_t c, bool load_dest,
 973                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
 974                                      int64_t))
 975{
 976    TCGv_vec t0 = tcg_temp_new_vec(type);
 977    TCGv_vec t1 = tcg_temp_new_vec(type);
 978    TCGv_vec t2 = tcg_temp_new_vec(type);
 979    uint32_t i;
 980
 981    for (i = 0; i < oprsz; i += tysz) {
 982        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 983        tcg_gen_ld_vec(t1, cpu_env, bofs + i);
 984        if (load_dest) {
 985            tcg_gen_ld_vec(t2, cpu_env, dofs + i);
 986        }
 987        fni(vece, t2, t0, t1, c);
 988        tcg_gen_st_vec(t2, cpu_env, dofs + i);
 989    }
 990    tcg_temp_free_vec(t0);
 991    tcg_temp_free_vec(t1);
 992    tcg_temp_free_vec(t2);
 993}
 994
 995/* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
 996static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 997                         uint32_t bofs, uint32_t cofs, uint32_t oprsz,
 998                         uint32_t tysz, TCGType type, bool write_aofs,
 999                         void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1000                                     TCGv_vec, TCGv_vec))
1001{
1002    TCGv_vec t0 = tcg_temp_new_vec(type);
1003    TCGv_vec t1 = tcg_temp_new_vec(type);
1004    TCGv_vec t2 = tcg_temp_new_vec(type);
1005    TCGv_vec t3 = tcg_temp_new_vec(type);
1006    uint32_t i;
1007
1008    for (i = 0; i < oprsz; i += tysz) {
1009        tcg_gen_ld_vec(t1, cpu_env, aofs + i);
1010        tcg_gen_ld_vec(t2, cpu_env, bofs + i);
1011        tcg_gen_ld_vec(t3, cpu_env, cofs + i);
1012        fni(vece, t0, t1, t2, t3);
1013        tcg_gen_st_vec(t0, cpu_env, dofs + i);
1014        if (write_aofs) {
1015            tcg_gen_st_vec(t1, cpu_env, aofs + i);
1016        }
1017    }
1018    tcg_temp_free_vec(t3);
1019    tcg_temp_free_vec(t2);
1020    tcg_temp_free_vec(t1);
1021    tcg_temp_free_vec(t0);
1022}
1023
1024/* Expand a vector two-operand operation.  */
1025void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
1026                    uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
1027{
1028    const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1029    const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1030    TCGType type;
1031    uint32_t some;
1032
1033    check_size_align(oprsz, maxsz, dofs | aofs);
1034    check_overlap_2(dofs, aofs, maxsz);
1035
1036    type = 0;
1037    if (g->fniv) {
1038        type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1039    }
1040    switch (type) {
1041    case TCG_TYPE_V256:
1042        /* Recall that ARM SVE allows vector sizes that are not a
1043         * power of 2, but always a multiple of 16.  The intent is
1044         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1045         */
1046        some = QEMU_ALIGN_DOWN(oprsz, 32);
1047        expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv);
1048        if (some == oprsz) {
1049            break;
1050        }
1051        dofs += some;
1052        aofs += some;
1053        oprsz -= some;
1054        maxsz -= some;
1055        /* fallthru */
1056    case TCG_TYPE_V128:
1057        expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv);
1058        break;
1059    case TCG_TYPE_V64:
1060        expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv);
1061        break;
1062
1063    case 0:
1064        if (g->fni8 && check_size_impl(oprsz, 8)) {
1065            expand_2_i64(dofs, aofs, oprsz, g->fni8);
1066        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1067            expand_2_i32(dofs, aofs, oprsz, g->fni4);
1068        } else {
1069            assert(g->fno != NULL);
1070            tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
1071            oprsz = maxsz;
1072        }
1073        break;
1074
1075    default:
1076        g_assert_not_reached();
1077    }
1078    tcg_swap_vecop_list(hold_list);
1079
1080    if (oprsz < maxsz) {
1081        expand_clr(dofs + oprsz, maxsz - oprsz);
1082    }
1083}
1084
1085/* Expand a vector operation with two vectors and an immediate.  */
1086void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1087                     uint32_t maxsz, int64_t c, const GVecGen2i *g)
1088{
1089    const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1090    const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1091    TCGType type;
1092    uint32_t some;
1093
1094    check_size_align(oprsz, maxsz, dofs | aofs);
1095    check_overlap_2(dofs, aofs, maxsz);
1096
1097    type = 0;
1098    if (g->fniv) {
1099        type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1100    }
1101    switch (type) {
1102    case TCG_TYPE_V256:
1103        /* Recall that ARM SVE allows vector sizes that are not a
1104         * power of 2, but always a multiple of 16.  The intent is
1105         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1106         */
1107        some = QEMU_ALIGN_DOWN(oprsz, 32);
1108        expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1109                      c, g->load_dest, g->fniv);
1110        if (some == oprsz) {
1111            break;
1112        }
1113        dofs += some;
1114        aofs += some;
1115        oprsz -= some;
1116        maxsz -= some;
1117        /* fallthru */
1118    case TCG_TYPE_V128:
1119        expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1120                      c, g->load_dest, g->fniv);
1121        break;
1122    case TCG_TYPE_V64:
1123        expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1124                      c, g->load_dest, g->fniv);
1125        break;
1126
1127    case 0:
1128        if (g->fni8 && check_size_impl(oprsz, 8)) {
1129            expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1130        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1131            expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1132        } else {
1133            if (g->fno) {
1134                tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1135            } else {
1136                TCGv_i64 tcg_c = tcg_const_i64(c);
1137                tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1138                                    maxsz, c, g->fnoi);
1139                tcg_temp_free_i64(tcg_c);
1140            }
1141            oprsz = maxsz;
1142        }
1143        break;
1144
1145    default:
1146        g_assert_not_reached();
1147    }
1148    tcg_swap_vecop_list(hold_list);
1149
1150    if (oprsz < maxsz) {
1151        expand_clr(dofs + oprsz, maxsz - oprsz);
1152    }
1153}
1154
1155/* Expand a vector operation with two vectors and a scalar.  */
1156void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1157                     uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1158{
1159    TCGType type;
1160
1161    check_size_align(oprsz, maxsz, dofs | aofs);
1162    check_overlap_2(dofs, aofs, maxsz);
1163
1164    type = 0;
1165    if (g->fniv) {
1166        type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1167    }
1168    if (type != 0) {
1169        const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1170        const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1171        TCGv_vec t_vec = tcg_temp_new_vec(type);
1172        uint32_t some;
1173
1174        tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1175
1176        switch (type) {
1177        case TCG_TYPE_V256:
1178            /* Recall that ARM SVE allows vector sizes that are not a
1179             * power of 2, but always a multiple of 16.  The intent is
1180             * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1181             */
1182            some = QEMU_ALIGN_DOWN(oprsz, 32);
1183            expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1184                          t_vec, g->scalar_first, g->fniv);
1185            if (some == oprsz) {
1186                break;
1187            }
1188            dofs += some;
1189            aofs += some;
1190            oprsz -= some;
1191            maxsz -= some;
1192            /* fallthru */
1193
1194        case TCG_TYPE_V128:
1195            expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1196                          t_vec, g->scalar_first, g->fniv);
1197            break;
1198
1199        case TCG_TYPE_V64:
1200            expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1201                          t_vec, g->scalar_first, g->fniv);
1202            break;
1203
1204        default:
1205            g_assert_not_reached();
1206        }
1207        tcg_temp_free_vec(t_vec);
1208        tcg_swap_vecop_list(hold_list);
1209    } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1210        TCGv_i64 t64 = tcg_temp_new_i64();
1211
1212        gen_dup_i64(g->vece, t64, c);
1213        expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1214        tcg_temp_free_i64(t64);
1215    } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1216        TCGv_i32 t32 = tcg_temp_new_i32();
1217
1218        tcg_gen_extrl_i64_i32(t32, c);
1219        gen_dup_i32(g->vece, t32, t32);
1220        expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1221        tcg_temp_free_i32(t32);
1222    } else {
1223        tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1224        return;
1225    }
1226
1227    if (oprsz < maxsz) {
1228        expand_clr(dofs + oprsz, maxsz - oprsz);
1229    }
1230}
1231
1232/* Expand a vector three-operand operation.  */
1233void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1234                    uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1235{
1236    const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1237    const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1238    TCGType type;
1239    uint32_t some;
1240
1241    check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1242    check_overlap_3(dofs, aofs, bofs, maxsz);
1243
1244    type = 0;
1245    if (g->fniv) {
1246        type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1247    }
1248    switch (type) {
1249    case TCG_TYPE_V256:
1250        /* Recall that ARM SVE allows vector sizes that are not a
1251         * power of 2, but always a multiple of 16.  The intent is
1252         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1253         */
1254        some = QEMU_ALIGN_DOWN(oprsz, 32);
1255        expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1256                     g->load_dest, g->fniv);
1257        if (some == oprsz) {
1258            break;
1259        }
1260        dofs += some;
1261        aofs += some;
1262        bofs += some;
1263        oprsz -= some;
1264        maxsz -= some;
1265        /* fallthru */
1266    case TCG_TYPE_V128:
1267        expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1268                     g->load_dest, g->fniv);
1269        break;
1270    case TCG_TYPE_V64:
1271        expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1272                     g->load_dest, g->fniv);
1273        break;
1274
1275    case 0:
1276        if (g->fni8 && check_size_impl(oprsz, 8)) {
1277            expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1278        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1279            expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1280        } else {
1281            assert(g->fno != NULL);
1282            tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
1283                               maxsz, g->data, g->fno);
1284            oprsz = maxsz;
1285        }
1286        break;
1287
1288    default:
1289        g_assert_not_reached();
1290    }
1291    tcg_swap_vecop_list(hold_list);
1292
1293    if (oprsz < maxsz) {
1294        expand_clr(dofs + oprsz, maxsz - oprsz);
1295    }
1296}
1297
1298/* Expand a vector operation with three vectors and an immediate.  */
1299void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1300                     uint32_t oprsz, uint32_t maxsz, int64_t c,
1301                     const GVecGen3i *g)
1302{
1303    const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1304    const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1305    TCGType type;
1306    uint32_t some;
1307
1308    check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1309    check_overlap_3(dofs, aofs, bofs, maxsz);
1310
1311    type = 0;
1312    if (g->fniv) {
1313        type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1314    }
1315    switch (type) {
1316    case TCG_TYPE_V256:
1317        /*
1318         * Recall that ARM SVE allows vector sizes that are not a
1319         * power of 2, but always a multiple of 16.  The intent is
1320         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1321         */
1322        some = QEMU_ALIGN_DOWN(oprsz, 32);
1323        expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1324                      c, g->load_dest, g->fniv);
1325        if (some == oprsz) {
1326            break;
1327        }
1328        dofs += some;
1329        aofs += some;
1330        bofs += some;
1331        oprsz -= some;
1332        maxsz -= some;
1333        /* fallthru */
1334    case TCG_TYPE_V128:
1335        expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1336                      c, g->load_dest, g->fniv);
1337        break;
1338    case TCG_TYPE_V64:
1339        expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1340                      c, g->load_dest, g->fniv);
1341        break;
1342
1343    case 0:
1344        if (g->fni8 && check_size_impl(oprsz, 8)) {
1345            expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8);
1346        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1347            expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4);
1348        } else {
1349            assert(g->fno != NULL);
1350            tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);
1351            oprsz = maxsz;
1352        }
1353        break;
1354
1355    default:
1356        g_assert_not_reached();
1357    }
1358    tcg_swap_vecop_list(hold_list);
1359
1360    if (oprsz < maxsz) {
1361        expand_clr(dofs + oprsz, maxsz - oprsz);
1362    }
1363}
1364
1365/* Expand a vector four-operand operation.  */
1366void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1367                    uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1368{
1369    const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1370    const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1371    TCGType type;
1372    uint32_t some;
1373
1374    check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1375    check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1376
1377    type = 0;
1378    if (g->fniv) {
1379        type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1380    }
1381    switch (type) {
1382    case TCG_TYPE_V256:
1383        /* Recall that ARM SVE allows vector sizes that are not a
1384         * power of 2, but always a multiple of 16.  The intent is
1385         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1386         */
1387        some = QEMU_ALIGN_DOWN(oprsz, 32);
1388        expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1389                     32, TCG_TYPE_V256, g->write_aofs, g->fniv);
1390        if (some == oprsz) {
1391            break;
1392        }
1393        dofs += some;
1394        aofs += some;
1395        bofs += some;
1396        cofs += some;
1397        oprsz -= some;
1398        maxsz -= some;
1399        /* fallthru */
1400    case TCG_TYPE_V128:
1401        expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1402                     16, TCG_TYPE_V128, g->write_aofs, g->fniv);
1403        break;
1404    case TCG_TYPE_V64:
1405        expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1406                     8, TCG_TYPE_V64, g->write_aofs, g->fniv);
1407        break;
1408
1409    case 0:
1410        if (g->fni8 && check_size_impl(oprsz, 8)) {
1411            expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
1412                         g->write_aofs, g->fni8);
1413        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1414            expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
1415                         g->write_aofs, g->fni4);
1416        } else {
1417            assert(g->fno != NULL);
1418            tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1419                               oprsz, maxsz, g->data, g->fno);
1420            oprsz = maxsz;
1421        }
1422        break;
1423
1424    default:
1425        g_assert_not_reached();
1426    }
1427    tcg_swap_vecop_list(hold_list);
1428
1429    if (oprsz < maxsz) {
1430        expand_clr(dofs + oprsz, maxsz - oprsz);
1431    }
1432}
1433
1434/*
1435 * Expand specific vector operations.
1436 */
1437
1438static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1439{
1440    tcg_gen_mov_vec(a, b);
1441}
1442
1443void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1444                      uint32_t oprsz, uint32_t maxsz)
1445{
1446    static const GVecGen2 g = {
1447        .fni8 = tcg_gen_mov_i64,
1448        .fniv = vec_mov2,
1449        .fno = gen_helper_gvec_mov,
1450        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1451    };
1452    if (dofs != aofs) {
1453        tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1454    } else {
1455        check_size_align(oprsz, maxsz, dofs);
1456        if (oprsz < maxsz) {
1457            expand_clr(dofs + oprsz, maxsz - oprsz);
1458        }
1459    }
1460}
1461
1462void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1463                          uint32_t maxsz, TCGv_i32 in)
1464{
1465    check_size_align(oprsz, maxsz, dofs);
1466    tcg_debug_assert(vece <= MO_32);
1467    do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1468}
1469
1470void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1471                          uint32_t maxsz, TCGv_i64 in)
1472{
1473    check_size_align(oprsz, maxsz, dofs);
1474    tcg_debug_assert(vece <= MO_64);
1475    do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1476}
1477
1478void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1479                          uint32_t oprsz, uint32_t maxsz)
1480{
1481    check_size_align(oprsz, maxsz, dofs);
1482    if (vece <= MO_64) {
1483        TCGType type = choose_vector_type(NULL, vece, oprsz, 0);
1484        if (type != 0) {
1485            TCGv_vec t_vec = tcg_temp_new_vec(type);
1486            tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs);
1487            do_dup_store(type, dofs, oprsz, maxsz, t_vec);
1488            tcg_temp_free_vec(t_vec);
1489        } else if (vece <= MO_32) {
1490            TCGv_i32 in = tcg_temp_new_i32();
1491            switch (vece) {
1492            case MO_8:
1493                tcg_gen_ld8u_i32(in, cpu_env, aofs);
1494                break;
1495            case MO_16:
1496                tcg_gen_ld16u_i32(in, cpu_env, aofs);
1497                break;
1498            default:
1499                tcg_gen_ld_i32(in, cpu_env, aofs);
1500                break;
1501            }
1502            do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1503            tcg_temp_free_i32(in);
1504        } else {
1505            TCGv_i64 in = tcg_temp_new_i64();
1506            tcg_gen_ld_i64(in, cpu_env, aofs);
1507            do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1508            tcg_temp_free_i64(in);
1509        }
1510    } else {
1511        /* 128-bit duplicate.  */
1512        /* ??? Dup to 256-bit vector.  */
1513        int i;
1514
1515        tcg_debug_assert(vece == 4);
1516        tcg_debug_assert(oprsz >= 16);
1517        if (TCG_TARGET_HAS_v128) {
1518            TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1519
1520            tcg_gen_ld_vec(in, cpu_env, aofs);
1521            for (i = 0; i < oprsz; i += 16) {
1522                tcg_gen_st_vec(in, cpu_env, dofs + i);
1523            }
1524            tcg_temp_free_vec(in);
1525        } else {
1526            TCGv_i64 in0 = tcg_temp_new_i64();
1527            TCGv_i64 in1 = tcg_temp_new_i64();
1528
1529            tcg_gen_ld_i64(in0, cpu_env, aofs);
1530            tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
1531            for (i = 0; i < oprsz; i += 16) {
1532                tcg_gen_st_i64(in0, cpu_env, dofs + i);
1533                tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
1534            }
1535            tcg_temp_free_i64(in0);
1536            tcg_temp_free_i64(in1);
1537        }
1538        if (oprsz < maxsz) {
1539            expand_clr(dofs + oprsz, maxsz - oprsz);
1540        }
1541    }
1542}
1543
1544void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz,
1545                         uint32_t maxsz, uint64_t x)
1546{
1547    check_size_align(oprsz, maxsz, dofs);
1548    do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x);
1549}
1550
1551void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz,
1552                         uint32_t maxsz, uint32_t x)
1553{
1554    check_size_align(oprsz, maxsz, dofs);
1555    do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x);
1556}
1557
1558void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz,
1559                         uint32_t maxsz, uint16_t x)
1560{
1561    check_size_align(oprsz, maxsz, dofs);
1562    do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x);
1563}
1564
1565void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz,
1566                         uint32_t maxsz, uint8_t x)
1567{
1568    check_size_align(oprsz, maxsz, dofs);
1569    do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x);
1570}
1571
1572void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1573                      uint32_t oprsz, uint32_t maxsz)
1574{
1575    static const GVecGen2 g = {
1576        .fni8 = tcg_gen_not_i64,
1577        .fniv = tcg_gen_not_vec,
1578        .fno = gen_helper_gvec_not,
1579        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1580    };
1581    tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1582}
1583
1584/* Perform a vector addition using normal addition and a mask.  The mask
1585   should be the sign bit of each lane.  This 6-operation form is more
1586   efficient than separate additions when there are 4 or more lanes in
1587   the 64-bit operation.  */
1588static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1589{
1590    TCGv_i64 t1 = tcg_temp_new_i64();
1591    TCGv_i64 t2 = tcg_temp_new_i64();
1592    TCGv_i64 t3 = tcg_temp_new_i64();
1593
1594    tcg_gen_andc_i64(t1, a, m);
1595    tcg_gen_andc_i64(t2, b, m);
1596    tcg_gen_xor_i64(t3, a, b);
1597    tcg_gen_add_i64(d, t1, t2);
1598    tcg_gen_and_i64(t3, t3, m);
1599    tcg_gen_xor_i64(d, d, t3);
1600
1601    tcg_temp_free_i64(t1);
1602    tcg_temp_free_i64(t2);
1603    tcg_temp_free_i64(t3);
1604}
1605
1606void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1607{
1608    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1609    gen_addv_mask(d, a, b, m);
1610    tcg_temp_free_i64(m);
1611}
1612
1613void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1614{
1615    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1616    gen_addv_mask(d, a, b, m);
1617    tcg_temp_free_i64(m);
1618}
1619
1620void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1621{
1622    TCGv_i64 t1 = tcg_temp_new_i64();
1623    TCGv_i64 t2 = tcg_temp_new_i64();
1624
1625    tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1626    tcg_gen_add_i64(t2, a, b);
1627    tcg_gen_add_i64(t1, t1, b);
1628    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1629
1630    tcg_temp_free_i64(t1);
1631    tcg_temp_free_i64(t2);
1632}
1633
1634static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 };
1635
1636void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1637                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1638{
1639    static const GVecGen3 g[4] = {
1640        { .fni8 = tcg_gen_vec_add8_i64,
1641          .fniv = tcg_gen_add_vec,
1642          .fno = gen_helper_gvec_add8,
1643          .opt_opc = vecop_list_add,
1644          .vece = MO_8 },
1645        { .fni8 = tcg_gen_vec_add16_i64,
1646          .fniv = tcg_gen_add_vec,
1647          .fno = gen_helper_gvec_add16,
1648          .opt_opc = vecop_list_add,
1649          .vece = MO_16 },
1650        { .fni4 = tcg_gen_add_i32,
1651          .fniv = tcg_gen_add_vec,
1652          .fno = gen_helper_gvec_add32,
1653          .opt_opc = vecop_list_add,
1654          .vece = MO_32 },
1655        { .fni8 = tcg_gen_add_i64,
1656          .fniv = tcg_gen_add_vec,
1657          .fno = gen_helper_gvec_add64,
1658          .opt_opc = vecop_list_add,
1659          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1660          .vece = MO_64 },
1661    };
1662
1663    tcg_debug_assert(vece <= MO_64);
1664    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1665}
1666
1667void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1668                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1669{
1670    static const GVecGen2s g[4] = {
1671        { .fni8 = tcg_gen_vec_add8_i64,
1672          .fniv = tcg_gen_add_vec,
1673          .fno = gen_helper_gvec_adds8,
1674          .opt_opc = vecop_list_add,
1675          .vece = MO_8 },
1676        { .fni8 = tcg_gen_vec_add16_i64,
1677          .fniv = tcg_gen_add_vec,
1678          .fno = gen_helper_gvec_adds16,
1679          .opt_opc = vecop_list_add,
1680          .vece = MO_16 },
1681        { .fni4 = tcg_gen_add_i32,
1682          .fniv = tcg_gen_add_vec,
1683          .fno = gen_helper_gvec_adds32,
1684          .opt_opc = vecop_list_add,
1685          .vece = MO_32 },
1686        { .fni8 = tcg_gen_add_i64,
1687          .fniv = tcg_gen_add_vec,
1688          .fno = gen_helper_gvec_adds64,
1689          .opt_opc = vecop_list_add,
1690          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1691          .vece = MO_64 },
1692    };
1693
1694    tcg_debug_assert(vece <= MO_64);
1695    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1696}
1697
1698void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
1699                       int64_t c, uint32_t oprsz, uint32_t maxsz)
1700{
1701    TCGv_i64 tmp = tcg_const_i64(c);
1702    tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
1703    tcg_temp_free_i64(tmp);
1704}
1705
1706static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
1707
1708void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
1709                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1710{
1711    static const GVecGen2s g[4] = {
1712        { .fni8 = tcg_gen_vec_sub8_i64,
1713          .fniv = tcg_gen_sub_vec,
1714          .fno = gen_helper_gvec_subs8,
1715          .opt_opc = vecop_list_sub,
1716          .vece = MO_8 },
1717        { .fni8 = tcg_gen_vec_sub16_i64,
1718          .fniv = tcg_gen_sub_vec,
1719          .fno = gen_helper_gvec_subs16,
1720          .opt_opc = vecop_list_sub,
1721          .vece = MO_16 },
1722        { .fni4 = tcg_gen_sub_i32,
1723          .fniv = tcg_gen_sub_vec,
1724          .fno = gen_helper_gvec_subs32,
1725          .opt_opc = vecop_list_sub,
1726          .vece = MO_32 },
1727        { .fni8 = tcg_gen_sub_i64,
1728          .fniv = tcg_gen_sub_vec,
1729          .fno = gen_helper_gvec_subs64,
1730          .opt_opc = vecop_list_sub,
1731          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1732          .vece = MO_64 },
1733    };
1734
1735    tcg_debug_assert(vece <= MO_64);
1736    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1737}
1738
1739/* Perform a vector subtraction using normal subtraction and a mask.
1740   Compare gen_addv_mask above.  */
1741static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1742{
1743    TCGv_i64 t1 = tcg_temp_new_i64();
1744    TCGv_i64 t2 = tcg_temp_new_i64();
1745    TCGv_i64 t3 = tcg_temp_new_i64();
1746
1747    tcg_gen_or_i64(t1, a, m);
1748    tcg_gen_andc_i64(t2, b, m);
1749    tcg_gen_eqv_i64(t3, a, b);
1750    tcg_gen_sub_i64(d, t1, t2);
1751    tcg_gen_and_i64(t3, t3, m);
1752    tcg_gen_xor_i64(d, d, t3);
1753
1754    tcg_temp_free_i64(t1);
1755    tcg_temp_free_i64(t2);
1756    tcg_temp_free_i64(t3);
1757}
1758
1759void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1760{
1761    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1762    gen_subv_mask(d, a, b, m);
1763    tcg_temp_free_i64(m);
1764}
1765
1766void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1767{
1768    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1769    gen_subv_mask(d, a, b, m);
1770    tcg_temp_free_i64(m);
1771}
1772
1773void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1774{
1775    TCGv_i64 t1 = tcg_temp_new_i64();
1776    TCGv_i64 t2 = tcg_temp_new_i64();
1777
1778    tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1779    tcg_gen_sub_i64(t2, a, b);
1780    tcg_gen_sub_i64(t1, a, t1);
1781    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1782
1783    tcg_temp_free_i64(t1);
1784    tcg_temp_free_i64(t2);
1785}
1786
1787void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
1788                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1789{
1790    static const GVecGen3 g[4] = {
1791        { .fni8 = tcg_gen_vec_sub8_i64,
1792          .fniv = tcg_gen_sub_vec,
1793          .fno = gen_helper_gvec_sub8,
1794          .opt_opc = vecop_list_sub,
1795          .vece = MO_8 },
1796        { .fni8 = tcg_gen_vec_sub16_i64,
1797          .fniv = tcg_gen_sub_vec,
1798          .fno = gen_helper_gvec_sub16,
1799          .opt_opc = vecop_list_sub,
1800          .vece = MO_16 },
1801        { .fni4 = tcg_gen_sub_i32,
1802          .fniv = tcg_gen_sub_vec,
1803          .fno = gen_helper_gvec_sub32,
1804          .opt_opc = vecop_list_sub,
1805          .vece = MO_32 },
1806        { .fni8 = tcg_gen_sub_i64,
1807          .fniv = tcg_gen_sub_vec,
1808          .fno = gen_helper_gvec_sub64,
1809          .opt_opc = vecop_list_sub,
1810          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1811          .vece = MO_64 },
1812    };
1813
1814    tcg_debug_assert(vece <= MO_64);
1815    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1816}
1817
1818static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 };
1819
1820void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
1821                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1822{
1823    static const GVecGen3 g[4] = {
1824        { .fniv = tcg_gen_mul_vec,
1825          .fno = gen_helper_gvec_mul8,
1826          .opt_opc = vecop_list_mul,
1827          .vece = MO_8 },
1828        { .fniv = tcg_gen_mul_vec,
1829          .fno = gen_helper_gvec_mul16,
1830          .opt_opc = vecop_list_mul,
1831          .vece = MO_16 },
1832        { .fni4 = tcg_gen_mul_i32,
1833          .fniv = tcg_gen_mul_vec,
1834          .fno = gen_helper_gvec_mul32,
1835          .opt_opc = vecop_list_mul,
1836          .vece = MO_32 },
1837        { .fni8 = tcg_gen_mul_i64,
1838          .fniv = tcg_gen_mul_vec,
1839          .fno = gen_helper_gvec_mul64,
1840          .opt_opc = vecop_list_mul,
1841          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1842          .vece = MO_64 },
1843    };
1844
1845    tcg_debug_assert(vece <= MO_64);
1846    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1847}
1848
1849void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
1850                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1851{
1852    static const GVecGen2s g[4] = {
1853        { .fniv = tcg_gen_mul_vec,
1854          .fno = gen_helper_gvec_muls8,
1855          .opt_opc = vecop_list_mul,
1856          .vece = MO_8 },
1857        { .fniv = tcg_gen_mul_vec,
1858          .fno = gen_helper_gvec_muls16,
1859          .opt_opc = vecop_list_mul,
1860          .vece = MO_16 },
1861        { .fni4 = tcg_gen_mul_i32,
1862          .fniv = tcg_gen_mul_vec,
1863          .fno = gen_helper_gvec_muls32,
1864          .opt_opc = vecop_list_mul,
1865          .vece = MO_32 },
1866        { .fni8 = tcg_gen_mul_i64,
1867          .fniv = tcg_gen_mul_vec,
1868          .fno = gen_helper_gvec_muls64,
1869          .opt_opc = vecop_list_mul,
1870          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1871          .vece = MO_64 },
1872    };
1873
1874    tcg_debug_assert(vece <= MO_64);
1875    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1876}
1877
1878void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
1879                       int64_t c, uint32_t oprsz, uint32_t maxsz)
1880{
1881    TCGv_i64 tmp = tcg_const_i64(c);
1882    tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
1883    tcg_temp_free_i64(tmp);
1884}
1885
1886void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1887                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1888{
1889    static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 };
1890    static const GVecGen3 g[4] = {
1891        { .fniv = tcg_gen_ssadd_vec,
1892          .fno = gen_helper_gvec_ssadd8,
1893          .opt_opc = vecop_list,
1894          .vece = MO_8 },
1895        { .fniv = tcg_gen_ssadd_vec,
1896          .fno = gen_helper_gvec_ssadd16,
1897          .opt_opc = vecop_list,
1898          .vece = MO_16 },
1899        { .fniv = tcg_gen_ssadd_vec,
1900          .fno = gen_helper_gvec_ssadd32,
1901          .opt_opc = vecop_list,
1902          .vece = MO_32 },
1903        { .fniv = tcg_gen_ssadd_vec,
1904          .fno = gen_helper_gvec_ssadd64,
1905          .opt_opc = vecop_list,
1906          .vece = MO_64 },
1907    };
1908    tcg_debug_assert(vece <= MO_64);
1909    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1910}
1911
1912void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
1913                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1914{
1915    static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 };
1916    static const GVecGen3 g[4] = {
1917        { .fniv = tcg_gen_sssub_vec,
1918          .fno = gen_helper_gvec_sssub8,
1919          .opt_opc = vecop_list,
1920          .vece = MO_8 },
1921        { .fniv = tcg_gen_sssub_vec,
1922          .fno = gen_helper_gvec_sssub16,
1923          .opt_opc = vecop_list,
1924          .vece = MO_16 },
1925        { .fniv = tcg_gen_sssub_vec,
1926          .fno = gen_helper_gvec_sssub32,
1927          .opt_opc = vecop_list,
1928          .vece = MO_32 },
1929        { .fniv = tcg_gen_sssub_vec,
1930          .fno = gen_helper_gvec_sssub64,
1931          .opt_opc = vecop_list,
1932          .vece = MO_64 },
1933    };
1934    tcg_debug_assert(vece <= MO_64);
1935    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1936}
1937
1938static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1939{
1940    TCGv_i32 max = tcg_const_i32(-1);
1941    tcg_gen_add_i32(d, a, b);
1942    tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
1943    tcg_temp_free_i32(max);
1944}
1945
1946static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1947{
1948    TCGv_i64 max = tcg_const_i64(-1);
1949    tcg_gen_add_i64(d, a, b);
1950    tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
1951    tcg_temp_free_i64(max);
1952}
1953
1954void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1955                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1956{
1957    static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 };
1958    static const GVecGen3 g[4] = {
1959        { .fniv = tcg_gen_usadd_vec,
1960          .fno = gen_helper_gvec_usadd8,
1961          .opt_opc = vecop_list,
1962          .vece = MO_8 },
1963        { .fniv = tcg_gen_usadd_vec,
1964          .fno = gen_helper_gvec_usadd16,
1965          .opt_opc = vecop_list,
1966          .vece = MO_16 },
1967        { .fni4 = tcg_gen_usadd_i32,
1968          .fniv = tcg_gen_usadd_vec,
1969          .fno = gen_helper_gvec_usadd32,
1970          .opt_opc = vecop_list,
1971          .vece = MO_32 },
1972        { .fni8 = tcg_gen_usadd_i64,
1973          .fniv = tcg_gen_usadd_vec,
1974          .fno = gen_helper_gvec_usadd64,
1975          .opt_opc = vecop_list,
1976          .vece = MO_64 }
1977    };
1978    tcg_debug_assert(vece <= MO_64);
1979    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1980}
1981
1982static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1983{
1984    TCGv_i32 min = tcg_const_i32(0);
1985    tcg_gen_sub_i32(d, a, b);
1986    tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
1987    tcg_temp_free_i32(min);
1988}
1989
1990static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1991{
1992    TCGv_i64 min = tcg_const_i64(0);
1993    tcg_gen_sub_i64(d, a, b);
1994    tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
1995    tcg_temp_free_i64(min);
1996}
1997
1998void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
1999                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2000{
2001    static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 };
2002    static const GVecGen3 g[4] = {
2003        { .fniv = tcg_gen_ussub_vec,
2004          .fno = gen_helper_gvec_ussub8,
2005          .opt_opc = vecop_list,
2006          .vece = MO_8 },
2007        { .fniv = tcg_gen_ussub_vec,
2008          .fno = gen_helper_gvec_ussub16,
2009          .opt_opc = vecop_list,
2010          .vece = MO_16 },
2011        { .fni4 = tcg_gen_ussub_i32,
2012          .fniv = tcg_gen_ussub_vec,
2013          .fno = gen_helper_gvec_ussub32,
2014          .opt_opc = vecop_list,
2015          .vece = MO_32 },
2016        { .fni8 = tcg_gen_ussub_i64,
2017          .fniv = tcg_gen_ussub_vec,
2018          .fno = gen_helper_gvec_ussub64,
2019          .opt_opc = vecop_list,
2020          .vece = MO_64 }
2021    };
2022    tcg_debug_assert(vece <= MO_64);
2023    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2024}
2025
2026void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
2027                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2028{
2029    static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 };
2030    static const GVecGen3 g[4] = {
2031        { .fniv = tcg_gen_smin_vec,
2032          .fno = gen_helper_gvec_smin8,
2033          .opt_opc = vecop_list,
2034          .vece = MO_8 },
2035        { .fniv = tcg_gen_smin_vec,
2036          .fno = gen_helper_gvec_smin16,
2037          .opt_opc = vecop_list,
2038          .vece = MO_16 },
2039        { .fni4 = tcg_gen_smin_i32,
2040          .fniv = tcg_gen_smin_vec,
2041          .fno = gen_helper_gvec_smin32,
2042          .opt_opc = vecop_list,
2043          .vece = MO_32 },
2044        { .fni8 = tcg_gen_smin_i64,
2045          .fniv = tcg_gen_smin_vec,
2046          .fno = gen_helper_gvec_smin64,
2047          .opt_opc = vecop_list,
2048          .vece = MO_64 }
2049    };
2050    tcg_debug_assert(vece <= MO_64);
2051    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2052}
2053
2054void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
2055                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2056{
2057    static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 };
2058    static const GVecGen3 g[4] = {
2059        { .fniv = tcg_gen_umin_vec,
2060          .fno = gen_helper_gvec_umin8,
2061          .opt_opc = vecop_list,
2062          .vece = MO_8 },
2063        { .fniv = tcg_gen_umin_vec,
2064          .fno = gen_helper_gvec_umin16,
2065          .opt_opc = vecop_list,
2066          .vece = MO_16 },
2067        { .fni4 = tcg_gen_umin_i32,
2068          .fniv = tcg_gen_umin_vec,
2069          .fno = gen_helper_gvec_umin32,
2070          .opt_opc = vecop_list,
2071          .vece = MO_32 },
2072        { .fni8 = tcg_gen_umin_i64,
2073          .fniv = tcg_gen_umin_vec,
2074          .fno = gen_helper_gvec_umin64,
2075          .opt_opc = vecop_list,
2076          .vece = MO_64 }
2077    };
2078    tcg_debug_assert(vece <= MO_64);
2079    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2080}
2081
2082void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
2083                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2084{
2085    static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 };
2086    static const GVecGen3 g[4] = {
2087        { .fniv = tcg_gen_smax_vec,
2088          .fno = gen_helper_gvec_smax8,
2089          .opt_opc = vecop_list,
2090          .vece = MO_8 },
2091        { .fniv = tcg_gen_smax_vec,
2092          .fno = gen_helper_gvec_smax16,
2093          .opt_opc = vecop_list,
2094          .vece = MO_16 },
2095        { .fni4 = tcg_gen_smax_i32,
2096          .fniv = tcg_gen_smax_vec,
2097          .fno = gen_helper_gvec_smax32,
2098          .opt_opc = vecop_list,
2099          .vece = MO_32 },
2100        { .fni8 = tcg_gen_smax_i64,
2101          .fniv = tcg_gen_smax_vec,
2102          .fno = gen_helper_gvec_smax64,
2103          .opt_opc = vecop_list,
2104          .vece = MO_64 }
2105    };
2106    tcg_debug_assert(vece <= MO_64);
2107    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2108}
2109
2110void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
2111                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2112{
2113    static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 };
2114    static const GVecGen3 g[4] = {
2115        { .fniv = tcg_gen_umax_vec,
2116          .fno = gen_helper_gvec_umax8,
2117          .opt_opc = vecop_list,
2118          .vece = MO_8 },
2119        { .fniv = tcg_gen_umax_vec,
2120          .fno = gen_helper_gvec_umax16,
2121          .opt_opc = vecop_list,
2122          .vece = MO_16 },
2123        { .fni4 = tcg_gen_umax_i32,
2124          .fniv = tcg_gen_umax_vec,
2125          .fno = gen_helper_gvec_umax32,
2126          .opt_opc = vecop_list,
2127          .vece = MO_32 },
2128        { .fni8 = tcg_gen_umax_i64,
2129          .fniv = tcg_gen_umax_vec,
2130          .fno = gen_helper_gvec_umax64,
2131          .opt_opc = vecop_list,
2132          .vece = MO_64 }
2133    };
2134    tcg_debug_assert(vece <= MO_64);
2135    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2136}
2137
2138/* Perform a vector negation using normal negation and a mask.
2139   Compare gen_subv_mask above.  */
2140static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
2141{
2142    TCGv_i64 t2 = tcg_temp_new_i64();
2143    TCGv_i64 t3 = tcg_temp_new_i64();
2144
2145    tcg_gen_andc_i64(t3, m, b);
2146    tcg_gen_andc_i64(t2, b, m);
2147    tcg_gen_sub_i64(d, m, t2);
2148    tcg_gen_xor_i64(d, d, t3);
2149
2150    tcg_temp_free_i64(t2);
2151    tcg_temp_free_i64(t3);
2152}
2153
2154void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
2155{
2156    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
2157    gen_negv_mask(d, b, m);
2158    tcg_temp_free_i64(m);
2159}
2160
2161void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
2162{
2163    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
2164    gen_negv_mask(d, b, m);
2165    tcg_temp_free_i64(m);
2166}
2167
2168void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
2169{
2170    TCGv_i64 t1 = tcg_temp_new_i64();
2171    TCGv_i64 t2 = tcg_temp_new_i64();
2172
2173    tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2174    tcg_gen_neg_i64(t2, b);
2175    tcg_gen_neg_i64(t1, t1);
2176    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2177
2178    tcg_temp_free_i64(t1);
2179    tcg_temp_free_i64(t2);
2180}
2181
2182void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
2183                      uint32_t oprsz, uint32_t maxsz)
2184{
2185    static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 };
2186    static const GVecGen2 g[4] = {
2187        { .fni8 = tcg_gen_vec_neg8_i64,
2188          .fniv = tcg_gen_neg_vec,
2189          .fno = gen_helper_gvec_neg8,
2190          .opt_opc = vecop_list,
2191          .vece = MO_8 },
2192        { .fni8 = tcg_gen_vec_neg16_i64,
2193          .fniv = tcg_gen_neg_vec,
2194          .fno = gen_helper_gvec_neg16,
2195          .opt_opc = vecop_list,
2196          .vece = MO_16 },
2197        { .fni4 = tcg_gen_neg_i32,
2198          .fniv = tcg_gen_neg_vec,
2199          .fno = gen_helper_gvec_neg32,
2200          .opt_opc = vecop_list,
2201          .vece = MO_32 },
2202        { .fni8 = tcg_gen_neg_i64,
2203          .fniv = tcg_gen_neg_vec,
2204          .fno = gen_helper_gvec_neg64,
2205          .opt_opc = vecop_list,
2206          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2207          .vece = MO_64 },
2208    };
2209
2210    tcg_debug_assert(vece <= MO_64);
2211    tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2212}
2213
2214static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
2215{
2216    TCGv_i64 t = tcg_temp_new_i64();
2217    int nbit = 8 << vece;
2218
2219    /* Create -1 for each negative element.  */
2220    tcg_gen_shri_i64(t, b, nbit - 1);
2221    tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2222    tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
2223
2224    /*
2225     * Invert (via xor -1) and add one (via sub -1).
2226     * Because of the ordering the msb is cleared,
2227     * so we never have carry into the next element.
2228     */
2229    tcg_gen_xor_i64(d, b, t);
2230    tcg_gen_sub_i64(d, d, t);
2231
2232    tcg_temp_free_i64(t);
2233}
2234
2235static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b)
2236{
2237    gen_absv_mask(d, b, MO_8);
2238}
2239
2240static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b)
2241{
2242    gen_absv_mask(d, b, MO_16);
2243}
2244
2245void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
2246                      uint32_t oprsz, uint32_t maxsz)
2247{
2248    static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 };
2249    static const GVecGen2 g[4] = {
2250        { .fni8 = tcg_gen_vec_abs8_i64,
2251          .fniv = tcg_gen_abs_vec,
2252          .fno = gen_helper_gvec_abs8,
2253          .opt_opc = vecop_list,
2254          .vece = MO_8 },
2255        { .fni8 = tcg_gen_vec_abs16_i64,
2256          .fniv = tcg_gen_abs_vec,
2257          .fno = gen_helper_gvec_abs16,
2258          .opt_opc = vecop_list,
2259          .vece = MO_16 },
2260        { .fni4 = tcg_gen_abs_i32,
2261          .fniv = tcg_gen_abs_vec,
2262          .fno = gen_helper_gvec_abs32,
2263          .opt_opc = vecop_list,
2264          .vece = MO_32 },
2265        { .fni8 = tcg_gen_abs_i64,
2266          .fniv = tcg_gen_abs_vec,
2267          .fno = gen_helper_gvec_abs64,
2268          .opt_opc = vecop_list,
2269          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2270          .vece = MO_64 },
2271    };
2272
2273    tcg_debug_assert(vece <= MO_64);
2274    tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2275}
2276
2277void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
2278                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2279{
2280    static const GVecGen3 g = {
2281        .fni8 = tcg_gen_and_i64,
2282        .fniv = tcg_gen_and_vec,
2283        .fno = gen_helper_gvec_and,
2284        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2285    };
2286
2287    if (aofs == bofs) {
2288        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2289    } else {
2290        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2291    }
2292}
2293
2294void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
2295                     uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2296{
2297    static const GVecGen3 g = {
2298        .fni8 = tcg_gen_or_i64,
2299        .fniv = tcg_gen_or_vec,
2300        .fno = gen_helper_gvec_or,
2301        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2302    };
2303
2304    if (aofs == bofs) {
2305        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2306    } else {
2307        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2308    }
2309}
2310
2311void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
2312                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2313{
2314    static const GVecGen3 g = {
2315        .fni8 = tcg_gen_xor_i64,
2316        .fniv = tcg_gen_xor_vec,
2317        .fno = gen_helper_gvec_xor,
2318        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2319    };
2320
2321    if (aofs == bofs) {
2322        tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0);
2323    } else {
2324        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2325    }
2326}
2327
2328void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
2329                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2330{
2331    static const GVecGen3 g = {
2332        .fni8 = tcg_gen_andc_i64,
2333        .fniv = tcg_gen_andc_vec,
2334        .fno = gen_helper_gvec_andc,
2335        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2336    };
2337
2338    if (aofs == bofs) {
2339        tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0);
2340    } else {
2341        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2342    }
2343}
2344
2345void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
2346                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2347{
2348    static const GVecGen3 g = {
2349        .fni8 = tcg_gen_orc_i64,
2350        .fniv = tcg_gen_orc_vec,
2351        .fno = gen_helper_gvec_orc,
2352        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2353    };
2354
2355    if (aofs == bofs) {
2356        tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1);
2357    } else {
2358        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2359    }
2360}
2361
2362void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
2363                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2364{
2365    static const GVecGen3 g = {
2366        .fni8 = tcg_gen_nand_i64,
2367        .fniv = tcg_gen_nand_vec,
2368        .fno = gen_helper_gvec_nand,
2369        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2370    };
2371
2372    if (aofs == bofs) {
2373        tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2374    } else {
2375        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2376    }
2377}
2378
2379void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
2380                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2381{
2382    static const GVecGen3 g = {
2383        .fni8 = tcg_gen_nor_i64,
2384        .fniv = tcg_gen_nor_vec,
2385        .fno = gen_helper_gvec_nor,
2386        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2387    };
2388
2389    if (aofs == bofs) {
2390        tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2391    } else {
2392        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2393    }
2394}
2395
2396void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
2397                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2398{
2399    static const GVecGen3 g = {
2400        .fni8 = tcg_gen_eqv_i64,
2401        .fniv = tcg_gen_eqv_vec,
2402        .fno = gen_helper_gvec_eqv,
2403        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2404    };
2405
2406    if (aofs == bofs) {
2407        tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1);
2408    } else {
2409        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2410    }
2411}
2412
2413static const GVecGen2s gop_ands = {
2414    .fni8 = tcg_gen_and_i64,
2415    .fniv = tcg_gen_and_vec,
2416    .fno = gen_helper_gvec_ands,
2417    .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2418    .vece = MO_64
2419};
2420
2421void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
2422                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2423{
2424    TCGv_i64 tmp = tcg_temp_new_i64();
2425    gen_dup_i64(vece, tmp, c);
2426    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2427    tcg_temp_free_i64(tmp);
2428}
2429
2430void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
2431                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2432{
2433    TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2434    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2435    tcg_temp_free_i64(tmp);
2436}
2437
2438static const GVecGen2s gop_xors = {
2439    .fni8 = tcg_gen_xor_i64,
2440    .fniv = tcg_gen_xor_vec,
2441    .fno = gen_helper_gvec_xors,
2442    .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2443    .vece = MO_64
2444};
2445
2446void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
2447                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2448{
2449    TCGv_i64 tmp = tcg_temp_new_i64();
2450    gen_dup_i64(vece, tmp, c);
2451    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2452    tcg_temp_free_i64(tmp);
2453}
2454
2455void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
2456                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2457{
2458    TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2459    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2460    tcg_temp_free_i64(tmp);
2461}
2462
2463static const GVecGen2s gop_ors = {
2464    .fni8 = tcg_gen_or_i64,
2465    .fniv = tcg_gen_or_vec,
2466    .fno = gen_helper_gvec_ors,
2467    .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2468    .vece = MO_64
2469};
2470
2471void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
2472                      TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2473{
2474    TCGv_i64 tmp = tcg_temp_new_i64();
2475    gen_dup_i64(vece, tmp, c);
2476    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2477    tcg_temp_free_i64(tmp);
2478}
2479
2480void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
2481                      int64_t c, uint32_t oprsz, uint32_t maxsz)
2482{
2483    TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2484    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2485    tcg_temp_free_i64(tmp);
2486}
2487
2488void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2489{
2490    uint64_t mask = dup_const(MO_8, 0xff << c);
2491    tcg_gen_shli_i64(d, a, c);
2492    tcg_gen_andi_i64(d, d, mask);
2493}
2494
2495void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2496{
2497    uint64_t mask = dup_const(MO_16, 0xffff << c);
2498    tcg_gen_shli_i64(d, a, c);
2499    tcg_gen_andi_i64(d, d, mask);
2500}
2501
2502void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
2503                       int64_t shift, uint32_t oprsz, uint32_t maxsz)
2504{
2505    static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
2506    static const GVecGen2i g[4] = {
2507        { .fni8 = tcg_gen_vec_shl8i_i64,
2508          .fniv = tcg_gen_shli_vec,
2509          .fno = gen_helper_gvec_shl8i,
2510          .opt_opc = vecop_list,
2511          .vece = MO_8 },
2512        { .fni8 = tcg_gen_vec_shl16i_i64,
2513          .fniv = tcg_gen_shli_vec,
2514          .fno = gen_helper_gvec_shl16i,
2515          .opt_opc = vecop_list,
2516          .vece = MO_16 },
2517        { .fni4 = tcg_gen_shli_i32,
2518          .fniv = tcg_gen_shli_vec,
2519          .fno = gen_helper_gvec_shl32i,
2520          .opt_opc = vecop_list,
2521          .vece = MO_32 },
2522        { .fni8 = tcg_gen_shli_i64,
2523          .fniv = tcg_gen_shli_vec,
2524          .fno = gen_helper_gvec_shl64i,
2525          .opt_opc = vecop_list,
2526          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2527          .vece = MO_64 },
2528    };
2529
2530    tcg_debug_assert(vece <= MO_64);
2531    tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2532    if (shift == 0) {
2533        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2534    } else {
2535        tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2536    }
2537}
2538
2539void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2540{
2541    uint64_t mask = dup_const(MO_8, 0xff >> c);
2542    tcg_gen_shri_i64(d, a, c);
2543    tcg_gen_andi_i64(d, d, mask);
2544}
2545
2546void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2547{
2548    uint64_t mask = dup_const(MO_16, 0xffff >> c);
2549    tcg_gen_shri_i64(d, a, c);
2550    tcg_gen_andi_i64(d, d, mask);
2551}
2552
2553void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
2554                       int64_t shift, uint32_t oprsz, uint32_t maxsz)
2555{
2556    static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
2557    static const GVecGen2i g[4] = {
2558        { .fni8 = tcg_gen_vec_shr8i_i64,
2559          .fniv = tcg_gen_shri_vec,
2560          .fno = gen_helper_gvec_shr8i,
2561          .opt_opc = vecop_list,
2562          .vece = MO_8 },
2563        { .fni8 = tcg_gen_vec_shr16i_i64,
2564          .fniv = tcg_gen_shri_vec,
2565          .fno = gen_helper_gvec_shr16i,
2566          .opt_opc = vecop_list,
2567          .vece = MO_16 },
2568        { .fni4 = tcg_gen_shri_i32,
2569          .fniv = tcg_gen_shri_vec,
2570          .fno = gen_helper_gvec_shr32i,
2571          .opt_opc = vecop_list,
2572          .vece = MO_32 },
2573        { .fni8 = tcg_gen_shri_i64,
2574          .fniv = tcg_gen_shri_vec,
2575          .fno = gen_helper_gvec_shr64i,
2576          .opt_opc = vecop_list,
2577          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2578          .vece = MO_64 },
2579    };
2580
2581    tcg_debug_assert(vece <= MO_64);
2582    tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2583    if (shift == 0) {
2584        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2585    } else {
2586        tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2587    }
2588}
2589
2590void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2591{
2592    uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2593    uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2594    TCGv_i64 s = tcg_temp_new_i64();
2595
2596    tcg_gen_shri_i64(d, a, c);
2597    tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2598    tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2599    tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2600    tcg_gen_or_i64(d, d, s);         /* include sign extension */
2601    tcg_temp_free_i64(s);
2602}
2603
2604void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2605{
2606    uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2607    uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2608    TCGv_i64 s = tcg_temp_new_i64();
2609
2610    tcg_gen_shri_i64(d, a, c);
2611    tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2612    tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2613    tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2614    tcg_gen_or_i64(d, d, s);         /* include sign extension */
2615    tcg_temp_free_i64(s);
2616}
2617
2618void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
2619                       int64_t shift, uint32_t oprsz, uint32_t maxsz)
2620{
2621    static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 };
2622    static const GVecGen2i g[4] = {
2623        { .fni8 = tcg_gen_vec_sar8i_i64,
2624          .fniv = tcg_gen_sari_vec,
2625          .fno = gen_helper_gvec_sar8i,
2626          .opt_opc = vecop_list,
2627          .vece = MO_8 },
2628        { .fni8 = tcg_gen_vec_sar16i_i64,
2629          .fniv = tcg_gen_sari_vec,
2630          .fno = gen_helper_gvec_sar16i,
2631          .opt_opc = vecop_list,
2632          .vece = MO_16 },
2633        { .fni4 = tcg_gen_sari_i32,
2634          .fniv = tcg_gen_sari_vec,
2635          .fno = gen_helper_gvec_sar32i,
2636          .opt_opc = vecop_list,
2637          .vece = MO_32 },
2638        { .fni8 = tcg_gen_sari_i64,
2639          .fniv = tcg_gen_sari_vec,
2640          .fno = gen_helper_gvec_sar64i,
2641          .opt_opc = vecop_list,
2642          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2643          .vece = MO_64 },
2644    };
2645
2646    tcg_debug_assert(vece <= MO_64);
2647    tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2648    if (shift == 0) {
2649        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2650    } else {
2651        tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2652    }
2653}
2654
2655/*
2656 * Specialized generation vector shifts by a non-constant scalar.
2657 */
2658
2659typedef struct {
2660    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
2661    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
2662    void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32);
2663    void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
2664    gen_helper_gvec_2 *fno[4];
2665    TCGOpcode s_list[2];
2666    TCGOpcode v_list[2];
2667} GVecGen2sh;
2668
2669static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
2670                           uint32_t oprsz, uint32_t tysz, TCGType type,
2671                           TCGv_i32 shift,
2672                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
2673{
2674    TCGv_vec t0 = tcg_temp_new_vec(type);
2675    uint32_t i;
2676
2677    for (i = 0; i < oprsz; i += tysz) {
2678        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
2679        fni(vece, t0, t0, shift);
2680        tcg_gen_st_vec(t0, cpu_env, dofs + i);
2681    }
2682    tcg_temp_free_vec(t0);
2683}
2684
2685static void
2686do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift,
2687               uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g)
2688{
2689    TCGType type;
2690    uint32_t some;
2691
2692    check_size_align(oprsz, maxsz, dofs | aofs);
2693    check_overlap_2(dofs, aofs, maxsz);
2694
2695    /* If the backend has a scalar expansion, great.  */
2696    type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64);
2697    if (type) {
2698        const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
2699        switch (type) {
2700        case TCG_TYPE_V256:
2701            some = QEMU_ALIGN_DOWN(oprsz, 32);
2702            expand_2sh_vec(vece, dofs, aofs, some, 32,
2703                           TCG_TYPE_V256, shift, g->fniv_s);
2704            if (some == oprsz) {
2705                break;
2706            }
2707            dofs += some;
2708            aofs += some;
2709            oprsz -= some;
2710            maxsz -= some;
2711            /* fallthru */
2712        case TCG_TYPE_V128:
2713            expand_2sh_vec(vece, dofs, aofs, oprsz, 16,
2714                           TCG_TYPE_V128, shift, g->fniv_s);
2715            break;
2716        case TCG_TYPE_V64:
2717            expand_2sh_vec(vece, dofs, aofs, oprsz, 8,
2718                           TCG_TYPE_V64, shift, g->fniv_s);
2719            break;
2720        default:
2721            g_assert_not_reached();
2722        }
2723        tcg_swap_vecop_list(hold_list);
2724        goto clear_tail;
2725    }
2726
2727    /* If the backend supports variable vector shifts, also cool.  */
2728    type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64);
2729    if (type) {
2730        const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
2731        TCGv_vec v_shift = tcg_temp_new_vec(type);
2732
2733        if (vece == MO_64) {
2734            TCGv_i64 sh64 = tcg_temp_new_i64();
2735            tcg_gen_extu_i32_i64(sh64, shift);
2736            tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);
2737            tcg_temp_free_i64(sh64);
2738        } else {
2739            tcg_gen_dup_i32_vec(vece, v_shift, shift);
2740        }
2741
2742        switch (type) {
2743        case TCG_TYPE_V256:
2744            some = QEMU_ALIGN_DOWN(oprsz, 32);
2745            expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,
2746                          v_shift, false, g->fniv_v);
2747            if (some == oprsz) {
2748                break;
2749            }
2750            dofs += some;
2751            aofs += some;
2752            oprsz -= some;
2753            maxsz -= some;
2754            /* fallthru */
2755        case TCG_TYPE_V128:
2756            expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
2757                          v_shift, false, g->fniv_v);
2758            break;
2759        case TCG_TYPE_V64:
2760            expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
2761                          v_shift, false, g->fniv_v);
2762            break;
2763        default:
2764            g_assert_not_reached();
2765        }
2766        tcg_temp_free_vec(v_shift);
2767        tcg_swap_vecop_list(hold_list);
2768        goto clear_tail;
2769    }
2770
2771    /* Otherwise fall back to integral... */
2772    if (vece == MO_32 && check_size_impl(oprsz, 4)) {
2773        expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4);
2774    } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
2775        TCGv_i64 sh64 = tcg_temp_new_i64();
2776        tcg_gen_extu_i32_i64(sh64, shift);
2777        expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8);
2778        tcg_temp_free_i64(sh64);
2779    } else {
2780        TCGv_ptr a0 = tcg_temp_new_ptr();
2781        TCGv_ptr a1 = tcg_temp_new_ptr();
2782        TCGv_i32 desc = tcg_temp_new_i32();
2783
2784        tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);
2785        tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));
2786        tcg_gen_addi_ptr(a0, cpu_env, dofs);
2787        tcg_gen_addi_ptr(a1, cpu_env, aofs);
2788
2789        g->fno[vece](a0, a1, desc);
2790
2791        tcg_temp_free_ptr(a0);
2792        tcg_temp_free_ptr(a1);
2793        tcg_temp_free_i32(desc);
2794        return;
2795    }
2796
2797 clear_tail:
2798    if (oprsz < maxsz) {
2799        expand_clr(dofs + oprsz, maxsz - oprsz);
2800    }
2801}
2802
2803void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
2804                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
2805{
2806    static const GVecGen2sh g = {
2807        .fni4 = tcg_gen_shl_i32,
2808        .fni8 = tcg_gen_shl_i64,
2809        .fniv_s = tcg_gen_shls_vec,
2810        .fniv_v = tcg_gen_shlv_vec,
2811        .fno = {
2812            gen_helper_gvec_shl8i,
2813            gen_helper_gvec_shl16i,
2814            gen_helper_gvec_shl32i,
2815            gen_helper_gvec_shl64i,
2816        },
2817        .s_list = { INDEX_op_shls_vec, 0 },
2818        .v_list = { INDEX_op_shlv_vec, 0 },
2819    };
2820
2821    tcg_debug_assert(vece <= MO_64);
2822    do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
2823}
2824
2825void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
2826                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
2827{
2828    static const GVecGen2sh g = {
2829        .fni4 = tcg_gen_shr_i32,
2830        .fni8 = tcg_gen_shr_i64,
2831        .fniv_s = tcg_gen_shrs_vec,
2832        .fniv_v = tcg_gen_shrv_vec,
2833        .fno = {
2834            gen_helper_gvec_shr8i,
2835            gen_helper_gvec_shr16i,
2836            gen_helper_gvec_shr32i,
2837            gen_helper_gvec_shr64i,
2838        },
2839        .s_list = { INDEX_op_shrs_vec, 0 },
2840        .v_list = { INDEX_op_shrv_vec, 0 },
2841    };
2842
2843    tcg_debug_assert(vece <= MO_64);
2844    do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
2845}
2846
2847void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
2848                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
2849{
2850    static const GVecGen2sh g = {
2851        .fni4 = tcg_gen_sar_i32,
2852        .fni8 = tcg_gen_sar_i64,
2853        .fniv_s = tcg_gen_sars_vec,
2854        .fniv_v = tcg_gen_sarv_vec,
2855        .fno = {
2856            gen_helper_gvec_sar8i,
2857            gen_helper_gvec_sar16i,
2858            gen_helper_gvec_sar32i,
2859            gen_helper_gvec_sar64i,
2860        },
2861        .s_list = { INDEX_op_sars_vec, 0 },
2862        .v_list = { INDEX_op_sarv_vec, 0 },
2863    };
2864
2865    tcg_debug_assert(vece <= MO_64);
2866    do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
2867}
2868
2869/*
2870 * Expand D = A << (B % element bits)
2871 *
2872 * Unlike scalar shifts, where it is easy for the target front end
2873 * to include the modulo as part of the expansion.  If the target
2874 * naturally includes the modulo as part of the operation, great!
2875 * If the target has some other behaviour from out-of-range shifts,
2876 * then it could not use this function anyway, and would need to
2877 * do it's own expansion with custom functions.
2878 */
2879static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
2880                                 TCGv_vec a, TCGv_vec b)
2881{
2882    TCGv_vec t = tcg_temp_new_vec_matching(d);
2883
2884    tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
2885    tcg_gen_and_vec(vece, t, t, b);
2886    tcg_gen_shlv_vec(vece, d, a, t);
2887    tcg_temp_free_vec(t);
2888}
2889
2890static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2891{
2892    TCGv_i32 t = tcg_temp_new_i32();
2893
2894    tcg_gen_andi_i32(t, b, 31);
2895    tcg_gen_shl_i32(d, a, t);
2896    tcg_temp_free_i32(t);
2897}
2898
2899static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2900{
2901    TCGv_i64 t = tcg_temp_new_i64();
2902
2903    tcg_gen_andi_i64(t, b, 63);
2904    tcg_gen_shl_i64(d, a, t);
2905    tcg_temp_free_i64(t);
2906}
2907
2908void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
2909                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2910{
2911    static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 };
2912    static const GVecGen3 g[4] = {
2913        { .fniv = tcg_gen_shlv_mod_vec,
2914          .fno = gen_helper_gvec_shl8v,
2915          .opt_opc = vecop_list,
2916          .vece = MO_8 },
2917        { .fniv = tcg_gen_shlv_mod_vec,
2918          .fno = gen_helper_gvec_shl16v,
2919          .opt_opc = vecop_list,
2920          .vece = MO_16 },
2921        { .fni4 = tcg_gen_shl_mod_i32,
2922          .fniv = tcg_gen_shlv_mod_vec,
2923          .fno = gen_helper_gvec_shl32v,
2924          .opt_opc = vecop_list,
2925          .vece = MO_32 },
2926        { .fni8 = tcg_gen_shl_mod_i64,
2927          .fniv = tcg_gen_shlv_mod_vec,
2928          .fno = gen_helper_gvec_shl64v,
2929          .opt_opc = vecop_list,
2930          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2931          .vece = MO_64 },
2932    };
2933
2934    tcg_debug_assert(vece <= MO_64);
2935    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2936}
2937
2938/*
2939 * Similarly for logical right shifts.
2940 */
2941
2942static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
2943                                 TCGv_vec a, TCGv_vec b)
2944{
2945    TCGv_vec t = tcg_temp_new_vec_matching(d);
2946
2947    tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
2948    tcg_gen_and_vec(vece, t, t, b);
2949    tcg_gen_shrv_vec(vece, d, a, t);
2950    tcg_temp_free_vec(t);
2951}
2952
2953static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2954{
2955    TCGv_i32 t = tcg_temp_new_i32();
2956
2957    tcg_gen_andi_i32(t, b, 31);
2958    tcg_gen_shr_i32(d, a, t);
2959    tcg_temp_free_i32(t);
2960}
2961
2962static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2963{
2964    TCGv_i64 t = tcg_temp_new_i64();
2965
2966    tcg_gen_andi_i64(t, b, 63);
2967    tcg_gen_shr_i64(d, a, t);
2968    tcg_temp_free_i64(t);
2969}
2970
2971void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
2972                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2973{
2974    static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 };
2975    static const GVecGen3 g[4] = {
2976        { .fniv = tcg_gen_shrv_mod_vec,
2977          .fno = gen_helper_gvec_shr8v,
2978          .opt_opc = vecop_list,
2979          .vece = MO_8 },
2980        { .fniv = tcg_gen_shrv_mod_vec,
2981          .fno = gen_helper_gvec_shr16v,
2982          .opt_opc = vecop_list,
2983          .vece = MO_16 },
2984        { .fni4 = tcg_gen_shr_mod_i32,
2985          .fniv = tcg_gen_shrv_mod_vec,
2986          .fno = gen_helper_gvec_shr32v,
2987          .opt_opc = vecop_list,
2988          .vece = MO_32 },
2989        { .fni8 = tcg_gen_shr_mod_i64,
2990          .fniv = tcg_gen_shrv_mod_vec,
2991          .fno = gen_helper_gvec_shr64v,
2992          .opt_opc = vecop_list,
2993          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2994          .vece = MO_64 },
2995    };
2996
2997    tcg_debug_assert(vece <= MO_64);
2998    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2999}
3000
3001/*
3002 * Similarly for arithmetic right shifts.
3003 */
3004
3005static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
3006                                 TCGv_vec a, TCGv_vec b)
3007{
3008    TCGv_vec t = tcg_temp_new_vec_matching(d);
3009
3010    tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
3011    tcg_gen_and_vec(vece, t, t, b);
3012    tcg_gen_sarv_vec(vece, d, a, t);
3013    tcg_temp_free_vec(t);
3014}
3015
3016static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3017{
3018    TCGv_i32 t = tcg_temp_new_i32();
3019
3020    tcg_gen_andi_i32(t, b, 31);
3021    tcg_gen_sar_i32(d, a, t);
3022    tcg_temp_free_i32(t);
3023}
3024
3025static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3026{
3027    TCGv_i64 t = tcg_temp_new_i64();
3028
3029    tcg_gen_andi_i64(t, b, 63);
3030    tcg_gen_sar_i64(d, a, t);
3031    tcg_temp_free_i64(t);
3032}
3033
3034void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
3035                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3036{
3037    static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 };
3038    static const GVecGen3 g[4] = {
3039        { .fniv = tcg_gen_sarv_mod_vec,
3040          .fno = gen_helper_gvec_sar8v,
3041          .opt_opc = vecop_list,
3042          .vece = MO_8 },
3043        { .fniv = tcg_gen_sarv_mod_vec,
3044          .fno = gen_helper_gvec_sar16v,
3045          .opt_opc = vecop_list,
3046          .vece = MO_16 },
3047        { .fni4 = tcg_gen_sar_mod_i32,
3048          .fniv = tcg_gen_sarv_mod_vec,
3049          .fno = gen_helper_gvec_sar32v,
3050          .opt_opc = vecop_list,
3051          .vece = MO_32 },
3052        { .fni8 = tcg_gen_sar_mod_i64,
3053          .fniv = tcg_gen_sarv_mod_vec,
3054          .fno = gen_helper_gvec_sar64v,
3055          .opt_opc = vecop_list,
3056          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3057          .vece = MO_64 },
3058    };
3059
3060    tcg_debug_assert(vece <= MO_64);
3061    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3062}
3063
3064/* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
3065static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3066                           uint32_t oprsz, TCGCond cond)
3067{
3068    TCGv_i32 t0 = tcg_temp_new_i32();
3069    TCGv_i32 t1 = tcg_temp_new_i32();
3070    uint32_t i;
3071
3072    for (i = 0; i < oprsz; i += 4) {
3073        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
3074        tcg_gen_ld_i32(t1, cpu_env, bofs + i);
3075        tcg_gen_setcond_i32(cond, t0, t0, t1);
3076        tcg_gen_neg_i32(t0, t0);
3077        tcg_gen_st_i32(t0, cpu_env, dofs + i);
3078    }
3079    tcg_temp_free_i32(t1);
3080    tcg_temp_free_i32(t0);
3081}
3082
3083static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3084                           uint32_t oprsz, TCGCond cond)
3085{
3086    TCGv_i64 t0 = tcg_temp_new_i64();
3087    TCGv_i64 t1 = tcg_temp_new_i64();
3088    uint32_t i;
3089
3090    for (i = 0; i < oprsz; i += 8) {
3091        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
3092        tcg_gen_ld_i64(t1, cpu_env, bofs + i);
3093        tcg_gen_setcond_i64(cond, t0, t0, t1);
3094        tcg_gen_neg_i64(t0, t0);
3095        tcg_gen_st_i64(t0, cpu_env, dofs + i);
3096    }
3097    tcg_temp_free_i64(t1);
3098    tcg_temp_free_i64(t0);
3099}
3100
3101static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3102                           uint32_t bofs, uint32_t oprsz, uint32_t tysz,
3103                           TCGType type, TCGCond cond)
3104{
3105    TCGv_vec t0 = tcg_temp_new_vec(type);
3106    TCGv_vec t1 = tcg_temp_new_vec(type);
3107    uint32_t i;
3108
3109    for (i = 0; i < oprsz; i += tysz) {
3110        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
3111        tcg_gen_ld_vec(t1, cpu_env, bofs + i);
3112        tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
3113        tcg_gen_st_vec(t0, cpu_env, dofs + i);
3114    }
3115    tcg_temp_free_vec(t1);
3116    tcg_temp_free_vec(t0);
3117}
3118
3119void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
3120                      uint32_t aofs, uint32_t bofs,
3121                      uint32_t oprsz, uint32_t maxsz)
3122{
3123    static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3124    static gen_helper_gvec_3 * const eq_fn[4] = {
3125        gen_helper_gvec_eq8, gen_helper_gvec_eq16,
3126        gen_helper_gvec_eq32, gen_helper_gvec_eq64
3127    };
3128    static gen_helper_gvec_3 * const ne_fn[4] = {
3129        gen_helper_gvec_ne8, gen_helper_gvec_ne16,
3130        gen_helper_gvec_ne32, gen_helper_gvec_ne64
3131    };
3132    static gen_helper_gvec_3 * const lt_fn[4] = {
3133        gen_helper_gvec_lt8, gen_helper_gvec_lt16,
3134        gen_helper_gvec_lt32, gen_helper_gvec_lt64
3135    };
3136    static gen_helper_gvec_3 * const le_fn[4] = {
3137        gen_helper_gvec_le8, gen_helper_gvec_le16,
3138        gen_helper_gvec_le32, gen_helper_gvec_le64
3139    };
3140    static gen_helper_gvec_3 * const ltu_fn[4] = {
3141        gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
3142        gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
3143    };
3144    static gen_helper_gvec_3 * const leu_fn[4] = {
3145        gen_helper_gvec_leu8, gen_helper_gvec_leu16,
3146        gen_helper_gvec_leu32, gen_helper_gvec_leu64
3147    };
3148    static gen_helper_gvec_3 * const * const fns[16] = {
3149        [TCG_COND_EQ] = eq_fn,
3150        [TCG_COND_NE] = ne_fn,
3151        [TCG_COND_LT] = lt_fn,
3152        [TCG_COND_LE] = le_fn,
3153        [TCG_COND_LTU] = ltu_fn,
3154        [TCG_COND_LEU] = leu_fn,
3155    };
3156
3157    const TCGOpcode *hold_list;
3158    TCGType type;
3159    uint32_t some;
3160
3161    check_size_align(oprsz, maxsz, dofs | aofs | bofs);
3162    check_overlap_3(dofs, aofs, bofs, maxsz);
3163
3164    if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3165        do_dup(MO_8, dofs, oprsz, maxsz,
3166               NULL, NULL, -(cond == TCG_COND_ALWAYS));
3167        return;
3168    }
3169
3170    /*
3171     * Implement inline with a vector type, if possible.
3172     * Prefer integer when 64-bit host and 64-bit comparison.
3173     */
3174    hold_list = tcg_swap_vecop_list(cmp_list);
3175    type = choose_vector_type(cmp_list, vece, oprsz,
3176                              TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3177    switch (type) {
3178    case TCG_TYPE_V256:
3179        /* Recall that ARM SVE allows vector sizes that are not a
3180         * power of 2, but always a multiple of 16.  The intent is
3181         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
3182         */
3183        some = QEMU_ALIGN_DOWN(oprsz, 32);
3184        expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
3185        if (some == oprsz) {
3186            break;
3187        }
3188        dofs += some;
3189        aofs += some;
3190        bofs += some;
3191        oprsz -= some;
3192        maxsz -= some;
3193        /* fallthru */
3194    case TCG_TYPE_V128:
3195        expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
3196        break;
3197    case TCG_TYPE_V64:
3198        expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
3199        break;
3200
3201    case 0:
3202        if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3203            expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
3204        } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3205            expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
3206        } else {
3207            gen_helper_gvec_3 * const *fn = fns[cond];
3208
3209            if (fn == NULL) {
3210                uint32_t tmp;
3211                tmp = aofs, aofs = bofs, bofs = tmp;
3212                cond = tcg_swap_cond(cond);
3213                fn = fns[cond];
3214                assert(fn != NULL);
3215            }
3216            tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
3217            oprsz = maxsz;
3218        }
3219        break;
3220
3221    default:
3222        g_assert_not_reached();
3223    }
3224    tcg_swap_vecop_list(hold_list);
3225
3226    if (oprsz < maxsz) {
3227        expand_clr(dofs + oprsz, maxsz - oprsz);
3228    }
3229}
3230
3231static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c)
3232{
3233    TCGv_i64 t = tcg_temp_new_i64();
3234
3235    tcg_gen_and_i64(t, b, a);
3236    tcg_gen_andc_i64(d, c, a);
3237    tcg_gen_or_i64(d, d, t);
3238    tcg_temp_free_i64(t);
3239}
3240
3241void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
3242                         uint32_t bofs, uint32_t cofs,
3243                         uint32_t oprsz, uint32_t maxsz)
3244{
3245    static const GVecGen4 g = {
3246        .fni8 = tcg_gen_bitsel_i64,
3247        .fniv = tcg_gen_bitsel_vec,
3248        .fno = gen_helper_gvec_bitsel,
3249    };
3250
3251    tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g);
3252}
3253