LXR qemu/tcg/tcg-op-gvec.c

   1/*
   2 * Generic vector operation expansion
   3 *
   4 * Copyright (c) 2018 Linaro
   5 *
   6 * This library is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU Lesser General Public
   8 * License as published by the Free Software Foundation; either
   9 * version 2.1 of the License, or (at your option) any later version.
  10 *
  11 * This library is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * Lesser General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU Lesser General Public
  17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20#include "qemu/osdep.h"
  21#include "qemu-common.h"
  22#include "tcg.h"
  23#include "tcg-op.h"
  24#include "tcg-op-gvec.h"
  25#include "tcg-gvec-desc.h"
  26
  27#define MAX_UNROLL  4
  28
  29/* Verify vector size and alignment rules.  OFS should be the OR of all
  30   of the operand offsets so that we can check them all at once.  */
  31static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
  32{
  33    uint32_t opr_align = oprsz >= 16 ? 15 : 7;
  34    uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7;
  35    tcg_debug_assert(oprsz > 0);
  36    tcg_debug_assert(oprsz <= maxsz);
  37    tcg_debug_assert((oprsz & opr_align) == 0);
  38    tcg_debug_assert((maxsz & max_align) == 0);
  39    tcg_debug_assert((ofs & max_align) == 0);
  40}
  41
  42/* Verify vector overlap rules for two operands.  */
  43static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
  44{
  45    tcg_debug_assert(d == a || d + s <= a || a + s <= d);
  46}
  47
  48/* Verify vector overlap rules for three operands.  */
  49static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
  50{
  51    check_overlap_2(d, a, s);
  52    check_overlap_2(d, b, s);
  53    check_overlap_2(a, b, s);
  54}
  55
  56/* Verify vector overlap rules for four operands.  */
  57static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
  58                            uint32_t c, uint32_t s)
  59{
  60    check_overlap_2(d, a, s);
  61    check_overlap_2(d, b, s);
  62    check_overlap_2(d, c, s);
  63    check_overlap_2(a, b, s);
  64    check_overlap_2(a, c, s);
  65    check_overlap_2(b, c, s);
  66}
  67
  68/* Create a descriptor from components.  */
  69uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
  70{
  71    uint32_t desc = 0;
  72
  73    assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS));
  74    assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS));
  75    assert(data == sextract32(data, 0, SIMD_DATA_BITS));
  76
  77    oprsz = (oprsz / 8) - 1;
  78    maxsz = (maxsz / 8) - 1;
  79    desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
  80    desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
  81    desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
  82
  83    return desc;
  84}
  85
  86/* Generate a call to a gvec-style helper with two vector operands.  */
  87void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
  88                        uint32_t oprsz, uint32_t maxsz, int32_t data,
  89                        gen_helper_gvec_2 *fn)
  90{
  91    TCGv_ptr a0, a1;
  92    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
  93
  94    a0 = tcg_temp_new_ptr();
  95    a1 = tcg_temp_new_ptr();
  96
  97    tcg_gen_addi_ptr(a0, cpu_env, dofs);
  98    tcg_gen_addi_ptr(a1, cpu_env, aofs);
  99
 100    fn(a0, a1, desc);
 101
 102    tcg_temp_free_ptr(a0);
 103    tcg_temp_free_ptr(a1);
 104    tcg_temp_free_i32(desc);
 105}
 106
 107/* Generate a call to a gvec-style helper with two vector operands
 108   and one scalar operand.  */
 109void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
 110                         uint32_t oprsz, uint32_t maxsz, int32_t data,
 111                         gen_helper_gvec_2i *fn)
 112{
 113    TCGv_ptr a0, a1;
 114    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 115
 116    a0 = tcg_temp_new_ptr();
 117    a1 = tcg_temp_new_ptr();
 118
 119    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 120    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 121
 122    fn(a0, a1, c, desc);
 123
 124    tcg_temp_free_ptr(a0);
 125    tcg_temp_free_ptr(a1);
 126    tcg_temp_free_i32(desc);
 127}
 128
 129/* Generate a call to a gvec-style helper with three vector operands.  */
 130void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 131                        uint32_t oprsz, uint32_t maxsz, int32_t data,
 132                        gen_helper_gvec_3 *fn)
 133{
 134    TCGv_ptr a0, a1, a2;
 135    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 136
 137    a0 = tcg_temp_new_ptr();
 138    a1 = tcg_temp_new_ptr();
 139    a2 = tcg_temp_new_ptr();
 140
 141    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 142    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 143    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 144
 145    fn(a0, a1, a2, desc);
 146
 147    tcg_temp_free_ptr(a0);
 148    tcg_temp_free_ptr(a1);
 149    tcg_temp_free_ptr(a2);
 150    tcg_temp_free_i32(desc);
 151}
 152
 153/* Generate a call to a gvec-style helper with four vector operands.  */
 154void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 155                        uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
 156                        int32_t data, gen_helper_gvec_4 *fn)
 157{
 158    TCGv_ptr a0, a1, a2, a3;
 159    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 160
 161    a0 = tcg_temp_new_ptr();
 162    a1 = tcg_temp_new_ptr();
 163    a2 = tcg_temp_new_ptr();
 164    a3 = tcg_temp_new_ptr();
 165
 166    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 167    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 168    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 169    tcg_gen_addi_ptr(a3, cpu_env, cofs);
 170
 171    fn(a0, a1, a2, a3, desc);
 172
 173    tcg_temp_free_ptr(a0);
 174    tcg_temp_free_ptr(a1);
 175    tcg_temp_free_ptr(a2);
 176    tcg_temp_free_ptr(a3);
 177    tcg_temp_free_i32(desc);
 178}
 179
 180/* Generate a call to a gvec-style helper with five vector operands.  */
 181void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 182                        uint32_t cofs, uint32_t xofs, uint32_t oprsz,
 183                        uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
 184{
 185    TCGv_ptr a0, a1, a2, a3, a4;
 186    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 187
 188    a0 = tcg_temp_new_ptr();
 189    a1 = tcg_temp_new_ptr();
 190    a2 = tcg_temp_new_ptr();
 191    a3 = tcg_temp_new_ptr();
 192    a4 = tcg_temp_new_ptr();
 193
 194    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 195    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 196    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 197    tcg_gen_addi_ptr(a3, cpu_env, cofs);
 198    tcg_gen_addi_ptr(a4, cpu_env, xofs);
 199
 200    fn(a0, a1, a2, a3, a4, desc);
 201
 202    tcg_temp_free_ptr(a0);
 203    tcg_temp_free_ptr(a1);
 204    tcg_temp_free_ptr(a2);
 205    tcg_temp_free_ptr(a3);
 206    tcg_temp_free_ptr(a4);
 207    tcg_temp_free_i32(desc);
 208}
 209
 210/* Generate a call to a gvec-style helper with three vector operands
 211   and an extra pointer operand.  */
 212void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
 213                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 214                        int32_t data, gen_helper_gvec_2_ptr *fn)
 215{
 216    TCGv_ptr a0, a1;
 217    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 218
 219    a0 = tcg_temp_new_ptr();
 220    a1 = tcg_temp_new_ptr();
 221
 222    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 223    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 224
 225    fn(a0, a1, ptr, desc);
 226
 227    tcg_temp_free_ptr(a0);
 228    tcg_temp_free_ptr(a1);
 229    tcg_temp_free_i32(desc);
 230}
 231
 232/* Generate a call to a gvec-style helper with three vector operands
 233   and an extra pointer operand.  */
 234void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 235                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 236                        int32_t data, gen_helper_gvec_3_ptr *fn)
 237{
 238    TCGv_ptr a0, a1, a2;
 239    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 240
 241    a0 = tcg_temp_new_ptr();
 242    a1 = tcg_temp_new_ptr();
 243    a2 = tcg_temp_new_ptr();
 244
 245    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 246    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 247    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 248
 249    fn(a0, a1, a2, ptr, desc);
 250
 251    tcg_temp_free_ptr(a0);
 252    tcg_temp_free_ptr(a1);
 253    tcg_temp_free_ptr(a2);
 254    tcg_temp_free_i32(desc);
 255}
 256
 257/* Generate a call to a gvec-style helper with four vector operands
 258   and an extra pointer operand.  */
 259void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 260                        uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
 261                        uint32_t maxsz, int32_t data,
 262                        gen_helper_gvec_4_ptr *fn)
 263{
 264    TCGv_ptr a0, a1, a2, a3;
 265    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 266
 267    a0 = tcg_temp_new_ptr();
 268    a1 = tcg_temp_new_ptr();
 269    a2 = tcg_temp_new_ptr();
 270    a3 = tcg_temp_new_ptr();
 271
 272    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 273    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 274    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 275    tcg_gen_addi_ptr(a3, cpu_env, cofs);
 276
 277    fn(a0, a1, a2, a3, ptr, desc);
 278
 279    tcg_temp_free_ptr(a0);
 280    tcg_temp_free_ptr(a1);
 281    tcg_temp_free_ptr(a2);
 282    tcg_temp_free_ptr(a3);
 283    tcg_temp_free_i32(desc);
 284}
 285
 286/* Return true if we want to implement something of OPRSZ bytes
 287   in units of LNSZ.  This limits the expansion of inline code.  */
 288static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
 289{
 290    if (oprsz % lnsz == 0) {
 291        uint32_t lnct = oprsz / lnsz;
 292        return lnct >= 1 && lnct <= MAX_UNROLL;
 293    }
 294    return false;
 295}
 296
 297static void expand_clr(uint32_t dofs, uint32_t maxsz);
 298
 299/* Duplicate C as per VECE.  */
 300uint64_t (dup_const)(unsigned vece, uint64_t c)
 301{
 302    switch (vece) {
 303    case MO_8:
 304        return 0x0101010101010101ull * (uint8_t)c;
 305    case MO_16:
 306        return 0x0001000100010001ull * (uint16_t)c;
 307    case MO_32:
 308        return 0x0000000100000001ull * (uint32_t)c;
 309    case MO_64:
 310        return c;
 311    default:
 312        g_assert_not_reached();
 313    }
 314}
 315
 316/* Duplicate IN into OUT as per VECE.  */
 317static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
 318{
 319    switch (vece) {
 320    case MO_8:
 321        tcg_gen_ext8u_i32(out, in);
 322        tcg_gen_muli_i32(out, out, 0x01010101);
 323        break;
 324    case MO_16:
 325        tcg_gen_deposit_i32(out, in, in, 16, 16);
 326        break;
 327    case MO_32:
 328        tcg_gen_mov_i32(out, in);
 329        break;
 330    default:
 331        g_assert_not_reached();
 332    }
 333}
 334
 335static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
 336{
 337    switch (vece) {
 338    case MO_8:
 339        tcg_gen_ext8u_i64(out, in);
 340        tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
 341        break;
 342    case MO_16:
 343        tcg_gen_ext16u_i64(out, in);
 344        tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
 345        break;
 346    case MO_32:
 347        tcg_gen_deposit_i64(out, in, in, 32, 32);
 348        break;
 349    case MO_64:
 350        tcg_gen_mov_i64(out, in);
 351        break;
 352    default:
 353        g_assert_not_reached();
 354    }
 355}
 356
 357/* Select a supported vector type for implementing an operation on SIZE
 358 * bytes.  If OP is 0, assume that the real operation to be performed is
 359 * required by all backends.  Otherwise, make sure than OP can be performed
 360 * on elements of size VECE in the selected type.  Do not select V64 if
 361 * PREFER_I64 is true.  Return 0 if no vector type is selected.
 362 */
 363static TCGType choose_vector_type(TCGOpcode op, unsigned vece, uint32_t size,
 364                                  bool prefer_i64)
 365{
 366    if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) {
 367        if (op == 0) {
 368            return TCG_TYPE_V256;
 369        }
 370        /* Recall that ARM SVE allows vector sizes that are not a
 371         * power of 2, but always a multiple of 16.  The intent is
 372         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 373         * It is hard to imagine a case in which v256 is supported
 374         * but v128 is not, but check anyway.
 375         */
 376        if (tcg_can_emit_vec_op(op, TCG_TYPE_V256, vece)
 377            && (size % 32 == 0
 378                || tcg_can_emit_vec_op(op, TCG_TYPE_V128, vece))) {
 379            return TCG_TYPE_V256;
 380        }
 381    }
 382    if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16)
 383        && (op == 0 || tcg_can_emit_vec_op(op, TCG_TYPE_V128, vece))) {
 384        return TCG_TYPE_V128;
 385    }
 386    if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
 387        && (op == 0 || tcg_can_emit_vec_op(op, TCG_TYPE_V64, vece))) {
 388        return TCG_TYPE_V64;
 389    }
 390    return 0;
 391}
 392
 393/* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
 394 * Only one of IN_32 or IN_64 may be set;
 395 * IN_C is used if IN_32 and IN_64 are unset.
 396 */
 397static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
 398                   uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
 399                   uint64_t in_c)
 400{
 401    TCGType type;
 402    TCGv_i64 t_64;
 403    TCGv_i32 t_32, t_desc;
 404    TCGv_ptr t_ptr;
 405    uint32_t i;
 406
 407    assert(vece <= (in_32 ? MO_32 : MO_64));
 408    assert(in_32 == NULL || in_64 == NULL);
 409
 410    /* If we're storing 0, expand oprsz to maxsz.  */
 411    if (in_32 == NULL && in_64 == NULL) {
 412        in_c = dup_const(vece, in_c);
 413        if (in_c == 0) {
 414            oprsz = maxsz;
 415        }
 416    }
 417
 418    /* Implement inline with a vector type, if possible.
 419     * Prefer integer when 64-bit host and no variable dup.
 420     */
 421    type = choose_vector_type(0, vece, oprsz,
 422                              (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
 423                               && (in_64 == NULL || vece == MO_64)));
 424    if (type != 0) {
 425        TCGv_vec t_vec = tcg_temp_new_vec(type);
 426
 427        if (in_32) {
 428            tcg_gen_dup_i32_vec(vece, t_vec, in_32);
 429        } else if (in_64) {
 430            tcg_gen_dup_i64_vec(vece, t_vec, in_64);
 431        } else {
 432            switch (vece) {
 433            case MO_8:
 434                tcg_gen_dup8i_vec(t_vec, in_c);
 435                break;
 436            case MO_16:
 437                tcg_gen_dup16i_vec(t_vec, in_c);
 438                break;
 439            case MO_32:
 440                tcg_gen_dup32i_vec(t_vec, in_c);
 441                break;
 442            default:
 443                tcg_gen_dup64i_vec(t_vec, in_c);
 444                break;
 445            }
 446        }
 447
 448        i = 0;
 449        switch (type) {
 450        case TCG_TYPE_V256:
 451            /* Recall that ARM SVE allows vector sizes that are not a
 452             * power of 2, but always a multiple of 16.  The intent is
 453             * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 454             */
 455            for (; i + 32 <= oprsz; i += 32) {
 456                tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
 457            }
 458            /* fallthru */
 459        case TCG_TYPE_V128:
 460            for (; i + 16 <= oprsz; i += 16) {
 461                tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
 462            }
 463            break;
 464        case TCG_TYPE_V64:
 465            for (; i < oprsz; i += 8) {
 466                tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
 467            }
 468            break;
 469        default:
 470            g_assert_not_reached();
 471        }
 472
 473        tcg_temp_free_vec(t_vec);
 474        goto done;
 475    }
 476
 477    /* Otherwise, inline with an integer type, unless "large".  */
 478    if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
 479        t_64 = NULL;
 480        t_32 = NULL;
 481
 482        if (in_32) {
 483            /* We are given a 32-bit variable input.  For a 64-bit host,
 484               use a 64-bit operation unless the 32-bit operation would
 485               be simple enough.  */
 486            if (TCG_TARGET_REG_BITS == 64
 487                && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
 488                t_64 = tcg_temp_new_i64();
 489                tcg_gen_extu_i32_i64(t_64, in_32);
 490                gen_dup_i64(vece, t_64, t_64);
 491            } else {
 492                t_32 = tcg_temp_new_i32();
 493                gen_dup_i32(vece, t_32, in_32);
 494            }
 495        } else if (in_64) {
 496            /* We are given a 64-bit variable input.  */
 497            t_64 = tcg_temp_new_i64();
 498            gen_dup_i64(vece, t_64, in_64);
 499        } else {
 500            /* We are given a constant input.  */
 501            /* For 64-bit hosts, use 64-bit constants for "simple" constants
 502               or when we'd need too many 32-bit stores, or when a 64-bit
 503               constant is really required.  */
 504            if (vece == MO_64
 505                || (TCG_TARGET_REG_BITS == 64
 506                    && (in_c == 0 || in_c == -1
 507                        || !check_size_impl(oprsz, 4)))) {
 508                t_64 = tcg_const_i64(in_c);
 509            } else {
 510                t_32 = tcg_const_i32(in_c);
 511            }
 512        }
 513
 514        /* Implement inline if we picked an implementation size above.  */
 515        if (t_32) {
 516            for (i = 0; i < oprsz; i += 4) {
 517                tcg_gen_st_i32(t_32, cpu_env, dofs + i);
 518            }
 519            tcg_temp_free_i32(t_32);
 520            goto done;
 521        }
 522        if (t_64) {
 523            for (i = 0; i < oprsz; i += 8) {
 524                tcg_gen_st_i64(t_64, cpu_env, dofs + i);
 525            }
 526            tcg_temp_free_i64(t_64);
 527            goto done;
 528        }
 529    }
 530
 531    /* Otherwise implement out of line.  */
 532    t_ptr = tcg_temp_new_ptr();
 533    tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
 534    t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));
 535
 536    if (vece == MO_64) {
 537        if (in_64) {
 538            gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
 539        } else {
 540            t_64 = tcg_const_i64(in_c);
 541            gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
 542            tcg_temp_free_i64(t_64);
 543        }
 544    } else {
 545        typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
 546        static dup_fn * const fns[3] = {
 547            gen_helper_gvec_dup8,
 548            gen_helper_gvec_dup16,
 549            gen_helper_gvec_dup32
 550        };
 551
 552        if (in_32) {
 553            fns[vece](t_ptr, t_desc, in_32);
 554        } else {
 555            t_32 = tcg_temp_new_i32();
 556            if (in_64) {
 557                tcg_gen_extrl_i64_i32(t_32, in_64);
 558            } else if (vece == MO_8) {
 559                tcg_gen_movi_i32(t_32, in_c & 0xff);
 560            } else if (vece == MO_16) {
 561                tcg_gen_movi_i32(t_32, in_c & 0xffff);
 562            } else {
 563                tcg_gen_movi_i32(t_32, in_c);
 564            }
 565            fns[vece](t_ptr, t_desc, t_32);
 566            tcg_temp_free_i32(t_32);
 567        }
 568    }
 569
 570    tcg_temp_free_ptr(t_ptr);
 571    tcg_temp_free_i32(t_desc);
 572    return;
 573
 574 done:
 575    if (oprsz < maxsz) {
 576        expand_clr(dofs + oprsz, maxsz - oprsz);
 577    }
 578}
 579
 580/* Likewise, but with zero.  */
 581static void expand_clr(uint32_t dofs, uint32_t maxsz)
 582{
 583    do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
 584}
 585
 586/* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
 587static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 588                         void (*fni)(TCGv_i32, TCGv_i32))
 589{
 590    TCGv_i32 t0 = tcg_temp_new_i32();
 591    uint32_t i;
 592
 593    for (i = 0; i < oprsz; i += 4) {
 594        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 595        fni(t0, t0);
 596        tcg_gen_st_i32(t0, cpu_env, dofs + i);
 597    }
 598    tcg_temp_free_i32(t0);
 599}
 600
 601static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 602                          int32_t c, bool load_dest,
 603                          void (*fni)(TCGv_i32, TCGv_i32, int32_t))
 604{
 605    TCGv_i32 t0 = tcg_temp_new_i32();
 606    TCGv_i32 t1 = tcg_temp_new_i32();
 607    uint32_t i;
 608
 609    for (i = 0; i < oprsz; i += 4) {
 610        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 611        if (load_dest) {
 612            tcg_gen_ld_i32(t1, cpu_env, dofs + i);
 613        }
 614        fni(t1, t0, c);
 615        tcg_gen_st_i32(t1, cpu_env, dofs + i);
 616    }
 617    tcg_temp_free_i32(t0);
 618    tcg_temp_free_i32(t1);
 619}
 620
 621static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 622                          TCGv_i32 c, bool scalar_first,
 623                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
 624{
 625    TCGv_i32 t0 = tcg_temp_new_i32();
 626    TCGv_i32 t1 = tcg_temp_new_i32();
 627    uint32_t i;
 628
 629    for (i = 0; i < oprsz; i += 4) {
 630        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 631        if (scalar_first) {
 632            fni(t1, c, t0);
 633        } else {
 634            fni(t1, t0, c);
 635        }
 636        tcg_gen_st_i32(t1, cpu_env, dofs + i);
 637    }
 638    tcg_temp_free_i32(t0);
 639    tcg_temp_free_i32(t1);
 640}
 641
 642/* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
 643static void expand_3_i32(uint32_t dofs, uint32_t aofs,
 644                         uint32_t bofs, uint32_t oprsz, bool load_dest,
 645                         void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
 646{
 647    TCGv_i32 t0 = tcg_temp_new_i32();
 648    TCGv_i32 t1 = tcg_temp_new_i32();
 649    TCGv_i32 t2 = tcg_temp_new_i32();
 650    uint32_t i;
 651
 652    for (i = 0; i < oprsz; i += 4) {
 653        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 654        tcg_gen_ld_i32(t1, cpu_env, bofs + i);
 655        if (load_dest) {
 656            tcg_gen_ld_i32(t2, cpu_env, dofs + i);
 657        }
 658        fni(t2, t0, t1);
 659        tcg_gen_st_i32(t2, cpu_env, dofs + i);
 660    }
 661    tcg_temp_free_i32(t2);
 662    tcg_temp_free_i32(t1);
 663    tcg_temp_free_i32(t0);
 664}
 665
 666/* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
 667static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 668                         uint32_t cofs, uint32_t oprsz, bool write_aofs,
 669                         void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
 670{
 671    TCGv_i32 t0 = tcg_temp_new_i32();
 672    TCGv_i32 t1 = tcg_temp_new_i32();
 673    TCGv_i32 t2 = tcg_temp_new_i32();
 674    TCGv_i32 t3 = tcg_temp_new_i32();
 675    uint32_t i;
 676
 677    for (i = 0; i < oprsz; i += 4) {
 678        tcg_gen_ld_i32(t1, cpu_env, aofs + i);
 679        tcg_gen_ld_i32(t2, cpu_env, bofs + i);
 680        tcg_gen_ld_i32(t3, cpu_env, cofs + i);
 681        fni(t0, t1, t2, t3);
 682        tcg_gen_st_i32(t0, cpu_env, dofs + i);
 683        if (write_aofs) {
 684            tcg_gen_st_i32(t1, cpu_env, aofs + i);
 685        }
 686    }
 687    tcg_temp_free_i32(t3);
 688    tcg_temp_free_i32(t2);
 689    tcg_temp_free_i32(t1);
 690    tcg_temp_free_i32(t0);
 691}
 692
 693/* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
 694static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 695                         void (*fni)(TCGv_i64, TCGv_i64))
 696{
 697    TCGv_i64 t0 = tcg_temp_new_i64();
 698    uint32_t i;
 699
 700    for (i = 0; i < oprsz; i += 8) {
 701        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 702        fni(t0, t0);
 703        tcg_gen_st_i64(t0, cpu_env, dofs + i);
 704    }
 705    tcg_temp_free_i64(t0);
 706}
 707
 708static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 709                          int64_t c, bool load_dest,
 710                          void (*fni)(TCGv_i64, TCGv_i64, int64_t))
 711{
 712    TCGv_i64 t0 = tcg_temp_new_i64();
 713    TCGv_i64 t1 = tcg_temp_new_i64();
 714    uint32_t i;
 715
 716    for (i = 0; i < oprsz; i += 8) {
 717        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 718        if (load_dest) {
 719            tcg_gen_ld_i64(t1, cpu_env, dofs + i);
 720        }
 721        fni(t1, t0, c);
 722        tcg_gen_st_i64(t1, cpu_env, dofs + i);
 723    }
 724    tcg_temp_free_i64(t0);
 725    tcg_temp_free_i64(t1);
 726}
 727
 728static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 729                          TCGv_i64 c, bool scalar_first,
 730                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
 731{
 732    TCGv_i64 t0 = tcg_temp_new_i64();
 733    TCGv_i64 t1 = tcg_temp_new_i64();
 734    uint32_t i;
 735
 736    for (i = 0; i < oprsz; i += 8) {
 737        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 738        if (scalar_first) {
 739            fni(t1, c, t0);
 740        } else {
 741            fni(t1, t0, c);
 742        }
 743        tcg_gen_st_i64(t1, cpu_env, dofs + i);
 744    }
 745    tcg_temp_free_i64(t0);
 746    tcg_temp_free_i64(t1);
 747}
 748
 749/* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
 750static void expand_3_i64(uint32_t dofs, uint32_t aofs,
 751                         uint32_t bofs, uint32_t oprsz, bool load_dest,
 752                         void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
 753{
 754    TCGv_i64 t0 = tcg_temp_new_i64();
 755    TCGv_i64 t1 = tcg_temp_new_i64();
 756    TCGv_i64 t2 = tcg_temp_new_i64();
 757    uint32_t i;
 758
 759    for (i = 0; i < oprsz; i += 8) {
 760        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 761        tcg_gen_ld_i64(t1, cpu_env, bofs + i);
 762        if (load_dest) {
 763            tcg_gen_ld_i64(t2, cpu_env, dofs + i);
 764        }
 765        fni(t2, t0, t1);
 766        tcg_gen_st_i64(t2, cpu_env, dofs + i);
 767    }
 768    tcg_temp_free_i64(t2);
 769    tcg_temp_free_i64(t1);
 770    tcg_temp_free_i64(t0);
 771}
 772
 773/* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
 774static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 775                         uint32_t cofs, uint32_t oprsz, bool write_aofs,
 776                         void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
 777{
 778    TCGv_i64 t0 = tcg_temp_new_i64();
 779    TCGv_i64 t1 = tcg_temp_new_i64();
 780    TCGv_i64 t2 = tcg_temp_new_i64();
 781    TCGv_i64 t3 = tcg_temp_new_i64();
 782    uint32_t i;
 783
 784    for (i = 0; i < oprsz; i += 8) {
 785        tcg_gen_ld_i64(t1, cpu_env, aofs + i);
 786        tcg_gen_ld_i64(t2, cpu_env, bofs + i);
 787        tcg_gen_ld_i64(t3, cpu_env, cofs + i);
 788        fni(t0, t1, t2, t3);
 789        tcg_gen_st_i64(t0, cpu_env, dofs + i);
 790        if (write_aofs) {
 791            tcg_gen_st_i64(t1, cpu_env, aofs + i);
 792        }
 793    }
 794    tcg_temp_free_i64(t3);
 795    tcg_temp_free_i64(t2);
 796    tcg_temp_free_i64(t1);
 797    tcg_temp_free_i64(t0);
 798}
 799
 800/* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
 801static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 802                         uint32_t oprsz, uint32_t tysz, TCGType type,
 803                         void (*fni)(unsigned, TCGv_vec, TCGv_vec))
 804{
 805    TCGv_vec t0 = tcg_temp_new_vec(type);
 806    uint32_t i;
 807
 808    for (i = 0; i < oprsz; i += tysz) {
 809        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 810        fni(vece, t0, t0);
 811        tcg_gen_st_vec(t0, cpu_env, dofs + i);
 812    }
 813    tcg_temp_free_vec(t0);
 814}
 815
 816/* Expand OPSZ bytes worth of two-vector operands and an immediate operand
 817   using host vectors.  */
 818static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 819                          uint32_t oprsz, uint32_t tysz, TCGType type,
 820                          int64_t c, bool load_dest,
 821                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
 822{
 823    TCGv_vec t0 = tcg_temp_new_vec(type);
 824    TCGv_vec t1 = tcg_temp_new_vec(type);
 825    uint32_t i;
 826
 827    for (i = 0; i < oprsz; i += tysz) {
 828        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 829        if (load_dest) {
 830            tcg_gen_ld_vec(t1, cpu_env, dofs + i);
 831        }
 832        fni(vece, t1, t0, c);
 833        tcg_gen_st_vec(t1, cpu_env, dofs + i);
 834    }
 835    tcg_temp_free_vec(t0);
 836    tcg_temp_free_vec(t1);
 837}
 838
 839static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 840                          uint32_t oprsz, uint32_t tysz, TCGType type,
 841                          TCGv_vec c, bool scalar_first,
 842                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
 843{
 844    TCGv_vec t0 = tcg_temp_new_vec(type);
 845    TCGv_vec t1 = tcg_temp_new_vec(type);
 846    uint32_t i;
 847
 848    for (i = 0; i < oprsz; i += tysz) {
 849        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 850        if (scalar_first) {
 851            fni(vece, t1, c, t0);
 852        } else {
 853            fni(vece, t1, t0, c);
 854        }
 855        tcg_gen_st_vec(t1, cpu_env, dofs + i);
 856    }
 857    tcg_temp_free_vec(t0);
 858    tcg_temp_free_vec(t1);
 859}
 860
 861/* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
 862static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 863                         uint32_t bofs, uint32_t oprsz,
 864                         uint32_t tysz, TCGType type, bool load_dest,
 865                         void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
 866{
 867    TCGv_vec t0 = tcg_temp_new_vec(type);
 868    TCGv_vec t1 = tcg_temp_new_vec(type);
 869    TCGv_vec t2 = tcg_temp_new_vec(type);
 870    uint32_t i;
 871
 872    for (i = 0; i < oprsz; i += tysz) {
 873        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 874        tcg_gen_ld_vec(t1, cpu_env, bofs + i);
 875        if (load_dest) {
 876            tcg_gen_ld_vec(t2, cpu_env, dofs + i);
 877        }
 878        fni(vece, t2, t0, t1);
 879        tcg_gen_st_vec(t2, cpu_env, dofs + i);
 880    }
 881    tcg_temp_free_vec(t2);
 882    tcg_temp_free_vec(t1);
 883    tcg_temp_free_vec(t0);
 884}
 885
 886/* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
 887static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 888                         uint32_t bofs, uint32_t cofs, uint32_t oprsz,
 889                         uint32_t tysz, TCGType type, bool write_aofs,
 890                         void (*fni)(unsigned, TCGv_vec, TCGv_vec,
 891                                     TCGv_vec, TCGv_vec))
 892{
 893    TCGv_vec t0 = tcg_temp_new_vec(type);
 894    TCGv_vec t1 = tcg_temp_new_vec(type);
 895    TCGv_vec t2 = tcg_temp_new_vec(type);
 896    TCGv_vec t3 = tcg_temp_new_vec(type);
 897    uint32_t i;
 898
 899    for (i = 0; i < oprsz; i += tysz) {
 900        tcg_gen_ld_vec(t1, cpu_env, aofs + i);
 901        tcg_gen_ld_vec(t2, cpu_env, bofs + i);
 902        tcg_gen_ld_vec(t3, cpu_env, cofs + i);
 903        fni(vece, t0, t1, t2, t3);
 904        tcg_gen_st_vec(t0, cpu_env, dofs + i);
 905        if (write_aofs) {
 906            tcg_gen_st_vec(t1, cpu_env, aofs + i);
 907        }
 908    }
 909    tcg_temp_free_vec(t3);
 910    tcg_temp_free_vec(t2);
 911    tcg_temp_free_vec(t1);
 912    tcg_temp_free_vec(t0);
 913}
 914
 915/* Expand a vector two-operand operation.  */
 916void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
 917                    uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
 918{
 919    TCGType type;
 920    uint32_t some;
 921
 922    check_size_align(oprsz, maxsz, dofs | aofs);
 923    check_overlap_2(dofs, aofs, maxsz);
 924
 925    type = 0;
 926    if (g->fniv) {
 927        type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64);
 928    }
 929    switch (type) {
 930    case TCG_TYPE_V256:
 931        /* Recall that ARM SVE allows vector sizes that are not a
 932         * power of 2, but always a multiple of 16.  The intent is
 933         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 934         */
 935        some = QEMU_ALIGN_DOWN(oprsz, 32);
 936        expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv);
 937        if (some == oprsz) {
 938            break;
 939        }
 940        dofs += some;
 941        aofs += some;
 942        oprsz -= some;
 943        maxsz -= some;
 944        /* fallthru */
 945    case TCG_TYPE_V128:
 946        expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv);
 947        break;
 948    case TCG_TYPE_V64:
 949        expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv);
 950        break;
 951
 952    case 0:
 953        if (g->fni8 && check_size_impl(oprsz, 8)) {
 954            expand_2_i64(dofs, aofs, oprsz, g->fni8);
 955        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
 956            expand_2_i32(dofs, aofs, oprsz, g->fni4);
 957        } else {
 958            assert(g->fno != NULL);
 959            tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
 960            return;
 961        }
 962        break;
 963
 964    default:
 965        g_assert_not_reached();
 966    }
 967
 968    if (oprsz < maxsz) {
 969        expand_clr(dofs + oprsz, maxsz - oprsz);
 970    }
 971}
 972
 973/* Expand a vector operation with two vectors and an immediate.  */
 974void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 975                     uint32_t maxsz, int64_t c, const GVecGen2i *g)
 976{
 977    TCGType type;
 978    uint32_t some;
 979
 980    check_size_align(oprsz, maxsz, dofs | aofs);
 981    check_overlap_2(dofs, aofs, maxsz);
 982
 983    type = 0;
 984    if (g->fniv) {
 985        type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64);
 986    }
 987    switch (type) {
 988    case TCG_TYPE_V256:
 989        /* Recall that ARM SVE allows vector sizes that are not a
 990         * power of 2, but always a multiple of 16.  The intent is
 991         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 992         */
 993        some = QEMU_ALIGN_DOWN(oprsz, 32);
 994        expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
 995                      c, g->load_dest, g->fniv);
 996        if (some == oprsz) {
 997            break;
 998        }
 999        dofs += some;
1000        aofs += some;

1001        oprsz -= some;
1002        maxsz -= some;
1003        /* fallthru */
1004    case TCG_TYPE_V128:
1005        expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1006                      c, g->load_dest, g->fniv);
1007        break;
1008    case TCG_TYPE_V64:
1009        expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1010                      c, g->load_dest, g->fniv);
1011        break;
1012
1013    case 0:
1014        if (g->fni8 && check_size_impl(oprsz, 8)) {
1015            expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1016        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1017            expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1018        } else {
1019            if (g->fno) {
1020                tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1021            } else {
1022                TCGv_i64 tcg_c = tcg_const_i64(c);
1023                tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1024                                    maxsz, c, g->fnoi);
1025                tcg_temp_free_i64(tcg_c);
1026            }
1027            return;
1028        }
1029        break;
1030
1031    default:
1032        g_assert_not_reached();
1033    }
1034
1035    if (oprsz < maxsz) {
1036        expand_clr(dofs + oprsz, maxsz - oprsz);
1037    }
1038}
1039
1040/* Expand a vector operation with two vectors and a scalar.  */
1041void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1042                     uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1043{
1044    TCGType type;
1045
1046    check_size_align(oprsz, maxsz, dofs | aofs);
1047    check_overlap_2(dofs, aofs, maxsz);
1048
1049    type = 0;
1050    if (g->fniv) {
1051        type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64);
1052    }
1053    if (type != 0) {
1054        TCGv_vec t_vec = tcg_temp_new_vec(type);
1055        uint32_t some;
1056
1057        tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1058
1059        switch (type) {
1060        case TCG_TYPE_V256:
1061            /* Recall that ARM SVE allows vector sizes that are not a
1062             * power of 2, but always a multiple of 16.  The intent is
1063             * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1064             */
1065            some = QEMU_ALIGN_DOWN(oprsz, 32);
1066            expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1067                          t_vec, g->scalar_first, g->fniv);
1068            if (some == oprsz) {
1069                break;
1070            }
1071            dofs += some;
1072            aofs += some;
1073            oprsz -= some;
1074            maxsz -= some;
1075            /* fallthru */
1076
1077        case TCG_TYPE_V128:
1078            expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1079                          t_vec, g->scalar_first, g->fniv);
1080            break;
1081
1082        case TCG_TYPE_V64:
1083            expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1084                          t_vec, g->scalar_first, g->fniv);
1085            break;
1086
1087        default:
1088            g_assert_not_reached();
1089        }
1090        tcg_temp_free_vec(t_vec);
1091    } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1092        TCGv_i64 t64 = tcg_temp_new_i64();
1093
1094        gen_dup_i64(g->vece, t64, c);
1095        expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1096        tcg_temp_free_i64(t64);
1097    } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1098        TCGv_i32 t32 = tcg_temp_new_i32();
1099
1100        tcg_gen_extrl_i64_i32(t32, c);
1101        gen_dup_i32(g->vece, t32, t32);
1102        expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1103        tcg_temp_free_i32(t32);
1104    } else {
1105        tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1106        return;
1107    }
1108
1109    if (oprsz < maxsz) {
1110        expand_clr(dofs + oprsz, maxsz - oprsz);
1111    }
1112}
1113
1114/* Expand a vector three-operand operation.  */
1115void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1116                    uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1117{
1118    TCGType type;
1119    uint32_t some;
1120
1121    check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1122    check_overlap_3(dofs, aofs, bofs, maxsz);
1123
1124    type = 0;
1125    if (g->fniv) {
1126        type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64);
1127    }
1128    switch (type) {
1129    case TCG_TYPE_V256:
1130        /* Recall that ARM SVE allows vector sizes that are not a
1131         * power of 2, but always a multiple of 16.  The intent is
1132         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1133         */
1134        some = QEMU_ALIGN_DOWN(oprsz, 32);
1135        expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1136                     g->load_dest, g->fniv);
1137        if (some == oprsz) {
1138            break;
1139        }
1140        dofs += some;
1141        aofs += some;
1142        bofs += some;
1143        oprsz -= some;
1144        maxsz -= some;
1145        /* fallthru */
1146    case TCG_TYPE_V128:
1147        expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1148                     g->load_dest, g->fniv);
1149        break;
1150    case TCG_TYPE_V64:
1151        expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1152                     g->load_dest, g->fniv);
1153        break;
1154
1155    case 0:
1156        if (g->fni8 && check_size_impl(oprsz, 8)) {
1157            expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1158        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1159            expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1160        } else {
1161            assert(g->fno != NULL);
1162            tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
1163                               maxsz, g->data, g->fno);
1164            return;
1165        }
1166        break;
1167
1168    default:
1169        g_assert_not_reached();
1170    }
1171
1172    if (oprsz < maxsz) {
1173        expand_clr(dofs + oprsz, maxsz - oprsz);
1174    }
1175}
1176
1177/* Expand a vector four-operand operation.  */
1178void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1179                    uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1180{
1181    TCGType type;
1182    uint32_t some;
1183
1184    check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1185    check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1186
1187    type = 0;
1188    if (g->fniv) {
1189        type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64);
1190    }
1191    switch (type) {
1192    case TCG_TYPE_V256:
1193        /* Recall that ARM SVE allows vector sizes that are not a
1194         * power of 2, but always a multiple of 16.  The intent is
1195         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1196         */
1197        some = QEMU_ALIGN_DOWN(oprsz, 32);
1198        expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1199                     32, TCG_TYPE_V256, g->write_aofs, g->fniv);
1200        if (some == oprsz) {
1201            break;
1202        }
1203        dofs += some;
1204        aofs += some;
1205        bofs += some;
1206        cofs += some;
1207        oprsz -= some;
1208        maxsz -= some;
1209        /* fallthru */
1210    case TCG_TYPE_V128:
1211        expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1212                     16, TCG_TYPE_V128, g->write_aofs, g->fniv);
1213        break;
1214    case TCG_TYPE_V64:
1215        expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1216                     8, TCG_TYPE_V64, g->write_aofs, g->fniv);
1217        break;
1218
1219    case 0:
1220        if (g->fni8 && check_size_impl(oprsz, 8)) {
1221            expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
1222                         g->write_aofs, g->fni8);
1223        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1224            expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
1225                         g->write_aofs, g->fni4);
1226        } else {
1227            assert(g->fno != NULL);
1228            tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1229                               oprsz, maxsz, g->data, g->fno);
1230            return;
1231        }
1232        break;
1233
1234    default:
1235        g_assert_not_reached();
1236    }
1237
1238    if (oprsz < maxsz) {
1239        expand_clr(dofs + oprsz, maxsz - oprsz);
1240    }
1241}
1242
1243/*
1244 * Expand specific vector operations.
1245 */
1246
1247static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1248{
1249    tcg_gen_mov_vec(a, b);
1250}
1251
1252void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1253                      uint32_t oprsz, uint32_t maxsz)
1254{
1255    static const GVecGen2 g = {
1256        .fni8 = tcg_gen_mov_i64,
1257        .fniv = vec_mov2,
1258        .fno = gen_helper_gvec_mov,
1259        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1260    };
1261    if (dofs != aofs) {
1262        tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1263    } else {
1264        check_size_align(oprsz, maxsz, dofs);
1265        if (oprsz < maxsz) {
1266            expand_clr(dofs + oprsz, maxsz - oprsz);
1267        }
1268    }
1269}
1270
1271void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1272                          uint32_t maxsz, TCGv_i32 in)
1273{
1274    check_size_align(oprsz, maxsz, dofs);
1275    tcg_debug_assert(vece <= MO_32);
1276    do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1277}
1278
1279void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1280                          uint32_t maxsz, TCGv_i64 in)
1281{
1282    check_size_align(oprsz, maxsz, dofs);
1283    tcg_debug_assert(vece <= MO_64);
1284    do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1285}
1286
1287void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1288                          uint32_t oprsz, uint32_t maxsz)
1289{
1290    if (vece <= MO_32) {
1291        TCGv_i32 in = tcg_temp_new_i32();
1292        switch (vece) {
1293        case MO_8:
1294            tcg_gen_ld8u_i32(in, cpu_env, aofs);
1295            break;
1296        case MO_16:
1297            tcg_gen_ld16u_i32(in, cpu_env, aofs);
1298            break;
1299        case MO_32:
1300            tcg_gen_ld_i32(in, cpu_env, aofs);
1301            break;
1302        }
1303        tcg_gen_gvec_dup_i32(vece, dofs, oprsz, maxsz, in);
1304        tcg_temp_free_i32(in);
1305    } else if (vece == MO_64) {
1306        TCGv_i64 in = tcg_temp_new_i64();
1307        tcg_gen_ld_i64(in, cpu_env, aofs);
1308        tcg_gen_gvec_dup_i64(MO_64, dofs, oprsz, maxsz, in);
1309        tcg_temp_free_i64(in);
1310    } else {
1311        /* 128-bit duplicate.  */
1312        /* ??? Dup to 256-bit vector.  */
1313        int i;
1314
1315        tcg_debug_assert(vece == 4);
1316        tcg_debug_assert(oprsz >= 16);
1317        if (TCG_TARGET_HAS_v128) {
1318            TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1319
1320            tcg_gen_ld_vec(in, cpu_env, aofs);
1321            for (i = 0; i < oprsz; i += 16) {
1322                tcg_gen_st_vec(in, cpu_env, dofs + i);
1323            }
1324            tcg_temp_free_vec(in);
1325        } else {
1326            TCGv_i64 in0 = tcg_temp_new_i64();
1327            TCGv_i64 in1 = tcg_temp_new_i64();
1328
1329            tcg_gen_ld_i64(in0, cpu_env, aofs);
1330            tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
1331            for (i = 0; i < oprsz; i += 16) {
1332                tcg_gen_st_i64(in0, cpu_env, dofs + i);
1333                tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
1334            }
1335            tcg_temp_free_i64(in0);
1336            tcg_temp_free_i64(in1);
1337        }
1338    }
1339}
1340
1341void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz,
1342                         uint32_t maxsz, uint64_t x)
1343{
1344    check_size_align(oprsz, maxsz, dofs);
1345    do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x);
1346}
1347
1348void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz,
1349                         uint32_t maxsz, uint32_t x)
1350{
1351    check_size_align(oprsz, maxsz, dofs);
1352    do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x);
1353}
1354
1355void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz,
1356                         uint32_t maxsz, uint16_t x)
1357{
1358    check_size_align(oprsz, maxsz, dofs);
1359    do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x);
1360}
1361
1362void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz,
1363                         uint32_t maxsz, uint8_t x)
1364{
1365    check_size_align(oprsz, maxsz, dofs);
1366    do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x);
1367}
1368
1369void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1370                      uint32_t oprsz, uint32_t maxsz)
1371{
1372    static const GVecGen2 g = {
1373        .fni8 = tcg_gen_not_i64,
1374        .fniv = tcg_gen_not_vec,
1375        .fno = gen_helper_gvec_not,
1376        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1377    };
1378    tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1379}
1380
1381/* Perform a vector addition using normal addition and a mask.  The mask
1382   should be the sign bit of each lane.  This 6-operation form is more
1383   efficient than separate additions when there are 4 or more lanes in
1384   the 64-bit operation.  */
1385static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1386{
1387    TCGv_i64 t1 = tcg_temp_new_i64();
1388    TCGv_i64 t2 = tcg_temp_new_i64();
1389    TCGv_i64 t3 = tcg_temp_new_i64();
1390
1391    tcg_gen_andc_i64(t1, a, m);
1392    tcg_gen_andc_i64(t2, b, m);
1393    tcg_gen_xor_i64(t3, a, b);
1394    tcg_gen_add_i64(d, t1, t2);
1395    tcg_gen_and_i64(t3, t3, m);
1396    tcg_gen_xor_i64(d, d, t3);
1397
1398    tcg_temp_free_i64(t1);
1399    tcg_temp_free_i64(t2);
1400    tcg_temp_free_i64(t3);
1401}
1402
1403void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1404{
1405    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1406    gen_addv_mask(d, a, b, m);
1407    tcg_temp_free_i64(m);
1408}
1409
1410void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1411{
1412    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1413    gen_addv_mask(d, a, b, m);
1414    tcg_temp_free_i64(m);
1415}
1416
1417void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1418{
1419    TCGv_i64 t1 = tcg_temp_new_i64();
1420    TCGv_i64 t2 = tcg_temp_new_i64();
1421
1422    tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1423    tcg_gen_add_i64(t2, a, b);
1424    tcg_gen_add_i64(t1, t1, b);
1425    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1426
1427    tcg_temp_free_i64(t1);
1428    tcg_temp_free_i64(t2);
1429}
1430
1431void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1432                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1433{
1434    static const GVecGen3 g[4] = {
1435        { .fni8 = tcg_gen_vec_add8_i64,
1436          .fniv = tcg_gen_add_vec,
1437          .fno = gen_helper_gvec_add8,
1438          .opc = INDEX_op_add_vec,
1439          .vece = MO_8 },
1440        { .fni8 = tcg_gen_vec_add16_i64,
1441          .fniv = tcg_gen_add_vec,
1442          .fno = gen_helper_gvec_add16,
1443          .opc = INDEX_op_add_vec,
1444          .vece = MO_16 },
1445        { .fni4 = tcg_gen_add_i32,
1446          .fniv = tcg_gen_add_vec,
1447          .fno = gen_helper_gvec_add32,
1448          .opc = INDEX_op_add_vec,
1449          .vece = MO_32 },
1450        { .fni8 = tcg_gen_add_i64,
1451          .fniv = tcg_gen_add_vec,
1452          .fno = gen_helper_gvec_add64,
1453          .opc = INDEX_op_add_vec,
1454          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1455          .vece = MO_64 },
1456    };
1457
1458    tcg_debug_assert(vece <= MO_64);
1459    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1460}
1461
1462void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1463                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1464{
1465    static const GVecGen2s g[4] = {
1466        { .fni8 = tcg_gen_vec_add8_i64,
1467          .fniv = tcg_gen_add_vec,
1468          .fno = gen_helper_gvec_adds8,
1469          .opc = INDEX_op_add_vec,
1470          .vece = MO_8 },
1471        { .fni8 = tcg_gen_vec_add16_i64,
1472          .fniv = tcg_gen_add_vec,
1473          .fno = gen_helper_gvec_adds16,
1474          .opc = INDEX_op_add_vec,
1475          .vece = MO_16 },
1476        { .fni4 = tcg_gen_add_i32,
1477          .fniv = tcg_gen_add_vec,
1478          .fno = gen_helper_gvec_adds32,
1479          .opc = INDEX_op_add_vec,
1480          .vece = MO_32 },
1481        { .fni8 = tcg_gen_add_i64,
1482          .fniv = tcg_gen_add_vec,
1483          .fno = gen_helper_gvec_adds64,
1484          .opc = INDEX_op_add_vec,
1485          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1486          .vece = MO_64 },
1487    };
1488
1489    tcg_debug_assert(vece <= MO_64);
1490    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1491}
1492
1493void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
1494                       int64_t c, uint32_t oprsz, uint32_t maxsz)
1495{
1496    TCGv_i64 tmp = tcg_const_i64(c);
1497    tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
1498    tcg_temp_free_i64(tmp);
1499}
1500
1501void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
1502                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1503{
1504    static const GVecGen2s g[4] = {
1505        { .fni8 = tcg_gen_vec_sub8_i64,
1506          .fniv = tcg_gen_sub_vec,
1507          .fno = gen_helper_gvec_subs8,
1508          .opc = INDEX_op_sub_vec,
1509          .vece = MO_8 },
1510        { .fni8 = tcg_gen_vec_sub16_i64,
1511          .fniv = tcg_gen_sub_vec,
1512          .fno = gen_helper_gvec_subs16,
1513          .opc = INDEX_op_sub_vec,
1514          .vece = MO_16 },
1515        { .fni4 = tcg_gen_sub_i32,
1516          .fniv = tcg_gen_sub_vec,
1517          .fno = gen_helper_gvec_subs32,
1518          .opc = INDEX_op_sub_vec,
1519          .vece = MO_32 },
1520        { .fni8 = tcg_gen_sub_i64,
1521          .fniv = tcg_gen_sub_vec,
1522          .fno = gen_helper_gvec_subs64,
1523          .opc = INDEX_op_sub_vec,
1524          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1525          .vece = MO_64 },
1526    };
1527
1528    tcg_debug_assert(vece <= MO_64);
1529    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1530}
1531
1532/* Perform a vector subtraction using normal subtraction and a mask.
1533   Compare gen_addv_mask above.  */
1534static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1535{
1536    TCGv_i64 t1 = tcg_temp_new_i64();
1537    TCGv_i64 t2 = tcg_temp_new_i64();
1538    TCGv_i64 t3 = tcg_temp_new_i64();
1539
1540    tcg_gen_or_i64(t1, a, m);
1541    tcg_gen_andc_i64(t2, b, m);
1542    tcg_gen_eqv_i64(t3, a, b);
1543    tcg_gen_sub_i64(d, t1, t2);
1544    tcg_gen_and_i64(t3, t3, m);
1545    tcg_gen_xor_i64(d, d, t3);
1546
1547    tcg_temp_free_i64(t1);
1548    tcg_temp_free_i64(t2);
1549    tcg_temp_free_i64(t3);
1550}
1551
1552void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1553{
1554    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1555    gen_subv_mask(d, a, b, m);
1556    tcg_temp_free_i64(m);
1557}
1558
1559void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1560{
1561    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1562    gen_subv_mask(d, a, b, m);
1563    tcg_temp_free_i64(m);
1564}
1565
1566void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1567{
1568    TCGv_i64 t1 = tcg_temp_new_i64();
1569    TCGv_i64 t2 = tcg_temp_new_i64();
1570
1571    tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1572    tcg_gen_sub_i64(t2, a, b);
1573    tcg_gen_sub_i64(t1, a, t1);
1574    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1575
1576    tcg_temp_free_i64(t1);
1577    tcg_temp_free_i64(t2);
1578}
1579
1580void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
1581                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1582{
1583    static const GVecGen3 g[4] = {
1584        { .fni8 = tcg_gen_vec_sub8_i64,
1585          .fniv = tcg_gen_sub_vec,
1586          .fno = gen_helper_gvec_sub8,
1587          .opc = INDEX_op_sub_vec,
1588          .vece = MO_8 },
1589        { .fni8 = tcg_gen_vec_sub16_i64,
1590          .fniv = tcg_gen_sub_vec,
1591          .fno = gen_helper_gvec_sub16,
1592          .opc = INDEX_op_sub_vec,
1593          .vece = MO_16 },
1594        { .fni4 = tcg_gen_sub_i32,
1595          .fniv = tcg_gen_sub_vec,
1596          .fno = gen_helper_gvec_sub32,
1597          .opc = INDEX_op_sub_vec,
1598          .vece = MO_32 },
1599        { .fni8 = tcg_gen_sub_i64,
1600          .fniv = tcg_gen_sub_vec,
1601          .fno = gen_helper_gvec_sub64,
1602          .opc = INDEX_op_sub_vec,
1603          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1604          .vece = MO_64 },
1605    };
1606
1607    tcg_debug_assert(vece <= MO_64);
1608    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1609}
1610
1611void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
1612                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1613{
1614    static const GVecGen3 g[4] = {
1615        { .fniv = tcg_gen_mul_vec,
1616          .fno = gen_helper_gvec_mul8,
1617          .opc = INDEX_op_mul_vec,
1618          .vece = MO_8 },
1619        { .fniv = tcg_gen_mul_vec,
1620          .fno = gen_helper_gvec_mul16,
1621          .opc = INDEX_op_mul_vec,
1622          .vece = MO_16 },
1623        { .fni4 = tcg_gen_mul_i32,
1624          .fniv = tcg_gen_mul_vec,
1625          .fno = gen_helper_gvec_mul32,
1626          .opc = INDEX_op_mul_vec,
1627          .vece = MO_32 },
1628        { .fni8 = tcg_gen_mul_i64,
1629          .fniv = tcg_gen_mul_vec,
1630          .fno = gen_helper_gvec_mul64,
1631          .opc = INDEX_op_mul_vec,
1632          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1633          .vece = MO_64 },
1634    };
1635
1636    tcg_debug_assert(vece <= MO_64);
1637    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1638}
1639
1640void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
1641                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1642{
1643    static const GVecGen2s g[4] = {
1644        { .fniv = tcg_gen_mul_vec,
1645          .fno = gen_helper_gvec_muls8,
1646          .opc = INDEX_op_mul_vec,
1647          .vece = MO_8 },
1648        { .fniv = tcg_gen_mul_vec,
1649          .fno = gen_helper_gvec_muls16,
1650          .opc = INDEX_op_mul_vec,
1651          .vece = MO_16 },
1652        { .fni4 = tcg_gen_mul_i32,
1653          .fniv = tcg_gen_mul_vec,
1654          .fno = gen_helper_gvec_muls32,
1655          .opc = INDEX_op_mul_vec,
1656          .vece = MO_32 },
1657        { .fni8 = tcg_gen_mul_i64,
1658          .fniv = tcg_gen_mul_vec,
1659          .fno = gen_helper_gvec_muls64,
1660          .opc = INDEX_op_mul_vec,
1661          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1662          .vece = MO_64 },
1663    };
1664
1665    tcg_debug_assert(vece <= MO_64);
1666    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1667}
1668
1669void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
1670                       int64_t c, uint32_t oprsz, uint32_t maxsz)
1671{
1672    TCGv_i64 tmp = tcg_const_i64(c);
1673    tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
1674    tcg_temp_free_i64(tmp);
1675}
1676
1677void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1678                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1679{
1680    static const GVecGen3 g[4] = {
1681        { .fniv = tcg_gen_ssadd_vec,
1682          .fno = gen_helper_gvec_ssadd8,
1683          .opc = INDEX_op_ssadd_vec,
1684          .vece = MO_8 },
1685        { .fniv = tcg_gen_ssadd_vec,
1686          .fno = gen_helper_gvec_ssadd16,
1687          .opc = INDEX_op_ssadd_vec,
1688          .vece = MO_16 },
1689        { .fniv = tcg_gen_ssadd_vec,
1690          .fno = gen_helper_gvec_ssadd32,
1691          .opc = INDEX_op_ssadd_vec,
1692          .vece = MO_32 },
1693        { .fniv = tcg_gen_ssadd_vec,
1694          .fno = gen_helper_gvec_ssadd64,
1695          .opc = INDEX_op_ssadd_vec,
1696          .vece = MO_64 },
1697    };
1698    tcg_debug_assert(vece <= MO_64);
1699    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1700}
1701
1702void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
1703                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1704{
1705    static const GVecGen3 g[4] = {
1706        { .fniv = tcg_gen_sssub_vec,
1707          .fno = gen_helper_gvec_sssub8,
1708          .opc = INDEX_op_sssub_vec,
1709          .vece = MO_8 },
1710        { .fniv = tcg_gen_sssub_vec,
1711          .fno = gen_helper_gvec_sssub16,
1712          .opc = INDEX_op_sssub_vec,
1713          .vece = MO_16 },
1714        { .fniv = tcg_gen_sssub_vec,
1715          .fno = gen_helper_gvec_sssub32,
1716          .opc = INDEX_op_sssub_vec,
1717          .vece = MO_32 },
1718        { .fniv = tcg_gen_sssub_vec,
1719          .fno = gen_helper_gvec_sssub64,
1720          .opc = INDEX_op_sssub_vec,
1721          .vece = MO_64 },
1722    };
1723    tcg_debug_assert(vece <= MO_64);
1724    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1725}
1726
1727static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1728{
1729    TCGv_i32 max = tcg_const_i32(-1);
1730    tcg_gen_add_i32(d, a, b);
1731    tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
1732    tcg_temp_free_i32(max);
1733}
1734
1735static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1736{
1737    TCGv_i64 max = tcg_const_i64(-1);
1738    tcg_gen_add_i64(d, a, b);
1739    tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
1740    tcg_temp_free_i64(max);
1741}
1742
1743void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1744                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1745{
1746    static const GVecGen3 g[4] = {
1747        { .fniv = tcg_gen_usadd_vec,
1748          .fno = gen_helper_gvec_usadd8,
1749          .opc = INDEX_op_usadd_vec,
1750          .vece = MO_8 },
1751        { .fniv = tcg_gen_usadd_vec,
1752          .fno = gen_helper_gvec_usadd16,
1753          .opc = INDEX_op_usadd_vec,
1754          .vece = MO_16 },
1755        { .fni4 = tcg_gen_usadd_i32,
1756          .fniv = tcg_gen_usadd_vec,
1757          .fno = gen_helper_gvec_usadd32,
1758          .opc = INDEX_op_usadd_vec,
1759          .vece = MO_32 },
1760        { .fni8 = tcg_gen_usadd_i64,
1761          .fniv = tcg_gen_usadd_vec,
1762          .fno = gen_helper_gvec_usadd64,
1763          .opc = INDEX_op_usadd_vec,
1764          .vece = MO_64 }
1765    };
1766    tcg_debug_assert(vece <= MO_64);
1767    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1768}
1769
1770static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1771{
1772    TCGv_i32 min = tcg_const_i32(0);
1773    tcg_gen_sub_i32(d, a, b);
1774    tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
1775    tcg_temp_free_i32(min);
1776}
1777
1778static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1779{
1780    TCGv_i64 min = tcg_const_i64(0);
1781    tcg_gen_sub_i64(d, a, b);
1782    tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
1783    tcg_temp_free_i64(min);
1784}
1785
1786void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
1787                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1788{
1789    static const GVecGen3 g[4] = {
1790        { .fniv = tcg_gen_ussub_vec,
1791          .fno = gen_helper_gvec_ussub8,
1792          .opc = INDEX_op_ussub_vec,
1793          .vece = MO_8 },
1794        { .fniv = tcg_gen_ussub_vec,
1795          .fno = gen_helper_gvec_ussub16,
1796          .opc = INDEX_op_ussub_vec,
1797          .vece = MO_16 },
1798        { .fni4 = tcg_gen_ussub_i32,
1799          .fniv = tcg_gen_ussub_vec,
1800          .fno = gen_helper_gvec_ussub32,
1801          .opc = INDEX_op_ussub_vec,
1802          .vece = MO_32 },
1803        { .fni8 = tcg_gen_ussub_i64,
1804          .fniv = tcg_gen_ussub_vec,
1805          .fno = gen_helper_gvec_ussub64,
1806          .opc = INDEX_op_ussub_vec,
1807          .vece = MO_64 }
1808    };
1809    tcg_debug_assert(vece <= MO_64);
1810    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1811}
1812
1813void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
1814                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1815{
1816    static const GVecGen3 g[4] = {
1817        { .fniv = tcg_gen_smin_vec,
1818          .fno = gen_helper_gvec_smin8,
1819          .opc = INDEX_op_smin_vec,
1820          .vece = MO_8 },
1821        { .fniv = tcg_gen_smin_vec,
1822          .fno = gen_helper_gvec_smin16,
1823          .opc = INDEX_op_smin_vec,
1824          .vece = MO_16 },
1825        { .fni4 = tcg_gen_smin_i32,
1826          .fniv = tcg_gen_smin_vec,
1827          .fno = gen_helper_gvec_smin32,
1828          .opc = INDEX_op_smin_vec,
1829          .vece = MO_32 },
1830        { .fni8 = tcg_gen_smin_i64,
1831          .fniv = tcg_gen_smin_vec,
1832          .fno = gen_helper_gvec_smin64,
1833          .opc = INDEX_op_smin_vec,
1834          .vece = MO_64 }
1835    };
1836    tcg_debug_assert(vece <= MO_64);
1837    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1838}
1839
1840void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
1841                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1842{
1843    static const GVecGen3 g[4] = {
1844        { .fniv = tcg_gen_umin_vec,
1845          .fno = gen_helper_gvec_umin8,
1846          .opc = INDEX_op_umin_vec,
1847          .vece = MO_8 },
1848        { .fniv = tcg_gen_umin_vec,
1849          .fno = gen_helper_gvec_umin16,
1850          .opc = INDEX_op_umin_vec,
1851          .vece = MO_16 },
1852        { .fni4 = tcg_gen_umin_i32,
1853          .fniv = tcg_gen_umin_vec,
1854          .fno = gen_helper_gvec_umin32,
1855          .opc = INDEX_op_umin_vec,
1856          .vece = MO_32 },
1857        { .fni8 = tcg_gen_umin_i64,
1858          .fniv = tcg_gen_umin_vec,
1859          .fno = gen_helper_gvec_umin64,
1860          .opc = INDEX_op_umin_vec,
1861          .vece = MO_64 }
1862    };
1863    tcg_debug_assert(vece <= MO_64);
1864    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1865}
1866
1867void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
1868                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1869{
1870    static const GVecGen3 g[4] = {
1871        { .fniv = tcg_gen_smax_vec,
1872          .fno = gen_helper_gvec_smax8,
1873          .opc = INDEX_op_smax_vec,
1874          .vece = MO_8 },
1875        { .fniv = tcg_gen_smax_vec,
1876          .fno = gen_helper_gvec_smax16,
1877          .opc = INDEX_op_smax_vec,
1878          .vece = MO_16 },
1879        { .fni4 = tcg_gen_smax_i32,
1880          .fniv = tcg_gen_smax_vec,
1881          .fno = gen_helper_gvec_smax32,
1882          .opc = INDEX_op_smax_vec,
1883          .vece = MO_32 },
1884        { .fni8 = tcg_gen_smax_i64,
1885          .fniv = tcg_gen_smax_vec,
1886          .fno = gen_helper_gvec_smax64,
1887          .opc = INDEX_op_smax_vec,
1888          .vece = MO_64 }
1889    };
1890    tcg_debug_assert(vece <= MO_64);
1891    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1892}
1893
1894void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
1895                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1896{
1897    static const GVecGen3 g[4] = {
1898        { .fniv = tcg_gen_umax_vec,
1899          .fno = gen_helper_gvec_umax8,
1900          .opc = INDEX_op_umax_vec,
1901          .vece = MO_8 },
1902        { .fniv = tcg_gen_umax_vec,
1903          .fno = gen_helper_gvec_umax16,
1904          .opc = INDEX_op_umax_vec,
1905          .vece = MO_16 },
1906        { .fni4 = tcg_gen_umax_i32,
1907          .fniv = tcg_gen_umax_vec,
1908          .fno = gen_helper_gvec_umax32,
1909          .opc = INDEX_op_umax_vec,
1910          .vece = MO_32 },
1911        { .fni8 = tcg_gen_umax_i64,
1912          .fniv = tcg_gen_umax_vec,
1913          .fno = gen_helper_gvec_umax64,
1914          .opc = INDEX_op_umax_vec,
1915          .vece = MO_64 }
1916    };
1917    tcg_debug_assert(vece <= MO_64);
1918    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1919}
1920
1921/* Perform a vector negation using normal negation and a mask.
1922   Compare gen_subv_mask above.  */
1923static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
1924{
1925    TCGv_i64 t2 = tcg_temp_new_i64();
1926    TCGv_i64 t3 = tcg_temp_new_i64();
1927
1928    tcg_gen_andc_i64(t3, m, b);
1929    tcg_gen_andc_i64(t2, b, m);
1930    tcg_gen_sub_i64(d, m, t2);
1931    tcg_gen_xor_i64(d, d, t3);
1932
1933    tcg_temp_free_i64(t2);
1934    tcg_temp_free_i64(t3);
1935}
1936
1937void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
1938{
1939    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1940    gen_negv_mask(d, b, m);
1941    tcg_temp_free_i64(m);
1942}
1943
1944void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
1945{
1946    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1947    gen_negv_mask(d, b, m);
1948    tcg_temp_free_i64(m);
1949}
1950
1951void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
1952{
1953    TCGv_i64 t1 = tcg_temp_new_i64();
1954    TCGv_i64 t2 = tcg_temp_new_i64();
1955
1956    tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1957    tcg_gen_neg_i64(t2, b);
1958    tcg_gen_neg_i64(t1, t1);
1959    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1960
1961    tcg_temp_free_i64(t1);
1962    tcg_temp_free_i64(t2);
1963}
1964
1965void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
1966                      uint32_t oprsz, uint32_t maxsz)
1967{
1968    static const GVecGen2 g[4] = {
1969        { .fni8 = tcg_gen_vec_neg8_i64,
1970          .fniv = tcg_gen_neg_vec,
1971          .fno = gen_helper_gvec_neg8,
1972          .opc = INDEX_op_neg_vec,
1973          .vece = MO_8 },
1974        { .fni8 = tcg_gen_vec_neg16_i64,
1975          .fniv = tcg_gen_neg_vec,
1976          .fno = gen_helper_gvec_neg16,
1977          .opc = INDEX_op_neg_vec,
1978          .vece = MO_16 },
1979        { .fni4 = tcg_gen_neg_i32,
1980          .fniv = tcg_gen_neg_vec,
1981          .fno = gen_helper_gvec_neg32,
1982          .opc = INDEX_op_neg_vec,
1983          .vece = MO_32 },
1984        { .fni8 = tcg_gen_neg_i64,
1985          .fniv = tcg_gen_neg_vec,
1986          .fno = gen_helper_gvec_neg64,
1987          .opc = INDEX_op_neg_vec,
1988          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1989          .vece = MO_64 },
1990    };
1991
1992    tcg_debug_assert(vece <= MO_64);
1993    tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
1994}
1995
1996void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
1997                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1998{
1999    static const GVecGen3 g = {
2000        .fni8 = tcg_gen_and_i64,

2001        .fniv = tcg_gen_and_vec,
2002        .fno = gen_helper_gvec_and,
2003        .opc = INDEX_op_and_vec,
2004        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2005    };
2006
2007    if (aofs == bofs) {
2008        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2009    } else {
2010        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2011    }
2012}
2013
2014void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
2015                     uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2016{
2017    static const GVecGen3 g = {
2018        .fni8 = tcg_gen_or_i64,
2019        .fniv = tcg_gen_or_vec,
2020        .fno = gen_helper_gvec_or,
2021        .opc = INDEX_op_or_vec,
2022        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2023    };
2024
2025    if (aofs == bofs) {
2026        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2027    } else {
2028        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2029    }
2030}
2031
2032void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
2033                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2034{
2035    static const GVecGen3 g = {
2036        .fni8 = tcg_gen_xor_i64,
2037        .fniv = tcg_gen_xor_vec,
2038        .fno = gen_helper_gvec_xor,
2039        .opc = INDEX_op_xor_vec,
2040        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2041    };
2042
2043    if (aofs == bofs) {
2044        tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0);
2045    } else {
2046        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2047    }
2048}
2049
2050void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
2051                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2052{
2053    static const GVecGen3 g = {
2054        .fni8 = tcg_gen_andc_i64,
2055        .fniv = tcg_gen_andc_vec,
2056        .fno = gen_helper_gvec_andc,
2057        .opc = INDEX_op_andc_vec,
2058        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2059    };
2060
2061    if (aofs == bofs) {
2062        tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0);
2063    } else {
2064        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2065    }
2066}
2067
2068void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
2069                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2070{
2071    static const GVecGen3 g = {
2072        .fni8 = tcg_gen_orc_i64,
2073        .fniv = tcg_gen_orc_vec,
2074        .fno = gen_helper_gvec_orc,
2075        .opc = INDEX_op_orc_vec,
2076        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2077    };
2078
2079    if (aofs == bofs) {
2080        tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1);
2081    } else {
2082        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2083    }
2084}
2085
2086void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
2087                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2088{
2089    static const GVecGen3 g = {
2090        .fni8 = tcg_gen_nand_i64,
2091        .fniv = tcg_gen_nand_vec,
2092        .fno = gen_helper_gvec_nand,
2093        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2094    };
2095
2096    if (aofs == bofs) {
2097        tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2098    } else {
2099        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2100    }
2101}
2102
2103void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
2104                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2105{
2106    static const GVecGen3 g = {
2107        .fni8 = tcg_gen_nor_i64,
2108        .fniv = tcg_gen_nor_vec,
2109        .fno = gen_helper_gvec_nor,
2110        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2111    };
2112
2113    if (aofs == bofs) {
2114        tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2115    } else {
2116        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2117    }
2118}
2119
2120void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
2121                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2122{
2123    static const GVecGen3 g = {
2124        .fni8 = tcg_gen_eqv_i64,
2125        .fniv = tcg_gen_eqv_vec,
2126        .fno = gen_helper_gvec_eqv,
2127        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2128    };
2129
2130    if (aofs == bofs) {
2131        tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1);
2132    } else {
2133        tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2134    }
2135}
2136
2137static const GVecGen2s gop_ands = {
2138    .fni8 = tcg_gen_and_i64,
2139    .fniv = tcg_gen_and_vec,
2140    .fno = gen_helper_gvec_ands,
2141    .opc = INDEX_op_and_vec,
2142    .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2143    .vece = MO_64
2144};
2145
2146void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
2147                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2148{
2149    TCGv_i64 tmp = tcg_temp_new_i64();
2150    gen_dup_i64(vece, tmp, c);
2151    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2152    tcg_temp_free_i64(tmp);
2153}
2154
2155void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
2156                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2157{
2158    TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2159    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2160    tcg_temp_free_i64(tmp);
2161}
2162
2163static const GVecGen2s gop_xors = {
2164    .fni8 = tcg_gen_xor_i64,
2165    .fniv = tcg_gen_xor_vec,
2166    .fno = gen_helper_gvec_xors,
2167    .opc = INDEX_op_xor_vec,
2168    .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2169    .vece = MO_64
2170};
2171
2172void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
2173                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2174{
2175    TCGv_i64 tmp = tcg_temp_new_i64();
2176    gen_dup_i64(vece, tmp, c);
2177    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2178    tcg_temp_free_i64(tmp);
2179}
2180
2181void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
2182                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2183{
2184    TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2185    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2186    tcg_temp_free_i64(tmp);
2187}
2188
2189static const GVecGen2s gop_ors = {
2190    .fni8 = tcg_gen_or_i64,
2191    .fniv = tcg_gen_or_vec,
2192    .fno = gen_helper_gvec_ors,
2193    .opc = INDEX_op_or_vec,
2194    .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2195    .vece = MO_64
2196};
2197
2198void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
2199                      TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2200{
2201    TCGv_i64 tmp = tcg_temp_new_i64();
2202    gen_dup_i64(vece, tmp, c);
2203    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2204    tcg_temp_free_i64(tmp);
2205}
2206
2207void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
2208                      int64_t c, uint32_t oprsz, uint32_t maxsz)
2209{
2210    TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
2211    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2212    tcg_temp_free_i64(tmp);
2213}
2214
2215void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2216{
2217    uint64_t mask = dup_const(MO_8, 0xff << c);
2218    tcg_gen_shli_i64(d, a, c);
2219    tcg_gen_andi_i64(d, d, mask);
2220}
2221
2222void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2223{
2224    uint64_t mask = dup_const(MO_16, 0xffff << c);
2225    tcg_gen_shli_i64(d, a, c);
2226    tcg_gen_andi_i64(d, d, mask);
2227}
2228
2229void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
2230                       int64_t shift, uint32_t oprsz, uint32_t maxsz)
2231{
2232    static const GVecGen2i g[4] = {
2233        { .fni8 = tcg_gen_vec_shl8i_i64,
2234          .fniv = tcg_gen_shli_vec,
2235          .fno = gen_helper_gvec_shl8i,
2236          .opc = INDEX_op_shli_vec,
2237          .vece = MO_8 },
2238        { .fni8 = tcg_gen_vec_shl16i_i64,
2239          .fniv = tcg_gen_shli_vec,
2240          .fno = gen_helper_gvec_shl16i,
2241          .opc = INDEX_op_shli_vec,
2242          .vece = MO_16 },
2243        { .fni4 = tcg_gen_shli_i32,
2244          .fniv = tcg_gen_shli_vec,
2245          .fno = gen_helper_gvec_shl32i,
2246          .opc = INDEX_op_shli_vec,
2247          .vece = MO_32 },
2248        { .fni8 = tcg_gen_shli_i64,
2249          .fniv = tcg_gen_shli_vec,
2250          .fno = gen_helper_gvec_shl64i,
2251          .opc = INDEX_op_shli_vec,
2252          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2253          .vece = MO_64 },
2254    };
2255
2256    tcg_debug_assert(vece <= MO_64);
2257    tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2258    if (shift == 0) {
2259        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2260    } else {
2261        tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2262    }
2263}
2264
2265void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2266{
2267    uint64_t mask = dup_const(MO_8, 0xff >> c);
2268    tcg_gen_shri_i64(d, a, c);
2269    tcg_gen_andi_i64(d, d, mask);
2270}
2271
2272void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2273{
2274    uint64_t mask = dup_const(MO_16, 0xffff >> c);
2275    tcg_gen_shri_i64(d, a, c);
2276    tcg_gen_andi_i64(d, d, mask);
2277}
2278
2279void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
2280                       int64_t shift, uint32_t oprsz, uint32_t maxsz)
2281{
2282    static const GVecGen2i g[4] = {
2283        { .fni8 = tcg_gen_vec_shr8i_i64,
2284          .fniv = tcg_gen_shri_vec,
2285          .fno = gen_helper_gvec_shr8i,
2286          .opc = INDEX_op_shri_vec,
2287          .vece = MO_8 },
2288        { .fni8 = tcg_gen_vec_shr16i_i64,
2289          .fniv = tcg_gen_shri_vec,
2290          .fno = gen_helper_gvec_shr16i,
2291          .opc = INDEX_op_shri_vec,
2292          .vece = MO_16 },
2293        { .fni4 = tcg_gen_shri_i32,
2294          .fniv = tcg_gen_shri_vec,
2295          .fno = gen_helper_gvec_shr32i,
2296          .opc = INDEX_op_shri_vec,
2297          .vece = MO_32 },
2298        { .fni8 = tcg_gen_shri_i64,
2299          .fniv = tcg_gen_shri_vec,
2300          .fno = gen_helper_gvec_shr64i,
2301          .opc = INDEX_op_shri_vec,
2302          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2303          .vece = MO_64 },
2304    };
2305
2306    tcg_debug_assert(vece <= MO_64);
2307    tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2308    if (shift == 0) {
2309        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2310    } else {
2311        tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2312    }
2313}
2314
2315void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2316{
2317    uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2318    uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2319    TCGv_i64 s = tcg_temp_new_i64();
2320
2321    tcg_gen_shri_i64(d, a, c);
2322    tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2323    tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2324    tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2325    tcg_gen_or_i64(d, d, s);         /* include sign extension */
2326    tcg_temp_free_i64(s);
2327}
2328
2329void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2330{
2331    uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2332    uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2333    TCGv_i64 s = tcg_temp_new_i64();
2334
2335    tcg_gen_shri_i64(d, a, c);
2336    tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2337    tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2338    tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2339    tcg_gen_or_i64(d, d, s);         /* include sign extension */
2340    tcg_temp_free_i64(s);
2341}
2342
2343void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
2344                       int64_t shift, uint32_t oprsz, uint32_t maxsz)
2345{
2346    static const GVecGen2i g[4] = {
2347        { .fni8 = tcg_gen_vec_sar8i_i64,
2348          .fniv = tcg_gen_sari_vec,
2349          .fno = gen_helper_gvec_sar8i,
2350          .opc = INDEX_op_sari_vec,
2351          .vece = MO_8 },
2352        { .fni8 = tcg_gen_vec_sar16i_i64,
2353          .fniv = tcg_gen_sari_vec,
2354          .fno = gen_helper_gvec_sar16i,
2355          .opc = INDEX_op_sari_vec,
2356          .vece = MO_16 },
2357        { .fni4 = tcg_gen_sari_i32,
2358          .fniv = tcg_gen_sari_vec,
2359          .fno = gen_helper_gvec_sar32i,
2360          .opc = INDEX_op_sari_vec,
2361          .vece = MO_32 },
2362        { .fni8 = tcg_gen_sari_i64,
2363          .fniv = tcg_gen_sari_vec,
2364          .fno = gen_helper_gvec_sar64i,
2365          .opc = INDEX_op_sari_vec,
2366          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2367          .vece = MO_64 },
2368    };
2369
2370    tcg_debug_assert(vece <= MO_64);
2371    tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2372    if (shift == 0) {
2373        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2374    } else {
2375        tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2376    }
2377}
2378
2379/* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
2380static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
2381                           uint32_t oprsz, TCGCond cond)
2382{
2383    TCGv_i32 t0 = tcg_temp_new_i32();
2384    TCGv_i32 t1 = tcg_temp_new_i32();
2385    uint32_t i;
2386
2387    for (i = 0; i < oprsz; i += 4) {
2388        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
2389        tcg_gen_ld_i32(t1, cpu_env, bofs + i);
2390        tcg_gen_setcond_i32(cond, t0, t0, t1);
2391        tcg_gen_neg_i32(t0, t0);
2392        tcg_gen_st_i32(t0, cpu_env, dofs + i);
2393    }
2394    tcg_temp_free_i32(t1);
2395    tcg_temp_free_i32(t0);
2396}
2397
2398static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
2399                           uint32_t oprsz, TCGCond cond)
2400{
2401    TCGv_i64 t0 = tcg_temp_new_i64();
2402    TCGv_i64 t1 = tcg_temp_new_i64();
2403    uint32_t i;
2404
2405    for (i = 0; i < oprsz; i += 8) {
2406        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
2407        tcg_gen_ld_i64(t1, cpu_env, bofs + i);
2408        tcg_gen_setcond_i64(cond, t0, t0, t1);
2409        tcg_gen_neg_i64(t0, t0);
2410        tcg_gen_st_i64(t0, cpu_env, dofs + i);
2411    }
2412    tcg_temp_free_i64(t1);
2413    tcg_temp_free_i64(t0);
2414}
2415
2416static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
2417                           uint32_t bofs, uint32_t oprsz, uint32_t tysz,
2418                           TCGType type, TCGCond cond)
2419{
2420    TCGv_vec t0 = tcg_temp_new_vec(type);
2421    TCGv_vec t1 = tcg_temp_new_vec(type);
2422    uint32_t i;
2423
2424    for (i = 0; i < oprsz; i += tysz) {
2425        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
2426        tcg_gen_ld_vec(t1, cpu_env, bofs + i);
2427        tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
2428        tcg_gen_st_vec(t0, cpu_env, dofs + i);
2429    }
2430    tcg_temp_free_vec(t1);
2431    tcg_temp_free_vec(t0);
2432}
2433
2434void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
2435                      uint32_t aofs, uint32_t bofs,
2436                      uint32_t oprsz, uint32_t maxsz)
2437{
2438    static gen_helper_gvec_3 * const eq_fn[4] = {
2439        gen_helper_gvec_eq8, gen_helper_gvec_eq16,
2440        gen_helper_gvec_eq32, gen_helper_gvec_eq64
2441    };
2442    static gen_helper_gvec_3 * const ne_fn[4] = {
2443        gen_helper_gvec_ne8, gen_helper_gvec_ne16,
2444        gen_helper_gvec_ne32, gen_helper_gvec_ne64
2445    };
2446    static gen_helper_gvec_3 * const lt_fn[4] = {
2447        gen_helper_gvec_lt8, gen_helper_gvec_lt16,
2448        gen_helper_gvec_lt32, gen_helper_gvec_lt64
2449    };
2450    static gen_helper_gvec_3 * const le_fn[4] = {
2451        gen_helper_gvec_le8, gen_helper_gvec_le16,
2452        gen_helper_gvec_le32, gen_helper_gvec_le64
2453    };
2454    static gen_helper_gvec_3 * const ltu_fn[4] = {
2455        gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
2456        gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
2457    };
2458    static gen_helper_gvec_3 * const leu_fn[4] = {
2459        gen_helper_gvec_leu8, gen_helper_gvec_leu16,
2460        gen_helper_gvec_leu32, gen_helper_gvec_leu64
2461    };
2462    static gen_helper_gvec_3 * const * const fns[16] = {
2463        [TCG_COND_EQ] = eq_fn,
2464        [TCG_COND_NE] = ne_fn,
2465        [TCG_COND_LT] = lt_fn,
2466        [TCG_COND_LE] = le_fn,
2467        [TCG_COND_LTU] = ltu_fn,
2468        [TCG_COND_LEU] = leu_fn,
2469    };
2470    TCGType type;
2471    uint32_t some;
2472
2473    check_size_align(oprsz, maxsz, dofs | aofs | bofs);
2474    check_overlap_3(dofs, aofs, bofs, maxsz);
2475
2476    if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
2477        do_dup(MO_8, dofs, oprsz, maxsz,
2478               NULL, NULL, -(cond == TCG_COND_ALWAYS));
2479        return;
2480    }
2481
2482    /* Implement inline with a vector type, if possible.
2483     * Prefer integer when 64-bit host and 64-bit comparison.
2484     */
2485    type = choose_vector_type(INDEX_op_cmp_vec, vece, oprsz,
2486                              TCG_TARGET_REG_BITS == 64 && vece == MO_64);
2487    switch (type) {
2488    case TCG_TYPE_V256:
2489        /* Recall that ARM SVE allows vector sizes that are not a
2490         * power of 2, but always a multiple of 16.  The intent is
2491         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
2492         */
2493        some = QEMU_ALIGN_DOWN(oprsz, 32);
2494        expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
2495        if (some == oprsz) {
2496            break;
2497        }
2498        dofs += some;
2499        aofs += some;
2500        bofs += some;
2501        oprsz -= some;
2502        maxsz -= some;
2503        /* fallthru */
2504    case TCG_TYPE_V128:
2505        expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
2506        break;
2507    case TCG_TYPE_V64:
2508        expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
2509        break;
2510
2511    case 0:
2512        if (vece == MO_64 && check_size_impl(oprsz, 8)) {
2513            expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
2514        } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
2515            expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
2516        } else {
2517            gen_helper_gvec_3 * const *fn = fns[cond];
2518
2519            if (fn == NULL) {
2520                uint32_t tmp;
2521                tmp = aofs, aofs = bofs, bofs = tmp;
2522                cond = tcg_swap_cond(cond);
2523                fn = fns[cond];
2524                assert(fn != NULL);
2525            }
2526            tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
2527            return;
2528        }
2529        break;
2530
2531    default:
2532        g_assert_not_reached();
2533    }
2534
2535    if (oprsz < maxsz) {
2536        expand_clr(dofs + oprsz, maxsz - oprsz);
2537    }
2538}
2539