LXR qemu/tcg/tcg-op-gvec.c

   1/*
   2 * Generic vector operation expansion
   3 *
   4 * Copyright (c) 2018 Linaro
   5 *
   6 * This library is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU Lesser General Public
   8 * License as published by the Free Software Foundation; either
   9 * version 2 of the License, or (at your option) any later version.
  10 *
  11 * This library is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * Lesser General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU Lesser General Public
  17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20#include "qemu/osdep.h"
  21#include "qemu-common.h"
  22#include "tcg.h"
  23#include "tcg-op.h"
  24#include "tcg-op-gvec.h"
  25#include "tcg-gvec-desc.h"
  26
  27#define MAX_UNROLL  4
  28
  29/* Verify vector size and alignment rules.  OFS should be the OR of all
  30   of the operand offsets so that we can check them all at once.  */
  31static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
  32{
  33    uint32_t opr_align = oprsz >= 16 ? 15 : 7;
  34    uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7;
  35    tcg_debug_assert(oprsz > 0);
  36    tcg_debug_assert(oprsz <= maxsz);
  37    tcg_debug_assert((oprsz & opr_align) == 0);
  38    tcg_debug_assert((maxsz & max_align) == 0);
  39    tcg_debug_assert((ofs & max_align) == 0);
  40}
  41
  42/* Verify vector overlap rules for two operands.  */
  43static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
  44{
  45    tcg_debug_assert(d == a || d + s <= a || a + s <= d);
  46}
  47
  48/* Verify vector overlap rules for three operands.  */
  49static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
  50{
  51    check_overlap_2(d, a, s);
  52    check_overlap_2(d, b, s);
  53    check_overlap_2(a, b, s);
  54}
  55
  56/* Verify vector overlap rules for four operands.  */
  57static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
  58                            uint32_t c, uint32_t s)
  59{
  60    check_overlap_2(d, a, s);
  61    check_overlap_2(d, b, s);
  62    check_overlap_2(d, c, s);
  63    check_overlap_2(a, b, s);
  64    check_overlap_2(a, c, s);
  65    check_overlap_2(b, c, s);
  66}
  67
  68/* Create a descriptor from components.  */
  69uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
  70{
  71    uint32_t desc = 0;
  72
  73    assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS));
  74    assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS));
  75    assert(data == sextract32(data, 0, SIMD_DATA_BITS));
  76
  77    oprsz = (oprsz / 8) - 1;
  78    maxsz = (maxsz / 8) - 1;
  79    desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
  80    desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
  81    desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
  82
  83    return desc;
  84}
  85
  86/* Generate a call to a gvec-style helper with two vector operands.  */
  87void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
  88                        uint32_t oprsz, uint32_t maxsz, int32_t data,
  89                        gen_helper_gvec_2 *fn)
  90{
  91    TCGv_ptr a0, a1;
  92    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
  93
  94    a0 = tcg_temp_new_ptr();
  95    a1 = tcg_temp_new_ptr();
  96
  97    tcg_gen_addi_ptr(a0, cpu_env, dofs);
  98    tcg_gen_addi_ptr(a1, cpu_env, aofs);
  99
 100    fn(a0, a1, desc);
 101
 102    tcg_temp_free_ptr(a0);
 103    tcg_temp_free_ptr(a1);
 104    tcg_temp_free_i32(desc);
 105}
 106
 107/* Generate a call to a gvec-style helper with two vector operands
 108   and one scalar operand.  */
 109void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
 110                         uint32_t oprsz, uint32_t maxsz, int32_t data,
 111                         gen_helper_gvec_2i *fn)
 112{
 113    TCGv_ptr a0, a1;
 114    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 115
 116    a0 = tcg_temp_new_ptr();
 117    a1 = tcg_temp_new_ptr();
 118
 119    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 120    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 121
 122    fn(a0, a1, c, desc);
 123
 124    tcg_temp_free_ptr(a0);
 125    tcg_temp_free_ptr(a1);
 126    tcg_temp_free_i32(desc);
 127}
 128
 129/* Generate a call to a gvec-style helper with three vector operands.  */
 130void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 131                        uint32_t oprsz, uint32_t maxsz, int32_t data,
 132                        gen_helper_gvec_3 *fn)
 133{
 134    TCGv_ptr a0, a1, a2;
 135    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 136
 137    a0 = tcg_temp_new_ptr();
 138    a1 = tcg_temp_new_ptr();
 139    a2 = tcg_temp_new_ptr();
 140
 141    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 142    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 143    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 144
 145    fn(a0, a1, a2, desc);
 146
 147    tcg_temp_free_ptr(a0);
 148    tcg_temp_free_ptr(a1);
 149    tcg_temp_free_ptr(a2);
 150    tcg_temp_free_i32(desc);
 151}
 152
 153/* Generate a call to a gvec-style helper with four vector operands.  */
 154void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 155                        uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
 156                        int32_t data, gen_helper_gvec_4 *fn)
 157{
 158    TCGv_ptr a0, a1, a2, a3;
 159    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 160
 161    a0 = tcg_temp_new_ptr();
 162    a1 = tcg_temp_new_ptr();
 163    a2 = tcg_temp_new_ptr();
 164    a3 = tcg_temp_new_ptr();
 165
 166    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 167    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 168    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 169    tcg_gen_addi_ptr(a3, cpu_env, cofs);
 170
 171    fn(a0, a1, a2, a3, desc);
 172
 173    tcg_temp_free_ptr(a0);
 174    tcg_temp_free_ptr(a1);
 175    tcg_temp_free_ptr(a2);
 176    tcg_temp_free_ptr(a3);
 177    tcg_temp_free_i32(desc);
 178}
 179
 180/* Generate a call to a gvec-style helper with five vector operands.  */
 181void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 182                        uint32_t cofs, uint32_t xofs, uint32_t oprsz,
 183                        uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
 184{
 185    TCGv_ptr a0, a1, a2, a3, a4;
 186    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 187
 188    a0 = tcg_temp_new_ptr();
 189    a1 = tcg_temp_new_ptr();
 190    a2 = tcg_temp_new_ptr();
 191    a3 = tcg_temp_new_ptr();
 192    a4 = tcg_temp_new_ptr();
 193
 194    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 195    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 196    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 197    tcg_gen_addi_ptr(a3, cpu_env, cofs);
 198    tcg_gen_addi_ptr(a4, cpu_env, xofs);
 199
 200    fn(a0, a1, a2, a3, a4, desc);
 201
 202    tcg_temp_free_ptr(a0);
 203    tcg_temp_free_ptr(a1);
 204    tcg_temp_free_ptr(a2);
 205    tcg_temp_free_ptr(a3);
 206    tcg_temp_free_ptr(a4);
 207    tcg_temp_free_i32(desc);
 208}
 209
 210/* Generate a call to a gvec-style helper with three vector operands
 211   and an extra pointer operand.  */
 212void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
 213                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 214                        int32_t data, gen_helper_gvec_2_ptr *fn)
 215{
 216    TCGv_ptr a0, a1;
 217    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 218
 219    a0 = tcg_temp_new_ptr();
 220    a1 = tcg_temp_new_ptr();
 221
 222    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 223    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 224
 225    fn(a0, a1, ptr, desc);
 226
 227    tcg_temp_free_ptr(a0);
 228    tcg_temp_free_ptr(a1);
 229    tcg_temp_free_i32(desc);
 230}
 231
 232/* Generate a call to a gvec-style helper with three vector operands
 233   and an extra pointer operand.  */
 234void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 235                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 236                        int32_t data, gen_helper_gvec_3_ptr *fn)
 237{
 238    TCGv_ptr a0, a1, a2;
 239    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 240
 241    a0 = tcg_temp_new_ptr();
 242    a1 = tcg_temp_new_ptr();
 243    a2 = tcg_temp_new_ptr();
 244
 245    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 246    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 247    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 248
 249    fn(a0, a1, a2, ptr, desc);
 250
 251    tcg_temp_free_ptr(a0);
 252    tcg_temp_free_ptr(a1);
 253    tcg_temp_free_ptr(a2);
 254    tcg_temp_free_i32(desc);
 255}
 256
 257/* Generate a call to a gvec-style helper with four vector operands
 258   and an extra pointer operand.  */
 259void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 260                        uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
 261                        uint32_t maxsz, int32_t data,
 262                        gen_helper_gvec_4_ptr *fn)
 263{
 264    TCGv_ptr a0, a1, a2, a3;
 265    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 266
 267    a0 = tcg_temp_new_ptr();
 268    a1 = tcg_temp_new_ptr();
 269    a2 = tcg_temp_new_ptr();
 270    a3 = tcg_temp_new_ptr();
 271
 272    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 273    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 274    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 275    tcg_gen_addi_ptr(a3, cpu_env, cofs);
 276
 277    fn(a0, a1, a2, a3, ptr, desc);
 278
 279    tcg_temp_free_ptr(a0);
 280    tcg_temp_free_ptr(a1);
 281    tcg_temp_free_ptr(a2);
 282    tcg_temp_free_ptr(a3);
 283    tcg_temp_free_i32(desc);
 284}
 285
 286/* Return true if we want to implement something of OPRSZ bytes
 287   in units of LNSZ.  This limits the expansion of inline code.  */
 288static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
 289{
 290    if (oprsz % lnsz == 0) {
 291        uint32_t lnct = oprsz / lnsz;
 292        return lnct >= 1 && lnct <= MAX_UNROLL;
 293    }
 294    return false;
 295}
 296
 297static void expand_clr(uint32_t dofs, uint32_t maxsz);
 298
 299/* Duplicate C as per VECE.  */
 300uint64_t (dup_const)(unsigned vece, uint64_t c)
 301{
 302    switch (vece) {
 303    case MO_8:
 304        return 0x0101010101010101ull * (uint8_t)c;
 305    case MO_16:
 306        return 0x0001000100010001ull * (uint16_t)c;
 307    case MO_32:
 308        return 0x0000000100000001ull * (uint32_t)c;
 309    case MO_64:
 310        return c;
 311    default:
 312        g_assert_not_reached();
 313    }
 314}
 315
 316/* Duplicate IN into OUT as per VECE.  */
 317static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
 318{
 319    switch (vece) {
 320    case MO_8:
 321        tcg_gen_ext8u_i32(out, in);
 322        tcg_gen_muli_i32(out, out, 0x01010101);
 323        break;
 324    case MO_16:
 325        tcg_gen_deposit_i32(out, in, in, 16, 16);
 326        break;
 327    case MO_32:
 328        tcg_gen_mov_i32(out, in);
 329        break;
 330    default:
 331        g_assert_not_reached();
 332    }
 333}
 334
 335static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
 336{
 337    switch (vece) {
 338    case MO_8:
 339        tcg_gen_ext8u_i64(out, in);
 340        tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
 341        break;
 342    case MO_16:
 343        tcg_gen_ext16u_i64(out, in);
 344        tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
 345        break;
 346    case MO_32:
 347        tcg_gen_deposit_i64(out, in, in, 32, 32);
 348        break;
 349    case MO_64:
 350        tcg_gen_mov_i64(out, in);
 351        break;
 352    default:
 353        g_assert_not_reached();
 354    }
 355}
 356
 357/* Select a supported vector type for implementing an operation on SIZE
 358 * bytes.  If OP is 0, assume that the real operation to be performed is
 359 * required by all backends.  Otherwise, make sure than OP can be performed
 360 * on elements of size VECE in the selected type.  Do not select V64 if
 361 * PREFER_I64 is true.  Return 0 if no vector type is selected.
 362 */
 363static TCGType choose_vector_type(TCGOpcode op, unsigned vece, uint32_t size,
 364                                  bool prefer_i64)
 365{
 366    if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) {
 367        if (op == 0) {
 368            return TCG_TYPE_V256;
 369        }
 370        /* Recall that ARM SVE allows vector sizes that are not a
 371         * power of 2, but always a multiple of 16.  The intent is
 372         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 373         * It is hard to imagine a case in which v256 is supported
 374         * but v128 is not, but check anyway.
 375         */
 376        if (tcg_can_emit_vec_op(op, TCG_TYPE_V256, vece)
 377            && (size % 32 == 0
 378                || tcg_can_emit_vec_op(op, TCG_TYPE_V128, vece))) {
 379            return TCG_TYPE_V256;
 380        }
 381    }
 382    if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16)
 383        && (op == 0 || tcg_can_emit_vec_op(op, TCG_TYPE_V128, vece))) {
 384        return TCG_TYPE_V128;
 385    }
 386    if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
 387        && (op == 0 || tcg_can_emit_vec_op(op, TCG_TYPE_V64, vece))) {
 388        return TCG_TYPE_V64;
 389    }
 390    return 0;
 391}
 392
 393/* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
 394 * Only one of IN_32 or IN_64 may be set;
 395 * IN_C is used if IN_32 and IN_64 are unset.
 396 */
 397static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
 398                   uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
 399                   uint64_t in_c)
 400{
 401    TCGType type;
 402    TCGv_i64 t_64;
 403    TCGv_i32 t_32, t_desc;
 404    TCGv_ptr t_ptr;
 405    uint32_t i;
 406
 407    assert(vece <= (in_32 ? MO_32 : MO_64));
 408    assert(in_32 == NULL || in_64 == NULL);
 409
 410    /* If we're storing 0, expand oprsz to maxsz.  */
 411    if (in_32 == NULL && in_64 == NULL) {
 412        in_c = dup_const(vece, in_c);
 413        if (in_c == 0) {
 414            oprsz = maxsz;
 415        }
 416    }
 417
 418    /* Implement inline with a vector type, if possible.
 419     * Prefer integer when 64-bit host and no variable dup.
 420     */
 421    type = choose_vector_type(0, vece, oprsz,
 422                              (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
 423                               && (in_64 == NULL || vece == MO_64)));
 424    if (type != 0) {
 425        TCGv_vec t_vec = tcg_temp_new_vec(type);
 426
 427        if (in_32) {
 428            tcg_gen_dup_i32_vec(vece, t_vec, in_32);
 429        } else if (in_64) {
 430            tcg_gen_dup_i64_vec(vece, t_vec, in_64);
 431        } else {
 432            switch (vece) {
 433            case MO_8:
 434                tcg_gen_dup8i_vec(t_vec, in_c);
 435                break;
 436            case MO_16:
 437                tcg_gen_dup16i_vec(t_vec, in_c);
 438                break;
 439            case MO_32:
 440                tcg_gen_dup32i_vec(t_vec, in_c);
 441                break;
 442            default:
 443                tcg_gen_dup64i_vec(t_vec, in_c);
 444                break;
 445            }
 446        }
 447
 448        i = 0;
 449        switch (type) {
 450        case TCG_TYPE_V256:
 451            /* Recall that ARM SVE allows vector sizes that are not a
 452             * power of 2, but always a multiple of 16.  The intent is
 453             * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 454             */
 455            for (; i + 32 <= oprsz; i += 32) {
 456                tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
 457            }
 458            /* fallthru */
 459        case TCG_TYPE_V128:
 460            for (; i + 16 <= oprsz; i += 16) {
 461                tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
 462            }
 463            break;
 464        case TCG_TYPE_V64:
 465            for (; i < oprsz; i += 8) {
 466                tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
 467            }
 468            break;
 469        default:
 470            g_assert_not_reached();
 471        }
 472
 473        tcg_temp_free_vec(t_vec);
 474        goto done;
 475    }
 476
 477    /* Otherwise, inline with an integer type, unless "large".  */
 478    if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
 479        t_64 = NULL;
 480        t_32 = NULL;
 481
 482        if (in_32) {
 483            /* We are given a 32-bit variable input.  For a 64-bit host,
 484               use a 64-bit operation unless the 32-bit operation would
 485               be simple enough.  */
 486            if (TCG_TARGET_REG_BITS == 64
 487                && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
 488                t_64 = tcg_temp_new_i64();
 489                tcg_gen_extu_i32_i64(t_64, in_32);
 490                gen_dup_i64(vece, t_64, t_64);
 491            } else {
 492                t_32 = tcg_temp_new_i32();
 493                gen_dup_i32(vece, t_32, in_32);
 494            }
 495        } else if (in_64) {
 496            /* We are given a 64-bit variable input.  */
 497            t_64 = tcg_temp_new_i64();
 498            gen_dup_i64(vece, t_64, in_64);
 499        } else {
 500            /* We are given a constant input.  */
 501            /* For 64-bit hosts, use 64-bit constants for "simple" constants
 502               or when we'd need too many 32-bit stores, or when a 64-bit
 503               constant is really required.  */
 504            if (vece == MO_64
 505                || (TCG_TARGET_REG_BITS == 64
 506                    && (in_c == 0 || in_c == -1
 507                        || !check_size_impl(oprsz, 4)))) {
 508                t_64 = tcg_const_i64(in_c);
 509            } else {
 510                t_32 = tcg_const_i32(in_c);
 511            }
 512        }
 513
 514        /* Implement inline if we picked an implementation size above.  */
 515        if (t_32) {
 516            for (i = 0; i < oprsz; i += 4) {
 517                tcg_gen_st_i32(t_32, cpu_env, dofs + i);
 518            }
 519            tcg_temp_free_i32(t_32);
 520            goto done;
 521        }
 522        if (t_64) {
 523            for (i = 0; i < oprsz; i += 8) {
 524                tcg_gen_st_i64(t_64, cpu_env, dofs + i);
 525            }
 526            tcg_temp_free_i64(t_64);
 527            goto done;
 528        }
 529    }
 530
 531    /* Otherwise implement out of line.  */
 532    t_ptr = tcg_temp_new_ptr();
 533    tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
 534    t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));
 535
 536    if (vece == MO_64) {
 537        if (in_64) {
 538            gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
 539        } else {
 540            t_64 = tcg_const_i64(in_c);
 541            gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
 542            tcg_temp_free_i64(t_64);
 543        }
 544    } else {
 545        typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
 546        static dup_fn * const fns[3] = {
 547            gen_helper_gvec_dup8,
 548            gen_helper_gvec_dup16,
 549            gen_helper_gvec_dup32
 550        };
 551
 552        if (in_32) {
 553            fns[vece](t_ptr, t_desc, in_32);
 554        } else {
 555            t_32 = tcg_temp_new_i32();
 556            if (in_64) {
 557                tcg_gen_extrl_i64_i32(t_32, in_64);
 558            } else if (vece == MO_8) {
 559                tcg_gen_movi_i32(t_32, in_c & 0xff);
 560            } else if (vece == MO_16) {
 561                tcg_gen_movi_i32(t_32, in_c & 0xffff);
 562            } else {
 563                tcg_gen_movi_i32(t_32, in_c);
 564            }
 565            fns[vece](t_ptr, t_desc, t_32);
 566            tcg_temp_free_i32(t_32);
 567        }
 568    }
 569
 570    tcg_temp_free_ptr(t_ptr);
 571    tcg_temp_free_i32(t_desc);
 572    return;
 573
 574 done:
 575    if (oprsz < maxsz) {
 576        expand_clr(dofs + oprsz, maxsz - oprsz);
 577    }
 578}
 579
 580/* Likewise, but with zero.  */
 581static void expand_clr(uint32_t dofs, uint32_t maxsz)
 582{
 583    do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
 584}
 585
 586/* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
 587static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 588                         void (*fni)(TCGv_i32, TCGv_i32))
 589{
 590    TCGv_i32 t0 = tcg_temp_new_i32();
 591    uint32_t i;
 592
 593    for (i = 0; i < oprsz; i += 4) {
 594        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 595        fni(t0, t0);
 596        tcg_gen_st_i32(t0, cpu_env, dofs + i);
 597    }
 598    tcg_temp_free_i32(t0);
 599}
 600
 601static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 602                          int32_t c, bool load_dest,
 603                          void (*fni)(TCGv_i32, TCGv_i32, int32_t))
 604{
 605    TCGv_i32 t0 = tcg_temp_new_i32();
 606    TCGv_i32 t1 = tcg_temp_new_i32();
 607    uint32_t i;
 608
 609    for (i = 0; i < oprsz; i += 4) {
 610        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 611        if (load_dest) {
 612            tcg_gen_ld_i32(t1, cpu_env, dofs + i);
 613        }
 614        fni(t1, t0, c);
 615        tcg_gen_st_i32(t1, cpu_env, dofs + i);
 616    }
 617    tcg_temp_free_i32(t0);
 618    tcg_temp_free_i32(t1);
 619}
 620
 621static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 622                          TCGv_i32 c, bool scalar_first,
 623                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
 624{
 625    TCGv_i32 t0 = tcg_temp_new_i32();
 626    TCGv_i32 t1 = tcg_temp_new_i32();
 627    uint32_t i;
 628
 629    for (i = 0; i < oprsz; i += 4) {
 630        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 631        if (scalar_first) {
 632            fni(t1, c, t0);
 633        } else {
 634            fni(t1, t0, c);
 635        }
 636        tcg_gen_st_i32(t1, cpu_env, dofs + i);
 637    }
 638    tcg_temp_free_i32(t0);
 639    tcg_temp_free_i32(t1);
 640}
 641
 642/* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
 643static void expand_3_i32(uint32_t dofs, uint32_t aofs,
 644                         uint32_t bofs, uint32_t oprsz, bool load_dest,
 645                         void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
 646{
 647    TCGv_i32 t0 = tcg_temp_new_i32();
 648    TCGv_i32 t1 = tcg_temp_new_i32();
 649    TCGv_i32 t2 = tcg_temp_new_i32();
 650    uint32_t i;
 651
 652    for (i = 0; i < oprsz; i += 4) {
 653        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 654        tcg_gen_ld_i32(t1, cpu_env, bofs + i);
 655        if (load_dest) {
 656            tcg_gen_ld_i32(t2, cpu_env, dofs + i);
 657        }
 658        fni(t2, t0, t1);
 659        tcg_gen_st_i32(t2, cpu_env, dofs + i);
 660    }
 661    tcg_temp_free_i32(t2);
 662    tcg_temp_free_i32(t1);
 663    tcg_temp_free_i32(t0);
 664}
 665
 666/* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
 667static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 668                         uint32_t cofs, uint32_t oprsz,
 669                         void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
 670{
 671    TCGv_i32 t0 = tcg_temp_new_i32();
 672    TCGv_i32 t1 = tcg_temp_new_i32();
 673    TCGv_i32 t2 = tcg_temp_new_i32();
 674    TCGv_i32 t3 = tcg_temp_new_i32();
 675    uint32_t i;
 676
 677    for (i = 0; i < oprsz; i += 4) {
 678        tcg_gen_ld_i32(t1, cpu_env, aofs + i);
 679        tcg_gen_ld_i32(t2, cpu_env, bofs + i);
 680        tcg_gen_ld_i32(t3, cpu_env, cofs + i);
 681        fni(t0, t1, t2, t3);
 682        tcg_gen_st_i32(t0, cpu_env, dofs + i);
 683    }
 684    tcg_temp_free_i32(t3);
 685    tcg_temp_free_i32(t2);
 686    tcg_temp_free_i32(t1);
 687    tcg_temp_free_i32(t0);
 688}
 689
 690/* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
 691static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 692                         void (*fni)(TCGv_i64, TCGv_i64))
 693{
 694    TCGv_i64 t0 = tcg_temp_new_i64();
 695    uint32_t i;
 696
 697    for (i = 0; i < oprsz; i += 8) {
 698        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 699        fni(t0, t0);
 700        tcg_gen_st_i64(t0, cpu_env, dofs + i);
 701    }
 702    tcg_temp_free_i64(t0);
 703}
 704
 705static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 706                          int64_t c, bool load_dest,
 707                          void (*fni)(TCGv_i64, TCGv_i64, int64_t))
 708{
 709    TCGv_i64 t0 = tcg_temp_new_i64();
 710    TCGv_i64 t1 = tcg_temp_new_i64();
 711    uint32_t i;
 712
 713    for (i = 0; i < oprsz; i += 8) {
 714        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 715        if (load_dest) {
 716            tcg_gen_ld_i64(t1, cpu_env, dofs + i);
 717        }
 718        fni(t1, t0, c);
 719        tcg_gen_st_i64(t1, cpu_env, dofs + i);
 720    }
 721    tcg_temp_free_i64(t0);
 722    tcg_temp_free_i64(t1);
 723}
 724
 725static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 726                          TCGv_i64 c, bool scalar_first,
 727                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
 728{
 729    TCGv_i64 t0 = tcg_temp_new_i64();
 730    TCGv_i64 t1 = tcg_temp_new_i64();
 731    uint32_t i;
 732
 733    for (i = 0; i < oprsz; i += 8) {
 734        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 735        if (scalar_first) {
 736            fni(t1, c, t0);
 737        } else {
 738            fni(t1, t0, c);
 739        }
 740        tcg_gen_st_i64(t1, cpu_env, dofs + i);
 741    }
 742    tcg_temp_free_i64(t0);
 743    tcg_temp_free_i64(t1);
 744}
 745
 746/* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
 747static void expand_3_i64(uint32_t dofs, uint32_t aofs,
 748                         uint32_t bofs, uint32_t oprsz, bool load_dest,
 749                         void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
 750{
 751    TCGv_i64 t0 = tcg_temp_new_i64();
 752    TCGv_i64 t1 = tcg_temp_new_i64();
 753    TCGv_i64 t2 = tcg_temp_new_i64();
 754    uint32_t i;
 755
 756    for (i = 0; i < oprsz; i += 8) {
 757        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 758        tcg_gen_ld_i64(t1, cpu_env, bofs + i);
 759        if (load_dest) {
 760            tcg_gen_ld_i64(t2, cpu_env, dofs + i);
 761        }
 762        fni(t2, t0, t1);
 763        tcg_gen_st_i64(t2, cpu_env, dofs + i);
 764    }
 765    tcg_temp_free_i64(t2);
 766    tcg_temp_free_i64(t1);
 767    tcg_temp_free_i64(t0);
 768}
 769
 770/* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
 771static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 772                         uint32_t cofs, uint32_t oprsz,
 773                         void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
 774{
 775    TCGv_i64 t0 = tcg_temp_new_i64();
 776    TCGv_i64 t1 = tcg_temp_new_i64();
 777    TCGv_i64 t2 = tcg_temp_new_i64();
 778    TCGv_i64 t3 = tcg_temp_new_i64();
 779    uint32_t i;
 780
 781    for (i = 0; i < oprsz; i += 8) {
 782        tcg_gen_ld_i64(t1, cpu_env, aofs + i);
 783        tcg_gen_ld_i64(t2, cpu_env, bofs + i);
 784        tcg_gen_ld_i64(t3, cpu_env, cofs + i);
 785        fni(t0, t1, t2, t3);
 786        tcg_gen_st_i64(t0, cpu_env, dofs + i);
 787    }
 788    tcg_temp_free_i64(t3);
 789    tcg_temp_free_i64(t2);
 790    tcg_temp_free_i64(t1);
 791    tcg_temp_free_i64(t0);
 792}
 793
 794/* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
 795static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 796                         uint32_t oprsz, uint32_t tysz, TCGType type,
 797                         void (*fni)(unsigned, TCGv_vec, TCGv_vec))
 798{
 799    TCGv_vec t0 = tcg_temp_new_vec(type);
 800    uint32_t i;
 801
 802    for (i = 0; i < oprsz; i += tysz) {
 803        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 804        fni(vece, t0, t0);
 805        tcg_gen_st_vec(t0, cpu_env, dofs + i);
 806    }
 807    tcg_temp_free_vec(t0);
 808}
 809
 810/* Expand OPSZ bytes worth of two-vector operands and an immediate operand
 811   using host vectors.  */
 812static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 813                          uint32_t oprsz, uint32_t tysz, TCGType type,
 814                          int64_t c, bool load_dest,
 815                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
 816{
 817    TCGv_vec t0 = tcg_temp_new_vec(type);
 818    TCGv_vec t1 = tcg_temp_new_vec(type);
 819    uint32_t i;
 820
 821    for (i = 0; i < oprsz; i += tysz) {
 822        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 823        if (load_dest) {
 824            tcg_gen_ld_vec(t1, cpu_env, dofs + i);
 825        }
 826        fni(vece, t1, t0, c);
 827        tcg_gen_st_vec(t1, cpu_env, dofs + i);
 828    }
 829    tcg_temp_free_vec(t0);
 830    tcg_temp_free_vec(t1);
 831}
 832
 833static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 834                          uint32_t oprsz, uint32_t tysz, TCGType type,
 835                          TCGv_vec c, bool scalar_first,
 836                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
 837{
 838    TCGv_vec t0 = tcg_temp_new_vec(type);
 839    TCGv_vec t1 = tcg_temp_new_vec(type);
 840    uint32_t i;
 841
 842    for (i = 0; i < oprsz; i += tysz) {
 843        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 844        if (scalar_first) {
 845            fni(vece, t1, c, t0);
 846        } else {
 847            fni(vece, t1, t0, c);
 848        }
 849        tcg_gen_st_vec(t1, cpu_env, dofs + i);
 850    }
 851    tcg_temp_free_vec(t0);
 852    tcg_temp_free_vec(t1);
 853}
 854
 855/* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
 856static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 857                         uint32_t bofs, uint32_t oprsz,
 858                         uint32_t tysz, TCGType type, bool load_dest,
 859                         void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
 860{
 861    TCGv_vec t0 = tcg_temp_new_vec(type);
 862    TCGv_vec t1 = tcg_temp_new_vec(type);
 863    TCGv_vec t2 = tcg_temp_new_vec(type);
 864    uint32_t i;
 865
 866    for (i = 0; i < oprsz; i += tysz) {
 867        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 868        tcg_gen_ld_vec(t1, cpu_env, bofs + i);
 869        if (load_dest) {
 870            tcg_gen_ld_vec(t2, cpu_env, dofs + i);
 871        }
 872        fni(vece, t2, t0, t1);
 873        tcg_gen_st_vec(t2, cpu_env, dofs + i);
 874    }
 875    tcg_temp_free_vec(t2);
 876    tcg_temp_free_vec(t1);
 877    tcg_temp_free_vec(t0);
 878}
 879
 880/* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
 881static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 882                         uint32_t bofs, uint32_t cofs, uint32_t oprsz,
 883                         uint32_t tysz, TCGType type,
 884                         void (*fni)(unsigned, TCGv_vec, TCGv_vec,
 885                                     TCGv_vec, TCGv_vec))
 886{
 887    TCGv_vec t0 = tcg_temp_new_vec(type);
 888    TCGv_vec t1 = tcg_temp_new_vec(type);
 889    TCGv_vec t2 = tcg_temp_new_vec(type);
 890    TCGv_vec t3 = tcg_temp_new_vec(type);
 891    uint32_t i;
 892
 893    for (i = 0; i < oprsz; i += tysz) {
 894        tcg_gen_ld_vec(t1, cpu_env, aofs + i);
 895        tcg_gen_ld_vec(t2, cpu_env, bofs + i);
 896        tcg_gen_ld_vec(t3, cpu_env, cofs + i);
 897        fni(vece, t0, t1, t2, t3);
 898        tcg_gen_st_vec(t0, cpu_env, dofs + i);
 899    }
 900    tcg_temp_free_vec(t3);
 901    tcg_temp_free_vec(t2);
 902    tcg_temp_free_vec(t1);
 903    tcg_temp_free_vec(t0);
 904}
 905
 906/* Expand a vector two-operand operation.  */
 907void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
 908                    uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
 909{
 910    TCGType type;
 911    uint32_t some;
 912
 913    check_size_align(oprsz, maxsz, dofs | aofs);
 914    check_overlap_2(dofs, aofs, maxsz);
 915
 916    type = 0;
 917    if (g->fniv) {
 918        type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64);
 919    }
 920    switch (type) {
 921    case TCG_TYPE_V256:
 922        /* Recall that ARM SVE allows vector sizes that are not a
 923         * power of 2, but always a multiple of 16.  The intent is
 924         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 925         */
 926        some = QEMU_ALIGN_DOWN(oprsz, 32);
 927        expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv);
 928        if (some == oprsz) {
 929            break;
 930        }
 931        dofs += some;
 932        aofs += some;
 933        oprsz -= some;
 934        maxsz -= some;
 935        /* fallthru */
 936    case TCG_TYPE_V128:
 937        expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv);
 938        break;
 939    case TCG_TYPE_V64:
 940        expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv);
 941        break;
 942
 943    case 0:
 944        if (g->fni8 && check_size_impl(oprsz, 8)) {
 945            expand_2_i64(dofs, aofs, oprsz, g->fni8);
 946        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
 947            expand_2_i32(dofs, aofs, oprsz, g->fni4);
 948        } else {
 949            assert(g->fno != NULL);
 950            tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
 951            return;
 952        }
 953        break;
 954
 955    default:
 956        g_assert_not_reached();
 957    }
 958
 959    if (oprsz < maxsz) {
 960        expand_clr(dofs + oprsz, maxsz - oprsz);
 961    }
 962}
 963
 964/* Expand a vector operation with two vectors and an immediate.  */
 965void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 966                     uint32_t maxsz, int64_t c, const GVecGen2i *g)
 967{
 968    TCGType type;
 969    uint32_t some;
 970
 971    check_size_align(oprsz, maxsz, dofs | aofs);
 972    check_overlap_2(dofs, aofs, maxsz);
 973
 974    type = 0;
 975    if (g->fniv) {
 976        type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64);
 977    }
 978    switch (type) {
 979    case TCG_TYPE_V256:
 980        /* Recall that ARM SVE allows vector sizes that are not a
 981         * power of 2, but always a multiple of 16.  The intent is
 982         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 983         */
 984        some = QEMU_ALIGN_DOWN(oprsz, 32);
 985        expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
 986                      c, g->load_dest, g->fniv);
 987        if (some == oprsz) {
 988            break;
 989        }
 990        dofs += some;
 991        aofs += some;
 992        oprsz -= some;
 993        maxsz -= some;
 994        /* fallthru */
 995    case TCG_TYPE_V128:
 996        expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
 997                      c, g->load_dest, g->fniv);
 998        break;
 999    case TCG_TYPE_V64:
1000        expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,

1001                      c, g->load_dest, g->fniv);
1002        break;
1003
1004    case 0:
1005        if (g->fni8 && check_size_impl(oprsz, 8)) {
1006            expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1007        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1008            expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1009        } else {
1010            if (g->fno) {
1011                tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1012            } else {
1013                TCGv_i64 tcg_c = tcg_const_i64(c);
1014                tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1015                                    maxsz, c, g->fnoi);
1016                tcg_temp_free_i64(tcg_c);
1017            }
1018            return;
1019        }
1020        break;
1021
1022    default:
1023        g_assert_not_reached();
1024    }
1025
1026    if (oprsz < maxsz) {
1027        expand_clr(dofs + oprsz, maxsz - oprsz);
1028    }
1029}
1030
1031/* Expand a vector operation with two vectors and a scalar.  */
1032void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1033                     uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1034{
1035    TCGType type;
1036
1037    check_size_align(oprsz, maxsz, dofs | aofs);
1038    check_overlap_2(dofs, aofs, maxsz);
1039
1040    type = 0;
1041    if (g->fniv) {
1042        type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64);
1043    }
1044    if (type != 0) {
1045        TCGv_vec t_vec = tcg_temp_new_vec(type);
1046        uint32_t some;
1047
1048        tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1049
1050        switch (type) {
1051        case TCG_TYPE_V256:
1052            /* Recall that ARM SVE allows vector sizes that are not a
1053             * power of 2, but always a multiple of 16.  The intent is
1054             * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1055             */
1056            some = QEMU_ALIGN_DOWN(oprsz, 32);
1057            expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1058                          t_vec, g->scalar_first, g->fniv);
1059            if (some == oprsz) {
1060                break;
1061            }
1062            dofs += some;
1063            aofs += some;
1064            oprsz -= some;
1065            maxsz -= some;
1066            /* fallthru */
1067
1068        case TCG_TYPE_V128:
1069            expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1070                          t_vec, g->scalar_first, g->fniv);
1071            break;
1072
1073        case TCG_TYPE_V64:
1074            expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1075                          t_vec, g->scalar_first, g->fniv);
1076            break;
1077
1078        default:
1079            g_assert_not_reached();
1080        }
1081        tcg_temp_free_vec(t_vec);
1082    } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1083        TCGv_i64 t64 = tcg_temp_new_i64();
1084
1085        gen_dup_i64(g->vece, t64, c);
1086        expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1087        tcg_temp_free_i64(t64);
1088    } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1089        TCGv_i32 t32 = tcg_temp_new_i32();
1090
1091        tcg_gen_extrl_i64_i32(t32, c);
1092        gen_dup_i32(g->vece, t32, t32);
1093        expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1094        tcg_temp_free_i32(t32);
1095    } else {
1096        tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1097        return;
1098    }
1099
1100    if (oprsz < maxsz) {
1101        expand_clr(dofs + oprsz, maxsz - oprsz);
1102    }
1103}
1104
1105/* Expand a vector three-operand operation.  */
1106void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1107                    uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1108{
1109    TCGType type;
1110    uint32_t some;
1111
1112    check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1113    check_overlap_3(dofs, aofs, bofs, maxsz);
1114
1115    type = 0;
1116    if (g->fniv) {
1117        type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64);
1118    }
1119    switch (type) {
1120    case TCG_TYPE_V256:
1121        /* Recall that ARM SVE allows vector sizes that are not a
1122         * power of 2, but always a multiple of 16.  The intent is
1123         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1124         */
1125        some = QEMU_ALIGN_DOWN(oprsz, 32);
1126        expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1127                     g->load_dest, g->fniv);
1128        if (some == oprsz) {
1129            break;
1130        }
1131        dofs += some;
1132        aofs += some;
1133        bofs += some;
1134        oprsz -= some;
1135        maxsz -= some;
1136        /* fallthru */
1137    case TCG_TYPE_V128:
1138        expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1139                     g->load_dest, g->fniv);
1140        break;
1141    case TCG_TYPE_V64:
1142        expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1143                     g->load_dest, g->fniv);
1144        break;
1145
1146    case 0:
1147        if (g->fni8 && check_size_impl(oprsz, 8)) {
1148            expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1149        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1150            expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1151        } else {
1152            assert(g->fno != NULL);
1153            tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
1154                               maxsz, g->data, g->fno);
1155            return;
1156        }
1157        break;
1158
1159    default:
1160        g_assert_not_reached();
1161    }
1162
1163    if (oprsz < maxsz) {
1164        expand_clr(dofs + oprsz, maxsz - oprsz);
1165    }
1166}
1167
1168/* Expand a vector four-operand operation.  */
1169void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1170                    uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1171{
1172    TCGType type;
1173    uint32_t some;
1174
1175    check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1176    check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1177
1178    type = 0;
1179    if (g->fniv) {
1180        type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64);
1181    }
1182    switch (type) {
1183    case TCG_TYPE_V256:
1184        /* Recall that ARM SVE allows vector sizes that are not a
1185         * power of 2, but always a multiple of 16.  The intent is
1186         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1187         */
1188        some = QEMU_ALIGN_DOWN(oprsz, 32);
1189        expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1190                     32, TCG_TYPE_V256, g->fniv);
1191        if (some == oprsz) {
1192            break;
1193        }
1194        dofs += some;
1195        aofs += some;
1196        bofs += some;
1197        cofs += some;
1198        oprsz -= some;
1199        maxsz -= some;
1200        /* fallthru */
1201    case TCG_TYPE_V128:
1202        expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1203                     16, TCG_TYPE_V128, g->fniv);
1204        break;
1205    case TCG_TYPE_V64:
1206        expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1207                     8, TCG_TYPE_V64, g->fniv);
1208        break;
1209
1210    case 0:
1211        if (g->fni8 && check_size_impl(oprsz, 8)) {
1212            expand_4_i64(dofs, aofs, bofs, cofs, oprsz, g->fni8);
1213        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1214            expand_4_i32(dofs, aofs, bofs, cofs, oprsz, g->fni4);
1215        } else {
1216            assert(g->fno != NULL);
1217            tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1218                               oprsz, maxsz, g->data, g->fno);
1219            return;
1220        }
1221        break;
1222
1223    default:
1224        g_assert_not_reached();
1225    }
1226
1227    if (oprsz < maxsz) {
1228        expand_clr(dofs + oprsz, maxsz - oprsz);
1229    }
1230}
1231
1232/*
1233 * Expand specific vector operations.
1234 */
1235
1236static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1237{
1238    tcg_gen_mov_vec(a, b);
1239}
1240
1241void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1242                      uint32_t oprsz, uint32_t maxsz)
1243{
1244    static const GVecGen2 g = {
1245        .fni8 = tcg_gen_mov_i64,
1246        .fniv = vec_mov2,
1247        .fno = gen_helper_gvec_mov,
1248        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1249    };
1250    if (dofs != aofs) {
1251        tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1252    } else {
1253        check_size_align(oprsz, maxsz, dofs);
1254        if (oprsz < maxsz) {
1255            expand_clr(dofs + oprsz, maxsz - oprsz);
1256        }
1257    }
1258}
1259
1260void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1261                          uint32_t maxsz, TCGv_i32 in)
1262{
1263    check_size_align(oprsz, maxsz, dofs);
1264    tcg_debug_assert(vece <= MO_32);
1265    do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1266}
1267
1268void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1269                          uint32_t maxsz, TCGv_i64 in)
1270{
1271    check_size_align(oprsz, maxsz, dofs);
1272    tcg_debug_assert(vece <= MO_64);
1273    do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1274}
1275
1276void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1277                          uint32_t oprsz, uint32_t maxsz)
1278{
1279    if (vece <= MO_32) {
1280        TCGv_i32 in = tcg_temp_new_i32();
1281        switch (vece) {
1282        case MO_8:
1283            tcg_gen_ld8u_i32(in, cpu_env, aofs);
1284            break;
1285        case MO_16:
1286            tcg_gen_ld16u_i32(in, cpu_env, aofs);
1287            break;
1288        case MO_32:
1289            tcg_gen_ld_i32(in, cpu_env, aofs);
1290            break;
1291        }
1292        tcg_gen_gvec_dup_i32(vece, dofs, oprsz, maxsz, in);
1293        tcg_temp_free_i32(in);
1294    } else if (vece == MO_64) {
1295        TCGv_i64 in = tcg_temp_new_i64();
1296        tcg_gen_ld_i64(in, cpu_env, aofs);
1297        tcg_gen_gvec_dup_i64(MO_64, dofs, oprsz, maxsz, in);
1298        tcg_temp_free_i64(in);
1299    } else {
1300        /* 128-bit duplicate.  */
1301        /* ??? Dup to 256-bit vector.  */
1302        int i;
1303
1304        tcg_debug_assert(vece == 4);
1305        tcg_debug_assert(oprsz >= 16);
1306        if (TCG_TARGET_HAS_v128) {
1307            TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1308
1309            tcg_gen_ld_vec(in, cpu_env, aofs);
1310            for (i = 0; i < oprsz; i += 16) {
1311                tcg_gen_st_vec(in, cpu_env, dofs + i);
1312            }
1313            tcg_temp_free_vec(in);
1314        } else {
1315            TCGv_i64 in0 = tcg_temp_new_i64();
1316            TCGv_i64 in1 = tcg_temp_new_i64();
1317
1318            tcg_gen_ld_i64(in0, cpu_env, aofs);
1319            tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
1320            for (i = 0; i < oprsz; i += 16) {
1321                tcg_gen_st_i64(in0, cpu_env, dofs + i);
1322                tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
1323            }
1324            tcg_temp_free_i64(in0);
1325            tcg_temp_free_i64(in1);
1326        }
1327    }
1328}
1329
1330void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz,
1331                         uint32_t maxsz, uint64_t x)
1332{
1333    check_size_align(oprsz, maxsz, dofs);
1334    do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x);
1335}
1336
1337void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz,
1338                         uint32_t maxsz, uint32_t x)
1339{
1340    check_size_align(oprsz, maxsz, dofs);
1341    do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x);
1342}
1343
1344void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz,
1345                         uint32_t maxsz, uint16_t x)
1346{
1347    check_size_align(oprsz, maxsz, dofs);
1348    do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x);
1349}
1350
1351void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz,
1352                         uint32_t maxsz, uint8_t x)
1353{
1354    check_size_align(oprsz, maxsz, dofs);
1355    do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x);
1356}
1357
1358void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1359                      uint32_t oprsz, uint32_t maxsz)
1360{
1361    static const GVecGen2 g = {
1362        .fni8 = tcg_gen_not_i64,
1363        .fniv = tcg_gen_not_vec,
1364        .fno = gen_helper_gvec_not,
1365        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1366    };
1367    tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1368}
1369
1370/* Perform a vector addition using normal addition and a mask.  The mask
1371   should be the sign bit of each lane.  This 6-operation form is more
1372   efficient than separate additions when there are 4 or more lanes in
1373   the 64-bit operation.  */
1374static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1375{
1376    TCGv_i64 t1 = tcg_temp_new_i64();
1377    TCGv_i64 t2 = tcg_temp_new_i64();
1378    TCGv_i64 t3 = tcg_temp_new_i64();
1379
1380    tcg_gen_andc_i64(t1, a, m);
1381    tcg_gen_andc_i64(t2, b, m);
1382    tcg_gen_xor_i64(t3, a, b);
1383    tcg_gen_add_i64(d, t1, t2);
1384    tcg_gen_and_i64(t3, t3, m);
1385    tcg_gen_xor_i64(d, d, t3);
1386
1387    tcg_temp_free_i64(t1);
1388    tcg_temp_free_i64(t2);
1389    tcg_temp_free_i64(t3);
1390}
1391
1392void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1393{
1394    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1395    gen_addv_mask(d, a, b, m);
1396    tcg_temp_free_i64(m);
1397}
1398
1399void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1400{
1401    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1402    gen_addv_mask(d, a, b, m);
1403    tcg_temp_free_i64(m);
1404}
1405
1406void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1407{
1408    TCGv_i64 t1 = tcg_temp_new_i64();
1409    TCGv_i64 t2 = tcg_temp_new_i64();
1410
1411    tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1412    tcg_gen_add_i64(t2, a, b);
1413    tcg_gen_add_i64(t1, t1, b);
1414    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1415
1416    tcg_temp_free_i64(t1);
1417    tcg_temp_free_i64(t2);
1418}
1419
1420void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1421                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1422{
1423    static const GVecGen3 g[4] = {
1424        { .fni8 = tcg_gen_vec_add8_i64,
1425          .fniv = tcg_gen_add_vec,
1426          .fno = gen_helper_gvec_add8,
1427          .opc = INDEX_op_add_vec,
1428          .vece = MO_8 },
1429        { .fni8 = tcg_gen_vec_add16_i64,
1430          .fniv = tcg_gen_add_vec,
1431          .fno = gen_helper_gvec_add16,
1432          .opc = INDEX_op_add_vec,
1433          .vece = MO_16 },
1434        { .fni4 = tcg_gen_add_i32,
1435          .fniv = tcg_gen_add_vec,
1436          .fno = gen_helper_gvec_add32,
1437          .opc = INDEX_op_add_vec,
1438          .vece = MO_32 },
1439        { .fni8 = tcg_gen_add_i64,
1440          .fniv = tcg_gen_add_vec,
1441          .fno = gen_helper_gvec_add64,
1442          .opc = INDEX_op_add_vec,
1443          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1444          .vece = MO_64 },
1445    };
1446
1447    tcg_debug_assert(vece <= MO_64);
1448    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1449}
1450
1451void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1452                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1453{
1454    static const GVecGen2s g[4] = {
1455        { .fni8 = tcg_gen_vec_add8_i64,
1456          .fniv = tcg_gen_add_vec,
1457          .fno = gen_helper_gvec_adds8,
1458          .opc = INDEX_op_add_vec,
1459          .vece = MO_8 },
1460        { .fni8 = tcg_gen_vec_add16_i64,
1461          .fniv = tcg_gen_add_vec,
1462          .fno = gen_helper_gvec_adds16,
1463          .opc = INDEX_op_add_vec,
1464          .vece = MO_16 },
1465        { .fni4 = tcg_gen_add_i32,
1466          .fniv = tcg_gen_add_vec,
1467          .fno = gen_helper_gvec_adds32,
1468          .opc = INDEX_op_add_vec,
1469          .vece = MO_32 },
1470        { .fni8 = tcg_gen_add_i64,
1471          .fniv = tcg_gen_add_vec,
1472          .fno = gen_helper_gvec_adds64,
1473          .opc = INDEX_op_add_vec,
1474          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1475          .vece = MO_64 },
1476    };
1477
1478    tcg_debug_assert(vece <= MO_64);
1479    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1480}
1481
1482void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
1483                       int64_t c, uint32_t oprsz, uint32_t maxsz)
1484{
1485    TCGv_i64 tmp = tcg_const_i64(c);
1486    tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
1487    tcg_temp_free_i64(tmp);
1488}
1489
1490void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
1491                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1492{
1493    static const GVecGen2s g[4] = {
1494        { .fni8 = tcg_gen_vec_sub8_i64,
1495          .fniv = tcg_gen_sub_vec,
1496          .fno = gen_helper_gvec_subs8,
1497          .opc = INDEX_op_sub_vec,
1498          .vece = MO_8 },
1499        { .fni8 = tcg_gen_vec_sub16_i64,
1500          .fniv = tcg_gen_sub_vec,
1501          .fno = gen_helper_gvec_subs16,
1502          .opc = INDEX_op_sub_vec,
1503          .vece = MO_16 },
1504        { .fni4 = tcg_gen_sub_i32,
1505          .fniv = tcg_gen_sub_vec,
1506          .fno = gen_helper_gvec_subs32,
1507          .opc = INDEX_op_sub_vec,
1508          .vece = MO_32 },
1509        { .fni8 = tcg_gen_sub_i64,
1510          .fniv = tcg_gen_sub_vec,
1511          .fno = gen_helper_gvec_subs64,
1512          .opc = INDEX_op_sub_vec,
1513          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1514          .vece = MO_64 },
1515    };
1516
1517    tcg_debug_assert(vece <= MO_64);
1518    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1519}
1520
1521/* Perform a vector subtraction using normal subtraction and a mask.
1522   Compare gen_addv_mask above.  */
1523static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1524{
1525    TCGv_i64 t1 = tcg_temp_new_i64();
1526    TCGv_i64 t2 = tcg_temp_new_i64();
1527    TCGv_i64 t3 = tcg_temp_new_i64();
1528
1529    tcg_gen_or_i64(t1, a, m);
1530    tcg_gen_andc_i64(t2, b, m);
1531    tcg_gen_eqv_i64(t3, a, b);
1532    tcg_gen_sub_i64(d, t1, t2);
1533    tcg_gen_and_i64(t3, t3, m);
1534    tcg_gen_xor_i64(d, d, t3);
1535
1536    tcg_temp_free_i64(t1);
1537    tcg_temp_free_i64(t2);
1538    tcg_temp_free_i64(t3);
1539}
1540
1541void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1542{
1543    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1544    gen_subv_mask(d, a, b, m);
1545    tcg_temp_free_i64(m);
1546}
1547
1548void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1549{
1550    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1551    gen_subv_mask(d, a, b, m);
1552    tcg_temp_free_i64(m);
1553}
1554
1555void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1556{
1557    TCGv_i64 t1 = tcg_temp_new_i64();
1558    TCGv_i64 t2 = tcg_temp_new_i64();
1559
1560    tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1561    tcg_gen_sub_i64(t2, a, b);
1562    tcg_gen_sub_i64(t1, a, t1);
1563    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1564
1565    tcg_temp_free_i64(t1);
1566    tcg_temp_free_i64(t2);
1567}
1568
1569void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
1570                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1571{
1572    static const GVecGen3 g[4] = {
1573        { .fni8 = tcg_gen_vec_sub8_i64,
1574          .fniv = tcg_gen_sub_vec,
1575          .fno = gen_helper_gvec_sub8,
1576          .opc = INDEX_op_sub_vec,
1577          .vece = MO_8 },
1578        { .fni8 = tcg_gen_vec_sub16_i64,
1579          .fniv = tcg_gen_sub_vec,
1580          .fno = gen_helper_gvec_sub16,
1581          .opc = INDEX_op_sub_vec,
1582          .vece = MO_16 },
1583        { .fni4 = tcg_gen_sub_i32,
1584          .fniv = tcg_gen_sub_vec,
1585          .fno = gen_helper_gvec_sub32,
1586          .opc = INDEX_op_sub_vec,
1587          .vece = MO_32 },
1588        { .fni8 = tcg_gen_sub_i64,
1589          .fniv = tcg_gen_sub_vec,
1590          .fno = gen_helper_gvec_sub64,
1591          .opc = INDEX_op_sub_vec,
1592          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1593          .vece = MO_64 },
1594    };
1595
1596    tcg_debug_assert(vece <= MO_64);
1597    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1598}
1599
1600void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
1601                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1602{
1603    static const GVecGen3 g[4] = {
1604        { .fniv = tcg_gen_mul_vec,
1605          .fno = gen_helper_gvec_mul8,
1606          .opc = INDEX_op_mul_vec,
1607          .vece = MO_8 },
1608        { .fniv = tcg_gen_mul_vec,
1609          .fno = gen_helper_gvec_mul16,
1610          .opc = INDEX_op_mul_vec,
1611          .vece = MO_16 },
1612        { .fni4 = tcg_gen_mul_i32,
1613          .fniv = tcg_gen_mul_vec,
1614          .fno = gen_helper_gvec_mul32,
1615          .opc = INDEX_op_mul_vec,
1616          .vece = MO_32 },
1617        { .fni8 = tcg_gen_mul_i64,
1618          .fniv = tcg_gen_mul_vec,
1619          .fno = gen_helper_gvec_mul64,
1620          .opc = INDEX_op_mul_vec,
1621          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1622          .vece = MO_64 },
1623    };
1624
1625    tcg_debug_assert(vece <= MO_64);
1626    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1627}
1628
1629void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
1630                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1631{
1632    static const GVecGen2s g[4] = {
1633        { .fniv = tcg_gen_mul_vec,
1634          .fno = gen_helper_gvec_muls8,
1635          .opc = INDEX_op_mul_vec,
1636          .vece = MO_8 },
1637        { .fniv = tcg_gen_mul_vec,
1638          .fno = gen_helper_gvec_muls16,
1639          .opc = INDEX_op_mul_vec,
1640          .vece = MO_16 },
1641        { .fni4 = tcg_gen_mul_i32,
1642          .fniv = tcg_gen_mul_vec,
1643          .fno = gen_helper_gvec_muls32,
1644          .opc = INDEX_op_mul_vec,
1645          .vece = MO_32 },
1646        { .fni8 = tcg_gen_mul_i64,
1647          .fniv = tcg_gen_mul_vec,
1648          .fno = gen_helper_gvec_muls64,
1649          .opc = INDEX_op_mul_vec,
1650          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1651          .vece = MO_64 },
1652    };
1653
1654    tcg_debug_assert(vece <= MO_64);
1655    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1656}
1657
1658void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
1659                       int64_t c, uint32_t oprsz, uint32_t maxsz)
1660{
1661    TCGv_i64 tmp = tcg_const_i64(c);
1662    tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
1663    tcg_temp_free_i64(tmp);
1664}
1665
1666void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1667                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1668{
1669    static const GVecGen3 g[4] = {
1670        { .fno = gen_helper_gvec_ssadd8, .vece = MO_8 },
1671        { .fno = gen_helper_gvec_ssadd16, .vece = MO_16 },
1672        { .fno = gen_helper_gvec_ssadd32, .vece = MO_32 },
1673        { .fno = gen_helper_gvec_ssadd64, .vece = MO_64 }
1674    };
1675    tcg_debug_assert(vece <= MO_64);
1676    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1677}
1678
1679void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
1680                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1681{
1682    static const GVecGen3 g[4] = {
1683        { .fno = gen_helper_gvec_sssub8, .vece = MO_8 },
1684        { .fno = gen_helper_gvec_sssub16, .vece = MO_16 },
1685        { .fno = gen_helper_gvec_sssub32, .vece = MO_32 },
1686        { .fno = gen_helper_gvec_sssub64, .vece = MO_64 }
1687    };
1688    tcg_debug_assert(vece <= MO_64);
1689    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1690}
1691
1692static void tcg_gen_vec_usadd32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1693{
1694    TCGv_i32 max = tcg_const_i32(-1);
1695    tcg_gen_add_i32(d, a, b);
1696    tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
1697    tcg_temp_free_i32(max);
1698}
1699
1700static void tcg_gen_vec_usadd32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1701{
1702    TCGv_i64 max = tcg_const_i64(-1);
1703    tcg_gen_add_i64(d, a, b);
1704    tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
1705    tcg_temp_free_i64(max);
1706}
1707
1708void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1709                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1710{
1711    static const GVecGen3 g[4] = {
1712        { .fno = gen_helper_gvec_usadd8, .vece = MO_8 },
1713        { .fno = gen_helper_gvec_usadd16, .vece = MO_16 },
1714        { .fni4 = tcg_gen_vec_usadd32_i32,
1715          .fno = gen_helper_gvec_usadd32,
1716          .vece = MO_32 },
1717        { .fni8 = tcg_gen_vec_usadd32_i64,
1718          .fno = gen_helper_gvec_usadd64,
1719          .vece = MO_64 }
1720    };
1721    tcg_debug_assert(vece <= MO_64);
1722    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1723}
1724
1725static void tcg_gen_vec_ussub32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1726{
1727    TCGv_i32 min = tcg_const_i32(0);
1728    tcg_gen_sub_i32(d, a, b);
1729    tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
1730    tcg_temp_free_i32(min);
1731}
1732
1733static void tcg_gen_vec_ussub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1734{
1735    TCGv_i64 min = tcg_const_i64(0);
1736    tcg_gen_sub_i64(d, a, b);
1737    tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
1738    tcg_temp_free_i64(min);
1739}
1740
1741void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
1742                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1743{
1744    static const GVecGen3 g[4] = {
1745        { .fno = gen_helper_gvec_ussub8, .vece = MO_8 },
1746        { .fno = gen_helper_gvec_ussub16, .vece = MO_16 },
1747        { .fni4 = tcg_gen_vec_ussub32_i32,
1748          .fno = gen_helper_gvec_ussub32,
1749          .vece = MO_32 },
1750        { .fni8 = tcg_gen_vec_ussub32_i64,
1751          .fno = gen_helper_gvec_ussub64,
1752          .vece = MO_64 }
1753    };
1754    tcg_debug_assert(vece <= MO_64);
1755    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1756}
1757
1758/* Perform a vector negation using normal negation and a mask.
1759   Compare gen_subv_mask above.  */
1760static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
1761{
1762    TCGv_i64 t2 = tcg_temp_new_i64();
1763    TCGv_i64 t3 = tcg_temp_new_i64();
1764
1765    tcg_gen_andc_i64(t3, m, b);
1766    tcg_gen_andc_i64(t2, b, m);
1767    tcg_gen_sub_i64(d, m, t2);
1768    tcg_gen_xor_i64(d, d, t3);
1769
1770    tcg_temp_free_i64(t2);
1771    tcg_temp_free_i64(t3);
1772}
1773
1774void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
1775{
1776    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1777    gen_negv_mask(d, b, m);
1778    tcg_temp_free_i64(m);
1779}
1780
1781void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
1782{
1783    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1784    gen_negv_mask(d, b, m);
1785    tcg_temp_free_i64(m);
1786}
1787
1788void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
1789{
1790    TCGv_i64 t1 = tcg_temp_new_i64();
1791    TCGv_i64 t2 = tcg_temp_new_i64();
1792
1793    tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1794    tcg_gen_neg_i64(t2, b);
1795    tcg_gen_neg_i64(t1, t1);
1796    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1797
1798    tcg_temp_free_i64(t1);
1799    tcg_temp_free_i64(t2);
1800}
1801
1802void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
1803                      uint32_t oprsz, uint32_t maxsz)
1804{
1805    static const GVecGen2 g[4] = {
1806        { .fni8 = tcg_gen_vec_neg8_i64,
1807          .fniv = tcg_gen_neg_vec,
1808          .fno = gen_helper_gvec_neg8,
1809          .opc = INDEX_op_neg_vec,
1810          .vece = MO_8 },
1811        { .fni8 = tcg_gen_vec_neg16_i64,
1812          .fniv = tcg_gen_neg_vec,
1813          .fno = gen_helper_gvec_neg16,
1814          .opc = INDEX_op_neg_vec,
1815          .vece = MO_16 },
1816        { .fni4 = tcg_gen_neg_i32,
1817          .fniv = tcg_gen_neg_vec,
1818          .fno = gen_helper_gvec_neg32,
1819          .opc = INDEX_op_neg_vec,
1820          .vece = MO_32 },
1821        { .fni8 = tcg_gen_neg_i64,
1822          .fniv = tcg_gen_neg_vec,
1823          .fno = gen_helper_gvec_neg64,
1824          .opc = INDEX_op_neg_vec,
1825          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1826          .vece = MO_64 },
1827    };
1828
1829    tcg_debug_assert(vece <= MO_64);
1830    tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
1831}
1832
1833void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
1834                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1835{
1836    static const GVecGen3 g = {
1837        .fni8 = tcg_gen_and_i64,
1838        .fniv = tcg_gen_and_vec,
1839        .fno = gen_helper_gvec_and,
1840        .opc = INDEX_op_and_vec,
1841        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1842    };
1843    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1844}
1845
1846void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
1847                     uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1848{
1849    static const GVecGen3 g = {
1850        .fni8 = tcg_gen_or_i64,
1851        .fniv = tcg_gen_or_vec,
1852        .fno = gen_helper_gvec_or,
1853        .opc = INDEX_op_or_vec,
1854        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1855    };
1856    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1857}
1858
1859void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
1860                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1861{
1862    static const GVecGen3 g = {
1863        .fni8 = tcg_gen_xor_i64,
1864        .fniv = tcg_gen_xor_vec,
1865        .fno = gen_helper_gvec_xor,
1866        .opc = INDEX_op_xor_vec,
1867        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1868    };
1869    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1870}
1871
1872void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
1873                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1874{
1875    static const GVecGen3 g = {
1876        .fni8 = tcg_gen_andc_i64,
1877        .fniv = tcg_gen_andc_vec,
1878        .fno = gen_helper_gvec_andc,
1879        .opc = INDEX_op_andc_vec,
1880        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1881    };
1882    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1883}
1884
1885void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
1886                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1887{
1888    static const GVecGen3 g = {
1889        .fni8 = tcg_gen_orc_i64,
1890        .fniv = tcg_gen_orc_vec,
1891        .fno = gen_helper_gvec_orc,
1892        .opc = INDEX_op_orc_vec,
1893        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1894    };
1895    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1896}
1897
1898static const GVecGen2s gop_ands = {
1899    .fni8 = tcg_gen_and_i64,
1900    .fniv = tcg_gen_and_vec,
1901    .fno = gen_helper_gvec_ands,
1902    .opc = INDEX_op_and_vec,
1903    .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1904    .vece = MO_64
1905};
1906
1907void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
1908                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1909{
1910    TCGv_i64 tmp = tcg_temp_new_i64();
1911    gen_dup_i64(vece, tmp, c);
1912    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
1913    tcg_temp_free_i64(tmp);
1914}
1915
1916void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
1917                       int64_t c, uint32_t oprsz, uint32_t maxsz)
1918{
1919    TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
1920    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
1921    tcg_temp_free_i64(tmp);
1922}
1923
1924static const GVecGen2s gop_xors = {
1925    .fni8 = tcg_gen_xor_i64,
1926    .fniv = tcg_gen_xor_vec,
1927    .fno = gen_helper_gvec_xors,
1928    .opc = INDEX_op_xor_vec,
1929    .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1930    .vece = MO_64
1931};
1932
1933void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
1934                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1935{
1936    TCGv_i64 tmp = tcg_temp_new_i64();
1937    gen_dup_i64(vece, tmp, c);
1938    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
1939    tcg_temp_free_i64(tmp);
1940}
1941
1942void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
1943                       int64_t c, uint32_t oprsz, uint32_t maxsz)
1944{
1945    TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
1946    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
1947    tcg_temp_free_i64(tmp);
1948}
1949
1950static const GVecGen2s gop_ors = {
1951    .fni8 = tcg_gen_or_i64,
1952    .fniv = tcg_gen_or_vec,
1953    .fno = gen_helper_gvec_ors,
1954    .opc = INDEX_op_or_vec,
1955    .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1956    .vece = MO_64
1957};
1958
1959void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
1960                      TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1961{
1962    TCGv_i64 tmp = tcg_temp_new_i64();
1963    gen_dup_i64(vece, tmp, c);
1964    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
1965    tcg_temp_free_i64(tmp);
1966}
1967
1968void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
1969                      int64_t c, uint32_t oprsz, uint32_t maxsz)
1970{
1971    TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
1972    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
1973    tcg_temp_free_i64(tmp);
1974}
1975
1976void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
1977{
1978    uint64_t mask = dup_const(MO_8, 0xff << c);
1979    tcg_gen_shli_i64(d, a, c);
1980    tcg_gen_andi_i64(d, d, mask);
1981}
1982
1983void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
1984{
1985    uint64_t mask = dup_const(MO_16, 0xffff << c);
1986    tcg_gen_shli_i64(d, a, c);
1987    tcg_gen_andi_i64(d, d, mask);
1988}
1989
1990void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
1991                       int64_t shift, uint32_t oprsz, uint32_t maxsz)
1992{
1993    static const GVecGen2i g[4] = {
1994        { .fni8 = tcg_gen_vec_shl8i_i64,
1995          .fniv = tcg_gen_shli_vec,
1996          .fno = gen_helper_gvec_shl8i,
1997          .opc = INDEX_op_shli_vec,
1998          .vece = MO_8 },
1999        { .fni8 = tcg_gen_vec_shl16i_i64,
2000          .fniv = tcg_gen_shli_vec,

2001          .fno = gen_helper_gvec_shl16i,
2002          .opc = INDEX_op_shli_vec,
2003          .vece = MO_16 },
2004        { .fni4 = tcg_gen_shli_i32,
2005          .fniv = tcg_gen_shli_vec,
2006          .fno = gen_helper_gvec_shl32i,
2007          .opc = INDEX_op_shli_vec,
2008          .vece = MO_32 },
2009        { .fni8 = tcg_gen_shli_i64,
2010          .fniv = tcg_gen_shli_vec,
2011          .fno = gen_helper_gvec_shl64i,
2012          .opc = INDEX_op_shli_vec,
2013          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2014          .vece = MO_64 },
2015    };
2016
2017    tcg_debug_assert(vece <= MO_64);
2018    tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2019    if (shift == 0) {
2020        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2021    } else {
2022        tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2023    }
2024}
2025
2026void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2027{
2028    uint64_t mask = dup_const(MO_8, 0xff >> c);
2029    tcg_gen_shri_i64(d, a, c);
2030    tcg_gen_andi_i64(d, d, mask);
2031}
2032
2033void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2034{
2035    uint64_t mask = dup_const(MO_16, 0xffff >> c);
2036    tcg_gen_shri_i64(d, a, c);
2037    tcg_gen_andi_i64(d, d, mask);
2038}
2039
2040void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
2041                       int64_t shift, uint32_t oprsz, uint32_t maxsz)
2042{
2043    static const GVecGen2i g[4] = {
2044        { .fni8 = tcg_gen_vec_shr8i_i64,
2045          .fniv = tcg_gen_shri_vec,
2046          .fno = gen_helper_gvec_shr8i,
2047          .opc = INDEX_op_shri_vec,
2048          .vece = MO_8 },
2049        { .fni8 = tcg_gen_vec_shr16i_i64,
2050          .fniv = tcg_gen_shri_vec,
2051          .fno = gen_helper_gvec_shr16i,
2052          .opc = INDEX_op_shri_vec,
2053          .vece = MO_16 },
2054        { .fni4 = tcg_gen_shri_i32,
2055          .fniv = tcg_gen_shri_vec,
2056          .fno = gen_helper_gvec_shr32i,
2057          .opc = INDEX_op_shri_vec,
2058          .vece = MO_32 },
2059        { .fni8 = tcg_gen_shri_i64,
2060          .fniv = tcg_gen_shri_vec,
2061          .fno = gen_helper_gvec_shr64i,
2062          .opc = INDEX_op_shri_vec,
2063          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2064          .vece = MO_64 },
2065    };
2066
2067    tcg_debug_assert(vece <= MO_64);
2068    tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2069    if (shift == 0) {
2070        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2071    } else {
2072        tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2073    }
2074}
2075
2076void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2077{
2078    uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2079    uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2080    TCGv_i64 s = tcg_temp_new_i64();
2081
2082    tcg_gen_shri_i64(d, a, c);
2083    tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2084    tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2085    tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2086    tcg_gen_or_i64(d, d, s);         /* include sign extension */
2087    tcg_temp_free_i64(s);
2088}
2089
2090void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2091{
2092    uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2093    uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2094    TCGv_i64 s = tcg_temp_new_i64();
2095
2096    tcg_gen_shri_i64(d, a, c);
2097    tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2098    tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2099    tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2100    tcg_gen_or_i64(d, d, s);         /* include sign extension */
2101    tcg_temp_free_i64(s);
2102}
2103
2104void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
2105                       int64_t shift, uint32_t oprsz, uint32_t maxsz)
2106{
2107    static const GVecGen2i g[4] = {
2108        { .fni8 = tcg_gen_vec_sar8i_i64,
2109          .fniv = tcg_gen_sari_vec,
2110          .fno = gen_helper_gvec_sar8i,
2111          .opc = INDEX_op_sari_vec,
2112          .vece = MO_8 },
2113        { .fni8 = tcg_gen_vec_sar16i_i64,
2114          .fniv = tcg_gen_sari_vec,
2115          .fno = gen_helper_gvec_sar16i,
2116          .opc = INDEX_op_sari_vec,
2117          .vece = MO_16 },
2118        { .fni4 = tcg_gen_sari_i32,
2119          .fniv = tcg_gen_sari_vec,
2120          .fno = gen_helper_gvec_sar32i,
2121          .opc = INDEX_op_sari_vec,
2122          .vece = MO_32 },
2123        { .fni8 = tcg_gen_sari_i64,
2124          .fniv = tcg_gen_sari_vec,
2125          .fno = gen_helper_gvec_sar64i,
2126          .opc = INDEX_op_sari_vec,
2127          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2128          .vece = MO_64 },
2129    };
2130
2131    tcg_debug_assert(vece <= MO_64);
2132    tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2133    if (shift == 0) {
2134        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2135    } else {
2136        tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2137    }
2138}
2139
2140/* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
2141static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
2142                           uint32_t oprsz, TCGCond cond)
2143{
2144    TCGv_i32 t0 = tcg_temp_new_i32();
2145    TCGv_i32 t1 = tcg_temp_new_i32();
2146    uint32_t i;
2147
2148    for (i = 0; i < oprsz; i += 4) {
2149        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
2150        tcg_gen_ld_i32(t1, cpu_env, bofs + i);
2151        tcg_gen_setcond_i32(cond, t0, t0, t1);
2152        tcg_gen_neg_i32(t0, t0);
2153        tcg_gen_st_i32(t0, cpu_env, dofs + i);
2154    }
2155    tcg_temp_free_i32(t1);
2156    tcg_temp_free_i32(t0);
2157}
2158
2159static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
2160                           uint32_t oprsz, TCGCond cond)
2161{
2162    TCGv_i64 t0 = tcg_temp_new_i64();
2163    TCGv_i64 t1 = tcg_temp_new_i64();
2164    uint32_t i;
2165
2166    for (i = 0; i < oprsz; i += 8) {
2167        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
2168        tcg_gen_ld_i64(t1, cpu_env, bofs + i);
2169        tcg_gen_setcond_i64(cond, t0, t0, t1);
2170        tcg_gen_neg_i64(t0, t0);
2171        tcg_gen_st_i64(t0, cpu_env, dofs + i);
2172    }
2173    tcg_temp_free_i64(t1);
2174    tcg_temp_free_i64(t0);
2175}
2176
2177static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
2178                           uint32_t bofs, uint32_t oprsz, uint32_t tysz,
2179                           TCGType type, TCGCond cond)
2180{
2181    TCGv_vec t0 = tcg_temp_new_vec(type);
2182    TCGv_vec t1 = tcg_temp_new_vec(type);
2183    uint32_t i;
2184
2185    for (i = 0; i < oprsz; i += tysz) {
2186        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
2187        tcg_gen_ld_vec(t1, cpu_env, bofs + i);
2188        tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
2189        tcg_gen_st_vec(t0, cpu_env, dofs + i);
2190    }
2191    tcg_temp_free_vec(t1);
2192    tcg_temp_free_vec(t0);
2193}
2194
2195void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
2196                      uint32_t aofs, uint32_t bofs,
2197                      uint32_t oprsz, uint32_t maxsz)
2198{
2199    static gen_helper_gvec_3 * const eq_fn[4] = {
2200        gen_helper_gvec_eq8, gen_helper_gvec_eq16,
2201        gen_helper_gvec_eq32, gen_helper_gvec_eq64
2202    };
2203    static gen_helper_gvec_3 * const ne_fn[4] = {
2204        gen_helper_gvec_ne8, gen_helper_gvec_ne16,
2205        gen_helper_gvec_ne32, gen_helper_gvec_ne64
2206    };
2207    static gen_helper_gvec_3 * const lt_fn[4] = {
2208        gen_helper_gvec_lt8, gen_helper_gvec_lt16,
2209        gen_helper_gvec_lt32, gen_helper_gvec_lt64
2210    };
2211    static gen_helper_gvec_3 * const le_fn[4] = {
2212        gen_helper_gvec_le8, gen_helper_gvec_le16,
2213        gen_helper_gvec_le32, gen_helper_gvec_le64
2214    };
2215    static gen_helper_gvec_3 * const ltu_fn[4] = {
2216        gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
2217        gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
2218    };
2219    static gen_helper_gvec_3 * const leu_fn[4] = {
2220        gen_helper_gvec_leu8, gen_helper_gvec_leu16,
2221        gen_helper_gvec_leu32, gen_helper_gvec_leu64
2222    };
2223    static gen_helper_gvec_3 * const * const fns[16] = {
2224        [TCG_COND_EQ] = eq_fn,
2225        [TCG_COND_NE] = ne_fn,
2226        [TCG_COND_LT] = lt_fn,
2227        [TCG_COND_LE] = le_fn,
2228        [TCG_COND_LTU] = ltu_fn,
2229        [TCG_COND_LEU] = leu_fn,
2230    };
2231    TCGType type;
2232    uint32_t some;
2233
2234    check_size_align(oprsz, maxsz, dofs | aofs | bofs);
2235    check_overlap_3(dofs, aofs, bofs, maxsz);
2236
2237    if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
2238        do_dup(MO_8, dofs, oprsz, maxsz,
2239               NULL, NULL, -(cond == TCG_COND_ALWAYS));
2240        return;
2241    }
2242
2243    /* Implement inline with a vector type, if possible.
2244     * Prefer integer when 64-bit host and 64-bit comparison.
2245     */
2246    type = choose_vector_type(INDEX_op_cmp_vec, vece, oprsz,
2247                              TCG_TARGET_REG_BITS == 64 && vece == MO_64);
2248    switch (type) {
2249    case TCG_TYPE_V256:
2250        /* Recall that ARM SVE allows vector sizes that are not a
2251         * power of 2, but always a multiple of 16.  The intent is
2252         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
2253         */
2254        some = QEMU_ALIGN_DOWN(oprsz, 32);
2255        expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
2256        if (some == oprsz) {
2257            break;
2258        }
2259        dofs += some;
2260        aofs += some;
2261        bofs += some;
2262        oprsz -= some;
2263        maxsz -= some;
2264        /* fallthru */
2265    case TCG_TYPE_V128:
2266        expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
2267        break;
2268    case TCG_TYPE_V64:
2269        expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
2270        break;
2271
2272    case 0:
2273        if (vece == MO_64 && check_size_impl(oprsz, 8)) {
2274            expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
2275        } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
2276            expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
2277        } else {
2278            gen_helper_gvec_3 * const *fn = fns[cond];
2279
2280            if (fn == NULL) {
2281                uint32_t tmp;
2282                tmp = aofs, aofs = bofs, bofs = tmp;
2283                cond = tcg_swap_cond(cond);
2284                fn = fns[cond];
2285                assert(fn != NULL);
2286            }
2287            tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
2288            return;
2289        }
2290        break;
2291
2292    default:
2293        g_assert_not_reached();
2294    }
2295
2296    if (oprsz < maxsz) {
2297        expand_clr(dofs + oprsz, maxsz - oprsz);
2298    }
2299}
2300