qemu/tcg/tcg-op-gvec.c
<<
>>
Prefs
   1/*
   2 * Generic vector operation expansion
   3 *
   4 * Copyright (c) 2018 Linaro
   5 *
   6 * This library is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU Lesser General Public
   8 * License as published by the Free Software Foundation; either
   9 * version 2 of the License, or (at your option) any later version.
  10 *
  11 * This library is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * Lesser General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU Lesser General Public
  17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20#include "qemu/osdep.h"
  21#include "qemu-common.h"
  22#include "tcg.h"
  23#include "tcg-op.h"
  24#include "tcg-op-gvec.h"
  25#include "tcg-gvec-desc.h"
  26
  27#define MAX_UNROLL  4
  28
  29/* Verify vector size and alignment rules.  OFS should be the OR of all
  30   of the operand offsets so that we can check them all at once.  */
  31static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
  32{
  33    uint32_t opr_align = oprsz >= 16 ? 15 : 7;
  34    uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7;
  35    tcg_debug_assert(oprsz > 0);
  36    tcg_debug_assert(oprsz <= maxsz);
  37    tcg_debug_assert((oprsz & opr_align) == 0);
  38    tcg_debug_assert((maxsz & max_align) == 0);
  39    tcg_debug_assert((ofs & max_align) == 0);
  40}
  41
  42/* Verify vector overlap rules for two operands.  */
  43static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
  44{
  45    tcg_debug_assert(d == a || d + s <= a || a + s <= d);
  46}
  47
  48/* Verify vector overlap rules for three operands.  */
  49static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
  50{
  51    check_overlap_2(d, a, s);
  52    check_overlap_2(d, b, s);
  53    check_overlap_2(a, b, s);
  54}
  55
  56/* Verify vector overlap rules for four operands.  */
  57static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
  58                            uint32_t c, uint32_t s)
  59{
  60    check_overlap_2(d, a, s);
  61    check_overlap_2(d, b, s);
  62    check_overlap_2(d, c, s);
  63    check_overlap_2(a, b, s);
  64    check_overlap_2(a, c, s);
  65    check_overlap_2(b, c, s);
  66}
  67
  68/* Create a descriptor from components.  */
  69uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
  70{
  71    uint32_t desc = 0;
  72
  73    assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS));
  74    assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS));
  75    assert(data == sextract32(data, 0, SIMD_DATA_BITS));
  76
  77    oprsz = (oprsz / 8) - 1;
  78    maxsz = (maxsz / 8) - 1;
  79    desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
  80    desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
  81    desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
  82
  83    return desc;
  84}
  85
  86/* Generate a call to a gvec-style helper with two vector operands.  */
  87void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
  88                        uint32_t oprsz, uint32_t maxsz, int32_t data,
  89                        gen_helper_gvec_2 *fn)
  90{
  91    TCGv_ptr a0, a1;
  92    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
  93
  94    a0 = tcg_temp_new_ptr();
  95    a1 = tcg_temp_new_ptr();
  96
  97    tcg_gen_addi_ptr(a0, cpu_env, dofs);
  98    tcg_gen_addi_ptr(a1, cpu_env, aofs);
  99
 100    fn(a0, a1, desc);
 101
 102    tcg_temp_free_ptr(a0);
 103    tcg_temp_free_ptr(a1);
 104    tcg_temp_free_i32(desc);
 105}
 106
 107/* Generate a call to a gvec-style helper with two vector operands
 108   and one scalar operand.  */
 109void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
 110                         uint32_t oprsz, uint32_t maxsz, int32_t data,
 111                         gen_helper_gvec_2i *fn)
 112{
 113    TCGv_ptr a0, a1;
 114    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 115
 116    a0 = tcg_temp_new_ptr();
 117    a1 = tcg_temp_new_ptr();
 118
 119    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 120    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 121
 122    fn(a0, a1, c, desc);
 123
 124    tcg_temp_free_ptr(a0);
 125    tcg_temp_free_ptr(a1);
 126    tcg_temp_free_i32(desc);
 127}
 128
 129/* Generate a call to a gvec-style helper with three vector operands.  */
 130void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 131                        uint32_t oprsz, uint32_t maxsz, int32_t data,
 132                        gen_helper_gvec_3 *fn)
 133{
 134    TCGv_ptr a0, a1, a2;
 135    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 136
 137    a0 = tcg_temp_new_ptr();
 138    a1 = tcg_temp_new_ptr();
 139    a2 = tcg_temp_new_ptr();
 140
 141    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 142    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 143    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 144
 145    fn(a0, a1, a2, desc);
 146
 147    tcg_temp_free_ptr(a0);
 148    tcg_temp_free_ptr(a1);
 149    tcg_temp_free_ptr(a2);
 150    tcg_temp_free_i32(desc);
 151}
 152
 153/* Generate a call to a gvec-style helper with four vector operands.  */
 154void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 155                        uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
 156                        int32_t data, gen_helper_gvec_4 *fn)
 157{
 158    TCGv_ptr a0, a1, a2, a3;
 159    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 160
 161    a0 = tcg_temp_new_ptr();
 162    a1 = tcg_temp_new_ptr();
 163    a2 = tcg_temp_new_ptr();
 164    a3 = tcg_temp_new_ptr();
 165
 166    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 167    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 168    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 169    tcg_gen_addi_ptr(a3, cpu_env, cofs);
 170
 171    fn(a0, a1, a2, a3, desc);
 172
 173    tcg_temp_free_ptr(a0);
 174    tcg_temp_free_ptr(a1);
 175    tcg_temp_free_ptr(a2);
 176    tcg_temp_free_ptr(a3);
 177    tcg_temp_free_i32(desc);
 178}
 179
 180/* Generate a call to a gvec-style helper with five vector operands.  */
 181void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 182                        uint32_t cofs, uint32_t xofs, uint32_t oprsz,
 183                        uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
 184{
 185    TCGv_ptr a0, a1, a2, a3, a4;
 186    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 187
 188    a0 = tcg_temp_new_ptr();
 189    a1 = tcg_temp_new_ptr();
 190    a2 = tcg_temp_new_ptr();
 191    a3 = tcg_temp_new_ptr();
 192    a4 = tcg_temp_new_ptr();
 193
 194    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 195    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 196    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 197    tcg_gen_addi_ptr(a3, cpu_env, cofs);
 198    tcg_gen_addi_ptr(a4, cpu_env, xofs);
 199
 200    fn(a0, a1, a2, a3, a4, desc);
 201
 202    tcg_temp_free_ptr(a0);
 203    tcg_temp_free_ptr(a1);
 204    tcg_temp_free_ptr(a2);
 205    tcg_temp_free_ptr(a3);
 206    tcg_temp_free_ptr(a4);
 207    tcg_temp_free_i32(desc);
 208}
 209
 210/* Generate a call to a gvec-style helper with three vector operands
 211   and an extra pointer operand.  */
 212void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
 213                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 214                        int32_t data, gen_helper_gvec_2_ptr *fn)
 215{
 216    TCGv_ptr a0, a1;
 217    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 218
 219    a0 = tcg_temp_new_ptr();
 220    a1 = tcg_temp_new_ptr();
 221
 222    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 223    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 224
 225    fn(a0, a1, ptr, desc);
 226
 227    tcg_temp_free_ptr(a0);
 228    tcg_temp_free_ptr(a1);
 229    tcg_temp_free_i32(desc);
 230}
 231
 232/* Generate a call to a gvec-style helper with three vector operands
 233   and an extra pointer operand.  */
 234void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 235                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 236                        int32_t data, gen_helper_gvec_3_ptr *fn)
 237{
 238    TCGv_ptr a0, a1, a2;
 239    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 240
 241    a0 = tcg_temp_new_ptr();
 242    a1 = tcg_temp_new_ptr();
 243    a2 = tcg_temp_new_ptr();
 244
 245    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 246    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 247    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 248
 249    fn(a0, a1, a2, ptr, desc);
 250
 251    tcg_temp_free_ptr(a0);
 252    tcg_temp_free_ptr(a1);
 253    tcg_temp_free_ptr(a2);
 254    tcg_temp_free_i32(desc);
 255}
 256
 257/* Generate a call to a gvec-style helper with four vector operands
 258   and an extra pointer operand.  */
 259void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 260                        uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
 261                        uint32_t maxsz, int32_t data,
 262                        gen_helper_gvec_4_ptr *fn)
 263{
 264    TCGv_ptr a0, a1, a2, a3;
 265    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 266
 267    a0 = tcg_temp_new_ptr();
 268    a1 = tcg_temp_new_ptr();
 269    a2 = tcg_temp_new_ptr();
 270    a3 = tcg_temp_new_ptr();
 271
 272    tcg_gen_addi_ptr(a0, cpu_env, dofs);
 273    tcg_gen_addi_ptr(a1, cpu_env, aofs);
 274    tcg_gen_addi_ptr(a2, cpu_env, bofs);
 275    tcg_gen_addi_ptr(a3, cpu_env, cofs);
 276
 277    fn(a0, a1, a2, a3, ptr, desc);
 278
 279    tcg_temp_free_ptr(a0);
 280    tcg_temp_free_ptr(a1);
 281    tcg_temp_free_ptr(a2);
 282    tcg_temp_free_ptr(a3);
 283    tcg_temp_free_i32(desc);
 284}
 285
 286/* Return true if we want to implement something of OPRSZ bytes
 287   in units of LNSZ.  This limits the expansion of inline code.  */
 288static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
 289{
 290    uint32_t lnct = oprsz / lnsz;
 291    return lnct >= 1 && lnct <= MAX_UNROLL;
 292}
 293
 294static void expand_clr(uint32_t dofs, uint32_t maxsz);
 295
 296/* Duplicate C as per VECE.  */
 297uint64_t (dup_const)(unsigned vece, uint64_t c)
 298{
 299    switch (vece) {
 300    case MO_8:
 301        return 0x0101010101010101ull * (uint8_t)c;
 302    case MO_16:
 303        return 0x0001000100010001ull * (uint16_t)c;
 304    case MO_32:
 305        return 0x0000000100000001ull * (uint32_t)c;
 306    case MO_64:
 307        return c;
 308    default:
 309        g_assert_not_reached();
 310    }
 311}
 312
 313/* Duplicate IN into OUT as per VECE.  */
 314static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
 315{
 316    switch (vece) {
 317    case MO_8:
 318        tcg_gen_ext8u_i32(out, in);
 319        tcg_gen_muli_i32(out, out, 0x01010101);
 320        break;
 321    case MO_16:
 322        tcg_gen_deposit_i32(out, in, in, 16, 16);
 323        break;
 324    case MO_32:
 325        tcg_gen_mov_i32(out, in);
 326        break;
 327    default:
 328        g_assert_not_reached();
 329    }
 330}
 331
 332static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
 333{
 334    switch (vece) {
 335    case MO_8:
 336        tcg_gen_ext8u_i64(out, in);
 337        tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
 338        break;
 339    case MO_16:
 340        tcg_gen_ext16u_i64(out, in);
 341        tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
 342        break;
 343    case MO_32:
 344        tcg_gen_deposit_i64(out, in, in, 32, 32);
 345        break;
 346    case MO_64:
 347        tcg_gen_mov_i64(out, in);
 348        break;
 349    default:
 350        g_assert_not_reached();
 351    }
 352}
 353
 354/* Select a supported vector type for implementing an operation on SIZE
 355 * bytes.  If OP is 0, assume that the real operation to be performed is
 356 * required by all backends.  Otherwise, make sure than OP can be performed
 357 * on elements of size VECE in the selected type.  Do not select V64 if
 358 * PREFER_I64 is true.  Return 0 if no vector type is selected.
 359 */
 360static TCGType choose_vector_type(TCGOpcode op, unsigned vece, uint32_t size,
 361                                  bool prefer_i64)
 362{
 363    if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) {
 364        if (op == 0) {
 365            return TCG_TYPE_V256;
 366        }
 367        /* Recall that ARM SVE allows vector sizes that are not a
 368         * power of 2, but always a multiple of 16.  The intent is
 369         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 370         * It is hard to imagine a case in which v256 is supported
 371         * but v128 is not, but check anyway.
 372         */
 373        if (tcg_can_emit_vec_op(op, TCG_TYPE_V256, vece)
 374            && (size % 32 == 0
 375                || tcg_can_emit_vec_op(op, TCG_TYPE_V128, vece))) {
 376            return TCG_TYPE_V256;
 377        }
 378    }
 379    if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16)
 380        && (op == 0 || tcg_can_emit_vec_op(op, TCG_TYPE_V128, vece))) {
 381        return TCG_TYPE_V128;
 382    }
 383    if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
 384        && (op == 0 || tcg_can_emit_vec_op(op, TCG_TYPE_V64, vece))) {
 385        return TCG_TYPE_V64;
 386    }
 387    return 0;
 388}
 389
 390/* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
 391 * Only one of IN_32 or IN_64 may be set;
 392 * IN_C is used if IN_32 and IN_64 are unset.
 393 */
 394static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
 395                   uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
 396                   uint64_t in_c)
 397{
 398    TCGType type;
 399    TCGv_i64 t_64;
 400    TCGv_i32 t_32, t_desc;
 401    TCGv_ptr t_ptr;
 402    uint32_t i;
 403
 404    assert(vece <= (in_32 ? MO_32 : MO_64));
 405    assert(in_32 == NULL || in_64 == NULL);
 406
 407    /* If we're storing 0, expand oprsz to maxsz.  */
 408    if (in_32 == NULL && in_64 == NULL) {
 409        in_c = dup_const(vece, in_c);
 410        if (in_c == 0) {
 411            oprsz = maxsz;
 412        }
 413    }
 414
 415    /* Implement inline with a vector type, if possible.
 416     * Prefer integer when 64-bit host and no variable dup.
 417     */
 418    type = choose_vector_type(0, vece, oprsz,
 419                              (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
 420                               && (in_64 == NULL || vece == MO_64)));
 421    if (type != 0) {
 422        TCGv_vec t_vec = tcg_temp_new_vec(type);
 423
 424        if (in_32) {
 425            tcg_gen_dup_i32_vec(vece, t_vec, in_32);
 426        } else if (in_64) {
 427            tcg_gen_dup_i64_vec(vece, t_vec, in_64);
 428        } else {
 429            switch (vece) {
 430            case MO_8:
 431                tcg_gen_dup8i_vec(t_vec, in_c);
 432                break;
 433            case MO_16:
 434                tcg_gen_dup16i_vec(t_vec, in_c);
 435                break;
 436            case MO_32:
 437                tcg_gen_dup32i_vec(t_vec, in_c);
 438                break;
 439            default:
 440                tcg_gen_dup64i_vec(t_vec, in_c);
 441                break;
 442            }
 443        }
 444
 445        i = 0;
 446        switch (type) {
 447        case TCG_TYPE_V256:
 448            /* Recall that ARM SVE allows vector sizes that are not a
 449             * power of 2, but always a multiple of 16.  The intent is
 450             * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 451             */
 452            for (; i + 32 <= oprsz; i += 32) {
 453                tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
 454            }
 455            /* fallthru */
 456        case TCG_TYPE_V128:
 457            for (; i + 16 <= oprsz; i += 16) {
 458                tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
 459            }
 460            break;
 461        case TCG_TYPE_V64:
 462            for (; i < oprsz; i += 8) {
 463                tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
 464            }
 465            break;
 466        default:
 467            g_assert_not_reached();
 468        }
 469
 470        tcg_temp_free_vec(t_vec);
 471        goto done;
 472    }
 473
 474    /* Otherwise, inline with an integer type, unless "large".  */
 475    if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
 476        t_64 = NULL;
 477        t_32 = NULL;
 478
 479        if (in_32) {
 480            /* We are given a 32-bit variable input.  For a 64-bit host,
 481               use a 64-bit operation unless the 32-bit operation would
 482               be simple enough.  */
 483            if (TCG_TARGET_REG_BITS == 64
 484                && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
 485                t_64 = tcg_temp_new_i64();
 486                tcg_gen_extu_i32_i64(t_64, in_32);
 487                gen_dup_i64(vece, t_64, t_64);
 488            } else {
 489                t_32 = tcg_temp_new_i32();
 490                gen_dup_i32(vece, t_32, in_32);
 491            }
 492        } else if (in_64) {
 493            /* We are given a 64-bit variable input.  */
 494            t_64 = tcg_temp_new_i64();
 495            gen_dup_i64(vece, t_64, in_64);
 496        } else {
 497            /* We are given a constant input.  */
 498            /* For 64-bit hosts, use 64-bit constants for "simple" constants
 499               or when we'd need too many 32-bit stores, or when a 64-bit
 500               constant is really required.  */
 501            if (vece == MO_64
 502                || (TCG_TARGET_REG_BITS == 64
 503                    && (in_c == 0 || in_c == -1
 504                        || !check_size_impl(oprsz, 4)))) {
 505                t_64 = tcg_const_i64(in_c);
 506            } else {
 507                t_32 = tcg_const_i32(in_c);
 508            }
 509        }
 510
 511        /* Implement inline if we picked an implementation size above.  */
 512        if (t_32) {
 513            for (i = 0; i < oprsz; i += 4) {
 514                tcg_gen_st_i32(t_32, cpu_env, dofs + i);
 515            }
 516            tcg_temp_free_i32(t_32);
 517            goto done;
 518        }
 519        if (t_64) {
 520            for (i = 0; i < oprsz; i += 8) {
 521                tcg_gen_st_i64(t_64, cpu_env, dofs + i);
 522            }
 523            tcg_temp_free_i64(t_64);
 524            goto done;
 525        }
 526    }
 527
 528    /* Otherwise implement out of line.  */
 529    t_ptr = tcg_temp_new_ptr();
 530    tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
 531    t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));
 532
 533    if (vece == MO_64) {
 534        if (in_64) {
 535            gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
 536        } else {
 537            t_64 = tcg_const_i64(in_c);
 538            gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
 539            tcg_temp_free_i64(t_64);
 540        }
 541    } else {
 542        typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
 543        static dup_fn * const fns[3] = {
 544            gen_helper_gvec_dup8,
 545            gen_helper_gvec_dup16,
 546            gen_helper_gvec_dup32
 547        };
 548
 549        if (in_32) {
 550            fns[vece](t_ptr, t_desc, in_32);
 551        } else {
 552            t_32 = tcg_temp_new_i32();
 553            if (in_64) {
 554                tcg_gen_extrl_i64_i32(t_32, in_64);
 555            } else if (vece == MO_8) {
 556                tcg_gen_movi_i32(t_32, in_c & 0xff);
 557            } else if (vece == MO_16) {
 558                tcg_gen_movi_i32(t_32, in_c & 0xffff);
 559            } else {
 560                tcg_gen_movi_i32(t_32, in_c);
 561            }
 562            fns[vece](t_ptr, t_desc, t_32);
 563            tcg_temp_free_i32(t_32);
 564        }
 565    }
 566
 567    tcg_temp_free_ptr(t_ptr);
 568    tcg_temp_free_i32(t_desc);
 569    return;
 570
 571 done:
 572    if (oprsz < maxsz) {
 573        expand_clr(dofs + oprsz, maxsz - oprsz);
 574    }
 575}
 576
 577/* Likewise, but with zero.  */
 578static void expand_clr(uint32_t dofs, uint32_t maxsz)
 579{
 580    do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
 581}
 582
 583/* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
 584static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 585                         void (*fni)(TCGv_i32, TCGv_i32))
 586{
 587    TCGv_i32 t0 = tcg_temp_new_i32();
 588    uint32_t i;
 589
 590    for (i = 0; i < oprsz; i += 4) {
 591        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 592        fni(t0, t0);
 593        tcg_gen_st_i32(t0, cpu_env, dofs + i);
 594    }
 595    tcg_temp_free_i32(t0);
 596}
 597
 598static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 599                          int32_t c, bool load_dest,
 600                          void (*fni)(TCGv_i32, TCGv_i32, int32_t))
 601{
 602    TCGv_i32 t0 = tcg_temp_new_i32();
 603    TCGv_i32 t1 = tcg_temp_new_i32();
 604    uint32_t i;
 605
 606    for (i = 0; i < oprsz; i += 4) {
 607        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 608        if (load_dest) {
 609            tcg_gen_ld_i32(t1, cpu_env, dofs + i);
 610        }
 611        fni(t1, t0, c);
 612        tcg_gen_st_i32(t1, cpu_env, dofs + i);
 613    }
 614    tcg_temp_free_i32(t0);
 615    tcg_temp_free_i32(t1);
 616}
 617
 618static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 619                          TCGv_i32 c, bool scalar_first,
 620                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
 621{
 622    TCGv_i32 t0 = tcg_temp_new_i32();
 623    TCGv_i32 t1 = tcg_temp_new_i32();
 624    uint32_t i;
 625
 626    for (i = 0; i < oprsz; i += 4) {
 627        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 628        if (scalar_first) {
 629            fni(t1, c, t0);
 630        } else {
 631            fni(t1, t0, c);
 632        }
 633        tcg_gen_st_i32(t1, cpu_env, dofs + i);
 634    }
 635    tcg_temp_free_i32(t0);
 636    tcg_temp_free_i32(t1);
 637}
 638
 639/* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
 640static void expand_3_i32(uint32_t dofs, uint32_t aofs,
 641                         uint32_t bofs, uint32_t oprsz, bool load_dest,
 642                         void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
 643{
 644    TCGv_i32 t0 = tcg_temp_new_i32();
 645    TCGv_i32 t1 = tcg_temp_new_i32();
 646    TCGv_i32 t2 = tcg_temp_new_i32();
 647    uint32_t i;
 648
 649    for (i = 0; i < oprsz; i += 4) {
 650        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 651        tcg_gen_ld_i32(t1, cpu_env, bofs + i);
 652        if (load_dest) {
 653            tcg_gen_ld_i32(t2, cpu_env, dofs + i);
 654        }
 655        fni(t2, t0, t1);
 656        tcg_gen_st_i32(t2, cpu_env, dofs + i);
 657    }
 658    tcg_temp_free_i32(t2);
 659    tcg_temp_free_i32(t1);
 660    tcg_temp_free_i32(t0);
 661}
 662
 663/* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
 664static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 665                         uint32_t cofs, uint32_t oprsz,
 666                         void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
 667{
 668    TCGv_i32 t0 = tcg_temp_new_i32();
 669    TCGv_i32 t1 = tcg_temp_new_i32();
 670    TCGv_i32 t2 = tcg_temp_new_i32();
 671    TCGv_i32 t3 = tcg_temp_new_i32();
 672    uint32_t i;
 673
 674    for (i = 0; i < oprsz; i += 4) {
 675        tcg_gen_ld_i32(t1, cpu_env, aofs + i);
 676        tcg_gen_ld_i32(t2, cpu_env, bofs + i);
 677        tcg_gen_ld_i32(t3, cpu_env, cofs + i);
 678        fni(t0, t1, t2, t3);
 679        tcg_gen_st_i32(t0, cpu_env, dofs + i);
 680    }
 681    tcg_temp_free_i32(t3);
 682    tcg_temp_free_i32(t2);
 683    tcg_temp_free_i32(t1);
 684    tcg_temp_free_i32(t0);
 685}
 686
 687/* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
 688static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 689                         void (*fni)(TCGv_i64, TCGv_i64))
 690{
 691    TCGv_i64 t0 = tcg_temp_new_i64();
 692    uint32_t i;
 693
 694    for (i = 0; i < oprsz; i += 8) {
 695        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 696        fni(t0, t0);
 697        tcg_gen_st_i64(t0, cpu_env, dofs + i);
 698    }
 699    tcg_temp_free_i64(t0);
 700}
 701
 702static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 703                          int64_t c, bool load_dest,
 704                          void (*fni)(TCGv_i64, TCGv_i64, int64_t))
 705{
 706    TCGv_i64 t0 = tcg_temp_new_i64();
 707    TCGv_i64 t1 = tcg_temp_new_i64();
 708    uint32_t i;
 709
 710    for (i = 0; i < oprsz; i += 8) {
 711        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 712        if (load_dest) {
 713            tcg_gen_ld_i64(t1, cpu_env, dofs + i);
 714        }
 715        fni(t1, t0, c);
 716        tcg_gen_st_i64(t1, cpu_env, dofs + i);
 717    }
 718    tcg_temp_free_i64(t0);
 719    tcg_temp_free_i64(t1);
 720}
 721
 722static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 723                          TCGv_i64 c, bool scalar_first,
 724                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
 725{
 726    TCGv_i64 t0 = tcg_temp_new_i64();
 727    TCGv_i64 t1 = tcg_temp_new_i64();
 728    uint32_t i;
 729
 730    for (i = 0; i < oprsz; i += 8) {
 731        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 732        if (scalar_first) {
 733            fni(t1, c, t0);
 734        } else {
 735            fni(t1, t0, c);
 736        }
 737        tcg_gen_st_i64(t1, cpu_env, dofs + i);
 738    }
 739    tcg_temp_free_i64(t0);
 740    tcg_temp_free_i64(t1);
 741}
 742
 743/* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
 744static void expand_3_i64(uint32_t dofs, uint32_t aofs,
 745                         uint32_t bofs, uint32_t oprsz, bool load_dest,
 746                         void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
 747{
 748    TCGv_i64 t0 = tcg_temp_new_i64();
 749    TCGv_i64 t1 = tcg_temp_new_i64();
 750    TCGv_i64 t2 = tcg_temp_new_i64();
 751    uint32_t i;
 752
 753    for (i = 0; i < oprsz; i += 8) {
 754        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 755        tcg_gen_ld_i64(t1, cpu_env, bofs + i);
 756        if (load_dest) {
 757            tcg_gen_ld_i64(t2, cpu_env, dofs + i);
 758        }
 759        fni(t2, t0, t1);
 760        tcg_gen_st_i64(t2, cpu_env, dofs + i);
 761    }
 762    tcg_temp_free_i64(t2);
 763    tcg_temp_free_i64(t1);
 764    tcg_temp_free_i64(t0);
 765}
 766
 767/* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
 768static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 769                         uint32_t cofs, uint32_t oprsz,
 770                         void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
 771{
 772    TCGv_i64 t0 = tcg_temp_new_i64();
 773    TCGv_i64 t1 = tcg_temp_new_i64();
 774    TCGv_i64 t2 = tcg_temp_new_i64();
 775    TCGv_i64 t3 = tcg_temp_new_i64();
 776    uint32_t i;
 777
 778    for (i = 0; i < oprsz; i += 8) {
 779        tcg_gen_ld_i64(t1, cpu_env, aofs + i);
 780        tcg_gen_ld_i64(t2, cpu_env, bofs + i);
 781        tcg_gen_ld_i64(t3, cpu_env, cofs + i);
 782        fni(t0, t1, t2, t3);
 783        tcg_gen_st_i64(t0, cpu_env, dofs + i);
 784    }
 785    tcg_temp_free_i64(t3);
 786    tcg_temp_free_i64(t2);
 787    tcg_temp_free_i64(t1);
 788    tcg_temp_free_i64(t0);
 789}
 790
 791/* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
 792static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 793                         uint32_t oprsz, uint32_t tysz, TCGType type,
 794                         void (*fni)(unsigned, TCGv_vec, TCGv_vec))
 795{
 796    TCGv_vec t0 = tcg_temp_new_vec(type);
 797    uint32_t i;
 798
 799    for (i = 0; i < oprsz; i += tysz) {
 800        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 801        fni(vece, t0, t0);
 802        tcg_gen_st_vec(t0, cpu_env, dofs + i);
 803    }
 804    tcg_temp_free_vec(t0);
 805}
 806
 807/* Expand OPSZ bytes worth of two-vector operands and an immediate operand
 808   using host vectors.  */
 809static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 810                          uint32_t oprsz, uint32_t tysz, TCGType type,
 811                          int64_t c, bool load_dest,
 812                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
 813{
 814    TCGv_vec t0 = tcg_temp_new_vec(type);
 815    TCGv_vec t1 = tcg_temp_new_vec(type);
 816    uint32_t i;
 817
 818    for (i = 0; i < oprsz; i += tysz) {
 819        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 820        if (load_dest) {
 821            tcg_gen_ld_vec(t1, cpu_env, dofs + i);
 822        }
 823        fni(vece, t1, t0, c);
 824        tcg_gen_st_vec(t1, cpu_env, dofs + i);
 825    }
 826    tcg_temp_free_vec(t0);
 827    tcg_temp_free_vec(t1);
 828}
 829
 830static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 831                          uint32_t oprsz, uint32_t tysz, TCGType type,
 832                          TCGv_vec c, bool scalar_first,
 833                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
 834{
 835    TCGv_vec t0 = tcg_temp_new_vec(type);
 836    TCGv_vec t1 = tcg_temp_new_vec(type);
 837    uint32_t i;
 838
 839    for (i = 0; i < oprsz; i += tysz) {
 840        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 841        if (scalar_first) {
 842            fni(vece, t1, c, t0);
 843        } else {
 844            fni(vece, t1, t0, c);
 845        }
 846        tcg_gen_st_vec(t1, cpu_env, dofs + i);
 847    }
 848    tcg_temp_free_vec(t0);
 849    tcg_temp_free_vec(t1);
 850}
 851
 852/* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
 853static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 854                         uint32_t bofs, uint32_t oprsz,
 855                         uint32_t tysz, TCGType type, bool load_dest,
 856                         void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
 857{
 858    TCGv_vec t0 = tcg_temp_new_vec(type);
 859    TCGv_vec t1 = tcg_temp_new_vec(type);
 860    TCGv_vec t2 = tcg_temp_new_vec(type);
 861    uint32_t i;
 862
 863    for (i = 0; i < oprsz; i += tysz) {
 864        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 865        tcg_gen_ld_vec(t1, cpu_env, bofs + i);
 866        if (load_dest) {
 867            tcg_gen_ld_vec(t2, cpu_env, dofs + i);
 868        }
 869        fni(vece, t2, t0, t1);
 870        tcg_gen_st_vec(t2, cpu_env, dofs + i);
 871    }
 872    tcg_temp_free_vec(t2);
 873    tcg_temp_free_vec(t1);
 874    tcg_temp_free_vec(t0);
 875}
 876
 877/* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
 878static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 879                         uint32_t bofs, uint32_t cofs, uint32_t oprsz,
 880                         uint32_t tysz, TCGType type,
 881                         void (*fni)(unsigned, TCGv_vec, TCGv_vec,
 882                                     TCGv_vec, TCGv_vec))
 883{
 884    TCGv_vec t0 = tcg_temp_new_vec(type);
 885    TCGv_vec t1 = tcg_temp_new_vec(type);
 886    TCGv_vec t2 = tcg_temp_new_vec(type);
 887    TCGv_vec t3 = tcg_temp_new_vec(type);
 888    uint32_t i;
 889
 890    for (i = 0; i < oprsz; i += tysz) {
 891        tcg_gen_ld_vec(t1, cpu_env, aofs + i);
 892        tcg_gen_ld_vec(t2, cpu_env, bofs + i);
 893        tcg_gen_ld_vec(t3, cpu_env, cofs + i);
 894        fni(vece, t0, t1, t2, t3);
 895        tcg_gen_st_vec(t0, cpu_env, dofs + i);
 896    }
 897    tcg_temp_free_vec(t3);
 898    tcg_temp_free_vec(t2);
 899    tcg_temp_free_vec(t1);
 900    tcg_temp_free_vec(t0);
 901}
 902
 903/* Expand a vector two-operand operation.  */
 904void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
 905                    uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
 906{
 907    TCGType type;
 908    uint32_t some;
 909
 910    check_size_align(oprsz, maxsz, dofs | aofs);
 911    check_overlap_2(dofs, aofs, maxsz);
 912
 913    type = 0;
 914    if (g->fniv) {
 915        type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64);
 916    }
 917    switch (type) {
 918    case TCG_TYPE_V256:
 919        /* Recall that ARM SVE allows vector sizes that are not a
 920         * power of 2, but always a multiple of 16.  The intent is
 921         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 922         */
 923        some = QEMU_ALIGN_DOWN(oprsz, 32);
 924        expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv);
 925        if (some == oprsz) {
 926            break;
 927        }
 928        dofs += some;
 929        aofs += some;
 930        oprsz -= some;
 931        maxsz -= some;
 932        /* fallthru */
 933    case TCG_TYPE_V128:
 934        expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv);
 935        break;
 936    case TCG_TYPE_V64:
 937        expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv);
 938        break;
 939
 940    case 0:
 941        if (g->fni8 && check_size_impl(oprsz, 8)) {
 942            expand_2_i64(dofs, aofs, oprsz, g->fni8);
 943        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
 944            expand_2_i32(dofs, aofs, oprsz, g->fni4);
 945        } else {
 946            assert(g->fno != NULL);
 947            tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
 948            return;
 949        }
 950        break;
 951
 952    default:
 953        g_assert_not_reached();
 954    }
 955
 956    if (oprsz < maxsz) {
 957        expand_clr(dofs + oprsz, maxsz - oprsz);
 958    }
 959}
 960
 961/* Expand a vector operation with two vectors and an immediate.  */
 962void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 963                     uint32_t maxsz, int64_t c, const GVecGen2i *g)
 964{
 965    TCGType type;
 966    uint32_t some;
 967
 968    check_size_align(oprsz, maxsz, dofs | aofs);
 969    check_overlap_2(dofs, aofs, maxsz);
 970
 971    type = 0;
 972    if (g->fniv) {
 973        type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64);
 974    }
 975    switch (type) {
 976    case TCG_TYPE_V256:
 977        /* Recall that ARM SVE allows vector sizes that are not a
 978         * power of 2, but always a multiple of 16.  The intent is
 979         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 980         */
 981        some = QEMU_ALIGN_DOWN(oprsz, 32);
 982        expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
 983                      c, g->load_dest, g->fniv);
 984        if (some == oprsz) {
 985            break;
 986        }
 987        dofs += some;
 988        aofs += some;
 989        oprsz -= some;
 990        maxsz -= some;
 991        /* fallthru */
 992    case TCG_TYPE_V128:
 993        expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
 994                      c, g->load_dest, g->fniv);
 995        break;
 996    case TCG_TYPE_V64:
 997        expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
 998                      c, g->load_dest, g->fniv);
 999        break;
1000
1001    case 0:
1002        if (g->fni8 && check_size_impl(oprsz, 8)) {
1003            expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1004        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1005            expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1006        } else {
1007            if (g->fno) {
1008                tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1009            } else {
1010                TCGv_i64 tcg_c = tcg_const_i64(c);
1011                tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1012                                    maxsz, c, g->fnoi);
1013                tcg_temp_free_i64(tcg_c);
1014            }
1015            return;
1016        }
1017        break;
1018
1019    default:
1020        g_assert_not_reached();
1021    }
1022
1023    if (oprsz < maxsz) {
1024        expand_clr(dofs + oprsz, maxsz - oprsz);
1025    }
1026}
1027
1028/* Expand a vector operation with two vectors and a scalar.  */
1029void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1030                     uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1031{
1032    TCGType type;
1033
1034    check_size_align(oprsz, maxsz, dofs | aofs);
1035    check_overlap_2(dofs, aofs, maxsz);
1036
1037    type = 0;
1038    if (g->fniv) {
1039        type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64);
1040    }
1041    if (type != 0) {
1042        TCGv_vec t_vec = tcg_temp_new_vec(type);
1043        uint32_t some;
1044
1045        tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1046
1047        switch (type) {
1048        case TCG_TYPE_V256:
1049            /* Recall that ARM SVE allows vector sizes that are not a
1050             * power of 2, but always a multiple of 16.  The intent is
1051             * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1052             */
1053            some = QEMU_ALIGN_DOWN(oprsz, 32);
1054            expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1055                          t_vec, g->scalar_first, g->fniv);
1056            if (some == oprsz) {
1057                break;
1058            }
1059            dofs += some;
1060            aofs += some;
1061            oprsz -= some;
1062            maxsz -= some;
1063            /* fallthru */
1064
1065        case TCG_TYPE_V128:
1066            expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1067                          t_vec, g->scalar_first, g->fniv);
1068            break;
1069
1070        case TCG_TYPE_V64:
1071            expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1072                          t_vec, g->scalar_first, g->fniv);
1073            break;
1074
1075        default:
1076            g_assert_not_reached();
1077        }
1078        tcg_temp_free_vec(t_vec);
1079    } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1080        TCGv_i64 t64 = tcg_temp_new_i64();
1081
1082        gen_dup_i64(g->vece, t64, c);
1083        expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1084        tcg_temp_free_i64(t64);
1085    } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1086        TCGv_i32 t32 = tcg_temp_new_i32();
1087
1088        tcg_gen_extrl_i64_i32(t32, c);
1089        gen_dup_i32(g->vece, t32, t32);
1090        expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1091        tcg_temp_free_i32(t32);
1092    } else {
1093        tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1094        return;
1095    }
1096
1097    if (oprsz < maxsz) {
1098        expand_clr(dofs + oprsz, maxsz - oprsz);
1099    }
1100}
1101
1102/* Expand a vector three-operand operation.  */
1103void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1104                    uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1105{
1106    TCGType type;
1107    uint32_t some;
1108
1109    check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1110    check_overlap_3(dofs, aofs, bofs, maxsz);
1111
1112    type = 0;
1113    if (g->fniv) {
1114        type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64);
1115    }
1116    switch (type) {
1117    case TCG_TYPE_V256:
1118        /* Recall that ARM SVE allows vector sizes that are not a
1119         * power of 2, but always a multiple of 16.  The intent is
1120         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1121         */
1122        some = QEMU_ALIGN_DOWN(oprsz, 32);
1123        expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1124                     g->load_dest, g->fniv);
1125        if (some == oprsz) {
1126            break;
1127        }
1128        dofs += some;
1129        aofs += some;
1130        bofs += some;
1131        oprsz -= some;
1132        maxsz -= some;
1133        /* fallthru */
1134    case TCG_TYPE_V128:
1135        expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1136                     g->load_dest, g->fniv);
1137        break;
1138    case TCG_TYPE_V64:
1139        expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1140                     g->load_dest, g->fniv);
1141        break;
1142
1143    case 0:
1144        if (g->fni8 && check_size_impl(oprsz, 8)) {
1145            expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1146        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1147            expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1148        } else {
1149            assert(g->fno != NULL);
1150            tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
1151                               maxsz, g->data, g->fno);
1152            return;
1153        }
1154        break;
1155
1156    default:
1157        g_assert_not_reached();
1158    }
1159
1160    if (oprsz < maxsz) {
1161        expand_clr(dofs + oprsz, maxsz - oprsz);
1162    }
1163}
1164
1165/* Expand a vector four-operand operation.  */
1166void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1167                    uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1168{
1169    TCGType type;
1170    uint32_t some;
1171
1172    check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1173    check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1174
1175    type = 0;
1176    if (g->fniv) {
1177        type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64);
1178    }
1179    switch (type) {
1180    case TCG_TYPE_V256:
1181        /* Recall that ARM SVE allows vector sizes that are not a
1182         * power of 2, but always a multiple of 16.  The intent is
1183         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1184         */
1185        some = QEMU_ALIGN_DOWN(oprsz, 32);
1186        expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1187                     32, TCG_TYPE_V256, g->fniv);
1188        if (some == oprsz) {
1189            break;
1190        }
1191        dofs += some;
1192        aofs += some;
1193        bofs += some;
1194        cofs += some;
1195        oprsz -= some;
1196        maxsz -= some;
1197        /* fallthru */
1198    case TCG_TYPE_V128:
1199        expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1200                     16, TCG_TYPE_V128, g->fniv);
1201        break;
1202    case TCG_TYPE_V64:
1203        expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1204                     8, TCG_TYPE_V64, g->fniv);
1205        break;
1206
1207    case 0:
1208        if (g->fni8 && check_size_impl(oprsz, 8)) {
1209            expand_4_i64(dofs, aofs, bofs, cofs, oprsz, g->fni8);
1210        } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1211            expand_4_i32(dofs, aofs, bofs, cofs, oprsz, g->fni4);
1212        } else {
1213            assert(g->fno != NULL);
1214            tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1215                               oprsz, maxsz, g->data, g->fno);
1216            return;
1217        }
1218        break;
1219
1220    default:
1221        g_assert_not_reached();
1222    }
1223
1224    if (oprsz < maxsz) {
1225        expand_clr(dofs + oprsz, maxsz - oprsz);
1226    }
1227}
1228
1229/*
1230 * Expand specific vector operations.
1231 */
1232
1233static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1234{
1235    tcg_gen_mov_vec(a, b);
1236}
1237
1238void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1239                      uint32_t oprsz, uint32_t maxsz)
1240{
1241    static const GVecGen2 g = {
1242        .fni8 = tcg_gen_mov_i64,
1243        .fniv = vec_mov2,
1244        .fno = gen_helper_gvec_mov,
1245        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1246    };
1247    if (dofs != aofs) {
1248        tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1249    } else {
1250        check_size_align(oprsz, maxsz, dofs);
1251        if (oprsz < maxsz) {
1252            expand_clr(dofs + oprsz, maxsz - oprsz);
1253        }
1254    }
1255}
1256
1257void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1258                          uint32_t maxsz, TCGv_i32 in)
1259{
1260    check_size_align(oprsz, maxsz, dofs);
1261    tcg_debug_assert(vece <= MO_32);
1262    do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1263}
1264
1265void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1266                          uint32_t maxsz, TCGv_i64 in)
1267{
1268    check_size_align(oprsz, maxsz, dofs);
1269    tcg_debug_assert(vece <= MO_64);
1270    do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1271}
1272
1273void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1274                          uint32_t oprsz, uint32_t maxsz)
1275{
1276    if (vece <= MO_32) {
1277        TCGv_i32 in = tcg_temp_new_i32();
1278        switch (vece) {
1279        case MO_8:
1280            tcg_gen_ld8u_i32(in, cpu_env, aofs);
1281            break;
1282        case MO_16:
1283            tcg_gen_ld16u_i32(in, cpu_env, aofs);
1284            break;
1285        case MO_32:
1286            tcg_gen_ld_i32(in, cpu_env, aofs);
1287            break;
1288        }
1289        tcg_gen_gvec_dup_i32(vece, dofs, oprsz, maxsz, in);
1290        tcg_temp_free_i32(in);
1291    } else if (vece == MO_64) {
1292        TCGv_i64 in = tcg_temp_new_i64();
1293        tcg_gen_ld_i64(in, cpu_env, aofs);
1294        tcg_gen_gvec_dup_i64(MO_64, dofs, oprsz, maxsz, in);
1295        tcg_temp_free_i64(in);
1296    } else {
1297        /* 128-bit duplicate.  */
1298        /* ??? Dup to 256-bit vector.  */
1299        int i;
1300
1301        tcg_debug_assert(vece == 4);
1302        tcg_debug_assert(oprsz >= 16);
1303        if (TCG_TARGET_HAS_v128) {
1304            TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1305
1306            tcg_gen_ld_vec(in, cpu_env, aofs);
1307            for (i = 0; i < oprsz; i += 16) {
1308                tcg_gen_st_vec(in, cpu_env, dofs + i);
1309            }
1310            tcg_temp_free_vec(in);
1311        } else {
1312            TCGv_i64 in0 = tcg_temp_new_i64();
1313            TCGv_i64 in1 = tcg_temp_new_i64();
1314
1315            tcg_gen_ld_i64(in0, cpu_env, aofs);
1316            tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
1317            for (i = 0; i < oprsz; i += 16) {
1318                tcg_gen_st_i64(in0, cpu_env, dofs + i);
1319                tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
1320            }
1321            tcg_temp_free_i64(in0);
1322            tcg_temp_free_i64(in1);
1323        }
1324    }
1325}
1326
1327void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz,
1328                         uint32_t maxsz, uint64_t x)
1329{
1330    check_size_align(oprsz, maxsz, dofs);
1331    do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x);
1332}
1333
1334void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz,
1335                         uint32_t maxsz, uint32_t x)
1336{
1337    check_size_align(oprsz, maxsz, dofs);
1338    do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x);
1339}
1340
1341void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz,
1342                         uint32_t maxsz, uint16_t x)
1343{
1344    check_size_align(oprsz, maxsz, dofs);
1345    do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x);
1346}
1347
1348void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz,
1349                         uint32_t maxsz, uint8_t x)
1350{
1351    check_size_align(oprsz, maxsz, dofs);
1352    do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x);
1353}
1354
1355void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1356                      uint32_t oprsz, uint32_t maxsz)
1357{
1358    static const GVecGen2 g = {
1359        .fni8 = tcg_gen_not_i64,
1360        .fniv = tcg_gen_not_vec,
1361        .fno = gen_helper_gvec_not,
1362        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1363    };
1364    tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1365}
1366
1367/* Perform a vector addition using normal addition and a mask.  The mask
1368   should be the sign bit of each lane.  This 6-operation form is more
1369   efficient than separate additions when there are 4 or more lanes in
1370   the 64-bit operation.  */
1371static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1372{
1373    TCGv_i64 t1 = tcg_temp_new_i64();
1374    TCGv_i64 t2 = tcg_temp_new_i64();
1375    TCGv_i64 t3 = tcg_temp_new_i64();
1376
1377    tcg_gen_andc_i64(t1, a, m);
1378    tcg_gen_andc_i64(t2, b, m);
1379    tcg_gen_xor_i64(t3, a, b);
1380    tcg_gen_add_i64(d, t1, t2);
1381    tcg_gen_and_i64(t3, t3, m);
1382    tcg_gen_xor_i64(d, d, t3);
1383
1384    tcg_temp_free_i64(t1);
1385    tcg_temp_free_i64(t2);
1386    tcg_temp_free_i64(t3);
1387}
1388
1389void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1390{
1391    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1392    gen_addv_mask(d, a, b, m);
1393    tcg_temp_free_i64(m);
1394}
1395
1396void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1397{
1398    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1399    gen_addv_mask(d, a, b, m);
1400    tcg_temp_free_i64(m);
1401}
1402
1403void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1404{
1405    TCGv_i64 t1 = tcg_temp_new_i64();
1406    TCGv_i64 t2 = tcg_temp_new_i64();
1407
1408    tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1409    tcg_gen_add_i64(t2, a, b);
1410    tcg_gen_add_i64(t1, t1, b);
1411    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1412
1413    tcg_temp_free_i64(t1);
1414    tcg_temp_free_i64(t2);
1415}
1416
1417void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1418                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1419{
1420    static const GVecGen3 g[4] = {
1421        { .fni8 = tcg_gen_vec_add8_i64,
1422          .fniv = tcg_gen_add_vec,
1423          .fno = gen_helper_gvec_add8,
1424          .opc = INDEX_op_add_vec,
1425          .vece = MO_8 },
1426        { .fni8 = tcg_gen_vec_add16_i64,
1427          .fniv = tcg_gen_add_vec,
1428          .fno = gen_helper_gvec_add16,
1429          .opc = INDEX_op_add_vec,
1430          .vece = MO_16 },
1431        { .fni4 = tcg_gen_add_i32,
1432          .fniv = tcg_gen_add_vec,
1433          .fno = gen_helper_gvec_add32,
1434          .opc = INDEX_op_add_vec,
1435          .vece = MO_32 },
1436        { .fni8 = tcg_gen_add_i64,
1437          .fniv = tcg_gen_add_vec,
1438          .fno = gen_helper_gvec_add64,
1439          .opc = INDEX_op_add_vec,
1440          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1441          .vece = MO_64 },
1442    };
1443
1444    tcg_debug_assert(vece <= MO_64);
1445    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1446}
1447
1448void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1449                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1450{
1451    static const GVecGen2s g[4] = {
1452        { .fni8 = tcg_gen_vec_add8_i64,
1453          .fniv = tcg_gen_add_vec,
1454          .fno = gen_helper_gvec_adds8,
1455          .opc = INDEX_op_add_vec,
1456          .vece = MO_8 },
1457        { .fni8 = tcg_gen_vec_add16_i64,
1458          .fniv = tcg_gen_add_vec,
1459          .fno = gen_helper_gvec_adds16,
1460          .opc = INDEX_op_add_vec,
1461          .vece = MO_16 },
1462        { .fni4 = tcg_gen_add_i32,
1463          .fniv = tcg_gen_add_vec,
1464          .fno = gen_helper_gvec_adds32,
1465          .opc = INDEX_op_add_vec,
1466          .vece = MO_32 },
1467        { .fni8 = tcg_gen_add_i64,
1468          .fniv = tcg_gen_add_vec,
1469          .fno = gen_helper_gvec_adds64,
1470          .opc = INDEX_op_add_vec,
1471          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1472          .vece = MO_64 },
1473    };
1474
1475    tcg_debug_assert(vece <= MO_64);
1476    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1477}
1478
1479void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
1480                       int64_t c, uint32_t oprsz, uint32_t maxsz)
1481{
1482    TCGv_i64 tmp = tcg_const_i64(c);
1483    tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
1484    tcg_temp_free_i64(tmp);
1485}
1486
1487void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
1488                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1489{
1490    static const GVecGen2s g[4] = {
1491        { .fni8 = tcg_gen_vec_sub8_i64,
1492          .fniv = tcg_gen_sub_vec,
1493          .fno = gen_helper_gvec_subs8,
1494          .opc = INDEX_op_sub_vec,
1495          .vece = MO_8 },
1496        { .fni8 = tcg_gen_vec_sub16_i64,
1497          .fniv = tcg_gen_sub_vec,
1498          .fno = gen_helper_gvec_subs16,
1499          .opc = INDEX_op_sub_vec,
1500          .vece = MO_16 },
1501        { .fni4 = tcg_gen_sub_i32,
1502          .fniv = tcg_gen_sub_vec,
1503          .fno = gen_helper_gvec_subs32,
1504          .opc = INDEX_op_sub_vec,
1505          .vece = MO_32 },
1506        { .fni8 = tcg_gen_sub_i64,
1507          .fniv = tcg_gen_sub_vec,
1508          .fno = gen_helper_gvec_subs64,
1509          .opc = INDEX_op_sub_vec,
1510          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1511          .vece = MO_64 },
1512    };
1513
1514    tcg_debug_assert(vece <= MO_64);
1515    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1516}
1517
1518/* Perform a vector subtraction using normal subtraction and a mask.
1519   Compare gen_addv_mask above.  */
1520static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1521{
1522    TCGv_i64 t1 = tcg_temp_new_i64();
1523    TCGv_i64 t2 = tcg_temp_new_i64();
1524    TCGv_i64 t3 = tcg_temp_new_i64();
1525
1526    tcg_gen_or_i64(t1, a, m);
1527    tcg_gen_andc_i64(t2, b, m);
1528    tcg_gen_eqv_i64(t3, a, b);
1529    tcg_gen_sub_i64(d, t1, t2);
1530    tcg_gen_and_i64(t3, t3, m);
1531    tcg_gen_xor_i64(d, d, t3);
1532
1533    tcg_temp_free_i64(t1);
1534    tcg_temp_free_i64(t2);
1535    tcg_temp_free_i64(t3);
1536}
1537
1538void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1539{
1540    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1541    gen_subv_mask(d, a, b, m);
1542    tcg_temp_free_i64(m);
1543}
1544
1545void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1546{
1547    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1548    gen_subv_mask(d, a, b, m);
1549    tcg_temp_free_i64(m);
1550}
1551
1552void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1553{
1554    TCGv_i64 t1 = tcg_temp_new_i64();
1555    TCGv_i64 t2 = tcg_temp_new_i64();
1556
1557    tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1558    tcg_gen_sub_i64(t2, a, b);
1559    tcg_gen_sub_i64(t1, a, t1);
1560    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1561
1562    tcg_temp_free_i64(t1);
1563    tcg_temp_free_i64(t2);
1564}
1565
1566void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
1567                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1568{
1569    static const GVecGen3 g[4] = {
1570        { .fni8 = tcg_gen_vec_sub8_i64,
1571          .fniv = tcg_gen_sub_vec,
1572          .fno = gen_helper_gvec_sub8,
1573          .opc = INDEX_op_sub_vec,
1574          .vece = MO_8 },
1575        { .fni8 = tcg_gen_vec_sub16_i64,
1576          .fniv = tcg_gen_sub_vec,
1577          .fno = gen_helper_gvec_sub16,
1578          .opc = INDEX_op_sub_vec,
1579          .vece = MO_16 },
1580        { .fni4 = tcg_gen_sub_i32,
1581          .fniv = tcg_gen_sub_vec,
1582          .fno = gen_helper_gvec_sub32,
1583          .opc = INDEX_op_sub_vec,
1584          .vece = MO_32 },
1585        { .fni8 = tcg_gen_sub_i64,
1586          .fniv = tcg_gen_sub_vec,
1587          .fno = gen_helper_gvec_sub64,
1588          .opc = INDEX_op_sub_vec,
1589          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1590          .vece = MO_64 },
1591    };
1592
1593    tcg_debug_assert(vece <= MO_64);
1594    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1595}
1596
1597void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
1598                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1599{
1600    static const GVecGen3 g[4] = {
1601        { .fniv = tcg_gen_mul_vec,
1602          .fno = gen_helper_gvec_mul8,
1603          .opc = INDEX_op_mul_vec,
1604          .vece = MO_8 },
1605        { .fniv = tcg_gen_mul_vec,
1606          .fno = gen_helper_gvec_mul16,
1607          .opc = INDEX_op_mul_vec,
1608          .vece = MO_16 },
1609        { .fni4 = tcg_gen_mul_i32,
1610          .fniv = tcg_gen_mul_vec,
1611          .fno = gen_helper_gvec_mul32,
1612          .opc = INDEX_op_mul_vec,
1613          .vece = MO_32 },
1614        { .fni8 = tcg_gen_mul_i64,
1615          .fniv = tcg_gen_mul_vec,
1616          .fno = gen_helper_gvec_mul64,
1617          .opc = INDEX_op_mul_vec,
1618          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1619          .vece = MO_64 },
1620    };
1621
1622    tcg_debug_assert(vece <= MO_64);
1623    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1624}
1625
1626void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
1627                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1628{
1629    static const GVecGen2s g[4] = {
1630        { .fniv = tcg_gen_mul_vec,
1631          .fno = gen_helper_gvec_muls8,
1632          .opc = INDEX_op_mul_vec,
1633          .vece = MO_8 },
1634        { .fniv = tcg_gen_mul_vec,
1635          .fno = gen_helper_gvec_muls16,
1636          .opc = INDEX_op_mul_vec,
1637          .vece = MO_16 },
1638        { .fni4 = tcg_gen_mul_i32,
1639          .fniv = tcg_gen_mul_vec,
1640          .fno = gen_helper_gvec_muls32,
1641          .opc = INDEX_op_mul_vec,
1642          .vece = MO_32 },
1643        { .fni8 = tcg_gen_mul_i64,
1644          .fniv = tcg_gen_mul_vec,
1645          .fno = gen_helper_gvec_muls64,
1646          .opc = INDEX_op_mul_vec,
1647          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1648          .vece = MO_64 },
1649    };
1650
1651    tcg_debug_assert(vece <= MO_64);
1652    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1653}
1654
1655void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
1656                       int64_t c, uint32_t oprsz, uint32_t maxsz)
1657{
1658    TCGv_i64 tmp = tcg_const_i64(c);
1659    tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
1660    tcg_temp_free_i64(tmp);
1661}
1662
1663void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1664                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1665{
1666    static const GVecGen3 g[4] = {
1667        { .fno = gen_helper_gvec_ssadd8, .vece = MO_8 },
1668        { .fno = gen_helper_gvec_ssadd16, .vece = MO_16 },
1669        { .fno = gen_helper_gvec_ssadd32, .vece = MO_32 },
1670        { .fno = gen_helper_gvec_ssadd64, .vece = MO_64 }
1671    };
1672    tcg_debug_assert(vece <= MO_64);
1673    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1674}
1675
1676void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
1677                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1678{
1679    static const GVecGen3 g[4] = {
1680        { .fno = gen_helper_gvec_sssub8, .vece = MO_8 },
1681        { .fno = gen_helper_gvec_sssub16, .vece = MO_16 },
1682        { .fno = gen_helper_gvec_sssub32, .vece = MO_32 },
1683        { .fno = gen_helper_gvec_sssub64, .vece = MO_64 }
1684    };
1685    tcg_debug_assert(vece <= MO_64);
1686    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1687}
1688
1689static void tcg_gen_vec_usadd32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1690{
1691    TCGv_i32 max = tcg_const_i32(-1);
1692    tcg_gen_add_i32(d, a, b);
1693    tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
1694    tcg_temp_free_i32(max);
1695}
1696
1697static void tcg_gen_vec_usadd32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1698{
1699    TCGv_i64 max = tcg_const_i64(-1);
1700    tcg_gen_add_i64(d, a, b);
1701    tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
1702    tcg_temp_free_i64(max);
1703}
1704
1705void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
1706                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1707{
1708    static const GVecGen3 g[4] = {
1709        { .fno = gen_helper_gvec_usadd8, .vece = MO_8 },
1710        { .fno = gen_helper_gvec_usadd16, .vece = MO_16 },
1711        { .fni4 = tcg_gen_vec_usadd32_i32,
1712          .fno = gen_helper_gvec_usadd32,
1713          .vece = MO_32 },
1714        { .fni8 = tcg_gen_vec_usadd32_i64,
1715          .fno = gen_helper_gvec_usadd64,
1716          .vece = MO_64 }
1717    };
1718    tcg_debug_assert(vece <= MO_64);
1719    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1720}
1721
1722static void tcg_gen_vec_ussub32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1723{
1724    TCGv_i32 min = tcg_const_i32(0);
1725    tcg_gen_sub_i32(d, a, b);
1726    tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
1727    tcg_temp_free_i32(min);
1728}
1729
1730static void tcg_gen_vec_ussub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1731{
1732    TCGv_i64 min = tcg_const_i64(0);
1733    tcg_gen_sub_i64(d, a, b);
1734    tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
1735    tcg_temp_free_i64(min);
1736}
1737
1738void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
1739                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1740{
1741    static const GVecGen3 g[4] = {
1742        { .fno = gen_helper_gvec_ussub8, .vece = MO_8 },
1743        { .fno = gen_helper_gvec_ussub16, .vece = MO_16 },
1744        { .fni4 = tcg_gen_vec_ussub32_i32,
1745          .fno = gen_helper_gvec_ussub32,
1746          .vece = MO_32 },
1747        { .fni8 = tcg_gen_vec_ussub32_i64,
1748          .fno = gen_helper_gvec_ussub64,
1749          .vece = MO_64 }
1750    };
1751    tcg_debug_assert(vece <= MO_64);
1752    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1753}
1754
1755/* Perform a vector negation using normal negation and a mask.
1756   Compare gen_subv_mask above.  */
1757static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
1758{
1759    TCGv_i64 t2 = tcg_temp_new_i64();
1760    TCGv_i64 t3 = tcg_temp_new_i64();
1761
1762    tcg_gen_andc_i64(t3, m, b);
1763    tcg_gen_andc_i64(t2, b, m);
1764    tcg_gen_sub_i64(d, m, t2);
1765    tcg_gen_xor_i64(d, d, t3);
1766
1767    tcg_temp_free_i64(t2);
1768    tcg_temp_free_i64(t3);
1769}
1770
1771void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
1772{
1773    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1774    gen_negv_mask(d, b, m);
1775    tcg_temp_free_i64(m);
1776}
1777
1778void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
1779{
1780    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1781    gen_negv_mask(d, b, m);
1782    tcg_temp_free_i64(m);
1783}
1784
1785void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
1786{
1787    TCGv_i64 t1 = tcg_temp_new_i64();
1788    TCGv_i64 t2 = tcg_temp_new_i64();
1789
1790    tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1791    tcg_gen_neg_i64(t2, b);
1792    tcg_gen_neg_i64(t1, t1);
1793    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1794
1795    tcg_temp_free_i64(t1);
1796    tcg_temp_free_i64(t2);
1797}
1798
1799void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
1800                      uint32_t oprsz, uint32_t maxsz)
1801{
1802    static const GVecGen2 g[4] = {
1803        { .fni8 = tcg_gen_vec_neg8_i64,
1804          .fniv = tcg_gen_neg_vec,
1805          .fno = gen_helper_gvec_neg8,
1806          .opc = INDEX_op_neg_vec,
1807          .vece = MO_8 },
1808        { .fni8 = tcg_gen_vec_neg16_i64,
1809          .fniv = tcg_gen_neg_vec,
1810          .fno = gen_helper_gvec_neg16,
1811          .opc = INDEX_op_neg_vec,
1812          .vece = MO_16 },
1813        { .fni4 = tcg_gen_neg_i32,
1814          .fniv = tcg_gen_neg_vec,
1815          .fno = gen_helper_gvec_neg32,
1816          .opc = INDEX_op_neg_vec,
1817          .vece = MO_32 },
1818        { .fni8 = tcg_gen_neg_i64,
1819          .fniv = tcg_gen_neg_vec,
1820          .fno = gen_helper_gvec_neg64,
1821          .opc = INDEX_op_neg_vec,
1822          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1823          .vece = MO_64 },
1824    };
1825
1826    tcg_debug_assert(vece <= MO_64);
1827    tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
1828}
1829
1830void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
1831                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1832{
1833    static const GVecGen3 g = {
1834        .fni8 = tcg_gen_and_i64,
1835        .fniv = tcg_gen_and_vec,
1836        .fno = gen_helper_gvec_and,
1837        .opc = INDEX_op_and_vec,
1838        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1839    };
1840    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1841}
1842
1843void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
1844                     uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1845{
1846    static const GVecGen3 g = {
1847        .fni8 = tcg_gen_or_i64,
1848        .fniv = tcg_gen_or_vec,
1849        .fno = gen_helper_gvec_or,
1850        .opc = INDEX_op_or_vec,
1851        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1852    };
1853    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1854}
1855
1856void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
1857                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1858{
1859    static const GVecGen3 g = {
1860        .fni8 = tcg_gen_xor_i64,
1861        .fniv = tcg_gen_xor_vec,
1862        .fno = gen_helper_gvec_xor,
1863        .opc = INDEX_op_xor_vec,
1864        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1865    };
1866    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1867}
1868
1869void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
1870                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1871{
1872    static const GVecGen3 g = {
1873        .fni8 = tcg_gen_andc_i64,
1874        .fniv = tcg_gen_andc_vec,
1875        .fno = gen_helper_gvec_andc,
1876        .opc = INDEX_op_andc_vec,
1877        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1878    };
1879    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1880}
1881
1882void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
1883                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1884{
1885    static const GVecGen3 g = {
1886        .fni8 = tcg_gen_orc_i64,
1887        .fniv = tcg_gen_orc_vec,
1888        .fno = gen_helper_gvec_orc,
1889        .opc = INDEX_op_orc_vec,
1890        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1891    };
1892    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1893}
1894
1895static const GVecGen2s gop_ands = {
1896    .fni8 = tcg_gen_and_i64,
1897    .fniv = tcg_gen_and_vec,
1898    .fno = gen_helper_gvec_ands,
1899    .opc = INDEX_op_and_vec,
1900    .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1901    .vece = MO_64
1902};
1903
1904void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
1905                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1906{
1907    TCGv_i64 tmp = tcg_temp_new_i64();
1908    gen_dup_i64(vece, tmp, c);
1909    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
1910    tcg_temp_free_i64(tmp);
1911}
1912
1913void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
1914                       int64_t c, uint32_t oprsz, uint32_t maxsz)
1915{
1916    TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
1917    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
1918    tcg_temp_free_i64(tmp);
1919}
1920
1921static const GVecGen2s gop_xors = {
1922    .fni8 = tcg_gen_xor_i64,
1923    .fniv = tcg_gen_xor_vec,
1924    .fno = gen_helper_gvec_xors,
1925    .opc = INDEX_op_xor_vec,
1926    .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1927    .vece = MO_64
1928};
1929
1930void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
1931                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1932{
1933    TCGv_i64 tmp = tcg_temp_new_i64();
1934    gen_dup_i64(vece, tmp, c);
1935    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
1936    tcg_temp_free_i64(tmp);
1937}
1938
1939void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
1940                       int64_t c, uint32_t oprsz, uint32_t maxsz)
1941{
1942    TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
1943    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
1944    tcg_temp_free_i64(tmp);
1945}
1946
1947static const GVecGen2s gop_ors = {
1948    .fni8 = tcg_gen_or_i64,
1949    .fniv = tcg_gen_or_vec,
1950    .fno = gen_helper_gvec_ors,
1951    .opc = INDEX_op_or_vec,
1952    .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1953    .vece = MO_64
1954};
1955
1956void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
1957                      TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1958{
1959    TCGv_i64 tmp = tcg_temp_new_i64();
1960    gen_dup_i64(vece, tmp, c);
1961    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
1962    tcg_temp_free_i64(tmp);
1963}
1964
1965void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
1966                      int64_t c, uint32_t oprsz, uint32_t maxsz)
1967{
1968    TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
1969    tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
1970    tcg_temp_free_i64(tmp);
1971}
1972
1973void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
1974{
1975    uint64_t mask = dup_const(MO_8, 0xff << c);
1976    tcg_gen_shli_i64(d, a, c);
1977    tcg_gen_andi_i64(d, d, mask);
1978}
1979
1980void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
1981{
1982    uint64_t mask = dup_const(MO_16, 0xffff << c);
1983    tcg_gen_shli_i64(d, a, c);
1984    tcg_gen_andi_i64(d, d, mask);
1985}
1986
1987void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
1988                       int64_t shift, uint32_t oprsz, uint32_t maxsz)
1989{
1990    static const GVecGen2i g[4] = {
1991        { .fni8 = tcg_gen_vec_shl8i_i64,
1992          .fniv = tcg_gen_shli_vec,
1993          .fno = gen_helper_gvec_shl8i,
1994          .opc = INDEX_op_shli_vec,
1995          .vece = MO_8 },
1996        { .fni8 = tcg_gen_vec_shl16i_i64,
1997          .fniv = tcg_gen_shli_vec,
1998          .fno = gen_helper_gvec_shl16i,
1999          .opc = INDEX_op_shli_vec,
2000          .vece = MO_16 },
2001        { .fni4 = tcg_gen_shli_i32,
2002          .fniv = tcg_gen_shli_vec,
2003          .fno = gen_helper_gvec_shl32i,
2004          .opc = INDEX_op_shli_vec,
2005          .vece = MO_32 },
2006        { .fni8 = tcg_gen_shli_i64,
2007          .fniv = tcg_gen_shli_vec,
2008          .fno = gen_helper_gvec_shl64i,
2009          .opc = INDEX_op_shli_vec,
2010          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2011          .vece = MO_64 },
2012    };
2013
2014    tcg_debug_assert(vece <= MO_64);
2015    tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2016    if (shift == 0) {
2017        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2018    } else {
2019        tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2020    }
2021}
2022
2023void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2024{
2025    uint64_t mask = dup_const(MO_8, 0xff >> c);
2026    tcg_gen_shri_i64(d, a, c);
2027    tcg_gen_andi_i64(d, d, mask);
2028}
2029
2030void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2031{
2032    uint64_t mask = dup_const(MO_16, 0xffff >> c);
2033    tcg_gen_shri_i64(d, a, c);
2034    tcg_gen_andi_i64(d, d, mask);
2035}
2036
2037void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
2038                       int64_t shift, uint32_t oprsz, uint32_t maxsz)
2039{
2040    static const GVecGen2i g[4] = {
2041        { .fni8 = tcg_gen_vec_shr8i_i64,
2042          .fniv = tcg_gen_shri_vec,
2043          .fno = gen_helper_gvec_shr8i,
2044          .opc = INDEX_op_shri_vec,
2045          .vece = MO_8 },
2046        { .fni8 = tcg_gen_vec_shr16i_i64,
2047          .fniv = tcg_gen_shri_vec,
2048          .fno = gen_helper_gvec_shr16i,
2049          .opc = INDEX_op_shri_vec,
2050          .vece = MO_16 },
2051        { .fni4 = tcg_gen_shri_i32,
2052          .fniv = tcg_gen_shri_vec,
2053          .fno = gen_helper_gvec_shr32i,
2054          .opc = INDEX_op_shri_vec,
2055          .vece = MO_32 },
2056        { .fni8 = tcg_gen_shri_i64,
2057          .fniv = tcg_gen_shri_vec,
2058          .fno = gen_helper_gvec_shr64i,
2059          .opc = INDEX_op_shri_vec,
2060          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2061          .vece = MO_64 },
2062    };
2063
2064    tcg_debug_assert(vece <= MO_64);
2065    tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2066    if (shift == 0) {
2067        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2068    } else {
2069        tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2070    }
2071}
2072
2073void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2074{
2075    uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2076    uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2077    TCGv_i64 s = tcg_temp_new_i64();
2078
2079    tcg_gen_shri_i64(d, a, c);
2080    tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2081    tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2082    tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2083    tcg_gen_or_i64(d, d, s);         /* include sign extension */
2084    tcg_temp_free_i64(s);
2085}
2086
2087void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2088{
2089    uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2090    uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2091    TCGv_i64 s = tcg_temp_new_i64();
2092
2093    tcg_gen_shri_i64(d, a, c);
2094    tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2095    tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2096    tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2097    tcg_gen_or_i64(d, d, s);         /* include sign extension */
2098    tcg_temp_free_i64(s);
2099}
2100
2101void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
2102                       int64_t shift, uint32_t oprsz, uint32_t maxsz)
2103{
2104    static const GVecGen2i g[4] = {
2105        { .fni8 = tcg_gen_vec_sar8i_i64,
2106          .fniv = tcg_gen_sari_vec,
2107          .fno = gen_helper_gvec_sar8i,
2108          .opc = INDEX_op_sari_vec,
2109          .vece = MO_8 },
2110        { .fni8 = tcg_gen_vec_sar16i_i64,
2111          .fniv = tcg_gen_sari_vec,
2112          .fno = gen_helper_gvec_sar16i,
2113          .opc = INDEX_op_sari_vec,
2114          .vece = MO_16 },
2115        { .fni4 = tcg_gen_sari_i32,
2116          .fniv = tcg_gen_sari_vec,
2117          .fno = gen_helper_gvec_sar32i,
2118          .opc = INDEX_op_sari_vec,
2119          .vece = MO_32 },
2120        { .fni8 = tcg_gen_sari_i64,
2121          .fniv = tcg_gen_sari_vec,
2122          .fno = gen_helper_gvec_sar64i,
2123          .opc = INDEX_op_sari_vec,
2124          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2125          .vece = MO_64 },
2126    };
2127
2128    tcg_debug_assert(vece <= MO_64);
2129    tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2130    if (shift == 0) {
2131        tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2132    } else {
2133        tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2134    }
2135}
2136
2137/* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
2138static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
2139                           uint32_t oprsz, TCGCond cond)
2140{
2141    TCGv_i32 t0 = tcg_temp_new_i32();
2142    TCGv_i32 t1 = tcg_temp_new_i32();
2143    uint32_t i;
2144
2145    for (i = 0; i < oprsz; i += 4) {
2146        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
2147        tcg_gen_ld_i32(t1, cpu_env, bofs + i);
2148        tcg_gen_setcond_i32(cond, t0, t0, t1);
2149        tcg_gen_neg_i32(t0, t0);
2150        tcg_gen_st_i32(t0, cpu_env, dofs + i);
2151    }
2152    tcg_temp_free_i32(t1);
2153    tcg_temp_free_i32(t0);
2154}
2155
2156static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
2157                           uint32_t oprsz, TCGCond cond)
2158{
2159    TCGv_i64 t0 = tcg_temp_new_i64();
2160    TCGv_i64 t1 = tcg_temp_new_i64();
2161    uint32_t i;
2162
2163    for (i = 0; i < oprsz; i += 8) {
2164        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
2165        tcg_gen_ld_i64(t1, cpu_env, bofs + i);
2166        tcg_gen_setcond_i64(cond, t0, t0, t1);
2167        tcg_gen_neg_i64(t0, t0);
2168        tcg_gen_st_i64(t0, cpu_env, dofs + i);
2169    }
2170    tcg_temp_free_i64(t1);
2171    tcg_temp_free_i64(t0);
2172}
2173
2174static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
2175                           uint32_t bofs, uint32_t oprsz, uint32_t tysz,
2176                           TCGType type, TCGCond cond)
2177{
2178    TCGv_vec t0 = tcg_temp_new_vec(type);
2179    TCGv_vec t1 = tcg_temp_new_vec(type);
2180    uint32_t i;
2181
2182    for (i = 0; i < oprsz; i += tysz) {
2183        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
2184        tcg_gen_ld_vec(t1, cpu_env, bofs + i);
2185        tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
2186        tcg_gen_st_vec(t0, cpu_env, dofs + i);
2187    }
2188    tcg_temp_free_vec(t1);
2189    tcg_temp_free_vec(t0);
2190}
2191
2192void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
2193                      uint32_t aofs, uint32_t bofs,
2194                      uint32_t oprsz, uint32_t maxsz)
2195{
2196    static gen_helper_gvec_3 * const eq_fn[4] = {
2197        gen_helper_gvec_eq8, gen_helper_gvec_eq16,
2198        gen_helper_gvec_eq32, gen_helper_gvec_eq64
2199    };
2200    static gen_helper_gvec_3 * const ne_fn[4] = {
2201        gen_helper_gvec_ne8, gen_helper_gvec_ne16,
2202        gen_helper_gvec_ne32, gen_helper_gvec_ne64
2203    };
2204    static gen_helper_gvec_3 * const lt_fn[4] = {
2205        gen_helper_gvec_lt8, gen_helper_gvec_lt16,
2206        gen_helper_gvec_lt32, gen_helper_gvec_lt64
2207    };
2208    static gen_helper_gvec_3 * const le_fn[4] = {
2209        gen_helper_gvec_le8, gen_helper_gvec_le16,
2210        gen_helper_gvec_le32, gen_helper_gvec_le64
2211    };
2212    static gen_helper_gvec_3 * const ltu_fn[4] = {
2213        gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
2214        gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
2215    };
2216    static gen_helper_gvec_3 * const leu_fn[4] = {
2217        gen_helper_gvec_leu8, gen_helper_gvec_leu16,
2218        gen_helper_gvec_leu32, gen_helper_gvec_leu64
2219    };
2220    static gen_helper_gvec_3 * const * const fns[16] = {
2221        [TCG_COND_EQ] = eq_fn,
2222        [TCG_COND_NE] = ne_fn,
2223        [TCG_COND_LT] = lt_fn,
2224        [TCG_COND_LE] = le_fn,
2225        [TCG_COND_LTU] = ltu_fn,
2226        [TCG_COND_LEU] = leu_fn,
2227    };
2228    TCGType type;
2229    uint32_t some;
2230
2231    check_size_align(oprsz, maxsz, dofs | aofs | bofs);
2232    check_overlap_3(dofs, aofs, bofs, maxsz);
2233
2234    if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
2235        do_dup(MO_8, dofs, oprsz, maxsz,
2236               NULL, NULL, -(cond == TCG_COND_ALWAYS));
2237        return;
2238    }
2239
2240    /* Implement inline with a vector type, if possible.
2241     * Prefer integer when 64-bit host and 64-bit comparison.
2242     */
2243    type = choose_vector_type(INDEX_op_cmp_vec, vece, oprsz,
2244                              TCG_TARGET_REG_BITS == 64 && vece == MO_64);
2245    switch (type) {
2246    case TCG_TYPE_V256:
2247        /* Recall that ARM SVE allows vector sizes that are not a
2248         * power of 2, but always a multiple of 16.  The intent is
2249         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
2250         */
2251        some = QEMU_ALIGN_DOWN(oprsz, 32);
2252        expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
2253        if (some == oprsz) {
2254            break;
2255        }
2256        dofs += some;
2257        aofs += some;
2258        bofs += some;
2259        oprsz -= some;
2260        maxsz -= some;
2261        /* fallthru */
2262    case TCG_TYPE_V128:
2263        expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
2264        break;
2265    case TCG_TYPE_V64:
2266        expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
2267        break;
2268
2269    case 0:
2270        if (vece == MO_64 && check_size_impl(oprsz, 8)) {
2271            expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
2272        } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
2273            expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
2274        } else {
2275            gen_helper_gvec_3 * const *fn = fns[cond];
2276
2277            if (fn == NULL) {
2278                uint32_t tmp;
2279                tmp = aofs, aofs = bofs, bofs = tmp;
2280                cond = tcg_swap_cond(cond);
2281                fn = fns[cond];
2282                assert(fn != NULL);
2283            }
2284            tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
2285            return;
2286        }
2287        break;
2288
2289    default:
2290        g_assert_not_reached();
2291    }
2292
2293    if (oprsz < maxsz) {
2294        expand_clr(dofs + oprsz, maxsz - oprsz);
2295    }
2296}
2297