qemu/tcg/i386/tcg-target.inc.c
<<
>>
Prefs
   1/*
   2 * Tiny Code Generator for QEMU
   3 *
   4 * Copyright (c) 2008 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "tcg-pool.inc.c"
  26
  27#ifdef CONFIG_DEBUG_TCG
  28static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
  29#if TCG_TARGET_REG_BITS == 64
  30    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
  31#else
  32    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
  33#endif
  34    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
  35    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
  36#if TCG_TARGET_REG_BITS == 64
  37    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
  38    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
  39#endif
  40};
  41#endif
  42
  43static const int tcg_target_reg_alloc_order[] = {
  44#if TCG_TARGET_REG_BITS == 64
  45    TCG_REG_RBP,
  46    TCG_REG_RBX,
  47    TCG_REG_R12,
  48    TCG_REG_R13,
  49    TCG_REG_R14,
  50    TCG_REG_R15,
  51    TCG_REG_R10,
  52    TCG_REG_R11,
  53    TCG_REG_R9,
  54    TCG_REG_R8,
  55    TCG_REG_RCX,
  56    TCG_REG_RDX,
  57    TCG_REG_RSI,
  58    TCG_REG_RDI,
  59    TCG_REG_RAX,
  60#else
  61    TCG_REG_EBX,
  62    TCG_REG_ESI,
  63    TCG_REG_EDI,
  64    TCG_REG_EBP,
  65    TCG_REG_ECX,
  66    TCG_REG_EDX,
  67    TCG_REG_EAX,
  68#endif
  69    TCG_REG_XMM0,
  70    TCG_REG_XMM1,
  71    TCG_REG_XMM2,
  72    TCG_REG_XMM3,
  73    TCG_REG_XMM4,
  74    TCG_REG_XMM5,
  75#ifndef _WIN64
  76    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
  77       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
  78    TCG_REG_XMM6,
  79    TCG_REG_XMM7,
  80#if TCG_TARGET_REG_BITS == 64
  81    TCG_REG_XMM8,
  82    TCG_REG_XMM9,
  83    TCG_REG_XMM10,
  84    TCG_REG_XMM11,
  85    TCG_REG_XMM12,
  86    TCG_REG_XMM13,
  87    TCG_REG_XMM14,
  88    TCG_REG_XMM15,
  89#endif
  90#endif
  91};
  92
  93static const int tcg_target_call_iarg_regs[] = {
  94#if TCG_TARGET_REG_BITS == 64
  95#if defined(_WIN64)
  96    TCG_REG_RCX,
  97    TCG_REG_RDX,
  98#else
  99    TCG_REG_RDI,
 100    TCG_REG_RSI,
 101    TCG_REG_RDX,
 102    TCG_REG_RCX,
 103#endif
 104    TCG_REG_R8,
 105    TCG_REG_R9,
 106#else
 107    /* 32 bit mode uses stack based calling convention (GCC default). */
 108#endif
 109};
 110
 111static const int tcg_target_call_oarg_regs[] = {
 112    TCG_REG_EAX,
 113#if TCG_TARGET_REG_BITS == 32
 114    TCG_REG_EDX
 115#endif
 116};
 117
 118/* Constants we accept.  */
 119#define TCG_CT_CONST_S32 0x100
 120#define TCG_CT_CONST_U32 0x200
 121#define TCG_CT_CONST_I32 0x400
 122#define TCG_CT_CONST_WSZ 0x800
 123
 124/* Registers used with L constraint, which are the first argument
 125   registers on x86_64, and two random call clobbered registers on
 126   i386. */
 127#if TCG_TARGET_REG_BITS == 64
 128# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
 129# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
 130#else
 131# define TCG_REG_L0 TCG_REG_EAX
 132# define TCG_REG_L1 TCG_REG_EDX
 133#endif
 134
 135/* The host compiler should supply <cpuid.h> to enable runtime features
 136   detection, as we're not going to go so far as our own inline assembly.
 137   If not available, default values will be assumed.  */
 138#if defined(CONFIG_CPUID_H)
 139#include "qemu/cpuid.h"
 140#endif
 141
 142/* For 64-bit, we always know that CMOV is available.  */
 143#if TCG_TARGET_REG_BITS == 64
 144# define have_cmov 1
 145#elif defined(CONFIG_CPUID_H)
 146static bool have_cmov;
 147#else
 148# define have_cmov 0
 149#endif
 150
 151/* We need these symbols in tcg-target.h, and we can't properly conditionalize
 152   it there.  Therefore we always define the variable.  */
 153bool have_bmi1;
 154bool have_popcnt;
 155bool have_avx1;
 156bool have_avx2;
 157
 158#ifdef CONFIG_CPUID_H
 159static bool have_movbe;
 160static bool have_bmi2;
 161static bool have_lzcnt;
 162#else
 163# define have_movbe 0
 164# define have_bmi2 0
 165# define have_lzcnt 0
 166#endif
 167
 168static tcg_insn_unit *tb_ret_addr;
 169
 170static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 171                        intptr_t value, intptr_t addend)
 172{
 173    value += addend;
 174    switch(type) {
 175    case R_386_PC32:
 176        value -= (uintptr_t)code_ptr;
 177        if (value != (int32_t)value) {
 178            return false;
 179        }
 180        /* FALLTHRU */
 181    case R_386_32:
 182        tcg_patch32(code_ptr, value);
 183        break;
 184    case R_386_PC8:
 185        value -= (uintptr_t)code_ptr;
 186        if (value != (int8_t)value) {
 187            return false;
 188        }
 189        tcg_patch8(code_ptr, value);
 190        break;
 191    default:
 192        tcg_abort();
 193    }
 194    return true;
 195}
 196
 197#if TCG_TARGET_REG_BITS == 64
 198#define ALL_GENERAL_REGS   0x0000ffffu
 199#define ALL_VECTOR_REGS    0xffff0000u
 200#else
 201#define ALL_GENERAL_REGS   0x000000ffu
 202#define ALL_VECTOR_REGS    0x00ff0000u
 203#endif
 204
 205/* parse target specific constraints */
 206static const char *target_parse_constraint(TCGArgConstraint *ct,
 207                                           const char *ct_str, TCGType type)
 208{
 209    switch(*ct_str++) {
 210    case 'a':
 211        ct->ct |= TCG_CT_REG;
 212        tcg_regset_set_reg(ct->u.regs, TCG_REG_EAX);
 213        break;
 214    case 'b':
 215        ct->ct |= TCG_CT_REG;
 216        tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX);
 217        break;
 218    case 'c':
 219        ct->ct |= TCG_CT_REG;
 220        tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX);
 221        break;
 222    case 'd':
 223        ct->ct |= TCG_CT_REG;
 224        tcg_regset_set_reg(ct->u.regs, TCG_REG_EDX);
 225        break;
 226    case 'S':
 227        ct->ct |= TCG_CT_REG;
 228        tcg_regset_set_reg(ct->u.regs, TCG_REG_ESI);
 229        break;
 230    case 'D':
 231        ct->ct |= TCG_CT_REG;
 232        tcg_regset_set_reg(ct->u.regs, TCG_REG_EDI);
 233        break;
 234    case 'q':
 235        /* A register that can be used as a byte operand.  */
 236        ct->ct |= TCG_CT_REG;
 237        ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf;
 238        break;
 239    case 'Q':
 240        /* A register with an addressable second byte (e.g. %ah).  */
 241        ct->ct |= TCG_CT_REG;
 242        ct->u.regs = 0xf;
 243        break;
 244    case 'r':
 245        /* A general register.  */
 246        ct->ct |= TCG_CT_REG;
 247        ct->u.regs |= ALL_GENERAL_REGS;
 248        break;
 249    case 'W':
 250        /* With TZCNT/LZCNT, we can have operand-size as an input.  */
 251        ct->ct |= TCG_CT_CONST_WSZ;
 252        break;
 253    case 'x':
 254        /* A vector register.  */
 255        ct->ct |= TCG_CT_REG;
 256        ct->u.regs |= ALL_VECTOR_REGS;
 257        break;
 258
 259        /* qemu_ld/st address constraint */
 260    case 'L':
 261        ct->ct |= TCG_CT_REG;
 262        ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff;
 263        tcg_regset_reset_reg(ct->u.regs, TCG_REG_L0);
 264        tcg_regset_reset_reg(ct->u.regs, TCG_REG_L1);
 265        break;
 266
 267    case 'e':
 268        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_S32);
 269        break;
 270    case 'Z':
 271        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_U32);
 272        break;
 273    case 'I':
 274        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_I32);
 275        break;
 276
 277    default:
 278        return NULL;
 279    }
 280    return ct_str;
 281}
 282
 283/* test if a constant matches the constraint */
 284static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 285                                         const TCGArgConstraint *arg_ct)
 286{
 287    int ct = arg_ct->ct;
 288    if (ct & TCG_CT_CONST) {
 289        return 1;
 290    }
 291    if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
 292        return 1;
 293    }
 294    if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
 295        return 1;
 296    }
 297    if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
 298        return 1;
 299    }
 300    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
 301        return 1;
 302    }
 303    return 0;
 304}
 305
 306# define LOWREGMASK(x)  ((x) & 7)
 307
 308#define P_EXT           0x100           /* 0x0f opcode prefix */
 309#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
 310#define P_DATA16        0x400           /* 0x66 opcode prefix */
 311#if TCG_TARGET_REG_BITS == 64
 312# define P_REXW         0x1000          /* Set REX.W = 1 */
 313# define P_REXB_R       0x2000          /* REG field as byte register */
 314# define P_REXB_RM      0x4000          /* R/M field as byte register */
 315# define P_GS           0x8000          /* gs segment override */
 316#else
 317# define P_REXW         0
 318# define P_REXB_R       0
 319# define P_REXB_RM      0
 320# define P_GS           0
 321#endif
 322#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
 323#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
 324#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
 325#define P_VEXL          0x80000         /* Set VEX.L = 1 */
 326
 327#define OPC_ARITH_EvIz  (0x81)
 328#define OPC_ARITH_EvIb  (0x83)
 329#define OPC_ARITH_GvEv  (0x03)          /* ... plus (ARITH_FOO << 3) */
 330#define OPC_ANDN        (0xf2 | P_EXT38)
 331#define OPC_ADD_GvEv    (OPC_ARITH_GvEv | (ARITH_ADD << 3))
 332#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
 333#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
 334#define OPC_BSF         (0xbc | P_EXT)
 335#define OPC_BSR         (0xbd | P_EXT)
 336#define OPC_BSWAP       (0xc8 | P_EXT)
 337#define OPC_CALL_Jz     (0xe8)
 338#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
 339#define OPC_CMP_GvEv    (OPC_ARITH_GvEv | (ARITH_CMP << 3))
 340#define OPC_DEC_r32     (0x48)
 341#define OPC_IMUL_GvEv   (0xaf | P_EXT)
 342#define OPC_IMUL_GvEvIb (0x6b)
 343#define OPC_IMUL_GvEvIz (0x69)
 344#define OPC_INC_r32     (0x40)
 345#define OPC_JCC_long    (0x80 | P_EXT)  /* ... plus condition code */
 346#define OPC_JCC_short   (0x70)          /* ... plus condition code */
 347#define OPC_JMP_long    (0xe9)
 348#define OPC_JMP_short   (0xeb)
 349#define OPC_LEA         (0x8d)
 350#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
 351#define OPC_MOVB_EvGv   (0x88)          /* stores, more or less */
 352#define OPC_MOVL_EvGv   (0x89)          /* stores, more or less */
 353#define OPC_MOVL_GvEv   (0x8b)          /* loads, more or less */
 354#define OPC_MOVB_EvIz   (0xc6)
 355#define OPC_MOVL_EvIz   (0xc7)
 356#define OPC_MOVL_Iv     (0xb8)
 357#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
 358#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
 359#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
 360#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
 361#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
 362#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
 363#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
 364#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
 365#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
 366#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
 367#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
 368#define OPC_MOVSBL      (0xbe | P_EXT)
 369#define OPC_MOVSWL      (0xbf | P_EXT)
 370#define OPC_MOVSLQ      (0x63 | P_REXW)
 371#define OPC_MOVZBL      (0xb6 | P_EXT)
 372#define OPC_MOVZWL      (0xb7 | P_EXT)
 373#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
 374#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
 375#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
 376#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
 377#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
 378#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
 379#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
 380#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
 381#define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
 382#define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
 383#define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
 384#define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
 385#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
 386#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
 387#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
 388#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
 389#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
 390#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
 391#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
 392#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
 393#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
 394#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
 395#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
 396#define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
 397#define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
 398#define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
 399#define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
 400#define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
 401#define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
 402#define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
 403#define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
 404#define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
 405#define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
 406#define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
 407#define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
 408#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
 409#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
 410#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
 411#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
 412#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
 413#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
 414#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
 415#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
 416#define OPC_POR         (0xeb | P_EXT | P_DATA16)
 417#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
 418#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
 419#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
 420#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
 421#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
 422#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
 423#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
 424#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
 425#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
 426#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
 427#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
 428#define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
 429#define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
 430#define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
 431#define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
 432#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
 433#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
 434#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
 435#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
 436#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
 437#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
 438#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
 439#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
 440#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
 441#define OPC_POP_r32     (0x58)
 442#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
 443#define OPC_PUSH_r32    (0x50)
 444#define OPC_PUSH_Iv     (0x68)
 445#define OPC_PUSH_Ib     (0x6a)
 446#define OPC_RET         (0xc3)
 447#define OPC_SETCC       (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
 448#define OPC_SHIFT_1     (0xd1)
 449#define OPC_SHIFT_Ib    (0xc1)
 450#define OPC_SHIFT_cl    (0xd3)
 451#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
 452#define OPC_SHUFPS      (0xc6 | P_EXT)
 453#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
 454#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
 455#define OPC_TESTL       (0x85)
 456#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
 457#define OPC_UD2         (0x0b | P_EXT)
 458#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
 459#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
 460#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
 461#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
 462#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
 463#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
 464#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_REXW)
 465#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
 466#define OPC_VZEROUPPER  (0x77 | P_EXT)
 467#define OPC_XCHG_ax_r32 (0x90)
 468
 469#define OPC_GRP3_Ev     (0xf7)
 470#define OPC_GRP5        (0xff)
 471#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
 472
 473/* Group 1 opcode extensions for 0x80-0x83.
 474   These are also used as modifiers for OPC_ARITH.  */
 475#define ARITH_ADD 0
 476#define ARITH_OR  1
 477#define ARITH_ADC 2
 478#define ARITH_SBB 3
 479#define ARITH_AND 4
 480#define ARITH_SUB 5
 481#define ARITH_XOR 6
 482#define ARITH_CMP 7
 483
 484/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
 485#define SHIFT_ROL 0
 486#define SHIFT_ROR 1
 487#define SHIFT_SHL 4
 488#define SHIFT_SHR 5
 489#define SHIFT_SAR 7
 490
 491/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
 492#define EXT3_NOT   2
 493#define EXT3_NEG   3
 494#define EXT3_MUL   4
 495#define EXT3_IMUL  5
 496#define EXT3_DIV   6
 497#define EXT3_IDIV  7
 498
 499/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
 500#define EXT5_INC_Ev     0
 501#define EXT5_DEC_Ev     1
 502#define EXT5_CALLN_Ev   2
 503#define EXT5_JMPN_Ev    4
 504
 505/* Condition codes to be added to OPC_JCC_{long,short}.  */
 506#define JCC_JMP (-1)
 507#define JCC_JO  0x0
 508#define JCC_JNO 0x1
 509#define JCC_JB  0x2
 510#define JCC_JAE 0x3
 511#define JCC_JE  0x4
 512#define JCC_JNE 0x5
 513#define JCC_JBE 0x6
 514#define JCC_JA  0x7
 515#define JCC_JS  0x8
 516#define JCC_JNS 0x9
 517#define JCC_JP  0xa
 518#define JCC_JNP 0xb
 519#define JCC_JL  0xc
 520#define JCC_JGE 0xd
 521#define JCC_JLE 0xe
 522#define JCC_JG  0xf
 523
 524static const uint8_t tcg_cond_to_jcc[] = {
 525    [TCG_COND_EQ] = JCC_JE,
 526    [TCG_COND_NE] = JCC_JNE,
 527    [TCG_COND_LT] = JCC_JL,
 528    [TCG_COND_GE] = JCC_JGE,
 529    [TCG_COND_LE] = JCC_JLE,
 530    [TCG_COND_GT] = JCC_JG,
 531    [TCG_COND_LTU] = JCC_JB,
 532    [TCG_COND_GEU] = JCC_JAE,
 533    [TCG_COND_LEU] = JCC_JBE,
 534    [TCG_COND_GTU] = JCC_JA,
 535};
 536
 537#if TCG_TARGET_REG_BITS == 64
 538static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
 539{
 540    int rex;
 541
 542    if (opc & P_GS) {
 543        tcg_out8(s, 0x65);
 544    }
 545    if (opc & P_DATA16) {
 546        /* We should never be asking for both 16 and 64-bit operation.  */
 547        tcg_debug_assert((opc & P_REXW) == 0);
 548        tcg_out8(s, 0x66);
 549    }
 550    if (opc & P_SIMDF3) {
 551        tcg_out8(s, 0xf3);
 552    } else if (opc & P_SIMDF2) {
 553        tcg_out8(s, 0xf2);
 554    }
 555
 556    rex = 0;
 557    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
 558    rex |= (r & 8) >> 1;                /* REX.R */
 559    rex |= (x & 8) >> 2;                /* REX.X */
 560    rex |= (rm & 8) >> 3;               /* REX.B */
 561
 562    /* P_REXB_{R,RM} indicates that the given register is the low byte.
 563       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
 564       as otherwise the encoding indicates %[abcd]h.  Note that the values
 565       that are ORed in merely indicate that the REX byte must be present;
 566       those bits get discarded in output.  */
 567    rex |= opc & (r >= 4 ? P_REXB_R : 0);
 568    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
 569
 570    if (rex) {
 571        tcg_out8(s, (uint8_t)(rex | 0x40));
 572    }
 573
 574    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 575        tcg_out8(s, 0x0f);
 576        if (opc & P_EXT38) {
 577            tcg_out8(s, 0x38);
 578        } else if (opc & P_EXT3A) {
 579            tcg_out8(s, 0x3a);
 580        }
 581    }
 582
 583    tcg_out8(s, opc);
 584}
 585#else
 586static void tcg_out_opc(TCGContext *s, int opc)
 587{
 588    if (opc & P_DATA16) {
 589        tcg_out8(s, 0x66);
 590    }
 591    if (opc & P_SIMDF3) {
 592        tcg_out8(s, 0xf3);
 593    } else if (opc & P_SIMDF2) {
 594        tcg_out8(s, 0xf2);
 595    }
 596    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 597        tcg_out8(s, 0x0f);
 598        if (opc & P_EXT38) {
 599            tcg_out8(s, 0x38);
 600        } else if (opc & P_EXT3A) {
 601            tcg_out8(s, 0x3a);
 602        }
 603    }
 604    tcg_out8(s, opc);
 605}
 606/* Discard the register arguments to tcg_out_opc early, so as not to penalize
 607   the 32-bit compilation paths.  This method works with all versions of gcc,
 608   whereas relying on optimization may not be able to exclude them.  */
 609#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
 610#endif
 611
 612static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
 613{
 614    tcg_out_opc(s, opc, r, rm, 0);
 615    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 616}
 617
 618static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
 619                            int rm, int index)
 620{
 621    int tmp;
 622
 623    /* Use the two byte form if possible, which cannot encode
 624       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
 625    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT
 626        && ((rm | index) & 8) == 0) {
 627        /* Two byte VEX prefix.  */
 628        tcg_out8(s, 0xc5);
 629
 630        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
 631    } else {
 632        /* Three byte VEX prefix.  */
 633        tcg_out8(s, 0xc4);
 634
 635        /* VEX.m-mmmm */
 636        if (opc & P_EXT3A) {
 637            tmp = 3;
 638        } else if (opc & P_EXT38) {
 639            tmp = 2;
 640        } else if (opc & P_EXT) {
 641            tmp = 1;
 642        } else {
 643            g_assert_not_reached();
 644        }
 645        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
 646        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
 647        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
 648        tcg_out8(s, tmp);
 649
 650        tmp = (opc & P_REXW ? 0x80 : 0);       /* VEX.W */
 651    }
 652
 653    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
 654    /* VEX.pp */
 655    if (opc & P_DATA16) {
 656        tmp |= 1;                          /* 0x66 */
 657    } else if (opc & P_SIMDF3) {
 658        tmp |= 2;                          /* 0xf3 */
 659    } else if (opc & P_SIMDF2) {
 660        tmp |= 3;                          /* 0xf2 */
 661    }
 662    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
 663    tcg_out8(s, tmp);
 664    tcg_out8(s, opc);
 665}
 666
 667static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
 668{
 669    tcg_out_vex_opc(s, opc, r, v, rm, 0);
 670    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 671}
 672
 673/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
 674   We handle either RM and INDEX missing with a negative value.  In 64-bit
 675   mode for absolute addresses, ~RM is the size of the immediate operand
 676   that will follow the instruction.  */
 677
 678static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
 679                               int shift, intptr_t offset)
 680{
 681    int mod, len;
 682
 683    if (index < 0 && rm < 0) {
 684        if (TCG_TARGET_REG_BITS == 64) {
 685            /* Try for a rip-relative addressing mode.  This has replaced
 686               the 32-bit-mode absolute addressing encoding.  */
 687            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
 688            intptr_t disp = offset - pc;
 689            if (disp == (int32_t)disp) {
 690                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
 691                tcg_out32(s, disp);
 692                return;
 693            }
 694
 695            /* Try for an absolute address encoding.  This requires the
 696               use of the MODRM+SIB encoding and is therefore larger than
 697               rip-relative addressing.  */
 698            if (offset == (int32_t)offset) {
 699                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
 700                tcg_out8(s, (4 << 3) | 5);
 701                tcg_out32(s, offset);
 702                return;
 703            }
 704
 705            /* ??? The memory isn't directly addressable.  */
 706            g_assert_not_reached();
 707        } else {
 708            /* Absolute address.  */
 709            tcg_out8(s, (r << 3) | 5);
 710            tcg_out32(s, offset);
 711            return;
 712        }
 713    }
 714
 715    /* Find the length of the immediate addend.  Note that the encoding
 716       that would be used for (%ebp) indicates absolute addressing.  */
 717    if (rm < 0) {
 718        mod = 0, len = 4, rm = 5;
 719    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
 720        mod = 0, len = 0;
 721    } else if (offset == (int8_t)offset) {
 722        mod = 0x40, len = 1;
 723    } else {
 724        mod = 0x80, len = 4;
 725    }
 726
 727    /* Use a single byte MODRM format if possible.  Note that the encoding
 728       that would be used for %esp is the escape to the two byte form.  */
 729    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
 730        /* Single byte MODRM format.  */
 731        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 732    } else {
 733        /* Two byte MODRM+SIB format.  */
 734
 735        /* Note that the encoding that would place %esp into the index
 736           field indicates no index register.  In 64-bit mode, the REX.X
 737           bit counts, so %r12 can be used as the index.  */
 738        if (index < 0) {
 739            index = 4;
 740        } else {
 741            tcg_debug_assert(index != TCG_REG_ESP);
 742        }
 743
 744        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
 745        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
 746    }
 747
 748    if (len == 1) {
 749        tcg_out8(s, offset);
 750    } else if (len == 4) {
 751        tcg_out32(s, offset);
 752    }
 753}
 754
 755static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
 756                                     int index, int shift, intptr_t offset)
 757{
 758    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 759    tcg_out_sib_offset(s, r, rm, index, shift, offset);
 760}
 761
 762static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
 763                                         int rm, int index, int shift,
 764                                         intptr_t offset)
 765{
 766    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 767    tcg_out_sib_offset(s, r, rm, index, shift, offset);
 768}
 769
 770/* A simplification of the above with no index or shift.  */
 771static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
 772                                        int rm, intptr_t offset)
 773{
 774    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
 775}
 776
 777static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
 778                                            int v, int rm, intptr_t offset)
 779{
 780    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
 781}
 782
 783/* Output an opcode with an expected reference to the constant pool.  */
 784static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
 785{
 786    tcg_out_opc(s, opc, r, 0, 0);
 787    /* Absolute for 32-bit, pc-relative for 64-bit.  */
 788    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 789    tcg_out32(s, 0);
 790}
 791
 792/* Output an opcode with an expected reference to the constant pool.  */
 793static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
 794{
 795    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
 796    /* Absolute for 32-bit, pc-relative for 64-bit.  */
 797    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 798    tcg_out32(s, 0);
 799}
 800
 801/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
 802static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
 803{
 804    /* Propagate an opcode prefix, such as P_REXW.  */
 805    int ext = subop & ~0x7;
 806    subop &= 0x7;
 807
 808    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
 809}
 810
 811static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
 812{
 813    int rexw = 0;
 814
 815    if (arg == ret) {
 816        return;
 817    }
 818    switch (type) {
 819    case TCG_TYPE_I64:
 820        rexw = P_REXW;
 821        /* fallthru */
 822    case TCG_TYPE_I32:
 823        if (ret < 16) {
 824            if (arg < 16) {
 825                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
 826            } else {
 827                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
 828            }
 829        } else {
 830            if (arg < 16) {
 831                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
 832            } else {
 833                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 834            }
 835        }
 836        break;
 837
 838    case TCG_TYPE_V64:
 839        tcg_debug_assert(ret >= 16 && arg >= 16);
 840        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 841        break;
 842    case TCG_TYPE_V128:
 843        tcg_debug_assert(ret >= 16 && arg >= 16);
 844        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
 845        break;
 846    case TCG_TYPE_V256:
 847        tcg_debug_assert(ret >= 16 && arg >= 16);
 848        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
 849        break;
 850
 851    default:
 852        g_assert_not_reached();
 853    }
 854}
 855
 856static void tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
 857                            TCGReg r, TCGReg a)
 858{
 859    if (have_avx2) {
 860        static const int dup_insn[4] = {
 861            OPC_VPBROADCASTB, OPC_VPBROADCASTW,
 862            OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
 863        };
 864        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 865        tcg_out_vex_modrm(s, dup_insn[vece] + vex_l, r, 0, a);
 866    } else {
 867        switch (vece) {
 868        case MO_8:
 869            /* ??? With zero in a register, use PSHUFB.  */
 870            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
 871            a = r;
 872            /* FALLTHRU */
 873        case MO_16:
 874            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
 875            a = r;
 876            /* FALLTHRU */
 877        case MO_32:
 878            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
 879            /* imm8 operand: all output lanes selected from input lane 0.  */
 880            tcg_out8(s, 0);
 881            break;
 882        case MO_64:
 883            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
 884            break;
 885        default:
 886            g_assert_not_reached();
 887        }
 888    }
 889}
 890
 891static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
 892                             TCGReg ret, tcg_target_long arg)
 893{
 894    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 895
 896    if (arg == 0) {
 897        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
 898        return;
 899    }
 900    if (arg == -1) {
 901        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
 902        return;
 903    }
 904
 905    if (TCG_TARGET_REG_BITS == 64) {
 906        if (type == TCG_TYPE_V64) {
 907            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
 908        } else if (have_avx2) {
 909            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
 910        } else {
 911            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
 912        }
 913        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
 914    } else if (have_avx2) {
 915        tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
 916        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 917    } else {
 918        tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy, ret);
 919        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 920        tcg_out_dup_vec(s, type, MO_32, ret, ret);
 921    }
 922}
 923
 924static void tcg_out_movi(TCGContext *s, TCGType type,
 925                         TCGReg ret, tcg_target_long arg)
 926{
 927    tcg_target_long diff;
 928
 929    switch (type) {
 930    case TCG_TYPE_I32:
 931#if TCG_TARGET_REG_BITS == 64
 932    case TCG_TYPE_I64:
 933#endif
 934        if (ret < 16) {
 935            break;
 936        }
 937        /* fallthru */
 938    case TCG_TYPE_V64:
 939    case TCG_TYPE_V128:
 940    case TCG_TYPE_V256:
 941        tcg_debug_assert(ret >= 16);
 942        tcg_out_dupi_vec(s, type, ret, arg);
 943        return;
 944    default:
 945        g_assert_not_reached();
 946    }
 947
 948    if (arg == 0) {
 949        tgen_arithr(s, ARITH_XOR, ret, ret);
 950        return;
 951    }
 952    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
 953        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
 954        tcg_out32(s, arg);
 955        return;
 956    }
 957    if (arg == (int32_t)arg) {
 958        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
 959        tcg_out32(s, arg);
 960        return;
 961    }
 962
 963    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
 964    diff = arg - ((uintptr_t)s->code_ptr + 7);
 965    if (diff == (int32_t)diff) {
 966        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
 967        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
 968        tcg_out32(s, diff);
 969        return;
 970    }
 971
 972    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
 973    tcg_out64(s, arg);
 974}
 975
 976static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
 977{
 978    if (val == (int8_t)val) {
 979        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
 980        tcg_out8(s, val);
 981    } else if (val == (int32_t)val) {
 982        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
 983        tcg_out32(s, val);
 984    } else {
 985        tcg_abort();
 986    }
 987}
 988
 989static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
 990{
 991    /* Given the strength of x86 memory ordering, we only need care for
 992       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
 993       faster than "mfence", so don't bother with the sse insn.  */
 994    if (a0 & TCG_MO_ST_LD) {
 995        tcg_out8(s, 0xf0);
 996        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
 997        tcg_out8(s, 0);
 998    }
 999}
1000
1001static inline void tcg_out_push(TCGContext *s, int reg)
1002{
1003    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1004}
1005
1006static inline void tcg_out_pop(TCGContext *s, int reg)
1007{
1008    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1009}
1010
1011static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1012                       TCGReg arg1, intptr_t arg2)
1013{
1014    switch (type) {
1015    case TCG_TYPE_I32:
1016        if (ret < 16) {
1017            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1018        } else {
1019            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1020        }
1021        break;
1022    case TCG_TYPE_I64:
1023        if (ret < 16) {
1024            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1025            break;
1026        }
1027        /* FALLTHRU */
1028    case TCG_TYPE_V64:
1029        tcg_debug_assert(ret >= 16);
1030        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1031        break;
1032    case TCG_TYPE_V128:
1033        tcg_debug_assert(ret >= 16);
1034        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx, ret, 0, arg1, arg2);
1035        break;
1036    case TCG_TYPE_V256:
1037        tcg_debug_assert(ret >= 16);
1038        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1039                                 ret, 0, arg1, arg2);
1040        break;
1041    default:
1042        g_assert_not_reached();
1043    }
1044}
1045
1046static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1047                       TCGReg arg1, intptr_t arg2)
1048{
1049    switch (type) {
1050    case TCG_TYPE_I32:
1051        if (arg < 16) {
1052            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1053        } else {
1054            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1055        }
1056        break;
1057    case TCG_TYPE_I64:
1058        if (arg < 16) {
1059            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1060            break;
1061        }
1062        /* FALLTHRU */
1063    case TCG_TYPE_V64:
1064        tcg_debug_assert(arg >= 16);
1065        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1066        break;
1067    case TCG_TYPE_V128:
1068        tcg_debug_assert(arg >= 16);
1069        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx, arg, 0, arg1, arg2);
1070        break;
1071    case TCG_TYPE_V256:
1072        tcg_debug_assert(arg >= 16);
1073        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1074                                 arg, 0, arg1, arg2);
1075        break;
1076    default:
1077        g_assert_not_reached();
1078    }
1079}
1080
1081static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1082                        TCGReg base, intptr_t ofs)
1083{
1084    int rexw = 0;
1085    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1086        if (val != (int32_t)val) {
1087            return false;
1088        }
1089        rexw = P_REXW;
1090    } else if (type != TCG_TYPE_I32) {
1091        return false;
1092    }
1093    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1094    tcg_out32(s, val);
1095    return true;
1096}
1097
1098static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1099{
1100    /* Propagate an opcode prefix, such as P_DATA16.  */
1101    int ext = subopc & ~0x7;
1102    subopc &= 0x7;
1103
1104    if (count == 1) {
1105        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1106    } else {
1107        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1108        tcg_out8(s, count);
1109    }
1110}
1111
1112static inline void tcg_out_bswap32(TCGContext *s, int reg)
1113{
1114    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1115}
1116
1117static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1118{
1119    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1120}
1121
1122static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
1123{
1124    /* movzbl */
1125    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1126    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1127}
1128
1129static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
1130{
1131    /* movsbl */
1132    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1133    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1134}
1135
1136static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
1137{
1138    /* movzwl */
1139    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1140}
1141
1142static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
1143{
1144    /* movsw[lq] */
1145    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1146}
1147
1148static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
1149{
1150    /* 32-bit mov zero extends.  */
1151    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1152}
1153
1154static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
1155{
1156    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1157}
1158
1159static inline void tcg_out_bswap64(TCGContext *s, int reg)
1160{
1161    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1162}
1163
1164static void tgen_arithi(TCGContext *s, int c, int r0,
1165                        tcg_target_long val, int cf)
1166{
1167    int rexw = 0;
1168
1169    if (TCG_TARGET_REG_BITS == 64) {
1170        rexw = c & -8;
1171        c &= 7;
1172    }
1173
1174    /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1175       partial flags update stalls on Pentium4 and are not recommended
1176       by current Intel optimization manuals.  */
1177    if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1178        int is_inc = (c == ARITH_ADD) ^ (val < 0);
1179        if (TCG_TARGET_REG_BITS == 64) {
1180            /* The single-byte increment encodings are re-tasked as the
1181               REX prefixes.  Use the MODRM encoding.  */
1182            tcg_out_modrm(s, OPC_GRP5 + rexw,
1183                          (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1184        } else {
1185            tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1186        }
1187        return;
1188    }
1189
1190    if (c == ARITH_AND) {
1191        if (TCG_TARGET_REG_BITS == 64) {
1192            if (val == 0xffffffffu) {
1193                tcg_out_ext32u(s, r0, r0);
1194                return;
1195            }
1196            if (val == (uint32_t)val) {
1197                /* AND with no high bits set can use a 32-bit operation.  */
1198                rexw = 0;
1199            }
1200        }
1201        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1202            tcg_out_ext8u(s, r0, r0);
1203            return;
1204        }
1205        if (val == 0xffffu) {
1206            tcg_out_ext16u(s, r0, r0);
1207            return;
1208        }
1209    }
1210
1211    if (val == (int8_t)val) {
1212        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1213        tcg_out8(s, val);
1214        return;
1215    }
1216    if (rexw == 0 || val == (int32_t)val) {
1217        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1218        tcg_out32(s, val);
1219        return;
1220    }
1221
1222    tcg_abort();
1223}
1224
1225static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1226{
1227    if (val != 0) {
1228        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1229    }
1230}
1231
1232/* Use SMALL != 0 to force a short forward branch.  */
1233static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
1234{
1235    int32_t val, val1;
1236
1237    if (l->has_value) {
1238        val = tcg_pcrel_diff(s, l->u.value_ptr);
1239        val1 = val - 2;
1240        if ((int8_t)val1 == val1) {
1241            if (opc == -1) {
1242                tcg_out8(s, OPC_JMP_short);
1243            } else {
1244                tcg_out8(s, OPC_JCC_short + opc);
1245            }
1246            tcg_out8(s, val1);
1247        } else {
1248            if (small) {
1249                tcg_abort();
1250            }
1251            if (opc == -1) {
1252                tcg_out8(s, OPC_JMP_long);
1253                tcg_out32(s, val - 5);
1254            } else {
1255                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1256                tcg_out32(s, val - 6);
1257            }
1258        }
1259    } else if (small) {
1260        if (opc == -1) {
1261            tcg_out8(s, OPC_JMP_short);
1262        } else {
1263            tcg_out8(s, OPC_JCC_short + opc);
1264        }
1265        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1266        s->code_ptr += 1;
1267    } else {
1268        if (opc == -1) {
1269            tcg_out8(s, OPC_JMP_long);
1270        } else {
1271            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1272        }
1273        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1274        s->code_ptr += 4;
1275    }
1276}
1277
1278static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1279                        int const_arg2, int rexw)
1280{
1281    if (const_arg2) {
1282        if (arg2 == 0) {
1283            /* test r, r */
1284            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1285        } else {
1286            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1287        }
1288    } else {
1289        tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1290    }
1291}
1292
1293static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1294                             TCGArg arg1, TCGArg arg2, int const_arg2,
1295                             TCGLabel *label, int small)
1296{
1297    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1298    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1299}
1300
1301#if TCG_TARGET_REG_BITS == 64
1302static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1303                             TCGArg arg1, TCGArg arg2, int const_arg2,
1304                             TCGLabel *label, int small)
1305{
1306    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1307    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1308}
1309#else
1310/* XXX: we implement it at the target level to avoid having to
1311   handle cross basic blocks temporaries */
1312static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1313                            const int *const_args, int small)
1314{
1315    TCGLabel *label_next = gen_new_label();
1316    TCGLabel *label_this = arg_label(args[5]);
1317
1318    switch(args[4]) {
1319    case TCG_COND_EQ:
1320        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1321                         label_next, 1);
1322        tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1323                         label_this, small);
1324        break;
1325    case TCG_COND_NE:
1326        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1327                         label_this, small);
1328        tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1329                         label_this, small);
1330        break;
1331    case TCG_COND_LT:
1332        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1333                         label_this, small);
1334        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1335        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1336                         label_this, small);
1337        break;
1338    case TCG_COND_LE:
1339        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1340                         label_this, small);
1341        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1342        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1343                         label_this, small);
1344        break;
1345    case TCG_COND_GT:
1346        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1347                         label_this, small);
1348        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1349        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1350                         label_this, small);
1351        break;
1352    case TCG_COND_GE:
1353        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1354                         label_this, small);
1355        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1356        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1357                         label_this, small);
1358        break;
1359    case TCG_COND_LTU:
1360        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1361                         label_this, small);
1362        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1363        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1364                         label_this, small);
1365        break;
1366    case TCG_COND_LEU:
1367        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1368                         label_this, small);
1369        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1370        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1371                         label_this, small);
1372        break;
1373    case TCG_COND_GTU:
1374        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1375                         label_this, small);
1376        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1377        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1378                         label_this, small);
1379        break;
1380    case TCG_COND_GEU:
1381        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1382                         label_this, small);
1383        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1384        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1385                         label_this, small);
1386        break;
1387    default:
1388        tcg_abort();
1389    }
1390    tcg_out_label(s, label_next, s->code_ptr);
1391}
1392#endif
1393
1394static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1395                              TCGArg arg1, TCGArg arg2, int const_arg2)
1396{
1397    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1398    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1399    tcg_out_ext8u(s, dest, dest);
1400}
1401
1402#if TCG_TARGET_REG_BITS == 64
1403static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1404                              TCGArg arg1, TCGArg arg2, int const_arg2)
1405{
1406    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1407    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1408    tcg_out_ext8u(s, dest, dest);
1409}
1410#else
1411static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1412                             const int *const_args)
1413{
1414    TCGArg new_args[6];
1415    TCGLabel *label_true, *label_over;
1416
1417    memcpy(new_args, args+1, 5*sizeof(TCGArg));
1418
1419    if (args[0] == args[1] || args[0] == args[2]
1420        || (!const_args[3] && args[0] == args[3])
1421        || (!const_args[4] && args[0] == args[4])) {
1422        /* When the destination overlaps with one of the argument
1423           registers, don't do anything tricky.  */
1424        label_true = gen_new_label();
1425        label_over = gen_new_label();
1426
1427        new_args[5] = label_arg(label_true);
1428        tcg_out_brcond2(s, new_args, const_args+1, 1);
1429
1430        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1431        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1432        tcg_out_label(s, label_true, s->code_ptr);
1433
1434        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1435        tcg_out_label(s, label_over, s->code_ptr);
1436    } else {
1437        /* When the destination does not overlap one of the arguments,
1438           clear the destination first, jump if cond false, and emit an
1439           increment in the true case.  This results in smaller code.  */
1440
1441        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1442
1443        label_over = gen_new_label();
1444        new_args[4] = tcg_invert_cond(new_args[4]);
1445        new_args[5] = label_arg(label_over);
1446        tcg_out_brcond2(s, new_args, const_args+1, 1);
1447
1448        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1449        tcg_out_label(s, label_over, s->code_ptr);
1450    }
1451}
1452#endif
1453
1454static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1455                         TCGReg dest, TCGReg v1)
1456{
1457    if (have_cmov) {
1458        tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1459    } else {
1460        TCGLabel *over = gen_new_label();
1461        tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1462        tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1463        tcg_out_label(s, over, s->code_ptr);
1464    }
1465}
1466
1467static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1468                              TCGReg c1, TCGArg c2, int const_c2,
1469                              TCGReg v1)
1470{
1471    tcg_out_cmp(s, c1, c2, const_c2, 0);
1472    tcg_out_cmov(s, cond, 0, dest, v1);
1473}
1474
1475#if TCG_TARGET_REG_BITS == 64
1476static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1477                              TCGReg c1, TCGArg c2, int const_c2,
1478                              TCGReg v1)
1479{
1480    tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1481    tcg_out_cmov(s, cond, P_REXW, dest, v1);
1482}
1483#endif
1484
1485static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1486                        TCGArg arg2, bool const_a2)
1487{
1488    if (have_bmi1) {
1489        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1490        if (const_a2) {
1491            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1492        } else {
1493            tcg_debug_assert(dest != arg2);
1494            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1495        }
1496    } else {
1497        tcg_debug_assert(dest != arg2);
1498        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1499        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1500    }
1501}
1502
1503static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1504                        TCGArg arg2, bool const_a2)
1505{
1506    if (have_lzcnt) {
1507        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1508        if (const_a2) {
1509            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1510        } else {
1511            tcg_debug_assert(dest != arg2);
1512            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1513        }
1514    } else {
1515        tcg_debug_assert(!const_a2);
1516        tcg_debug_assert(dest != arg1);
1517        tcg_debug_assert(dest != arg2);
1518
1519        /* Recall that the output of BSR is the index not the count.  */
1520        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1521        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1522
1523        /* Since we have destroyed the flags from BSR, we have to re-test.  */
1524        tcg_out_cmp(s, arg1, 0, 1, rexw);
1525        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1526    }
1527}
1528
1529static void tcg_out_branch(TCGContext *s, int call, tcg_insn_unit *dest)
1530{
1531    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1532
1533    if (disp == (int32_t)disp) {
1534        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1535        tcg_out32(s, disp);
1536    } else {
1537        /* rip-relative addressing into the constant pool.
1538           This is 6 + 8 = 14 bytes, as compared to using an
1539           an immediate load 10 + 6 = 16 bytes, plus we may
1540           be able to re-use the pool constant for more calls.  */
1541        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1542        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1543        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1544        tcg_out32(s, 0);
1545    }
1546}
1547
1548static inline void tcg_out_call(TCGContext *s, tcg_insn_unit *dest)
1549{
1550    tcg_out_branch(s, 1, dest);
1551}
1552
1553static void tcg_out_jmp(TCGContext *s, tcg_insn_unit *dest)
1554{
1555    tcg_out_branch(s, 0, dest);
1556}
1557
1558static void tcg_out_nopn(TCGContext *s, int n)
1559{
1560    int i;
1561    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1562     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1563     * duplicate prefix, and all of the interesting recent cores can
1564     * decode and discard the duplicates in a single cycle.
1565     */
1566    tcg_debug_assert(n >= 1);
1567    for (i = 1; i < n; ++i) {
1568        tcg_out8(s, 0x66);
1569    }
1570    tcg_out8(s, 0x90);
1571}
1572
1573#if defined(CONFIG_SOFTMMU)
1574#include "tcg-ldst.inc.c"
1575
1576/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1577 *                                     int mmu_idx, uintptr_t ra)
1578 */
1579static void * const qemu_ld_helpers[16] = {
1580    [MO_UB]   = helper_ret_ldub_mmu,
1581    [MO_LEUW] = helper_le_lduw_mmu,
1582    [MO_LEUL] = helper_le_ldul_mmu,
1583    [MO_LEQ]  = helper_le_ldq_mmu,
1584    [MO_BEUW] = helper_be_lduw_mmu,
1585    [MO_BEUL] = helper_be_ldul_mmu,
1586    [MO_BEQ]  = helper_be_ldq_mmu,
1587};
1588
1589/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1590 *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
1591 */
1592static void * const qemu_st_helpers[16] = {
1593    [MO_UB]   = helper_ret_stb_mmu,
1594    [MO_LEUW] = helper_le_stw_mmu,
1595    [MO_LEUL] = helper_le_stl_mmu,
1596    [MO_LEQ]  = helper_le_stq_mmu,
1597    [MO_BEUW] = helper_be_stw_mmu,
1598    [MO_BEUL] = helper_be_stl_mmu,
1599    [MO_BEQ]  = helper_be_stq_mmu,
1600};
1601
1602/* Perform the TLB load and compare.
1603
1604   Inputs:
1605   ADDRLO and ADDRHI contain the low and high part of the address.
1606
1607   MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1608
1609   WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1610   This should be offsetof addr_read or addr_write.
1611
1612   Outputs:
1613   LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1614   positions of the displacements of forward jumps to the TLB miss case.
1615
1616   Second argument register is loaded with the low part of the address.
1617   In the TLB hit case, it has been adjusted as indicated by the TLB
1618   and so is a host address.  In the TLB miss case, it continues to
1619   hold a guest address.
1620
1621   First argument register is clobbered.  */
1622
1623static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1624                                    int mem_index, TCGMemOp opc,
1625                                    tcg_insn_unit **label_ptr, int which)
1626{
1627    const TCGReg r0 = TCG_REG_L0;
1628    const TCGReg r1 = TCG_REG_L1;
1629    TCGType ttype = TCG_TYPE_I32;
1630    TCGType tlbtype = TCG_TYPE_I32;
1631    int trexw = 0, hrexw = 0, tlbrexw = 0;
1632    unsigned a_bits = get_alignment_bits(opc);
1633    unsigned s_bits = opc & MO_SIZE;
1634    unsigned a_mask = (1 << a_bits) - 1;
1635    unsigned s_mask = (1 << s_bits) - 1;
1636    target_ulong tlb_mask;
1637
1638    if (TCG_TARGET_REG_BITS == 64) {
1639        if (TARGET_LONG_BITS == 64) {
1640            ttype = TCG_TYPE_I64;
1641            trexw = P_REXW;
1642        }
1643        if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1644            hrexw = P_REXW;
1645            if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
1646                tlbtype = TCG_TYPE_I64;
1647                tlbrexw = P_REXW;
1648            }
1649        }
1650    }
1651
1652    tcg_out_mov(s, tlbtype, r0, addrlo);
1653    tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1654                   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1655
1656    tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0,
1657                         offsetof(CPUArchState, tlb_mask[mem_index]));
1658
1659    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0,
1660                         offsetof(CPUArchState, tlb_table[mem_index]));
1661
1662    /* If the required alignment is at least as large as the access, simply
1663       copy the address and mask.  For lesser alignments, check that we don't
1664       cross pages for the complete access.  */
1665    if (a_bits >= s_bits) {
1666        tcg_out_mov(s, ttype, r1, addrlo);
1667    } else {
1668        tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1669    }
1670    tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1671    tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1672
1673    /* cmp 0(r0), r1 */
1674    tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which);
1675
1676    /* Prepare for both the fast path add of the tlb addend, and the slow
1677       path function argument setup.  */
1678    tcg_out_mov(s, ttype, r1, addrlo);
1679
1680    /* jne slow_path */
1681    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1682    label_ptr[0] = s->code_ptr;
1683    s->code_ptr += 4;
1684
1685    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1686        /* cmp 4(r0), addrhi */
1687        tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4);
1688
1689        /* jne slow_path */
1690        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1691        label_ptr[1] = s->code_ptr;
1692        s->code_ptr += 4;
1693    }
1694
1695    /* TLB Hit.  */
1696
1697    /* add addend(r0), r1 */
1698    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1699                         offsetof(CPUTLBEntry, addend));
1700}
1701
1702/*
1703 * Record the context of a call to the out of line helper code for the slow path
1704 * for a load or store, so that we can later generate the correct helper code
1705 */
1706static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64,
1707                                TCGMemOpIdx oi,
1708                                TCGReg datalo, TCGReg datahi,
1709                                TCGReg addrlo, TCGReg addrhi,
1710                                tcg_insn_unit *raddr,
1711                                tcg_insn_unit **label_ptr)
1712{
1713    TCGLabelQemuLdst *label = new_ldst_label(s);
1714
1715    label->is_ld = is_ld;
1716    label->oi = oi;
1717    label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
1718    label->datalo_reg = datalo;
1719    label->datahi_reg = datahi;
1720    label->addrlo_reg = addrlo;
1721    label->addrhi_reg = addrhi;
1722    label->raddr = raddr;
1723    label->label_ptr[0] = label_ptr[0];
1724    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1725        label->label_ptr[1] = label_ptr[1];
1726    }
1727}
1728
1729/*
1730 * Generate code for the slow path for a load at the end of block
1731 */
1732static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1733{
1734    TCGMemOpIdx oi = l->oi;
1735    TCGMemOp opc = get_memop(oi);
1736    TCGReg data_reg;
1737    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1738    int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0);
1739
1740    /* resolve label address */
1741    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1742    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1743        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1744    }
1745
1746    if (TCG_TARGET_REG_BITS == 32) {
1747        int ofs = 0;
1748
1749        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1750        ofs += 4;
1751
1752        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1753        ofs += 4;
1754
1755        if (TARGET_LONG_BITS == 64) {
1756            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1757            ofs += 4;
1758        }
1759
1760        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1761        ofs += 4;
1762
1763        tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1764    } else {
1765        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1766        /* The second argument is already loaded with addrlo.  */
1767        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1768        tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1769                     (uintptr_t)l->raddr);
1770    }
1771
1772    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1773
1774    data_reg = l->datalo_reg;
1775    switch (opc & MO_SSIZE) {
1776    case MO_SB:
1777        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw);
1778        break;
1779    case MO_SW:
1780        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw);
1781        break;
1782#if TCG_TARGET_REG_BITS == 64
1783    case MO_SL:
1784        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1785        break;
1786#endif
1787    case MO_UB:
1788    case MO_UW:
1789        /* Note that the helpers have zero-extended to tcg_target_long.  */
1790    case MO_UL:
1791        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1792        break;
1793    case MO_Q:
1794        if (TCG_TARGET_REG_BITS == 64) {
1795            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1796        } else if (data_reg == TCG_REG_EDX) {
1797            /* xchg %edx, %eax */
1798            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1799            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1800        } else {
1801            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1802            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1803        }
1804        break;
1805    default:
1806        tcg_abort();
1807    }
1808
1809    /* Jump to the code corresponding to next IR of qemu_st */
1810    tcg_out_jmp(s, l->raddr);
1811}
1812
1813/*
1814 * Generate code for the slow path for a store at the end of block
1815 */
1816static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1817{
1818    TCGMemOpIdx oi = l->oi;
1819    TCGMemOp opc = get_memop(oi);
1820    TCGMemOp s_bits = opc & MO_SIZE;
1821    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1822    TCGReg retaddr;
1823
1824    /* resolve label address */
1825    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1826    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1827        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1828    }
1829
1830    if (TCG_TARGET_REG_BITS == 32) {
1831        int ofs = 0;
1832
1833        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1834        ofs += 4;
1835
1836        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1837        ofs += 4;
1838
1839        if (TARGET_LONG_BITS == 64) {
1840            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1841            ofs += 4;
1842        }
1843
1844        tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1845        ofs += 4;
1846
1847        if (s_bits == MO_64) {
1848            tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1849            ofs += 4;
1850        }
1851
1852        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1853        ofs += 4;
1854
1855        retaddr = TCG_REG_EAX;
1856        tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1857        tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1858    } else {
1859        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1860        /* The second argument is already loaded with addrlo.  */
1861        tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1862                    tcg_target_call_iarg_regs[2], l->datalo_reg);
1863        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1864
1865        if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1866            retaddr = tcg_target_call_iarg_regs[4];
1867            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1868        } else {
1869            retaddr = TCG_REG_RAX;
1870            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1871            tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1872                       TCG_TARGET_CALL_STACK_OFFSET);
1873        }
1874    }
1875
1876    /* "Tail call" to the helper, with the return address back inline.  */
1877    tcg_out_push(s, retaddr);
1878    tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1879}
1880#elif TCG_TARGET_REG_BITS == 32
1881# define x86_guest_base_seg     0
1882# define x86_guest_base_index   -1
1883# define x86_guest_base_offset  guest_base
1884#else
1885static int x86_guest_base_seg;
1886static int x86_guest_base_index = -1;
1887static int32_t x86_guest_base_offset;
1888# if defined(__x86_64__) && defined(__linux__)
1889#  include <asm/prctl.h>
1890#  include <sys/prctl.h>
1891int arch_prctl(int code, unsigned long addr);
1892static inline int setup_guest_base_seg(void)
1893{
1894    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1895        return P_GS;
1896    }
1897    return 0;
1898}
1899# elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__)
1900#  include <machine/sysarch.h>
1901static inline int setup_guest_base_seg(void)
1902{
1903    if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1904        return P_GS;
1905    }
1906    return 0;
1907}
1908# else
1909static inline int setup_guest_base_seg(void)
1910{
1911    return 0;
1912}
1913# endif
1914#endif /* SOFTMMU */
1915
1916static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1917                                   TCGReg base, int index, intptr_t ofs,
1918                                   int seg, bool is64, TCGMemOp memop)
1919{
1920    const TCGMemOp real_bswap = memop & MO_BSWAP;
1921    TCGMemOp bswap = real_bswap;
1922    int rexw = is64 * P_REXW;
1923    int movop = OPC_MOVL_GvEv;
1924
1925    if (have_movbe && real_bswap) {
1926        bswap = 0;
1927        movop = OPC_MOVBE_GyMy;
1928    }
1929
1930    switch (memop & MO_SSIZE) {
1931    case MO_UB:
1932        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
1933                                 base, index, 0, ofs);
1934        break;
1935    case MO_SB:
1936        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo,
1937                                 base, index, 0, ofs);
1938        break;
1939    case MO_UW:
1940        tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1941                                 base, index, 0, ofs);
1942        if (real_bswap) {
1943            tcg_out_rolw_8(s, datalo);
1944        }
1945        break;
1946    case MO_SW:
1947        if (real_bswap) {
1948            if (have_movbe) {
1949                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1950                                         datalo, base, index, 0, ofs);
1951            } else {
1952                tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1953                                         base, index, 0, ofs);
1954                tcg_out_rolw_8(s, datalo);
1955            }
1956            tcg_out_modrm(s, OPC_MOVSWL + rexw, datalo, datalo);
1957        } else {
1958            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg,
1959                                     datalo, base, index, 0, ofs);
1960        }
1961        break;
1962    case MO_UL:
1963        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
1964        if (bswap) {
1965            tcg_out_bswap32(s, datalo);
1966        }
1967        break;
1968#if TCG_TARGET_REG_BITS == 64
1969    case MO_SL:
1970        if (real_bswap) {
1971            tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1972                                     base, index, 0, ofs);
1973            if (bswap) {
1974                tcg_out_bswap32(s, datalo);
1975            }
1976            tcg_out_ext32s(s, datalo, datalo);
1977        } else {
1978            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
1979                                     base, index, 0, ofs);
1980        }
1981        break;
1982#endif
1983    case MO_Q:
1984        if (TCG_TARGET_REG_BITS == 64) {
1985            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
1986                                     base, index, 0, ofs);
1987            if (bswap) {
1988                tcg_out_bswap64(s, datalo);
1989            }
1990        } else {
1991            if (real_bswap) {
1992                int t = datalo;
1993                datalo = datahi;
1994                datahi = t;
1995            }
1996            if (base != datalo) {
1997                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1998                                         base, index, 0, ofs);
1999                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2000                                         base, index, 0, ofs + 4);
2001            } else {
2002                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2003                                         base, index, 0, ofs + 4);
2004                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2005                                         base, index, 0, ofs);
2006            }
2007            if (bswap) {
2008                tcg_out_bswap32(s, datalo);
2009                tcg_out_bswap32(s, datahi);
2010            }
2011        }
2012        break;
2013    default:
2014        tcg_abort();
2015    }
2016}
2017
2018/* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
2019   EAX. It will be useful once fixed registers globals are less
2020   common. */
2021static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
2022{
2023    TCGReg datalo, datahi, addrlo;
2024    TCGReg addrhi __attribute__((unused));
2025    TCGMemOpIdx oi;
2026    TCGMemOp opc;
2027#if defined(CONFIG_SOFTMMU)
2028    int mem_index;
2029    tcg_insn_unit *label_ptr[2];
2030#endif
2031
2032    datalo = *args++;
2033    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2034    addrlo = *args++;
2035    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2036    oi = *args++;
2037    opc = get_memop(oi);
2038
2039#if defined(CONFIG_SOFTMMU)
2040    mem_index = get_mmuidx(oi);
2041
2042    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2043                     label_ptr, offsetof(CPUTLBEntry, addr_read));
2044
2045    /* TLB Hit.  */
2046    tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc);
2047
2048    /* Record the current context of a load into ldst label */
2049    add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
2050                        s->code_ptr, label_ptr);
2051#else
2052    tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2053                           x86_guest_base_offset, x86_guest_base_seg,
2054                           is64, opc);
2055#endif
2056}
2057
2058static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2059                                   TCGReg base, int index, intptr_t ofs,
2060                                   int seg, TCGMemOp memop)
2061{
2062    /* ??? Ideally we wouldn't need a scratch register.  For user-only,
2063       we could perform the bswap twice to restore the original value
2064       instead of moving to the scratch.  But as it is, the L constraint
2065       means that TCG_REG_L0 is definitely free here.  */
2066    const TCGReg scratch = TCG_REG_L0;
2067    const TCGMemOp real_bswap = memop & MO_BSWAP;
2068    TCGMemOp bswap = real_bswap;
2069    int movop = OPC_MOVL_EvGv;
2070
2071    if (have_movbe && real_bswap) {
2072        bswap = 0;
2073        movop = OPC_MOVBE_MyGy;
2074    }
2075
2076    switch (memop & MO_SIZE) {
2077    case MO_8:
2078        /* In 32-bit mode, 8-bit stores can only happen from [abcd]x.
2079           Use the scratch register if necessary.  */
2080        if (TCG_TARGET_REG_BITS == 32 && datalo >= 4) {
2081            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2082            datalo = scratch;
2083        }
2084        tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
2085                                 datalo, base, index, 0, ofs);
2086        break;
2087    case MO_16:
2088        if (bswap) {
2089            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2090            tcg_out_rolw_8(s, scratch);
2091            datalo = scratch;
2092        }
2093        tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo,
2094                                 base, index, 0, ofs);
2095        break;
2096    case MO_32:
2097        if (bswap) {
2098            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2099            tcg_out_bswap32(s, scratch);
2100            datalo = scratch;
2101        }
2102        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2103        break;
2104    case MO_64:
2105        if (TCG_TARGET_REG_BITS == 64) {
2106            if (bswap) {
2107                tcg_out_mov(s, TCG_TYPE_I64, scratch, datalo);
2108                tcg_out_bswap64(s, scratch);
2109                datalo = scratch;
2110            }
2111            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2112                                     base, index, 0, ofs);
2113        } else if (bswap) {
2114            tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
2115            tcg_out_bswap32(s, scratch);
2116            tcg_out_modrm_sib_offset(s, OPC_MOVL_EvGv + seg, scratch,
2117                                     base, index, 0, ofs);
2118            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2119            tcg_out_bswap32(s, scratch);
2120            tcg_out_modrm_sib_offset(s, OPC_MOVL_EvGv + seg, scratch,
2121                                     base, index, 0, ofs + 4);
2122        } else {
2123            if (real_bswap) {
2124                int t = datalo;
2125                datalo = datahi;
2126                datahi = t;
2127            }
2128            tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2129                                     base, index, 0, ofs);
2130            tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2131                                     base, index, 0, ofs + 4);
2132        }
2133        break;
2134    default:
2135        tcg_abort();
2136    }
2137}
2138
2139static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
2140{
2141    TCGReg datalo, datahi, addrlo;
2142    TCGReg addrhi __attribute__((unused));
2143    TCGMemOpIdx oi;
2144    TCGMemOp opc;
2145#if defined(CONFIG_SOFTMMU)
2146    int mem_index;
2147    tcg_insn_unit *label_ptr[2];
2148#endif
2149
2150    datalo = *args++;
2151    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2152    addrlo = *args++;
2153    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2154    oi = *args++;
2155    opc = get_memop(oi);
2156
2157#if defined(CONFIG_SOFTMMU)
2158    mem_index = get_mmuidx(oi);
2159
2160    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2161                     label_ptr, offsetof(CPUTLBEntry, addr_write));
2162
2163    /* TLB Hit.  */
2164    tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
2165
2166    /* Record the current context of a store into ldst label */
2167    add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi,
2168                        s->code_ptr, label_ptr);
2169#else
2170    tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2171                           x86_guest_base_offset, x86_guest_base_seg, opc);
2172#endif
2173}
2174
2175static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2176                              const TCGArg *args, const int *const_args)
2177{
2178    TCGArg a0, a1, a2;
2179    int c, const_a2, vexop, rexw = 0;
2180
2181#if TCG_TARGET_REG_BITS == 64
2182# define OP_32_64(x) \
2183        case glue(glue(INDEX_op_, x), _i64): \
2184            rexw = P_REXW; /* FALLTHRU */    \
2185        case glue(glue(INDEX_op_, x), _i32)
2186#else
2187# define OP_32_64(x) \
2188        case glue(glue(INDEX_op_, x), _i32)
2189#endif
2190
2191    /* Hoist the loads of the most common arguments.  */
2192    a0 = args[0];
2193    a1 = args[1];
2194    a2 = args[2];
2195    const_a2 = const_args[2];
2196
2197    switch (opc) {
2198    case INDEX_op_exit_tb:
2199        /* Reuse the zeroing that exists for goto_ptr.  */
2200        if (a0 == 0) {
2201            tcg_out_jmp(s, s->code_gen_epilogue);
2202        } else {
2203            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2204            tcg_out_jmp(s, tb_ret_addr);
2205        }
2206        break;
2207    case INDEX_op_goto_tb:
2208        if (s->tb_jmp_insn_offset) {
2209            /* direct jump method */
2210            int gap;
2211            /* jump displacement must be aligned for atomic patching;
2212             * see if we need to add extra nops before jump
2213             */
2214            gap = tcg_pcrel_diff(s, QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4));
2215            if (gap != 1) {
2216                tcg_out_nopn(s, gap - 1);
2217            }
2218            tcg_out8(s, OPC_JMP_long); /* jmp im */
2219            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
2220            tcg_out32(s, 0);
2221        } else {
2222            /* indirect jump method */
2223            tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
2224                                 (intptr_t)(s->tb_jmp_target_addr + a0));
2225        }
2226        set_jmp_reset_offset(s, a0);
2227        break;
2228    case INDEX_op_goto_ptr:
2229        /* jmp to the given host address (could be epilogue) */
2230        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2231        break;
2232    case INDEX_op_br:
2233        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2234        break;
2235    OP_32_64(ld8u):
2236        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2237        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2238        break;
2239    OP_32_64(ld8s):
2240        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2241        break;
2242    OP_32_64(ld16u):
2243        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2244        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2245        break;
2246    OP_32_64(ld16s):
2247        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2248        break;
2249#if TCG_TARGET_REG_BITS == 64
2250    case INDEX_op_ld32u_i64:
2251#endif
2252    case INDEX_op_ld_i32:
2253        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2254        break;
2255
2256    OP_32_64(st8):
2257        if (const_args[0]) {
2258            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2259            tcg_out8(s, a0);
2260        } else {
2261            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2262        }
2263        break;
2264    OP_32_64(st16):
2265        if (const_args[0]) {
2266            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2267            tcg_out16(s, a0);
2268        } else {
2269            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2270        }
2271        break;
2272#if TCG_TARGET_REG_BITS == 64
2273    case INDEX_op_st32_i64:
2274#endif
2275    case INDEX_op_st_i32:
2276        if (const_args[0]) {
2277            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2278            tcg_out32(s, a0);
2279        } else {
2280            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2281        }
2282        break;
2283
2284    OP_32_64(add):
2285        /* For 3-operand addition, use LEA.  */
2286        if (a0 != a1) {
2287            TCGArg c3 = 0;
2288            if (const_a2) {
2289                c3 = a2, a2 = -1;
2290            } else if (a0 == a2) {
2291                /* Watch out for dest = src + dest, since we've removed
2292                   the matching constraint on the add.  */
2293                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2294                break;
2295            }
2296
2297            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2298            break;
2299        }
2300        c = ARITH_ADD;
2301        goto gen_arith;
2302    OP_32_64(sub):
2303        c = ARITH_SUB;
2304        goto gen_arith;
2305    OP_32_64(and):
2306        c = ARITH_AND;
2307        goto gen_arith;
2308    OP_32_64(or):
2309        c = ARITH_OR;
2310        goto gen_arith;
2311    OP_32_64(xor):
2312        c = ARITH_XOR;
2313        goto gen_arith;
2314    gen_arith:
2315        if (const_a2) {
2316            tgen_arithi(s, c + rexw, a0, a2, 0);
2317        } else {
2318            tgen_arithr(s, c + rexw, a0, a2);
2319        }
2320        break;
2321
2322    OP_32_64(andc):
2323        if (const_a2) {
2324            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2325            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2326        } else {
2327            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2328        }
2329        break;
2330
2331    OP_32_64(mul):
2332        if (const_a2) {
2333            int32_t val;
2334            val = a2;
2335            if (val == (int8_t)val) {
2336                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2337                tcg_out8(s, val);
2338            } else {
2339                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2340                tcg_out32(s, val);
2341            }
2342        } else {
2343            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2344        }
2345        break;
2346
2347    OP_32_64(div2):
2348        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2349        break;
2350    OP_32_64(divu2):
2351        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2352        break;
2353
2354    OP_32_64(shl):
2355        /* For small constant 3-operand shift, use LEA.  */
2356        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2357            if (a2 - 1 == 0) {
2358                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2359                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2360            } else {
2361                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2362                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2363            }
2364            break;
2365        }
2366        c = SHIFT_SHL;
2367        vexop = OPC_SHLX;
2368        goto gen_shift_maybe_vex;
2369    OP_32_64(shr):
2370        c = SHIFT_SHR;
2371        vexop = OPC_SHRX;
2372        goto gen_shift_maybe_vex;
2373    OP_32_64(sar):
2374        c = SHIFT_SAR;
2375        vexop = OPC_SARX;
2376        goto gen_shift_maybe_vex;
2377    OP_32_64(rotl):
2378        c = SHIFT_ROL;
2379        goto gen_shift;
2380    OP_32_64(rotr):
2381        c = SHIFT_ROR;
2382        goto gen_shift;
2383    gen_shift_maybe_vex:
2384        if (have_bmi2) {
2385            if (!const_a2) {
2386                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2387                break;
2388            }
2389            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2390        }
2391        /* FALLTHRU */
2392    gen_shift:
2393        if (const_a2) {
2394            tcg_out_shifti(s, c + rexw, a0, a2);
2395        } else {
2396            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2397        }
2398        break;
2399
2400    OP_32_64(ctz):
2401        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2402        break;
2403    OP_32_64(clz):
2404        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2405        break;
2406    OP_32_64(ctpop):
2407        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2408        break;
2409
2410    case INDEX_op_brcond_i32:
2411        tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2412        break;
2413    case INDEX_op_setcond_i32:
2414        tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2415        break;
2416    case INDEX_op_movcond_i32:
2417        tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2418        break;
2419
2420    OP_32_64(bswap16):
2421        tcg_out_rolw_8(s, a0);
2422        break;
2423    OP_32_64(bswap32):
2424        tcg_out_bswap32(s, a0);
2425        break;
2426
2427    OP_32_64(neg):
2428        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2429        break;
2430    OP_32_64(not):
2431        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2432        break;
2433
2434    OP_32_64(ext8s):
2435        tcg_out_ext8s(s, a0, a1, rexw);
2436        break;
2437    OP_32_64(ext16s):
2438        tcg_out_ext16s(s, a0, a1, rexw);
2439        break;
2440    OP_32_64(ext8u):
2441        tcg_out_ext8u(s, a0, a1);
2442        break;
2443    OP_32_64(ext16u):
2444        tcg_out_ext16u(s, a0, a1);
2445        break;
2446
2447    case INDEX_op_qemu_ld_i32:
2448        tcg_out_qemu_ld(s, args, 0);
2449        break;
2450    case INDEX_op_qemu_ld_i64:
2451        tcg_out_qemu_ld(s, args, 1);
2452        break;
2453    case INDEX_op_qemu_st_i32:
2454        tcg_out_qemu_st(s, args, 0);
2455        break;
2456    case INDEX_op_qemu_st_i64:
2457        tcg_out_qemu_st(s, args, 1);
2458        break;
2459
2460    OP_32_64(mulu2):
2461        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2462        break;
2463    OP_32_64(muls2):
2464        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2465        break;
2466    OP_32_64(add2):
2467        if (const_args[4]) {
2468            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2469        } else {
2470            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2471        }
2472        if (const_args[5]) {
2473            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2474        } else {
2475            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2476        }
2477        break;
2478    OP_32_64(sub2):
2479        if (const_args[4]) {
2480            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2481        } else {
2482            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2483        }
2484        if (const_args[5]) {
2485            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2486        } else {
2487            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2488        }
2489        break;
2490
2491#if TCG_TARGET_REG_BITS == 32
2492    case INDEX_op_brcond2_i32:
2493        tcg_out_brcond2(s, args, const_args, 0);
2494        break;
2495    case INDEX_op_setcond2_i32:
2496        tcg_out_setcond2(s, args, const_args);
2497        break;
2498#else /* TCG_TARGET_REG_BITS == 64 */
2499    case INDEX_op_ld32s_i64:
2500        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2501        break;
2502    case INDEX_op_ld_i64:
2503        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2504        break;
2505    case INDEX_op_st_i64:
2506        if (const_args[0]) {
2507            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2508            tcg_out32(s, a0);
2509        } else {
2510            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2511        }
2512        break;
2513
2514    case INDEX_op_brcond_i64:
2515        tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2516        break;
2517    case INDEX_op_setcond_i64:
2518        tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2519        break;
2520    case INDEX_op_movcond_i64:
2521        tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2522        break;
2523
2524    case INDEX_op_bswap64_i64:
2525        tcg_out_bswap64(s, a0);
2526        break;
2527    case INDEX_op_extu_i32_i64:
2528    case INDEX_op_ext32u_i64:
2529    case INDEX_op_extrl_i64_i32:
2530        tcg_out_ext32u(s, a0, a1);
2531        break;
2532    case INDEX_op_ext_i32_i64:
2533    case INDEX_op_ext32s_i64:
2534        tcg_out_ext32s(s, a0, a1);
2535        break;
2536    case INDEX_op_extrh_i64_i32:
2537        tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2538        break;
2539#endif
2540
2541    OP_32_64(deposit):
2542        if (args[3] == 0 && args[4] == 8) {
2543            /* load bits 0..7 */
2544            tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2545        } else if (args[3] == 8 && args[4] == 8) {
2546            /* load bits 8..15 */
2547            tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2548        } else if (args[3] == 0 && args[4] == 16) {
2549            /* load bits 0..15 */
2550            tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2551        } else {
2552            tcg_abort();
2553        }
2554        break;
2555
2556    case INDEX_op_extract_i64:
2557        if (a2 + args[3] == 32) {
2558            /* This is a 32-bit zero-extending right shift.  */
2559            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2560            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2561            break;
2562        }
2563        /* FALLTHRU */
2564    case INDEX_op_extract_i32:
2565        /* On the off-chance that we can use the high-byte registers.
2566           Otherwise we emit the same ext16 + shift pattern that we
2567           would have gotten from the normal tcg-op.c expansion.  */
2568        tcg_debug_assert(a2 == 8 && args[3] == 8);
2569        if (a1 < 4 && a0 < 8) {
2570            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2571        } else {
2572            tcg_out_ext16u(s, a0, a1);
2573            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2574        }
2575        break;
2576
2577    case INDEX_op_sextract_i32:
2578        /* We don't implement sextract_i64, as we cannot sign-extend to
2579           64-bits without using the REX prefix that explicitly excludes
2580           access to the high-byte registers.  */
2581        tcg_debug_assert(a2 == 8 && args[3] == 8);
2582        if (a1 < 4 && a0 < 8) {
2583            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2584        } else {
2585            tcg_out_ext16s(s, a0, a1, 0);
2586            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2587        }
2588        break;
2589
2590    case INDEX_op_mb:
2591        tcg_out_mb(s, a0);
2592        break;
2593    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2594    case INDEX_op_mov_i64:
2595    case INDEX_op_mov_vec:
2596    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
2597    case INDEX_op_movi_i64:
2598    case INDEX_op_dupi_vec:
2599    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2600    default:
2601        tcg_abort();
2602    }
2603
2604#undef OP_32_64
2605}
2606
2607static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2608                           unsigned vecl, unsigned vece,
2609                           const TCGArg *args, const int *const_args)
2610{
2611    static int const add_insn[4] = {
2612        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2613    };
2614    static int const ssadd_insn[4] = {
2615        OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2616    };
2617    static int const usadd_insn[4] = {
2618        OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2619    };
2620    static int const sub_insn[4] = {
2621        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2622    };
2623    static int const sssub_insn[4] = {
2624        OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2625    };
2626    static int const ussub_insn[4] = {
2627        OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2628    };
2629    static int const mul_insn[4] = {
2630        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
2631    };
2632    static int const shift_imm_insn[4] = {
2633        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2634    };
2635    static int const cmpeq_insn[4] = {
2636        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2637    };
2638    static int const cmpgt_insn[4] = {
2639        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2640    };
2641    static int const punpckl_insn[4] = {
2642        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2643    };
2644    static int const punpckh_insn[4] = {
2645        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2646    };
2647    static int const packss_insn[4] = {
2648        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2649    };
2650    static int const packus_insn[4] = {
2651        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2652    };
2653    static int const smin_insn[4] = {
2654        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2
2655    };
2656    static int const smax_insn[4] = {
2657        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2
2658    };
2659    static int const umin_insn[4] = {
2660        OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2
2661    };
2662    static int const umax_insn[4] = {
2663        OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
2664    };
2665
2666    TCGType type = vecl + TCG_TYPE_V64;
2667    int insn, sub;
2668    TCGArg a0, a1, a2;
2669
2670    a0 = args[0];
2671    a1 = args[1];
2672    a2 = args[2];
2673
2674    switch (opc) {
2675    case INDEX_op_add_vec:
2676        insn = add_insn[vece];
2677        goto gen_simd;
2678    case INDEX_op_ssadd_vec:
2679        insn = ssadd_insn[vece];
2680        goto gen_simd;
2681    case INDEX_op_usadd_vec:
2682        insn = usadd_insn[vece];
2683        goto gen_simd;
2684    case INDEX_op_sub_vec:
2685        insn = sub_insn[vece];
2686        goto gen_simd;
2687    case INDEX_op_sssub_vec:
2688        insn = sssub_insn[vece];
2689        goto gen_simd;
2690    case INDEX_op_ussub_vec:
2691        insn = ussub_insn[vece];
2692        goto gen_simd;
2693    case INDEX_op_mul_vec:
2694        insn = mul_insn[vece];
2695        goto gen_simd;
2696    case INDEX_op_and_vec:
2697        insn = OPC_PAND;
2698        goto gen_simd;
2699    case INDEX_op_or_vec:
2700        insn = OPC_POR;
2701        goto gen_simd;
2702    case INDEX_op_xor_vec:
2703        insn = OPC_PXOR;
2704        goto gen_simd;
2705    case INDEX_op_smin_vec:
2706        insn = smin_insn[vece];
2707        goto gen_simd;
2708    case INDEX_op_umin_vec:
2709        insn = umin_insn[vece];
2710        goto gen_simd;
2711    case INDEX_op_smax_vec:
2712        insn = smax_insn[vece];
2713        goto gen_simd;
2714    case INDEX_op_umax_vec:
2715        insn = umax_insn[vece];
2716        goto gen_simd;
2717    case INDEX_op_x86_punpckl_vec:
2718        insn = punpckl_insn[vece];
2719        goto gen_simd;
2720    case INDEX_op_x86_punpckh_vec:
2721        insn = punpckh_insn[vece];
2722        goto gen_simd;
2723    case INDEX_op_x86_packss_vec:
2724        insn = packss_insn[vece];
2725        goto gen_simd;
2726    case INDEX_op_x86_packus_vec:
2727        insn = packus_insn[vece];
2728        goto gen_simd;
2729#if TCG_TARGET_REG_BITS == 32
2730    case INDEX_op_dup2_vec:
2731        /* Constraints have already placed both 32-bit inputs in xmm regs.  */
2732        insn = OPC_PUNPCKLDQ;
2733        goto gen_simd;
2734#endif
2735    gen_simd:
2736        tcg_debug_assert(insn != OPC_UD2);
2737        if (type == TCG_TYPE_V256) {
2738            insn |= P_VEXL;
2739        }
2740        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2741        break;
2742
2743    case INDEX_op_cmp_vec:
2744        sub = args[3];
2745        if (sub == TCG_COND_EQ) {
2746            insn = cmpeq_insn[vece];
2747        } else if (sub == TCG_COND_GT) {
2748            insn = cmpgt_insn[vece];
2749        } else {
2750            g_assert_not_reached();
2751        }
2752        goto gen_simd;
2753
2754    case INDEX_op_andc_vec:
2755        insn = OPC_PANDN;
2756        if (type == TCG_TYPE_V256) {
2757            insn |= P_VEXL;
2758        }
2759        tcg_out_vex_modrm(s, insn, a0, a2, a1);
2760        break;
2761
2762    case INDEX_op_shli_vec:
2763        sub = 6;
2764        goto gen_shift;
2765    case INDEX_op_shri_vec:
2766        sub = 2;
2767        goto gen_shift;
2768    case INDEX_op_sari_vec:
2769        tcg_debug_assert(vece != MO_64);
2770        sub = 4;
2771    gen_shift:
2772        tcg_debug_assert(vece != MO_8);
2773        insn = shift_imm_insn[vece];
2774        if (type == TCG_TYPE_V256) {
2775            insn |= P_VEXL;
2776        }
2777        tcg_out_vex_modrm(s, insn, sub, a0, a1);
2778        tcg_out8(s, a2);
2779        break;
2780
2781    case INDEX_op_ld_vec:
2782        tcg_out_ld(s, type, a0, a1, a2);
2783        break;
2784    case INDEX_op_st_vec:
2785        tcg_out_st(s, type, a0, a1, a2);
2786        break;
2787    case INDEX_op_dup_vec:
2788        tcg_out_dup_vec(s, type, vece, a0, a1);
2789        break;
2790
2791    case INDEX_op_x86_shufps_vec:
2792        insn = OPC_SHUFPS;
2793        sub = args[3];
2794        goto gen_simd_imm8;
2795    case INDEX_op_x86_blend_vec:
2796        if (vece == MO_16) {
2797            insn = OPC_PBLENDW;
2798        } else if (vece == MO_32) {
2799            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
2800        } else {
2801            g_assert_not_reached();
2802        }
2803        sub = args[3];
2804        goto gen_simd_imm8;
2805    case INDEX_op_x86_vperm2i128_vec:
2806        insn = OPC_VPERM2I128;
2807        sub = args[3];
2808        goto gen_simd_imm8;
2809    gen_simd_imm8:
2810        if (type == TCG_TYPE_V256) {
2811            insn |= P_VEXL;
2812        }
2813        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2814        tcg_out8(s, sub);
2815        break;
2816
2817    case INDEX_op_x86_vpblendvb_vec:
2818        insn = OPC_VPBLENDVB;
2819        if (type == TCG_TYPE_V256) {
2820            insn |= P_VEXL;
2821        }
2822        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2823        tcg_out8(s, args[3] << 4);
2824        break;
2825
2826    case INDEX_op_x86_psrldq_vec:
2827        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
2828        tcg_out8(s, a2);
2829        break;
2830
2831    default:
2832        g_assert_not_reached();
2833    }
2834}
2835
2836static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
2837{
2838    static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
2839    static const TCGTargetOpDef ri_r = { .args_ct_str = { "ri", "r" } };
2840    static const TCGTargetOpDef re_r = { .args_ct_str = { "re", "r" } };
2841    static const TCGTargetOpDef qi_r = { .args_ct_str = { "qi", "r" } };
2842    static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
2843    static const TCGTargetOpDef r_q = { .args_ct_str = { "r", "q" } };
2844    static const TCGTargetOpDef r_re = { .args_ct_str = { "r", "re" } };
2845    static const TCGTargetOpDef r_0 = { .args_ct_str = { "r", "0" } };
2846    static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
2847    static const TCGTargetOpDef r_r_re = { .args_ct_str = { "r", "r", "re" } };
2848    static const TCGTargetOpDef r_0_re = { .args_ct_str = { "r", "0", "re" } };
2849    static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } };
2850    static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
2851    static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
2852    static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
2853    static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
2854    static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
2855    static const TCGTargetOpDef r_r_L_L
2856        = { .args_ct_str = { "r", "r", "L", "L" } };
2857    static const TCGTargetOpDef L_L_L_L
2858        = { .args_ct_str = { "L", "L", "L", "L" } };
2859    static const TCGTargetOpDef x_x = { .args_ct_str = { "x", "x" } };
2860    static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } };
2861    static const TCGTargetOpDef x_x_x_x
2862        = { .args_ct_str = { "x", "x", "x", "x" } };
2863    static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } };
2864
2865    switch (op) {
2866    case INDEX_op_goto_ptr:
2867        return &r;
2868
2869    case INDEX_op_ld8u_i32:
2870    case INDEX_op_ld8u_i64:
2871    case INDEX_op_ld8s_i32:
2872    case INDEX_op_ld8s_i64:
2873    case INDEX_op_ld16u_i32:
2874    case INDEX_op_ld16u_i64:
2875    case INDEX_op_ld16s_i32:
2876    case INDEX_op_ld16s_i64:
2877    case INDEX_op_ld_i32:
2878    case INDEX_op_ld32u_i64:
2879    case INDEX_op_ld32s_i64:
2880    case INDEX_op_ld_i64:
2881        return &r_r;
2882
2883    case INDEX_op_st8_i32:
2884    case INDEX_op_st8_i64:
2885        return &qi_r;
2886    case INDEX_op_st16_i32:
2887    case INDEX_op_st16_i64:
2888    case INDEX_op_st_i32:
2889    case INDEX_op_st32_i64:
2890        return &ri_r;
2891    case INDEX_op_st_i64:
2892        return &re_r;
2893
2894    case INDEX_op_add_i32:
2895    case INDEX_op_add_i64:
2896        return &r_r_re;
2897    case INDEX_op_sub_i32:
2898    case INDEX_op_sub_i64:
2899    case INDEX_op_mul_i32:
2900    case INDEX_op_mul_i64:
2901    case INDEX_op_or_i32:
2902    case INDEX_op_or_i64:
2903    case INDEX_op_xor_i32:
2904    case INDEX_op_xor_i64:
2905        return &r_0_re;
2906
2907    case INDEX_op_and_i32:
2908    case INDEX_op_and_i64:
2909        {
2910            static const TCGTargetOpDef and
2911                = { .args_ct_str = { "r", "0", "reZ" } };
2912            return &and;
2913        }
2914        break;
2915    case INDEX_op_andc_i32:
2916    case INDEX_op_andc_i64:
2917        {
2918            static const TCGTargetOpDef andc
2919                = { .args_ct_str = { "r", "r", "rI" } };
2920            return &andc;
2921        }
2922        break;
2923
2924    case INDEX_op_shl_i32:
2925    case INDEX_op_shl_i64:
2926    case INDEX_op_shr_i32:
2927    case INDEX_op_shr_i64:
2928    case INDEX_op_sar_i32:
2929    case INDEX_op_sar_i64:
2930        return have_bmi2 ? &r_r_ri : &r_0_ci;
2931    case INDEX_op_rotl_i32:
2932    case INDEX_op_rotl_i64:
2933    case INDEX_op_rotr_i32:
2934    case INDEX_op_rotr_i64:
2935        return &r_0_ci;
2936
2937    case INDEX_op_brcond_i32:
2938    case INDEX_op_brcond_i64:
2939        return &r_re;
2940
2941    case INDEX_op_bswap16_i32:
2942    case INDEX_op_bswap16_i64:
2943    case INDEX_op_bswap32_i32:
2944    case INDEX_op_bswap32_i64:
2945    case INDEX_op_bswap64_i64:
2946    case INDEX_op_neg_i32:
2947    case INDEX_op_neg_i64:
2948    case INDEX_op_not_i32:
2949    case INDEX_op_not_i64:
2950    case INDEX_op_extrh_i64_i32:
2951        return &r_0;
2952
2953    case INDEX_op_ext8s_i32:
2954    case INDEX_op_ext8s_i64:
2955    case INDEX_op_ext8u_i32:
2956    case INDEX_op_ext8u_i64:
2957        return &r_q;
2958    case INDEX_op_ext16s_i32:
2959    case INDEX_op_ext16s_i64:
2960    case INDEX_op_ext16u_i32:
2961    case INDEX_op_ext16u_i64:
2962    case INDEX_op_ext32s_i64:
2963    case INDEX_op_ext32u_i64:
2964    case INDEX_op_ext_i32_i64:
2965    case INDEX_op_extu_i32_i64:
2966    case INDEX_op_extrl_i64_i32:
2967    case INDEX_op_extract_i32:
2968    case INDEX_op_extract_i64:
2969    case INDEX_op_sextract_i32:
2970    case INDEX_op_ctpop_i32:
2971    case INDEX_op_ctpop_i64:
2972        return &r_r;
2973
2974    case INDEX_op_deposit_i32:
2975    case INDEX_op_deposit_i64:
2976        {
2977            static const TCGTargetOpDef dep
2978                = { .args_ct_str = { "Q", "0", "Q" } };
2979            return &dep;
2980        }
2981    case INDEX_op_setcond_i32:
2982    case INDEX_op_setcond_i64:
2983        {
2984            static const TCGTargetOpDef setc
2985                = { .args_ct_str = { "q", "r", "re" } };
2986            return &setc;
2987        }
2988    case INDEX_op_movcond_i32:
2989    case INDEX_op_movcond_i64:
2990        {
2991            static const TCGTargetOpDef movc
2992                = { .args_ct_str = { "r", "r", "re", "r", "0" } };
2993            return &movc;
2994        }
2995    case INDEX_op_div2_i32:
2996    case INDEX_op_div2_i64:
2997    case INDEX_op_divu2_i32:
2998    case INDEX_op_divu2_i64:
2999        {
3000            static const TCGTargetOpDef div2
3001                = { .args_ct_str = { "a", "d", "0", "1", "r" } };
3002            return &div2;
3003        }
3004    case INDEX_op_mulu2_i32:
3005    case INDEX_op_mulu2_i64:
3006    case INDEX_op_muls2_i32:
3007    case INDEX_op_muls2_i64:
3008        {
3009            static const TCGTargetOpDef mul2
3010                = { .args_ct_str = { "a", "d", "a", "r" } };
3011            return &mul2;
3012        }
3013    case INDEX_op_add2_i32:
3014    case INDEX_op_add2_i64:
3015    case INDEX_op_sub2_i32:
3016    case INDEX_op_sub2_i64:
3017        {
3018            static const TCGTargetOpDef arith2
3019                = { .args_ct_str = { "r", "r", "0", "1", "re", "re" } };
3020            return &arith2;
3021        }
3022    case INDEX_op_ctz_i32:
3023    case INDEX_op_ctz_i64:
3024        {
3025            static const TCGTargetOpDef ctz[2] = {
3026                { .args_ct_str = { "&r", "r", "r" } },
3027                { .args_ct_str = { "&r", "r", "rW" } },
3028            };
3029            return &ctz[have_bmi1];
3030        }
3031    case INDEX_op_clz_i32:
3032    case INDEX_op_clz_i64:
3033        {
3034            static const TCGTargetOpDef clz[2] = {
3035                { .args_ct_str = { "&r", "r", "r" } },
3036                { .args_ct_str = { "&r", "r", "rW" } },
3037            };
3038            return &clz[have_lzcnt];
3039        }
3040
3041    case INDEX_op_qemu_ld_i32:
3042        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
3043    case INDEX_op_qemu_st_i32:
3044        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L;
3045    case INDEX_op_qemu_ld_i64:
3046        return (TCG_TARGET_REG_BITS == 64 ? &r_L
3047                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
3048                : &r_r_L_L);
3049    case INDEX_op_qemu_st_i64:
3050        return (TCG_TARGET_REG_BITS == 64 ? &L_L
3051                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L_L
3052                : &L_L_L_L);
3053
3054    case INDEX_op_brcond2_i32:
3055        {
3056            static const TCGTargetOpDef b2
3057                = { .args_ct_str = { "r", "r", "ri", "ri" } };
3058            return &b2;
3059        }
3060    case INDEX_op_setcond2_i32:
3061        {
3062            static const TCGTargetOpDef s2
3063                = { .args_ct_str = { "r", "r", "r", "ri", "ri" } };
3064            return &s2;
3065        }
3066
3067    case INDEX_op_ld_vec:
3068    case INDEX_op_st_vec:
3069        return &x_r;
3070
3071    case INDEX_op_add_vec:
3072    case INDEX_op_sub_vec:
3073    case INDEX_op_mul_vec:
3074    case INDEX_op_and_vec:
3075    case INDEX_op_or_vec:
3076    case INDEX_op_xor_vec:
3077    case INDEX_op_andc_vec:
3078    case INDEX_op_ssadd_vec:
3079    case INDEX_op_usadd_vec:
3080    case INDEX_op_sssub_vec:
3081    case INDEX_op_ussub_vec:
3082    case INDEX_op_smin_vec:
3083    case INDEX_op_umin_vec:
3084    case INDEX_op_smax_vec:
3085    case INDEX_op_umax_vec:
3086    case INDEX_op_cmp_vec:
3087    case INDEX_op_x86_shufps_vec:
3088    case INDEX_op_x86_blend_vec:
3089    case INDEX_op_x86_packss_vec:
3090    case INDEX_op_x86_packus_vec:
3091    case INDEX_op_x86_vperm2i128_vec:
3092    case INDEX_op_x86_punpckl_vec:
3093    case INDEX_op_x86_punpckh_vec:
3094#if TCG_TARGET_REG_BITS == 32
3095    case INDEX_op_dup2_vec:
3096#endif
3097        return &x_x_x;
3098    case INDEX_op_dup_vec:
3099    case INDEX_op_shli_vec:
3100    case INDEX_op_shri_vec:
3101    case INDEX_op_sari_vec:
3102    case INDEX_op_x86_psrldq_vec:
3103        return &x_x;
3104    case INDEX_op_x86_vpblendvb_vec:
3105        return &x_x_x_x;
3106
3107    default:
3108        break;
3109    }
3110    return NULL;
3111}
3112
3113int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3114{
3115    switch (opc) {
3116    case INDEX_op_add_vec:
3117    case INDEX_op_sub_vec:
3118    case INDEX_op_and_vec:
3119    case INDEX_op_or_vec:
3120    case INDEX_op_xor_vec:
3121    case INDEX_op_andc_vec:
3122        return 1;
3123    case INDEX_op_cmp_vec:
3124        return -1;
3125
3126    case INDEX_op_shli_vec:
3127    case INDEX_op_shri_vec:
3128        /* We must expand the operation for MO_8.  */
3129        return vece == MO_8 ? -1 : 1;
3130
3131    case INDEX_op_sari_vec:
3132        /* We must expand the operation for MO_8.  */
3133        if (vece == MO_8) {
3134            return -1;
3135        }
3136        /* We can emulate this for MO_64, but it does not pay off
3137           unless we're producing at least 4 values.  */
3138        if (vece == MO_64) {
3139            return type >= TCG_TYPE_V256 ? -1 : 0;
3140        }
3141        return 1;
3142
3143    case INDEX_op_mul_vec:
3144        if (vece == MO_8) {
3145            /* We can expand the operation for MO_8.  */
3146            return -1;
3147        }
3148        if (vece == MO_64) {
3149            return 0;
3150        }
3151        return 1;
3152
3153    case INDEX_op_ssadd_vec:
3154    case INDEX_op_usadd_vec:
3155    case INDEX_op_sssub_vec:
3156    case INDEX_op_ussub_vec:
3157        return vece <= MO_16;
3158    case INDEX_op_smin_vec:
3159    case INDEX_op_smax_vec:
3160    case INDEX_op_umin_vec:
3161    case INDEX_op_umax_vec:
3162        return vece <= MO_32 ? 1 : -1;
3163
3164    default:
3165        return 0;
3166    }
3167}
3168
3169static void expand_vec_shi(TCGType type, unsigned vece, bool shr,
3170                           TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3171{
3172    TCGv_vec t1, t2;
3173
3174    tcg_debug_assert(vece == MO_8);
3175
3176    t1 = tcg_temp_new_vec(type);
3177    t2 = tcg_temp_new_vec(type);
3178
3179    /* Unpack to W, shift, and repack.  Tricky bits:
3180       (1) Use punpck*bw x,x to produce DDCCBBAA,
3181           i.e. duplicate in other half of the 16-bit lane.
3182       (2) For right-shift, add 8 so that the high half of
3183           the lane becomes zero.  For left-shift, we must
3184           shift up and down again.
3185       (3) Step 2 leaves high half zero such that PACKUSWB
3186           (pack with unsigned saturation) does not modify
3187           the quantity.  */
3188    vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3189              tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3190    vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3191              tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3192
3193    if (shr) {
3194        tcg_gen_shri_vec(MO_16, t1, t1, imm + 8);
3195        tcg_gen_shri_vec(MO_16, t2, t2, imm + 8);
3196    } else {
3197        tcg_gen_shli_vec(MO_16, t1, t1, imm + 8);
3198        tcg_gen_shli_vec(MO_16, t2, t2, imm + 8);
3199        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3200        tcg_gen_shri_vec(MO_16, t2, t2, 8);
3201    }
3202
3203    vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3204              tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3205    tcg_temp_free_vec(t1);
3206    tcg_temp_free_vec(t2);
3207}
3208
3209static void expand_vec_sari(TCGType type, unsigned vece,
3210                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3211{
3212    TCGv_vec t1, t2;
3213
3214    switch (vece) {
3215    case MO_8:
3216        /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3217        t1 = tcg_temp_new_vec(type);
3218        t2 = tcg_temp_new_vec(type);
3219        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3220                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3221        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3222                  tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3223        tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3224        tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3225        vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3226                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3227        tcg_temp_free_vec(t1);
3228        tcg_temp_free_vec(t2);
3229        break;
3230
3231    case MO_64:
3232        if (imm <= 32) {
3233            /* We can emulate a small sign extend by performing an arithmetic
3234             * 32-bit shift and overwriting the high half of a 64-bit logical
3235             * shift (note that the ISA says shift of 32 is valid).
3236             */
3237            t1 = tcg_temp_new_vec(type);
3238            tcg_gen_sari_vec(MO_32, t1, v1, imm);
3239            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3240            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3241                      tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3242                      tcgv_vec_arg(t1), 0xaa);
3243            tcg_temp_free_vec(t1);
3244        } else {
3245            /* Otherwise we will need to use a compare vs 0 to produce
3246             * the sign-extend, shift and merge.
3247             */
3248            t1 = tcg_const_zeros_vec(type);
3249            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1);
3250            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3251            tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3252            tcg_gen_or_vec(MO_64, v0, v0, t1);
3253            tcg_temp_free_vec(t1);
3254        }
3255        break;
3256
3257    default:
3258        g_assert_not_reached();
3259    }
3260}
3261
3262static void expand_vec_mul(TCGType type, unsigned vece,
3263                           TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3264{
3265    TCGv_vec t1, t2, t3, t4;
3266
3267    tcg_debug_assert(vece == MO_8);
3268
3269    /*
3270     * Unpack v1 bytes to words, 0 | x.
3271     * Unpack v2 bytes to words, y | 0.
3272     * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3273     * Shift logical right by 8 bits to clear the high 8 bytes before
3274     * using an unsigned saturated pack.
3275     *
3276     * The difference between the V64, V128 and V256 cases is merely how
3277     * we distribute the expansion between temporaries.
3278     */
3279    switch (type) {
3280    case TCG_TYPE_V64:
3281        t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3282        t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3283        tcg_gen_dup16i_vec(t2, 0);
3284        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3285                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t2));
3286        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3287                  tcgv_vec_arg(t2), tcgv_vec_arg(t2), tcgv_vec_arg(v2));
3288        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3289        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3290        vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3291                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3292        tcg_temp_free_vec(t1);
3293        tcg_temp_free_vec(t2);
3294        break;
3295
3296    case TCG_TYPE_V128:
3297    case TCG_TYPE_V256:
3298        t1 = tcg_temp_new_vec(type);
3299        t2 = tcg_temp_new_vec(type);
3300        t3 = tcg_temp_new_vec(type);
3301        t4 = tcg_temp_new_vec(type);
3302        tcg_gen_dup16i_vec(t4, 0);
3303        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3304                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
3305        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3306                  tcgv_vec_arg(t2), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
3307        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3308                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
3309        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3310                  tcgv_vec_arg(t4), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
3311        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3312        tcg_gen_mul_vec(MO_16, t3, t3, t4);
3313        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3314        tcg_gen_shri_vec(MO_16, t3, t3, 8);
3315        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3316                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3317        tcg_temp_free_vec(t1);
3318        tcg_temp_free_vec(t2);
3319        tcg_temp_free_vec(t3);
3320        tcg_temp_free_vec(t4);
3321        break;
3322
3323    default:
3324        g_assert_not_reached();
3325    }
3326}
3327
3328static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3329                           TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3330{
3331    enum {
3332        NEED_SWAP = 1,
3333        NEED_INV  = 2,
3334        NEED_BIAS = 4
3335    };
3336    static const uint8_t fixups[16] = {
3337        [0 ... 15] = -1,
3338        [TCG_COND_EQ] = 0,
3339        [TCG_COND_NE] = NEED_INV,
3340        [TCG_COND_GT] = 0,
3341        [TCG_COND_LT] = NEED_SWAP,
3342        [TCG_COND_LE] = NEED_INV,
3343        [TCG_COND_GE] = NEED_SWAP | NEED_INV,
3344        [TCG_COND_GTU] = NEED_BIAS,
3345        [TCG_COND_LTU] = NEED_BIAS | NEED_SWAP,
3346        [TCG_COND_LEU] = NEED_BIAS | NEED_INV,
3347        [TCG_COND_GEU] = NEED_BIAS | NEED_SWAP | NEED_INV,
3348    };
3349    TCGv_vec t1, t2;
3350    uint8_t fixup;
3351
3352    fixup = fixups[cond & 15];
3353    tcg_debug_assert(fixup != 0xff);
3354
3355    if (fixup & NEED_INV) {
3356        cond = tcg_invert_cond(cond);
3357    }
3358    if (fixup & NEED_SWAP) {
3359        t1 = v1, v1 = v2, v2 = t1;
3360        cond = tcg_swap_cond(cond);
3361    }
3362
3363    t1 = t2 = NULL;
3364    if (fixup & NEED_BIAS) {
3365        t1 = tcg_temp_new_vec(type);
3366        t2 = tcg_temp_new_vec(type);
3367        tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1));
3368        tcg_gen_sub_vec(vece, t1, v1, t2);
3369        tcg_gen_sub_vec(vece, t2, v2, t2);
3370        v1 = t1;
3371        v2 = t2;
3372        cond = tcg_signed_cond(cond);
3373    }
3374
3375    tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3376    /* Expand directly; do not recurse.  */
3377    vec_gen_4(INDEX_op_cmp_vec, type, vece,
3378              tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3379
3380    if (t1) {
3381        tcg_temp_free_vec(t1);
3382        if (t2) {
3383            tcg_temp_free_vec(t2);
3384        }
3385    }
3386    if (fixup & NEED_INV) {
3387        tcg_gen_not_vec(vece, v0, v0);
3388    }
3389}
3390
3391static void expand_vec_minmax(TCGType type, unsigned vece,
3392                              TCGCond cond, bool min,
3393                              TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3394{
3395    TCGv_vec t1 = tcg_temp_new_vec(type);
3396
3397    tcg_debug_assert(vece == MO_64);
3398
3399    tcg_gen_cmp_vec(cond, vece, t1, v1, v2);
3400    if (min) {
3401        TCGv_vec t2;
3402        t2 = v1, v1 = v2, v2 = t2;
3403    }
3404    vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3405              tcgv_vec_arg(v0), tcgv_vec_arg(v1),
3406              tcgv_vec_arg(v2), tcgv_vec_arg(t1));
3407    tcg_temp_free_vec(t1);
3408}
3409
3410void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3411                       TCGArg a0, ...)
3412{
3413    va_list va;
3414    TCGArg a2;
3415    TCGv_vec v0, v1, v2;
3416
3417    va_start(va, a0);
3418    v0 = temp_tcgv_vec(arg_temp(a0));
3419    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3420    a2 = va_arg(va, TCGArg);
3421
3422    switch (opc) {
3423    case INDEX_op_shli_vec:
3424    case INDEX_op_shri_vec:
3425        expand_vec_shi(type, vece, opc == INDEX_op_shri_vec, v0, v1, a2);
3426        break;
3427
3428    case INDEX_op_sari_vec:
3429        expand_vec_sari(type, vece, v0, v1, a2);
3430        break;
3431
3432    case INDEX_op_mul_vec:
3433        v2 = temp_tcgv_vec(arg_temp(a2));
3434        expand_vec_mul(type, vece, v0, v1, v2);
3435        break;
3436
3437    case INDEX_op_cmp_vec:
3438        v2 = temp_tcgv_vec(arg_temp(a2));
3439        expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
3440        break;
3441
3442    case INDEX_op_smin_vec:
3443        v2 = temp_tcgv_vec(arg_temp(a2));
3444        expand_vec_minmax(type, vece, TCG_COND_GT, true, v0, v1, v2);
3445        break;
3446    case INDEX_op_smax_vec:
3447        v2 = temp_tcgv_vec(arg_temp(a2));
3448        expand_vec_minmax(type, vece, TCG_COND_GT, false, v0, v1, v2);
3449        break;
3450    case INDEX_op_umin_vec:
3451        v2 = temp_tcgv_vec(arg_temp(a2));
3452        expand_vec_minmax(type, vece, TCG_COND_GTU, true, v0, v1, v2);
3453        break;
3454    case INDEX_op_umax_vec:
3455        v2 = temp_tcgv_vec(arg_temp(a2));
3456        expand_vec_minmax(type, vece, TCG_COND_GTU, false, v0, v1, v2);
3457        break;
3458
3459    default:
3460        break;
3461    }
3462
3463    va_end(va);
3464}
3465
3466static const int tcg_target_callee_save_regs[] = {
3467#if TCG_TARGET_REG_BITS == 64
3468    TCG_REG_RBP,
3469    TCG_REG_RBX,
3470#if defined(_WIN64)
3471    TCG_REG_RDI,
3472    TCG_REG_RSI,
3473#endif
3474    TCG_REG_R12,
3475    TCG_REG_R13,
3476    TCG_REG_R14, /* Currently used for the global env. */
3477    TCG_REG_R15,
3478#else
3479    TCG_REG_EBP, /* Currently used for the global env. */
3480    TCG_REG_EBX,
3481    TCG_REG_ESI,
3482    TCG_REG_EDI,
3483#endif
3484};
3485
3486/* Compute frame size via macros, to share between tcg_target_qemu_prologue
3487   and tcg_register_jit.  */
3488
3489#define PUSH_SIZE \
3490    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3491     * (TCG_TARGET_REG_BITS / 8))
3492
3493#define FRAME_SIZE \
3494    ((PUSH_SIZE \
3495      + TCG_STATIC_CALL_ARGS_SIZE \
3496      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3497      + TCG_TARGET_STACK_ALIGN - 1) \
3498     & ~(TCG_TARGET_STACK_ALIGN - 1))
3499
3500/* Generate global QEMU prologue and epilogue code */
3501static void tcg_target_qemu_prologue(TCGContext *s)
3502{
3503    int i, stack_addend;
3504
3505    /* TB prologue */
3506
3507    /* Reserve some stack space, also for TCG temps.  */
3508    stack_addend = FRAME_SIZE - PUSH_SIZE;
3509    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3510                  CPU_TEMP_BUF_NLONGS * sizeof(long));
3511
3512    /* Save all callee saved registers.  */
3513    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3514        tcg_out_push(s, tcg_target_callee_save_regs[i]);
3515    }
3516
3517#if TCG_TARGET_REG_BITS == 32
3518    tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3519               (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3520    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3521    /* jmp *tb.  */
3522    tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
3523                         (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3524                         + stack_addend);
3525#else
3526# if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64
3527    if (guest_base) {
3528        int seg = setup_guest_base_seg();
3529        if (seg != 0) {
3530            x86_guest_base_seg = seg;
3531        } else if (guest_base == (int32_t)guest_base) {
3532            x86_guest_base_offset = guest_base;
3533        } else {
3534            /* Choose R12 because, as a base, it requires a SIB byte. */
3535            x86_guest_base_index = TCG_REG_R12;
3536            tcg_out_mov(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base);
3537            tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index);
3538        }
3539    }
3540# endif
3541    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3542    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3543    /* jmp *tb.  */
3544    tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3545#endif
3546
3547    /*
3548     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3549     * and fall through to the rest of the epilogue.
3550     */
3551    s->code_gen_epilogue = s->code_ptr;
3552    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3553
3554    /* TB epilogue */
3555    tb_ret_addr = s->code_ptr;
3556
3557    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3558
3559    if (have_avx2) {
3560        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3561    }
3562    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3563        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3564    }
3565    tcg_out_opc(s, OPC_RET, 0, 0, 0);
3566}
3567
3568static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3569{
3570    memset(p, 0x90, count);
3571}
3572
3573static void tcg_target_init(TCGContext *s)
3574{
3575#ifdef CONFIG_CPUID_H
3576    unsigned a, b, c, d, b7 = 0;
3577    int max = __get_cpuid_max(0, 0);
3578
3579    if (max >= 7) {
3580        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
3581        __cpuid_count(7, 0, a, b7, c, d);
3582        have_bmi1 = (b7 & bit_BMI) != 0;
3583        have_bmi2 = (b7 & bit_BMI2) != 0;
3584    }
3585
3586    if (max >= 1) {
3587        __cpuid(1, a, b, c, d);
3588#ifndef have_cmov
3589        /* For 32-bit, 99% certainty that we're running on hardware that
3590           supports cmov, but we still need to check.  In case cmov is not
3591           available, we'll use a small forward branch.  */
3592        have_cmov = (d & bit_CMOV) != 0;
3593#endif
3594
3595        /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
3596           need to probe for it.  */
3597        have_movbe = (c & bit_MOVBE) != 0;
3598        have_popcnt = (c & bit_POPCNT) != 0;
3599
3600        /* There are a number of things we must check before we can be
3601           sure of not hitting invalid opcode.  */
3602        if (c & bit_OSXSAVE) {
3603            unsigned xcrl, xcrh;
3604            /* The xgetbv instruction is not available to older versions of
3605             * the assembler, so we encode the instruction manually.
3606             */
3607            asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
3608            if ((xcrl & 6) == 6) {
3609                have_avx1 = (c & bit_AVX) != 0;
3610                have_avx2 = (b7 & bit_AVX2) != 0;
3611            }
3612        }
3613    }
3614
3615    max = __get_cpuid_max(0x8000000, 0);
3616    if (max >= 1) {
3617        __cpuid(0x80000001, a, b, c, d);
3618        /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
3619        have_lzcnt = (c & bit_LZCNT) != 0;
3620    }
3621#endif /* CONFIG_CPUID_H */
3622
3623    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
3624    if (TCG_TARGET_REG_BITS == 64) {
3625        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
3626    }
3627    if (have_avx1) {
3628        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
3629        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
3630    }
3631    if (have_avx2) {
3632        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
3633    }
3634
3635    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
3636    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
3637    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
3638    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
3639    if (TCG_TARGET_REG_BITS == 64) {
3640#if !defined(_WIN64)
3641        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
3642        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
3643#endif
3644        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
3645        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
3646        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
3647        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
3648    }
3649
3650    s->reserved_regs = 0;
3651    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
3652}
3653
3654typedef struct {
3655    DebugFrameHeader h;
3656    uint8_t fde_def_cfa[4];
3657    uint8_t fde_reg_ofs[14];
3658} DebugFrame;
3659
3660/* We're expecting a 2 byte uleb128 encoded value.  */
3661QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3662
3663#if !defined(__ELF__)
3664    /* Host machine without ELF. */
3665#elif TCG_TARGET_REG_BITS == 64
3666#define ELF_HOST_MACHINE EM_X86_64
3667static const DebugFrame debug_frame = {
3668    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3669    .h.cie.id = -1,
3670    .h.cie.version = 1,
3671    .h.cie.code_align = 1,
3672    .h.cie.data_align = 0x78,             /* sleb128 -8 */
3673    .h.cie.return_column = 16,
3674
3675    /* Total FDE size does not include the "len" member.  */
3676    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3677
3678    .fde_def_cfa = {
3679        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
3680        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3681        (FRAME_SIZE >> 7)
3682    },
3683    .fde_reg_ofs = {
3684        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
3685        /* The following ordering must match tcg_target_callee_save_regs.  */
3686        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
3687        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
3688        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
3689        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
3690        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
3691        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
3692    }
3693};
3694#else
3695#define ELF_HOST_MACHINE EM_386
3696static const DebugFrame debug_frame = {
3697    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3698    .h.cie.id = -1,
3699    .h.cie.version = 1,
3700    .h.cie.code_align = 1,
3701    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
3702    .h.cie.return_column = 8,
3703
3704    /* Total FDE size does not include the "len" member.  */
3705    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3706
3707    .fde_def_cfa = {
3708        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
3709        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3710        (FRAME_SIZE >> 7)
3711    },
3712    .fde_reg_ofs = {
3713        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
3714        /* The following ordering must match tcg_target_callee_save_regs.  */
3715        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
3716        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
3717        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
3718        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
3719    }
3720};
3721#endif
3722
3723#if defined(ELF_HOST_MACHINE)
3724void tcg_register_jit(void *buf, size_t buf_size)
3725{
3726    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
3727}
3728#endif
3729