qemu/tcg/i386/tcg-target.inc.c
<<
>>
Prefs
   1/*
   2 * Tiny Code Generator for QEMU
   3 *
   4 * Copyright (c) 2008 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "tcg-pool.inc.c"
  26
  27#ifdef CONFIG_DEBUG_TCG
  28static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
  29#if TCG_TARGET_REG_BITS == 64
  30    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
  31#else
  32    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
  33#endif
  34    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
  35    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
  36#if TCG_TARGET_REG_BITS == 64
  37    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
  38    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
  39#endif
  40};
  41#endif
  42
  43static const int tcg_target_reg_alloc_order[] = {
  44#if TCG_TARGET_REG_BITS == 64
  45    TCG_REG_RBP,
  46    TCG_REG_RBX,
  47    TCG_REG_R12,
  48    TCG_REG_R13,
  49    TCG_REG_R14,
  50    TCG_REG_R15,
  51    TCG_REG_R10,
  52    TCG_REG_R11,
  53    TCG_REG_R9,
  54    TCG_REG_R8,
  55    TCG_REG_RCX,
  56    TCG_REG_RDX,
  57    TCG_REG_RSI,
  58    TCG_REG_RDI,
  59    TCG_REG_RAX,
  60#else
  61    TCG_REG_EBX,
  62    TCG_REG_ESI,
  63    TCG_REG_EDI,
  64    TCG_REG_EBP,
  65    TCG_REG_ECX,
  66    TCG_REG_EDX,
  67    TCG_REG_EAX,
  68#endif
  69    TCG_REG_XMM0,
  70    TCG_REG_XMM1,
  71    TCG_REG_XMM2,
  72    TCG_REG_XMM3,
  73    TCG_REG_XMM4,
  74    TCG_REG_XMM5,
  75#ifndef _WIN64
  76    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
  77       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
  78    TCG_REG_XMM6,
  79    TCG_REG_XMM7,
  80#if TCG_TARGET_REG_BITS == 64
  81    TCG_REG_XMM8,
  82    TCG_REG_XMM9,
  83    TCG_REG_XMM10,
  84    TCG_REG_XMM11,
  85    TCG_REG_XMM12,
  86    TCG_REG_XMM13,
  87    TCG_REG_XMM14,
  88    TCG_REG_XMM15,
  89#endif
  90#endif
  91};
  92
  93static const int tcg_target_call_iarg_regs[] = {
  94#if TCG_TARGET_REG_BITS == 64
  95#if defined(_WIN64)
  96    TCG_REG_RCX,
  97    TCG_REG_RDX,
  98#else
  99    TCG_REG_RDI,
 100    TCG_REG_RSI,
 101    TCG_REG_RDX,
 102    TCG_REG_RCX,
 103#endif
 104    TCG_REG_R8,
 105    TCG_REG_R9,
 106#else
 107    /* 32 bit mode uses stack based calling convention (GCC default). */
 108#endif
 109};
 110
 111static const int tcg_target_call_oarg_regs[] = {
 112    TCG_REG_EAX,
 113#if TCG_TARGET_REG_BITS == 32
 114    TCG_REG_EDX
 115#endif
 116};
 117
 118/* Constants we accept.  */
 119#define TCG_CT_CONST_S32 0x100
 120#define TCG_CT_CONST_U32 0x200
 121#define TCG_CT_CONST_I32 0x400
 122#define TCG_CT_CONST_WSZ 0x800
 123
 124/* Registers used with L constraint, which are the first argument
 125   registers on x86_64, and two random call clobbered registers on
 126   i386. */
 127#if TCG_TARGET_REG_BITS == 64
 128# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
 129# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
 130#else
 131# define TCG_REG_L0 TCG_REG_EAX
 132# define TCG_REG_L1 TCG_REG_EDX
 133#endif
 134
 135/* The host compiler should supply <cpuid.h> to enable runtime features
 136   detection, as we're not going to go so far as our own inline assembly.
 137   If not available, default values will be assumed.  */
 138#if defined(CONFIG_CPUID_H)
 139#include "qemu/cpuid.h"
 140#endif
 141
 142/* For 64-bit, we always know that CMOV is available.  */
 143#if TCG_TARGET_REG_BITS == 64
 144# define have_cmov 1
 145#elif defined(CONFIG_CPUID_H)
 146static bool have_cmov;
 147#else
 148# define have_cmov 0
 149#endif
 150
 151/* We need these symbols in tcg-target.h, and we can't properly conditionalize
 152   it there.  Therefore we always define the variable.  */
 153bool have_bmi1;
 154bool have_popcnt;
 155bool have_avx1;
 156bool have_avx2;
 157
 158#ifdef CONFIG_CPUID_H
 159static bool have_movbe;
 160static bool have_bmi2;
 161static bool have_lzcnt;
 162#else
 163# define have_movbe 0
 164# define have_bmi2 0
 165# define have_lzcnt 0
 166#endif
 167
 168static tcg_insn_unit *tb_ret_addr;
 169
 170static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 171                        intptr_t value, intptr_t addend)
 172{
 173    value += addend;
 174    switch(type) {
 175    case R_386_PC32:
 176        value -= (uintptr_t)code_ptr;
 177        if (value != (int32_t)value) {
 178            return false;
 179        }
 180        /* FALLTHRU */
 181    case R_386_32:
 182        tcg_patch32(code_ptr, value);
 183        break;
 184    case R_386_PC8:
 185        value -= (uintptr_t)code_ptr;
 186        if (value != (int8_t)value) {
 187            return false;
 188        }
 189        tcg_patch8(code_ptr, value);
 190        break;
 191    default:
 192        tcg_abort();
 193    }
 194    return true;
 195}
 196
 197#if TCG_TARGET_REG_BITS == 64
 198#define ALL_GENERAL_REGS   0x0000ffffu
 199#define ALL_VECTOR_REGS    0xffff0000u
 200#else
 201#define ALL_GENERAL_REGS   0x000000ffu
 202#define ALL_VECTOR_REGS    0x00ff0000u
 203#endif
 204
 205/* parse target specific constraints */
 206static const char *target_parse_constraint(TCGArgConstraint *ct,
 207                                           const char *ct_str, TCGType type)
 208{
 209    switch(*ct_str++) {
 210    case 'a':
 211        ct->ct |= TCG_CT_REG;
 212        tcg_regset_set_reg(ct->u.regs, TCG_REG_EAX);
 213        break;
 214    case 'b':
 215        ct->ct |= TCG_CT_REG;
 216        tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX);
 217        break;
 218    case 'c':
 219        ct->ct |= TCG_CT_REG;
 220        tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX);
 221        break;
 222    case 'd':
 223        ct->ct |= TCG_CT_REG;
 224        tcg_regset_set_reg(ct->u.regs, TCG_REG_EDX);
 225        break;
 226    case 'S':
 227        ct->ct |= TCG_CT_REG;
 228        tcg_regset_set_reg(ct->u.regs, TCG_REG_ESI);
 229        break;
 230    case 'D':
 231        ct->ct |= TCG_CT_REG;
 232        tcg_regset_set_reg(ct->u.regs, TCG_REG_EDI);
 233        break;
 234    case 'q':
 235        /* A register that can be used as a byte operand.  */
 236        ct->ct |= TCG_CT_REG;
 237        ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf;
 238        break;
 239    case 'Q':
 240        /* A register with an addressable second byte (e.g. %ah).  */
 241        ct->ct |= TCG_CT_REG;
 242        ct->u.regs = 0xf;
 243        break;
 244    case 'r':
 245        /* A general register.  */
 246        ct->ct |= TCG_CT_REG;
 247        ct->u.regs |= ALL_GENERAL_REGS;
 248        break;
 249    case 'W':
 250        /* With TZCNT/LZCNT, we can have operand-size as an input.  */
 251        ct->ct |= TCG_CT_CONST_WSZ;
 252        break;
 253    case 'x':
 254        /* A vector register.  */
 255        ct->ct |= TCG_CT_REG;
 256        ct->u.regs |= ALL_VECTOR_REGS;
 257        break;
 258
 259        /* qemu_ld/st address constraint */
 260    case 'L':
 261        ct->ct |= TCG_CT_REG;
 262        ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff;
 263        tcg_regset_reset_reg(ct->u.regs, TCG_REG_L0);
 264        tcg_regset_reset_reg(ct->u.regs, TCG_REG_L1);
 265        break;
 266
 267    case 'e':
 268        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_S32);
 269        break;
 270    case 'Z':
 271        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_U32);
 272        break;
 273    case 'I':
 274        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_I32);
 275        break;
 276
 277    default:
 278        return NULL;
 279    }
 280    return ct_str;
 281}
 282
 283/* test if a constant matches the constraint */
 284static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 285                                         const TCGArgConstraint *arg_ct)
 286{
 287    int ct = arg_ct->ct;
 288    if (ct & TCG_CT_CONST) {
 289        return 1;
 290    }
 291    if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
 292        return 1;
 293    }
 294    if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
 295        return 1;
 296    }
 297    if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
 298        return 1;
 299    }
 300    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
 301        return 1;
 302    }
 303    return 0;
 304}
 305
 306# define LOWREGMASK(x)  ((x) & 7)
 307
 308#define P_EXT           0x100           /* 0x0f opcode prefix */
 309#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
 310#define P_DATA16        0x400           /* 0x66 opcode prefix */
 311#if TCG_TARGET_REG_BITS == 64
 312# define P_REXW         0x1000          /* Set REX.W = 1 */
 313# define P_REXB_R       0x2000          /* REG field as byte register */
 314# define P_REXB_RM      0x4000          /* R/M field as byte register */
 315# define P_GS           0x8000          /* gs segment override */
 316#else
 317# define P_REXW         0
 318# define P_REXB_R       0
 319# define P_REXB_RM      0
 320# define P_GS           0
 321#endif
 322#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
 323#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
 324#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
 325#define P_VEXL          0x80000         /* Set VEX.L = 1 */
 326
 327#define OPC_ARITH_EvIz  (0x81)
 328#define OPC_ARITH_EvIb  (0x83)
 329#define OPC_ARITH_GvEv  (0x03)          /* ... plus (ARITH_FOO << 3) */
 330#define OPC_ANDN        (0xf2 | P_EXT38)
 331#define OPC_ADD_GvEv    (OPC_ARITH_GvEv | (ARITH_ADD << 3))
 332#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
 333#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
 334#define OPC_BSF         (0xbc | P_EXT)
 335#define OPC_BSR         (0xbd | P_EXT)
 336#define OPC_BSWAP       (0xc8 | P_EXT)
 337#define OPC_CALL_Jz     (0xe8)
 338#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
 339#define OPC_CMP_GvEv    (OPC_ARITH_GvEv | (ARITH_CMP << 3))
 340#define OPC_DEC_r32     (0x48)
 341#define OPC_IMUL_GvEv   (0xaf | P_EXT)
 342#define OPC_IMUL_GvEvIb (0x6b)
 343#define OPC_IMUL_GvEvIz (0x69)
 344#define OPC_INC_r32     (0x40)
 345#define OPC_JCC_long    (0x80 | P_EXT)  /* ... plus condition code */
 346#define OPC_JCC_short   (0x70)          /* ... plus condition code */
 347#define OPC_JMP_long    (0xe9)
 348#define OPC_JMP_short   (0xeb)
 349#define OPC_LEA         (0x8d)
 350#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
 351#define OPC_MOVB_EvGv   (0x88)          /* stores, more or less */
 352#define OPC_MOVL_EvGv   (0x89)          /* stores, more or less */
 353#define OPC_MOVL_GvEv   (0x8b)          /* loads, more or less */
 354#define OPC_MOVB_EvIz   (0xc6)
 355#define OPC_MOVL_EvIz   (0xc7)
 356#define OPC_MOVL_Iv     (0xb8)
 357#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
 358#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
 359#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
 360#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
 361#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
 362#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
 363#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
 364#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
 365#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
 366#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
 367#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
 368#define OPC_MOVSBL      (0xbe | P_EXT)
 369#define OPC_MOVSWL      (0xbf | P_EXT)
 370#define OPC_MOVSLQ      (0x63 | P_REXW)
 371#define OPC_MOVZBL      (0xb6 | P_EXT)
 372#define OPC_MOVZWL      (0xb7 | P_EXT)
 373#define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
 374#define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
 375#define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
 376#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
 377#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
 378#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
 379#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
 380#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
 381#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
 382#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
 383#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
 384#define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
 385#define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
 386#define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
 387#define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
 388#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
 389#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
 390#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
 391#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
 392#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
 393#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
 394#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
 395#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
 396#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
 397#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
 398#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
 399#define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
 400#define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
 401#define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
 402#define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
 403#define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
 404#define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
 405#define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
 406#define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
 407#define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
 408#define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
 409#define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
 410#define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
 411#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
 412#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
 413#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
 414#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
 415#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
 416#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
 417#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
 418#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
 419#define OPC_POR         (0xeb | P_EXT | P_DATA16)
 420#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
 421#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
 422#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
 423#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
 424#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
 425#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
 426#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
 427#define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
 428#define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
 429#define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
 430#define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
 431#define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
 432#define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
 433#define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
 434#define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
 435#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
 436#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
 437#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
 438#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
 439#define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
 440#define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
 441#define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
 442#define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
 443#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
 444#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
 445#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
 446#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
 447#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
 448#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
 449#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
 450#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
 451#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
 452#define OPC_POP_r32     (0x58)
 453#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
 454#define OPC_PUSH_r32    (0x50)
 455#define OPC_PUSH_Iv     (0x68)
 456#define OPC_PUSH_Ib     (0x6a)
 457#define OPC_RET         (0xc3)
 458#define OPC_SETCC       (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
 459#define OPC_SHIFT_1     (0xd1)
 460#define OPC_SHIFT_Ib    (0xc1)
 461#define OPC_SHIFT_cl    (0xd3)
 462#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
 463#define OPC_SHUFPS      (0xc6 | P_EXT)
 464#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
 465#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
 466#define OPC_SHRD_Ib     (0xac | P_EXT)
 467#define OPC_TESTL       (0x85)
 468#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
 469#define OPC_UD2         (0x0b | P_EXT)
 470#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
 471#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
 472#define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
 473#define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
 474#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
 475#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
 476#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
 477#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
 478#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
 479#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
 480#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_REXW)
 481#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
 482#define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
 483#define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_REXW)
 484#define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
 485#define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
 486#define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_REXW)
 487#define OPC_VZEROUPPER  (0x77 | P_EXT)
 488#define OPC_XCHG_ax_r32 (0x90)
 489
 490#define OPC_GRP3_Ev     (0xf7)
 491#define OPC_GRP5        (0xff)
 492#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
 493
 494/* Group 1 opcode extensions for 0x80-0x83.
 495   These are also used as modifiers for OPC_ARITH.  */
 496#define ARITH_ADD 0
 497#define ARITH_OR  1
 498#define ARITH_ADC 2
 499#define ARITH_SBB 3
 500#define ARITH_AND 4
 501#define ARITH_SUB 5
 502#define ARITH_XOR 6
 503#define ARITH_CMP 7
 504
 505/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
 506#define SHIFT_ROL 0
 507#define SHIFT_ROR 1
 508#define SHIFT_SHL 4
 509#define SHIFT_SHR 5
 510#define SHIFT_SAR 7
 511
 512/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
 513#define EXT3_NOT   2
 514#define EXT3_NEG   3
 515#define EXT3_MUL   4
 516#define EXT3_IMUL  5
 517#define EXT3_DIV   6
 518#define EXT3_IDIV  7
 519
 520/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
 521#define EXT5_INC_Ev     0
 522#define EXT5_DEC_Ev     1
 523#define EXT5_CALLN_Ev   2
 524#define EXT5_JMPN_Ev    4
 525
 526/* Condition codes to be added to OPC_JCC_{long,short}.  */
 527#define JCC_JMP (-1)
 528#define JCC_JO  0x0
 529#define JCC_JNO 0x1
 530#define JCC_JB  0x2
 531#define JCC_JAE 0x3
 532#define JCC_JE  0x4
 533#define JCC_JNE 0x5
 534#define JCC_JBE 0x6
 535#define JCC_JA  0x7
 536#define JCC_JS  0x8
 537#define JCC_JNS 0x9
 538#define JCC_JP  0xa
 539#define JCC_JNP 0xb
 540#define JCC_JL  0xc
 541#define JCC_JGE 0xd
 542#define JCC_JLE 0xe
 543#define JCC_JG  0xf
 544
 545static const uint8_t tcg_cond_to_jcc[] = {
 546    [TCG_COND_EQ] = JCC_JE,
 547    [TCG_COND_NE] = JCC_JNE,
 548    [TCG_COND_LT] = JCC_JL,
 549    [TCG_COND_GE] = JCC_JGE,
 550    [TCG_COND_LE] = JCC_JLE,
 551    [TCG_COND_GT] = JCC_JG,
 552    [TCG_COND_LTU] = JCC_JB,
 553    [TCG_COND_GEU] = JCC_JAE,
 554    [TCG_COND_LEU] = JCC_JBE,
 555    [TCG_COND_GTU] = JCC_JA,
 556};
 557
 558#if TCG_TARGET_REG_BITS == 64
 559static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
 560{
 561    int rex;
 562
 563    if (opc & P_GS) {
 564        tcg_out8(s, 0x65);
 565    }
 566    if (opc & P_DATA16) {
 567        /* We should never be asking for both 16 and 64-bit operation.  */
 568        tcg_debug_assert((opc & P_REXW) == 0);
 569        tcg_out8(s, 0x66);
 570    }
 571    if (opc & P_SIMDF3) {
 572        tcg_out8(s, 0xf3);
 573    } else if (opc & P_SIMDF2) {
 574        tcg_out8(s, 0xf2);
 575    }
 576
 577    rex = 0;
 578    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
 579    rex |= (r & 8) >> 1;                /* REX.R */
 580    rex |= (x & 8) >> 2;                /* REX.X */
 581    rex |= (rm & 8) >> 3;               /* REX.B */
 582
 583    /* P_REXB_{R,RM} indicates that the given register is the low byte.
 584       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
 585       as otherwise the encoding indicates %[abcd]h.  Note that the values
 586       that are ORed in merely indicate that the REX byte must be present;
 587       those bits get discarded in output.  */
 588    rex |= opc & (r >= 4 ? P_REXB_R : 0);
 589    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
 590
 591    if (rex) {
 592        tcg_out8(s, (uint8_t)(rex | 0x40));
 593    }
 594
 595    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 596        tcg_out8(s, 0x0f);
 597        if (opc & P_EXT38) {
 598            tcg_out8(s, 0x38);
 599        } else if (opc & P_EXT3A) {
 600            tcg_out8(s, 0x3a);
 601        }
 602    }
 603
 604    tcg_out8(s, opc);
 605}
 606#else
 607static void tcg_out_opc(TCGContext *s, int opc)
 608{
 609    if (opc & P_DATA16) {
 610        tcg_out8(s, 0x66);
 611    }
 612    if (opc & P_SIMDF3) {
 613        tcg_out8(s, 0xf3);
 614    } else if (opc & P_SIMDF2) {
 615        tcg_out8(s, 0xf2);
 616    }
 617    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 618        tcg_out8(s, 0x0f);
 619        if (opc & P_EXT38) {
 620            tcg_out8(s, 0x38);
 621        } else if (opc & P_EXT3A) {
 622            tcg_out8(s, 0x3a);
 623        }
 624    }
 625    tcg_out8(s, opc);
 626}
 627/* Discard the register arguments to tcg_out_opc early, so as not to penalize
 628   the 32-bit compilation paths.  This method works with all versions of gcc,
 629   whereas relying on optimization may not be able to exclude them.  */
 630#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
 631#endif
 632
 633static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
 634{
 635    tcg_out_opc(s, opc, r, rm, 0);
 636    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 637}
 638
 639static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
 640                            int rm, int index)
 641{
 642    int tmp;
 643
 644    /* Use the two byte form if possible, which cannot encode
 645       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
 646    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT
 647        && ((rm | index) & 8) == 0) {
 648        /* Two byte VEX prefix.  */
 649        tcg_out8(s, 0xc5);
 650
 651        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
 652    } else {
 653        /* Three byte VEX prefix.  */
 654        tcg_out8(s, 0xc4);
 655
 656        /* VEX.m-mmmm */
 657        if (opc & P_EXT3A) {
 658            tmp = 3;
 659        } else if (opc & P_EXT38) {
 660            tmp = 2;
 661        } else if (opc & P_EXT) {
 662            tmp = 1;
 663        } else {
 664            g_assert_not_reached();
 665        }
 666        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
 667        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
 668        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
 669        tcg_out8(s, tmp);
 670
 671        tmp = (opc & P_REXW ? 0x80 : 0);       /* VEX.W */
 672    }
 673
 674    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
 675    /* VEX.pp */
 676    if (opc & P_DATA16) {
 677        tmp |= 1;                          /* 0x66 */
 678    } else if (opc & P_SIMDF3) {
 679        tmp |= 2;                          /* 0xf3 */
 680    } else if (opc & P_SIMDF2) {
 681        tmp |= 3;                          /* 0xf2 */
 682    }
 683    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
 684    tcg_out8(s, tmp);
 685    tcg_out8(s, opc);
 686}
 687
 688static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
 689{
 690    tcg_out_vex_opc(s, opc, r, v, rm, 0);
 691    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 692}
 693
 694/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
 695   We handle either RM and INDEX missing with a negative value.  In 64-bit
 696   mode for absolute addresses, ~RM is the size of the immediate operand
 697   that will follow the instruction.  */
 698
 699static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
 700                               int shift, intptr_t offset)
 701{
 702    int mod, len;
 703
 704    if (index < 0 && rm < 0) {
 705        if (TCG_TARGET_REG_BITS == 64) {
 706            /* Try for a rip-relative addressing mode.  This has replaced
 707               the 32-bit-mode absolute addressing encoding.  */
 708            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
 709            intptr_t disp = offset - pc;
 710            if (disp == (int32_t)disp) {
 711                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
 712                tcg_out32(s, disp);
 713                return;
 714            }
 715
 716            /* Try for an absolute address encoding.  This requires the
 717               use of the MODRM+SIB encoding and is therefore larger than
 718               rip-relative addressing.  */
 719            if (offset == (int32_t)offset) {
 720                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
 721                tcg_out8(s, (4 << 3) | 5);
 722                tcg_out32(s, offset);
 723                return;
 724            }
 725
 726            /* ??? The memory isn't directly addressable.  */
 727            g_assert_not_reached();
 728        } else {
 729            /* Absolute address.  */
 730            tcg_out8(s, (r << 3) | 5);
 731            tcg_out32(s, offset);
 732            return;
 733        }
 734    }
 735
 736    /* Find the length of the immediate addend.  Note that the encoding
 737       that would be used for (%ebp) indicates absolute addressing.  */
 738    if (rm < 0) {
 739        mod = 0, len = 4, rm = 5;
 740    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
 741        mod = 0, len = 0;
 742    } else if (offset == (int8_t)offset) {
 743        mod = 0x40, len = 1;
 744    } else {
 745        mod = 0x80, len = 4;
 746    }
 747
 748    /* Use a single byte MODRM format if possible.  Note that the encoding
 749       that would be used for %esp is the escape to the two byte form.  */
 750    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
 751        /* Single byte MODRM format.  */
 752        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 753    } else {
 754        /* Two byte MODRM+SIB format.  */
 755
 756        /* Note that the encoding that would place %esp into the index
 757           field indicates no index register.  In 64-bit mode, the REX.X
 758           bit counts, so %r12 can be used as the index.  */
 759        if (index < 0) {
 760            index = 4;
 761        } else {
 762            tcg_debug_assert(index != TCG_REG_ESP);
 763        }
 764
 765        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
 766        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
 767    }
 768
 769    if (len == 1) {
 770        tcg_out8(s, offset);
 771    } else if (len == 4) {
 772        tcg_out32(s, offset);
 773    }
 774}
 775
 776static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
 777                                     int index, int shift, intptr_t offset)
 778{
 779    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 780    tcg_out_sib_offset(s, r, rm, index, shift, offset);
 781}
 782
 783static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
 784                                         int rm, int index, int shift,
 785                                         intptr_t offset)
 786{
 787    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 788    tcg_out_sib_offset(s, r, rm, index, shift, offset);
 789}
 790
 791/* A simplification of the above with no index or shift.  */
 792static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
 793                                        int rm, intptr_t offset)
 794{
 795    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
 796}
 797
 798static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
 799                                            int v, int rm, intptr_t offset)
 800{
 801    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
 802}
 803
 804/* Output an opcode with an expected reference to the constant pool.  */
 805static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
 806{
 807    tcg_out_opc(s, opc, r, 0, 0);
 808    /* Absolute for 32-bit, pc-relative for 64-bit.  */
 809    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 810    tcg_out32(s, 0);
 811}
 812
 813/* Output an opcode with an expected reference to the constant pool.  */
 814static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
 815{
 816    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
 817    /* Absolute for 32-bit, pc-relative for 64-bit.  */
 818    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 819    tcg_out32(s, 0);
 820}
 821
 822/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
 823static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
 824{
 825    /* Propagate an opcode prefix, such as P_REXW.  */
 826    int ext = subop & ~0x7;
 827    subop &= 0x7;
 828
 829    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
 830}
 831
 832static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
 833{
 834    int rexw = 0;
 835
 836    if (arg == ret) {
 837        return true;
 838    }
 839    switch (type) {
 840    case TCG_TYPE_I64:
 841        rexw = P_REXW;
 842        /* fallthru */
 843    case TCG_TYPE_I32:
 844        if (ret < 16) {
 845            if (arg < 16) {
 846                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
 847            } else {
 848                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
 849            }
 850        } else {
 851            if (arg < 16) {
 852                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
 853            } else {
 854                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 855            }
 856        }
 857        break;
 858
 859    case TCG_TYPE_V64:
 860        tcg_debug_assert(ret >= 16 && arg >= 16);
 861        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 862        break;
 863    case TCG_TYPE_V128:
 864        tcg_debug_assert(ret >= 16 && arg >= 16);
 865        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
 866        break;
 867    case TCG_TYPE_V256:
 868        tcg_debug_assert(ret >= 16 && arg >= 16);
 869        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
 870        break;
 871
 872    default:
 873        g_assert_not_reached();
 874    }
 875    return true;
 876}
 877
 878static const int avx2_dup_insn[4] = {
 879    OPC_VPBROADCASTB, OPC_VPBROADCASTW,
 880    OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
 881};
 882
 883static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
 884                            TCGReg r, TCGReg a)
 885{
 886    if (have_avx2) {
 887        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 888        tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
 889    } else {
 890        switch (vece) {
 891        case MO_8:
 892            /* ??? With zero in a register, use PSHUFB.  */
 893            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
 894            a = r;
 895            /* FALLTHRU */
 896        case MO_16:
 897            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
 898            a = r;
 899            /* FALLTHRU */
 900        case MO_32:
 901            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
 902            /* imm8 operand: all output lanes selected from input lane 0.  */
 903            tcg_out8(s, 0);
 904            break;
 905        case MO_64:
 906            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
 907            break;
 908        default:
 909            g_assert_not_reached();
 910        }
 911    }
 912    return true;
 913}
 914
 915static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
 916                             TCGReg r, TCGReg base, intptr_t offset)
 917{
 918    if (have_avx2) {
 919        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 920        tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
 921                                 r, 0, base, offset);
 922    } else {
 923        switch (vece) {
 924        case MO_64:
 925            tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
 926            break;
 927        case MO_32:
 928            tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
 929            break;
 930        case MO_16:
 931            tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
 932            tcg_out8(s, 0); /* imm8 */
 933            tcg_out_dup_vec(s, type, vece, r, r);
 934            break;
 935        case MO_8:
 936            tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
 937            tcg_out8(s, 0); /* imm8 */
 938            tcg_out_dup_vec(s, type, vece, r, r);
 939            break;
 940        default:
 941            g_assert_not_reached();
 942        }
 943    }
 944    return true;
 945}
 946
 947static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
 948                             TCGReg ret, tcg_target_long arg)
 949{
 950    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 951
 952    if (arg == 0) {
 953        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
 954        return;
 955    }
 956    if (arg == -1) {
 957        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
 958        return;
 959    }
 960
 961    if (TCG_TARGET_REG_BITS == 64) {
 962        if (type == TCG_TYPE_V64) {
 963            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
 964        } else if (have_avx2) {
 965            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
 966        } else {
 967            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
 968        }
 969        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
 970    } else {
 971        if (have_avx2) {
 972            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTW + vex_l, ret);
 973        } else {
 974            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
 975        }
 976        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 977    }
 978}
 979
 980static void tcg_out_movi(TCGContext *s, TCGType type,
 981                         TCGReg ret, tcg_target_long arg)
 982{
 983    tcg_target_long diff;
 984
 985    switch (type) {
 986    case TCG_TYPE_I32:
 987#if TCG_TARGET_REG_BITS == 64
 988    case TCG_TYPE_I64:
 989#endif
 990        if (ret < 16) {
 991            break;
 992        }
 993        /* fallthru */
 994    case TCG_TYPE_V64:
 995    case TCG_TYPE_V128:
 996    case TCG_TYPE_V256:
 997        tcg_debug_assert(ret >= 16);
 998        tcg_out_dupi_vec(s, type, ret, arg);
 999        return;
1000    default:
1001        g_assert_not_reached();
1002    }
1003
1004    if (arg == 0) {
1005        tgen_arithr(s, ARITH_XOR, ret, ret);
1006        return;
1007    }
1008    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1009        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1010        tcg_out32(s, arg);
1011        return;
1012    }
1013    if (arg == (int32_t)arg) {
1014        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1015        tcg_out32(s, arg);
1016        return;
1017    }
1018
1019    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1020    diff = arg - ((uintptr_t)s->code_ptr + 7);
1021    if (diff == (int32_t)diff) {
1022        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1023        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1024        tcg_out32(s, diff);
1025        return;
1026    }
1027
1028    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1029    tcg_out64(s, arg);
1030}
1031
1032static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1033{
1034    if (val == (int8_t)val) {
1035        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1036        tcg_out8(s, val);
1037    } else if (val == (int32_t)val) {
1038        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1039        tcg_out32(s, val);
1040    } else {
1041        tcg_abort();
1042    }
1043}
1044
1045static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1046{
1047    /* Given the strength of x86 memory ordering, we only need care for
1048       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1049       faster than "mfence", so don't bother with the sse insn.  */
1050    if (a0 & TCG_MO_ST_LD) {
1051        tcg_out8(s, 0xf0);
1052        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1053        tcg_out8(s, 0);
1054    }
1055}
1056
1057static inline void tcg_out_push(TCGContext *s, int reg)
1058{
1059    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1060}
1061
1062static inline void tcg_out_pop(TCGContext *s, int reg)
1063{
1064    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1065}
1066
1067static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1068                       TCGReg arg1, intptr_t arg2)
1069{
1070    switch (type) {
1071    case TCG_TYPE_I32:
1072        if (ret < 16) {
1073            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1074        } else {
1075            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1076        }
1077        break;
1078    case TCG_TYPE_I64:
1079        if (ret < 16) {
1080            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1081            break;
1082        }
1083        /* FALLTHRU */
1084    case TCG_TYPE_V64:
1085        /* There is no instruction that can validate 8-byte alignment.  */
1086        tcg_debug_assert(ret >= 16);
1087        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1088        break;
1089    case TCG_TYPE_V128:
1090        /*
1091         * The gvec infrastructure is asserts that v128 vector loads
1092         * and stores use a 16-byte aligned offset.  Validate that the
1093         * final pointer is aligned by using an insn that will SIGSEGV.
1094         */
1095        tcg_debug_assert(ret >= 16);
1096        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1097        break;
1098    case TCG_TYPE_V256:
1099        /*
1100         * The gvec infrastructure only requires 16-byte alignment,
1101         * so here we must use an unaligned load.
1102         */
1103        tcg_debug_assert(ret >= 16);
1104        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1105                                 ret, 0, arg1, arg2);
1106        break;
1107    default:
1108        g_assert_not_reached();
1109    }
1110}
1111
1112static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1113                       TCGReg arg1, intptr_t arg2)
1114{
1115    switch (type) {
1116    case TCG_TYPE_I32:
1117        if (arg < 16) {
1118            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1119        } else {
1120            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1121        }
1122        break;
1123    case TCG_TYPE_I64:
1124        if (arg < 16) {
1125            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1126            break;
1127        }
1128        /* FALLTHRU */
1129    case TCG_TYPE_V64:
1130        /* There is no instruction that can validate 8-byte alignment.  */
1131        tcg_debug_assert(arg >= 16);
1132        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1133        break;
1134    case TCG_TYPE_V128:
1135        /*
1136         * The gvec infrastructure is asserts that v128 vector loads
1137         * and stores use a 16-byte aligned offset.  Validate that the
1138         * final pointer is aligned by using an insn that will SIGSEGV.
1139         */
1140        tcg_debug_assert(arg >= 16);
1141        tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1142        break;
1143    case TCG_TYPE_V256:
1144        /*
1145         * The gvec infrastructure only requires 16-byte alignment,
1146         * so here we must use an unaligned store.
1147         */
1148        tcg_debug_assert(arg >= 16);
1149        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1150                                 arg, 0, arg1, arg2);
1151        break;
1152    default:
1153        g_assert_not_reached();
1154    }
1155}
1156
1157static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1158                        TCGReg base, intptr_t ofs)
1159{
1160    int rexw = 0;
1161    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1162        if (val != (int32_t)val) {
1163            return false;
1164        }
1165        rexw = P_REXW;
1166    } else if (type != TCG_TYPE_I32) {
1167        return false;
1168    }
1169    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1170    tcg_out32(s, val);
1171    return true;
1172}
1173
1174static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1175{
1176    /* Propagate an opcode prefix, such as P_DATA16.  */
1177    int ext = subopc & ~0x7;
1178    subopc &= 0x7;
1179
1180    if (count == 1) {
1181        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1182    } else {
1183        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1184        tcg_out8(s, count);
1185    }
1186}
1187
1188static inline void tcg_out_bswap32(TCGContext *s, int reg)
1189{
1190    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1191}
1192
1193static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1194{
1195    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1196}
1197
1198static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
1199{
1200    /* movzbl */
1201    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1202    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1203}
1204
1205static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
1206{
1207    /* movsbl */
1208    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1209    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1210}
1211
1212static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
1213{
1214    /* movzwl */
1215    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1216}
1217
1218static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
1219{
1220    /* movsw[lq] */
1221    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1222}
1223
1224static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
1225{
1226    /* 32-bit mov zero extends.  */
1227    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1228}
1229
1230static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
1231{
1232    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1233}
1234
1235static inline void tcg_out_bswap64(TCGContext *s, int reg)
1236{
1237    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1238}
1239
1240static void tgen_arithi(TCGContext *s, int c, int r0,
1241                        tcg_target_long val, int cf)
1242{
1243    int rexw = 0;
1244
1245    if (TCG_TARGET_REG_BITS == 64) {
1246        rexw = c & -8;
1247        c &= 7;
1248    }
1249
1250    /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1251       partial flags update stalls on Pentium4 and are not recommended
1252       by current Intel optimization manuals.  */
1253    if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1254        int is_inc = (c == ARITH_ADD) ^ (val < 0);
1255        if (TCG_TARGET_REG_BITS == 64) {
1256            /* The single-byte increment encodings are re-tasked as the
1257               REX prefixes.  Use the MODRM encoding.  */
1258            tcg_out_modrm(s, OPC_GRP5 + rexw,
1259                          (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1260        } else {
1261            tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1262        }
1263        return;
1264    }
1265
1266    if (c == ARITH_AND) {
1267        if (TCG_TARGET_REG_BITS == 64) {
1268            if (val == 0xffffffffu) {
1269                tcg_out_ext32u(s, r0, r0);
1270                return;
1271            }
1272            if (val == (uint32_t)val) {
1273                /* AND with no high bits set can use a 32-bit operation.  */
1274                rexw = 0;
1275            }
1276        }
1277        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1278            tcg_out_ext8u(s, r0, r0);
1279            return;
1280        }
1281        if (val == 0xffffu) {
1282            tcg_out_ext16u(s, r0, r0);
1283            return;
1284        }
1285    }
1286
1287    if (val == (int8_t)val) {
1288        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1289        tcg_out8(s, val);
1290        return;
1291    }
1292    if (rexw == 0 || val == (int32_t)val) {
1293        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1294        tcg_out32(s, val);
1295        return;
1296    }
1297
1298    tcg_abort();
1299}
1300
1301static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1302{
1303    if (val != 0) {
1304        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1305    }
1306}
1307
1308/* Use SMALL != 0 to force a short forward branch.  */
1309static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
1310{
1311    int32_t val, val1;
1312
1313    if (l->has_value) {
1314        val = tcg_pcrel_diff(s, l->u.value_ptr);
1315        val1 = val - 2;
1316        if ((int8_t)val1 == val1) {
1317            if (opc == -1) {
1318                tcg_out8(s, OPC_JMP_short);
1319            } else {
1320                tcg_out8(s, OPC_JCC_short + opc);
1321            }
1322            tcg_out8(s, val1);
1323        } else {
1324            if (small) {
1325                tcg_abort();
1326            }
1327            if (opc == -1) {
1328                tcg_out8(s, OPC_JMP_long);
1329                tcg_out32(s, val - 5);
1330            } else {
1331                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1332                tcg_out32(s, val - 6);
1333            }
1334        }
1335    } else if (small) {
1336        if (opc == -1) {
1337            tcg_out8(s, OPC_JMP_short);
1338        } else {
1339            tcg_out8(s, OPC_JCC_short + opc);
1340        }
1341        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1342        s->code_ptr += 1;
1343    } else {
1344        if (opc == -1) {
1345            tcg_out8(s, OPC_JMP_long);
1346        } else {
1347            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1348        }
1349        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1350        s->code_ptr += 4;
1351    }
1352}
1353
1354static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1355                        int const_arg2, int rexw)
1356{
1357    if (const_arg2) {
1358        if (arg2 == 0) {
1359            /* test r, r */
1360            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1361        } else {
1362            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1363        }
1364    } else {
1365        tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1366    }
1367}
1368
1369static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1370                             TCGArg arg1, TCGArg arg2, int const_arg2,
1371                             TCGLabel *label, int small)
1372{
1373    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1374    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1375}
1376
1377#if TCG_TARGET_REG_BITS == 64
1378static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1379                             TCGArg arg1, TCGArg arg2, int const_arg2,
1380                             TCGLabel *label, int small)
1381{
1382    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1383    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1384}
1385#else
1386/* XXX: we implement it at the target level to avoid having to
1387   handle cross basic blocks temporaries */
1388static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1389                            const int *const_args, int small)
1390{
1391    TCGLabel *label_next = gen_new_label();
1392    TCGLabel *label_this = arg_label(args[5]);
1393
1394    switch(args[4]) {
1395    case TCG_COND_EQ:
1396        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1397                         label_next, 1);
1398        tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1399                         label_this, small);
1400        break;
1401    case TCG_COND_NE:
1402        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1403                         label_this, small);
1404        tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1405                         label_this, small);
1406        break;
1407    case TCG_COND_LT:
1408        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1409                         label_this, small);
1410        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1411        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1412                         label_this, small);
1413        break;
1414    case TCG_COND_LE:
1415        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1416                         label_this, small);
1417        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1418        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1419                         label_this, small);
1420        break;
1421    case TCG_COND_GT:
1422        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1423                         label_this, small);
1424        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1425        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1426                         label_this, small);
1427        break;
1428    case TCG_COND_GE:
1429        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1430                         label_this, small);
1431        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1432        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1433                         label_this, small);
1434        break;
1435    case TCG_COND_LTU:
1436        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1437                         label_this, small);
1438        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1439        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1440                         label_this, small);
1441        break;
1442    case TCG_COND_LEU:
1443        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1444                         label_this, small);
1445        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1446        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1447                         label_this, small);
1448        break;
1449    case TCG_COND_GTU:
1450        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1451                         label_this, small);
1452        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1453        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1454                         label_this, small);
1455        break;
1456    case TCG_COND_GEU:
1457        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1458                         label_this, small);
1459        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1460        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1461                         label_this, small);
1462        break;
1463    default:
1464        tcg_abort();
1465    }
1466    tcg_out_label(s, label_next, s->code_ptr);
1467}
1468#endif
1469
1470static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1471                              TCGArg arg1, TCGArg arg2, int const_arg2)
1472{
1473    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1474    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1475    tcg_out_ext8u(s, dest, dest);
1476}
1477
1478#if TCG_TARGET_REG_BITS == 64
1479static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1480                              TCGArg arg1, TCGArg arg2, int const_arg2)
1481{
1482    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1483    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1484    tcg_out_ext8u(s, dest, dest);
1485}
1486#else
1487static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1488                             const int *const_args)
1489{
1490    TCGArg new_args[6];
1491    TCGLabel *label_true, *label_over;
1492
1493    memcpy(new_args, args+1, 5*sizeof(TCGArg));
1494
1495    if (args[0] == args[1] || args[0] == args[2]
1496        || (!const_args[3] && args[0] == args[3])
1497        || (!const_args[4] && args[0] == args[4])) {
1498        /* When the destination overlaps with one of the argument
1499           registers, don't do anything tricky.  */
1500        label_true = gen_new_label();
1501        label_over = gen_new_label();
1502
1503        new_args[5] = label_arg(label_true);
1504        tcg_out_brcond2(s, new_args, const_args+1, 1);
1505
1506        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1507        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1508        tcg_out_label(s, label_true, s->code_ptr);
1509
1510        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1511        tcg_out_label(s, label_over, s->code_ptr);
1512    } else {
1513        /* When the destination does not overlap one of the arguments,
1514           clear the destination first, jump if cond false, and emit an
1515           increment in the true case.  This results in smaller code.  */
1516
1517        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1518
1519        label_over = gen_new_label();
1520        new_args[4] = tcg_invert_cond(new_args[4]);
1521        new_args[5] = label_arg(label_over);
1522        tcg_out_brcond2(s, new_args, const_args+1, 1);
1523
1524        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1525        tcg_out_label(s, label_over, s->code_ptr);
1526    }
1527}
1528#endif
1529
1530static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1531                         TCGReg dest, TCGReg v1)
1532{
1533    if (have_cmov) {
1534        tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1535    } else {
1536        TCGLabel *over = gen_new_label();
1537        tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1538        tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1539        tcg_out_label(s, over, s->code_ptr);
1540    }
1541}
1542
1543static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1544                              TCGReg c1, TCGArg c2, int const_c2,
1545                              TCGReg v1)
1546{
1547    tcg_out_cmp(s, c1, c2, const_c2, 0);
1548    tcg_out_cmov(s, cond, 0, dest, v1);
1549}
1550
1551#if TCG_TARGET_REG_BITS == 64
1552static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1553                              TCGReg c1, TCGArg c2, int const_c2,
1554                              TCGReg v1)
1555{
1556    tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1557    tcg_out_cmov(s, cond, P_REXW, dest, v1);
1558}
1559#endif
1560
1561static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1562                        TCGArg arg2, bool const_a2)
1563{
1564    if (have_bmi1) {
1565        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1566        if (const_a2) {
1567            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1568        } else {
1569            tcg_debug_assert(dest != arg2);
1570            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1571        }
1572    } else {
1573        tcg_debug_assert(dest != arg2);
1574        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1575        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1576    }
1577}
1578
1579static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1580                        TCGArg arg2, bool const_a2)
1581{
1582    if (have_lzcnt) {
1583        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1584        if (const_a2) {
1585            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1586        } else {
1587            tcg_debug_assert(dest != arg2);
1588            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1589        }
1590    } else {
1591        tcg_debug_assert(!const_a2);
1592        tcg_debug_assert(dest != arg1);
1593        tcg_debug_assert(dest != arg2);
1594
1595        /* Recall that the output of BSR is the index not the count.  */
1596        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1597        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1598
1599        /* Since we have destroyed the flags from BSR, we have to re-test.  */
1600        tcg_out_cmp(s, arg1, 0, 1, rexw);
1601        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1602    }
1603}
1604
1605static void tcg_out_branch(TCGContext *s, int call, tcg_insn_unit *dest)
1606{
1607    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1608
1609    if (disp == (int32_t)disp) {
1610        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1611        tcg_out32(s, disp);
1612    } else {
1613        /* rip-relative addressing into the constant pool.
1614           This is 6 + 8 = 14 bytes, as compared to using an
1615           an immediate load 10 + 6 = 16 bytes, plus we may
1616           be able to re-use the pool constant for more calls.  */
1617        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1618        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1619        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1620        tcg_out32(s, 0);
1621    }
1622}
1623
1624static inline void tcg_out_call(TCGContext *s, tcg_insn_unit *dest)
1625{
1626    tcg_out_branch(s, 1, dest);
1627}
1628
1629static void tcg_out_jmp(TCGContext *s, tcg_insn_unit *dest)
1630{
1631    tcg_out_branch(s, 0, dest);
1632}
1633
1634static void tcg_out_nopn(TCGContext *s, int n)
1635{
1636    int i;
1637    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1638     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1639     * duplicate prefix, and all of the interesting recent cores can
1640     * decode and discard the duplicates in a single cycle.
1641     */
1642    tcg_debug_assert(n >= 1);
1643    for (i = 1; i < n; ++i) {
1644        tcg_out8(s, 0x66);
1645    }
1646    tcg_out8(s, 0x90);
1647}
1648
1649#if defined(CONFIG_SOFTMMU)
1650#include "tcg-ldst.inc.c"
1651
1652/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1653 *                                     int mmu_idx, uintptr_t ra)
1654 */
1655static void * const qemu_ld_helpers[16] = {
1656    [MO_UB]   = helper_ret_ldub_mmu,
1657    [MO_LEUW] = helper_le_lduw_mmu,
1658    [MO_LEUL] = helper_le_ldul_mmu,
1659    [MO_LEQ]  = helper_le_ldq_mmu,
1660    [MO_BEUW] = helper_be_lduw_mmu,
1661    [MO_BEUL] = helper_be_ldul_mmu,
1662    [MO_BEQ]  = helper_be_ldq_mmu,
1663};
1664
1665/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1666 *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
1667 */
1668static void * const qemu_st_helpers[16] = {
1669    [MO_UB]   = helper_ret_stb_mmu,
1670    [MO_LEUW] = helper_le_stw_mmu,
1671    [MO_LEUL] = helper_le_stl_mmu,
1672    [MO_LEQ]  = helper_le_stq_mmu,
1673    [MO_BEUW] = helper_be_stw_mmu,
1674    [MO_BEUL] = helper_be_stl_mmu,
1675    [MO_BEQ]  = helper_be_stq_mmu,
1676};
1677
1678/* Perform the TLB load and compare.
1679
1680   Inputs:
1681   ADDRLO and ADDRHI contain the low and high part of the address.
1682
1683   MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1684
1685   WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1686   This should be offsetof addr_read or addr_write.
1687
1688   Outputs:
1689   LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1690   positions of the displacements of forward jumps to the TLB miss case.
1691
1692   Second argument register is loaded with the low part of the address.
1693   In the TLB hit case, it has been adjusted as indicated by the TLB
1694   and so is a host address.  In the TLB miss case, it continues to
1695   hold a guest address.
1696
1697   First argument register is clobbered.  */
1698
1699static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1700                                    int mem_index, TCGMemOp opc,
1701                                    tcg_insn_unit **label_ptr, int which)
1702{
1703    const TCGReg r0 = TCG_REG_L0;
1704    const TCGReg r1 = TCG_REG_L1;
1705    TCGType ttype = TCG_TYPE_I32;
1706    TCGType tlbtype = TCG_TYPE_I32;
1707    int trexw = 0, hrexw = 0, tlbrexw = 0;
1708    unsigned a_bits = get_alignment_bits(opc);
1709    unsigned s_bits = opc & MO_SIZE;
1710    unsigned a_mask = (1 << a_bits) - 1;
1711    unsigned s_mask = (1 << s_bits) - 1;
1712    target_ulong tlb_mask;
1713
1714    if (TCG_TARGET_REG_BITS == 64) {
1715        if (TARGET_LONG_BITS == 64) {
1716            ttype = TCG_TYPE_I64;
1717            trexw = P_REXW;
1718        }
1719        if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1720            hrexw = P_REXW;
1721            if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
1722                tlbtype = TCG_TYPE_I64;
1723                tlbrexw = P_REXW;
1724            }
1725        }
1726    }
1727
1728    tcg_out_mov(s, tlbtype, r0, addrlo);
1729    tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1730                   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1731
1732    tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0,
1733                         TLB_MASK_TABLE_OFS(mem_index) +
1734                         offsetof(CPUTLBDescFast, mask));
1735
1736    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0,
1737                         TLB_MASK_TABLE_OFS(mem_index) +
1738                         offsetof(CPUTLBDescFast, table));
1739
1740    /* If the required alignment is at least as large as the access, simply
1741       copy the address and mask.  For lesser alignments, check that we don't
1742       cross pages for the complete access.  */
1743    if (a_bits >= s_bits) {
1744        tcg_out_mov(s, ttype, r1, addrlo);
1745    } else {
1746        tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1747    }
1748    tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1749    tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1750
1751    /* cmp 0(r0), r1 */
1752    tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which);
1753
1754    /* Prepare for both the fast path add of the tlb addend, and the slow
1755       path function argument setup.  */
1756    tcg_out_mov(s, ttype, r1, addrlo);
1757
1758    /* jne slow_path */
1759    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1760    label_ptr[0] = s->code_ptr;
1761    s->code_ptr += 4;
1762
1763    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1764        /* cmp 4(r0), addrhi */
1765        tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4);
1766
1767        /* jne slow_path */
1768        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1769        label_ptr[1] = s->code_ptr;
1770        s->code_ptr += 4;
1771    }
1772
1773    /* TLB Hit.  */
1774
1775    /* add addend(r0), r1 */
1776    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1777                         offsetof(CPUTLBEntry, addend));
1778}
1779
1780/*
1781 * Record the context of a call to the out of line helper code for the slow path
1782 * for a load or store, so that we can later generate the correct helper code
1783 */
1784static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64,
1785                                TCGMemOpIdx oi,
1786                                TCGReg datalo, TCGReg datahi,
1787                                TCGReg addrlo, TCGReg addrhi,
1788                                tcg_insn_unit *raddr,
1789                                tcg_insn_unit **label_ptr)
1790{
1791    TCGLabelQemuLdst *label = new_ldst_label(s);
1792
1793    label->is_ld = is_ld;
1794    label->oi = oi;
1795    label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
1796    label->datalo_reg = datalo;
1797    label->datahi_reg = datahi;
1798    label->addrlo_reg = addrlo;
1799    label->addrhi_reg = addrhi;
1800    label->raddr = raddr;
1801    label->label_ptr[0] = label_ptr[0];
1802    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1803        label->label_ptr[1] = label_ptr[1];
1804    }
1805}
1806
1807/*
1808 * Generate code for the slow path for a load at the end of block
1809 */
1810static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1811{
1812    TCGMemOpIdx oi = l->oi;
1813    TCGMemOp opc = get_memop(oi);
1814    TCGReg data_reg;
1815    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1816    int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0);
1817
1818    /* resolve label address */
1819    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1820    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1821        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1822    }
1823
1824    if (TCG_TARGET_REG_BITS == 32) {
1825        int ofs = 0;
1826
1827        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1828        ofs += 4;
1829
1830        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1831        ofs += 4;
1832
1833        if (TARGET_LONG_BITS == 64) {
1834            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1835            ofs += 4;
1836        }
1837
1838        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1839        ofs += 4;
1840
1841        tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1842    } else {
1843        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1844        /* The second argument is already loaded with addrlo.  */
1845        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1846        tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1847                     (uintptr_t)l->raddr);
1848    }
1849
1850    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1851
1852    data_reg = l->datalo_reg;
1853    switch (opc & MO_SSIZE) {
1854    case MO_SB:
1855        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw);
1856        break;
1857    case MO_SW:
1858        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw);
1859        break;
1860#if TCG_TARGET_REG_BITS == 64
1861    case MO_SL:
1862        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1863        break;
1864#endif
1865    case MO_UB:
1866    case MO_UW:
1867        /* Note that the helpers have zero-extended to tcg_target_long.  */
1868    case MO_UL:
1869        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1870        break;
1871    case MO_Q:
1872        if (TCG_TARGET_REG_BITS == 64) {
1873            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1874        } else if (data_reg == TCG_REG_EDX) {
1875            /* xchg %edx, %eax */
1876            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1877            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1878        } else {
1879            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1880            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1881        }
1882        break;
1883    default:
1884        tcg_abort();
1885    }
1886
1887    /* Jump to the code corresponding to next IR of qemu_st */
1888    tcg_out_jmp(s, l->raddr);
1889    return true;
1890}
1891
1892/*
1893 * Generate code for the slow path for a store at the end of block
1894 */
1895static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1896{
1897    TCGMemOpIdx oi = l->oi;
1898    TCGMemOp opc = get_memop(oi);
1899    TCGMemOp s_bits = opc & MO_SIZE;
1900    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1901    TCGReg retaddr;
1902
1903    /* resolve label address */
1904    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1905    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1906        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1907    }
1908
1909    if (TCG_TARGET_REG_BITS == 32) {
1910        int ofs = 0;
1911
1912        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1913        ofs += 4;
1914
1915        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1916        ofs += 4;
1917
1918        if (TARGET_LONG_BITS == 64) {
1919            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1920            ofs += 4;
1921        }
1922
1923        tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1924        ofs += 4;
1925
1926        if (s_bits == MO_64) {
1927            tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1928            ofs += 4;
1929        }
1930
1931        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1932        ofs += 4;
1933
1934        retaddr = TCG_REG_EAX;
1935        tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1936        tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1937    } else {
1938        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1939        /* The second argument is already loaded with addrlo.  */
1940        tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1941                    tcg_target_call_iarg_regs[2], l->datalo_reg);
1942        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1943
1944        if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1945            retaddr = tcg_target_call_iarg_regs[4];
1946            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1947        } else {
1948            retaddr = TCG_REG_RAX;
1949            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1950            tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1951                       TCG_TARGET_CALL_STACK_OFFSET);
1952        }
1953    }
1954
1955    /* "Tail call" to the helper, with the return address back inline.  */
1956    tcg_out_push(s, retaddr);
1957    tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1958    return true;
1959}
1960#elif TCG_TARGET_REG_BITS == 32
1961# define x86_guest_base_seg     0
1962# define x86_guest_base_index   -1
1963# define x86_guest_base_offset  guest_base
1964#else
1965static int x86_guest_base_seg;
1966static int x86_guest_base_index = -1;
1967static int32_t x86_guest_base_offset;
1968# if defined(__x86_64__) && defined(__linux__)
1969#  include <asm/prctl.h>
1970#  include <sys/prctl.h>
1971int arch_prctl(int code, unsigned long addr);
1972static inline int setup_guest_base_seg(void)
1973{
1974    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1975        return P_GS;
1976    }
1977    return 0;
1978}
1979# elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__)
1980#  include <machine/sysarch.h>
1981static inline int setup_guest_base_seg(void)
1982{
1983    if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1984        return P_GS;
1985    }
1986    return 0;
1987}
1988# else
1989static inline int setup_guest_base_seg(void)
1990{
1991    return 0;
1992}
1993# endif
1994#endif /* SOFTMMU */
1995
1996static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1997                                   TCGReg base, int index, intptr_t ofs,
1998                                   int seg, bool is64, TCGMemOp memop)
1999{
2000    const TCGMemOp real_bswap = memop & MO_BSWAP;
2001    TCGMemOp bswap = real_bswap;
2002    int rexw = is64 * P_REXW;
2003    int movop = OPC_MOVL_GvEv;
2004
2005    if (have_movbe && real_bswap) {
2006        bswap = 0;
2007        movop = OPC_MOVBE_GyMy;
2008    }
2009
2010    switch (memop & MO_SSIZE) {
2011    case MO_UB:
2012        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
2013                                 base, index, 0, ofs);
2014        break;
2015    case MO_SB:
2016        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo,
2017                                 base, index, 0, ofs);
2018        break;
2019    case MO_UW:
2020        tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
2021                                 base, index, 0, ofs);
2022        if (real_bswap) {
2023            tcg_out_rolw_8(s, datalo);
2024        }
2025        break;
2026    case MO_SW:
2027        if (real_bswap) {
2028            if (have_movbe) {
2029                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2030                                         datalo, base, index, 0, ofs);
2031            } else {
2032                tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
2033                                         base, index, 0, ofs);
2034                tcg_out_rolw_8(s, datalo);
2035            }
2036            tcg_out_modrm(s, OPC_MOVSWL + rexw, datalo, datalo);
2037        } else {
2038            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg,
2039                                     datalo, base, index, 0, ofs);
2040        }
2041        break;
2042    case MO_UL:
2043        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2044        if (bswap) {
2045            tcg_out_bswap32(s, datalo);
2046        }
2047        break;
2048#if TCG_TARGET_REG_BITS == 64
2049    case MO_SL:
2050        if (real_bswap) {
2051            tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2052                                     base, index, 0, ofs);
2053            if (bswap) {
2054                tcg_out_bswap32(s, datalo);
2055            }
2056            tcg_out_ext32s(s, datalo, datalo);
2057        } else {
2058            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
2059                                     base, index, 0, ofs);
2060        }
2061        break;
2062#endif
2063    case MO_Q:
2064        if (TCG_TARGET_REG_BITS == 64) {
2065            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2066                                     base, index, 0, ofs);
2067            if (bswap) {
2068                tcg_out_bswap64(s, datalo);
2069            }
2070        } else {
2071            if (real_bswap) {
2072                int t = datalo;
2073                datalo = datahi;
2074                datahi = t;
2075            }
2076            if (base != datalo) {
2077                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2078                                         base, index, 0, ofs);
2079                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2080                                         base, index, 0, ofs + 4);
2081            } else {
2082                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2083                                         base, index, 0, ofs + 4);
2084                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2085                                         base, index, 0, ofs);
2086            }
2087            if (bswap) {
2088                tcg_out_bswap32(s, datalo);
2089                tcg_out_bswap32(s, datahi);
2090            }
2091        }
2092        break;
2093    default:
2094        tcg_abort();
2095    }
2096}
2097
2098/* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
2099   EAX. It will be useful once fixed registers globals are less
2100   common. */
2101static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
2102{
2103    TCGReg datalo, datahi, addrlo;
2104    TCGReg addrhi __attribute__((unused));
2105    TCGMemOpIdx oi;
2106    TCGMemOp opc;
2107#if defined(CONFIG_SOFTMMU)
2108    int mem_index;
2109    tcg_insn_unit *label_ptr[2];
2110#endif
2111
2112    datalo = *args++;
2113    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2114    addrlo = *args++;
2115    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2116    oi = *args++;
2117    opc = get_memop(oi);
2118
2119#if defined(CONFIG_SOFTMMU)
2120    mem_index = get_mmuidx(oi);
2121
2122    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2123                     label_ptr, offsetof(CPUTLBEntry, addr_read));
2124
2125    /* TLB Hit.  */
2126    tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc);
2127
2128    /* Record the current context of a load into ldst label */
2129    add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
2130                        s->code_ptr, label_ptr);
2131#else
2132    tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2133                           x86_guest_base_offset, x86_guest_base_seg,
2134                           is64, opc);
2135#endif
2136}
2137
2138static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2139                                   TCGReg base, int index, intptr_t ofs,
2140                                   int seg, TCGMemOp memop)
2141{
2142    /* ??? Ideally we wouldn't need a scratch register.  For user-only,
2143       we could perform the bswap twice to restore the original value
2144       instead of moving to the scratch.  But as it is, the L constraint
2145       means that TCG_REG_L0 is definitely free here.  */
2146    const TCGReg scratch = TCG_REG_L0;
2147    const TCGMemOp real_bswap = memop & MO_BSWAP;
2148    TCGMemOp bswap = real_bswap;
2149    int movop = OPC_MOVL_EvGv;
2150
2151    if (have_movbe && real_bswap) {
2152        bswap = 0;
2153        movop = OPC_MOVBE_MyGy;
2154    }
2155
2156    switch (memop & MO_SIZE) {
2157    case MO_8:
2158        /* In 32-bit mode, 8-bit stores can only happen from [abcd]x.
2159           Use the scratch register if necessary.  */
2160        if (TCG_TARGET_REG_BITS == 32 && datalo >= 4) {
2161            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2162            datalo = scratch;
2163        }
2164        tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
2165                                 datalo, base, index, 0, ofs);
2166        break;
2167    case MO_16:
2168        if (bswap) {
2169            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2170            tcg_out_rolw_8(s, scratch);
2171            datalo = scratch;
2172        }
2173        tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo,
2174                                 base, index, 0, ofs);
2175        break;
2176    case MO_32:
2177        if (bswap) {
2178            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2179            tcg_out_bswap32(s, scratch);
2180            datalo = scratch;
2181        }
2182        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2183        break;
2184    case MO_64:
2185        if (TCG_TARGET_REG_BITS == 64) {
2186            if (bswap) {
2187                tcg_out_mov(s, TCG_TYPE_I64, scratch, datalo);
2188                tcg_out_bswap64(s, scratch);
2189                datalo = scratch;
2190            }
2191            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2192                                     base, index, 0, ofs);
2193        } else if (bswap) {
2194            tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
2195            tcg_out_bswap32(s, scratch);
2196            tcg_out_modrm_sib_offset(s, OPC_MOVL_EvGv + seg, scratch,
2197                                     base, index, 0, ofs);
2198            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2199            tcg_out_bswap32(s, scratch);
2200            tcg_out_modrm_sib_offset(s, OPC_MOVL_EvGv + seg, scratch,
2201                                     base, index, 0, ofs + 4);
2202        } else {
2203            if (real_bswap) {
2204                int t = datalo;
2205                datalo = datahi;
2206                datahi = t;
2207            }
2208            tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2209                                     base, index, 0, ofs);
2210            tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2211                                     base, index, 0, ofs + 4);
2212        }
2213        break;
2214    default:
2215        tcg_abort();
2216    }
2217}
2218
2219static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
2220{
2221    TCGReg datalo, datahi, addrlo;
2222    TCGReg addrhi __attribute__((unused));
2223    TCGMemOpIdx oi;
2224    TCGMemOp opc;
2225#if defined(CONFIG_SOFTMMU)
2226    int mem_index;
2227    tcg_insn_unit *label_ptr[2];
2228#endif
2229
2230    datalo = *args++;
2231    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2232    addrlo = *args++;
2233    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2234    oi = *args++;
2235    opc = get_memop(oi);
2236
2237#if defined(CONFIG_SOFTMMU)
2238    mem_index = get_mmuidx(oi);
2239
2240    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2241                     label_ptr, offsetof(CPUTLBEntry, addr_write));
2242
2243    /* TLB Hit.  */
2244    tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
2245
2246    /* Record the current context of a store into ldst label */
2247    add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi,
2248                        s->code_ptr, label_ptr);
2249#else
2250    tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2251                           x86_guest_base_offset, x86_guest_base_seg, opc);
2252#endif
2253}
2254
2255static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2256                              const TCGArg *args, const int *const_args)
2257{
2258    TCGArg a0, a1, a2;
2259    int c, const_a2, vexop, rexw = 0;
2260
2261#if TCG_TARGET_REG_BITS == 64
2262# define OP_32_64(x) \
2263        case glue(glue(INDEX_op_, x), _i64): \
2264            rexw = P_REXW; /* FALLTHRU */    \
2265        case glue(glue(INDEX_op_, x), _i32)
2266#else
2267# define OP_32_64(x) \
2268        case glue(glue(INDEX_op_, x), _i32)
2269#endif
2270
2271    /* Hoist the loads of the most common arguments.  */
2272    a0 = args[0];
2273    a1 = args[1];
2274    a2 = args[2];
2275    const_a2 = const_args[2];
2276
2277    switch (opc) {
2278    case INDEX_op_exit_tb:
2279        /* Reuse the zeroing that exists for goto_ptr.  */
2280        if (a0 == 0) {
2281            tcg_out_jmp(s, s->code_gen_epilogue);
2282        } else {
2283            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2284            tcg_out_jmp(s, tb_ret_addr);
2285        }
2286        break;
2287    case INDEX_op_goto_tb:
2288        if (s->tb_jmp_insn_offset) {
2289            /* direct jump method */
2290            int gap;
2291            /* jump displacement must be aligned for atomic patching;
2292             * see if we need to add extra nops before jump
2293             */
2294            gap = tcg_pcrel_diff(s, QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4));
2295            if (gap != 1) {
2296                tcg_out_nopn(s, gap - 1);
2297            }
2298            tcg_out8(s, OPC_JMP_long); /* jmp im */
2299            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
2300            tcg_out32(s, 0);
2301        } else {
2302            /* indirect jump method */
2303            tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
2304                                 (intptr_t)(s->tb_jmp_target_addr + a0));
2305        }
2306        set_jmp_reset_offset(s, a0);
2307        break;
2308    case INDEX_op_goto_ptr:
2309        /* jmp to the given host address (could be epilogue) */
2310        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2311        break;
2312    case INDEX_op_br:
2313        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2314        break;
2315    OP_32_64(ld8u):
2316        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2317        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2318        break;
2319    OP_32_64(ld8s):
2320        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2321        break;
2322    OP_32_64(ld16u):
2323        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2324        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2325        break;
2326    OP_32_64(ld16s):
2327        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2328        break;
2329#if TCG_TARGET_REG_BITS == 64
2330    case INDEX_op_ld32u_i64:
2331#endif
2332    case INDEX_op_ld_i32:
2333        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2334        break;
2335
2336    OP_32_64(st8):
2337        if (const_args[0]) {
2338            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2339            tcg_out8(s, a0);
2340        } else {
2341            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2342        }
2343        break;
2344    OP_32_64(st16):
2345        if (const_args[0]) {
2346            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2347            tcg_out16(s, a0);
2348        } else {
2349            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2350        }
2351        break;
2352#if TCG_TARGET_REG_BITS == 64
2353    case INDEX_op_st32_i64:
2354#endif
2355    case INDEX_op_st_i32:
2356        if (const_args[0]) {
2357            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2358            tcg_out32(s, a0);
2359        } else {
2360            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2361        }
2362        break;
2363
2364    OP_32_64(add):
2365        /* For 3-operand addition, use LEA.  */
2366        if (a0 != a1) {
2367            TCGArg c3 = 0;
2368            if (const_a2) {
2369                c3 = a2, a2 = -1;
2370            } else if (a0 == a2) {
2371                /* Watch out for dest = src + dest, since we've removed
2372                   the matching constraint on the add.  */
2373                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2374                break;
2375            }
2376
2377            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2378            break;
2379        }
2380        c = ARITH_ADD;
2381        goto gen_arith;
2382    OP_32_64(sub):
2383        c = ARITH_SUB;
2384        goto gen_arith;
2385    OP_32_64(and):
2386        c = ARITH_AND;
2387        goto gen_arith;
2388    OP_32_64(or):
2389        c = ARITH_OR;
2390        goto gen_arith;
2391    OP_32_64(xor):
2392        c = ARITH_XOR;
2393        goto gen_arith;
2394    gen_arith:
2395        if (const_a2) {
2396            tgen_arithi(s, c + rexw, a0, a2, 0);
2397        } else {
2398            tgen_arithr(s, c + rexw, a0, a2);
2399        }
2400        break;
2401
2402    OP_32_64(andc):
2403        if (const_a2) {
2404            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2405            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2406        } else {
2407            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2408        }
2409        break;
2410
2411    OP_32_64(mul):
2412        if (const_a2) {
2413            int32_t val;
2414            val = a2;
2415            if (val == (int8_t)val) {
2416                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2417                tcg_out8(s, val);
2418            } else {
2419                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2420                tcg_out32(s, val);
2421            }
2422        } else {
2423            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2424        }
2425        break;
2426
2427    OP_32_64(div2):
2428        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2429        break;
2430    OP_32_64(divu2):
2431        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2432        break;
2433
2434    OP_32_64(shl):
2435        /* For small constant 3-operand shift, use LEA.  */
2436        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2437            if (a2 - 1 == 0) {
2438                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2439                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2440            } else {
2441                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2442                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2443            }
2444            break;
2445        }
2446        c = SHIFT_SHL;
2447        vexop = OPC_SHLX;
2448        goto gen_shift_maybe_vex;
2449    OP_32_64(shr):
2450        c = SHIFT_SHR;
2451        vexop = OPC_SHRX;
2452        goto gen_shift_maybe_vex;
2453    OP_32_64(sar):
2454        c = SHIFT_SAR;
2455        vexop = OPC_SARX;
2456        goto gen_shift_maybe_vex;
2457    OP_32_64(rotl):
2458        c = SHIFT_ROL;
2459        goto gen_shift;
2460    OP_32_64(rotr):
2461        c = SHIFT_ROR;
2462        goto gen_shift;
2463    gen_shift_maybe_vex:
2464        if (have_bmi2) {
2465            if (!const_a2) {
2466                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2467                break;
2468            }
2469            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2470        }
2471        /* FALLTHRU */
2472    gen_shift:
2473        if (const_a2) {
2474            tcg_out_shifti(s, c + rexw, a0, a2);
2475        } else {
2476            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2477        }
2478        break;
2479
2480    OP_32_64(ctz):
2481        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2482        break;
2483    OP_32_64(clz):
2484        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2485        break;
2486    OP_32_64(ctpop):
2487        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2488        break;
2489
2490    case INDEX_op_brcond_i32:
2491        tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2492        break;
2493    case INDEX_op_setcond_i32:
2494        tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2495        break;
2496    case INDEX_op_movcond_i32:
2497        tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2498        break;
2499
2500    OP_32_64(bswap16):
2501        tcg_out_rolw_8(s, a0);
2502        break;
2503    OP_32_64(bswap32):
2504        tcg_out_bswap32(s, a0);
2505        break;
2506
2507    OP_32_64(neg):
2508        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2509        break;
2510    OP_32_64(not):
2511        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2512        break;
2513
2514    OP_32_64(ext8s):
2515        tcg_out_ext8s(s, a0, a1, rexw);
2516        break;
2517    OP_32_64(ext16s):
2518        tcg_out_ext16s(s, a0, a1, rexw);
2519        break;
2520    OP_32_64(ext8u):
2521        tcg_out_ext8u(s, a0, a1);
2522        break;
2523    OP_32_64(ext16u):
2524        tcg_out_ext16u(s, a0, a1);
2525        break;
2526
2527    case INDEX_op_qemu_ld_i32:
2528        tcg_out_qemu_ld(s, args, 0);
2529        break;
2530    case INDEX_op_qemu_ld_i64:
2531        tcg_out_qemu_ld(s, args, 1);
2532        break;
2533    case INDEX_op_qemu_st_i32:
2534        tcg_out_qemu_st(s, args, 0);
2535        break;
2536    case INDEX_op_qemu_st_i64:
2537        tcg_out_qemu_st(s, args, 1);
2538        break;
2539
2540    OP_32_64(mulu2):
2541        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2542        break;
2543    OP_32_64(muls2):
2544        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2545        break;
2546    OP_32_64(add2):
2547        if (const_args[4]) {
2548            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2549        } else {
2550            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2551        }
2552        if (const_args[5]) {
2553            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2554        } else {
2555            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2556        }
2557        break;
2558    OP_32_64(sub2):
2559        if (const_args[4]) {
2560            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2561        } else {
2562            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2563        }
2564        if (const_args[5]) {
2565            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2566        } else {
2567            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2568        }
2569        break;
2570
2571#if TCG_TARGET_REG_BITS == 32
2572    case INDEX_op_brcond2_i32:
2573        tcg_out_brcond2(s, args, const_args, 0);
2574        break;
2575    case INDEX_op_setcond2_i32:
2576        tcg_out_setcond2(s, args, const_args);
2577        break;
2578#else /* TCG_TARGET_REG_BITS == 64 */
2579    case INDEX_op_ld32s_i64:
2580        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2581        break;
2582    case INDEX_op_ld_i64:
2583        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2584        break;
2585    case INDEX_op_st_i64:
2586        if (const_args[0]) {
2587            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2588            tcg_out32(s, a0);
2589        } else {
2590            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2591        }
2592        break;
2593
2594    case INDEX_op_brcond_i64:
2595        tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2596        break;
2597    case INDEX_op_setcond_i64:
2598        tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2599        break;
2600    case INDEX_op_movcond_i64:
2601        tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2602        break;
2603
2604    case INDEX_op_bswap64_i64:
2605        tcg_out_bswap64(s, a0);
2606        break;
2607    case INDEX_op_extu_i32_i64:
2608    case INDEX_op_ext32u_i64:
2609    case INDEX_op_extrl_i64_i32:
2610        tcg_out_ext32u(s, a0, a1);
2611        break;
2612    case INDEX_op_ext_i32_i64:
2613    case INDEX_op_ext32s_i64:
2614        tcg_out_ext32s(s, a0, a1);
2615        break;
2616    case INDEX_op_extrh_i64_i32:
2617        tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2618        break;
2619#endif
2620
2621    OP_32_64(deposit):
2622        if (args[3] == 0 && args[4] == 8) {
2623            /* load bits 0..7 */
2624            tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2625        } else if (args[3] == 8 && args[4] == 8) {
2626            /* load bits 8..15 */
2627            tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2628        } else if (args[3] == 0 && args[4] == 16) {
2629            /* load bits 0..15 */
2630            tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2631        } else {
2632            tcg_abort();
2633        }
2634        break;
2635
2636    case INDEX_op_extract_i64:
2637        if (a2 + args[3] == 32) {
2638            /* This is a 32-bit zero-extending right shift.  */
2639            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2640            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2641            break;
2642        }
2643        /* FALLTHRU */
2644    case INDEX_op_extract_i32:
2645        /* On the off-chance that we can use the high-byte registers.
2646           Otherwise we emit the same ext16 + shift pattern that we
2647           would have gotten from the normal tcg-op.c expansion.  */
2648        tcg_debug_assert(a2 == 8 && args[3] == 8);
2649        if (a1 < 4 && a0 < 8) {
2650            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2651        } else {
2652            tcg_out_ext16u(s, a0, a1);
2653            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2654        }
2655        break;
2656
2657    case INDEX_op_sextract_i32:
2658        /* We don't implement sextract_i64, as we cannot sign-extend to
2659           64-bits without using the REX prefix that explicitly excludes
2660           access to the high-byte registers.  */
2661        tcg_debug_assert(a2 == 8 && args[3] == 8);
2662        if (a1 < 4 && a0 < 8) {
2663            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2664        } else {
2665            tcg_out_ext16s(s, a0, a1, 0);
2666            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2667        }
2668        break;
2669
2670    OP_32_64(extract2):
2671        /* Note that SHRD outputs to the r/m operand.  */
2672        tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2673        tcg_out8(s, args[3]);
2674        break;
2675
2676    case INDEX_op_mb:
2677        tcg_out_mb(s, a0);
2678        break;
2679    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2680    case INDEX_op_mov_i64:
2681    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
2682    case INDEX_op_movi_i64:
2683    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2684    default:
2685        tcg_abort();
2686    }
2687
2688#undef OP_32_64
2689}
2690
2691static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2692                           unsigned vecl, unsigned vece,
2693                           const TCGArg *args, const int *const_args)
2694{
2695    static int const add_insn[4] = {
2696        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2697    };
2698    static int const ssadd_insn[4] = {
2699        OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2700    };
2701    static int const usadd_insn[4] = {
2702        OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2703    };
2704    static int const sub_insn[4] = {
2705        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2706    };
2707    static int const sssub_insn[4] = {
2708        OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2709    };
2710    static int const ussub_insn[4] = {
2711        OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2712    };
2713    static int const mul_insn[4] = {
2714        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
2715    };
2716    static int const shift_imm_insn[4] = {
2717        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2718    };
2719    static int const cmpeq_insn[4] = {
2720        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2721    };
2722    static int const cmpgt_insn[4] = {
2723        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2724    };
2725    static int const punpckl_insn[4] = {
2726        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2727    };
2728    static int const punpckh_insn[4] = {
2729        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2730    };
2731    static int const packss_insn[4] = {
2732        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2733    };
2734    static int const packus_insn[4] = {
2735        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2736    };
2737    static int const smin_insn[4] = {
2738        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2
2739    };
2740    static int const smax_insn[4] = {
2741        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2
2742    };
2743    static int const umin_insn[4] = {
2744        OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2
2745    };
2746    static int const umax_insn[4] = {
2747        OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
2748    };
2749    static int const shlv_insn[4] = {
2750        /* TODO: AVX512 adds support for MO_16.  */
2751        OPC_UD2, OPC_UD2, OPC_VPSLLVD, OPC_VPSLLVQ
2752    };
2753    static int const shrv_insn[4] = {
2754        /* TODO: AVX512 adds support for MO_16.  */
2755        OPC_UD2, OPC_UD2, OPC_VPSRLVD, OPC_VPSRLVQ
2756    };
2757    static int const sarv_insn[4] = {
2758        /* TODO: AVX512 adds support for MO_16, MO_64.  */
2759        OPC_UD2, OPC_UD2, OPC_VPSRAVD, OPC_UD2
2760    };
2761    static int const shls_insn[4] = {
2762        OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2763    };
2764    static int const shrs_insn[4] = {
2765        OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2766    };
2767    static int const sars_insn[4] = {
2768        OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_UD2
2769    };
2770    static int const abs_insn[4] = {
2771        /* TODO: AVX512 adds support for MO_64.  */
2772        OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_UD2
2773    };
2774
2775    TCGType type = vecl + TCG_TYPE_V64;
2776    int insn, sub;
2777    TCGArg a0, a1, a2;
2778
2779    a0 = args[0];
2780    a1 = args[1];
2781    a2 = args[2];
2782
2783    switch (opc) {
2784    case INDEX_op_add_vec:
2785        insn = add_insn[vece];
2786        goto gen_simd;
2787    case INDEX_op_ssadd_vec:
2788        insn = ssadd_insn[vece];
2789        goto gen_simd;
2790    case INDEX_op_usadd_vec:
2791        insn = usadd_insn[vece];
2792        goto gen_simd;
2793    case INDEX_op_sub_vec:
2794        insn = sub_insn[vece];
2795        goto gen_simd;
2796    case INDEX_op_sssub_vec:
2797        insn = sssub_insn[vece];
2798        goto gen_simd;
2799    case INDEX_op_ussub_vec:
2800        insn = ussub_insn[vece];
2801        goto gen_simd;
2802    case INDEX_op_mul_vec:
2803        insn = mul_insn[vece];
2804        goto gen_simd;
2805    case INDEX_op_and_vec:
2806        insn = OPC_PAND;
2807        goto gen_simd;
2808    case INDEX_op_or_vec:
2809        insn = OPC_POR;
2810        goto gen_simd;
2811    case INDEX_op_xor_vec:
2812        insn = OPC_PXOR;
2813        goto gen_simd;
2814    case INDEX_op_smin_vec:
2815        insn = smin_insn[vece];
2816        goto gen_simd;
2817    case INDEX_op_umin_vec:
2818        insn = umin_insn[vece];
2819        goto gen_simd;
2820    case INDEX_op_smax_vec:
2821        insn = smax_insn[vece];
2822        goto gen_simd;
2823    case INDEX_op_umax_vec:
2824        insn = umax_insn[vece];
2825        goto gen_simd;
2826    case INDEX_op_shlv_vec:
2827        insn = shlv_insn[vece];
2828        goto gen_simd;
2829    case INDEX_op_shrv_vec:
2830        insn = shrv_insn[vece];
2831        goto gen_simd;
2832    case INDEX_op_sarv_vec:
2833        insn = sarv_insn[vece];
2834        goto gen_simd;
2835    case INDEX_op_shls_vec:
2836        insn = shls_insn[vece];
2837        goto gen_simd;
2838    case INDEX_op_shrs_vec:
2839        insn = shrs_insn[vece];
2840        goto gen_simd;
2841    case INDEX_op_sars_vec:
2842        insn = sars_insn[vece];
2843        goto gen_simd;
2844    case INDEX_op_x86_punpckl_vec:
2845        insn = punpckl_insn[vece];
2846        goto gen_simd;
2847    case INDEX_op_x86_punpckh_vec:
2848        insn = punpckh_insn[vece];
2849        goto gen_simd;
2850    case INDEX_op_x86_packss_vec:
2851        insn = packss_insn[vece];
2852        goto gen_simd;
2853    case INDEX_op_x86_packus_vec:
2854        insn = packus_insn[vece];
2855        goto gen_simd;
2856#if TCG_TARGET_REG_BITS == 32
2857    case INDEX_op_dup2_vec:
2858        /* Constraints have already placed both 32-bit inputs in xmm regs.  */
2859        insn = OPC_PUNPCKLDQ;
2860        goto gen_simd;
2861#endif
2862    case INDEX_op_abs_vec:
2863        insn = abs_insn[vece];
2864        a2 = a1;
2865        a1 = 0;
2866        goto gen_simd;
2867    gen_simd:
2868        tcg_debug_assert(insn != OPC_UD2);
2869        if (type == TCG_TYPE_V256) {
2870            insn |= P_VEXL;
2871        }
2872        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2873        break;
2874
2875    case INDEX_op_cmp_vec:
2876        sub = args[3];
2877        if (sub == TCG_COND_EQ) {
2878            insn = cmpeq_insn[vece];
2879        } else if (sub == TCG_COND_GT) {
2880            insn = cmpgt_insn[vece];
2881        } else {
2882            g_assert_not_reached();
2883        }
2884        goto gen_simd;
2885
2886    case INDEX_op_andc_vec:
2887        insn = OPC_PANDN;
2888        if (type == TCG_TYPE_V256) {
2889            insn |= P_VEXL;
2890        }
2891        tcg_out_vex_modrm(s, insn, a0, a2, a1);
2892        break;
2893
2894    case INDEX_op_shli_vec:
2895        sub = 6;
2896        goto gen_shift;
2897    case INDEX_op_shri_vec:
2898        sub = 2;
2899        goto gen_shift;
2900    case INDEX_op_sari_vec:
2901        tcg_debug_assert(vece != MO_64);
2902        sub = 4;
2903    gen_shift:
2904        tcg_debug_assert(vece != MO_8);
2905        insn = shift_imm_insn[vece];
2906        if (type == TCG_TYPE_V256) {
2907            insn |= P_VEXL;
2908        }
2909        tcg_out_vex_modrm(s, insn, sub, a0, a1);
2910        tcg_out8(s, a2);
2911        break;
2912
2913    case INDEX_op_ld_vec:
2914        tcg_out_ld(s, type, a0, a1, a2);
2915        break;
2916    case INDEX_op_st_vec:
2917        tcg_out_st(s, type, a0, a1, a2);
2918        break;
2919    case INDEX_op_dupm_vec:
2920        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
2921        break;
2922
2923    case INDEX_op_x86_shufps_vec:
2924        insn = OPC_SHUFPS;
2925        sub = args[3];
2926        goto gen_simd_imm8;
2927    case INDEX_op_x86_blend_vec:
2928        if (vece == MO_16) {
2929            insn = OPC_PBLENDW;
2930        } else if (vece == MO_32) {
2931            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
2932        } else {
2933            g_assert_not_reached();
2934        }
2935        sub = args[3];
2936        goto gen_simd_imm8;
2937    case INDEX_op_x86_vperm2i128_vec:
2938        insn = OPC_VPERM2I128;
2939        sub = args[3];
2940        goto gen_simd_imm8;
2941    gen_simd_imm8:
2942        if (type == TCG_TYPE_V256) {
2943            insn |= P_VEXL;
2944        }
2945        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2946        tcg_out8(s, sub);
2947        break;
2948
2949    case INDEX_op_x86_vpblendvb_vec:
2950        insn = OPC_VPBLENDVB;
2951        if (type == TCG_TYPE_V256) {
2952            insn |= P_VEXL;
2953        }
2954        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2955        tcg_out8(s, args[3] << 4);
2956        break;
2957
2958    case INDEX_op_x86_psrldq_vec:
2959        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
2960        tcg_out8(s, a2);
2961        break;
2962
2963    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
2964    case INDEX_op_dupi_vec: /* Always emitted via tcg_out_movi.  */
2965    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
2966    default:
2967        g_assert_not_reached();
2968    }
2969}
2970
2971static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
2972{
2973    static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
2974    static const TCGTargetOpDef ri_r = { .args_ct_str = { "ri", "r" } };
2975    static const TCGTargetOpDef re_r = { .args_ct_str = { "re", "r" } };
2976    static const TCGTargetOpDef qi_r = { .args_ct_str = { "qi", "r" } };
2977    static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
2978    static const TCGTargetOpDef r_q = { .args_ct_str = { "r", "q" } };
2979    static const TCGTargetOpDef r_re = { .args_ct_str = { "r", "re" } };
2980    static const TCGTargetOpDef r_0 = { .args_ct_str = { "r", "0" } };
2981    static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
2982    static const TCGTargetOpDef r_r_re = { .args_ct_str = { "r", "r", "re" } };
2983    static const TCGTargetOpDef r_0_r = { .args_ct_str = { "r", "0", "r" } };
2984    static const TCGTargetOpDef r_0_re = { .args_ct_str = { "r", "0", "re" } };
2985    static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } };
2986    static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
2987    static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
2988    static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
2989    static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
2990    static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
2991    static const TCGTargetOpDef r_r_L_L
2992        = { .args_ct_str = { "r", "r", "L", "L" } };
2993    static const TCGTargetOpDef L_L_L_L
2994        = { .args_ct_str = { "L", "L", "L", "L" } };
2995    static const TCGTargetOpDef x_x = { .args_ct_str = { "x", "x" } };
2996    static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } };
2997    static const TCGTargetOpDef x_x_x_x
2998        = { .args_ct_str = { "x", "x", "x", "x" } };
2999    static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } };
3000
3001    switch (op) {
3002    case INDEX_op_goto_ptr:
3003        return &r;
3004
3005    case INDEX_op_ld8u_i32:
3006    case INDEX_op_ld8u_i64:
3007    case INDEX_op_ld8s_i32:
3008    case INDEX_op_ld8s_i64:
3009    case INDEX_op_ld16u_i32:
3010    case INDEX_op_ld16u_i64:
3011    case INDEX_op_ld16s_i32:
3012    case INDEX_op_ld16s_i64:
3013    case INDEX_op_ld_i32:
3014    case INDEX_op_ld32u_i64:
3015    case INDEX_op_ld32s_i64:
3016    case INDEX_op_ld_i64:
3017        return &r_r;
3018
3019    case INDEX_op_st8_i32:
3020    case INDEX_op_st8_i64:
3021        return &qi_r;
3022    case INDEX_op_st16_i32:
3023    case INDEX_op_st16_i64:
3024    case INDEX_op_st_i32:
3025    case INDEX_op_st32_i64:
3026        return &ri_r;
3027    case INDEX_op_st_i64:
3028        return &re_r;
3029
3030    case INDEX_op_add_i32:
3031    case INDEX_op_add_i64:
3032        return &r_r_re;
3033    case INDEX_op_sub_i32:
3034    case INDEX_op_sub_i64:
3035    case INDEX_op_mul_i32:
3036    case INDEX_op_mul_i64:
3037    case INDEX_op_or_i32:
3038    case INDEX_op_or_i64:
3039    case INDEX_op_xor_i32:
3040    case INDEX_op_xor_i64:
3041        return &r_0_re;
3042
3043    case INDEX_op_and_i32:
3044    case INDEX_op_and_i64:
3045        {
3046            static const TCGTargetOpDef and
3047                = { .args_ct_str = { "r", "0", "reZ" } };
3048            return &and;
3049        }
3050        break;
3051    case INDEX_op_andc_i32:
3052    case INDEX_op_andc_i64:
3053        {
3054            static const TCGTargetOpDef andc
3055                = { .args_ct_str = { "r", "r", "rI" } };
3056            return &andc;
3057        }
3058        break;
3059
3060    case INDEX_op_shl_i32:
3061    case INDEX_op_shl_i64:
3062    case INDEX_op_shr_i32:
3063    case INDEX_op_shr_i64:
3064    case INDEX_op_sar_i32:
3065    case INDEX_op_sar_i64:
3066        return have_bmi2 ? &r_r_ri : &r_0_ci;
3067    case INDEX_op_rotl_i32:
3068    case INDEX_op_rotl_i64:
3069    case INDEX_op_rotr_i32:
3070    case INDEX_op_rotr_i64:
3071        return &r_0_ci;
3072
3073    case INDEX_op_brcond_i32:
3074    case INDEX_op_brcond_i64:
3075        return &r_re;
3076
3077    case INDEX_op_bswap16_i32:
3078    case INDEX_op_bswap16_i64:
3079    case INDEX_op_bswap32_i32:
3080    case INDEX_op_bswap32_i64:
3081    case INDEX_op_bswap64_i64:
3082    case INDEX_op_neg_i32:
3083    case INDEX_op_neg_i64:
3084    case INDEX_op_not_i32:
3085    case INDEX_op_not_i64:
3086    case INDEX_op_extrh_i64_i32:
3087        return &r_0;
3088
3089    case INDEX_op_ext8s_i32:
3090    case INDEX_op_ext8s_i64:
3091    case INDEX_op_ext8u_i32:
3092    case INDEX_op_ext8u_i64:
3093        return &r_q;
3094    case INDEX_op_ext16s_i32:
3095    case INDEX_op_ext16s_i64:
3096    case INDEX_op_ext16u_i32:
3097    case INDEX_op_ext16u_i64:
3098    case INDEX_op_ext32s_i64:
3099    case INDEX_op_ext32u_i64:
3100    case INDEX_op_ext_i32_i64:
3101    case INDEX_op_extu_i32_i64:
3102    case INDEX_op_extrl_i64_i32:
3103    case INDEX_op_extract_i32:
3104    case INDEX_op_extract_i64:
3105    case INDEX_op_sextract_i32:
3106    case INDEX_op_ctpop_i32:
3107    case INDEX_op_ctpop_i64:
3108        return &r_r;
3109    case INDEX_op_extract2_i32:
3110    case INDEX_op_extract2_i64:
3111        return &r_0_r;
3112
3113    case INDEX_op_deposit_i32:
3114    case INDEX_op_deposit_i64:
3115        {
3116            static const TCGTargetOpDef dep
3117                = { .args_ct_str = { "Q", "0", "Q" } };
3118            return &dep;
3119        }
3120    case INDEX_op_setcond_i32:
3121    case INDEX_op_setcond_i64:
3122        {
3123            static const TCGTargetOpDef setc
3124                = { .args_ct_str = { "q", "r", "re" } };
3125            return &setc;
3126        }
3127    case INDEX_op_movcond_i32:
3128    case INDEX_op_movcond_i64:
3129        {
3130            static const TCGTargetOpDef movc
3131                = { .args_ct_str = { "r", "r", "re", "r", "0" } };
3132            return &movc;
3133        }
3134    case INDEX_op_div2_i32:
3135    case INDEX_op_div2_i64:
3136    case INDEX_op_divu2_i32:
3137    case INDEX_op_divu2_i64:
3138        {
3139            static const TCGTargetOpDef div2
3140                = { .args_ct_str = { "a", "d", "0", "1", "r" } };
3141            return &div2;
3142        }
3143    case INDEX_op_mulu2_i32:
3144    case INDEX_op_mulu2_i64:
3145    case INDEX_op_muls2_i32:
3146    case INDEX_op_muls2_i64:
3147        {
3148            static const TCGTargetOpDef mul2
3149                = { .args_ct_str = { "a", "d", "a", "r" } };
3150            return &mul2;
3151        }
3152    case INDEX_op_add2_i32:
3153    case INDEX_op_add2_i64:
3154    case INDEX_op_sub2_i32:
3155    case INDEX_op_sub2_i64:
3156        {
3157            static const TCGTargetOpDef arith2
3158                = { .args_ct_str = { "r", "r", "0", "1", "re", "re" } };
3159            return &arith2;
3160        }
3161    case INDEX_op_ctz_i32:
3162    case INDEX_op_ctz_i64:
3163        {
3164            static const TCGTargetOpDef ctz[2] = {
3165                { .args_ct_str = { "&r", "r", "r" } },
3166                { .args_ct_str = { "&r", "r", "rW" } },
3167            };
3168            return &ctz[have_bmi1];
3169        }
3170    case INDEX_op_clz_i32:
3171    case INDEX_op_clz_i64:
3172        {
3173            static const TCGTargetOpDef clz[2] = {
3174                { .args_ct_str = { "&r", "r", "r" } },
3175                { .args_ct_str = { "&r", "r", "rW" } },
3176            };
3177            return &clz[have_lzcnt];
3178        }
3179
3180    case INDEX_op_qemu_ld_i32:
3181        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
3182    case INDEX_op_qemu_st_i32:
3183        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L;
3184    case INDEX_op_qemu_ld_i64:
3185        return (TCG_TARGET_REG_BITS == 64 ? &r_L
3186                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
3187                : &r_r_L_L);
3188    case INDEX_op_qemu_st_i64:
3189        return (TCG_TARGET_REG_BITS == 64 ? &L_L
3190                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L_L
3191                : &L_L_L_L);
3192
3193    case INDEX_op_brcond2_i32:
3194        {
3195            static const TCGTargetOpDef b2
3196                = { .args_ct_str = { "r", "r", "ri", "ri" } };
3197            return &b2;
3198        }
3199    case INDEX_op_setcond2_i32:
3200        {
3201            static const TCGTargetOpDef s2
3202                = { .args_ct_str = { "r", "r", "r", "ri", "ri" } };
3203            return &s2;
3204        }
3205
3206    case INDEX_op_ld_vec:
3207    case INDEX_op_st_vec:
3208    case INDEX_op_dupm_vec:
3209        return &x_r;
3210
3211    case INDEX_op_add_vec:
3212    case INDEX_op_sub_vec:
3213    case INDEX_op_mul_vec:
3214    case INDEX_op_and_vec:
3215    case INDEX_op_or_vec:
3216    case INDEX_op_xor_vec:
3217    case INDEX_op_andc_vec:
3218    case INDEX_op_ssadd_vec:
3219    case INDEX_op_usadd_vec:
3220    case INDEX_op_sssub_vec:
3221    case INDEX_op_ussub_vec:
3222    case INDEX_op_smin_vec:
3223    case INDEX_op_umin_vec:
3224    case INDEX_op_smax_vec:
3225    case INDEX_op_umax_vec:
3226    case INDEX_op_shlv_vec:
3227    case INDEX_op_shrv_vec:
3228    case INDEX_op_sarv_vec:
3229    case INDEX_op_shls_vec:
3230    case INDEX_op_shrs_vec:
3231    case INDEX_op_sars_vec:
3232    case INDEX_op_cmp_vec:
3233    case INDEX_op_x86_shufps_vec:
3234    case INDEX_op_x86_blend_vec:
3235    case INDEX_op_x86_packss_vec:
3236    case INDEX_op_x86_packus_vec:
3237    case INDEX_op_x86_vperm2i128_vec:
3238    case INDEX_op_x86_punpckl_vec:
3239    case INDEX_op_x86_punpckh_vec:
3240#if TCG_TARGET_REG_BITS == 32
3241    case INDEX_op_dup2_vec:
3242#endif
3243        return &x_x_x;
3244    case INDEX_op_abs_vec:
3245    case INDEX_op_dup_vec:
3246    case INDEX_op_shli_vec:
3247    case INDEX_op_shri_vec:
3248    case INDEX_op_sari_vec:
3249    case INDEX_op_x86_psrldq_vec:
3250        return &x_x;
3251    case INDEX_op_x86_vpblendvb_vec:
3252        return &x_x_x_x;
3253
3254    default:
3255        break;
3256    }
3257    return NULL;
3258}
3259
3260int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3261{
3262    switch (opc) {
3263    case INDEX_op_add_vec:
3264    case INDEX_op_sub_vec:
3265    case INDEX_op_and_vec:
3266    case INDEX_op_or_vec:
3267    case INDEX_op_xor_vec:
3268    case INDEX_op_andc_vec:
3269        return 1;
3270    case INDEX_op_cmp_vec:
3271    case INDEX_op_cmpsel_vec:
3272        return -1;
3273
3274    case INDEX_op_shli_vec:
3275    case INDEX_op_shri_vec:
3276        /* We must expand the operation for MO_8.  */
3277        return vece == MO_8 ? -1 : 1;
3278
3279    case INDEX_op_sari_vec:
3280        /* We must expand the operation for MO_8.  */
3281        if (vece == MO_8) {
3282            return -1;
3283        }
3284        /* We can emulate this for MO_64, but it does not pay off
3285           unless we're producing at least 4 values.  */
3286        if (vece == MO_64) {
3287            return type >= TCG_TYPE_V256 ? -1 : 0;
3288        }
3289        return 1;
3290
3291    case INDEX_op_shls_vec:
3292    case INDEX_op_shrs_vec:
3293        return vece >= MO_16;
3294    case INDEX_op_sars_vec:
3295        return vece >= MO_16 && vece <= MO_32;
3296
3297    case INDEX_op_shlv_vec:
3298    case INDEX_op_shrv_vec:
3299        return have_avx2 && vece >= MO_32;
3300    case INDEX_op_sarv_vec:
3301        return have_avx2 && vece == MO_32;
3302
3303    case INDEX_op_mul_vec:
3304        if (vece == MO_8) {
3305            /* We can expand the operation for MO_8.  */
3306            return -1;
3307        }
3308        if (vece == MO_64) {
3309            return 0;
3310        }
3311        return 1;
3312
3313    case INDEX_op_ssadd_vec:
3314    case INDEX_op_usadd_vec:
3315    case INDEX_op_sssub_vec:
3316    case INDEX_op_ussub_vec:
3317        return vece <= MO_16;
3318    case INDEX_op_smin_vec:
3319    case INDEX_op_smax_vec:
3320    case INDEX_op_umin_vec:
3321    case INDEX_op_umax_vec:
3322    case INDEX_op_abs_vec:
3323        return vece <= MO_32;
3324
3325    default:
3326        return 0;
3327    }
3328}
3329
3330static void expand_vec_shi(TCGType type, unsigned vece, bool shr,
3331                           TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3332{
3333    TCGv_vec t1, t2;
3334
3335    tcg_debug_assert(vece == MO_8);
3336
3337    t1 = tcg_temp_new_vec(type);
3338    t2 = tcg_temp_new_vec(type);
3339
3340    /* Unpack to W, shift, and repack.  Tricky bits:
3341       (1) Use punpck*bw x,x to produce DDCCBBAA,
3342           i.e. duplicate in other half of the 16-bit lane.
3343       (2) For right-shift, add 8 so that the high half of
3344           the lane becomes zero.  For left-shift, we must
3345           shift up and down again.
3346       (3) Step 2 leaves high half zero such that PACKUSWB
3347           (pack with unsigned saturation) does not modify
3348           the quantity.  */
3349    vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3350              tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3351    vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3352              tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3353
3354    if (shr) {
3355        tcg_gen_shri_vec(MO_16, t1, t1, imm + 8);
3356        tcg_gen_shri_vec(MO_16, t2, t2, imm + 8);
3357    } else {
3358        tcg_gen_shli_vec(MO_16, t1, t1, imm + 8);
3359        tcg_gen_shli_vec(MO_16, t2, t2, imm + 8);
3360        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3361        tcg_gen_shri_vec(MO_16, t2, t2, 8);
3362    }
3363
3364    vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3365              tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3366    tcg_temp_free_vec(t1);
3367    tcg_temp_free_vec(t2);
3368}
3369
3370static void expand_vec_sari(TCGType type, unsigned vece,
3371                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3372{
3373    TCGv_vec t1, t2;
3374
3375    switch (vece) {
3376    case MO_8:
3377        /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3378        t1 = tcg_temp_new_vec(type);
3379        t2 = tcg_temp_new_vec(type);
3380        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3381                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3382        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3383                  tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3384        tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3385        tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3386        vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3387                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3388        tcg_temp_free_vec(t1);
3389        tcg_temp_free_vec(t2);
3390        break;
3391
3392    case MO_64:
3393        if (imm <= 32) {
3394            /* We can emulate a small sign extend by performing an arithmetic
3395             * 32-bit shift and overwriting the high half of a 64-bit logical
3396             * shift (note that the ISA says shift of 32 is valid).
3397             */
3398            t1 = tcg_temp_new_vec(type);
3399            tcg_gen_sari_vec(MO_32, t1, v1, imm);
3400            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3401            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3402                      tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3403                      tcgv_vec_arg(t1), 0xaa);
3404            tcg_temp_free_vec(t1);
3405        } else {
3406            /* Otherwise we will need to use a compare vs 0 to produce
3407             * the sign-extend, shift and merge.
3408             */
3409            t1 = tcg_const_zeros_vec(type);
3410            tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1);
3411            tcg_gen_shri_vec(MO_64, v0, v1, imm);
3412            tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3413            tcg_gen_or_vec(MO_64, v0, v0, t1);
3414            tcg_temp_free_vec(t1);
3415        }
3416        break;
3417
3418    default:
3419        g_assert_not_reached();
3420    }
3421}
3422
3423static void expand_vec_mul(TCGType type, unsigned vece,
3424                           TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3425{
3426    TCGv_vec t1, t2, t3, t4;
3427
3428    tcg_debug_assert(vece == MO_8);
3429
3430    /*
3431     * Unpack v1 bytes to words, 0 | x.
3432     * Unpack v2 bytes to words, y | 0.
3433     * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3434     * Shift logical right by 8 bits to clear the high 8 bytes before
3435     * using an unsigned saturated pack.
3436     *
3437     * The difference between the V64, V128 and V256 cases is merely how
3438     * we distribute the expansion between temporaries.
3439     */
3440    switch (type) {
3441    case TCG_TYPE_V64:
3442        t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3443        t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3444        tcg_gen_dup16i_vec(t2, 0);
3445        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3446                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t2));
3447        vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3448                  tcgv_vec_arg(t2), tcgv_vec_arg(t2), tcgv_vec_arg(v2));
3449        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3450        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3451        vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3452                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3453        tcg_temp_free_vec(t1);
3454        tcg_temp_free_vec(t2);
3455        break;
3456
3457    case TCG_TYPE_V128:
3458    case TCG_TYPE_V256:
3459        t1 = tcg_temp_new_vec(type);
3460        t2 = tcg_temp_new_vec(type);
3461        t3 = tcg_temp_new_vec(type);
3462        t4 = tcg_temp_new_vec(type);
3463        tcg_gen_dup16i_vec(t4, 0);
3464        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3465                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
3466        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3467                  tcgv_vec_arg(t2), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
3468        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3469                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
3470        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3471                  tcgv_vec_arg(t4), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
3472        tcg_gen_mul_vec(MO_16, t1, t1, t2);
3473        tcg_gen_mul_vec(MO_16, t3, t3, t4);
3474        tcg_gen_shri_vec(MO_16, t1, t1, 8);
3475        tcg_gen_shri_vec(MO_16, t3, t3, 8);
3476        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3477                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3478        tcg_temp_free_vec(t1);
3479        tcg_temp_free_vec(t2);
3480        tcg_temp_free_vec(t3);
3481        tcg_temp_free_vec(t4);
3482        break;
3483
3484    default:
3485        g_assert_not_reached();
3486    }
3487}
3488
3489static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3490                                 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3491{
3492    enum {
3493        NEED_INV  = 1,
3494        NEED_SWAP = 2,
3495        NEED_BIAS = 4,
3496        NEED_UMIN = 8,
3497        NEED_UMAX = 16,
3498    };
3499    TCGv_vec t1, t2;
3500    uint8_t fixup;
3501
3502    switch (cond) {
3503    case TCG_COND_EQ:
3504    case TCG_COND_GT:
3505        fixup = 0;
3506        break;
3507    case TCG_COND_NE:
3508    case TCG_COND_LE:
3509        fixup = NEED_INV;
3510        break;
3511    case TCG_COND_LT:
3512        fixup = NEED_SWAP;
3513        break;
3514    case TCG_COND_GE:
3515        fixup = NEED_SWAP | NEED_INV;
3516        break;
3517    case TCG_COND_LEU:
3518        if (vece <= MO_32) {
3519            fixup = NEED_UMIN;
3520        } else {
3521            fixup = NEED_BIAS | NEED_INV;
3522        }
3523        break;
3524    case TCG_COND_GTU:
3525        if (vece <= MO_32) {
3526            fixup = NEED_UMIN | NEED_INV;
3527        } else {
3528            fixup = NEED_BIAS;
3529        }
3530        break;
3531    case TCG_COND_GEU:
3532        if (vece <= MO_32) {
3533            fixup = NEED_UMAX;
3534        } else {
3535            fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3536        }
3537        break;
3538    case TCG_COND_LTU:
3539        if (vece <= MO_32) {
3540            fixup = NEED_UMAX | NEED_INV;
3541        } else {
3542            fixup = NEED_BIAS | NEED_SWAP;
3543        }
3544        break;
3545    default:
3546        g_assert_not_reached();
3547    }
3548
3549    if (fixup & NEED_INV) {
3550        cond = tcg_invert_cond(cond);
3551    }
3552    if (fixup & NEED_SWAP) {
3553        t1 = v1, v1 = v2, v2 = t1;
3554        cond = tcg_swap_cond(cond);
3555    }
3556
3557    t1 = t2 = NULL;
3558    if (fixup & (NEED_UMIN | NEED_UMAX)) {
3559        t1 = tcg_temp_new_vec(type);
3560        if (fixup & NEED_UMIN) {
3561            tcg_gen_umin_vec(vece, t1, v1, v2);
3562        } else {
3563            tcg_gen_umax_vec(vece, t1, v1, v2);
3564        }
3565        v2 = t1;
3566        cond = TCG_COND_EQ;
3567    } else if (fixup & NEED_BIAS) {
3568        t1 = tcg_temp_new_vec(type);
3569        t2 = tcg_temp_new_vec(type);
3570        tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1));
3571        tcg_gen_sub_vec(vece, t1, v1, t2);
3572        tcg_gen_sub_vec(vece, t2, v2, t2);
3573        v1 = t1;
3574        v2 = t2;
3575        cond = tcg_signed_cond(cond);
3576    }
3577
3578    tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3579    /* Expand directly; do not recurse.  */
3580    vec_gen_4(INDEX_op_cmp_vec, type, vece,
3581              tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3582
3583    if (t1) {
3584        tcg_temp_free_vec(t1);
3585        if (t2) {
3586            tcg_temp_free_vec(t2);
3587        }
3588    }
3589    return fixup & NEED_INV;
3590}
3591
3592static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3593                           TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3594{
3595    if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
3596        tcg_gen_not_vec(vece, v0, v0);
3597    }
3598}
3599
3600static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
3601                              TCGv_vec c1, TCGv_vec c2,
3602                              TCGv_vec v3, TCGv_vec v4, TCGCond cond)
3603{
3604    TCGv_vec t = tcg_temp_new_vec(type);
3605
3606    if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
3607        /* Invert the sense of the compare by swapping arguments.  */
3608        TCGv_vec x;
3609        x = v3, v3 = v4, v4 = x;
3610    }
3611    vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3612              tcgv_vec_arg(v0), tcgv_vec_arg(v4),
3613              tcgv_vec_arg(v3), tcgv_vec_arg(t));
3614    tcg_temp_free_vec(t);
3615}
3616
3617void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3618                       TCGArg a0, ...)
3619{
3620    va_list va;
3621    TCGArg a2;
3622    TCGv_vec v0, v1, v2, v3, v4;
3623
3624    va_start(va, a0);
3625    v0 = temp_tcgv_vec(arg_temp(a0));
3626    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3627    a2 = va_arg(va, TCGArg);
3628
3629    switch (opc) {
3630    case INDEX_op_shli_vec:
3631    case INDEX_op_shri_vec:
3632        expand_vec_shi(type, vece, opc == INDEX_op_shri_vec, v0, v1, a2);
3633        break;
3634
3635    case INDEX_op_sari_vec:
3636        expand_vec_sari(type, vece, v0, v1, a2);
3637        break;
3638
3639    case INDEX_op_mul_vec:
3640        v2 = temp_tcgv_vec(arg_temp(a2));
3641        expand_vec_mul(type, vece, v0, v1, v2);
3642        break;
3643
3644    case INDEX_op_cmp_vec:
3645        v2 = temp_tcgv_vec(arg_temp(a2));
3646        expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
3647        break;
3648
3649    case INDEX_op_cmpsel_vec:
3650        v2 = temp_tcgv_vec(arg_temp(a2));
3651        v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3652        v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3653        expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
3654        break;
3655
3656    default:
3657        break;
3658    }
3659
3660    va_end(va);
3661}
3662
3663static const int tcg_target_callee_save_regs[] = {
3664#if TCG_TARGET_REG_BITS == 64
3665    TCG_REG_RBP,
3666    TCG_REG_RBX,
3667#if defined(_WIN64)
3668    TCG_REG_RDI,
3669    TCG_REG_RSI,
3670#endif
3671    TCG_REG_R12,
3672    TCG_REG_R13,
3673    TCG_REG_R14, /* Currently used for the global env. */
3674    TCG_REG_R15,
3675#else
3676    TCG_REG_EBP, /* Currently used for the global env. */
3677    TCG_REG_EBX,
3678    TCG_REG_ESI,
3679    TCG_REG_EDI,
3680#endif
3681};
3682
3683/* Compute frame size via macros, to share between tcg_target_qemu_prologue
3684   and tcg_register_jit.  */
3685
3686#define PUSH_SIZE \
3687    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3688     * (TCG_TARGET_REG_BITS / 8))
3689
3690#define FRAME_SIZE \
3691    ((PUSH_SIZE \
3692      + TCG_STATIC_CALL_ARGS_SIZE \
3693      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3694      + TCG_TARGET_STACK_ALIGN - 1) \
3695     & ~(TCG_TARGET_STACK_ALIGN - 1))
3696
3697/* Generate global QEMU prologue and epilogue code */
3698static void tcg_target_qemu_prologue(TCGContext *s)
3699{
3700    int i, stack_addend;
3701
3702    /* TB prologue */
3703
3704    /* Reserve some stack space, also for TCG temps.  */
3705    stack_addend = FRAME_SIZE - PUSH_SIZE;
3706    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3707                  CPU_TEMP_BUF_NLONGS * sizeof(long));
3708
3709    /* Save all callee saved registers.  */
3710    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3711        tcg_out_push(s, tcg_target_callee_save_regs[i]);
3712    }
3713
3714#if TCG_TARGET_REG_BITS == 32
3715    tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3716               (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3717    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3718    /* jmp *tb.  */
3719    tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
3720                         (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3721                         + stack_addend);
3722#else
3723# if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64
3724    if (guest_base) {
3725        int seg = setup_guest_base_seg();
3726        if (seg != 0) {
3727            x86_guest_base_seg = seg;
3728        } else if (guest_base == (int32_t)guest_base) {
3729            x86_guest_base_offset = guest_base;
3730        } else {
3731            /* Choose R12 because, as a base, it requires a SIB byte. */
3732            x86_guest_base_index = TCG_REG_R12;
3733            tcg_out_mov(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base);
3734            tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index);
3735        }
3736    }
3737# endif
3738    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3739    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3740    /* jmp *tb.  */
3741    tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3742#endif
3743
3744    /*
3745     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3746     * and fall through to the rest of the epilogue.
3747     */
3748    s->code_gen_epilogue = s->code_ptr;
3749    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3750
3751    /* TB epilogue */
3752    tb_ret_addr = s->code_ptr;
3753
3754    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3755
3756    if (have_avx2) {
3757        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3758    }
3759    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3760        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3761    }
3762    tcg_out_opc(s, OPC_RET, 0, 0, 0);
3763}
3764
3765static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3766{
3767    memset(p, 0x90, count);
3768}
3769
3770static void tcg_target_init(TCGContext *s)
3771{
3772#ifdef CONFIG_CPUID_H
3773    unsigned a, b, c, d, b7 = 0;
3774    int max = __get_cpuid_max(0, 0);
3775
3776    if (max >= 7) {
3777        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
3778        __cpuid_count(7, 0, a, b7, c, d);
3779        have_bmi1 = (b7 & bit_BMI) != 0;
3780        have_bmi2 = (b7 & bit_BMI2) != 0;
3781    }
3782
3783    if (max >= 1) {
3784        __cpuid(1, a, b, c, d);
3785#ifndef have_cmov
3786        /* For 32-bit, 99% certainty that we're running on hardware that
3787           supports cmov, but we still need to check.  In case cmov is not
3788           available, we'll use a small forward branch.  */
3789        have_cmov = (d & bit_CMOV) != 0;
3790#endif
3791
3792        /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
3793           need to probe for it.  */
3794        have_movbe = (c & bit_MOVBE) != 0;
3795        have_popcnt = (c & bit_POPCNT) != 0;
3796
3797        /* There are a number of things we must check before we can be
3798           sure of not hitting invalid opcode.  */
3799        if (c & bit_OSXSAVE) {
3800            unsigned xcrl, xcrh;
3801            /* The xgetbv instruction is not available to older versions of
3802             * the assembler, so we encode the instruction manually.
3803             */
3804            asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
3805            if ((xcrl & 6) == 6) {
3806                have_avx1 = (c & bit_AVX) != 0;
3807                have_avx2 = (b7 & bit_AVX2) != 0;
3808            }
3809        }
3810    }
3811
3812    max = __get_cpuid_max(0x8000000, 0);
3813    if (max >= 1) {
3814        __cpuid(0x80000001, a, b, c, d);
3815        /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
3816        have_lzcnt = (c & bit_LZCNT) != 0;
3817    }
3818#endif /* CONFIG_CPUID_H */
3819
3820    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
3821    if (TCG_TARGET_REG_BITS == 64) {
3822        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
3823    }
3824    if (have_avx1) {
3825        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
3826        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
3827    }
3828    if (have_avx2) {
3829        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
3830    }
3831
3832    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
3833    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
3834    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
3835    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
3836    if (TCG_TARGET_REG_BITS == 64) {
3837#if !defined(_WIN64)
3838        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
3839        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
3840#endif
3841        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
3842        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
3843        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
3844        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
3845    }
3846
3847    s->reserved_regs = 0;
3848    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
3849}
3850
3851typedef struct {
3852    DebugFrameHeader h;
3853    uint8_t fde_def_cfa[4];
3854    uint8_t fde_reg_ofs[14];
3855} DebugFrame;
3856
3857/* We're expecting a 2 byte uleb128 encoded value.  */
3858QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3859
3860#if !defined(__ELF__)
3861    /* Host machine without ELF. */
3862#elif TCG_TARGET_REG_BITS == 64
3863#define ELF_HOST_MACHINE EM_X86_64
3864static const DebugFrame debug_frame = {
3865    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3866    .h.cie.id = -1,
3867    .h.cie.version = 1,
3868    .h.cie.code_align = 1,
3869    .h.cie.data_align = 0x78,             /* sleb128 -8 */
3870    .h.cie.return_column = 16,
3871
3872    /* Total FDE size does not include the "len" member.  */
3873    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3874
3875    .fde_def_cfa = {
3876        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
3877        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3878        (FRAME_SIZE >> 7)
3879    },
3880    .fde_reg_ofs = {
3881        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
3882        /* The following ordering must match tcg_target_callee_save_regs.  */
3883        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
3884        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
3885        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
3886        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
3887        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
3888        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
3889    }
3890};
3891#else
3892#define ELF_HOST_MACHINE EM_386
3893static const DebugFrame debug_frame = {
3894    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3895    .h.cie.id = -1,
3896    .h.cie.version = 1,
3897    .h.cie.code_align = 1,
3898    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
3899    .h.cie.return_column = 8,
3900
3901    /* Total FDE size does not include the "len" member.  */
3902    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3903
3904    .fde_def_cfa = {
3905        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
3906        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3907        (FRAME_SIZE >> 7)
3908    },
3909    .fde_reg_ofs = {
3910        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
3911        /* The following ordering must match tcg_target_callee_save_regs.  */
3912        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
3913        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
3914        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
3915        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
3916    }
3917};
3918#endif
3919
3920#if defined(ELF_HOST_MACHINE)
3921void tcg_register_jit(void *buf, size_t buf_size)
3922{
3923    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
3924}
3925#endif
3926