qemu/tcg/i386/tcg-target.inc.c
<<
>>
Prefs
   1/*
   2 * Tiny Code Generator for QEMU
   3 *
   4 * Copyright (c) 2008 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "tcg-be-ldst.h"
  26
  27#ifdef CONFIG_DEBUG_TCG
  28static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
  29#if TCG_TARGET_REG_BITS == 64
  30    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
  31    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
  32#else
  33    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
  34#endif
  35};
  36#endif
  37
  38static const int tcg_target_reg_alloc_order[] = {
  39#if TCG_TARGET_REG_BITS == 64
  40    TCG_REG_RBP,
  41    TCG_REG_RBX,
  42    TCG_REG_R12,
  43    TCG_REG_R13,
  44    TCG_REG_R14,
  45    TCG_REG_R15,
  46    TCG_REG_R10,
  47    TCG_REG_R11,
  48    TCG_REG_R9,
  49    TCG_REG_R8,
  50    TCG_REG_RCX,
  51    TCG_REG_RDX,
  52    TCG_REG_RSI,
  53    TCG_REG_RDI,
  54    TCG_REG_RAX,
  55#else
  56    TCG_REG_EBX,
  57    TCG_REG_ESI,
  58    TCG_REG_EDI,
  59    TCG_REG_EBP,
  60    TCG_REG_ECX,
  61    TCG_REG_EDX,
  62    TCG_REG_EAX,
  63#endif
  64};
  65
  66static const int tcg_target_call_iarg_regs[] = {
  67#if TCG_TARGET_REG_BITS == 64
  68#if defined(_WIN64)
  69    TCG_REG_RCX,
  70    TCG_REG_RDX,
  71#else
  72    TCG_REG_RDI,
  73    TCG_REG_RSI,
  74    TCG_REG_RDX,
  75    TCG_REG_RCX,
  76#endif
  77    TCG_REG_R8,
  78    TCG_REG_R9,
  79#else
  80    /* 32 bit mode uses stack based calling convention (GCC default). */
  81#endif
  82};
  83
  84static const int tcg_target_call_oarg_regs[] = {
  85    TCG_REG_EAX,
  86#if TCG_TARGET_REG_BITS == 32
  87    TCG_REG_EDX
  88#endif
  89};
  90
  91/* Constants we accept.  */
  92#define TCG_CT_CONST_S32 0x100
  93#define TCG_CT_CONST_U32 0x200
  94#define TCG_CT_CONST_I32 0x400
  95#define TCG_CT_CONST_WSZ 0x800
  96
  97/* Registers used with L constraint, which are the first argument 
  98   registers on x86_64, and two random call clobbered registers on
  99   i386. */
 100#if TCG_TARGET_REG_BITS == 64
 101# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
 102# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
 103#else
 104# define TCG_REG_L0 TCG_REG_EAX
 105# define TCG_REG_L1 TCG_REG_EDX
 106#endif
 107
 108/* The host compiler should supply <cpuid.h> to enable runtime features
 109   detection, as we're not going to go so far as our own inline assembly.
 110   If not available, default values will be assumed.  */
 111#if defined(CONFIG_CPUID_H)
 112#include "qemu/cpuid.h"
 113#endif
 114
 115/* For 64-bit, we always know that CMOV is available.  */
 116#if TCG_TARGET_REG_BITS == 64
 117# define have_cmov 1
 118#elif defined(CONFIG_CPUID_H)
 119static bool have_cmov;
 120#else
 121# define have_cmov 0
 122#endif
 123
 124/* We need these symbols in tcg-target.h, and we can't properly conditionalize
 125   it there.  Therefore we always define the variable.  */
 126bool have_bmi1;
 127bool have_popcnt;
 128
 129#ifdef CONFIG_CPUID_H
 130static bool have_movbe;
 131static bool have_bmi2;
 132static bool have_lzcnt;
 133#else
 134# define have_movbe 0
 135# define have_bmi2 0
 136# define have_lzcnt 0
 137#endif
 138
 139static tcg_insn_unit *tb_ret_addr;
 140
 141static void patch_reloc(tcg_insn_unit *code_ptr, int type,
 142                        intptr_t value, intptr_t addend)
 143{
 144    value += addend;
 145    switch(type) {
 146    case R_386_PC32:
 147        value -= (uintptr_t)code_ptr;
 148        if (value != (int32_t)value) {
 149            tcg_abort();
 150        }
 151        tcg_patch32(code_ptr, value);
 152        break;
 153    case R_386_PC8:
 154        value -= (uintptr_t)code_ptr;
 155        if (value != (int8_t)value) {
 156            tcg_abort();
 157        }
 158        tcg_patch8(code_ptr, value);
 159        break;
 160    default:
 161        tcg_abort();
 162    }
 163}
 164
 165/* parse target specific constraints */
 166static const char *target_parse_constraint(TCGArgConstraint *ct,
 167                                           const char *ct_str, TCGType type)
 168{
 169    switch(*ct_str++) {
 170    case 'a':
 171        ct->ct |= TCG_CT_REG;
 172        tcg_regset_set_reg(ct->u.regs, TCG_REG_EAX);
 173        break;
 174    case 'b':
 175        ct->ct |= TCG_CT_REG;
 176        tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX);
 177        break;
 178    case 'c':
 179        ct->ct |= TCG_CT_REG;
 180        tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX);
 181        break;
 182    case 'd':
 183        ct->ct |= TCG_CT_REG;
 184        tcg_regset_set_reg(ct->u.regs, TCG_REG_EDX);
 185        break;
 186    case 'S':
 187        ct->ct |= TCG_CT_REG;
 188        tcg_regset_set_reg(ct->u.regs, TCG_REG_ESI);
 189        break;
 190    case 'D':
 191        ct->ct |= TCG_CT_REG;
 192        tcg_regset_set_reg(ct->u.regs, TCG_REG_EDI);
 193        break;
 194    case 'q':
 195        ct->ct |= TCG_CT_REG;
 196        if (TCG_TARGET_REG_BITS == 64) {
 197            tcg_regset_set32(ct->u.regs, 0, 0xffff);
 198        } else {
 199            tcg_regset_set32(ct->u.regs, 0, 0xf);
 200        }
 201        break;
 202    case 'Q':
 203        ct->ct |= TCG_CT_REG;
 204        tcg_regset_set32(ct->u.regs, 0, 0xf);
 205        break;
 206    case 'r':
 207        ct->ct |= TCG_CT_REG;
 208        if (TCG_TARGET_REG_BITS == 64) {
 209            tcg_regset_set32(ct->u.regs, 0, 0xffff);
 210        } else {
 211            tcg_regset_set32(ct->u.regs, 0, 0xff);
 212        }
 213        break;
 214    case 'W':
 215        /* With TZCNT/LZCNT, we can have operand-size as an input.  */
 216        ct->ct |= TCG_CT_CONST_WSZ;
 217        break;
 218
 219        /* qemu_ld/st address constraint */
 220    case 'L':
 221        ct->ct |= TCG_CT_REG;
 222        if (TCG_TARGET_REG_BITS == 64) {
 223            tcg_regset_set32(ct->u.regs, 0, 0xffff);
 224        } else {
 225            tcg_regset_set32(ct->u.regs, 0, 0xff);
 226        }
 227        tcg_regset_reset_reg(ct->u.regs, TCG_REG_L0);
 228        tcg_regset_reset_reg(ct->u.regs, TCG_REG_L1);
 229        break;
 230
 231    case 'e':
 232        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_S32);
 233        break;
 234    case 'Z':
 235        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_U32);
 236        break;
 237    case 'I':
 238        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_I32);
 239        break;
 240
 241    default:
 242        return NULL;
 243    }
 244    return ct_str;
 245}
 246
 247/* test if a constant matches the constraint */
 248static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 249                                         const TCGArgConstraint *arg_ct)
 250{
 251    int ct = arg_ct->ct;
 252    if (ct & TCG_CT_CONST) {
 253        return 1;
 254    }
 255    if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
 256        return 1;
 257    }
 258    if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
 259        return 1;
 260    }
 261    if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
 262        return 1;
 263    }
 264    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
 265        return 1;
 266    }
 267    return 0;
 268}
 269
 270#if TCG_TARGET_REG_BITS == 64
 271# define LOWREGMASK(x)  ((x) & 7)
 272#else
 273# define LOWREGMASK(x)  (x)
 274#endif
 275
 276#define P_EXT           0x100           /* 0x0f opcode prefix */
 277#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
 278#define P_DATA16        0x400           /* 0x66 opcode prefix */
 279#if TCG_TARGET_REG_BITS == 64
 280# define P_ADDR32       0x800           /* 0x67 opcode prefix */
 281# define P_REXW         0x1000          /* Set REX.W = 1 */
 282# define P_REXB_R       0x2000          /* REG field as byte register */
 283# define P_REXB_RM      0x4000          /* R/M field as byte register */
 284# define P_GS           0x8000          /* gs segment override */
 285#else
 286# define P_ADDR32       0
 287# define P_REXW         0
 288# define P_REXB_R       0
 289# define P_REXB_RM      0
 290# define P_GS           0
 291#endif
 292#define P_SIMDF3        0x10000         /* 0xf3 opcode prefix */
 293#define P_SIMDF2        0x20000         /* 0xf2 opcode prefix */
 294
 295#define OPC_ARITH_EvIz  (0x81)
 296#define OPC_ARITH_EvIb  (0x83)
 297#define OPC_ARITH_GvEv  (0x03)          /* ... plus (ARITH_FOO << 3) */
 298#define OPC_ANDN        (0xf2 | P_EXT38)
 299#define OPC_ADD_GvEv    (OPC_ARITH_GvEv | (ARITH_ADD << 3))
 300#define OPC_BSF         (0xbc | P_EXT)
 301#define OPC_BSR         (0xbd | P_EXT)
 302#define OPC_BSWAP       (0xc8 | P_EXT)
 303#define OPC_CALL_Jz     (0xe8)
 304#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
 305#define OPC_CMP_GvEv    (OPC_ARITH_GvEv | (ARITH_CMP << 3))
 306#define OPC_DEC_r32     (0x48)
 307#define OPC_IMUL_GvEv   (0xaf | P_EXT)
 308#define OPC_IMUL_GvEvIb (0x6b)
 309#define OPC_IMUL_GvEvIz (0x69)
 310#define OPC_INC_r32     (0x40)
 311#define OPC_JCC_long    (0x80 | P_EXT)  /* ... plus condition code */
 312#define OPC_JCC_short   (0x70)          /* ... plus condition code */
 313#define OPC_JMP_long    (0xe9)
 314#define OPC_JMP_short   (0xeb)
 315#define OPC_LEA         (0x8d)
 316#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
 317#define OPC_MOVB_EvGv   (0x88)          /* stores, more or less */
 318#define OPC_MOVL_EvGv   (0x89)          /* stores, more or less */
 319#define OPC_MOVL_GvEv   (0x8b)          /* loads, more or less */
 320#define OPC_MOVB_EvIz   (0xc6)
 321#define OPC_MOVL_EvIz   (0xc7)
 322#define OPC_MOVL_Iv     (0xb8)
 323#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
 324#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
 325#define OPC_MOVSBL      (0xbe | P_EXT)
 326#define OPC_MOVSWL      (0xbf | P_EXT)
 327#define OPC_MOVSLQ      (0x63 | P_REXW)
 328#define OPC_MOVZBL      (0xb6 | P_EXT)
 329#define OPC_MOVZWL      (0xb7 | P_EXT)
 330#define OPC_POP_r32     (0x58)
 331#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
 332#define OPC_PUSH_r32    (0x50)
 333#define OPC_PUSH_Iv     (0x68)
 334#define OPC_PUSH_Ib     (0x6a)
 335#define OPC_RET         (0xc3)
 336#define OPC_SETCC       (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
 337#define OPC_SHIFT_1     (0xd1)
 338#define OPC_SHIFT_Ib    (0xc1)
 339#define OPC_SHIFT_cl    (0xd3)
 340#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
 341#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
 342#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
 343#define OPC_TESTL       (0x85)
 344#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
 345#define OPC_XCHG_ax_r32 (0x90)
 346
 347#define OPC_GRP3_Ev     (0xf7)
 348#define OPC_GRP5        (0xff)
 349
 350/* Group 1 opcode extensions for 0x80-0x83.
 351   These are also used as modifiers for OPC_ARITH.  */
 352#define ARITH_ADD 0
 353#define ARITH_OR  1
 354#define ARITH_ADC 2
 355#define ARITH_SBB 3
 356#define ARITH_AND 4
 357#define ARITH_SUB 5
 358#define ARITH_XOR 6
 359#define ARITH_CMP 7
 360
 361/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
 362#define SHIFT_ROL 0
 363#define SHIFT_ROR 1
 364#define SHIFT_SHL 4
 365#define SHIFT_SHR 5
 366#define SHIFT_SAR 7
 367
 368/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
 369#define EXT3_NOT   2
 370#define EXT3_NEG   3
 371#define EXT3_MUL   4
 372#define EXT3_IMUL  5
 373#define EXT3_DIV   6
 374#define EXT3_IDIV  7
 375
 376/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
 377#define EXT5_INC_Ev     0
 378#define EXT5_DEC_Ev     1
 379#define EXT5_CALLN_Ev   2
 380#define EXT5_JMPN_Ev    4
 381
 382/* Condition codes to be added to OPC_JCC_{long,short}.  */
 383#define JCC_JMP (-1)
 384#define JCC_JO  0x0
 385#define JCC_JNO 0x1
 386#define JCC_JB  0x2
 387#define JCC_JAE 0x3
 388#define JCC_JE  0x4
 389#define JCC_JNE 0x5
 390#define JCC_JBE 0x6
 391#define JCC_JA  0x7
 392#define JCC_JS  0x8
 393#define JCC_JNS 0x9
 394#define JCC_JP  0xa
 395#define JCC_JNP 0xb
 396#define JCC_JL  0xc
 397#define JCC_JGE 0xd
 398#define JCC_JLE 0xe
 399#define JCC_JG  0xf
 400
 401static const uint8_t tcg_cond_to_jcc[] = {
 402    [TCG_COND_EQ] = JCC_JE,
 403    [TCG_COND_NE] = JCC_JNE,
 404    [TCG_COND_LT] = JCC_JL,
 405    [TCG_COND_GE] = JCC_JGE,
 406    [TCG_COND_LE] = JCC_JLE,
 407    [TCG_COND_GT] = JCC_JG,
 408    [TCG_COND_LTU] = JCC_JB,
 409    [TCG_COND_GEU] = JCC_JAE,
 410    [TCG_COND_LEU] = JCC_JBE,
 411    [TCG_COND_GTU] = JCC_JA,
 412};
 413
 414#if TCG_TARGET_REG_BITS == 64
 415static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
 416{
 417    int rex;
 418
 419    if (opc & P_GS) {
 420        tcg_out8(s, 0x65);
 421    }
 422    if (opc & P_DATA16) {
 423        /* We should never be asking for both 16 and 64-bit operation.  */
 424        tcg_debug_assert((opc & P_REXW) == 0);
 425        tcg_out8(s, 0x66);
 426    }
 427    if (opc & P_ADDR32) {
 428        tcg_out8(s, 0x67);
 429    }
 430    if (opc & P_SIMDF3) {
 431        tcg_out8(s, 0xf3);
 432    } else if (opc & P_SIMDF2) {
 433        tcg_out8(s, 0xf2);
 434    }
 435
 436    rex = 0;
 437    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
 438    rex |= (r & 8) >> 1;                /* REX.R */
 439    rex |= (x & 8) >> 2;                /* REX.X */
 440    rex |= (rm & 8) >> 3;               /* REX.B */
 441
 442    /* P_REXB_{R,RM} indicates that the given register is the low byte.
 443       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
 444       as otherwise the encoding indicates %[abcd]h.  Note that the values
 445       that are ORed in merely indicate that the REX byte must be present;
 446       those bits get discarded in output.  */
 447    rex |= opc & (r >= 4 ? P_REXB_R : 0);
 448    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
 449
 450    if (rex) {
 451        tcg_out8(s, (uint8_t)(rex | 0x40));
 452    }
 453
 454    if (opc & (P_EXT | P_EXT38)) {
 455        tcg_out8(s, 0x0f);
 456        if (opc & P_EXT38) {
 457            tcg_out8(s, 0x38);
 458        }
 459    }
 460
 461    tcg_out8(s, opc);
 462}
 463#else
 464static void tcg_out_opc(TCGContext *s, int opc)
 465{
 466    if (opc & P_DATA16) {
 467        tcg_out8(s, 0x66);
 468    }
 469    if (opc & P_SIMDF3) {
 470        tcg_out8(s, 0xf3);
 471    } else if (opc & P_SIMDF2) {
 472        tcg_out8(s, 0xf2);
 473    }
 474    if (opc & (P_EXT | P_EXT38)) {
 475        tcg_out8(s, 0x0f);
 476        if (opc & P_EXT38) {
 477            tcg_out8(s, 0x38);
 478        }
 479    }
 480    tcg_out8(s, opc);
 481}
 482/* Discard the register arguments to tcg_out_opc early, so as not to penalize
 483   the 32-bit compilation paths.  This method works with all versions of gcc,
 484   whereas relying on optimization may not be able to exclude them.  */
 485#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
 486#endif
 487
 488static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
 489{
 490    tcg_out_opc(s, opc, r, rm, 0);
 491    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 492}
 493
 494static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
 495{
 496    int tmp;
 497
 498    if ((opc & (P_REXW | P_EXT | P_EXT38)) || (rm & 8)) {
 499        /* Three byte VEX prefix.  */
 500        tcg_out8(s, 0xc4);
 501
 502        /* VEX.m-mmmm */
 503        if (opc & P_EXT38) {
 504            tmp = 2;
 505        } else if (opc & P_EXT) {
 506            tmp = 1;
 507        } else {
 508            tcg_abort();
 509        }
 510        tmp |= 0x40;                       /* VEX.X */
 511        tmp |= (r & 8 ? 0 : 0x80);         /* VEX.R */
 512        tmp |= (rm & 8 ? 0 : 0x20);        /* VEX.B */
 513        tcg_out8(s, tmp);
 514
 515        tmp = (opc & P_REXW ? 0x80 : 0);   /* VEX.W */
 516    } else {
 517        /* Two byte VEX prefix.  */
 518        tcg_out8(s, 0xc5);
 519
 520        tmp = (r & 8 ? 0 : 0x80);          /* VEX.R */
 521    }
 522    /* VEX.pp */
 523    if (opc & P_DATA16) {
 524        tmp |= 1;                          /* 0x66 */
 525    } else if (opc & P_SIMDF3) {
 526        tmp |= 2;                          /* 0xf3 */
 527    } else if (opc & P_SIMDF2) {
 528        tmp |= 3;                          /* 0xf2 */
 529    }
 530    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
 531    tcg_out8(s, tmp);
 532    tcg_out8(s, opc);
 533    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 534}
 535
 536/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
 537   We handle either RM and INDEX missing with a negative value.  In 64-bit
 538   mode for absolute addresses, ~RM is the size of the immediate operand
 539   that will follow the instruction.  */
 540
 541static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
 542                                     int index, int shift, intptr_t offset)
 543{
 544    int mod, len;
 545
 546    if (index < 0 && rm < 0) {
 547        if (TCG_TARGET_REG_BITS == 64) {
 548            /* Try for a rip-relative addressing mode.  This has replaced
 549               the 32-bit-mode absolute addressing encoding.  */
 550            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
 551            intptr_t disp = offset - pc;
 552            if (disp == (int32_t)disp) {
 553                tcg_out_opc(s, opc, r, 0, 0);
 554                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
 555                tcg_out32(s, disp);
 556                return;
 557            }
 558
 559            /* Try for an absolute address encoding.  This requires the
 560               use of the MODRM+SIB encoding and is therefore larger than
 561               rip-relative addressing.  */
 562            if (offset == (int32_t)offset) {
 563                tcg_out_opc(s, opc, r, 0, 0);
 564                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
 565                tcg_out8(s, (4 << 3) | 5);
 566                tcg_out32(s, offset);
 567                return;
 568            }
 569
 570            /* ??? The memory isn't directly addressable.  */
 571            tcg_abort();
 572        } else {
 573            /* Absolute address.  */
 574            tcg_out_opc(s, opc, r, 0, 0);
 575            tcg_out8(s, (r << 3) | 5);
 576            tcg_out32(s, offset);
 577            return;
 578        }
 579    }
 580
 581    /* Find the length of the immediate addend.  Note that the encoding
 582       that would be used for (%ebp) indicates absolute addressing.  */
 583    if (rm < 0) {
 584        mod = 0, len = 4, rm = 5;
 585    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
 586        mod = 0, len = 0;
 587    } else if (offset == (int8_t)offset) {
 588        mod = 0x40, len = 1;
 589    } else {
 590        mod = 0x80, len = 4;
 591    }
 592
 593    /* Use a single byte MODRM format if possible.  Note that the encoding
 594       that would be used for %esp is the escape to the two byte form.  */
 595    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
 596        /* Single byte MODRM format.  */
 597        tcg_out_opc(s, opc, r, rm, 0);
 598        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 599    } else {
 600        /* Two byte MODRM+SIB format.  */
 601
 602        /* Note that the encoding that would place %esp into the index
 603           field indicates no index register.  In 64-bit mode, the REX.X
 604           bit counts, so %r12 can be used as the index.  */
 605        if (index < 0) {
 606            index = 4;
 607        } else {
 608            tcg_debug_assert(index != TCG_REG_ESP);
 609        }
 610
 611        tcg_out_opc(s, opc, r, rm, index);
 612        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
 613        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
 614    }
 615
 616    if (len == 1) {
 617        tcg_out8(s, offset);
 618    } else if (len == 4) {
 619        tcg_out32(s, offset);
 620    }
 621}
 622
 623/* A simplification of the above with no index or shift.  */
 624static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
 625                                        int rm, intptr_t offset)
 626{
 627    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
 628}
 629
 630/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
 631static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
 632{
 633    /* Propagate an opcode prefix, such as P_REXW.  */
 634    int ext = subop & ~0x7;
 635    subop &= 0x7;
 636
 637    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
 638}
 639
 640static inline void tcg_out_mov(TCGContext *s, TCGType type,
 641                               TCGReg ret, TCGReg arg)
 642{
 643    if (arg != ret) {
 644        int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
 645        tcg_out_modrm(s, opc, ret, arg);
 646    }
 647}
 648
 649static void tcg_out_movi(TCGContext *s, TCGType type,
 650                         TCGReg ret, tcg_target_long arg)
 651{
 652    tcg_target_long diff;
 653
 654    if (arg == 0) {
 655        tgen_arithr(s, ARITH_XOR, ret, ret);
 656        return;
 657    }
 658    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
 659        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
 660        tcg_out32(s, arg);
 661        return;
 662    }
 663    if (arg == (int32_t)arg) {
 664        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
 665        tcg_out32(s, arg);
 666        return;
 667    }
 668
 669    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
 670    diff = arg - ((uintptr_t)s->code_ptr + 7);
 671    if (diff == (int32_t)diff) {
 672        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
 673        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
 674        tcg_out32(s, diff);
 675        return;
 676    }
 677
 678    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
 679    tcg_out64(s, arg);
 680}
 681
 682static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
 683{
 684    if (val == (int8_t)val) {
 685        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
 686        tcg_out8(s, val);
 687    } else if (val == (int32_t)val) {
 688        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
 689        tcg_out32(s, val);
 690    } else {
 691        tcg_abort();
 692    }
 693}
 694
 695static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
 696{
 697    /* Given the strength of x86 memory ordering, we only need care for
 698       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
 699       faster than "mfence", so don't bother with the sse insn.  */
 700    if (a0 & TCG_MO_ST_LD) {
 701        tcg_out8(s, 0xf0);
 702        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
 703        tcg_out8(s, 0);
 704    }
 705}
 706
 707static inline void tcg_out_push(TCGContext *s, int reg)
 708{
 709    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
 710}
 711
 712static inline void tcg_out_pop(TCGContext *s, int reg)
 713{
 714    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
 715}
 716
 717static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
 718                              TCGReg arg1, intptr_t arg2)
 719{
 720    int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
 721    tcg_out_modrm_offset(s, opc, ret, arg1, arg2);
 722}
 723
 724static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
 725                              TCGReg arg1, intptr_t arg2)
 726{
 727    int opc = OPC_MOVL_EvGv + (type == TCG_TYPE_I64 ? P_REXW : 0);
 728    tcg_out_modrm_offset(s, opc, arg, arg1, arg2);
 729}
 730
 731static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
 732                        TCGReg base, intptr_t ofs)
 733{
 734    int rexw = 0;
 735    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
 736        if (val != (int32_t)val) {
 737            return false;
 738        }
 739        rexw = P_REXW;
 740    }
 741    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
 742    tcg_out32(s, val);
 743    return true;
 744}
 745
 746static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
 747{
 748    /* Propagate an opcode prefix, such as P_DATA16.  */
 749    int ext = subopc & ~0x7;
 750    subopc &= 0x7;
 751
 752    if (count == 1) {
 753        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
 754    } else {
 755        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
 756        tcg_out8(s, count);
 757    }
 758}
 759
 760static inline void tcg_out_bswap32(TCGContext *s, int reg)
 761{
 762    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
 763}
 764
 765static inline void tcg_out_rolw_8(TCGContext *s, int reg)
 766{
 767    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
 768}
 769
 770static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
 771{
 772    /* movzbl */
 773    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
 774    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
 775}
 776
 777static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
 778{
 779    /* movsbl */
 780    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
 781    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
 782}
 783
 784static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
 785{
 786    /* movzwl */
 787    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
 788}
 789
 790static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
 791{
 792    /* movsw[lq] */
 793    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
 794}
 795
 796static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
 797{
 798    /* 32-bit mov zero extends.  */
 799    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
 800}
 801
 802static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
 803{
 804    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
 805}
 806
 807static inline void tcg_out_bswap64(TCGContext *s, int reg)
 808{
 809    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
 810}
 811
 812static void tgen_arithi(TCGContext *s, int c, int r0,
 813                        tcg_target_long val, int cf)
 814{
 815    int rexw = 0;
 816
 817    if (TCG_TARGET_REG_BITS == 64) {
 818        rexw = c & -8;
 819        c &= 7;
 820    }
 821
 822    /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
 823       partial flags update stalls on Pentium4 and are not recommended
 824       by current Intel optimization manuals.  */
 825    if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
 826        int is_inc = (c == ARITH_ADD) ^ (val < 0);
 827        if (TCG_TARGET_REG_BITS == 64) {
 828            /* The single-byte increment encodings are re-tasked as the
 829               REX prefixes.  Use the MODRM encoding.  */
 830            tcg_out_modrm(s, OPC_GRP5 + rexw,
 831                          (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
 832        } else {
 833            tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
 834        }
 835        return;
 836    }
 837
 838    if (c == ARITH_AND) {
 839        if (TCG_TARGET_REG_BITS == 64) {
 840            if (val == 0xffffffffu) {
 841                tcg_out_ext32u(s, r0, r0);
 842                return;
 843            }
 844            if (val == (uint32_t)val) {
 845                /* AND with no high bits set can use a 32-bit operation.  */
 846                rexw = 0;
 847            }
 848        }
 849        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
 850            tcg_out_ext8u(s, r0, r0);
 851            return;
 852        }
 853        if (val == 0xffffu) {
 854            tcg_out_ext16u(s, r0, r0);
 855            return;
 856        }
 857    }
 858
 859    if (val == (int8_t)val) {
 860        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
 861        tcg_out8(s, val);
 862        return;
 863    }
 864    if (rexw == 0 || val == (int32_t)val) {
 865        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
 866        tcg_out32(s, val);
 867        return;
 868    }
 869
 870    tcg_abort();
 871}
 872
 873static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
 874{
 875    if (val != 0) {
 876        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
 877    }
 878}
 879
 880/* Use SMALL != 0 to force a short forward branch.  */
 881static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
 882{
 883    int32_t val, val1;
 884
 885    if (l->has_value) {
 886        val = tcg_pcrel_diff(s, l->u.value_ptr);
 887        val1 = val - 2;
 888        if ((int8_t)val1 == val1) {
 889            if (opc == -1) {
 890                tcg_out8(s, OPC_JMP_short);
 891            } else {
 892                tcg_out8(s, OPC_JCC_short + opc);
 893            }
 894            tcg_out8(s, val1);
 895        } else {
 896            if (small) {
 897                tcg_abort();
 898            }
 899            if (opc == -1) {
 900                tcg_out8(s, OPC_JMP_long);
 901                tcg_out32(s, val - 5);
 902            } else {
 903                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
 904                tcg_out32(s, val - 6);
 905            }
 906        }
 907    } else if (small) {
 908        if (opc == -1) {
 909            tcg_out8(s, OPC_JMP_short);
 910        } else {
 911            tcg_out8(s, OPC_JCC_short + opc);
 912        }
 913        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
 914        s->code_ptr += 1;
 915    } else {
 916        if (opc == -1) {
 917            tcg_out8(s, OPC_JMP_long);
 918        } else {
 919            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
 920        }
 921        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
 922        s->code_ptr += 4;
 923    }
 924}
 925
 926static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
 927                        int const_arg2, int rexw)
 928{
 929    if (const_arg2) {
 930        if (arg2 == 0) {
 931            /* test r, r */
 932            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
 933        } else {
 934            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
 935        }
 936    } else {
 937        tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
 938    }
 939}
 940
 941static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
 942                             TCGArg arg1, TCGArg arg2, int const_arg2,
 943                             TCGLabel *label, int small)
 944{
 945    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
 946    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
 947}
 948
 949#if TCG_TARGET_REG_BITS == 64
 950static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
 951                             TCGArg arg1, TCGArg arg2, int const_arg2,
 952                             TCGLabel *label, int small)
 953{
 954    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
 955    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
 956}
 957#else
 958/* XXX: we implement it at the target level to avoid having to
 959   handle cross basic blocks temporaries */
 960static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
 961                            const int *const_args, int small)
 962{
 963    TCGLabel *label_next = gen_new_label();
 964    TCGLabel *label_this = arg_label(args[5]);
 965
 966    switch(args[4]) {
 967    case TCG_COND_EQ:
 968        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
 969                         label_next, 1);
 970        tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
 971                         label_this, small);
 972        break;
 973    case TCG_COND_NE:
 974        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
 975                         label_this, small);
 976        tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
 977                         label_this, small);
 978        break;
 979    case TCG_COND_LT:
 980        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
 981                         label_this, small);
 982        tcg_out_jxx(s, JCC_JNE, label_next, 1);
 983        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
 984                         label_this, small);
 985        break;
 986    case TCG_COND_LE:
 987        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
 988                         label_this, small);
 989        tcg_out_jxx(s, JCC_JNE, label_next, 1);
 990        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
 991                         label_this, small);
 992        break;
 993    case TCG_COND_GT:
 994        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
 995                         label_this, small);
 996        tcg_out_jxx(s, JCC_JNE, label_next, 1);
 997        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
 998                         label_this, small);
 999        break;
1000    case TCG_COND_GE:
1001        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1002                         label_this, small);
1003        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1004        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1005                         label_this, small);
1006        break;
1007    case TCG_COND_LTU:
1008        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1009                         label_this, small);
1010        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1011        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1012                         label_this, small);
1013        break;
1014    case TCG_COND_LEU:
1015        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1016                         label_this, small);
1017        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1018        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1019                         label_this, small);
1020        break;
1021    case TCG_COND_GTU:
1022        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1023                         label_this, small);
1024        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1025        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1026                         label_this, small);
1027        break;
1028    case TCG_COND_GEU:
1029        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1030                         label_this, small);
1031        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1032        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1033                         label_this, small);
1034        break;
1035    default:
1036        tcg_abort();
1037    }
1038    tcg_out_label(s, label_next, s->code_ptr);
1039}
1040#endif
1041
1042static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1043                              TCGArg arg1, TCGArg arg2, int const_arg2)
1044{
1045    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1046    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1047    tcg_out_ext8u(s, dest, dest);
1048}
1049
1050#if TCG_TARGET_REG_BITS == 64
1051static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1052                              TCGArg arg1, TCGArg arg2, int const_arg2)
1053{
1054    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1055    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1056    tcg_out_ext8u(s, dest, dest);
1057}
1058#else
1059static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1060                             const int *const_args)
1061{
1062    TCGArg new_args[6];
1063    TCGLabel *label_true, *label_over;
1064
1065    memcpy(new_args, args+1, 5*sizeof(TCGArg));
1066
1067    if (args[0] == args[1] || args[0] == args[2]
1068        || (!const_args[3] && args[0] == args[3])
1069        || (!const_args[4] && args[0] == args[4])) {
1070        /* When the destination overlaps with one of the argument
1071           registers, don't do anything tricky.  */
1072        label_true = gen_new_label();
1073        label_over = gen_new_label();
1074
1075        new_args[5] = label_arg(label_true);
1076        tcg_out_brcond2(s, new_args, const_args+1, 1);
1077
1078        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1079        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1080        tcg_out_label(s, label_true, s->code_ptr);
1081
1082        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1083        tcg_out_label(s, label_over, s->code_ptr);
1084    } else {
1085        /* When the destination does not overlap one of the arguments,
1086           clear the destination first, jump if cond false, and emit an
1087           increment in the true case.  This results in smaller code.  */
1088
1089        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1090
1091        label_over = gen_new_label();
1092        new_args[4] = tcg_invert_cond(new_args[4]);
1093        new_args[5] = label_arg(label_over);
1094        tcg_out_brcond2(s, new_args, const_args+1, 1);
1095
1096        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1097        tcg_out_label(s, label_over, s->code_ptr);
1098    }
1099}
1100#endif
1101
1102static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1103                         TCGReg dest, TCGReg v1)
1104{
1105    if (have_cmov) {
1106        tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1107    } else {
1108        TCGLabel *over = gen_new_label();
1109        tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1110        tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1111        tcg_out_label(s, over, s->code_ptr);
1112    }
1113}
1114
1115static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1116                              TCGReg c1, TCGArg c2, int const_c2,
1117                              TCGReg v1)
1118{
1119    tcg_out_cmp(s, c1, c2, const_c2, 0);
1120    tcg_out_cmov(s, cond, 0, dest, v1);
1121}
1122
1123#if TCG_TARGET_REG_BITS == 64
1124static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1125                              TCGReg c1, TCGArg c2, int const_c2,
1126                              TCGReg v1)
1127{
1128    tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1129    tcg_out_cmov(s, cond, P_REXW, dest, v1);
1130}
1131#endif
1132
1133static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1134                        TCGArg arg2, bool const_a2)
1135{
1136    if (have_bmi1) {
1137        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1138        if (const_a2) {
1139            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1140        } else {
1141            tcg_debug_assert(dest != arg2);
1142            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1143        }
1144    } else {
1145        tcg_debug_assert(dest != arg2);
1146        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1147        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1148    }
1149}
1150
1151static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1152                        TCGArg arg2, bool const_a2)
1153{
1154    if (have_lzcnt) {
1155        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1156        if (const_a2) {
1157            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1158        } else {
1159            tcg_debug_assert(dest != arg2);
1160            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1161        }
1162    } else {
1163        tcg_debug_assert(!const_a2);
1164        tcg_debug_assert(dest != arg1);
1165        tcg_debug_assert(dest != arg2);
1166
1167        /* Recall that the output of BSR is the index not the count.  */
1168        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1169        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1170
1171        /* Since we have destroyed the flags from BSR, we have to re-test.  */
1172        tcg_out_cmp(s, arg1, 0, 1, rexw);
1173        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1174    }
1175}
1176
1177static void tcg_out_branch(TCGContext *s, int call, tcg_insn_unit *dest)
1178{
1179    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1180
1181    if (disp == (int32_t)disp) {
1182        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1183        tcg_out32(s, disp);
1184    } else {
1185        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R10, (uintptr_t)dest);
1186        tcg_out_modrm(s, OPC_GRP5,
1187                      call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev, TCG_REG_R10);
1188    }
1189}
1190
1191static inline void tcg_out_call(TCGContext *s, tcg_insn_unit *dest)
1192{
1193    tcg_out_branch(s, 1, dest);
1194}
1195
1196static void tcg_out_jmp(TCGContext *s, tcg_insn_unit *dest)
1197{
1198    tcg_out_branch(s, 0, dest);
1199}
1200
1201static void tcg_out_nopn(TCGContext *s, int n)
1202{
1203    int i;
1204    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1205     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1206     * duplicate prefix, and all of the interesting recent cores can
1207     * decode and discard the duplicates in a single cycle.
1208     */
1209    tcg_debug_assert(n >= 1);
1210    for (i = 1; i < n; ++i) {
1211        tcg_out8(s, 0x66);
1212    }
1213    tcg_out8(s, 0x90);
1214}
1215
1216#if defined(CONFIG_SOFTMMU)
1217/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1218 *                                     int mmu_idx, uintptr_t ra)
1219 */
1220static void * const qemu_ld_helpers[16] = {
1221    [MO_UB]   = helper_ret_ldub_mmu,
1222    [MO_LEUW] = helper_le_lduw_mmu,
1223    [MO_LEUL] = helper_le_ldul_mmu,
1224    [MO_LEQ]  = helper_le_ldq_mmu,
1225    [MO_BEUW] = helper_be_lduw_mmu,
1226    [MO_BEUL] = helper_be_ldul_mmu,
1227    [MO_BEQ]  = helper_be_ldq_mmu,
1228};
1229
1230/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1231 *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
1232 */
1233static void * const qemu_st_helpers[16] = {
1234    [MO_UB]   = helper_ret_stb_mmu,
1235    [MO_LEUW] = helper_le_stw_mmu,
1236    [MO_LEUL] = helper_le_stl_mmu,
1237    [MO_LEQ]  = helper_le_stq_mmu,
1238    [MO_BEUW] = helper_be_stw_mmu,
1239    [MO_BEUL] = helper_be_stl_mmu,
1240    [MO_BEQ]  = helper_be_stq_mmu,
1241};
1242
1243/* Perform the TLB load and compare.
1244
1245   Inputs:
1246   ADDRLO and ADDRHI contain the low and high part of the address.
1247
1248   MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1249
1250   WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1251   This should be offsetof addr_read or addr_write.
1252
1253   Outputs:
1254   LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1255   positions of the displacements of forward jumps to the TLB miss case.
1256
1257   Second argument register is loaded with the low part of the address.
1258   In the TLB hit case, it has been adjusted as indicated by the TLB
1259   and so is a host address.  In the TLB miss case, it continues to
1260   hold a guest address.
1261
1262   First argument register is clobbered.  */
1263
1264static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1265                                    int mem_index, TCGMemOp opc,
1266                                    tcg_insn_unit **label_ptr, int which)
1267{
1268    const TCGReg r0 = TCG_REG_L0;
1269    const TCGReg r1 = TCG_REG_L1;
1270    TCGType ttype = TCG_TYPE_I32;
1271    TCGType tlbtype = TCG_TYPE_I32;
1272    int trexw = 0, hrexw = 0, tlbrexw = 0;
1273    unsigned a_bits = get_alignment_bits(opc);
1274    unsigned s_bits = opc & MO_SIZE;
1275    unsigned a_mask = (1 << a_bits) - 1;
1276    unsigned s_mask = (1 << s_bits) - 1;
1277    target_ulong tlb_mask;
1278
1279    if (TCG_TARGET_REG_BITS == 64) {
1280        if (TARGET_LONG_BITS == 64) {
1281            ttype = TCG_TYPE_I64;
1282            trexw = P_REXW;
1283        }
1284        if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1285            hrexw = P_REXW;
1286            if (TARGET_PAGE_BITS + CPU_TLB_BITS > 32) {
1287                tlbtype = TCG_TYPE_I64;
1288                tlbrexw = P_REXW;
1289            }
1290        }
1291    }
1292
1293    tcg_out_mov(s, tlbtype, r0, addrlo);
1294    /* If the required alignment is at least as large as the access, simply
1295       copy the address and mask.  For lesser alignments, check that we don't
1296       cross pages for the complete access.  */
1297    if (a_bits >= s_bits) {
1298        tcg_out_mov(s, ttype, r1, addrlo);
1299    } else {
1300        tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1301    }
1302    tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1303
1304    tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1305                   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1306
1307    tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1308    tgen_arithi(s, ARITH_AND + tlbrexw, r0,
1309                (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS, 0);
1310
1311    tcg_out_modrm_sib_offset(s, OPC_LEA + hrexw, r0, TCG_AREG0, r0, 0,
1312                             offsetof(CPUArchState, tlb_table[mem_index][0])
1313                             + which);
1314
1315    /* cmp 0(r0), r1 */
1316    tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, 0);
1317
1318    /* Prepare for both the fast path add of the tlb addend, and the slow
1319       path function argument setup.  There are two cases worth note:
1320       For 32-bit guest and x86_64 host, MOVL zero-extends the guest address
1321       before the fastpath ADDQ below.  For 64-bit guest and x32 host, MOVQ
1322       copies the entire guest address for the slow path, while truncation
1323       for the 32-bit host happens with the fastpath ADDL below.  */
1324    tcg_out_mov(s, ttype, r1, addrlo);
1325
1326    /* jne slow_path */
1327    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1328    label_ptr[0] = s->code_ptr;
1329    s->code_ptr += 4;
1330
1331    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1332        /* cmp 4(r0), addrhi */
1333        tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, 4);
1334
1335        /* jne slow_path */
1336        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1337        label_ptr[1] = s->code_ptr;
1338        s->code_ptr += 4;
1339    }
1340
1341    /* TLB Hit.  */
1342
1343    /* add addend(r0), r1 */
1344    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1345                         offsetof(CPUTLBEntry, addend) - which);
1346}
1347
1348/*
1349 * Record the context of a call to the out of line helper code for the slow path
1350 * for a load or store, so that we can later generate the correct helper code
1351 */
1352static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
1353                                TCGReg datalo, TCGReg datahi,
1354                                TCGReg addrlo, TCGReg addrhi,
1355                                tcg_insn_unit *raddr,
1356                                tcg_insn_unit **label_ptr)
1357{
1358    TCGLabelQemuLdst *label = new_ldst_label(s);
1359
1360    label->is_ld = is_ld;
1361    label->oi = oi;
1362    label->datalo_reg = datalo;
1363    label->datahi_reg = datahi;
1364    label->addrlo_reg = addrlo;
1365    label->addrhi_reg = addrhi;
1366    label->raddr = raddr;
1367    label->label_ptr[0] = label_ptr[0];
1368    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1369        label->label_ptr[1] = label_ptr[1];
1370    }
1371}
1372
1373/*
1374 * Generate code for the slow path for a load at the end of block
1375 */
1376static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1377{
1378    TCGMemOpIdx oi = l->oi;
1379    TCGMemOp opc = get_memop(oi);
1380    TCGReg data_reg;
1381    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1382
1383    /* resolve label address */
1384    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1385    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1386        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1387    }
1388
1389    if (TCG_TARGET_REG_BITS == 32) {
1390        int ofs = 0;
1391
1392        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1393        ofs += 4;
1394
1395        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1396        ofs += 4;
1397
1398        if (TARGET_LONG_BITS == 64) {
1399            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1400            ofs += 4;
1401        }
1402
1403        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1404        ofs += 4;
1405
1406        tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1407    } else {
1408        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1409        /* The second argument is already loaded with addrlo.  */
1410        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1411        tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1412                     (uintptr_t)l->raddr);
1413    }
1414
1415    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1416
1417    data_reg = l->datalo_reg;
1418    switch (opc & MO_SSIZE) {
1419    case MO_SB:
1420        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
1421        break;
1422    case MO_SW:
1423        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
1424        break;
1425#if TCG_TARGET_REG_BITS == 64
1426    case MO_SL:
1427        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1428        break;
1429#endif
1430    case MO_UB:
1431    case MO_UW:
1432        /* Note that the helpers have zero-extended to tcg_target_long.  */
1433    case MO_UL:
1434        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1435        break;
1436    case MO_Q:
1437        if (TCG_TARGET_REG_BITS == 64) {
1438            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1439        } else if (data_reg == TCG_REG_EDX) {
1440            /* xchg %edx, %eax */
1441            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1442            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1443        } else {
1444            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1445            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1446        }
1447        break;
1448    default:
1449        tcg_abort();
1450    }
1451
1452    /* Jump to the code corresponding to next IR of qemu_st */
1453    tcg_out_jmp(s, l->raddr);
1454}
1455
1456/*
1457 * Generate code for the slow path for a store at the end of block
1458 */
1459static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1460{
1461    TCGMemOpIdx oi = l->oi;
1462    TCGMemOp opc = get_memop(oi);
1463    TCGMemOp s_bits = opc & MO_SIZE;
1464    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1465    TCGReg retaddr;
1466
1467    /* resolve label address */
1468    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1469    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1470        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1471    }
1472
1473    if (TCG_TARGET_REG_BITS == 32) {
1474        int ofs = 0;
1475
1476        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1477        ofs += 4;
1478
1479        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1480        ofs += 4;
1481
1482        if (TARGET_LONG_BITS == 64) {
1483            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1484            ofs += 4;
1485        }
1486
1487        tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1488        ofs += 4;
1489
1490        if (s_bits == MO_64) {
1491            tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1492            ofs += 4;
1493        }
1494
1495        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1496        ofs += 4;
1497
1498        retaddr = TCG_REG_EAX;
1499        tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1500        tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1501    } else {
1502        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1503        /* The second argument is already loaded with addrlo.  */
1504        tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1505                    tcg_target_call_iarg_regs[2], l->datalo_reg);
1506        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1507
1508        if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1509            retaddr = tcg_target_call_iarg_regs[4];
1510            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1511        } else {
1512            retaddr = TCG_REG_RAX;
1513            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1514            tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1515                       TCG_TARGET_CALL_STACK_OFFSET);
1516        }
1517    }
1518
1519    /* "Tail call" to the helper, with the return address back inline.  */
1520    tcg_out_push(s, retaddr);
1521    tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1522}
1523#elif defined(__x86_64__) && defined(__linux__)
1524# include <asm/prctl.h>
1525# include <sys/prctl.h>
1526
1527int arch_prctl(int code, unsigned long addr);
1528
1529static int guest_base_flags;
1530static inline void setup_guest_base_seg(void)
1531{
1532    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1533        guest_base_flags = P_GS;
1534    }
1535}
1536#else
1537# define guest_base_flags 0
1538static inline void setup_guest_base_seg(void) { }
1539#endif /* SOFTMMU */
1540
1541static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1542                                   TCGReg base, int index, intptr_t ofs,
1543                                   int seg, TCGMemOp memop)
1544{
1545    const TCGMemOp real_bswap = memop & MO_BSWAP;
1546    TCGMemOp bswap = real_bswap;
1547    int movop = OPC_MOVL_GvEv;
1548
1549    if (have_movbe && real_bswap) {
1550        bswap = 0;
1551        movop = OPC_MOVBE_GyMy;
1552    }
1553
1554    switch (memop & MO_SSIZE) {
1555    case MO_UB:
1556        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
1557                                 base, index, 0, ofs);
1558        break;
1559    case MO_SB:
1560        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + P_REXW + seg, datalo,
1561                                 base, index, 0, ofs);
1562        break;
1563    case MO_UW:
1564        tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1565                                 base, index, 0, ofs);
1566        if (real_bswap) {
1567            tcg_out_rolw_8(s, datalo);
1568        }
1569        break;
1570    case MO_SW:
1571        if (real_bswap) {
1572            if (have_movbe) {
1573                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1574                                         datalo, base, index, 0, ofs);
1575            } else {
1576                tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1577                                         base, index, 0, ofs);
1578                tcg_out_rolw_8(s, datalo);
1579            }
1580            tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo);
1581        } else {
1582            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + P_REXW + seg,
1583                                     datalo, base, index, 0, ofs);
1584        }
1585        break;
1586    case MO_UL:
1587        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
1588        if (bswap) {
1589            tcg_out_bswap32(s, datalo);
1590        }
1591        break;
1592#if TCG_TARGET_REG_BITS == 64
1593    case MO_SL:
1594        if (real_bswap) {
1595            tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1596                                     base, index, 0, ofs);
1597            if (bswap) {
1598                tcg_out_bswap32(s, datalo);
1599            }
1600            tcg_out_ext32s(s, datalo, datalo);
1601        } else {
1602            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
1603                                     base, index, 0, ofs);
1604        }
1605        break;
1606#endif
1607    case MO_Q:
1608        if (TCG_TARGET_REG_BITS == 64) {
1609            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
1610                                     base, index, 0, ofs);
1611            if (bswap) {
1612                tcg_out_bswap64(s, datalo);
1613            }
1614        } else {
1615            if (real_bswap) {
1616                int t = datalo;
1617                datalo = datahi;
1618                datahi = t;
1619            }
1620            if (base != datalo) {
1621                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1622                                         base, index, 0, ofs);
1623                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
1624                                         base, index, 0, ofs + 4);
1625            } else {
1626                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
1627                                         base, index, 0, ofs + 4);
1628                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1629                                         base, index, 0, ofs);
1630            }
1631            if (bswap) {
1632                tcg_out_bswap32(s, datalo);
1633                tcg_out_bswap32(s, datahi);
1634            }
1635        }
1636        break;
1637    default:
1638        tcg_abort();
1639    }
1640}
1641
1642/* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
1643   EAX. It will be useful once fixed registers globals are less
1644   common. */
1645static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
1646{
1647    TCGReg datalo, datahi, addrlo;
1648    TCGReg addrhi __attribute__((unused));
1649    TCGMemOpIdx oi;
1650    TCGMemOp opc;
1651#if defined(CONFIG_SOFTMMU)
1652    int mem_index;
1653    tcg_insn_unit *label_ptr[2];
1654#endif
1655
1656    datalo = *args++;
1657    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
1658    addrlo = *args++;
1659    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
1660    oi = *args++;
1661    opc = get_memop(oi);
1662
1663#if defined(CONFIG_SOFTMMU)
1664    mem_index = get_mmuidx(oi);
1665
1666    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
1667                     label_ptr, offsetof(CPUTLBEntry, addr_read));
1668
1669    /* TLB Hit.  */
1670    tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
1671
1672    /* Record the current context of a load into ldst label */
1673    add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi,
1674                        s->code_ptr, label_ptr);
1675#else
1676    {
1677        int32_t offset = guest_base;
1678        TCGReg base = addrlo;
1679        int index = -1;
1680        int seg = 0;
1681
1682        /* For a 32-bit guest, the high 32 bits may contain garbage.
1683           We can do this with the ADDR32 prefix if we're not using
1684           a guest base, or when using segmentation.  Otherwise we
1685           need to zero-extend manually.  */
1686        if (guest_base == 0 || guest_base_flags) {
1687            seg = guest_base_flags;
1688            offset = 0;
1689            if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
1690                seg |= P_ADDR32;
1691            }
1692        } else if (TCG_TARGET_REG_BITS == 64) {
1693            if (TARGET_LONG_BITS == 32) {
1694                tcg_out_ext32u(s, TCG_REG_L0, base);
1695                base = TCG_REG_L0;
1696            }
1697            if (offset != guest_base) {
1698                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, guest_base);
1699                index = TCG_REG_L1;
1700                offset = 0;
1701            }
1702        }
1703
1704        tcg_out_qemu_ld_direct(s, datalo, datahi,
1705                               base, index, offset, seg, opc);
1706    }
1707#endif
1708}
1709
1710static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1711                                   TCGReg base, intptr_t ofs, int seg,
1712                                   TCGMemOp memop)
1713{
1714    /* ??? Ideally we wouldn't need a scratch register.  For user-only,
1715       we could perform the bswap twice to restore the original value
1716       instead of moving to the scratch.  But as it is, the L constraint
1717       means that TCG_REG_L0 is definitely free here.  */
1718    const TCGReg scratch = TCG_REG_L0;
1719    const TCGMemOp real_bswap = memop & MO_BSWAP;
1720    TCGMemOp bswap = real_bswap;
1721    int movop = OPC_MOVL_EvGv;
1722
1723    if (have_movbe && real_bswap) {
1724        bswap = 0;
1725        movop = OPC_MOVBE_MyGy;
1726    }
1727
1728    switch (memop & MO_SIZE) {
1729    case MO_8:
1730        /* In 32-bit mode, 8-bit stores can only happen from [abcd]x.
1731           Use the scratch register if necessary.  */
1732        if (TCG_TARGET_REG_BITS == 32 && datalo >= 4) {
1733            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
1734            datalo = scratch;
1735        }
1736        tcg_out_modrm_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
1737                             datalo, base, ofs);
1738        break;
1739    case MO_16:
1740        if (bswap) {
1741            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
1742            tcg_out_rolw_8(s, scratch);
1743            datalo = scratch;
1744        }
1745        tcg_out_modrm_offset(s, movop + P_DATA16 + seg, datalo, base, ofs);
1746        break;
1747    case MO_32:
1748        if (bswap) {
1749            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
1750            tcg_out_bswap32(s, scratch);
1751            datalo = scratch;
1752        }
1753        tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
1754        break;
1755    case MO_64:
1756        if (TCG_TARGET_REG_BITS == 64) {
1757            if (bswap) {
1758                tcg_out_mov(s, TCG_TYPE_I64, scratch, datalo);
1759                tcg_out_bswap64(s, scratch);
1760                datalo = scratch;
1761            }
1762            tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
1763        } else if (bswap) {
1764            tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
1765            tcg_out_bswap32(s, scratch);
1766            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs);
1767            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
1768            tcg_out_bswap32(s, scratch);
1769            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs+4);
1770        } else {
1771            if (real_bswap) {
1772                int t = datalo;
1773                datalo = datahi;
1774                datahi = t;
1775            }
1776            tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
1777            tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs+4);
1778        }
1779        break;
1780    default:
1781        tcg_abort();
1782    }
1783}
1784
1785static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
1786{
1787    TCGReg datalo, datahi, addrlo;
1788    TCGReg addrhi __attribute__((unused));
1789    TCGMemOpIdx oi;
1790    TCGMemOp opc;
1791#if defined(CONFIG_SOFTMMU)
1792    int mem_index;
1793    tcg_insn_unit *label_ptr[2];
1794#endif
1795
1796    datalo = *args++;
1797    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
1798    addrlo = *args++;
1799    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
1800    oi = *args++;
1801    opc = get_memop(oi);
1802
1803#if defined(CONFIG_SOFTMMU)
1804    mem_index = get_mmuidx(oi);
1805
1806    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
1807                     label_ptr, offsetof(CPUTLBEntry, addr_write));
1808
1809    /* TLB Hit.  */
1810    tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, 0, 0, opc);
1811
1812    /* Record the current context of a store into ldst label */
1813    add_qemu_ldst_label(s, false, oi, datalo, datahi, addrlo, addrhi,
1814                        s->code_ptr, label_ptr);
1815#else
1816    {
1817        int32_t offset = guest_base;
1818        TCGReg base = addrlo;
1819        int seg = 0;
1820
1821        /* See comment in tcg_out_qemu_ld re zero-extension of addrlo.  */
1822        if (guest_base == 0 || guest_base_flags) {
1823            seg = guest_base_flags;
1824            offset = 0;
1825            if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
1826                seg |= P_ADDR32;
1827            }
1828        } else if (TCG_TARGET_REG_BITS == 64) {
1829            /* ??? Note that we can't use the same SIB addressing scheme
1830               as for loads, since we require L0 free for bswap.  */
1831            if (offset != guest_base) {
1832                if (TARGET_LONG_BITS == 32) {
1833                    tcg_out_ext32u(s, TCG_REG_L0, base);
1834                    base = TCG_REG_L0;
1835                }
1836                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, guest_base);
1837                tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L1, base);
1838                base = TCG_REG_L1;
1839                offset = 0;
1840            } else if (TARGET_LONG_BITS == 32) {
1841                tcg_out_ext32u(s, TCG_REG_L1, base);
1842                base = TCG_REG_L1;
1843            }
1844        }
1845
1846        tcg_out_qemu_st_direct(s, datalo, datahi, base, offset, seg, opc);
1847    }
1848#endif
1849}
1850
1851static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
1852                              const TCGArg *args, const int *const_args)
1853{
1854    TCGArg a0, a1, a2;
1855    int c, const_a2, vexop, rexw = 0;
1856
1857#if TCG_TARGET_REG_BITS == 64
1858# define OP_32_64(x) \
1859        case glue(glue(INDEX_op_, x), _i64): \
1860            rexw = P_REXW; /* FALLTHRU */    \
1861        case glue(glue(INDEX_op_, x), _i32)
1862#else
1863# define OP_32_64(x) \
1864        case glue(glue(INDEX_op_, x), _i32)
1865#endif
1866
1867    /* Hoist the loads of the most common arguments.  */
1868    a0 = args[0];
1869    a1 = args[1];
1870    a2 = args[2];
1871    const_a2 = const_args[2];
1872
1873    switch (opc) {
1874    case INDEX_op_exit_tb:
1875        /* Reuse the zeroing that exists for goto_ptr.  */
1876        if (a0 == 0) {
1877            tcg_out_jmp(s, s->code_gen_epilogue);
1878        } else {
1879            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
1880            tcg_out_jmp(s, tb_ret_addr);
1881        }
1882        break;
1883    case INDEX_op_goto_tb:
1884        if (s->tb_jmp_insn_offset) {
1885            /* direct jump method */
1886            int gap;
1887            /* jump displacement must be aligned for atomic patching;
1888             * see if we need to add extra nops before jump
1889             */
1890            gap = tcg_pcrel_diff(s, QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4));
1891            if (gap != 1) {
1892                tcg_out_nopn(s, gap - 1);
1893            }
1894            tcg_out8(s, OPC_JMP_long); /* jmp im */
1895            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
1896            tcg_out32(s, 0);
1897        } else {
1898            /* indirect jump method */
1899            tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
1900                                 (intptr_t)(s->tb_jmp_target_addr + a0));
1901        }
1902        s->tb_jmp_reset_offset[a0] = tcg_current_code_size(s);
1903        break;
1904    case INDEX_op_goto_ptr:
1905        /* jmp to the given host address (could be epilogue) */
1906        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
1907        break;
1908    case INDEX_op_br:
1909        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
1910        break;
1911    OP_32_64(ld8u):
1912        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
1913        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
1914        break;
1915    OP_32_64(ld8s):
1916        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
1917        break;
1918    OP_32_64(ld16u):
1919        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
1920        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
1921        break;
1922    OP_32_64(ld16s):
1923        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
1924        break;
1925#if TCG_TARGET_REG_BITS == 64
1926    case INDEX_op_ld32u_i64:
1927#endif
1928    case INDEX_op_ld_i32:
1929        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
1930        break;
1931
1932    OP_32_64(st8):
1933        if (const_args[0]) {
1934            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
1935            tcg_out8(s, a0);
1936        } else {
1937            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
1938        }
1939        break;
1940    OP_32_64(st16):
1941        if (const_args[0]) {
1942            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
1943            tcg_out16(s, a0);
1944        } else {
1945            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
1946        }
1947        break;
1948#if TCG_TARGET_REG_BITS == 64
1949    case INDEX_op_st32_i64:
1950#endif
1951    case INDEX_op_st_i32:
1952        if (const_args[0]) {
1953            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
1954            tcg_out32(s, a0);
1955        } else {
1956            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
1957        }
1958        break;
1959
1960    OP_32_64(add):
1961        /* For 3-operand addition, use LEA.  */
1962        if (a0 != a1) {
1963            TCGArg c3 = 0;
1964            if (const_a2) {
1965                c3 = a2, a2 = -1;
1966            } else if (a0 == a2) {
1967                /* Watch out for dest = src + dest, since we've removed
1968                   the matching constraint on the add.  */
1969                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
1970                break;
1971            }
1972
1973            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
1974            break;
1975        }
1976        c = ARITH_ADD;
1977        goto gen_arith;
1978    OP_32_64(sub):
1979        c = ARITH_SUB;
1980        goto gen_arith;
1981    OP_32_64(and):
1982        c = ARITH_AND;
1983        goto gen_arith;
1984    OP_32_64(or):
1985        c = ARITH_OR;
1986        goto gen_arith;
1987    OP_32_64(xor):
1988        c = ARITH_XOR;
1989        goto gen_arith;
1990    gen_arith:
1991        if (const_a2) {
1992            tgen_arithi(s, c + rexw, a0, a2, 0);
1993        } else {
1994            tgen_arithr(s, c + rexw, a0, a2);
1995        }
1996        break;
1997
1998    OP_32_64(andc):
1999        if (const_a2) {
2000            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2001            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2002        } else {
2003            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2004        }
2005        break;
2006
2007    OP_32_64(mul):
2008        if (const_a2) {
2009            int32_t val;
2010            val = a2;
2011            if (val == (int8_t)val) {
2012                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2013                tcg_out8(s, val);
2014            } else {
2015                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2016                tcg_out32(s, val);
2017            }
2018        } else {
2019            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2020        }
2021        break;
2022
2023    OP_32_64(div2):
2024        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2025        break;
2026    OP_32_64(divu2):
2027        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2028        break;
2029
2030    OP_32_64(shl):
2031        /* For small constant 3-operand shift, use LEA.  */
2032        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2033            if (a2 - 1 == 0) {
2034                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2035                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2036            } else {
2037                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2038                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2039            }
2040            break;
2041        }
2042        c = SHIFT_SHL;
2043        vexop = OPC_SHLX;
2044        goto gen_shift_maybe_vex;
2045    OP_32_64(shr):
2046        c = SHIFT_SHR;
2047        vexop = OPC_SHRX;
2048        goto gen_shift_maybe_vex;
2049    OP_32_64(sar):
2050        c = SHIFT_SAR;
2051        vexop = OPC_SARX;
2052        goto gen_shift_maybe_vex;
2053    OP_32_64(rotl):
2054        c = SHIFT_ROL;
2055        goto gen_shift;
2056    OP_32_64(rotr):
2057        c = SHIFT_ROR;
2058        goto gen_shift;
2059    gen_shift_maybe_vex:
2060        if (have_bmi2) {
2061            if (!const_a2) {
2062                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2063                break;
2064            }
2065            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2066        }
2067        /* FALLTHRU */
2068    gen_shift:
2069        if (const_a2) {
2070            tcg_out_shifti(s, c + rexw, a0, a2);
2071        } else {
2072            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2073        }
2074        break;
2075
2076    OP_32_64(ctz):
2077        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2078        break;
2079    OP_32_64(clz):
2080        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2081        break;
2082    OP_32_64(ctpop):
2083        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2084        break;
2085
2086    case INDEX_op_brcond_i32:
2087        tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2088        break;
2089    case INDEX_op_setcond_i32:
2090        tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2091        break;
2092    case INDEX_op_movcond_i32:
2093        tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2094        break;
2095
2096    OP_32_64(bswap16):
2097        tcg_out_rolw_8(s, a0);
2098        break;
2099    OP_32_64(bswap32):
2100        tcg_out_bswap32(s, a0);
2101        break;
2102
2103    OP_32_64(neg):
2104        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2105        break;
2106    OP_32_64(not):
2107        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2108        break;
2109
2110    OP_32_64(ext8s):
2111        tcg_out_ext8s(s, a0, a1, rexw);
2112        break;
2113    OP_32_64(ext16s):
2114        tcg_out_ext16s(s, a0, a1, rexw);
2115        break;
2116    OP_32_64(ext8u):
2117        tcg_out_ext8u(s, a0, a1);
2118        break;
2119    OP_32_64(ext16u):
2120        tcg_out_ext16u(s, a0, a1);
2121        break;
2122
2123    case INDEX_op_qemu_ld_i32:
2124        tcg_out_qemu_ld(s, args, 0);
2125        break;
2126    case INDEX_op_qemu_ld_i64:
2127        tcg_out_qemu_ld(s, args, 1);
2128        break;
2129    case INDEX_op_qemu_st_i32:
2130        tcg_out_qemu_st(s, args, 0);
2131        break;
2132    case INDEX_op_qemu_st_i64:
2133        tcg_out_qemu_st(s, args, 1);
2134        break;
2135
2136    OP_32_64(mulu2):
2137        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2138        break;
2139    OP_32_64(muls2):
2140        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2141        break;
2142    OP_32_64(add2):
2143        if (const_args[4]) {
2144            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2145        } else {
2146            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2147        }
2148        if (const_args[5]) {
2149            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2150        } else {
2151            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2152        }
2153        break;
2154    OP_32_64(sub2):
2155        if (const_args[4]) {
2156            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2157        } else {
2158            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2159        }
2160        if (const_args[5]) {
2161            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2162        } else {
2163            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2164        }
2165        break;
2166
2167#if TCG_TARGET_REG_BITS == 32
2168    case INDEX_op_brcond2_i32:
2169        tcg_out_brcond2(s, args, const_args, 0);
2170        break;
2171    case INDEX_op_setcond2_i32:
2172        tcg_out_setcond2(s, args, const_args);
2173        break;
2174#else /* TCG_TARGET_REG_BITS == 64 */
2175    case INDEX_op_ld32s_i64:
2176        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2177        break;
2178    case INDEX_op_ld_i64:
2179        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2180        break;
2181    case INDEX_op_st_i64:
2182        if (const_args[0]) {
2183            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2184            tcg_out32(s, a0);
2185        } else {
2186            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2187        }
2188        break;
2189
2190    case INDEX_op_brcond_i64:
2191        tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2192        break;
2193    case INDEX_op_setcond_i64:
2194        tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2195        break;
2196    case INDEX_op_movcond_i64:
2197        tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2198        break;
2199
2200    case INDEX_op_bswap64_i64:
2201        tcg_out_bswap64(s, a0);
2202        break;
2203    case INDEX_op_extu_i32_i64:
2204    case INDEX_op_ext32u_i64:
2205        tcg_out_ext32u(s, a0, a1);
2206        break;
2207    case INDEX_op_ext_i32_i64:
2208    case INDEX_op_ext32s_i64:
2209        tcg_out_ext32s(s, a0, a1);
2210        break;
2211#endif
2212
2213    OP_32_64(deposit):
2214        if (args[3] == 0 && args[4] == 8) {
2215            /* load bits 0..7 */
2216            tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2217        } else if (args[3] == 8 && args[4] == 8) {
2218            /* load bits 8..15 */
2219            tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2220        } else if (args[3] == 0 && args[4] == 16) {
2221            /* load bits 0..15 */
2222            tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2223        } else {
2224            tcg_abort();
2225        }
2226        break;
2227
2228    case INDEX_op_extract_i64:
2229        if (a2 + args[3] == 32) {
2230            /* This is a 32-bit zero-extending right shift.  */
2231            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2232            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2233            break;
2234        }
2235        /* FALLTHRU */
2236    case INDEX_op_extract_i32:
2237        /* On the off-chance that we can use the high-byte registers.
2238           Otherwise we emit the same ext16 + shift pattern that we
2239           would have gotten from the normal tcg-op.c expansion.  */
2240        tcg_debug_assert(a2 == 8 && args[3] == 8);
2241        if (a1 < 4 && a0 < 8) {
2242            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2243        } else {
2244            tcg_out_ext16u(s, a0, a1);
2245            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2246        }
2247        break;
2248
2249    case INDEX_op_sextract_i32:
2250        /* We don't implement sextract_i64, as we cannot sign-extend to
2251           64-bits without using the REX prefix that explicitly excludes
2252           access to the high-byte registers.  */
2253        tcg_debug_assert(a2 == 8 && args[3] == 8);
2254        if (a1 < 4 && a0 < 8) {
2255            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2256        } else {
2257            tcg_out_ext16s(s, a0, a1, 0);
2258            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2259        }
2260        break;
2261
2262    case INDEX_op_mb:
2263        tcg_out_mb(s, a0);
2264        break;
2265    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2266    case INDEX_op_mov_i64:
2267    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
2268    case INDEX_op_movi_i64:
2269    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2270    default:
2271        tcg_abort();
2272    }
2273
2274#undef OP_32_64
2275}
2276
2277static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
2278{
2279    static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
2280    static const TCGTargetOpDef ri_r = { .args_ct_str = { "ri", "r" } };
2281    static const TCGTargetOpDef re_r = { .args_ct_str = { "re", "r" } };
2282    static const TCGTargetOpDef qi_r = { .args_ct_str = { "qi", "r" } };
2283    static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
2284    static const TCGTargetOpDef r_q = { .args_ct_str = { "r", "q" } };
2285    static const TCGTargetOpDef r_re = { .args_ct_str = { "r", "re" } };
2286    static const TCGTargetOpDef r_0 = { .args_ct_str = { "r", "0" } };
2287    static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
2288    static const TCGTargetOpDef r_r_re = { .args_ct_str = { "r", "r", "re" } };
2289    static const TCGTargetOpDef r_0_re = { .args_ct_str = { "r", "0", "re" } };
2290    static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } };
2291    static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
2292    static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
2293    static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
2294    static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
2295    static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
2296    static const TCGTargetOpDef r_r_L_L
2297        = { .args_ct_str = { "r", "r", "L", "L" } };
2298    static const TCGTargetOpDef L_L_L_L
2299        = { .args_ct_str = { "L", "L", "L", "L" } };
2300
2301    switch (op) {
2302    case INDEX_op_goto_ptr:
2303        return &r;
2304
2305    case INDEX_op_ld8u_i32:
2306    case INDEX_op_ld8u_i64:
2307    case INDEX_op_ld8s_i32:
2308    case INDEX_op_ld8s_i64:
2309    case INDEX_op_ld16u_i32:
2310    case INDEX_op_ld16u_i64:
2311    case INDEX_op_ld16s_i32:
2312    case INDEX_op_ld16s_i64:
2313    case INDEX_op_ld_i32:
2314    case INDEX_op_ld32u_i64:
2315    case INDEX_op_ld32s_i64:
2316    case INDEX_op_ld_i64:
2317        return &r_r;
2318
2319    case INDEX_op_st8_i32:
2320    case INDEX_op_st8_i64:
2321        return &qi_r;
2322    case INDEX_op_st16_i32:
2323    case INDEX_op_st16_i64:
2324    case INDEX_op_st_i32:
2325    case INDEX_op_st32_i64:
2326        return &ri_r;
2327    case INDEX_op_st_i64:
2328        return &re_r;
2329
2330    case INDEX_op_add_i32:
2331    case INDEX_op_add_i64:
2332        return &r_r_re;
2333    case INDEX_op_sub_i32:
2334    case INDEX_op_sub_i64:
2335    case INDEX_op_mul_i32:
2336    case INDEX_op_mul_i64:
2337    case INDEX_op_or_i32:
2338    case INDEX_op_or_i64:
2339    case INDEX_op_xor_i32:
2340    case INDEX_op_xor_i64:
2341        return &r_0_re;
2342
2343    case INDEX_op_and_i32:
2344    case INDEX_op_and_i64:
2345        {
2346            static const TCGTargetOpDef and
2347                = { .args_ct_str = { "r", "0", "reZ" } };
2348            return &and;
2349        }
2350        break;
2351    case INDEX_op_andc_i32:
2352    case INDEX_op_andc_i64:
2353        {
2354            static const TCGTargetOpDef andc
2355                = { .args_ct_str = { "r", "r", "rI" } };
2356            return &andc;
2357        }
2358        break;
2359
2360    case INDEX_op_shl_i32:
2361    case INDEX_op_shl_i64:
2362    case INDEX_op_shr_i32:
2363    case INDEX_op_shr_i64:
2364    case INDEX_op_sar_i32:
2365    case INDEX_op_sar_i64:
2366        return have_bmi2 ? &r_r_ri : &r_0_ci;
2367    case INDEX_op_rotl_i32:
2368    case INDEX_op_rotl_i64:
2369    case INDEX_op_rotr_i32:
2370    case INDEX_op_rotr_i64:
2371        return &r_0_ci;
2372
2373    case INDEX_op_brcond_i32:
2374    case INDEX_op_brcond_i64:
2375        return &r_re;
2376
2377    case INDEX_op_bswap16_i32:
2378    case INDEX_op_bswap16_i64:
2379    case INDEX_op_bswap32_i32:
2380    case INDEX_op_bswap32_i64:
2381    case INDEX_op_bswap64_i64:
2382    case INDEX_op_neg_i32:
2383    case INDEX_op_neg_i64:
2384    case INDEX_op_not_i32:
2385    case INDEX_op_not_i64:
2386        return &r_0;
2387
2388    case INDEX_op_ext8s_i32:
2389    case INDEX_op_ext8s_i64:
2390    case INDEX_op_ext8u_i32:
2391    case INDEX_op_ext8u_i64:
2392        return &r_q;
2393    case INDEX_op_ext16s_i32:
2394    case INDEX_op_ext16s_i64:
2395    case INDEX_op_ext16u_i32:
2396    case INDEX_op_ext16u_i64:
2397    case INDEX_op_ext32s_i64:
2398    case INDEX_op_ext32u_i64:
2399    case INDEX_op_ext_i32_i64:
2400    case INDEX_op_extu_i32_i64:
2401    case INDEX_op_extract_i32:
2402    case INDEX_op_extract_i64:
2403    case INDEX_op_sextract_i32:
2404    case INDEX_op_ctpop_i32:
2405    case INDEX_op_ctpop_i64:
2406        return &r_r;
2407
2408    case INDEX_op_deposit_i32:
2409    case INDEX_op_deposit_i64:
2410        {
2411            static const TCGTargetOpDef dep
2412                = { .args_ct_str = { "Q", "0", "Q" } };
2413            return &dep;
2414        }
2415    case INDEX_op_setcond_i32:
2416    case INDEX_op_setcond_i64:
2417        {
2418            static const TCGTargetOpDef setc
2419                = { .args_ct_str = { "q", "r", "re" } };
2420            return &setc;
2421        }
2422    case INDEX_op_movcond_i32:
2423    case INDEX_op_movcond_i64:
2424        {
2425            static const TCGTargetOpDef movc
2426                = { .args_ct_str = { "r", "r", "re", "r", "0" } };
2427            return &movc;
2428        }
2429    case INDEX_op_div2_i32:
2430    case INDEX_op_div2_i64:
2431    case INDEX_op_divu2_i32:
2432    case INDEX_op_divu2_i64:
2433        {
2434            static const TCGTargetOpDef div2
2435                = { .args_ct_str = { "a", "d", "0", "1", "r" } };
2436            return &div2;
2437        }
2438    case INDEX_op_mulu2_i32:
2439    case INDEX_op_mulu2_i64:
2440    case INDEX_op_muls2_i32:
2441    case INDEX_op_muls2_i64:
2442        {
2443            static const TCGTargetOpDef mul2
2444                = { .args_ct_str = { "a", "d", "a", "r" } };
2445            return &mul2;
2446        }
2447    case INDEX_op_add2_i32:
2448    case INDEX_op_add2_i64:
2449    case INDEX_op_sub2_i32:
2450    case INDEX_op_sub2_i64:
2451        {
2452            static const TCGTargetOpDef arith2
2453                = { .args_ct_str = { "r", "r", "0", "1", "re", "re" } };
2454            return &arith2;
2455        }
2456    case INDEX_op_ctz_i32:
2457    case INDEX_op_ctz_i64:
2458        {
2459            static const TCGTargetOpDef ctz[2] = {
2460                { .args_ct_str = { "&r", "r", "r" } },
2461                { .args_ct_str = { "&r", "r", "rW" } },
2462            };
2463            return &ctz[have_bmi1];
2464        }
2465    case INDEX_op_clz_i32:
2466    case INDEX_op_clz_i64:
2467        {
2468            static const TCGTargetOpDef clz[2] = {
2469                { .args_ct_str = { "&r", "r", "r" } },
2470                { .args_ct_str = { "&r", "r", "rW" } },
2471            };
2472            return &clz[have_lzcnt];
2473        }
2474
2475    case INDEX_op_qemu_ld_i32:
2476        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
2477    case INDEX_op_qemu_st_i32:
2478        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L;
2479    case INDEX_op_qemu_ld_i64:
2480        return (TCG_TARGET_REG_BITS == 64 ? &r_L
2481                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
2482                : &r_r_L_L);
2483    case INDEX_op_qemu_st_i64:
2484        return (TCG_TARGET_REG_BITS == 64 ? &L_L
2485                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L_L
2486                : &L_L_L_L);
2487
2488    case INDEX_op_brcond2_i32:
2489        {
2490            static const TCGTargetOpDef b2
2491                = { .args_ct_str = { "r", "r", "ri", "ri" } };
2492            return &b2;
2493        }
2494    case INDEX_op_setcond2_i32:
2495        {
2496            static const TCGTargetOpDef s2
2497                = { .args_ct_str = { "r", "r", "r", "ri", "ri" } };
2498            return &s2;
2499        }
2500
2501    default:
2502        break;
2503    }
2504    return NULL;
2505}
2506
2507static int tcg_target_callee_save_regs[] = {
2508#if TCG_TARGET_REG_BITS == 64
2509    TCG_REG_RBP,
2510    TCG_REG_RBX,
2511#if defined(_WIN64)
2512    TCG_REG_RDI,
2513    TCG_REG_RSI,
2514#endif
2515    TCG_REG_R12,
2516    TCG_REG_R13,
2517    TCG_REG_R14, /* Currently used for the global env. */
2518    TCG_REG_R15,
2519#else
2520    TCG_REG_EBP, /* Currently used for the global env. */
2521    TCG_REG_EBX,
2522    TCG_REG_ESI,
2523    TCG_REG_EDI,
2524#endif
2525};
2526
2527/* Compute frame size via macros, to share between tcg_target_qemu_prologue
2528   and tcg_register_jit.  */
2529
2530#define PUSH_SIZE \
2531    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
2532     * (TCG_TARGET_REG_BITS / 8))
2533
2534#define FRAME_SIZE \
2535    ((PUSH_SIZE \
2536      + TCG_STATIC_CALL_ARGS_SIZE \
2537      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
2538      + TCG_TARGET_STACK_ALIGN - 1) \
2539     & ~(TCG_TARGET_STACK_ALIGN - 1))
2540
2541/* Generate global QEMU prologue and epilogue code */
2542static void tcg_target_qemu_prologue(TCGContext *s)
2543{
2544    int i, stack_addend;
2545
2546    /* TB prologue */
2547
2548    /* Reserve some stack space, also for TCG temps.  */
2549    stack_addend = FRAME_SIZE - PUSH_SIZE;
2550    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
2551                  CPU_TEMP_BUF_NLONGS * sizeof(long));
2552
2553    /* Save all callee saved registers.  */
2554    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
2555        tcg_out_push(s, tcg_target_callee_save_regs[i]);
2556    }
2557
2558#if TCG_TARGET_REG_BITS == 32
2559    tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
2560               (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
2561    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
2562    /* jmp *tb.  */
2563    tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
2564                         (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
2565                         + stack_addend);
2566#else
2567    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
2568    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
2569    /* jmp *tb.  */
2570    tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
2571#endif
2572
2573    /*
2574     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
2575     * and fall through to the rest of the epilogue.
2576     */
2577    s->code_gen_epilogue = s->code_ptr;
2578    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
2579
2580    /* TB epilogue */
2581    tb_ret_addr = s->code_ptr;
2582
2583    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
2584
2585    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
2586        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
2587    }
2588    tcg_out_opc(s, OPC_RET, 0, 0, 0);
2589
2590#if !defined(CONFIG_SOFTMMU)
2591    /* Try to set up a segment register to point to guest_base.  */
2592    if (guest_base) {
2593        setup_guest_base_seg();
2594    }
2595#endif
2596}
2597
2598static void tcg_target_init(TCGContext *s)
2599{
2600#ifdef CONFIG_CPUID_H
2601    unsigned a, b, c, d;
2602    int max = __get_cpuid_max(0, 0);
2603
2604    if (max >= 1) {
2605        __cpuid(1, a, b, c, d);
2606#ifndef have_cmov
2607        /* For 32-bit, 99% certainty that we're running on hardware that
2608           supports cmov, but we still need to check.  In case cmov is not
2609           available, we'll use a small forward branch.  */
2610        have_cmov = (d & bit_CMOV) != 0;
2611#endif
2612        /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
2613           need to probe for it.  */
2614        have_movbe = (c & bit_MOVBE) != 0;
2615        have_popcnt = (c & bit_POPCNT) != 0;
2616    }
2617
2618    if (max >= 7) {
2619        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
2620        __cpuid_count(7, 0, a, b, c, d);
2621        have_bmi1 = (b & bit_BMI) != 0;
2622        have_bmi2 = (b & bit_BMI2) != 0;
2623    }
2624
2625    max = __get_cpuid_max(0x8000000, 0);
2626    if (max >= 1) {
2627        __cpuid(0x80000001, a, b, c, d);
2628        /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
2629        have_lzcnt = (c & bit_LZCNT) != 0;
2630    }
2631#endif /* CONFIG_CPUID_H */
2632
2633    if (TCG_TARGET_REG_BITS == 64) {
2634        tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffff);
2635        tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I64], 0, 0xffff);
2636    } else {
2637        tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xff);
2638    }
2639
2640    tcg_regset_clear(tcg_target_call_clobber_regs);
2641    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
2642    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
2643    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
2644    if (TCG_TARGET_REG_BITS == 64) {
2645#if !defined(_WIN64)
2646        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
2647        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
2648#endif
2649        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
2650        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
2651        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
2652        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
2653    }
2654
2655    tcg_regset_clear(s->reserved_regs);
2656    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
2657}
2658
2659typedef struct {
2660    DebugFrameHeader h;
2661    uint8_t fde_def_cfa[4];
2662    uint8_t fde_reg_ofs[14];
2663} DebugFrame;
2664
2665/* We're expecting a 2 byte uleb128 encoded value.  */
2666QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
2667
2668#if !defined(__ELF__)
2669    /* Host machine without ELF. */
2670#elif TCG_TARGET_REG_BITS == 64
2671#define ELF_HOST_MACHINE EM_X86_64
2672static const DebugFrame debug_frame = {
2673    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
2674    .h.cie.id = -1,
2675    .h.cie.version = 1,
2676    .h.cie.code_align = 1,
2677    .h.cie.data_align = 0x78,             /* sleb128 -8 */
2678    .h.cie.return_column = 16,
2679
2680    /* Total FDE size does not include the "len" member.  */
2681    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
2682
2683    .fde_def_cfa = {
2684        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
2685        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
2686        (FRAME_SIZE >> 7)
2687    },
2688    .fde_reg_ofs = {
2689        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
2690        /* The following ordering must match tcg_target_callee_save_regs.  */
2691        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
2692        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
2693        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
2694        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
2695        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
2696        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
2697    }
2698};
2699#else
2700#define ELF_HOST_MACHINE EM_386
2701static const DebugFrame debug_frame = {
2702    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
2703    .h.cie.id = -1,
2704    .h.cie.version = 1,
2705    .h.cie.code_align = 1,
2706    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
2707    .h.cie.return_column = 8,
2708
2709    /* Total FDE size does not include the "len" member.  */
2710    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
2711
2712    .fde_def_cfa = {
2713        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
2714        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
2715        (FRAME_SIZE >> 7)
2716    },
2717    .fde_reg_ofs = {
2718        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
2719        /* The following ordering must match tcg_target_callee_save_regs.  */
2720        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
2721        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
2722        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
2723        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
2724    }
2725};
2726#endif
2727
2728#if defined(ELF_HOST_MACHINE)
2729void tcg_register_jit(void *buf, size_t buf_size)
2730{
2731    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
2732}
2733#endif
2734