qemu/tcg/i386/tcg-target.inc.c
<<
>>
Prefs
   1/*
   2 * Tiny Code Generator for QEMU
   3 *
   4 * Copyright (c) 2008 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "tcg-pool.inc.c"
  26
  27#ifdef CONFIG_DEBUG_TCG
  28static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
  29#if TCG_TARGET_REG_BITS == 64
  30    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
  31#else
  32    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
  33#endif
  34    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
  35    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
  36#if TCG_TARGET_REG_BITS == 64
  37    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
  38    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
  39#endif
  40};
  41#endif
  42
  43static const int tcg_target_reg_alloc_order[] = {
  44#if TCG_TARGET_REG_BITS == 64
  45    TCG_REG_RBP,
  46    TCG_REG_RBX,
  47    TCG_REG_R12,
  48    TCG_REG_R13,
  49    TCG_REG_R14,
  50    TCG_REG_R15,
  51    TCG_REG_R10,
  52    TCG_REG_R11,
  53    TCG_REG_R9,
  54    TCG_REG_R8,
  55    TCG_REG_RCX,
  56    TCG_REG_RDX,
  57    TCG_REG_RSI,
  58    TCG_REG_RDI,
  59    TCG_REG_RAX,
  60#else
  61    TCG_REG_EBX,
  62    TCG_REG_ESI,
  63    TCG_REG_EDI,
  64    TCG_REG_EBP,
  65    TCG_REG_ECX,
  66    TCG_REG_EDX,
  67    TCG_REG_EAX,
  68#endif
  69    TCG_REG_XMM0,
  70    TCG_REG_XMM1,
  71    TCG_REG_XMM2,
  72    TCG_REG_XMM3,
  73    TCG_REG_XMM4,
  74    TCG_REG_XMM5,
  75#ifndef _WIN64
  76    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
  77       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
  78    TCG_REG_XMM6,
  79    TCG_REG_XMM7,
  80#if TCG_TARGET_REG_BITS == 64
  81    TCG_REG_XMM8,
  82    TCG_REG_XMM9,
  83    TCG_REG_XMM10,
  84    TCG_REG_XMM11,
  85    TCG_REG_XMM12,
  86    TCG_REG_XMM13,
  87    TCG_REG_XMM14,
  88    TCG_REG_XMM15,
  89#endif
  90#endif
  91};
  92
  93static const int tcg_target_call_iarg_regs[] = {
  94#if TCG_TARGET_REG_BITS == 64
  95#if defined(_WIN64)
  96    TCG_REG_RCX,
  97    TCG_REG_RDX,
  98#else
  99    TCG_REG_RDI,
 100    TCG_REG_RSI,
 101    TCG_REG_RDX,
 102    TCG_REG_RCX,
 103#endif
 104    TCG_REG_R8,
 105    TCG_REG_R9,
 106#else
 107    /* 32 bit mode uses stack based calling convention (GCC default). */
 108#endif
 109};
 110
 111static const int tcg_target_call_oarg_regs[] = {
 112    TCG_REG_EAX,
 113#if TCG_TARGET_REG_BITS == 32
 114    TCG_REG_EDX
 115#endif
 116};
 117
 118/* Constants we accept.  */
 119#define TCG_CT_CONST_S32 0x100
 120#define TCG_CT_CONST_U32 0x200
 121#define TCG_CT_CONST_I32 0x400
 122#define TCG_CT_CONST_WSZ 0x800
 123
 124/* Registers used with L constraint, which are the first argument
 125   registers on x86_64, and two random call clobbered registers on
 126   i386. */
 127#if TCG_TARGET_REG_BITS == 64
 128# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
 129# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
 130#else
 131# define TCG_REG_L0 TCG_REG_EAX
 132# define TCG_REG_L1 TCG_REG_EDX
 133#endif
 134
 135/* The host compiler should supply <cpuid.h> to enable runtime features
 136   detection, as we're not going to go so far as our own inline assembly.
 137   If not available, default values will be assumed.  */
 138#if defined(CONFIG_CPUID_H)
 139#include "qemu/cpuid.h"
 140#endif
 141
 142/* For 64-bit, we always know that CMOV is available.  */
 143#if TCG_TARGET_REG_BITS == 64
 144# define have_cmov 1
 145#elif defined(CONFIG_CPUID_H)
 146static bool have_cmov;
 147#else
 148# define have_cmov 0
 149#endif
 150
 151/* We need these symbols in tcg-target.h, and we can't properly conditionalize
 152   it there.  Therefore we always define the variable.  */
 153bool have_bmi1;
 154bool have_popcnt;
 155bool have_avx1;
 156bool have_avx2;
 157
 158#ifdef CONFIG_CPUID_H
 159static bool have_movbe;
 160static bool have_bmi2;
 161static bool have_lzcnt;
 162#else
 163# define have_movbe 0
 164# define have_bmi2 0
 165# define have_lzcnt 0
 166#endif
 167
 168static tcg_insn_unit *tb_ret_addr;
 169
 170static void patch_reloc(tcg_insn_unit *code_ptr, int type,
 171                        intptr_t value, intptr_t addend)
 172{
 173    value += addend;
 174    switch(type) {
 175    case R_386_PC32:
 176        value -= (uintptr_t)code_ptr;
 177        if (value != (int32_t)value) {
 178            tcg_abort();
 179        }
 180        /* FALLTHRU */
 181    case R_386_32:
 182        tcg_patch32(code_ptr, value);
 183        break;
 184    case R_386_PC8:
 185        value -= (uintptr_t)code_ptr;
 186        if (value != (int8_t)value) {
 187            tcg_abort();
 188        }
 189        tcg_patch8(code_ptr, value);
 190        break;
 191    default:
 192        tcg_abort();
 193    }
 194}
 195
 196#if TCG_TARGET_REG_BITS == 64
 197#define ALL_GENERAL_REGS   0x0000ffffu
 198#define ALL_VECTOR_REGS    0xffff0000u
 199#else
 200#define ALL_GENERAL_REGS   0x000000ffu
 201#define ALL_VECTOR_REGS    0x00ff0000u
 202#endif
 203
 204/* parse target specific constraints */
 205static const char *target_parse_constraint(TCGArgConstraint *ct,
 206                                           const char *ct_str, TCGType type)
 207{
 208    switch(*ct_str++) {
 209    case 'a':
 210        ct->ct |= TCG_CT_REG;
 211        tcg_regset_set_reg(ct->u.regs, TCG_REG_EAX);
 212        break;
 213    case 'b':
 214        ct->ct |= TCG_CT_REG;
 215        tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX);
 216        break;
 217    case 'c':
 218        ct->ct |= TCG_CT_REG;
 219        tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX);
 220        break;
 221    case 'd':
 222        ct->ct |= TCG_CT_REG;
 223        tcg_regset_set_reg(ct->u.regs, TCG_REG_EDX);
 224        break;
 225    case 'S':
 226        ct->ct |= TCG_CT_REG;
 227        tcg_regset_set_reg(ct->u.regs, TCG_REG_ESI);
 228        break;
 229    case 'D':
 230        ct->ct |= TCG_CT_REG;
 231        tcg_regset_set_reg(ct->u.regs, TCG_REG_EDI);
 232        break;
 233    case 'q':
 234        /* A register that can be used as a byte operand.  */
 235        ct->ct |= TCG_CT_REG;
 236        ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf;
 237        break;
 238    case 'Q':
 239        /* A register with an addressable second byte (e.g. %ah).  */
 240        ct->ct |= TCG_CT_REG;
 241        ct->u.regs = 0xf;
 242        break;
 243    case 'r':
 244        /* A general register.  */
 245        ct->ct |= TCG_CT_REG;
 246        ct->u.regs |= ALL_GENERAL_REGS;
 247        break;
 248    case 'W':
 249        /* With TZCNT/LZCNT, we can have operand-size as an input.  */
 250        ct->ct |= TCG_CT_CONST_WSZ;
 251        break;
 252    case 'x':
 253        /* A vector register.  */
 254        ct->ct |= TCG_CT_REG;
 255        ct->u.regs |= ALL_VECTOR_REGS;
 256        break;
 257
 258        /* qemu_ld/st address constraint */
 259    case 'L':
 260        ct->ct |= TCG_CT_REG;
 261        ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff;
 262        tcg_regset_reset_reg(ct->u.regs, TCG_REG_L0);
 263        tcg_regset_reset_reg(ct->u.regs, TCG_REG_L1);
 264        break;
 265
 266    case 'e':
 267        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_S32);
 268        break;
 269    case 'Z':
 270        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_U32);
 271        break;
 272    case 'I':
 273        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_I32);
 274        break;
 275
 276    default:
 277        return NULL;
 278    }
 279    return ct_str;
 280}
 281
 282/* test if a constant matches the constraint */
 283static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 284                                         const TCGArgConstraint *arg_ct)
 285{
 286    int ct = arg_ct->ct;
 287    if (ct & TCG_CT_CONST) {
 288        return 1;
 289    }
 290    if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
 291        return 1;
 292    }
 293    if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
 294        return 1;
 295    }
 296    if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
 297        return 1;
 298    }
 299    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
 300        return 1;
 301    }
 302    return 0;
 303}
 304
 305#if TCG_TARGET_REG_BITS == 64
 306# define LOWREGMASK(x)  ((x) & 7)
 307#else
 308# define LOWREGMASK(x)  (x)
 309#endif
 310
 311#define P_EXT           0x100           /* 0x0f opcode prefix */
 312#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
 313#define P_DATA16        0x400           /* 0x66 opcode prefix */
 314#if TCG_TARGET_REG_BITS == 64
 315# define P_ADDR32       0x800           /* 0x67 opcode prefix */
 316# define P_REXW         0x1000          /* Set REX.W = 1 */
 317# define P_REXB_R       0x2000          /* REG field as byte register */
 318# define P_REXB_RM      0x4000          /* R/M field as byte register */
 319# define P_GS           0x8000          /* gs segment override */
 320#else
 321# define P_ADDR32       0
 322# define P_REXW         0
 323# define P_REXB_R       0
 324# define P_REXB_RM      0
 325# define P_GS           0
 326#endif
 327#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
 328#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
 329#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
 330#define P_VEXL          0x80000         /* Set VEX.L = 1 */
 331
 332#define OPC_ARITH_EvIz  (0x81)
 333#define OPC_ARITH_EvIb  (0x83)
 334#define OPC_ARITH_GvEv  (0x03)          /* ... plus (ARITH_FOO << 3) */
 335#define OPC_ANDN        (0xf2 | P_EXT38)
 336#define OPC_ADD_GvEv    (OPC_ARITH_GvEv | (ARITH_ADD << 3))
 337#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
 338#define OPC_BSF         (0xbc | P_EXT)
 339#define OPC_BSR         (0xbd | P_EXT)
 340#define OPC_BSWAP       (0xc8 | P_EXT)
 341#define OPC_CALL_Jz     (0xe8)
 342#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
 343#define OPC_CMP_GvEv    (OPC_ARITH_GvEv | (ARITH_CMP << 3))
 344#define OPC_DEC_r32     (0x48)
 345#define OPC_IMUL_GvEv   (0xaf | P_EXT)
 346#define OPC_IMUL_GvEvIb (0x6b)
 347#define OPC_IMUL_GvEvIz (0x69)
 348#define OPC_INC_r32     (0x40)
 349#define OPC_JCC_long    (0x80 | P_EXT)  /* ... plus condition code */
 350#define OPC_JCC_short   (0x70)          /* ... plus condition code */
 351#define OPC_JMP_long    (0xe9)
 352#define OPC_JMP_short   (0xeb)
 353#define OPC_LEA         (0x8d)
 354#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
 355#define OPC_MOVB_EvGv   (0x88)          /* stores, more or less */
 356#define OPC_MOVL_EvGv   (0x89)          /* stores, more or less */
 357#define OPC_MOVL_GvEv   (0x8b)          /* loads, more or less */
 358#define OPC_MOVB_EvIz   (0xc6)
 359#define OPC_MOVL_EvIz   (0xc7)
 360#define OPC_MOVL_Iv     (0xb8)
 361#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
 362#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
 363#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
 364#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
 365#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
 366#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
 367#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
 368#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
 369#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
 370#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
 371#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
 372#define OPC_MOVSBL      (0xbe | P_EXT)
 373#define OPC_MOVSWL      (0xbf | P_EXT)
 374#define OPC_MOVSLQ      (0x63 | P_REXW)
 375#define OPC_MOVZBL      (0xb6 | P_EXT)
 376#define OPC_MOVZWL      (0xb7 | P_EXT)
 377#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
 378#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
 379#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
 380#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
 381#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
 382#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
 383#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
 384#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
 385#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
 386#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
 387#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
 388#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
 389#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
 390#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
 391#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
 392#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
 393#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
 394#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
 395#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
 396#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
 397#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
 398#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
 399#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
 400#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
 401#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
 402#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
 403#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
 404#define OPC_POR         (0xeb | P_EXT | P_DATA16)
 405#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
 406#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
 407#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
 408#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
 409#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
 410#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
 411#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
 412#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
 413#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
 414#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
 415#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
 416#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
 417#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
 418#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
 419#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
 420#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
 421#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
 422#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
 423#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
 424#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
 425#define OPC_POP_r32     (0x58)
 426#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
 427#define OPC_PUSH_r32    (0x50)
 428#define OPC_PUSH_Iv     (0x68)
 429#define OPC_PUSH_Ib     (0x6a)
 430#define OPC_RET         (0xc3)
 431#define OPC_SETCC       (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
 432#define OPC_SHIFT_1     (0xd1)
 433#define OPC_SHIFT_Ib    (0xc1)
 434#define OPC_SHIFT_cl    (0xd3)
 435#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
 436#define OPC_SHUFPS      (0xc6 | P_EXT)
 437#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
 438#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
 439#define OPC_TESTL       (0x85)
 440#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
 441#define OPC_UD2         (0x0b | P_EXT)
 442#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
 443#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
 444#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
 445#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
 446#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
 447#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
 448#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_REXW)
 449#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
 450#define OPC_VZEROUPPER  (0x77 | P_EXT)
 451#define OPC_XCHG_ax_r32 (0x90)
 452
 453#define OPC_GRP3_Ev     (0xf7)
 454#define OPC_GRP5        (0xff)
 455#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
 456
 457/* Group 1 opcode extensions for 0x80-0x83.
 458   These are also used as modifiers for OPC_ARITH.  */
 459#define ARITH_ADD 0
 460#define ARITH_OR  1
 461#define ARITH_ADC 2
 462#define ARITH_SBB 3
 463#define ARITH_AND 4
 464#define ARITH_SUB 5
 465#define ARITH_XOR 6
 466#define ARITH_CMP 7
 467
 468/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
 469#define SHIFT_ROL 0
 470#define SHIFT_ROR 1
 471#define SHIFT_SHL 4
 472#define SHIFT_SHR 5
 473#define SHIFT_SAR 7
 474
 475/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
 476#define EXT3_NOT   2
 477#define EXT3_NEG   3
 478#define EXT3_MUL   4
 479#define EXT3_IMUL  5
 480#define EXT3_DIV   6
 481#define EXT3_IDIV  7
 482
 483/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
 484#define EXT5_INC_Ev     0
 485#define EXT5_DEC_Ev     1
 486#define EXT5_CALLN_Ev   2
 487#define EXT5_JMPN_Ev    4
 488
 489/* Condition codes to be added to OPC_JCC_{long,short}.  */
 490#define JCC_JMP (-1)
 491#define JCC_JO  0x0
 492#define JCC_JNO 0x1
 493#define JCC_JB  0x2
 494#define JCC_JAE 0x3
 495#define JCC_JE  0x4
 496#define JCC_JNE 0x5
 497#define JCC_JBE 0x6
 498#define JCC_JA  0x7
 499#define JCC_JS  0x8
 500#define JCC_JNS 0x9
 501#define JCC_JP  0xa
 502#define JCC_JNP 0xb
 503#define JCC_JL  0xc
 504#define JCC_JGE 0xd
 505#define JCC_JLE 0xe
 506#define JCC_JG  0xf
 507
 508static const uint8_t tcg_cond_to_jcc[] = {
 509    [TCG_COND_EQ] = JCC_JE,
 510    [TCG_COND_NE] = JCC_JNE,
 511    [TCG_COND_LT] = JCC_JL,
 512    [TCG_COND_GE] = JCC_JGE,
 513    [TCG_COND_LE] = JCC_JLE,
 514    [TCG_COND_GT] = JCC_JG,
 515    [TCG_COND_LTU] = JCC_JB,
 516    [TCG_COND_GEU] = JCC_JAE,
 517    [TCG_COND_LEU] = JCC_JBE,
 518    [TCG_COND_GTU] = JCC_JA,
 519};
 520
 521#if TCG_TARGET_REG_BITS == 64
 522static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
 523{
 524    int rex;
 525
 526    if (opc & P_GS) {
 527        tcg_out8(s, 0x65);
 528    }
 529    if (opc & P_DATA16) {
 530        /* We should never be asking for both 16 and 64-bit operation.  */
 531        tcg_debug_assert((opc & P_REXW) == 0);
 532        tcg_out8(s, 0x66);
 533    }
 534    if (opc & P_ADDR32) {
 535        tcg_out8(s, 0x67);
 536    }
 537    if (opc & P_SIMDF3) {
 538        tcg_out8(s, 0xf3);
 539    } else if (opc & P_SIMDF2) {
 540        tcg_out8(s, 0xf2);
 541    }
 542
 543    rex = 0;
 544    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
 545    rex |= (r & 8) >> 1;                /* REX.R */
 546    rex |= (x & 8) >> 2;                /* REX.X */
 547    rex |= (rm & 8) >> 3;               /* REX.B */
 548
 549    /* P_REXB_{R,RM} indicates that the given register is the low byte.
 550       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
 551       as otherwise the encoding indicates %[abcd]h.  Note that the values
 552       that are ORed in merely indicate that the REX byte must be present;
 553       those bits get discarded in output.  */
 554    rex |= opc & (r >= 4 ? P_REXB_R : 0);
 555    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
 556
 557    if (rex) {
 558        tcg_out8(s, (uint8_t)(rex | 0x40));
 559    }
 560
 561    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 562        tcg_out8(s, 0x0f);
 563        if (opc & P_EXT38) {
 564            tcg_out8(s, 0x38);
 565        } else if (opc & P_EXT3A) {
 566            tcg_out8(s, 0x3a);
 567        }
 568    }
 569
 570    tcg_out8(s, opc);
 571}
 572#else
 573static void tcg_out_opc(TCGContext *s, int opc)
 574{
 575    if (opc & P_DATA16) {
 576        tcg_out8(s, 0x66);
 577    }
 578    if (opc & P_SIMDF3) {
 579        tcg_out8(s, 0xf3);
 580    } else if (opc & P_SIMDF2) {
 581        tcg_out8(s, 0xf2);
 582    }
 583    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 584        tcg_out8(s, 0x0f);
 585        if (opc & P_EXT38) {
 586            tcg_out8(s, 0x38);
 587        } else if (opc & P_EXT3A) {
 588            tcg_out8(s, 0x3a);
 589        }
 590    }
 591    tcg_out8(s, opc);
 592}
 593/* Discard the register arguments to tcg_out_opc early, so as not to penalize
 594   the 32-bit compilation paths.  This method works with all versions of gcc,
 595   whereas relying on optimization may not be able to exclude them.  */
 596#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
 597#endif
 598
 599static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
 600{
 601    tcg_out_opc(s, opc, r, rm, 0);
 602    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 603}
 604
 605static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
 606                            int rm, int index)
 607{
 608    int tmp;
 609
 610    /* Use the two byte form if possible, which cannot encode
 611       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
 612    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT
 613        && ((rm | index) & 8) == 0) {
 614        /* Two byte VEX prefix.  */
 615        tcg_out8(s, 0xc5);
 616
 617        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
 618    } else {
 619        /* Three byte VEX prefix.  */
 620        tcg_out8(s, 0xc4);
 621
 622        /* VEX.m-mmmm */
 623        if (opc & P_EXT3A) {
 624            tmp = 3;
 625        } else if (opc & P_EXT38) {
 626            tmp = 2;
 627        } else if (opc & P_EXT) {
 628            tmp = 1;
 629        } else {
 630            g_assert_not_reached();
 631        }
 632        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
 633        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
 634        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
 635        tcg_out8(s, tmp);
 636
 637        tmp = (opc & P_REXW ? 0x80 : 0);       /* VEX.W */
 638    }
 639
 640    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
 641    /* VEX.pp */
 642    if (opc & P_DATA16) {
 643        tmp |= 1;                          /* 0x66 */
 644    } else if (opc & P_SIMDF3) {
 645        tmp |= 2;                          /* 0xf3 */
 646    } else if (opc & P_SIMDF2) {
 647        tmp |= 3;                          /* 0xf2 */
 648    }
 649    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
 650    tcg_out8(s, tmp);
 651    tcg_out8(s, opc);
 652}
 653
 654static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
 655{
 656    tcg_out_vex_opc(s, opc, r, v, rm, 0);
 657    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 658}
 659
 660/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
 661   We handle either RM and INDEX missing with a negative value.  In 64-bit
 662   mode for absolute addresses, ~RM is the size of the immediate operand
 663   that will follow the instruction.  */
 664
 665static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
 666                               int shift, intptr_t offset)
 667{
 668    int mod, len;
 669
 670    if (index < 0 && rm < 0) {
 671        if (TCG_TARGET_REG_BITS == 64) {
 672            /* Try for a rip-relative addressing mode.  This has replaced
 673               the 32-bit-mode absolute addressing encoding.  */
 674            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
 675            intptr_t disp = offset - pc;
 676            if (disp == (int32_t)disp) {
 677                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
 678                tcg_out32(s, disp);
 679                return;
 680            }
 681
 682            /* Try for an absolute address encoding.  This requires the
 683               use of the MODRM+SIB encoding and is therefore larger than
 684               rip-relative addressing.  */
 685            if (offset == (int32_t)offset) {
 686                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
 687                tcg_out8(s, (4 << 3) | 5);
 688                tcg_out32(s, offset);
 689                return;
 690            }
 691
 692            /* ??? The memory isn't directly addressable.  */
 693            g_assert_not_reached();
 694        } else {
 695            /* Absolute address.  */
 696            tcg_out8(s, (r << 3) | 5);
 697            tcg_out32(s, offset);
 698            return;
 699        }
 700    }
 701
 702    /* Find the length of the immediate addend.  Note that the encoding
 703       that would be used for (%ebp) indicates absolute addressing.  */
 704    if (rm < 0) {
 705        mod = 0, len = 4, rm = 5;
 706    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
 707        mod = 0, len = 0;
 708    } else if (offset == (int8_t)offset) {
 709        mod = 0x40, len = 1;
 710    } else {
 711        mod = 0x80, len = 4;
 712    }
 713
 714    /* Use a single byte MODRM format if possible.  Note that the encoding
 715       that would be used for %esp is the escape to the two byte form.  */
 716    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
 717        /* Single byte MODRM format.  */
 718        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 719    } else {
 720        /* Two byte MODRM+SIB format.  */
 721
 722        /* Note that the encoding that would place %esp into the index
 723           field indicates no index register.  In 64-bit mode, the REX.X
 724           bit counts, so %r12 can be used as the index.  */
 725        if (index < 0) {
 726            index = 4;
 727        } else {
 728            tcg_debug_assert(index != TCG_REG_ESP);
 729        }
 730
 731        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
 732        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
 733    }
 734
 735    if (len == 1) {
 736        tcg_out8(s, offset);
 737    } else if (len == 4) {
 738        tcg_out32(s, offset);
 739    }
 740}
 741
 742static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
 743                                     int index, int shift, intptr_t offset)
 744{
 745    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 746    tcg_out_sib_offset(s, r, rm, index, shift, offset);
 747}
 748
 749static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
 750                                         int rm, int index, int shift,
 751                                         intptr_t offset)
 752{
 753    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 754    tcg_out_sib_offset(s, r, rm, index, shift, offset);
 755}
 756
 757/* A simplification of the above with no index or shift.  */
 758static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
 759                                        int rm, intptr_t offset)
 760{
 761    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
 762}
 763
 764static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
 765                                            int v, int rm, intptr_t offset)
 766{
 767    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
 768}
 769
 770/* Output an opcode with an expected reference to the constant pool.  */
 771static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
 772{
 773    tcg_out_opc(s, opc, r, 0, 0);
 774    /* Absolute for 32-bit, pc-relative for 64-bit.  */
 775    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 776    tcg_out32(s, 0);
 777}
 778
 779/* Output an opcode with an expected reference to the constant pool.  */
 780static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
 781{
 782    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
 783    /* Absolute for 32-bit, pc-relative for 64-bit.  */
 784    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 785    tcg_out32(s, 0);
 786}
 787
 788/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
 789static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
 790{
 791    /* Propagate an opcode prefix, such as P_REXW.  */
 792    int ext = subop & ~0x7;
 793    subop &= 0x7;
 794
 795    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
 796}
 797
 798static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
 799{
 800    int rexw = 0;
 801
 802    if (arg == ret) {
 803        return;
 804    }
 805    switch (type) {
 806    case TCG_TYPE_I64:
 807        rexw = P_REXW;
 808        /* fallthru */
 809    case TCG_TYPE_I32:
 810        if (ret < 16) {
 811            if (arg < 16) {
 812                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
 813            } else {
 814                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
 815            }
 816        } else {
 817            if (arg < 16) {
 818                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
 819            } else {
 820                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 821            }
 822        }
 823        break;
 824
 825    case TCG_TYPE_V64:
 826        tcg_debug_assert(ret >= 16 && arg >= 16);
 827        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 828        break;
 829    case TCG_TYPE_V128:
 830        tcg_debug_assert(ret >= 16 && arg >= 16);
 831        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
 832        break;
 833    case TCG_TYPE_V256:
 834        tcg_debug_assert(ret >= 16 && arg >= 16);
 835        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
 836        break;
 837
 838    default:
 839        g_assert_not_reached();
 840    }
 841}
 842
 843static void tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
 844                            TCGReg r, TCGReg a)
 845{
 846    if (have_avx2) {
 847        static const int dup_insn[4] = {
 848            OPC_VPBROADCASTB, OPC_VPBROADCASTW,
 849            OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
 850        };
 851        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 852        tcg_out_vex_modrm(s, dup_insn[vece] + vex_l, r, 0, a);
 853    } else {
 854        switch (vece) {
 855        case MO_8:
 856            /* ??? With zero in a register, use PSHUFB.  */
 857            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
 858            a = r;
 859            /* FALLTHRU */
 860        case MO_16:
 861            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
 862            a = r;
 863            /* FALLTHRU */
 864        case MO_32:
 865            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
 866            /* imm8 operand: all output lanes selected from input lane 0.  */
 867            tcg_out8(s, 0);
 868            break;
 869        case MO_64:
 870            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
 871            break;
 872        default:
 873            g_assert_not_reached();
 874        }
 875    }
 876}
 877
 878static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
 879                             TCGReg ret, tcg_target_long arg)
 880{
 881    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 882
 883    if (arg == 0) {
 884        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
 885        return;
 886    }
 887    if (arg == -1) {
 888        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
 889        return;
 890    }
 891
 892    if (TCG_TARGET_REG_BITS == 64) {
 893        if (type == TCG_TYPE_V64) {
 894            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
 895        } else if (have_avx2) {
 896            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
 897        } else {
 898            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
 899        }
 900        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
 901    } else if (have_avx2) {
 902        tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
 903        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 904    } else {
 905        tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy, ret);
 906        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 907        tcg_out_dup_vec(s, type, MO_32, ret, ret);
 908    }
 909}
 910
 911static void tcg_out_movi(TCGContext *s, TCGType type,
 912                         TCGReg ret, tcg_target_long arg)
 913{
 914    tcg_target_long diff;
 915
 916    switch (type) {
 917    case TCG_TYPE_I32:
 918#if TCG_TARGET_REG_BITS == 64
 919    case TCG_TYPE_I64:
 920#endif
 921        if (ret < 16) {
 922            break;
 923        }
 924        /* fallthru */
 925    case TCG_TYPE_V64:
 926    case TCG_TYPE_V128:
 927    case TCG_TYPE_V256:
 928        tcg_debug_assert(ret >= 16);
 929        tcg_out_dupi_vec(s, type, ret, arg);
 930        return;
 931    default:
 932        g_assert_not_reached();
 933    }
 934
 935    if (arg == 0) {
 936        tgen_arithr(s, ARITH_XOR, ret, ret);
 937        return;
 938    }
 939    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
 940        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
 941        tcg_out32(s, arg);
 942        return;
 943    }
 944    if (arg == (int32_t)arg) {
 945        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
 946        tcg_out32(s, arg);
 947        return;
 948    }
 949
 950    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
 951    diff = arg - ((uintptr_t)s->code_ptr + 7);
 952    if (diff == (int32_t)diff) {
 953        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
 954        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
 955        tcg_out32(s, diff);
 956        return;
 957    }
 958
 959    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
 960    tcg_out64(s, arg);
 961}
 962
 963static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
 964{
 965    if (val == (int8_t)val) {
 966        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
 967        tcg_out8(s, val);
 968    } else if (val == (int32_t)val) {
 969        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
 970        tcg_out32(s, val);
 971    } else {
 972        tcg_abort();
 973    }
 974}
 975
 976static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
 977{
 978    /* Given the strength of x86 memory ordering, we only need care for
 979       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
 980       faster than "mfence", so don't bother with the sse insn.  */
 981    if (a0 & TCG_MO_ST_LD) {
 982        tcg_out8(s, 0xf0);
 983        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
 984        tcg_out8(s, 0);
 985    }
 986}
 987
 988static inline void tcg_out_push(TCGContext *s, int reg)
 989{
 990    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
 991}
 992
 993static inline void tcg_out_pop(TCGContext *s, int reg)
 994{
 995    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
 996}
 997
 998static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
 999                       TCGReg arg1, intptr_t arg2)
1000{
1001    switch (type) {
1002    case TCG_TYPE_I32:
1003        if (ret < 16) {
1004            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1005        } else {
1006            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1007        }
1008        break;
1009    case TCG_TYPE_I64:
1010        if (ret < 16) {
1011            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1012            break;
1013        }
1014        /* FALLTHRU */
1015    case TCG_TYPE_V64:
1016        tcg_debug_assert(ret >= 16);
1017        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1018        break;
1019    case TCG_TYPE_V128:
1020        tcg_debug_assert(ret >= 16);
1021        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx, ret, 0, arg1, arg2);
1022        break;
1023    case TCG_TYPE_V256:
1024        tcg_debug_assert(ret >= 16);
1025        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1026                                 ret, 0, arg1, arg2);
1027        break;
1028    default:
1029        g_assert_not_reached();
1030    }
1031}
1032
1033static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1034                       TCGReg arg1, intptr_t arg2)
1035{
1036    switch (type) {
1037    case TCG_TYPE_I32:
1038        if (arg < 16) {
1039            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1040        } else {
1041            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1042        }
1043        break;
1044    case TCG_TYPE_I64:
1045        if (arg < 16) {
1046            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1047            break;
1048        }
1049        /* FALLTHRU */
1050    case TCG_TYPE_V64:
1051        tcg_debug_assert(arg >= 16);
1052        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1053        break;
1054    case TCG_TYPE_V128:
1055        tcg_debug_assert(arg >= 16);
1056        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx, arg, 0, arg1, arg2);
1057        break;
1058    case TCG_TYPE_V256:
1059        tcg_debug_assert(arg >= 16);
1060        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1061                                 arg, 0, arg1, arg2);
1062        break;
1063    default:
1064        g_assert_not_reached();
1065    }
1066}
1067
1068static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1069                        TCGReg base, intptr_t ofs)
1070{
1071    int rexw = 0;
1072    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1073        if (val != (int32_t)val) {
1074            return false;
1075        }
1076        rexw = P_REXW;
1077    } else if (type != TCG_TYPE_I32) {
1078        return false;
1079    }
1080    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1081    tcg_out32(s, val);
1082    return true;
1083}
1084
1085static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1086{
1087    /* Propagate an opcode prefix, such as P_DATA16.  */
1088    int ext = subopc & ~0x7;
1089    subopc &= 0x7;
1090
1091    if (count == 1) {
1092        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1093    } else {
1094        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1095        tcg_out8(s, count);
1096    }
1097}
1098
1099static inline void tcg_out_bswap32(TCGContext *s, int reg)
1100{
1101    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1102}
1103
1104static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1105{
1106    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1107}
1108
1109static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
1110{
1111    /* movzbl */
1112    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1113    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1114}
1115
1116static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
1117{
1118    /* movsbl */
1119    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1120    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1121}
1122
1123static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
1124{
1125    /* movzwl */
1126    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1127}
1128
1129static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
1130{
1131    /* movsw[lq] */
1132    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1133}
1134
1135static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
1136{
1137    /* 32-bit mov zero extends.  */
1138    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1139}
1140
1141static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
1142{
1143    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1144}
1145
1146static inline void tcg_out_bswap64(TCGContext *s, int reg)
1147{
1148    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1149}
1150
1151static void tgen_arithi(TCGContext *s, int c, int r0,
1152                        tcg_target_long val, int cf)
1153{
1154    int rexw = 0;
1155
1156    if (TCG_TARGET_REG_BITS == 64) {
1157        rexw = c & -8;
1158        c &= 7;
1159    }
1160
1161    /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1162       partial flags update stalls on Pentium4 and are not recommended
1163       by current Intel optimization manuals.  */
1164    if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1165        int is_inc = (c == ARITH_ADD) ^ (val < 0);
1166        if (TCG_TARGET_REG_BITS == 64) {
1167            /* The single-byte increment encodings are re-tasked as the
1168               REX prefixes.  Use the MODRM encoding.  */
1169            tcg_out_modrm(s, OPC_GRP5 + rexw,
1170                          (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1171        } else {
1172            tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1173        }
1174        return;
1175    }
1176
1177    if (c == ARITH_AND) {
1178        if (TCG_TARGET_REG_BITS == 64) {
1179            if (val == 0xffffffffu) {
1180                tcg_out_ext32u(s, r0, r0);
1181                return;
1182            }
1183            if (val == (uint32_t)val) {
1184                /* AND with no high bits set can use a 32-bit operation.  */
1185                rexw = 0;
1186            }
1187        }
1188        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1189            tcg_out_ext8u(s, r0, r0);
1190            return;
1191        }
1192        if (val == 0xffffu) {
1193            tcg_out_ext16u(s, r0, r0);
1194            return;
1195        }
1196    }
1197
1198    if (val == (int8_t)val) {
1199        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1200        tcg_out8(s, val);
1201        return;
1202    }
1203    if (rexw == 0 || val == (int32_t)val) {
1204        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1205        tcg_out32(s, val);
1206        return;
1207    }
1208
1209    tcg_abort();
1210}
1211
1212static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1213{
1214    if (val != 0) {
1215        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1216    }
1217}
1218
1219/* Use SMALL != 0 to force a short forward branch.  */
1220static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
1221{
1222    int32_t val, val1;
1223
1224    if (l->has_value) {
1225        val = tcg_pcrel_diff(s, l->u.value_ptr);
1226        val1 = val - 2;
1227        if ((int8_t)val1 == val1) {
1228            if (opc == -1) {
1229                tcg_out8(s, OPC_JMP_short);
1230            } else {
1231                tcg_out8(s, OPC_JCC_short + opc);
1232            }
1233            tcg_out8(s, val1);
1234        } else {
1235            if (small) {
1236                tcg_abort();
1237            }
1238            if (opc == -1) {
1239                tcg_out8(s, OPC_JMP_long);
1240                tcg_out32(s, val - 5);
1241            } else {
1242                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1243                tcg_out32(s, val - 6);
1244            }
1245        }
1246    } else if (small) {
1247        if (opc == -1) {
1248            tcg_out8(s, OPC_JMP_short);
1249        } else {
1250            tcg_out8(s, OPC_JCC_short + opc);
1251        }
1252        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1253        s->code_ptr += 1;
1254    } else {
1255        if (opc == -1) {
1256            tcg_out8(s, OPC_JMP_long);
1257        } else {
1258            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1259        }
1260        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1261        s->code_ptr += 4;
1262    }
1263}
1264
1265static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1266                        int const_arg2, int rexw)
1267{
1268    if (const_arg2) {
1269        if (arg2 == 0) {
1270            /* test r, r */
1271            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1272        } else {
1273            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1274        }
1275    } else {
1276        tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1277    }
1278}
1279
1280static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1281                             TCGArg arg1, TCGArg arg2, int const_arg2,
1282                             TCGLabel *label, int small)
1283{
1284    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1285    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1286}
1287
1288#if TCG_TARGET_REG_BITS == 64
1289static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1290                             TCGArg arg1, TCGArg arg2, int const_arg2,
1291                             TCGLabel *label, int small)
1292{
1293    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1294    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1295}
1296#else
1297/* XXX: we implement it at the target level to avoid having to
1298   handle cross basic blocks temporaries */
1299static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1300                            const int *const_args, int small)
1301{
1302    TCGLabel *label_next = gen_new_label();
1303    TCGLabel *label_this = arg_label(args[5]);
1304
1305    switch(args[4]) {
1306    case TCG_COND_EQ:
1307        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1308                         label_next, 1);
1309        tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1310                         label_this, small);
1311        break;
1312    case TCG_COND_NE:
1313        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1314                         label_this, small);
1315        tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1316                         label_this, small);
1317        break;
1318    case TCG_COND_LT:
1319        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1320                         label_this, small);
1321        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1322        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1323                         label_this, small);
1324        break;
1325    case TCG_COND_LE:
1326        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1327                         label_this, small);
1328        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1329        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1330                         label_this, small);
1331        break;
1332    case TCG_COND_GT:
1333        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1334                         label_this, small);
1335        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1336        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1337                         label_this, small);
1338        break;
1339    case TCG_COND_GE:
1340        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1341                         label_this, small);
1342        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1343        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1344                         label_this, small);
1345        break;
1346    case TCG_COND_LTU:
1347        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1348                         label_this, small);
1349        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1350        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1351                         label_this, small);
1352        break;
1353    case TCG_COND_LEU:
1354        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1355                         label_this, small);
1356        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1357        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1358                         label_this, small);
1359        break;
1360    case TCG_COND_GTU:
1361        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1362                         label_this, small);
1363        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1364        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1365                         label_this, small);
1366        break;
1367    case TCG_COND_GEU:
1368        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1369                         label_this, small);
1370        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1371        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1372                         label_this, small);
1373        break;
1374    default:
1375        tcg_abort();
1376    }
1377    tcg_out_label(s, label_next, s->code_ptr);
1378}
1379#endif
1380
1381static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1382                              TCGArg arg1, TCGArg arg2, int const_arg2)
1383{
1384    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1385    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1386    tcg_out_ext8u(s, dest, dest);
1387}
1388
1389#if TCG_TARGET_REG_BITS == 64
1390static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1391                              TCGArg arg1, TCGArg arg2, int const_arg2)
1392{
1393    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1394    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1395    tcg_out_ext8u(s, dest, dest);
1396}
1397#else
1398static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1399                             const int *const_args)
1400{
1401    TCGArg new_args[6];
1402    TCGLabel *label_true, *label_over;
1403
1404    memcpy(new_args, args+1, 5*sizeof(TCGArg));
1405
1406    if (args[0] == args[1] || args[0] == args[2]
1407        || (!const_args[3] && args[0] == args[3])
1408        || (!const_args[4] && args[0] == args[4])) {
1409        /* When the destination overlaps with one of the argument
1410           registers, don't do anything tricky.  */
1411        label_true = gen_new_label();
1412        label_over = gen_new_label();
1413
1414        new_args[5] = label_arg(label_true);
1415        tcg_out_brcond2(s, new_args, const_args+1, 1);
1416
1417        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1418        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1419        tcg_out_label(s, label_true, s->code_ptr);
1420
1421        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1422        tcg_out_label(s, label_over, s->code_ptr);
1423    } else {
1424        /* When the destination does not overlap one of the arguments,
1425           clear the destination first, jump if cond false, and emit an
1426           increment in the true case.  This results in smaller code.  */
1427
1428        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1429
1430        label_over = gen_new_label();
1431        new_args[4] = tcg_invert_cond(new_args[4]);
1432        new_args[5] = label_arg(label_over);
1433        tcg_out_brcond2(s, new_args, const_args+1, 1);
1434
1435        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1436        tcg_out_label(s, label_over, s->code_ptr);
1437    }
1438}
1439#endif
1440
1441static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1442                         TCGReg dest, TCGReg v1)
1443{
1444    if (have_cmov) {
1445        tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1446    } else {
1447        TCGLabel *over = gen_new_label();
1448        tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1449        tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1450        tcg_out_label(s, over, s->code_ptr);
1451    }
1452}
1453
1454static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1455                              TCGReg c1, TCGArg c2, int const_c2,
1456                              TCGReg v1)
1457{
1458    tcg_out_cmp(s, c1, c2, const_c2, 0);
1459    tcg_out_cmov(s, cond, 0, dest, v1);
1460}
1461
1462#if TCG_TARGET_REG_BITS == 64
1463static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1464                              TCGReg c1, TCGArg c2, int const_c2,
1465                              TCGReg v1)
1466{
1467    tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1468    tcg_out_cmov(s, cond, P_REXW, dest, v1);
1469}
1470#endif
1471
1472static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1473                        TCGArg arg2, bool const_a2)
1474{
1475    if (have_bmi1) {
1476        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1477        if (const_a2) {
1478            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1479        } else {
1480            tcg_debug_assert(dest != arg2);
1481            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1482        }
1483    } else {
1484        tcg_debug_assert(dest != arg2);
1485        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1486        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1487    }
1488}
1489
1490static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1491                        TCGArg arg2, bool const_a2)
1492{
1493    if (have_lzcnt) {
1494        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1495        if (const_a2) {
1496            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1497        } else {
1498            tcg_debug_assert(dest != arg2);
1499            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1500        }
1501    } else {
1502        tcg_debug_assert(!const_a2);
1503        tcg_debug_assert(dest != arg1);
1504        tcg_debug_assert(dest != arg2);
1505
1506        /* Recall that the output of BSR is the index not the count.  */
1507        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1508        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1509
1510        /* Since we have destroyed the flags from BSR, we have to re-test.  */
1511        tcg_out_cmp(s, arg1, 0, 1, rexw);
1512        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1513    }
1514}
1515
1516static void tcg_out_branch(TCGContext *s, int call, tcg_insn_unit *dest)
1517{
1518    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1519
1520    if (disp == (int32_t)disp) {
1521        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1522        tcg_out32(s, disp);
1523    } else {
1524        /* rip-relative addressing into the constant pool.
1525           This is 6 + 8 = 14 bytes, as compared to using an
1526           an immediate load 10 + 6 = 16 bytes, plus we may
1527           be able to re-use the pool constant for more calls.  */
1528        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1529        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1530        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1531        tcg_out32(s, 0);
1532    }
1533}
1534
1535static inline void tcg_out_call(TCGContext *s, tcg_insn_unit *dest)
1536{
1537    tcg_out_branch(s, 1, dest);
1538}
1539
1540static void tcg_out_jmp(TCGContext *s, tcg_insn_unit *dest)
1541{
1542    tcg_out_branch(s, 0, dest);
1543}
1544
1545static void tcg_out_nopn(TCGContext *s, int n)
1546{
1547    int i;
1548    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1549     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1550     * duplicate prefix, and all of the interesting recent cores can
1551     * decode and discard the duplicates in a single cycle.
1552     */
1553    tcg_debug_assert(n >= 1);
1554    for (i = 1; i < n; ++i) {
1555        tcg_out8(s, 0x66);
1556    }
1557    tcg_out8(s, 0x90);
1558}
1559
1560#if defined(CONFIG_SOFTMMU)
1561#include "tcg-ldst.inc.c"
1562
1563/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1564 *                                     int mmu_idx, uintptr_t ra)
1565 */
1566static void * const qemu_ld_helpers[16] = {
1567    [MO_UB]   = helper_ret_ldub_mmu,
1568    [MO_LEUW] = helper_le_lduw_mmu,
1569    [MO_LEUL] = helper_le_ldul_mmu,
1570    [MO_LEQ]  = helper_le_ldq_mmu,
1571    [MO_BEUW] = helper_be_lduw_mmu,
1572    [MO_BEUL] = helper_be_ldul_mmu,
1573    [MO_BEQ]  = helper_be_ldq_mmu,
1574};
1575
1576/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1577 *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
1578 */
1579static void * const qemu_st_helpers[16] = {
1580    [MO_UB]   = helper_ret_stb_mmu,
1581    [MO_LEUW] = helper_le_stw_mmu,
1582    [MO_LEUL] = helper_le_stl_mmu,
1583    [MO_LEQ]  = helper_le_stq_mmu,
1584    [MO_BEUW] = helper_be_stw_mmu,
1585    [MO_BEUL] = helper_be_stl_mmu,
1586    [MO_BEQ]  = helper_be_stq_mmu,
1587};
1588
1589/* Perform the TLB load and compare.
1590
1591   Inputs:
1592   ADDRLO and ADDRHI contain the low and high part of the address.
1593
1594   MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1595
1596   WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1597   This should be offsetof addr_read or addr_write.
1598
1599   Outputs:
1600   LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1601   positions of the displacements of forward jumps to the TLB miss case.
1602
1603   Second argument register is loaded with the low part of the address.
1604   In the TLB hit case, it has been adjusted as indicated by the TLB
1605   and so is a host address.  In the TLB miss case, it continues to
1606   hold a guest address.
1607
1608   First argument register is clobbered.  */
1609
1610static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1611                                    int mem_index, TCGMemOp opc,
1612                                    tcg_insn_unit **label_ptr, int which)
1613{
1614    const TCGReg r0 = TCG_REG_L0;
1615    const TCGReg r1 = TCG_REG_L1;
1616    TCGType ttype = TCG_TYPE_I32;
1617    TCGType tlbtype = TCG_TYPE_I32;
1618    int trexw = 0, hrexw = 0, tlbrexw = 0;
1619    unsigned a_bits = get_alignment_bits(opc);
1620    unsigned s_bits = opc & MO_SIZE;
1621    unsigned a_mask = (1 << a_bits) - 1;
1622    unsigned s_mask = (1 << s_bits) - 1;
1623    target_ulong tlb_mask;
1624
1625    if (TCG_TARGET_REG_BITS == 64) {
1626        if (TARGET_LONG_BITS == 64) {
1627            ttype = TCG_TYPE_I64;
1628            trexw = P_REXW;
1629        }
1630        if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1631            hrexw = P_REXW;
1632            if (TARGET_PAGE_BITS + CPU_TLB_BITS > 32) {
1633                tlbtype = TCG_TYPE_I64;
1634                tlbrexw = P_REXW;
1635            }
1636        }
1637    }
1638
1639    tcg_out_mov(s, tlbtype, r0, addrlo);
1640    /* If the required alignment is at least as large as the access, simply
1641       copy the address and mask.  For lesser alignments, check that we don't
1642       cross pages for the complete access.  */
1643    if (a_bits >= s_bits) {
1644        tcg_out_mov(s, ttype, r1, addrlo);
1645    } else {
1646        tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1647    }
1648    tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1649
1650    tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1651                   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1652
1653    tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1654    tgen_arithi(s, ARITH_AND + tlbrexw, r0,
1655                (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS, 0);
1656
1657    tcg_out_modrm_sib_offset(s, OPC_LEA + hrexw, r0, TCG_AREG0, r0, 0,
1658                             offsetof(CPUArchState, tlb_table[mem_index][0])
1659                             + which);
1660
1661    /* cmp 0(r0), r1 */
1662    tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, 0);
1663
1664    /* Prepare for both the fast path add of the tlb addend, and the slow
1665       path function argument setup.  There are two cases worth note:
1666       For 32-bit guest and x86_64 host, MOVL zero-extends the guest address
1667       before the fastpath ADDQ below.  For 64-bit guest and x32 host, MOVQ
1668       copies the entire guest address for the slow path, while truncation
1669       for the 32-bit host happens with the fastpath ADDL below.  */
1670    tcg_out_mov(s, ttype, r1, addrlo);
1671
1672    /* jne slow_path */
1673    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1674    label_ptr[0] = s->code_ptr;
1675    s->code_ptr += 4;
1676
1677    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1678        /* cmp 4(r0), addrhi */
1679        tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, 4);
1680
1681        /* jne slow_path */
1682        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1683        label_ptr[1] = s->code_ptr;
1684        s->code_ptr += 4;
1685    }
1686
1687    /* TLB Hit.  */
1688
1689    /* add addend(r0), r1 */
1690    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1691                         offsetof(CPUTLBEntry, addend) - which);
1692}
1693
1694/*
1695 * Record the context of a call to the out of line helper code for the slow path
1696 * for a load or store, so that we can later generate the correct helper code
1697 */
1698static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
1699                                TCGReg datalo, TCGReg datahi,
1700                                TCGReg addrlo, TCGReg addrhi,
1701                                tcg_insn_unit *raddr,
1702                                tcg_insn_unit **label_ptr)
1703{
1704    TCGLabelQemuLdst *label = new_ldst_label(s);
1705
1706    label->is_ld = is_ld;
1707    label->oi = oi;
1708    label->datalo_reg = datalo;
1709    label->datahi_reg = datahi;
1710    label->addrlo_reg = addrlo;
1711    label->addrhi_reg = addrhi;
1712    label->raddr = raddr;
1713    label->label_ptr[0] = label_ptr[0];
1714    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1715        label->label_ptr[1] = label_ptr[1];
1716    }
1717}
1718
1719/*
1720 * Generate code for the slow path for a load at the end of block
1721 */
1722static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1723{
1724    TCGMemOpIdx oi = l->oi;
1725    TCGMemOp opc = get_memop(oi);
1726    TCGReg data_reg;
1727    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1728
1729    /* resolve label address */
1730    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1731    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1732        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1733    }
1734
1735    if (TCG_TARGET_REG_BITS == 32) {
1736        int ofs = 0;
1737
1738        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1739        ofs += 4;
1740
1741        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1742        ofs += 4;
1743
1744        if (TARGET_LONG_BITS == 64) {
1745            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1746            ofs += 4;
1747        }
1748
1749        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1750        ofs += 4;
1751
1752        tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1753    } else {
1754        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1755        /* The second argument is already loaded with addrlo.  */
1756        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1757        tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1758                     (uintptr_t)l->raddr);
1759    }
1760
1761    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1762
1763    data_reg = l->datalo_reg;
1764    switch (opc & MO_SSIZE) {
1765    case MO_SB:
1766        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
1767        break;
1768    case MO_SW:
1769        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
1770        break;
1771#if TCG_TARGET_REG_BITS == 64
1772    case MO_SL:
1773        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1774        break;
1775#endif
1776    case MO_UB:
1777    case MO_UW:
1778        /* Note that the helpers have zero-extended to tcg_target_long.  */
1779    case MO_UL:
1780        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1781        break;
1782    case MO_Q:
1783        if (TCG_TARGET_REG_BITS == 64) {
1784            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1785        } else if (data_reg == TCG_REG_EDX) {
1786            /* xchg %edx, %eax */
1787            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1788            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1789        } else {
1790            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1791            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1792        }
1793        break;
1794    default:
1795        tcg_abort();
1796    }
1797
1798    /* Jump to the code corresponding to next IR of qemu_st */
1799    tcg_out_jmp(s, l->raddr);
1800}
1801
1802/*
1803 * Generate code for the slow path for a store at the end of block
1804 */
1805static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1806{
1807    TCGMemOpIdx oi = l->oi;
1808    TCGMemOp opc = get_memop(oi);
1809    TCGMemOp s_bits = opc & MO_SIZE;
1810    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1811    TCGReg retaddr;
1812
1813    /* resolve label address */
1814    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1815    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1816        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1817    }
1818
1819    if (TCG_TARGET_REG_BITS == 32) {
1820        int ofs = 0;
1821
1822        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1823        ofs += 4;
1824
1825        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1826        ofs += 4;
1827
1828        if (TARGET_LONG_BITS == 64) {
1829            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1830            ofs += 4;
1831        }
1832
1833        tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1834        ofs += 4;
1835
1836        if (s_bits == MO_64) {
1837            tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1838            ofs += 4;
1839        }
1840
1841        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1842        ofs += 4;
1843
1844        retaddr = TCG_REG_EAX;
1845        tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1846        tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1847    } else {
1848        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1849        /* The second argument is already loaded with addrlo.  */
1850        tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1851                    tcg_target_call_iarg_regs[2], l->datalo_reg);
1852        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1853
1854        if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1855            retaddr = tcg_target_call_iarg_regs[4];
1856            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1857        } else {
1858            retaddr = TCG_REG_RAX;
1859            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1860            tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1861                       TCG_TARGET_CALL_STACK_OFFSET);
1862        }
1863    }
1864
1865    /* "Tail call" to the helper, with the return address back inline.  */
1866    tcg_out_push(s, retaddr);
1867    tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1868}
1869#elif defined(__x86_64__) && defined(__linux__)
1870# include <asm/prctl.h>
1871# include <sys/prctl.h>
1872
1873int arch_prctl(int code, unsigned long addr);
1874
1875static int guest_base_flags;
1876static inline void setup_guest_base_seg(void)
1877{
1878    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1879        guest_base_flags = P_GS;
1880    }
1881}
1882#else
1883# define guest_base_flags 0
1884static inline void setup_guest_base_seg(void) { }
1885#endif /* SOFTMMU */
1886
1887static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1888                                   TCGReg base, int index, intptr_t ofs,
1889                                   int seg, TCGMemOp memop)
1890{
1891    const TCGMemOp real_bswap = memop & MO_BSWAP;
1892    TCGMemOp bswap = real_bswap;
1893    int movop = OPC_MOVL_GvEv;
1894
1895    if (have_movbe && real_bswap) {
1896        bswap = 0;
1897        movop = OPC_MOVBE_GyMy;
1898    }
1899
1900    switch (memop & MO_SSIZE) {
1901    case MO_UB:
1902        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
1903                                 base, index, 0, ofs);
1904        break;
1905    case MO_SB:
1906        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + P_REXW + seg, datalo,
1907                                 base, index, 0, ofs);
1908        break;
1909    case MO_UW:
1910        tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1911                                 base, index, 0, ofs);
1912        if (real_bswap) {
1913            tcg_out_rolw_8(s, datalo);
1914        }
1915        break;
1916    case MO_SW:
1917        if (real_bswap) {
1918            if (have_movbe) {
1919                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1920                                         datalo, base, index, 0, ofs);
1921            } else {
1922                tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1923                                         base, index, 0, ofs);
1924                tcg_out_rolw_8(s, datalo);
1925            }
1926            tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo);
1927        } else {
1928            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + P_REXW + seg,
1929                                     datalo, base, index, 0, ofs);
1930        }
1931        break;
1932    case MO_UL:
1933        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
1934        if (bswap) {
1935            tcg_out_bswap32(s, datalo);
1936        }
1937        break;
1938#if TCG_TARGET_REG_BITS == 64
1939    case MO_SL:
1940        if (real_bswap) {
1941            tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1942                                     base, index, 0, ofs);
1943            if (bswap) {
1944                tcg_out_bswap32(s, datalo);
1945            }
1946            tcg_out_ext32s(s, datalo, datalo);
1947        } else {
1948            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
1949                                     base, index, 0, ofs);
1950        }
1951        break;
1952#endif
1953    case MO_Q:
1954        if (TCG_TARGET_REG_BITS == 64) {
1955            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
1956                                     base, index, 0, ofs);
1957            if (bswap) {
1958                tcg_out_bswap64(s, datalo);
1959            }
1960        } else {
1961            if (real_bswap) {
1962                int t = datalo;
1963                datalo = datahi;
1964                datahi = t;
1965            }
1966            if (base != datalo) {
1967                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1968                                         base, index, 0, ofs);
1969                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
1970                                         base, index, 0, ofs + 4);
1971            } else {
1972                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
1973                                         base, index, 0, ofs + 4);
1974                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1975                                         base, index, 0, ofs);
1976            }
1977            if (bswap) {
1978                tcg_out_bswap32(s, datalo);
1979                tcg_out_bswap32(s, datahi);
1980            }
1981        }
1982        break;
1983    default:
1984        tcg_abort();
1985    }
1986}
1987
1988/* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
1989   EAX. It will be useful once fixed registers globals are less
1990   common. */
1991static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
1992{
1993    TCGReg datalo, datahi, addrlo;
1994    TCGReg addrhi __attribute__((unused));
1995    TCGMemOpIdx oi;
1996    TCGMemOp opc;
1997#if defined(CONFIG_SOFTMMU)
1998    int mem_index;
1999    tcg_insn_unit *label_ptr[2];
2000#endif
2001
2002    datalo = *args++;
2003    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2004    addrlo = *args++;
2005    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2006    oi = *args++;
2007    opc = get_memop(oi);
2008
2009#if defined(CONFIG_SOFTMMU)
2010    mem_index = get_mmuidx(oi);
2011
2012    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2013                     label_ptr, offsetof(CPUTLBEntry, addr_read));
2014
2015    /* TLB Hit.  */
2016    tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
2017
2018    /* Record the current context of a load into ldst label */
2019    add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi,
2020                        s->code_ptr, label_ptr);
2021#else
2022    {
2023        int32_t offset = guest_base;
2024        TCGReg base = addrlo;
2025        int index = -1;
2026        int seg = 0;
2027
2028        /* For a 32-bit guest, the high 32 bits may contain garbage.
2029           We can do this with the ADDR32 prefix if we're not using
2030           a guest base, or when using segmentation.  Otherwise we
2031           need to zero-extend manually.  */
2032        if (guest_base == 0 || guest_base_flags) {
2033            seg = guest_base_flags;
2034            offset = 0;
2035            if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
2036                seg |= P_ADDR32;
2037            }
2038        } else if (TCG_TARGET_REG_BITS == 64) {
2039            if (TARGET_LONG_BITS == 32) {
2040                tcg_out_ext32u(s, TCG_REG_L0, base);
2041                base = TCG_REG_L0;
2042            }
2043            if (offset != guest_base) {
2044                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, guest_base);
2045                index = TCG_REG_L1;
2046                offset = 0;
2047            }
2048        }
2049
2050        tcg_out_qemu_ld_direct(s, datalo, datahi,
2051                               base, index, offset, seg, opc);
2052    }
2053#endif
2054}
2055
2056static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2057                                   TCGReg base, intptr_t ofs, int seg,
2058                                   TCGMemOp memop)
2059{
2060    /* ??? Ideally we wouldn't need a scratch register.  For user-only,
2061       we could perform the bswap twice to restore the original value
2062       instead of moving to the scratch.  But as it is, the L constraint
2063       means that TCG_REG_L0 is definitely free here.  */
2064    const TCGReg scratch = TCG_REG_L0;
2065    const TCGMemOp real_bswap = memop & MO_BSWAP;
2066    TCGMemOp bswap = real_bswap;
2067    int movop = OPC_MOVL_EvGv;
2068
2069    if (have_movbe && real_bswap) {
2070        bswap = 0;
2071        movop = OPC_MOVBE_MyGy;
2072    }
2073
2074    switch (memop & MO_SIZE) {
2075    case MO_8:
2076        /* In 32-bit mode, 8-bit stores can only happen from [abcd]x.
2077           Use the scratch register if necessary.  */
2078        if (TCG_TARGET_REG_BITS == 32 && datalo >= 4) {
2079            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2080            datalo = scratch;
2081        }
2082        tcg_out_modrm_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
2083                             datalo, base, ofs);
2084        break;
2085    case MO_16:
2086        if (bswap) {
2087            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2088            tcg_out_rolw_8(s, scratch);
2089            datalo = scratch;
2090        }
2091        tcg_out_modrm_offset(s, movop + P_DATA16 + seg, datalo, base, ofs);
2092        break;
2093    case MO_32:
2094        if (bswap) {
2095            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2096            tcg_out_bswap32(s, scratch);
2097            datalo = scratch;
2098        }
2099        tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
2100        break;
2101    case MO_64:
2102        if (TCG_TARGET_REG_BITS == 64) {
2103            if (bswap) {
2104                tcg_out_mov(s, TCG_TYPE_I64, scratch, datalo);
2105                tcg_out_bswap64(s, scratch);
2106                datalo = scratch;
2107            }
2108            tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
2109        } else if (bswap) {
2110            tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
2111            tcg_out_bswap32(s, scratch);
2112            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs);
2113            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2114            tcg_out_bswap32(s, scratch);
2115            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs+4);
2116        } else {
2117            if (real_bswap) {
2118                int t = datalo;
2119                datalo = datahi;
2120                datahi = t;
2121            }
2122            tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
2123            tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs+4);
2124        }
2125        break;
2126    default:
2127        tcg_abort();
2128    }
2129}
2130
2131static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
2132{
2133    TCGReg datalo, datahi, addrlo;
2134    TCGReg addrhi __attribute__((unused));
2135    TCGMemOpIdx oi;
2136    TCGMemOp opc;
2137#if defined(CONFIG_SOFTMMU)
2138    int mem_index;
2139    tcg_insn_unit *label_ptr[2];
2140#endif
2141
2142    datalo = *args++;
2143    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2144    addrlo = *args++;
2145    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2146    oi = *args++;
2147    opc = get_memop(oi);
2148
2149#if defined(CONFIG_SOFTMMU)
2150    mem_index = get_mmuidx(oi);
2151
2152    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2153                     label_ptr, offsetof(CPUTLBEntry, addr_write));
2154
2155    /* TLB Hit.  */
2156    tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, 0, 0, opc);
2157
2158    /* Record the current context of a store into ldst label */
2159    add_qemu_ldst_label(s, false, oi, datalo, datahi, addrlo, addrhi,
2160                        s->code_ptr, label_ptr);
2161#else
2162    {
2163        int32_t offset = guest_base;
2164        TCGReg base = addrlo;
2165        int seg = 0;
2166
2167        /* See comment in tcg_out_qemu_ld re zero-extension of addrlo.  */
2168        if (guest_base == 0 || guest_base_flags) {
2169            seg = guest_base_flags;
2170            offset = 0;
2171            if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
2172                seg |= P_ADDR32;
2173            }
2174        } else if (TCG_TARGET_REG_BITS == 64) {
2175            /* ??? Note that we can't use the same SIB addressing scheme
2176               as for loads, since we require L0 free for bswap.  */
2177            if (offset != guest_base) {
2178                if (TARGET_LONG_BITS == 32) {
2179                    tcg_out_ext32u(s, TCG_REG_L0, base);
2180                    base = TCG_REG_L0;
2181                }
2182                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, guest_base);
2183                tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L1, base);
2184                base = TCG_REG_L1;
2185                offset = 0;
2186            } else if (TARGET_LONG_BITS == 32) {
2187                tcg_out_ext32u(s, TCG_REG_L1, base);
2188                base = TCG_REG_L1;
2189            }
2190        }
2191
2192        tcg_out_qemu_st_direct(s, datalo, datahi, base, offset, seg, opc);
2193    }
2194#endif
2195}
2196
2197static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2198                              const TCGArg *args, const int *const_args)
2199{
2200    TCGArg a0, a1, a2;
2201    int c, const_a2, vexop, rexw = 0;
2202
2203#if TCG_TARGET_REG_BITS == 64
2204# define OP_32_64(x) \
2205        case glue(glue(INDEX_op_, x), _i64): \
2206            rexw = P_REXW; /* FALLTHRU */    \
2207        case glue(glue(INDEX_op_, x), _i32)
2208#else
2209# define OP_32_64(x) \
2210        case glue(glue(INDEX_op_, x), _i32)
2211#endif
2212
2213    /* Hoist the loads of the most common arguments.  */
2214    a0 = args[0];
2215    a1 = args[1];
2216    a2 = args[2];
2217    const_a2 = const_args[2];
2218
2219    switch (opc) {
2220    case INDEX_op_exit_tb:
2221        /* Reuse the zeroing that exists for goto_ptr.  */
2222        if (a0 == 0) {
2223            tcg_out_jmp(s, s->code_gen_epilogue);
2224        } else {
2225            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2226            tcg_out_jmp(s, tb_ret_addr);
2227        }
2228        break;
2229    case INDEX_op_goto_tb:
2230        if (s->tb_jmp_insn_offset) {
2231            /* direct jump method */
2232            int gap;
2233            /* jump displacement must be aligned for atomic patching;
2234             * see if we need to add extra nops before jump
2235             */
2236            gap = tcg_pcrel_diff(s, QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4));
2237            if (gap != 1) {
2238                tcg_out_nopn(s, gap - 1);
2239            }
2240            tcg_out8(s, OPC_JMP_long); /* jmp im */
2241            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
2242            tcg_out32(s, 0);
2243        } else {
2244            /* indirect jump method */
2245            tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
2246                                 (intptr_t)(s->tb_jmp_target_addr + a0));
2247        }
2248        set_jmp_reset_offset(s, a0);
2249        break;
2250    case INDEX_op_goto_ptr:
2251        /* jmp to the given host address (could be epilogue) */
2252        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2253        break;
2254    case INDEX_op_br:
2255        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2256        break;
2257    OP_32_64(ld8u):
2258        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2259        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2260        break;
2261    OP_32_64(ld8s):
2262        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2263        break;
2264    OP_32_64(ld16u):
2265        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2266        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2267        break;
2268    OP_32_64(ld16s):
2269        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2270        break;
2271#if TCG_TARGET_REG_BITS == 64
2272    case INDEX_op_ld32u_i64:
2273#endif
2274    case INDEX_op_ld_i32:
2275        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2276        break;
2277
2278    OP_32_64(st8):
2279        if (const_args[0]) {
2280            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2281            tcg_out8(s, a0);
2282        } else {
2283            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2284        }
2285        break;
2286    OP_32_64(st16):
2287        if (const_args[0]) {
2288            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2289            tcg_out16(s, a0);
2290        } else {
2291            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2292        }
2293        break;
2294#if TCG_TARGET_REG_BITS == 64
2295    case INDEX_op_st32_i64:
2296#endif
2297    case INDEX_op_st_i32:
2298        if (const_args[0]) {
2299            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2300            tcg_out32(s, a0);
2301        } else {
2302            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2303        }
2304        break;
2305
2306    OP_32_64(add):
2307        /* For 3-operand addition, use LEA.  */
2308        if (a0 != a1) {
2309            TCGArg c3 = 0;
2310            if (const_a2) {
2311                c3 = a2, a2 = -1;
2312            } else if (a0 == a2) {
2313                /* Watch out for dest = src + dest, since we've removed
2314                   the matching constraint on the add.  */
2315                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2316                break;
2317            }
2318
2319            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2320            break;
2321        }
2322        c = ARITH_ADD;
2323        goto gen_arith;
2324    OP_32_64(sub):
2325        c = ARITH_SUB;
2326        goto gen_arith;
2327    OP_32_64(and):
2328        c = ARITH_AND;
2329        goto gen_arith;
2330    OP_32_64(or):
2331        c = ARITH_OR;
2332        goto gen_arith;
2333    OP_32_64(xor):
2334        c = ARITH_XOR;
2335        goto gen_arith;
2336    gen_arith:
2337        if (const_a2) {
2338            tgen_arithi(s, c + rexw, a0, a2, 0);
2339        } else {
2340            tgen_arithr(s, c + rexw, a0, a2);
2341        }
2342        break;
2343
2344    OP_32_64(andc):
2345        if (const_a2) {
2346            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2347            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2348        } else {
2349            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2350        }
2351        break;
2352
2353    OP_32_64(mul):
2354        if (const_a2) {
2355            int32_t val;
2356            val = a2;
2357            if (val == (int8_t)val) {
2358                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2359                tcg_out8(s, val);
2360            } else {
2361                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2362                tcg_out32(s, val);
2363            }
2364        } else {
2365            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2366        }
2367        break;
2368
2369    OP_32_64(div2):
2370        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2371        break;
2372    OP_32_64(divu2):
2373        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2374        break;
2375
2376    OP_32_64(shl):
2377        /* For small constant 3-operand shift, use LEA.  */
2378        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2379            if (a2 - 1 == 0) {
2380                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2381                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2382            } else {
2383                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2384                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2385            }
2386            break;
2387        }
2388        c = SHIFT_SHL;
2389        vexop = OPC_SHLX;
2390        goto gen_shift_maybe_vex;
2391    OP_32_64(shr):
2392        c = SHIFT_SHR;
2393        vexop = OPC_SHRX;
2394        goto gen_shift_maybe_vex;
2395    OP_32_64(sar):
2396        c = SHIFT_SAR;
2397        vexop = OPC_SARX;
2398        goto gen_shift_maybe_vex;
2399    OP_32_64(rotl):
2400        c = SHIFT_ROL;
2401        goto gen_shift;
2402    OP_32_64(rotr):
2403        c = SHIFT_ROR;
2404        goto gen_shift;
2405    gen_shift_maybe_vex:
2406        if (have_bmi2) {
2407            if (!const_a2) {
2408                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2409                break;
2410            }
2411            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2412        }
2413        /* FALLTHRU */
2414    gen_shift:
2415        if (const_a2) {
2416            tcg_out_shifti(s, c + rexw, a0, a2);
2417        } else {
2418            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2419        }
2420        break;
2421
2422    OP_32_64(ctz):
2423        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2424        break;
2425    OP_32_64(clz):
2426        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2427        break;
2428    OP_32_64(ctpop):
2429        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2430        break;
2431
2432    case INDEX_op_brcond_i32:
2433        tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2434        break;
2435    case INDEX_op_setcond_i32:
2436        tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2437        break;
2438    case INDEX_op_movcond_i32:
2439        tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2440        break;
2441
2442    OP_32_64(bswap16):
2443        tcg_out_rolw_8(s, a0);
2444        break;
2445    OP_32_64(bswap32):
2446        tcg_out_bswap32(s, a0);
2447        break;
2448
2449    OP_32_64(neg):
2450        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2451        break;
2452    OP_32_64(not):
2453        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2454        break;
2455
2456    OP_32_64(ext8s):
2457        tcg_out_ext8s(s, a0, a1, rexw);
2458        break;
2459    OP_32_64(ext16s):
2460        tcg_out_ext16s(s, a0, a1, rexw);
2461        break;
2462    OP_32_64(ext8u):
2463        tcg_out_ext8u(s, a0, a1);
2464        break;
2465    OP_32_64(ext16u):
2466        tcg_out_ext16u(s, a0, a1);
2467        break;
2468
2469    case INDEX_op_qemu_ld_i32:
2470        tcg_out_qemu_ld(s, args, 0);
2471        break;
2472    case INDEX_op_qemu_ld_i64:
2473        tcg_out_qemu_ld(s, args, 1);
2474        break;
2475    case INDEX_op_qemu_st_i32:
2476        tcg_out_qemu_st(s, args, 0);
2477        break;
2478    case INDEX_op_qemu_st_i64:
2479        tcg_out_qemu_st(s, args, 1);
2480        break;
2481
2482    OP_32_64(mulu2):
2483        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2484        break;
2485    OP_32_64(muls2):
2486        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2487        break;
2488    OP_32_64(add2):
2489        if (const_args[4]) {
2490            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2491        } else {
2492            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2493        }
2494        if (const_args[5]) {
2495            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2496        } else {
2497            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2498        }
2499        break;
2500    OP_32_64(sub2):
2501        if (const_args[4]) {
2502            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2503        } else {
2504            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2505        }
2506        if (const_args[5]) {
2507            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2508        } else {
2509            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2510        }
2511        break;
2512
2513#if TCG_TARGET_REG_BITS == 32
2514    case INDEX_op_brcond2_i32:
2515        tcg_out_brcond2(s, args, const_args, 0);
2516        break;
2517    case INDEX_op_setcond2_i32:
2518        tcg_out_setcond2(s, args, const_args);
2519        break;
2520#else /* TCG_TARGET_REG_BITS == 64 */
2521    case INDEX_op_ld32s_i64:
2522        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2523        break;
2524    case INDEX_op_ld_i64:
2525        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2526        break;
2527    case INDEX_op_st_i64:
2528        if (const_args[0]) {
2529            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2530            tcg_out32(s, a0);
2531        } else {
2532            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2533        }
2534        break;
2535
2536    case INDEX_op_brcond_i64:
2537        tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2538        break;
2539    case INDEX_op_setcond_i64:
2540        tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2541        break;
2542    case INDEX_op_movcond_i64:
2543        tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2544        break;
2545
2546    case INDEX_op_bswap64_i64:
2547        tcg_out_bswap64(s, a0);
2548        break;
2549    case INDEX_op_extu_i32_i64:
2550    case INDEX_op_ext32u_i64:
2551        tcg_out_ext32u(s, a0, a1);
2552        break;
2553    case INDEX_op_ext_i32_i64:
2554    case INDEX_op_ext32s_i64:
2555        tcg_out_ext32s(s, a0, a1);
2556        break;
2557#endif
2558
2559    OP_32_64(deposit):
2560        if (args[3] == 0 && args[4] == 8) {
2561            /* load bits 0..7 */
2562            tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2563        } else if (args[3] == 8 && args[4] == 8) {
2564            /* load bits 8..15 */
2565            tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2566        } else if (args[3] == 0 && args[4] == 16) {
2567            /* load bits 0..15 */
2568            tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2569        } else {
2570            tcg_abort();
2571        }
2572        break;
2573
2574    case INDEX_op_extract_i64:
2575        if (a2 + args[3] == 32) {
2576            /* This is a 32-bit zero-extending right shift.  */
2577            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2578            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2579            break;
2580        }
2581        /* FALLTHRU */
2582    case INDEX_op_extract_i32:
2583        /* On the off-chance that we can use the high-byte registers.
2584           Otherwise we emit the same ext16 + shift pattern that we
2585           would have gotten from the normal tcg-op.c expansion.  */
2586        tcg_debug_assert(a2 == 8 && args[3] == 8);
2587        if (a1 < 4 && a0 < 8) {
2588            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2589        } else {
2590            tcg_out_ext16u(s, a0, a1);
2591            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2592        }
2593        break;
2594
2595    case INDEX_op_sextract_i32:
2596        /* We don't implement sextract_i64, as we cannot sign-extend to
2597           64-bits without using the REX prefix that explicitly excludes
2598           access to the high-byte registers.  */
2599        tcg_debug_assert(a2 == 8 && args[3] == 8);
2600        if (a1 < 4 && a0 < 8) {
2601            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2602        } else {
2603            tcg_out_ext16s(s, a0, a1, 0);
2604            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2605        }
2606        break;
2607
2608    case INDEX_op_mb:
2609        tcg_out_mb(s, a0);
2610        break;
2611    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2612    case INDEX_op_mov_i64:
2613    case INDEX_op_mov_vec:
2614    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
2615    case INDEX_op_movi_i64:
2616    case INDEX_op_dupi_vec:
2617    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2618    default:
2619        tcg_abort();
2620    }
2621
2622#undef OP_32_64
2623}
2624
2625static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2626                           unsigned vecl, unsigned vece,
2627                           const TCGArg *args, const int *const_args)
2628{
2629    static int const add_insn[4] = {
2630        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2631    };
2632    static int const sub_insn[4] = {
2633        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2634    };
2635    static int const mul_insn[4] = {
2636        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
2637    };
2638    static int const shift_imm_insn[4] = {
2639        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2640    };
2641    static int const cmpeq_insn[4] = {
2642        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2643    };
2644    static int const cmpgt_insn[4] = {
2645        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2646    };
2647    static int const punpckl_insn[4] = {
2648        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2649    };
2650    static int const punpckh_insn[4] = {
2651        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2652    };
2653    static int const packss_insn[4] = {
2654        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2655    };
2656    static int const packus_insn[4] = {
2657        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2658    };
2659
2660    TCGType type = vecl + TCG_TYPE_V64;
2661    int insn, sub;
2662    TCGArg a0, a1, a2;
2663
2664    a0 = args[0];
2665    a1 = args[1];
2666    a2 = args[2];
2667
2668    switch (opc) {
2669    case INDEX_op_add_vec:
2670        insn = add_insn[vece];
2671        goto gen_simd;
2672    case INDEX_op_sub_vec:
2673        insn = sub_insn[vece];
2674        goto gen_simd;
2675    case INDEX_op_mul_vec:
2676        insn = mul_insn[vece];
2677        goto gen_simd;
2678    case INDEX_op_and_vec:
2679        insn = OPC_PAND;
2680        goto gen_simd;
2681    case INDEX_op_or_vec:
2682        insn = OPC_POR;
2683        goto gen_simd;
2684    case INDEX_op_xor_vec:
2685        insn = OPC_PXOR;
2686        goto gen_simd;
2687    case INDEX_op_x86_punpckl_vec:
2688        insn = punpckl_insn[vece];
2689        goto gen_simd;
2690    case INDEX_op_x86_punpckh_vec:
2691        insn = punpckh_insn[vece];
2692        goto gen_simd;
2693    case INDEX_op_x86_packss_vec:
2694        insn = packss_insn[vece];
2695        goto gen_simd;
2696    case INDEX_op_x86_packus_vec:
2697        insn = packus_insn[vece];
2698        goto gen_simd;
2699#if TCG_TARGET_REG_BITS == 32
2700    case INDEX_op_dup2_vec:
2701        /* Constraints have already placed both 32-bit inputs in xmm regs.  */
2702        insn = OPC_PUNPCKLDQ;
2703        goto gen_simd;
2704#endif
2705    gen_simd:
2706        tcg_debug_assert(insn != OPC_UD2);
2707        if (type == TCG_TYPE_V256) {
2708            insn |= P_VEXL;
2709        }
2710        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2711        break;
2712
2713    case INDEX_op_cmp_vec:
2714        sub = args[3];
2715        if (sub == TCG_COND_EQ) {
2716            insn = cmpeq_insn[vece];
2717        } else if (sub == TCG_COND_GT) {
2718            insn = cmpgt_insn[vece];
2719        } else {
2720            g_assert_not_reached();
2721        }
2722        goto gen_simd;
2723
2724    case INDEX_op_andc_vec:
2725        insn = OPC_PANDN;
2726        if (type == TCG_TYPE_V256) {
2727            insn |= P_VEXL;
2728        }
2729        tcg_out_vex_modrm(s, insn, a0, a2, a1);
2730        break;
2731
2732    case INDEX_op_shli_vec:
2733        sub = 6;
2734        goto gen_shift;
2735    case INDEX_op_shri_vec:
2736        sub = 2;
2737        goto gen_shift;
2738    case INDEX_op_sari_vec:
2739        tcg_debug_assert(vece != MO_64);
2740        sub = 4;
2741    gen_shift:
2742        tcg_debug_assert(vece != MO_8);
2743        insn = shift_imm_insn[vece];
2744        if (type == TCG_TYPE_V256) {
2745            insn |= P_VEXL;
2746        }
2747        tcg_out_vex_modrm(s, insn, sub, a0, a1);
2748        tcg_out8(s, a2);
2749        break;
2750
2751    case INDEX_op_ld_vec:
2752        tcg_out_ld(s, type, a0, a1, a2);
2753        break;
2754    case INDEX_op_st_vec:
2755        tcg_out_st(s, type, a0, a1, a2);
2756        break;
2757    case INDEX_op_dup_vec:
2758        tcg_out_dup_vec(s, type, vece, a0, a1);
2759        break;
2760
2761    case INDEX_op_x86_shufps_vec:
2762        insn = OPC_SHUFPS;
2763        sub = args[3];
2764        goto gen_simd_imm8;
2765    case INDEX_op_x86_blend_vec:
2766        if (vece == MO_16) {
2767            insn = OPC_PBLENDW;
2768        } else if (vece == MO_32) {
2769            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
2770        } else {
2771            g_assert_not_reached();
2772        }
2773        sub = args[3];
2774        goto gen_simd_imm8;
2775    case INDEX_op_x86_vperm2i128_vec:
2776        insn = OPC_VPERM2I128;
2777        sub = args[3];
2778        goto gen_simd_imm8;
2779    gen_simd_imm8:
2780        if (type == TCG_TYPE_V256) {
2781            insn |= P_VEXL;
2782        }
2783        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2784        tcg_out8(s, sub);
2785        break;
2786
2787    case INDEX_op_x86_vpblendvb_vec:
2788        insn = OPC_VPBLENDVB;
2789        if (type == TCG_TYPE_V256) {
2790            insn |= P_VEXL;
2791        }
2792        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2793        tcg_out8(s, args[3] << 4);
2794        break;
2795
2796    case INDEX_op_x86_psrldq_vec:
2797        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
2798        tcg_out8(s, a2);
2799        break;
2800
2801    default:
2802        g_assert_not_reached();
2803    }
2804}
2805
2806static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
2807{
2808    static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
2809    static const TCGTargetOpDef ri_r = { .args_ct_str = { "ri", "r" } };
2810    static const TCGTargetOpDef re_r = { .args_ct_str = { "re", "r" } };
2811    static const TCGTargetOpDef qi_r = { .args_ct_str = { "qi", "r" } };
2812    static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
2813    static const TCGTargetOpDef r_q = { .args_ct_str = { "r", "q" } };
2814    static const TCGTargetOpDef r_re = { .args_ct_str = { "r", "re" } };
2815    static const TCGTargetOpDef r_0 = { .args_ct_str = { "r", "0" } };
2816    static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
2817    static const TCGTargetOpDef r_r_re = { .args_ct_str = { "r", "r", "re" } };
2818    static const TCGTargetOpDef r_0_re = { .args_ct_str = { "r", "0", "re" } };
2819    static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } };
2820    static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
2821    static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
2822    static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
2823    static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
2824    static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
2825    static const TCGTargetOpDef r_r_L_L
2826        = { .args_ct_str = { "r", "r", "L", "L" } };
2827    static const TCGTargetOpDef L_L_L_L
2828        = { .args_ct_str = { "L", "L", "L", "L" } };
2829    static const TCGTargetOpDef x_x = { .args_ct_str = { "x", "x" } };
2830    static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } };
2831    static const TCGTargetOpDef x_x_x_x
2832        = { .args_ct_str = { "x", "x", "x", "x" } };
2833    static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } };
2834
2835    switch (op) {
2836    case INDEX_op_goto_ptr:
2837        return &r;
2838
2839    case INDEX_op_ld8u_i32:
2840    case INDEX_op_ld8u_i64:
2841    case INDEX_op_ld8s_i32:
2842    case INDEX_op_ld8s_i64:
2843    case INDEX_op_ld16u_i32:
2844    case INDEX_op_ld16u_i64:
2845    case INDEX_op_ld16s_i32:
2846    case INDEX_op_ld16s_i64:
2847    case INDEX_op_ld_i32:
2848    case INDEX_op_ld32u_i64:
2849    case INDEX_op_ld32s_i64:
2850    case INDEX_op_ld_i64:
2851        return &r_r;
2852
2853    case INDEX_op_st8_i32:
2854    case INDEX_op_st8_i64:
2855        return &qi_r;
2856    case INDEX_op_st16_i32:
2857    case INDEX_op_st16_i64:
2858    case INDEX_op_st_i32:
2859    case INDEX_op_st32_i64:
2860        return &ri_r;
2861    case INDEX_op_st_i64:
2862        return &re_r;
2863
2864    case INDEX_op_add_i32:
2865    case INDEX_op_add_i64:
2866        return &r_r_re;
2867    case INDEX_op_sub_i32:
2868    case INDEX_op_sub_i64:
2869    case INDEX_op_mul_i32:
2870    case INDEX_op_mul_i64:
2871    case INDEX_op_or_i32:
2872    case INDEX_op_or_i64:
2873    case INDEX_op_xor_i32:
2874    case INDEX_op_xor_i64:
2875        return &r_0_re;
2876
2877    case INDEX_op_and_i32:
2878    case INDEX_op_and_i64:
2879        {
2880            static const TCGTargetOpDef and
2881                = { .args_ct_str = { "r", "0", "reZ" } };
2882            return &and;
2883        }
2884        break;
2885    case INDEX_op_andc_i32:
2886    case INDEX_op_andc_i64:
2887        {
2888            static const TCGTargetOpDef andc
2889                = { .args_ct_str = { "r", "r", "rI" } };
2890            return &andc;
2891        }
2892        break;
2893
2894    case INDEX_op_shl_i32:
2895    case INDEX_op_shl_i64:
2896    case INDEX_op_shr_i32:
2897    case INDEX_op_shr_i64:
2898    case INDEX_op_sar_i32:
2899    case INDEX_op_sar_i64:
2900        return have_bmi2 ? &r_r_ri : &r_0_ci;
2901    case INDEX_op_rotl_i32:
2902    case INDEX_op_rotl_i64:
2903    case INDEX_op_rotr_i32:
2904    case INDEX_op_rotr_i64:
2905        return &r_0_ci;
2906
2907    case INDEX_op_brcond_i32:
2908    case INDEX_op_brcond_i64:
2909        return &r_re;
2910
2911    case INDEX_op_bswap16_i32:
2912    case INDEX_op_bswap16_i64:
2913    case INDEX_op_bswap32_i32:
2914    case INDEX_op_bswap32_i64:
2915    case INDEX_op_bswap64_i64:
2916    case INDEX_op_neg_i32:
2917    case INDEX_op_neg_i64:
2918    case INDEX_op_not_i32:
2919    case INDEX_op_not_i64:
2920        return &r_0;
2921
2922    case INDEX_op_ext8s_i32:
2923    case INDEX_op_ext8s_i64:
2924    case INDEX_op_ext8u_i32:
2925    case INDEX_op_ext8u_i64:
2926        return &r_q;
2927    case INDEX_op_ext16s_i32:
2928    case INDEX_op_ext16s_i64:
2929    case INDEX_op_ext16u_i32:
2930    case INDEX_op_ext16u_i64:
2931    case INDEX_op_ext32s_i64:
2932    case INDEX_op_ext32u_i64:
2933    case INDEX_op_ext_i32_i64:
2934    case INDEX_op_extu_i32_i64:
2935    case INDEX_op_extract_i32:
2936    case INDEX_op_extract_i64:
2937    case INDEX_op_sextract_i32:
2938    case INDEX_op_ctpop_i32:
2939    case INDEX_op_ctpop_i64:
2940        return &r_r;
2941
2942    case INDEX_op_deposit_i32:
2943    case INDEX_op_deposit_i64:
2944        {
2945            static const TCGTargetOpDef dep
2946                = { .args_ct_str = { "Q", "0", "Q" } };
2947            return &dep;
2948        }
2949    case INDEX_op_setcond_i32:
2950    case INDEX_op_setcond_i64:
2951        {
2952            static const TCGTargetOpDef setc
2953                = { .args_ct_str = { "q", "r", "re" } };
2954            return &setc;
2955        }
2956    case INDEX_op_movcond_i32:
2957    case INDEX_op_movcond_i64:
2958        {
2959            static const TCGTargetOpDef movc
2960                = { .args_ct_str = { "r", "r", "re", "r", "0" } };
2961            return &movc;
2962        }
2963    case INDEX_op_div2_i32:
2964    case INDEX_op_div2_i64:
2965    case INDEX_op_divu2_i32:
2966    case INDEX_op_divu2_i64:
2967        {
2968            static const TCGTargetOpDef div2
2969                = { .args_ct_str = { "a", "d", "0", "1", "r" } };
2970            return &div2;
2971        }
2972    case INDEX_op_mulu2_i32:
2973    case INDEX_op_mulu2_i64:
2974    case INDEX_op_muls2_i32:
2975    case INDEX_op_muls2_i64:
2976        {
2977            static const TCGTargetOpDef mul2
2978                = { .args_ct_str = { "a", "d", "a", "r" } };
2979            return &mul2;
2980        }
2981    case INDEX_op_add2_i32:
2982    case INDEX_op_add2_i64:
2983    case INDEX_op_sub2_i32:
2984    case INDEX_op_sub2_i64:
2985        {
2986            static const TCGTargetOpDef arith2
2987                = { .args_ct_str = { "r", "r", "0", "1", "re", "re" } };
2988            return &arith2;
2989        }
2990    case INDEX_op_ctz_i32:
2991    case INDEX_op_ctz_i64:
2992        {
2993            static const TCGTargetOpDef ctz[2] = {
2994                { .args_ct_str = { "&r", "r", "r" } },
2995                { .args_ct_str = { "&r", "r", "rW" } },
2996            };
2997            return &ctz[have_bmi1];
2998        }
2999    case INDEX_op_clz_i32:
3000    case INDEX_op_clz_i64:
3001        {
3002            static const TCGTargetOpDef clz[2] = {
3003                { .args_ct_str = { "&r", "r", "r" } },
3004                { .args_ct_str = { "&r", "r", "rW" } },
3005            };
3006            return &clz[have_lzcnt];
3007        }
3008
3009    case INDEX_op_qemu_ld_i32:
3010        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
3011    case INDEX_op_qemu_st_i32:
3012        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L;
3013    case INDEX_op_qemu_ld_i64:
3014        return (TCG_TARGET_REG_BITS == 64 ? &r_L
3015                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
3016                : &r_r_L_L);
3017    case INDEX_op_qemu_st_i64:
3018        return (TCG_TARGET_REG_BITS == 64 ? &L_L
3019                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L_L
3020                : &L_L_L_L);
3021
3022    case INDEX_op_brcond2_i32:
3023        {
3024            static const TCGTargetOpDef b2
3025                = { .args_ct_str = { "r", "r", "ri", "ri" } };
3026            return &b2;
3027        }
3028    case INDEX_op_setcond2_i32:
3029        {
3030            static const TCGTargetOpDef s2
3031                = { .args_ct_str = { "r", "r", "r", "ri", "ri" } };
3032            return &s2;
3033        }
3034
3035    case INDEX_op_ld_vec:
3036    case INDEX_op_st_vec:
3037        return &x_r;
3038
3039    case INDEX_op_add_vec:
3040    case INDEX_op_sub_vec:
3041    case INDEX_op_mul_vec:
3042    case INDEX_op_and_vec:
3043    case INDEX_op_or_vec:
3044    case INDEX_op_xor_vec:
3045    case INDEX_op_andc_vec:
3046    case INDEX_op_cmp_vec:
3047    case INDEX_op_x86_shufps_vec:
3048    case INDEX_op_x86_blend_vec:
3049    case INDEX_op_x86_packss_vec:
3050    case INDEX_op_x86_packus_vec:
3051    case INDEX_op_x86_vperm2i128_vec:
3052    case INDEX_op_x86_punpckl_vec:
3053    case INDEX_op_x86_punpckh_vec:
3054#if TCG_TARGET_REG_BITS == 32
3055    case INDEX_op_dup2_vec:
3056#endif
3057        return &x_x_x;
3058    case INDEX_op_dup_vec:
3059    case INDEX_op_shli_vec:
3060    case INDEX_op_shri_vec:
3061    case INDEX_op_sari_vec:
3062    case INDEX_op_x86_psrldq_vec:
3063        return &x_x;
3064    case INDEX_op_x86_vpblendvb_vec:
3065        return &x_x_x_x;
3066
3067    default:
3068        break;
3069    }
3070    return NULL;
3071}
3072
3073int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3074{
3075    switch (opc) {
3076    case INDEX_op_add_vec:
3077    case INDEX_op_sub_vec:
3078    case INDEX_op_and_vec:
3079    case INDEX_op_or_vec:
3080    case INDEX_op_xor_vec:
3081    case INDEX_op_andc_vec:
3082        return 1;
3083    case INDEX_op_cmp_vec:
3084        return -1;
3085
3086    case INDEX_op_shli_vec:
3087    case INDEX_op_shri_vec:
3088        /* We must expand the operation for MO_8.  */
3089        return vece == MO_8 ? -1 : 1;
3090
3091    case INDEX_op_sari_vec:
3092        /* We must expand the operation for MO_8.  */
3093        if (vece == MO_8) {
3094            return -1;
3095        }
3096        /* We can emulate this for MO_64, but it does not pay off
3097           unless we're producing at least 4 values.  */
3098        if (vece == MO_64) {
3099            return type >= TCG_TYPE_V256 ? -1 : 0;
3100        }
3101        return 1;
3102
3103    case INDEX_op_mul_vec:
3104        if (vece == MO_8) {
3105            /* We can expand the operation for MO_8.  */
3106            return -1;
3107        }
3108        if (vece == MO_64) {
3109            return 0;
3110        }
3111        return 1;
3112
3113    default:
3114        return 0;
3115    }
3116}
3117
3118void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3119                       TCGArg a0, ...)
3120{
3121    va_list va;
3122    TCGArg a1, a2;
3123    TCGv_vec v0, t1, t2, t3, t4;
3124
3125    va_start(va, a0);
3126    v0 = temp_tcgv_vec(arg_temp(a0));
3127
3128    switch (opc) {
3129    case INDEX_op_shli_vec:
3130    case INDEX_op_shri_vec:
3131        tcg_debug_assert(vece == MO_8);
3132        a1 = va_arg(va, TCGArg);
3133        a2 = va_arg(va, TCGArg);
3134        /* Unpack to W, shift, and repack.  Tricky bits:
3135           (1) Use punpck*bw x,x to produce DDCCBBAA,
3136               i.e. duplicate in other half of the 16-bit lane.
3137           (2) For right-shift, add 8 so that the high half of
3138               the lane becomes zero.  For left-shift, we must
3139               shift up and down again.
3140           (3) Step 2 leaves high half zero such that PACKUSWB
3141               (pack with unsigned saturation) does not modify
3142               the quantity.  */
3143        t1 = tcg_temp_new_vec(type);
3144        t2 = tcg_temp_new_vec(type);
3145        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3146                  tcgv_vec_arg(t1), a1, a1);
3147        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3148                  tcgv_vec_arg(t2), a1, a1);
3149        if (opc == INDEX_op_shri_vec) {
3150            vec_gen_3(INDEX_op_shri_vec, type, MO_16,
3151                     tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
3152            vec_gen_3(INDEX_op_shri_vec, type, MO_16,
3153                     tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
3154        } else {
3155            vec_gen_3(INDEX_op_shli_vec, type, MO_16,
3156                     tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
3157            vec_gen_3(INDEX_op_shli_vec, type, MO_16,
3158                     tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
3159            vec_gen_3(INDEX_op_shri_vec, type, MO_16,
3160                     tcgv_vec_arg(t1), tcgv_vec_arg(t1), 8);
3161            vec_gen_3(INDEX_op_shri_vec, type, MO_16,
3162                     tcgv_vec_arg(t2), tcgv_vec_arg(t2), 8);
3163        }
3164        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3165                 a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3166        tcg_temp_free_vec(t1);
3167        tcg_temp_free_vec(t2);
3168        break;
3169
3170    case INDEX_op_sari_vec:
3171        a1 = va_arg(va, TCGArg);
3172        a2 = va_arg(va, TCGArg);
3173        if (vece == MO_8) {
3174            /* Unpack to W, shift, and repack, as above.  */
3175            t1 = tcg_temp_new_vec(type);
3176            t2 = tcg_temp_new_vec(type);
3177            vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3178                      tcgv_vec_arg(t1), a1, a1);
3179            vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3180                      tcgv_vec_arg(t2), a1, a1);
3181            vec_gen_3(INDEX_op_sari_vec, type, MO_16,
3182                      tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
3183            vec_gen_3(INDEX_op_sari_vec, type, MO_16,
3184                      tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
3185            vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3186                      a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3187            tcg_temp_free_vec(t1);
3188            tcg_temp_free_vec(t2);
3189            break;
3190        }
3191        tcg_debug_assert(vece == MO_64);
3192        /* MO_64: If the shift is <= 32, we can emulate the sign extend by
3193           performing an arithmetic 32-bit shift and overwriting the high
3194           half of the result (note that the ISA says shift of 32 is valid). */
3195        if (a2 <= 32) {
3196            t1 = tcg_temp_new_vec(type);
3197            vec_gen_3(INDEX_op_sari_vec, type, MO_32, tcgv_vec_arg(t1), a1, a2);
3198            vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2);
3199            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3200                      a0, a0, tcgv_vec_arg(t1), 0xaa);
3201            tcg_temp_free_vec(t1);
3202            break;
3203        }
3204        /* Otherwise we will need to use a compare vs 0 to produce the
3205           sign-extend, shift and merge.  */
3206        t1 = tcg_temp_new_vec(type);
3207        t2 = tcg_const_zeros_vec(type);
3208        vec_gen_4(INDEX_op_cmp_vec, type, MO_64,
3209                  tcgv_vec_arg(t1), tcgv_vec_arg(t2), a1, TCG_COND_GT);
3210        tcg_temp_free_vec(t2);
3211        vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2);
3212        vec_gen_3(INDEX_op_shli_vec, type, MO_64,
3213                  tcgv_vec_arg(t1), tcgv_vec_arg(t1), 64 - a2);
3214        vec_gen_3(INDEX_op_or_vec, type, MO_64, a0, a0, tcgv_vec_arg(t1));
3215        tcg_temp_free_vec(t1);
3216        break;
3217
3218    case INDEX_op_mul_vec:
3219        tcg_debug_assert(vece == MO_8);
3220        a1 = va_arg(va, TCGArg);
3221        a2 = va_arg(va, TCGArg);
3222        switch (type) {
3223        case TCG_TYPE_V64:
3224            t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3225            t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3226            tcg_gen_dup16i_vec(t2, 0);
3227            vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3228                      tcgv_vec_arg(t1), a1, tcgv_vec_arg(t2));
3229            vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3230                      tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2);
3231            tcg_gen_mul_vec(MO_16, t1, t1, t2);
3232            tcg_gen_shri_vec(MO_16, t1, t1, 8);
3233            vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3234                      a0, tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3235            tcg_temp_free_vec(t1);
3236            tcg_temp_free_vec(t2);
3237            break;
3238
3239        case TCG_TYPE_V128:
3240            t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3241            t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3242            t3 = tcg_temp_new_vec(TCG_TYPE_V128);
3243            t4 = tcg_temp_new_vec(TCG_TYPE_V128);
3244            tcg_gen_dup16i_vec(t4, 0);
3245            vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3246                      tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4));
3247            vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3248                      tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2);
3249            vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8,
3250                      tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4));
3251            vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8,
3252                      tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2);
3253            tcg_gen_mul_vec(MO_16, t1, t1, t2);
3254            tcg_gen_mul_vec(MO_16, t3, t3, t4);
3255            tcg_gen_shri_vec(MO_16, t1, t1, 8);
3256            tcg_gen_shri_vec(MO_16, t3, t3, 8);
3257            vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3258                      a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3259            tcg_temp_free_vec(t1);
3260            tcg_temp_free_vec(t2);
3261            tcg_temp_free_vec(t3);
3262            tcg_temp_free_vec(t4);
3263            break;
3264
3265        case TCG_TYPE_V256:
3266            t1 = tcg_temp_new_vec(TCG_TYPE_V256);
3267            t2 = tcg_temp_new_vec(TCG_TYPE_V256);
3268            t3 = tcg_temp_new_vec(TCG_TYPE_V256);
3269            t4 = tcg_temp_new_vec(TCG_TYPE_V256);
3270            tcg_gen_dup16i_vec(t4, 0);
3271            /* a1: A[0-7] ... D[0-7]; a2: W[0-7] ... Z[0-7]
3272               t1: extends of B[0-7], D[0-7]
3273               t2: extends of X[0-7], Z[0-7]
3274               t3: extends of A[0-7], C[0-7]
3275               t4: extends of W[0-7], Y[0-7].  */
3276            vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8,
3277                      tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4));
3278            vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8,
3279                      tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2);
3280            vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8,
3281                      tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4));
3282            vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8,
3283                      tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2);
3284            /* t1: BX DZ; t2: AW CY.  */
3285            tcg_gen_mul_vec(MO_16, t1, t1, t2);
3286            tcg_gen_mul_vec(MO_16, t3, t3, t4);
3287            tcg_gen_shri_vec(MO_16, t1, t1, 8);
3288            tcg_gen_shri_vec(MO_16, t3, t3, 8);
3289            /* a0: AW BX CY DZ.  */
3290            vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V256, MO_8,
3291                      a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3292            tcg_temp_free_vec(t1);
3293            tcg_temp_free_vec(t2);
3294            tcg_temp_free_vec(t3);
3295            tcg_temp_free_vec(t4);
3296            break;
3297
3298        default:
3299            g_assert_not_reached();
3300        }
3301        break;
3302
3303    case INDEX_op_cmp_vec:
3304        {
3305            enum {
3306                NEED_SWAP = 1,
3307                NEED_INV  = 2,
3308                NEED_BIAS = 4
3309            };
3310            static const uint8_t fixups[16] = {
3311                [0 ... 15] = -1,
3312                [TCG_COND_EQ] = 0,
3313                [TCG_COND_NE] = NEED_INV,
3314                [TCG_COND_GT] = 0,
3315                [TCG_COND_LT] = NEED_SWAP,
3316                [TCG_COND_LE] = NEED_INV,
3317                [TCG_COND_GE] = NEED_SWAP | NEED_INV,
3318                [TCG_COND_GTU] = NEED_BIAS,
3319                [TCG_COND_LTU] = NEED_BIAS | NEED_SWAP,
3320                [TCG_COND_LEU] = NEED_BIAS | NEED_INV,
3321                [TCG_COND_GEU] = NEED_BIAS | NEED_SWAP | NEED_INV,
3322            };
3323
3324            TCGCond cond;
3325            uint8_t fixup;
3326
3327            a1 = va_arg(va, TCGArg);
3328            a2 = va_arg(va, TCGArg);
3329            cond = va_arg(va, TCGArg);
3330            fixup = fixups[cond & 15];
3331            tcg_debug_assert(fixup != 0xff);
3332
3333            if (fixup & NEED_INV) {
3334                cond = tcg_invert_cond(cond);
3335            }
3336            if (fixup & NEED_SWAP) {
3337                TCGArg t;
3338                t = a1, a1 = a2, a2 = t;
3339                cond = tcg_swap_cond(cond);
3340            }
3341
3342            t1 = t2 = NULL;
3343            if (fixup & NEED_BIAS) {
3344                t1 = tcg_temp_new_vec(type);
3345                t2 = tcg_temp_new_vec(type);
3346                tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1));
3347                tcg_gen_sub_vec(vece, t1, temp_tcgv_vec(arg_temp(a1)), t2);
3348                tcg_gen_sub_vec(vece, t2, temp_tcgv_vec(arg_temp(a2)), t2);
3349                a1 = tcgv_vec_arg(t1);
3350                a2 = tcgv_vec_arg(t2);
3351                cond = tcg_signed_cond(cond);
3352            }
3353
3354            tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3355            vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond);
3356
3357            if (fixup & NEED_BIAS) {
3358                tcg_temp_free_vec(t1);
3359                tcg_temp_free_vec(t2);
3360            }
3361            if (fixup & NEED_INV) {
3362                tcg_gen_not_vec(vece, v0, v0);
3363            }
3364        }
3365        break;
3366
3367    default:
3368        break;
3369    }
3370
3371    va_end(va);
3372}
3373
3374static const int tcg_target_callee_save_regs[] = {
3375#if TCG_TARGET_REG_BITS == 64
3376    TCG_REG_RBP,
3377    TCG_REG_RBX,
3378#if defined(_WIN64)
3379    TCG_REG_RDI,
3380    TCG_REG_RSI,
3381#endif
3382    TCG_REG_R12,
3383    TCG_REG_R13,
3384    TCG_REG_R14, /* Currently used for the global env. */
3385    TCG_REG_R15,
3386#else
3387    TCG_REG_EBP, /* Currently used for the global env. */
3388    TCG_REG_EBX,
3389    TCG_REG_ESI,
3390    TCG_REG_EDI,
3391#endif
3392};
3393
3394/* Compute frame size via macros, to share between tcg_target_qemu_prologue
3395   and tcg_register_jit.  */
3396
3397#define PUSH_SIZE \
3398    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3399     * (TCG_TARGET_REG_BITS / 8))
3400
3401#define FRAME_SIZE \
3402    ((PUSH_SIZE \
3403      + TCG_STATIC_CALL_ARGS_SIZE \
3404      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3405      + TCG_TARGET_STACK_ALIGN - 1) \
3406     & ~(TCG_TARGET_STACK_ALIGN - 1))
3407
3408/* Generate global QEMU prologue and epilogue code */
3409static void tcg_target_qemu_prologue(TCGContext *s)
3410{
3411    int i, stack_addend;
3412
3413    /* TB prologue */
3414
3415    /* Reserve some stack space, also for TCG temps.  */
3416    stack_addend = FRAME_SIZE - PUSH_SIZE;
3417    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3418                  CPU_TEMP_BUF_NLONGS * sizeof(long));
3419
3420    /* Save all callee saved registers.  */
3421    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3422        tcg_out_push(s, tcg_target_callee_save_regs[i]);
3423    }
3424
3425#if TCG_TARGET_REG_BITS == 32
3426    tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3427               (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3428    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3429    /* jmp *tb.  */
3430    tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
3431                         (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3432                         + stack_addend);
3433#else
3434    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3435    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3436    /* jmp *tb.  */
3437    tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3438#endif
3439
3440    /*
3441     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3442     * and fall through to the rest of the epilogue.
3443     */
3444    s->code_gen_epilogue = s->code_ptr;
3445    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3446
3447    /* TB epilogue */
3448    tb_ret_addr = s->code_ptr;
3449
3450    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3451
3452    if (have_avx2) {
3453        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3454    }
3455    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3456        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3457    }
3458    tcg_out_opc(s, OPC_RET, 0, 0, 0);
3459
3460#if !defined(CONFIG_SOFTMMU)
3461    /* Try to set up a segment register to point to guest_base.  */
3462    if (guest_base) {
3463        setup_guest_base_seg();
3464    }
3465#endif
3466}
3467
3468static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3469{
3470    memset(p, 0x90, count);
3471}
3472
3473static void tcg_target_init(TCGContext *s)
3474{
3475#ifdef CONFIG_CPUID_H
3476    unsigned a, b, c, d, b7 = 0;
3477    int max = __get_cpuid_max(0, 0);
3478
3479    if (max >= 7) {
3480        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
3481        __cpuid_count(7, 0, a, b7, c, d);
3482        have_bmi1 = (b7 & bit_BMI) != 0;
3483        have_bmi2 = (b7 & bit_BMI2) != 0;
3484    }
3485
3486    if (max >= 1) {
3487        __cpuid(1, a, b, c, d);
3488#ifndef have_cmov
3489        /* For 32-bit, 99% certainty that we're running on hardware that
3490           supports cmov, but we still need to check.  In case cmov is not
3491           available, we'll use a small forward branch.  */
3492        have_cmov = (d & bit_CMOV) != 0;
3493#endif
3494
3495        /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
3496           need to probe for it.  */
3497        have_movbe = (c & bit_MOVBE) != 0;
3498        have_popcnt = (c & bit_POPCNT) != 0;
3499
3500        /* There are a number of things we must check before we can be
3501           sure of not hitting invalid opcode.  */
3502        if (c & bit_OSXSAVE) {
3503            unsigned xcrl, xcrh;
3504            /* The xgetbv instruction is not available to older versions of
3505             * the assembler, so we encode the instruction manually.
3506             */
3507            asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
3508            if ((xcrl & 6) == 6) {
3509                have_avx1 = (c & bit_AVX) != 0;
3510                have_avx2 = (b7 & bit_AVX2) != 0;
3511            }
3512        }
3513    }
3514
3515    max = __get_cpuid_max(0x8000000, 0);
3516    if (max >= 1) {
3517        __cpuid(0x80000001, a, b, c, d);
3518        /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
3519        have_lzcnt = (c & bit_LZCNT) != 0;
3520    }
3521#endif /* CONFIG_CPUID_H */
3522
3523    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
3524    if (TCG_TARGET_REG_BITS == 64) {
3525        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
3526    }
3527    if (have_avx1) {
3528        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
3529        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
3530    }
3531    if (have_avx2) {
3532        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
3533    }
3534
3535    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
3536    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
3537    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
3538    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
3539    if (TCG_TARGET_REG_BITS == 64) {
3540#if !defined(_WIN64)
3541        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
3542        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
3543#endif
3544        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
3545        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
3546        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
3547        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
3548    }
3549
3550    s->reserved_regs = 0;
3551    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
3552}
3553
3554typedef struct {
3555    DebugFrameHeader h;
3556    uint8_t fde_def_cfa[4];
3557    uint8_t fde_reg_ofs[14];
3558} DebugFrame;
3559
3560/* We're expecting a 2 byte uleb128 encoded value.  */
3561QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3562
3563#if !defined(__ELF__)
3564    /* Host machine without ELF. */
3565#elif TCG_TARGET_REG_BITS == 64
3566#define ELF_HOST_MACHINE EM_X86_64
3567static const DebugFrame debug_frame = {
3568    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3569    .h.cie.id = -1,
3570    .h.cie.version = 1,
3571    .h.cie.code_align = 1,
3572    .h.cie.data_align = 0x78,             /* sleb128 -8 */
3573    .h.cie.return_column = 16,
3574
3575    /* Total FDE size does not include the "len" member.  */
3576    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3577
3578    .fde_def_cfa = {
3579        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
3580        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3581        (FRAME_SIZE >> 7)
3582    },
3583    .fde_reg_ofs = {
3584        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
3585        /* The following ordering must match tcg_target_callee_save_regs.  */
3586        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
3587        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
3588        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
3589        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
3590        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
3591        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
3592    }
3593};
3594#else
3595#define ELF_HOST_MACHINE EM_386
3596static const DebugFrame debug_frame = {
3597    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3598    .h.cie.id = -1,
3599    .h.cie.version = 1,
3600    .h.cie.code_align = 1,
3601    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
3602    .h.cie.return_column = 8,
3603
3604    /* Total FDE size does not include the "len" member.  */
3605    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3606
3607    .fde_def_cfa = {
3608        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
3609        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3610        (FRAME_SIZE >> 7)
3611    },
3612    .fde_reg_ofs = {
3613        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
3614        /* The following ordering must match tcg_target_callee_save_regs.  */
3615        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
3616        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
3617        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
3618        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
3619    }
3620};
3621#endif
3622
3623#if defined(ELF_HOST_MACHINE)
3624void tcg_register_jit(void *buf, size_t buf_size)
3625{
3626    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
3627}
3628#endif
3629