qemu/tcg/i386/tcg-target.inc.c
<<
>>
Prefs
   1/*
   2 * Tiny Code Generator for QEMU
   3 *
   4 * Copyright (c) 2008 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "tcg-pool.inc.c"
  26
  27#ifdef CONFIG_DEBUG_TCG
  28static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
  29#if TCG_TARGET_REG_BITS == 64
  30    "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
  31#else
  32    "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
  33#endif
  34    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
  35    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
  36#if TCG_TARGET_REG_BITS == 64
  37    "%xmm8", "%xmm9", "%xmm10", "%xmm11",
  38    "%xmm12", "%xmm13", "%xmm14", "%xmm15",
  39#endif
  40};
  41#endif
  42
  43static const int tcg_target_reg_alloc_order[] = {
  44#if TCG_TARGET_REG_BITS == 64
  45    TCG_REG_RBP,
  46    TCG_REG_RBX,
  47    TCG_REG_R12,
  48    TCG_REG_R13,
  49    TCG_REG_R14,
  50    TCG_REG_R15,
  51    TCG_REG_R10,
  52    TCG_REG_R11,
  53    TCG_REG_R9,
  54    TCG_REG_R8,
  55    TCG_REG_RCX,
  56    TCG_REG_RDX,
  57    TCG_REG_RSI,
  58    TCG_REG_RDI,
  59    TCG_REG_RAX,
  60#else
  61    TCG_REG_EBX,
  62    TCG_REG_ESI,
  63    TCG_REG_EDI,
  64    TCG_REG_EBP,
  65    TCG_REG_ECX,
  66    TCG_REG_EDX,
  67    TCG_REG_EAX,
  68#endif
  69    TCG_REG_XMM0,
  70    TCG_REG_XMM1,
  71    TCG_REG_XMM2,
  72    TCG_REG_XMM3,
  73    TCG_REG_XMM4,
  74    TCG_REG_XMM5,
  75#ifndef _WIN64
  76    /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
  77       any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
  78    TCG_REG_XMM6,
  79    TCG_REG_XMM7,
  80#if TCG_TARGET_REG_BITS == 64
  81    TCG_REG_XMM8,
  82    TCG_REG_XMM9,
  83    TCG_REG_XMM10,
  84    TCG_REG_XMM11,
  85    TCG_REG_XMM12,
  86    TCG_REG_XMM13,
  87    TCG_REG_XMM14,
  88    TCG_REG_XMM15,
  89#endif
  90#endif
  91};
  92
  93static const int tcg_target_call_iarg_regs[] = {
  94#if TCG_TARGET_REG_BITS == 64
  95#if defined(_WIN64)
  96    TCG_REG_RCX,
  97    TCG_REG_RDX,
  98#else
  99    TCG_REG_RDI,
 100    TCG_REG_RSI,
 101    TCG_REG_RDX,
 102    TCG_REG_RCX,
 103#endif
 104    TCG_REG_R8,
 105    TCG_REG_R9,
 106#else
 107    /* 32 bit mode uses stack based calling convention (GCC default). */
 108#endif
 109};
 110
 111static const int tcg_target_call_oarg_regs[] = {
 112    TCG_REG_EAX,
 113#if TCG_TARGET_REG_BITS == 32
 114    TCG_REG_EDX
 115#endif
 116};
 117
 118/* Constants we accept.  */
 119#define TCG_CT_CONST_S32 0x100
 120#define TCG_CT_CONST_U32 0x200
 121#define TCG_CT_CONST_I32 0x400
 122#define TCG_CT_CONST_WSZ 0x800
 123
 124/* Registers used with L constraint, which are the first argument
 125   registers on x86_64, and two random call clobbered registers on
 126   i386. */
 127#if TCG_TARGET_REG_BITS == 64
 128# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
 129# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
 130#else
 131# define TCG_REG_L0 TCG_REG_EAX
 132# define TCG_REG_L1 TCG_REG_EDX
 133#endif
 134
 135/* The host compiler should supply <cpuid.h> to enable runtime features
 136   detection, as we're not going to go so far as our own inline assembly.
 137   If not available, default values will be assumed.  */
 138#if defined(CONFIG_CPUID_H)
 139#include "qemu/cpuid.h"
 140#endif
 141
 142/* For 64-bit, we always know that CMOV is available.  */
 143#if TCG_TARGET_REG_BITS == 64
 144# define have_cmov 1
 145#elif defined(CONFIG_CPUID_H)
 146static bool have_cmov;
 147#else
 148# define have_cmov 0
 149#endif
 150
 151/* We need these symbols in tcg-target.h, and we can't properly conditionalize
 152   it there.  Therefore we always define the variable.  */
 153bool have_bmi1;
 154bool have_popcnt;
 155bool have_avx1;
 156bool have_avx2;
 157
 158#ifdef CONFIG_CPUID_H
 159static bool have_movbe;
 160static bool have_bmi2;
 161static bool have_lzcnt;
 162#else
 163# define have_movbe 0
 164# define have_bmi2 0
 165# define have_lzcnt 0
 166#endif
 167
 168static tcg_insn_unit *tb_ret_addr;
 169
 170static void patch_reloc(tcg_insn_unit *code_ptr, int type,
 171                        intptr_t value, intptr_t addend)
 172{
 173    value += addend;
 174    switch(type) {
 175    case R_386_PC32:
 176        value -= (uintptr_t)code_ptr;
 177        if (value != (int32_t)value) {
 178            tcg_abort();
 179        }
 180        /* FALLTHRU */
 181    case R_386_32:
 182        tcg_patch32(code_ptr, value);
 183        break;
 184    case R_386_PC8:
 185        value -= (uintptr_t)code_ptr;
 186        if (value != (int8_t)value) {
 187            tcg_abort();
 188        }
 189        tcg_patch8(code_ptr, value);
 190        break;
 191    default:
 192        tcg_abort();
 193    }
 194}
 195
 196#if TCG_TARGET_REG_BITS == 64
 197#define ALL_GENERAL_REGS   0x0000ffffu
 198#define ALL_VECTOR_REGS    0xffff0000u
 199#else
 200#define ALL_GENERAL_REGS   0x000000ffu
 201#define ALL_VECTOR_REGS    0x00ff0000u
 202#endif
 203
 204/* parse target specific constraints */
 205static const char *target_parse_constraint(TCGArgConstraint *ct,
 206                                           const char *ct_str, TCGType type)
 207{
 208    switch(*ct_str++) {
 209    case 'a':
 210        ct->ct |= TCG_CT_REG;
 211        tcg_regset_set_reg(ct->u.regs, TCG_REG_EAX);
 212        break;
 213    case 'b':
 214        ct->ct |= TCG_CT_REG;
 215        tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX);
 216        break;
 217    case 'c':
 218        ct->ct |= TCG_CT_REG;
 219        tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX);
 220        break;
 221    case 'd':
 222        ct->ct |= TCG_CT_REG;
 223        tcg_regset_set_reg(ct->u.regs, TCG_REG_EDX);
 224        break;
 225    case 'S':
 226        ct->ct |= TCG_CT_REG;
 227        tcg_regset_set_reg(ct->u.regs, TCG_REG_ESI);
 228        break;
 229    case 'D':
 230        ct->ct |= TCG_CT_REG;
 231        tcg_regset_set_reg(ct->u.regs, TCG_REG_EDI);
 232        break;
 233    case 'q':
 234        /* A register that can be used as a byte operand.  */
 235        ct->ct |= TCG_CT_REG;
 236        ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf;
 237        break;
 238    case 'Q':
 239        /* A register with an addressable second byte (e.g. %ah).  */
 240        ct->ct |= TCG_CT_REG;
 241        ct->u.regs = 0xf;
 242        break;
 243    case 'r':
 244        /* A general register.  */
 245        ct->ct |= TCG_CT_REG;
 246        ct->u.regs |= ALL_GENERAL_REGS;
 247        break;
 248    case 'W':
 249        /* With TZCNT/LZCNT, we can have operand-size as an input.  */
 250        ct->ct |= TCG_CT_CONST_WSZ;
 251        break;
 252    case 'x':
 253        /* A vector register.  */
 254        ct->ct |= TCG_CT_REG;
 255        ct->u.regs |= ALL_VECTOR_REGS;
 256        break;
 257
 258        /* qemu_ld/st address constraint */
 259    case 'L':
 260        ct->ct |= TCG_CT_REG;
 261        ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff;
 262        tcg_regset_reset_reg(ct->u.regs, TCG_REG_L0);
 263        tcg_regset_reset_reg(ct->u.regs, TCG_REG_L1);
 264        break;
 265
 266    case 'e':
 267        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_S32);
 268        break;
 269    case 'Z':
 270        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_U32);
 271        break;
 272    case 'I':
 273        ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_I32);
 274        break;
 275
 276    default:
 277        return NULL;
 278    }
 279    return ct_str;
 280}
 281
 282/* test if a constant matches the constraint */
 283static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 284                                         const TCGArgConstraint *arg_ct)
 285{
 286    int ct = arg_ct->ct;
 287    if (ct & TCG_CT_CONST) {
 288        return 1;
 289    }
 290    if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
 291        return 1;
 292    }
 293    if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
 294        return 1;
 295    }
 296    if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
 297        return 1;
 298    }
 299    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
 300        return 1;
 301    }
 302    return 0;
 303}
 304
 305# define LOWREGMASK(x)  ((x) & 7)
 306
 307#define P_EXT           0x100           /* 0x0f opcode prefix */
 308#define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
 309#define P_DATA16        0x400           /* 0x66 opcode prefix */
 310#if TCG_TARGET_REG_BITS == 64
 311# define P_ADDR32       0x800           /* 0x67 opcode prefix */
 312# define P_REXW         0x1000          /* Set REX.W = 1 */
 313# define P_REXB_R       0x2000          /* REG field as byte register */
 314# define P_REXB_RM      0x4000          /* R/M field as byte register */
 315# define P_GS           0x8000          /* gs segment override */
 316#else
 317# define P_ADDR32       0
 318# define P_REXW         0
 319# define P_REXB_R       0
 320# define P_REXB_RM      0
 321# define P_GS           0
 322#endif
 323#define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
 324#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
 325#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
 326#define P_VEXL          0x80000         /* Set VEX.L = 1 */
 327
 328#define OPC_ARITH_EvIz  (0x81)
 329#define OPC_ARITH_EvIb  (0x83)
 330#define OPC_ARITH_GvEv  (0x03)          /* ... plus (ARITH_FOO << 3) */
 331#define OPC_ANDN        (0xf2 | P_EXT38)
 332#define OPC_ADD_GvEv    (OPC_ARITH_GvEv | (ARITH_ADD << 3))
 333#define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
 334#define OPC_BSF         (0xbc | P_EXT)
 335#define OPC_BSR         (0xbd | P_EXT)
 336#define OPC_BSWAP       (0xc8 | P_EXT)
 337#define OPC_CALL_Jz     (0xe8)
 338#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
 339#define OPC_CMP_GvEv    (OPC_ARITH_GvEv | (ARITH_CMP << 3))
 340#define OPC_DEC_r32     (0x48)
 341#define OPC_IMUL_GvEv   (0xaf | P_EXT)
 342#define OPC_IMUL_GvEvIb (0x6b)
 343#define OPC_IMUL_GvEvIz (0x69)
 344#define OPC_INC_r32     (0x40)
 345#define OPC_JCC_long    (0x80 | P_EXT)  /* ... plus condition code */
 346#define OPC_JCC_short   (0x70)          /* ... plus condition code */
 347#define OPC_JMP_long    (0xe9)
 348#define OPC_JMP_short   (0xeb)
 349#define OPC_LEA         (0x8d)
 350#define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
 351#define OPC_MOVB_EvGv   (0x88)          /* stores, more or less */
 352#define OPC_MOVL_EvGv   (0x89)          /* stores, more or less */
 353#define OPC_MOVL_GvEv   (0x8b)          /* loads, more or less */
 354#define OPC_MOVB_EvIz   (0xc6)
 355#define OPC_MOVL_EvIz   (0xc7)
 356#define OPC_MOVL_Iv     (0xb8)
 357#define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
 358#define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
 359#define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
 360#define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
 361#define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
 362#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
 363#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
 364#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
 365#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
 366#define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
 367#define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
 368#define OPC_MOVSBL      (0xbe | P_EXT)
 369#define OPC_MOVSWL      (0xbf | P_EXT)
 370#define OPC_MOVSLQ      (0x63 | P_REXW)
 371#define OPC_MOVZBL      (0xb6 | P_EXT)
 372#define OPC_MOVZWL      (0xb7 | P_EXT)
 373#define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
 374#define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
 375#define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
 376#define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
 377#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
 378#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
 379#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
 380#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
 381#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
 382#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
 383#define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
 384#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
 385#define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
 386#define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
 387#define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
 388#define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
 389#define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
 390#define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
 391#define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
 392#define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
 393#define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
 394#define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
 395#define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
 396#define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
 397#define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
 398#define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
 399#define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
 400#define OPC_POR         (0xeb | P_EXT | P_DATA16)
 401#define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
 402#define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
 403#define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
 404#define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
 405#define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
 406#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
 407#define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
 408#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
 409#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
 410#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
 411#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
 412#define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
 413#define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
 414#define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
 415#define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
 416#define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
 417#define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
 418#define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
 419#define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
 420#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
 421#define OPC_POP_r32     (0x58)
 422#define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
 423#define OPC_PUSH_r32    (0x50)
 424#define OPC_PUSH_Iv     (0x68)
 425#define OPC_PUSH_Ib     (0x6a)
 426#define OPC_RET         (0xc3)
 427#define OPC_SETCC       (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
 428#define OPC_SHIFT_1     (0xd1)
 429#define OPC_SHIFT_Ib    (0xc1)
 430#define OPC_SHIFT_cl    (0xd3)
 431#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
 432#define OPC_SHUFPS      (0xc6 | P_EXT)
 433#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
 434#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
 435#define OPC_TESTL       (0x85)
 436#define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
 437#define OPC_UD2         (0x0b | P_EXT)
 438#define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
 439#define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
 440#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
 441#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
 442#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
 443#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
 444#define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_REXW)
 445#define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
 446#define OPC_VZEROUPPER  (0x77 | P_EXT)
 447#define OPC_XCHG_ax_r32 (0x90)
 448
 449#define OPC_GRP3_Ev     (0xf7)
 450#define OPC_GRP5        (0xff)
 451#define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
 452
 453/* Group 1 opcode extensions for 0x80-0x83.
 454   These are also used as modifiers for OPC_ARITH.  */
 455#define ARITH_ADD 0
 456#define ARITH_OR  1
 457#define ARITH_ADC 2
 458#define ARITH_SBB 3
 459#define ARITH_AND 4
 460#define ARITH_SUB 5
 461#define ARITH_XOR 6
 462#define ARITH_CMP 7
 463
 464/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
 465#define SHIFT_ROL 0
 466#define SHIFT_ROR 1
 467#define SHIFT_SHL 4
 468#define SHIFT_SHR 5
 469#define SHIFT_SAR 7
 470
 471/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
 472#define EXT3_NOT   2
 473#define EXT3_NEG   3
 474#define EXT3_MUL   4
 475#define EXT3_IMUL  5
 476#define EXT3_DIV   6
 477#define EXT3_IDIV  7
 478
 479/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
 480#define EXT5_INC_Ev     0
 481#define EXT5_DEC_Ev     1
 482#define EXT5_CALLN_Ev   2
 483#define EXT5_JMPN_Ev    4
 484
 485/* Condition codes to be added to OPC_JCC_{long,short}.  */
 486#define JCC_JMP (-1)
 487#define JCC_JO  0x0
 488#define JCC_JNO 0x1
 489#define JCC_JB  0x2
 490#define JCC_JAE 0x3
 491#define JCC_JE  0x4
 492#define JCC_JNE 0x5
 493#define JCC_JBE 0x6
 494#define JCC_JA  0x7
 495#define JCC_JS  0x8
 496#define JCC_JNS 0x9
 497#define JCC_JP  0xa
 498#define JCC_JNP 0xb
 499#define JCC_JL  0xc
 500#define JCC_JGE 0xd
 501#define JCC_JLE 0xe
 502#define JCC_JG  0xf
 503
 504static const uint8_t tcg_cond_to_jcc[] = {
 505    [TCG_COND_EQ] = JCC_JE,
 506    [TCG_COND_NE] = JCC_JNE,
 507    [TCG_COND_LT] = JCC_JL,
 508    [TCG_COND_GE] = JCC_JGE,
 509    [TCG_COND_LE] = JCC_JLE,
 510    [TCG_COND_GT] = JCC_JG,
 511    [TCG_COND_LTU] = JCC_JB,
 512    [TCG_COND_GEU] = JCC_JAE,
 513    [TCG_COND_LEU] = JCC_JBE,
 514    [TCG_COND_GTU] = JCC_JA,
 515};
 516
 517#if TCG_TARGET_REG_BITS == 64
 518static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
 519{
 520    int rex;
 521
 522    if (opc & P_GS) {
 523        tcg_out8(s, 0x65);
 524    }
 525    if (opc & P_DATA16) {
 526        /* We should never be asking for both 16 and 64-bit operation.  */
 527        tcg_debug_assert((opc & P_REXW) == 0);
 528        tcg_out8(s, 0x66);
 529    }
 530    if (opc & P_ADDR32) {
 531        tcg_out8(s, 0x67);
 532    }
 533    if (opc & P_SIMDF3) {
 534        tcg_out8(s, 0xf3);
 535    } else if (opc & P_SIMDF2) {
 536        tcg_out8(s, 0xf2);
 537    }
 538
 539    rex = 0;
 540    rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
 541    rex |= (r & 8) >> 1;                /* REX.R */
 542    rex |= (x & 8) >> 2;                /* REX.X */
 543    rex |= (rm & 8) >> 3;               /* REX.B */
 544
 545    /* P_REXB_{R,RM} indicates that the given register is the low byte.
 546       For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
 547       as otherwise the encoding indicates %[abcd]h.  Note that the values
 548       that are ORed in merely indicate that the REX byte must be present;
 549       those bits get discarded in output.  */
 550    rex |= opc & (r >= 4 ? P_REXB_R : 0);
 551    rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
 552
 553    if (rex) {
 554        tcg_out8(s, (uint8_t)(rex | 0x40));
 555    }
 556
 557    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 558        tcg_out8(s, 0x0f);
 559        if (opc & P_EXT38) {
 560            tcg_out8(s, 0x38);
 561        } else if (opc & P_EXT3A) {
 562            tcg_out8(s, 0x3a);
 563        }
 564    }
 565
 566    tcg_out8(s, opc);
 567}
 568#else
 569static void tcg_out_opc(TCGContext *s, int opc)
 570{
 571    if (opc & P_DATA16) {
 572        tcg_out8(s, 0x66);
 573    }
 574    if (opc & P_SIMDF3) {
 575        tcg_out8(s, 0xf3);
 576    } else if (opc & P_SIMDF2) {
 577        tcg_out8(s, 0xf2);
 578    }
 579    if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 580        tcg_out8(s, 0x0f);
 581        if (opc & P_EXT38) {
 582            tcg_out8(s, 0x38);
 583        } else if (opc & P_EXT3A) {
 584            tcg_out8(s, 0x3a);
 585        }
 586    }
 587    tcg_out8(s, opc);
 588}
 589/* Discard the register arguments to tcg_out_opc early, so as not to penalize
 590   the 32-bit compilation paths.  This method works with all versions of gcc,
 591   whereas relying on optimization may not be able to exclude them.  */
 592#define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
 593#endif
 594
 595static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
 596{
 597    tcg_out_opc(s, opc, r, rm, 0);
 598    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 599}
 600
 601static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
 602                            int rm, int index)
 603{
 604    int tmp;
 605
 606    /* Use the two byte form if possible, which cannot encode
 607       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
 608    if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT
 609        && ((rm | index) & 8) == 0) {
 610        /* Two byte VEX prefix.  */
 611        tcg_out8(s, 0xc5);
 612
 613        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
 614    } else {
 615        /* Three byte VEX prefix.  */
 616        tcg_out8(s, 0xc4);
 617
 618        /* VEX.m-mmmm */
 619        if (opc & P_EXT3A) {
 620            tmp = 3;
 621        } else if (opc & P_EXT38) {
 622            tmp = 2;
 623        } else if (opc & P_EXT) {
 624            tmp = 1;
 625        } else {
 626            g_assert_not_reached();
 627        }
 628        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
 629        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
 630        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
 631        tcg_out8(s, tmp);
 632
 633        tmp = (opc & P_REXW ? 0x80 : 0);       /* VEX.W */
 634    }
 635
 636    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
 637    /* VEX.pp */
 638    if (opc & P_DATA16) {
 639        tmp |= 1;                          /* 0x66 */
 640    } else if (opc & P_SIMDF3) {
 641        tmp |= 2;                          /* 0xf3 */
 642    } else if (opc & P_SIMDF2) {
 643        tmp |= 3;                          /* 0xf2 */
 644    }
 645    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
 646    tcg_out8(s, tmp);
 647    tcg_out8(s, opc);
 648}
 649
 650static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
 651{
 652    tcg_out_vex_opc(s, opc, r, v, rm, 0);
 653    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 654}
 655
 656/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
 657   We handle either RM and INDEX missing with a negative value.  In 64-bit
 658   mode for absolute addresses, ~RM is the size of the immediate operand
 659   that will follow the instruction.  */
 660
 661static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
 662                               int shift, intptr_t offset)
 663{
 664    int mod, len;
 665
 666    if (index < 0 && rm < 0) {
 667        if (TCG_TARGET_REG_BITS == 64) {
 668            /* Try for a rip-relative addressing mode.  This has replaced
 669               the 32-bit-mode absolute addressing encoding.  */
 670            intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
 671            intptr_t disp = offset - pc;
 672            if (disp == (int32_t)disp) {
 673                tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
 674                tcg_out32(s, disp);
 675                return;
 676            }
 677
 678            /* Try for an absolute address encoding.  This requires the
 679               use of the MODRM+SIB encoding and is therefore larger than
 680               rip-relative addressing.  */
 681            if (offset == (int32_t)offset) {
 682                tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
 683                tcg_out8(s, (4 << 3) | 5);
 684                tcg_out32(s, offset);
 685                return;
 686            }
 687
 688            /* ??? The memory isn't directly addressable.  */
 689            g_assert_not_reached();
 690        } else {
 691            /* Absolute address.  */
 692            tcg_out8(s, (r << 3) | 5);
 693            tcg_out32(s, offset);
 694            return;
 695        }
 696    }
 697
 698    /* Find the length of the immediate addend.  Note that the encoding
 699       that would be used for (%ebp) indicates absolute addressing.  */
 700    if (rm < 0) {
 701        mod = 0, len = 4, rm = 5;
 702    } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
 703        mod = 0, len = 0;
 704    } else if (offset == (int8_t)offset) {
 705        mod = 0x40, len = 1;
 706    } else {
 707        mod = 0x80, len = 4;
 708    }
 709
 710    /* Use a single byte MODRM format if possible.  Note that the encoding
 711       that would be used for %esp is the escape to the two byte form.  */
 712    if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
 713        /* Single byte MODRM format.  */
 714        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 715    } else {
 716        /* Two byte MODRM+SIB format.  */
 717
 718        /* Note that the encoding that would place %esp into the index
 719           field indicates no index register.  In 64-bit mode, the REX.X
 720           bit counts, so %r12 can be used as the index.  */
 721        if (index < 0) {
 722            index = 4;
 723        } else {
 724            tcg_debug_assert(index != TCG_REG_ESP);
 725        }
 726
 727        tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
 728        tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
 729    }
 730
 731    if (len == 1) {
 732        tcg_out8(s, offset);
 733    } else if (len == 4) {
 734        tcg_out32(s, offset);
 735    }
 736}
 737
 738static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
 739                                     int index, int shift, intptr_t offset)
 740{
 741    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 742    tcg_out_sib_offset(s, r, rm, index, shift, offset);
 743}
 744
 745static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
 746                                         int rm, int index, int shift,
 747                                         intptr_t offset)
 748{
 749    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 750    tcg_out_sib_offset(s, r, rm, index, shift, offset);
 751}
 752
 753/* A simplification of the above with no index or shift.  */
 754static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
 755                                        int rm, intptr_t offset)
 756{
 757    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
 758}
 759
 760static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
 761                                            int v, int rm, intptr_t offset)
 762{
 763    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
 764}
 765
 766/* Output an opcode with an expected reference to the constant pool.  */
 767static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
 768{
 769    tcg_out_opc(s, opc, r, 0, 0);
 770    /* Absolute for 32-bit, pc-relative for 64-bit.  */
 771    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 772    tcg_out32(s, 0);
 773}
 774
 775/* Output an opcode with an expected reference to the constant pool.  */
 776static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
 777{
 778    tcg_out_vex_opc(s, opc, r, 0, 0, 0);
 779    /* Absolute for 32-bit, pc-relative for 64-bit.  */
 780    tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 781    tcg_out32(s, 0);
 782}
 783
 784/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
 785static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
 786{
 787    /* Propagate an opcode prefix, such as P_REXW.  */
 788    int ext = subop & ~0x7;
 789    subop &= 0x7;
 790
 791    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
 792}
 793
 794static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
 795{
 796    int rexw = 0;
 797
 798    if (arg == ret) {
 799        return;
 800    }
 801    switch (type) {
 802    case TCG_TYPE_I64:
 803        rexw = P_REXW;
 804        /* fallthru */
 805    case TCG_TYPE_I32:
 806        if (ret < 16) {
 807            if (arg < 16) {
 808                tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
 809            } else {
 810                tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
 811            }
 812        } else {
 813            if (arg < 16) {
 814                tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
 815            } else {
 816                tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 817            }
 818        }
 819        break;
 820
 821    case TCG_TYPE_V64:
 822        tcg_debug_assert(ret >= 16 && arg >= 16);
 823        tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 824        break;
 825    case TCG_TYPE_V128:
 826        tcg_debug_assert(ret >= 16 && arg >= 16);
 827        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
 828        break;
 829    case TCG_TYPE_V256:
 830        tcg_debug_assert(ret >= 16 && arg >= 16);
 831        tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
 832        break;
 833
 834    default:
 835        g_assert_not_reached();
 836    }
 837}
 838
 839static void tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
 840                            TCGReg r, TCGReg a)
 841{
 842    if (have_avx2) {
 843        static const int dup_insn[4] = {
 844            OPC_VPBROADCASTB, OPC_VPBROADCASTW,
 845            OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
 846        };
 847        int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 848        tcg_out_vex_modrm(s, dup_insn[vece] + vex_l, r, 0, a);
 849    } else {
 850        switch (vece) {
 851        case MO_8:
 852            /* ??? With zero in a register, use PSHUFB.  */
 853            tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
 854            a = r;
 855            /* FALLTHRU */
 856        case MO_16:
 857            tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
 858            a = r;
 859            /* FALLTHRU */
 860        case MO_32:
 861            tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
 862            /* imm8 operand: all output lanes selected from input lane 0.  */
 863            tcg_out8(s, 0);
 864            break;
 865        case MO_64:
 866            tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
 867            break;
 868        default:
 869            g_assert_not_reached();
 870        }
 871    }
 872}
 873
 874static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
 875                             TCGReg ret, tcg_target_long arg)
 876{
 877    int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 878
 879    if (arg == 0) {
 880        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
 881        return;
 882    }
 883    if (arg == -1) {
 884        tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
 885        return;
 886    }
 887
 888    if (TCG_TARGET_REG_BITS == 64) {
 889        if (type == TCG_TYPE_V64) {
 890            tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
 891        } else if (have_avx2) {
 892            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
 893        } else {
 894            tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
 895        }
 896        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
 897    } else if (have_avx2) {
 898        tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
 899        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 900    } else {
 901        tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy, ret);
 902        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 903        tcg_out_dup_vec(s, type, MO_32, ret, ret);
 904    }
 905}
 906
 907static void tcg_out_movi(TCGContext *s, TCGType type,
 908                         TCGReg ret, tcg_target_long arg)
 909{
 910    tcg_target_long diff;
 911
 912    switch (type) {
 913    case TCG_TYPE_I32:
 914#if TCG_TARGET_REG_BITS == 64
 915    case TCG_TYPE_I64:
 916#endif
 917        if (ret < 16) {
 918            break;
 919        }
 920        /* fallthru */
 921    case TCG_TYPE_V64:
 922    case TCG_TYPE_V128:
 923    case TCG_TYPE_V256:
 924        tcg_debug_assert(ret >= 16);
 925        tcg_out_dupi_vec(s, type, ret, arg);
 926        return;
 927    default:
 928        g_assert_not_reached();
 929    }
 930
 931    if (arg == 0) {
 932        tgen_arithr(s, ARITH_XOR, ret, ret);
 933        return;
 934    }
 935    if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
 936        tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
 937        tcg_out32(s, arg);
 938        return;
 939    }
 940    if (arg == (int32_t)arg) {
 941        tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
 942        tcg_out32(s, arg);
 943        return;
 944    }
 945
 946    /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
 947    diff = arg - ((uintptr_t)s->code_ptr + 7);
 948    if (diff == (int32_t)diff) {
 949        tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
 950        tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
 951        tcg_out32(s, diff);
 952        return;
 953    }
 954
 955    tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
 956    tcg_out64(s, arg);
 957}
 958
 959static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
 960{
 961    if (val == (int8_t)val) {
 962        tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
 963        tcg_out8(s, val);
 964    } else if (val == (int32_t)val) {
 965        tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
 966        tcg_out32(s, val);
 967    } else {
 968        tcg_abort();
 969    }
 970}
 971
 972static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
 973{
 974    /* Given the strength of x86 memory ordering, we only need care for
 975       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
 976       faster than "mfence", so don't bother with the sse insn.  */
 977    if (a0 & TCG_MO_ST_LD) {
 978        tcg_out8(s, 0xf0);
 979        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
 980        tcg_out8(s, 0);
 981    }
 982}
 983
 984static inline void tcg_out_push(TCGContext *s, int reg)
 985{
 986    tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
 987}
 988
 989static inline void tcg_out_pop(TCGContext *s, int reg)
 990{
 991    tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
 992}
 993
 994static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
 995                       TCGReg arg1, intptr_t arg2)
 996{
 997    switch (type) {
 998    case TCG_TYPE_I32:
 999        if (ret < 16) {
1000            tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1001        } else {
1002            tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1003        }
1004        break;
1005    case TCG_TYPE_I64:
1006        if (ret < 16) {
1007            tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1008            break;
1009        }
1010        /* FALLTHRU */
1011    case TCG_TYPE_V64:
1012        tcg_debug_assert(ret >= 16);
1013        tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1014        break;
1015    case TCG_TYPE_V128:
1016        tcg_debug_assert(ret >= 16);
1017        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx, ret, 0, arg1, arg2);
1018        break;
1019    case TCG_TYPE_V256:
1020        tcg_debug_assert(ret >= 16);
1021        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1022                                 ret, 0, arg1, arg2);
1023        break;
1024    default:
1025        g_assert_not_reached();
1026    }
1027}
1028
1029static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1030                       TCGReg arg1, intptr_t arg2)
1031{
1032    switch (type) {
1033    case TCG_TYPE_I32:
1034        if (arg < 16) {
1035            tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1036        } else {
1037            tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1038        }
1039        break;
1040    case TCG_TYPE_I64:
1041        if (arg < 16) {
1042            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1043            break;
1044        }
1045        /* FALLTHRU */
1046    case TCG_TYPE_V64:
1047        tcg_debug_assert(arg >= 16);
1048        tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1049        break;
1050    case TCG_TYPE_V128:
1051        tcg_debug_assert(arg >= 16);
1052        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx, arg, 0, arg1, arg2);
1053        break;
1054    case TCG_TYPE_V256:
1055        tcg_debug_assert(arg >= 16);
1056        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1057                                 arg, 0, arg1, arg2);
1058        break;
1059    default:
1060        g_assert_not_reached();
1061    }
1062}
1063
1064static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1065                        TCGReg base, intptr_t ofs)
1066{
1067    int rexw = 0;
1068    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1069        if (val != (int32_t)val) {
1070            return false;
1071        }
1072        rexw = P_REXW;
1073    } else if (type != TCG_TYPE_I32) {
1074        return false;
1075    }
1076    tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1077    tcg_out32(s, val);
1078    return true;
1079}
1080
1081static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1082{
1083    /* Propagate an opcode prefix, such as P_DATA16.  */
1084    int ext = subopc & ~0x7;
1085    subopc &= 0x7;
1086
1087    if (count == 1) {
1088        tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1089    } else {
1090        tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1091        tcg_out8(s, count);
1092    }
1093}
1094
1095static inline void tcg_out_bswap32(TCGContext *s, int reg)
1096{
1097    tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1098}
1099
1100static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1101{
1102    tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1103}
1104
1105static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
1106{
1107    /* movzbl */
1108    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1109    tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1110}
1111
1112static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
1113{
1114    /* movsbl */
1115    tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1116    tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1117}
1118
1119static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
1120{
1121    /* movzwl */
1122    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1123}
1124
1125static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
1126{
1127    /* movsw[lq] */
1128    tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1129}
1130
1131static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
1132{
1133    /* 32-bit mov zero extends.  */
1134    tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1135}
1136
1137static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
1138{
1139    tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1140}
1141
1142static inline void tcg_out_bswap64(TCGContext *s, int reg)
1143{
1144    tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1145}
1146
1147static void tgen_arithi(TCGContext *s, int c, int r0,
1148                        tcg_target_long val, int cf)
1149{
1150    int rexw = 0;
1151
1152    if (TCG_TARGET_REG_BITS == 64) {
1153        rexw = c & -8;
1154        c &= 7;
1155    }
1156
1157    /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1158       partial flags update stalls on Pentium4 and are not recommended
1159       by current Intel optimization manuals.  */
1160    if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1161        int is_inc = (c == ARITH_ADD) ^ (val < 0);
1162        if (TCG_TARGET_REG_BITS == 64) {
1163            /* The single-byte increment encodings are re-tasked as the
1164               REX prefixes.  Use the MODRM encoding.  */
1165            tcg_out_modrm(s, OPC_GRP5 + rexw,
1166                          (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1167        } else {
1168            tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1169        }
1170        return;
1171    }
1172
1173    if (c == ARITH_AND) {
1174        if (TCG_TARGET_REG_BITS == 64) {
1175            if (val == 0xffffffffu) {
1176                tcg_out_ext32u(s, r0, r0);
1177                return;
1178            }
1179            if (val == (uint32_t)val) {
1180                /* AND with no high bits set can use a 32-bit operation.  */
1181                rexw = 0;
1182            }
1183        }
1184        if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1185            tcg_out_ext8u(s, r0, r0);
1186            return;
1187        }
1188        if (val == 0xffffu) {
1189            tcg_out_ext16u(s, r0, r0);
1190            return;
1191        }
1192    }
1193
1194    if (val == (int8_t)val) {
1195        tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1196        tcg_out8(s, val);
1197        return;
1198    }
1199    if (rexw == 0 || val == (int32_t)val) {
1200        tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1201        tcg_out32(s, val);
1202        return;
1203    }
1204
1205    tcg_abort();
1206}
1207
1208static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1209{
1210    if (val != 0) {
1211        tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1212    }
1213}
1214
1215/* Use SMALL != 0 to force a short forward branch.  */
1216static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
1217{
1218    int32_t val, val1;
1219
1220    if (l->has_value) {
1221        val = tcg_pcrel_diff(s, l->u.value_ptr);
1222        val1 = val - 2;
1223        if ((int8_t)val1 == val1) {
1224            if (opc == -1) {
1225                tcg_out8(s, OPC_JMP_short);
1226            } else {
1227                tcg_out8(s, OPC_JCC_short + opc);
1228            }
1229            tcg_out8(s, val1);
1230        } else {
1231            if (small) {
1232                tcg_abort();
1233            }
1234            if (opc == -1) {
1235                tcg_out8(s, OPC_JMP_long);
1236                tcg_out32(s, val - 5);
1237            } else {
1238                tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1239                tcg_out32(s, val - 6);
1240            }
1241        }
1242    } else if (small) {
1243        if (opc == -1) {
1244            tcg_out8(s, OPC_JMP_short);
1245        } else {
1246            tcg_out8(s, OPC_JCC_short + opc);
1247        }
1248        tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1249        s->code_ptr += 1;
1250    } else {
1251        if (opc == -1) {
1252            tcg_out8(s, OPC_JMP_long);
1253        } else {
1254            tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1255        }
1256        tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1257        s->code_ptr += 4;
1258    }
1259}
1260
1261static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1262                        int const_arg2, int rexw)
1263{
1264    if (const_arg2) {
1265        if (arg2 == 0) {
1266            /* test r, r */
1267            tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1268        } else {
1269            tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1270        }
1271    } else {
1272        tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1273    }
1274}
1275
1276static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1277                             TCGArg arg1, TCGArg arg2, int const_arg2,
1278                             TCGLabel *label, int small)
1279{
1280    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1281    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1282}
1283
1284#if TCG_TARGET_REG_BITS == 64
1285static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1286                             TCGArg arg1, TCGArg arg2, int const_arg2,
1287                             TCGLabel *label, int small)
1288{
1289    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1290    tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1291}
1292#else
1293/* XXX: we implement it at the target level to avoid having to
1294   handle cross basic blocks temporaries */
1295static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1296                            const int *const_args, int small)
1297{
1298    TCGLabel *label_next = gen_new_label();
1299    TCGLabel *label_this = arg_label(args[5]);
1300
1301    switch(args[4]) {
1302    case TCG_COND_EQ:
1303        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1304                         label_next, 1);
1305        tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1306                         label_this, small);
1307        break;
1308    case TCG_COND_NE:
1309        tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1310                         label_this, small);
1311        tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1312                         label_this, small);
1313        break;
1314    case TCG_COND_LT:
1315        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1316                         label_this, small);
1317        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1318        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1319                         label_this, small);
1320        break;
1321    case TCG_COND_LE:
1322        tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1323                         label_this, small);
1324        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1325        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1326                         label_this, small);
1327        break;
1328    case TCG_COND_GT:
1329        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1330                         label_this, small);
1331        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1332        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1333                         label_this, small);
1334        break;
1335    case TCG_COND_GE:
1336        tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1337                         label_this, small);
1338        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1339        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1340                         label_this, small);
1341        break;
1342    case TCG_COND_LTU:
1343        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1344                         label_this, small);
1345        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1346        tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1347                         label_this, small);
1348        break;
1349    case TCG_COND_LEU:
1350        tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1351                         label_this, small);
1352        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1353        tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1354                         label_this, small);
1355        break;
1356    case TCG_COND_GTU:
1357        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1358                         label_this, small);
1359        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1360        tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1361                         label_this, small);
1362        break;
1363    case TCG_COND_GEU:
1364        tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1365                         label_this, small);
1366        tcg_out_jxx(s, JCC_JNE, label_next, 1);
1367        tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1368                         label_this, small);
1369        break;
1370    default:
1371        tcg_abort();
1372    }
1373    tcg_out_label(s, label_next, s->code_ptr);
1374}
1375#endif
1376
1377static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1378                              TCGArg arg1, TCGArg arg2, int const_arg2)
1379{
1380    tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1381    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1382    tcg_out_ext8u(s, dest, dest);
1383}
1384
1385#if TCG_TARGET_REG_BITS == 64
1386static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1387                              TCGArg arg1, TCGArg arg2, int const_arg2)
1388{
1389    tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1390    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1391    tcg_out_ext8u(s, dest, dest);
1392}
1393#else
1394static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1395                             const int *const_args)
1396{
1397    TCGArg new_args[6];
1398    TCGLabel *label_true, *label_over;
1399
1400    memcpy(new_args, args+1, 5*sizeof(TCGArg));
1401
1402    if (args[0] == args[1] || args[0] == args[2]
1403        || (!const_args[3] && args[0] == args[3])
1404        || (!const_args[4] && args[0] == args[4])) {
1405        /* When the destination overlaps with one of the argument
1406           registers, don't do anything tricky.  */
1407        label_true = gen_new_label();
1408        label_over = gen_new_label();
1409
1410        new_args[5] = label_arg(label_true);
1411        tcg_out_brcond2(s, new_args, const_args+1, 1);
1412
1413        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1414        tcg_out_jxx(s, JCC_JMP, label_over, 1);
1415        tcg_out_label(s, label_true, s->code_ptr);
1416
1417        tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1418        tcg_out_label(s, label_over, s->code_ptr);
1419    } else {
1420        /* When the destination does not overlap one of the arguments,
1421           clear the destination first, jump if cond false, and emit an
1422           increment in the true case.  This results in smaller code.  */
1423
1424        tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1425
1426        label_over = gen_new_label();
1427        new_args[4] = tcg_invert_cond(new_args[4]);
1428        new_args[5] = label_arg(label_over);
1429        tcg_out_brcond2(s, new_args, const_args+1, 1);
1430
1431        tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1432        tcg_out_label(s, label_over, s->code_ptr);
1433    }
1434}
1435#endif
1436
1437static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1438                         TCGReg dest, TCGReg v1)
1439{
1440    if (have_cmov) {
1441        tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1442    } else {
1443        TCGLabel *over = gen_new_label();
1444        tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1445        tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1446        tcg_out_label(s, over, s->code_ptr);
1447    }
1448}
1449
1450static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1451                              TCGReg c1, TCGArg c2, int const_c2,
1452                              TCGReg v1)
1453{
1454    tcg_out_cmp(s, c1, c2, const_c2, 0);
1455    tcg_out_cmov(s, cond, 0, dest, v1);
1456}
1457
1458#if TCG_TARGET_REG_BITS == 64
1459static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1460                              TCGReg c1, TCGArg c2, int const_c2,
1461                              TCGReg v1)
1462{
1463    tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1464    tcg_out_cmov(s, cond, P_REXW, dest, v1);
1465}
1466#endif
1467
1468static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1469                        TCGArg arg2, bool const_a2)
1470{
1471    if (have_bmi1) {
1472        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1473        if (const_a2) {
1474            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1475        } else {
1476            tcg_debug_assert(dest != arg2);
1477            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1478        }
1479    } else {
1480        tcg_debug_assert(dest != arg2);
1481        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1482        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1483    }
1484}
1485
1486static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1487                        TCGArg arg2, bool const_a2)
1488{
1489    if (have_lzcnt) {
1490        tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1491        if (const_a2) {
1492            tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1493        } else {
1494            tcg_debug_assert(dest != arg2);
1495            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1496        }
1497    } else {
1498        tcg_debug_assert(!const_a2);
1499        tcg_debug_assert(dest != arg1);
1500        tcg_debug_assert(dest != arg2);
1501
1502        /* Recall that the output of BSR is the index not the count.  */
1503        tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1504        tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1505
1506        /* Since we have destroyed the flags from BSR, we have to re-test.  */
1507        tcg_out_cmp(s, arg1, 0, 1, rexw);
1508        tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1509    }
1510}
1511
1512static void tcg_out_branch(TCGContext *s, int call, tcg_insn_unit *dest)
1513{
1514    intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1515
1516    if (disp == (int32_t)disp) {
1517        tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1518        tcg_out32(s, disp);
1519    } else {
1520        /* rip-relative addressing into the constant pool.
1521           This is 6 + 8 = 14 bytes, as compared to using an
1522           an immediate load 10 + 6 = 16 bytes, plus we may
1523           be able to re-use the pool constant for more calls.  */
1524        tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1525        tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1526        new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1527        tcg_out32(s, 0);
1528    }
1529}
1530
1531static inline void tcg_out_call(TCGContext *s, tcg_insn_unit *dest)
1532{
1533    tcg_out_branch(s, 1, dest);
1534}
1535
1536static void tcg_out_jmp(TCGContext *s, tcg_insn_unit *dest)
1537{
1538    tcg_out_branch(s, 0, dest);
1539}
1540
1541static void tcg_out_nopn(TCGContext *s, int n)
1542{
1543    int i;
1544    /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1545     * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1546     * duplicate prefix, and all of the interesting recent cores can
1547     * decode and discard the duplicates in a single cycle.
1548     */
1549    tcg_debug_assert(n >= 1);
1550    for (i = 1; i < n; ++i) {
1551        tcg_out8(s, 0x66);
1552    }
1553    tcg_out8(s, 0x90);
1554}
1555
1556#if defined(CONFIG_SOFTMMU)
1557#include "tcg-ldst.inc.c"
1558
1559/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1560 *                                     int mmu_idx, uintptr_t ra)
1561 */
1562static void * const qemu_ld_helpers[16] = {
1563    [MO_UB]   = helper_ret_ldub_mmu,
1564    [MO_LEUW] = helper_le_lduw_mmu,
1565    [MO_LEUL] = helper_le_ldul_mmu,
1566    [MO_LEQ]  = helper_le_ldq_mmu,
1567    [MO_BEUW] = helper_be_lduw_mmu,
1568    [MO_BEUL] = helper_be_ldul_mmu,
1569    [MO_BEQ]  = helper_be_ldq_mmu,
1570};
1571
1572/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1573 *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
1574 */
1575static void * const qemu_st_helpers[16] = {
1576    [MO_UB]   = helper_ret_stb_mmu,
1577    [MO_LEUW] = helper_le_stw_mmu,
1578    [MO_LEUL] = helper_le_stl_mmu,
1579    [MO_LEQ]  = helper_le_stq_mmu,
1580    [MO_BEUW] = helper_be_stw_mmu,
1581    [MO_BEUL] = helper_be_stl_mmu,
1582    [MO_BEQ]  = helper_be_stq_mmu,
1583};
1584
1585/* Perform the TLB load and compare.
1586
1587   Inputs:
1588   ADDRLO and ADDRHI contain the low and high part of the address.
1589
1590   MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1591
1592   WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1593   This should be offsetof addr_read or addr_write.
1594
1595   Outputs:
1596   LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1597   positions of the displacements of forward jumps to the TLB miss case.
1598
1599   Second argument register is loaded with the low part of the address.
1600   In the TLB hit case, it has been adjusted as indicated by the TLB
1601   and so is a host address.  In the TLB miss case, it continues to
1602   hold a guest address.
1603
1604   First argument register is clobbered.  */
1605
1606static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1607                                    int mem_index, TCGMemOp opc,
1608                                    tcg_insn_unit **label_ptr, int which)
1609{
1610    const TCGReg r0 = TCG_REG_L0;
1611    const TCGReg r1 = TCG_REG_L1;
1612    TCGType ttype = TCG_TYPE_I32;
1613    TCGType tlbtype = TCG_TYPE_I32;
1614    int trexw = 0, hrexw = 0, tlbrexw = 0;
1615    unsigned a_bits = get_alignment_bits(opc);
1616    unsigned s_bits = opc & MO_SIZE;
1617    unsigned a_mask = (1 << a_bits) - 1;
1618    unsigned s_mask = (1 << s_bits) - 1;
1619    target_ulong tlb_mask;
1620
1621    if (TCG_TARGET_REG_BITS == 64) {
1622        if (TARGET_LONG_BITS == 64) {
1623            ttype = TCG_TYPE_I64;
1624            trexw = P_REXW;
1625        }
1626        if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1627            hrexw = P_REXW;
1628            if (TARGET_PAGE_BITS + CPU_TLB_BITS > 32) {
1629                tlbtype = TCG_TYPE_I64;
1630                tlbrexw = P_REXW;
1631            }
1632        }
1633    }
1634
1635    tcg_out_mov(s, tlbtype, r0, addrlo);
1636    /* If the required alignment is at least as large as the access, simply
1637       copy the address and mask.  For lesser alignments, check that we don't
1638       cross pages for the complete access.  */
1639    if (a_bits >= s_bits) {
1640        tcg_out_mov(s, ttype, r1, addrlo);
1641    } else {
1642        tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1643    }
1644    tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1645
1646    tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1647                   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1648
1649    tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1650    tgen_arithi(s, ARITH_AND + tlbrexw, r0,
1651                (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS, 0);
1652
1653    tcg_out_modrm_sib_offset(s, OPC_LEA + hrexw, r0, TCG_AREG0, r0, 0,
1654                             offsetof(CPUArchState, tlb_table[mem_index][0])
1655                             + which);
1656
1657    /* cmp 0(r0), r1 */
1658    tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, 0);
1659
1660    /* Prepare for both the fast path add of the tlb addend, and the slow
1661       path function argument setup.  There are two cases worth note:
1662       For 32-bit guest and x86_64 host, MOVL zero-extends the guest address
1663       before the fastpath ADDQ below.  For 64-bit guest and x32 host, MOVQ
1664       copies the entire guest address for the slow path, while truncation
1665       for the 32-bit host happens with the fastpath ADDL below.  */
1666    tcg_out_mov(s, ttype, r1, addrlo);
1667
1668    /* jne slow_path */
1669    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1670    label_ptr[0] = s->code_ptr;
1671    s->code_ptr += 4;
1672
1673    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1674        /* cmp 4(r0), addrhi */
1675        tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, 4);
1676
1677        /* jne slow_path */
1678        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1679        label_ptr[1] = s->code_ptr;
1680        s->code_ptr += 4;
1681    }
1682
1683    /* TLB Hit.  */
1684
1685    /* add addend(r0), r1 */
1686    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1687                         offsetof(CPUTLBEntry, addend) - which);
1688}
1689
1690/*
1691 * Record the context of a call to the out of line helper code for the slow path
1692 * for a load or store, so that we can later generate the correct helper code
1693 */
1694static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
1695                                TCGReg datalo, TCGReg datahi,
1696                                TCGReg addrlo, TCGReg addrhi,
1697                                tcg_insn_unit *raddr,
1698                                tcg_insn_unit **label_ptr)
1699{
1700    TCGLabelQemuLdst *label = new_ldst_label(s);
1701
1702    label->is_ld = is_ld;
1703    label->oi = oi;
1704    label->datalo_reg = datalo;
1705    label->datahi_reg = datahi;
1706    label->addrlo_reg = addrlo;
1707    label->addrhi_reg = addrhi;
1708    label->raddr = raddr;
1709    label->label_ptr[0] = label_ptr[0];
1710    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1711        label->label_ptr[1] = label_ptr[1];
1712    }
1713}
1714
1715/*
1716 * Generate code for the slow path for a load at the end of block
1717 */
1718static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1719{
1720    TCGMemOpIdx oi = l->oi;
1721    TCGMemOp opc = get_memop(oi);
1722    TCGReg data_reg;
1723    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1724
1725    /* resolve label address */
1726    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1727    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1728        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1729    }
1730
1731    if (TCG_TARGET_REG_BITS == 32) {
1732        int ofs = 0;
1733
1734        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1735        ofs += 4;
1736
1737        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1738        ofs += 4;
1739
1740        if (TARGET_LONG_BITS == 64) {
1741            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1742            ofs += 4;
1743        }
1744
1745        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1746        ofs += 4;
1747
1748        tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1749    } else {
1750        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1751        /* The second argument is already loaded with addrlo.  */
1752        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1753        tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1754                     (uintptr_t)l->raddr);
1755    }
1756
1757    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1758
1759    data_reg = l->datalo_reg;
1760    switch (opc & MO_SSIZE) {
1761    case MO_SB:
1762        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
1763        break;
1764    case MO_SW:
1765        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
1766        break;
1767#if TCG_TARGET_REG_BITS == 64
1768    case MO_SL:
1769        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1770        break;
1771#endif
1772    case MO_UB:
1773    case MO_UW:
1774        /* Note that the helpers have zero-extended to tcg_target_long.  */
1775    case MO_UL:
1776        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1777        break;
1778    case MO_Q:
1779        if (TCG_TARGET_REG_BITS == 64) {
1780            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1781        } else if (data_reg == TCG_REG_EDX) {
1782            /* xchg %edx, %eax */
1783            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1784            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1785        } else {
1786            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1787            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1788        }
1789        break;
1790    default:
1791        tcg_abort();
1792    }
1793
1794    /* Jump to the code corresponding to next IR of qemu_st */
1795    tcg_out_jmp(s, l->raddr);
1796}
1797
1798/*
1799 * Generate code for the slow path for a store at the end of block
1800 */
1801static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1802{
1803    TCGMemOpIdx oi = l->oi;
1804    TCGMemOp opc = get_memop(oi);
1805    TCGMemOp s_bits = opc & MO_SIZE;
1806    tcg_insn_unit **label_ptr = &l->label_ptr[0];
1807    TCGReg retaddr;
1808
1809    /* resolve label address */
1810    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1811    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1812        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1813    }
1814
1815    if (TCG_TARGET_REG_BITS == 32) {
1816        int ofs = 0;
1817
1818        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1819        ofs += 4;
1820
1821        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1822        ofs += 4;
1823
1824        if (TARGET_LONG_BITS == 64) {
1825            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1826            ofs += 4;
1827        }
1828
1829        tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1830        ofs += 4;
1831
1832        if (s_bits == MO_64) {
1833            tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1834            ofs += 4;
1835        }
1836
1837        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1838        ofs += 4;
1839
1840        retaddr = TCG_REG_EAX;
1841        tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1842        tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1843    } else {
1844        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1845        /* The second argument is already loaded with addrlo.  */
1846        tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1847                    tcg_target_call_iarg_regs[2], l->datalo_reg);
1848        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1849
1850        if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1851            retaddr = tcg_target_call_iarg_regs[4];
1852            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1853        } else {
1854            retaddr = TCG_REG_RAX;
1855            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1856            tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1857                       TCG_TARGET_CALL_STACK_OFFSET);
1858        }
1859    }
1860
1861    /* "Tail call" to the helper, with the return address back inline.  */
1862    tcg_out_push(s, retaddr);
1863    tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1864}
1865#elif defined(__x86_64__) && defined(__linux__)
1866# include <asm/prctl.h>
1867# include <sys/prctl.h>
1868
1869int arch_prctl(int code, unsigned long addr);
1870
1871static int guest_base_flags;
1872static inline void setup_guest_base_seg(void)
1873{
1874    if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1875        guest_base_flags = P_GS;
1876    }
1877}
1878#else
1879# define guest_base_flags 0
1880static inline void setup_guest_base_seg(void) { }
1881#endif /* SOFTMMU */
1882
1883static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1884                                   TCGReg base, int index, intptr_t ofs,
1885                                   int seg, TCGMemOp memop)
1886{
1887    const TCGMemOp real_bswap = memop & MO_BSWAP;
1888    TCGMemOp bswap = real_bswap;
1889    int movop = OPC_MOVL_GvEv;
1890
1891    if (have_movbe && real_bswap) {
1892        bswap = 0;
1893        movop = OPC_MOVBE_GyMy;
1894    }
1895
1896    switch (memop & MO_SSIZE) {
1897    case MO_UB:
1898        tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
1899                                 base, index, 0, ofs);
1900        break;
1901    case MO_SB:
1902        tcg_out_modrm_sib_offset(s, OPC_MOVSBL + P_REXW + seg, datalo,
1903                                 base, index, 0, ofs);
1904        break;
1905    case MO_UW:
1906        tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1907                                 base, index, 0, ofs);
1908        if (real_bswap) {
1909            tcg_out_rolw_8(s, datalo);
1910        }
1911        break;
1912    case MO_SW:
1913        if (real_bswap) {
1914            if (have_movbe) {
1915                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1916                                         datalo, base, index, 0, ofs);
1917            } else {
1918                tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1919                                         base, index, 0, ofs);
1920                tcg_out_rolw_8(s, datalo);
1921            }
1922            tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo);
1923        } else {
1924            tcg_out_modrm_sib_offset(s, OPC_MOVSWL + P_REXW + seg,
1925                                     datalo, base, index, 0, ofs);
1926        }
1927        break;
1928    case MO_UL:
1929        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
1930        if (bswap) {
1931            tcg_out_bswap32(s, datalo);
1932        }
1933        break;
1934#if TCG_TARGET_REG_BITS == 64
1935    case MO_SL:
1936        if (real_bswap) {
1937            tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1938                                     base, index, 0, ofs);
1939            if (bswap) {
1940                tcg_out_bswap32(s, datalo);
1941            }
1942            tcg_out_ext32s(s, datalo, datalo);
1943        } else {
1944            tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
1945                                     base, index, 0, ofs);
1946        }
1947        break;
1948#endif
1949    case MO_Q:
1950        if (TCG_TARGET_REG_BITS == 64) {
1951            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
1952                                     base, index, 0, ofs);
1953            if (bswap) {
1954                tcg_out_bswap64(s, datalo);
1955            }
1956        } else {
1957            if (real_bswap) {
1958                int t = datalo;
1959                datalo = datahi;
1960                datahi = t;
1961            }
1962            if (base != datalo) {
1963                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1964                                         base, index, 0, ofs);
1965                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
1966                                         base, index, 0, ofs + 4);
1967            } else {
1968                tcg_out_modrm_sib_offset(s, movop + seg, datahi,
1969                                         base, index, 0, ofs + 4);
1970                tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1971                                         base, index, 0, ofs);
1972            }
1973            if (bswap) {
1974                tcg_out_bswap32(s, datalo);
1975                tcg_out_bswap32(s, datahi);
1976            }
1977        }
1978        break;
1979    default:
1980        tcg_abort();
1981    }
1982}
1983
1984/* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
1985   EAX. It will be useful once fixed registers globals are less
1986   common. */
1987static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
1988{
1989    TCGReg datalo, datahi, addrlo;
1990    TCGReg addrhi __attribute__((unused));
1991    TCGMemOpIdx oi;
1992    TCGMemOp opc;
1993#if defined(CONFIG_SOFTMMU)
1994    int mem_index;
1995    tcg_insn_unit *label_ptr[2];
1996#endif
1997
1998    datalo = *args++;
1999    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2000    addrlo = *args++;
2001    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2002    oi = *args++;
2003    opc = get_memop(oi);
2004
2005#if defined(CONFIG_SOFTMMU)
2006    mem_index = get_mmuidx(oi);
2007
2008    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2009                     label_ptr, offsetof(CPUTLBEntry, addr_read));
2010
2011    /* TLB Hit.  */
2012    tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
2013
2014    /* Record the current context of a load into ldst label */
2015    add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi,
2016                        s->code_ptr, label_ptr);
2017#else
2018    {
2019        int32_t offset = guest_base;
2020        TCGReg base = addrlo;
2021        int index = -1;
2022        int seg = 0;
2023
2024        /* For a 32-bit guest, the high 32 bits may contain garbage.
2025           We can do this with the ADDR32 prefix if we're not using
2026           a guest base, or when using segmentation.  Otherwise we
2027           need to zero-extend manually.  */
2028        if (guest_base == 0 || guest_base_flags) {
2029            seg = guest_base_flags;
2030            offset = 0;
2031            if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
2032                seg |= P_ADDR32;
2033            }
2034        } else if (TCG_TARGET_REG_BITS == 64) {
2035            if (TARGET_LONG_BITS == 32) {
2036                tcg_out_ext32u(s, TCG_REG_L0, base);
2037                base = TCG_REG_L0;
2038            }
2039            if (offset != guest_base) {
2040                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, guest_base);
2041                index = TCG_REG_L1;
2042                offset = 0;
2043            }
2044        }
2045
2046        tcg_out_qemu_ld_direct(s, datalo, datahi,
2047                               base, index, offset, seg, opc);
2048    }
2049#endif
2050}
2051
2052static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2053                                   TCGReg base, intptr_t ofs, int seg,
2054                                   TCGMemOp memop)
2055{
2056    /* ??? Ideally we wouldn't need a scratch register.  For user-only,
2057       we could perform the bswap twice to restore the original value
2058       instead of moving to the scratch.  But as it is, the L constraint
2059       means that TCG_REG_L0 is definitely free here.  */
2060    const TCGReg scratch = TCG_REG_L0;
2061    const TCGMemOp real_bswap = memop & MO_BSWAP;
2062    TCGMemOp bswap = real_bswap;
2063    int movop = OPC_MOVL_EvGv;
2064
2065    if (have_movbe && real_bswap) {
2066        bswap = 0;
2067        movop = OPC_MOVBE_MyGy;
2068    }
2069
2070    switch (memop & MO_SIZE) {
2071    case MO_8:
2072        /* In 32-bit mode, 8-bit stores can only happen from [abcd]x.
2073           Use the scratch register if necessary.  */
2074        if (TCG_TARGET_REG_BITS == 32 && datalo >= 4) {
2075            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2076            datalo = scratch;
2077        }
2078        tcg_out_modrm_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
2079                             datalo, base, ofs);
2080        break;
2081    case MO_16:
2082        if (bswap) {
2083            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2084            tcg_out_rolw_8(s, scratch);
2085            datalo = scratch;
2086        }
2087        tcg_out_modrm_offset(s, movop + P_DATA16 + seg, datalo, base, ofs);
2088        break;
2089    case MO_32:
2090        if (bswap) {
2091            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2092            tcg_out_bswap32(s, scratch);
2093            datalo = scratch;
2094        }
2095        tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
2096        break;
2097    case MO_64:
2098        if (TCG_TARGET_REG_BITS == 64) {
2099            if (bswap) {
2100                tcg_out_mov(s, TCG_TYPE_I64, scratch, datalo);
2101                tcg_out_bswap64(s, scratch);
2102                datalo = scratch;
2103            }
2104            tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
2105        } else if (bswap) {
2106            tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
2107            tcg_out_bswap32(s, scratch);
2108            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs);
2109            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2110            tcg_out_bswap32(s, scratch);
2111            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs+4);
2112        } else {
2113            if (real_bswap) {
2114                int t = datalo;
2115                datalo = datahi;
2116                datahi = t;
2117            }
2118            tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
2119            tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs+4);
2120        }
2121        break;
2122    default:
2123        tcg_abort();
2124    }
2125}
2126
2127static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
2128{
2129    TCGReg datalo, datahi, addrlo;
2130    TCGReg addrhi __attribute__((unused));
2131    TCGMemOpIdx oi;
2132    TCGMemOp opc;
2133#if defined(CONFIG_SOFTMMU)
2134    int mem_index;
2135    tcg_insn_unit *label_ptr[2];
2136#endif
2137
2138    datalo = *args++;
2139    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2140    addrlo = *args++;
2141    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2142    oi = *args++;
2143    opc = get_memop(oi);
2144
2145#if defined(CONFIG_SOFTMMU)
2146    mem_index = get_mmuidx(oi);
2147
2148    tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2149                     label_ptr, offsetof(CPUTLBEntry, addr_write));
2150
2151    /* TLB Hit.  */
2152    tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, 0, 0, opc);
2153
2154    /* Record the current context of a store into ldst label */
2155    add_qemu_ldst_label(s, false, oi, datalo, datahi, addrlo, addrhi,
2156                        s->code_ptr, label_ptr);
2157#else
2158    {
2159        int32_t offset = guest_base;
2160        TCGReg base = addrlo;
2161        int seg = 0;
2162
2163        /* See comment in tcg_out_qemu_ld re zero-extension of addrlo.  */
2164        if (guest_base == 0 || guest_base_flags) {
2165            seg = guest_base_flags;
2166            offset = 0;
2167            if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
2168                seg |= P_ADDR32;
2169            }
2170        } else if (TCG_TARGET_REG_BITS == 64) {
2171            /* ??? Note that we can't use the same SIB addressing scheme
2172               as for loads, since we require L0 free for bswap.  */
2173            if (offset != guest_base) {
2174                if (TARGET_LONG_BITS == 32) {
2175                    tcg_out_ext32u(s, TCG_REG_L0, base);
2176                    base = TCG_REG_L0;
2177                }
2178                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, guest_base);
2179                tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L1, base);
2180                base = TCG_REG_L1;
2181                offset = 0;
2182            } else if (TARGET_LONG_BITS == 32) {
2183                tcg_out_ext32u(s, TCG_REG_L1, base);
2184                base = TCG_REG_L1;
2185            }
2186        }
2187
2188        tcg_out_qemu_st_direct(s, datalo, datahi, base, offset, seg, opc);
2189    }
2190#endif
2191}
2192
2193static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2194                              const TCGArg *args, const int *const_args)
2195{
2196    TCGArg a0, a1, a2;
2197    int c, const_a2, vexop, rexw = 0;
2198
2199#if TCG_TARGET_REG_BITS == 64
2200# define OP_32_64(x) \
2201        case glue(glue(INDEX_op_, x), _i64): \
2202            rexw = P_REXW; /* FALLTHRU */    \
2203        case glue(glue(INDEX_op_, x), _i32)
2204#else
2205# define OP_32_64(x) \
2206        case glue(glue(INDEX_op_, x), _i32)
2207#endif
2208
2209    /* Hoist the loads of the most common arguments.  */
2210    a0 = args[0];
2211    a1 = args[1];
2212    a2 = args[2];
2213    const_a2 = const_args[2];
2214
2215    switch (opc) {
2216    case INDEX_op_exit_tb:
2217        /* Reuse the zeroing that exists for goto_ptr.  */
2218        if (a0 == 0) {
2219            tcg_out_jmp(s, s->code_gen_epilogue);
2220        } else {
2221            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2222            tcg_out_jmp(s, tb_ret_addr);
2223        }
2224        break;
2225    case INDEX_op_goto_tb:
2226        if (s->tb_jmp_insn_offset) {
2227            /* direct jump method */
2228            int gap;
2229            /* jump displacement must be aligned for atomic patching;
2230             * see if we need to add extra nops before jump
2231             */
2232            gap = tcg_pcrel_diff(s, QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4));
2233            if (gap != 1) {
2234                tcg_out_nopn(s, gap - 1);
2235            }
2236            tcg_out8(s, OPC_JMP_long); /* jmp im */
2237            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
2238            tcg_out32(s, 0);
2239        } else {
2240            /* indirect jump method */
2241            tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
2242                                 (intptr_t)(s->tb_jmp_target_addr + a0));
2243        }
2244        set_jmp_reset_offset(s, a0);
2245        break;
2246    case INDEX_op_goto_ptr:
2247        /* jmp to the given host address (could be epilogue) */
2248        tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2249        break;
2250    case INDEX_op_br:
2251        tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2252        break;
2253    OP_32_64(ld8u):
2254        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2255        tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2256        break;
2257    OP_32_64(ld8s):
2258        tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2259        break;
2260    OP_32_64(ld16u):
2261        /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2262        tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2263        break;
2264    OP_32_64(ld16s):
2265        tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2266        break;
2267#if TCG_TARGET_REG_BITS == 64
2268    case INDEX_op_ld32u_i64:
2269#endif
2270    case INDEX_op_ld_i32:
2271        tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2272        break;
2273
2274    OP_32_64(st8):
2275        if (const_args[0]) {
2276            tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2277            tcg_out8(s, a0);
2278        } else {
2279            tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2280        }
2281        break;
2282    OP_32_64(st16):
2283        if (const_args[0]) {
2284            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2285            tcg_out16(s, a0);
2286        } else {
2287            tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2288        }
2289        break;
2290#if TCG_TARGET_REG_BITS == 64
2291    case INDEX_op_st32_i64:
2292#endif
2293    case INDEX_op_st_i32:
2294        if (const_args[0]) {
2295            tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2296            tcg_out32(s, a0);
2297        } else {
2298            tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2299        }
2300        break;
2301
2302    OP_32_64(add):
2303        /* For 3-operand addition, use LEA.  */
2304        if (a0 != a1) {
2305            TCGArg c3 = 0;
2306            if (const_a2) {
2307                c3 = a2, a2 = -1;
2308            } else if (a0 == a2) {
2309                /* Watch out for dest = src + dest, since we've removed
2310                   the matching constraint on the add.  */
2311                tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2312                break;
2313            }
2314
2315            tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2316            break;
2317        }
2318        c = ARITH_ADD;
2319        goto gen_arith;
2320    OP_32_64(sub):
2321        c = ARITH_SUB;
2322        goto gen_arith;
2323    OP_32_64(and):
2324        c = ARITH_AND;
2325        goto gen_arith;
2326    OP_32_64(or):
2327        c = ARITH_OR;
2328        goto gen_arith;
2329    OP_32_64(xor):
2330        c = ARITH_XOR;
2331        goto gen_arith;
2332    gen_arith:
2333        if (const_a2) {
2334            tgen_arithi(s, c + rexw, a0, a2, 0);
2335        } else {
2336            tgen_arithr(s, c + rexw, a0, a2);
2337        }
2338        break;
2339
2340    OP_32_64(andc):
2341        if (const_a2) {
2342            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2343            tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2344        } else {
2345            tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2346        }
2347        break;
2348
2349    OP_32_64(mul):
2350        if (const_a2) {
2351            int32_t val;
2352            val = a2;
2353            if (val == (int8_t)val) {
2354                tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2355                tcg_out8(s, val);
2356            } else {
2357                tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2358                tcg_out32(s, val);
2359            }
2360        } else {
2361            tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2362        }
2363        break;
2364
2365    OP_32_64(div2):
2366        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2367        break;
2368    OP_32_64(divu2):
2369        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2370        break;
2371
2372    OP_32_64(shl):
2373        /* For small constant 3-operand shift, use LEA.  */
2374        if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2375            if (a2 - 1 == 0) {
2376                /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2377                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2378            } else {
2379                /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2380                tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2381            }
2382            break;
2383        }
2384        c = SHIFT_SHL;
2385        vexop = OPC_SHLX;
2386        goto gen_shift_maybe_vex;
2387    OP_32_64(shr):
2388        c = SHIFT_SHR;
2389        vexop = OPC_SHRX;
2390        goto gen_shift_maybe_vex;
2391    OP_32_64(sar):
2392        c = SHIFT_SAR;
2393        vexop = OPC_SARX;
2394        goto gen_shift_maybe_vex;
2395    OP_32_64(rotl):
2396        c = SHIFT_ROL;
2397        goto gen_shift;
2398    OP_32_64(rotr):
2399        c = SHIFT_ROR;
2400        goto gen_shift;
2401    gen_shift_maybe_vex:
2402        if (have_bmi2) {
2403            if (!const_a2) {
2404                tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2405                break;
2406            }
2407            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2408        }
2409        /* FALLTHRU */
2410    gen_shift:
2411        if (const_a2) {
2412            tcg_out_shifti(s, c + rexw, a0, a2);
2413        } else {
2414            tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2415        }
2416        break;
2417
2418    OP_32_64(ctz):
2419        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2420        break;
2421    OP_32_64(clz):
2422        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2423        break;
2424    OP_32_64(ctpop):
2425        tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2426        break;
2427
2428    case INDEX_op_brcond_i32:
2429        tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2430        break;
2431    case INDEX_op_setcond_i32:
2432        tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2433        break;
2434    case INDEX_op_movcond_i32:
2435        tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2436        break;
2437
2438    OP_32_64(bswap16):
2439        tcg_out_rolw_8(s, a0);
2440        break;
2441    OP_32_64(bswap32):
2442        tcg_out_bswap32(s, a0);
2443        break;
2444
2445    OP_32_64(neg):
2446        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2447        break;
2448    OP_32_64(not):
2449        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2450        break;
2451
2452    OP_32_64(ext8s):
2453        tcg_out_ext8s(s, a0, a1, rexw);
2454        break;
2455    OP_32_64(ext16s):
2456        tcg_out_ext16s(s, a0, a1, rexw);
2457        break;
2458    OP_32_64(ext8u):
2459        tcg_out_ext8u(s, a0, a1);
2460        break;
2461    OP_32_64(ext16u):
2462        tcg_out_ext16u(s, a0, a1);
2463        break;
2464
2465    case INDEX_op_qemu_ld_i32:
2466        tcg_out_qemu_ld(s, args, 0);
2467        break;
2468    case INDEX_op_qemu_ld_i64:
2469        tcg_out_qemu_ld(s, args, 1);
2470        break;
2471    case INDEX_op_qemu_st_i32:
2472        tcg_out_qemu_st(s, args, 0);
2473        break;
2474    case INDEX_op_qemu_st_i64:
2475        tcg_out_qemu_st(s, args, 1);
2476        break;
2477
2478    OP_32_64(mulu2):
2479        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2480        break;
2481    OP_32_64(muls2):
2482        tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2483        break;
2484    OP_32_64(add2):
2485        if (const_args[4]) {
2486            tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2487        } else {
2488            tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2489        }
2490        if (const_args[5]) {
2491            tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2492        } else {
2493            tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2494        }
2495        break;
2496    OP_32_64(sub2):
2497        if (const_args[4]) {
2498            tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2499        } else {
2500            tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2501        }
2502        if (const_args[5]) {
2503            tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2504        } else {
2505            tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2506        }
2507        break;
2508
2509#if TCG_TARGET_REG_BITS == 32
2510    case INDEX_op_brcond2_i32:
2511        tcg_out_brcond2(s, args, const_args, 0);
2512        break;
2513    case INDEX_op_setcond2_i32:
2514        tcg_out_setcond2(s, args, const_args);
2515        break;
2516#else /* TCG_TARGET_REG_BITS == 64 */
2517    case INDEX_op_ld32s_i64:
2518        tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2519        break;
2520    case INDEX_op_ld_i64:
2521        tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2522        break;
2523    case INDEX_op_st_i64:
2524        if (const_args[0]) {
2525            tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2526            tcg_out32(s, a0);
2527        } else {
2528            tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2529        }
2530        break;
2531
2532    case INDEX_op_brcond_i64:
2533        tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2534        break;
2535    case INDEX_op_setcond_i64:
2536        tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2537        break;
2538    case INDEX_op_movcond_i64:
2539        tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2540        break;
2541
2542    case INDEX_op_bswap64_i64:
2543        tcg_out_bswap64(s, a0);
2544        break;
2545    case INDEX_op_extu_i32_i64:
2546    case INDEX_op_ext32u_i64:
2547        tcg_out_ext32u(s, a0, a1);
2548        break;
2549    case INDEX_op_ext_i32_i64:
2550    case INDEX_op_ext32s_i64:
2551        tcg_out_ext32s(s, a0, a1);
2552        break;
2553#endif
2554
2555    OP_32_64(deposit):
2556        if (args[3] == 0 && args[4] == 8) {
2557            /* load bits 0..7 */
2558            tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2559        } else if (args[3] == 8 && args[4] == 8) {
2560            /* load bits 8..15 */
2561            tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2562        } else if (args[3] == 0 && args[4] == 16) {
2563            /* load bits 0..15 */
2564            tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2565        } else {
2566            tcg_abort();
2567        }
2568        break;
2569
2570    case INDEX_op_extract_i64:
2571        if (a2 + args[3] == 32) {
2572            /* This is a 32-bit zero-extending right shift.  */
2573            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2574            tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2575            break;
2576        }
2577        /* FALLTHRU */
2578    case INDEX_op_extract_i32:
2579        /* On the off-chance that we can use the high-byte registers.
2580           Otherwise we emit the same ext16 + shift pattern that we
2581           would have gotten from the normal tcg-op.c expansion.  */
2582        tcg_debug_assert(a2 == 8 && args[3] == 8);
2583        if (a1 < 4 && a0 < 8) {
2584            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2585        } else {
2586            tcg_out_ext16u(s, a0, a1);
2587            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2588        }
2589        break;
2590
2591    case INDEX_op_sextract_i32:
2592        /* We don't implement sextract_i64, as we cannot sign-extend to
2593           64-bits without using the REX prefix that explicitly excludes
2594           access to the high-byte registers.  */
2595        tcg_debug_assert(a2 == 8 && args[3] == 8);
2596        if (a1 < 4 && a0 < 8) {
2597            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2598        } else {
2599            tcg_out_ext16s(s, a0, a1, 0);
2600            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2601        }
2602        break;
2603
2604    case INDEX_op_mb:
2605        tcg_out_mb(s, a0);
2606        break;
2607    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2608    case INDEX_op_mov_i64:
2609    case INDEX_op_mov_vec:
2610    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
2611    case INDEX_op_movi_i64:
2612    case INDEX_op_dupi_vec:
2613    case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2614    default:
2615        tcg_abort();
2616    }
2617
2618#undef OP_32_64
2619}
2620
2621static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2622                           unsigned vecl, unsigned vece,
2623                           const TCGArg *args, const int *const_args)
2624{
2625    static int const add_insn[4] = {
2626        OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2627    };
2628    static int const sub_insn[4] = {
2629        OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2630    };
2631    static int const mul_insn[4] = {
2632        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
2633    };
2634    static int const shift_imm_insn[4] = {
2635        OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2636    };
2637    static int const cmpeq_insn[4] = {
2638        OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2639    };
2640    static int const cmpgt_insn[4] = {
2641        OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2642    };
2643    static int const punpckl_insn[4] = {
2644        OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2645    };
2646    static int const punpckh_insn[4] = {
2647        OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2648    };
2649    static int const packss_insn[4] = {
2650        OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2651    };
2652    static int const packus_insn[4] = {
2653        OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2654    };
2655
2656    TCGType type = vecl + TCG_TYPE_V64;
2657    int insn, sub;
2658    TCGArg a0, a1, a2;
2659
2660    a0 = args[0];
2661    a1 = args[1];
2662    a2 = args[2];
2663
2664    switch (opc) {
2665    case INDEX_op_add_vec:
2666        insn = add_insn[vece];
2667        goto gen_simd;
2668    case INDEX_op_sub_vec:
2669        insn = sub_insn[vece];
2670        goto gen_simd;
2671    case INDEX_op_mul_vec:
2672        insn = mul_insn[vece];
2673        goto gen_simd;
2674    case INDEX_op_and_vec:
2675        insn = OPC_PAND;
2676        goto gen_simd;
2677    case INDEX_op_or_vec:
2678        insn = OPC_POR;
2679        goto gen_simd;
2680    case INDEX_op_xor_vec:
2681        insn = OPC_PXOR;
2682        goto gen_simd;
2683    case INDEX_op_x86_punpckl_vec:
2684        insn = punpckl_insn[vece];
2685        goto gen_simd;
2686    case INDEX_op_x86_punpckh_vec:
2687        insn = punpckh_insn[vece];
2688        goto gen_simd;
2689    case INDEX_op_x86_packss_vec:
2690        insn = packss_insn[vece];
2691        goto gen_simd;
2692    case INDEX_op_x86_packus_vec:
2693        insn = packus_insn[vece];
2694        goto gen_simd;
2695#if TCG_TARGET_REG_BITS == 32
2696    case INDEX_op_dup2_vec:
2697        /* Constraints have already placed both 32-bit inputs in xmm regs.  */
2698        insn = OPC_PUNPCKLDQ;
2699        goto gen_simd;
2700#endif
2701    gen_simd:
2702        tcg_debug_assert(insn != OPC_UD2);
2703        if (type == TCG_TYPE_V256) {
2704            insn |= P_VEXL;
2705        }
2706        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2707        break;
2708
2709    case INDEX_op_cmp_vec:
2710        sub = args[3];
2711        if (sub == TCG_COND_EQ) {
2712            insn = cmpeq_insn[vece];
2713        } else if (sub == TCG_COND_GT) {
2714            insn = cmpgt_insn[vece];
2715        } else {
2716            g_assert_not_reached();
2717        }
2718        goto gen_simd;
2719
2720    case INDEX_op_andc_vec:
2721        insn = OPC_PANDN;
2722        if (type == TCG_TYPE_V256) {
2723            insn |= P_VEXL;
2724        }
2725        tcg_out_vex_modrm(s, insn, a0, a2, a1);
2726        break;
2727
2728    case INDEX_op_shli_vec:
2729        sub = 6;
2730        goto gen_shift;
2731    case INDEX_op_shri_vec:
2732        sub = 2;
2733        goto gen_shift;
2734    case INDEX_op_sari_vec:
2735        tcg_debug_assert(vece != MO_64);
2736        sub = 4;
2737    gen_shift:
2738        tcg_debug_assert(vece != MO_8);
2739        insn = shift_imm_insn[vece];
2740        if (type == TCG_TYPE_V256) {
2741            insn |= P_VEXL;
2742        }
2743        tcg_out_vex_modrm(s, insn, sub, a0, a1);
2744        tcg_out8(s, a2);
2745        break;
2746
2747    case INDEX_op_ld_vec:
2748        tcg_out_ld(s, type, a0, a1, a2);
2749        break;
2750    case INDEX_op_st_vec:
2751        tcg_out_st(s, type, a0, a1, a2);
2752        break;
2753    case INDEX_op_dup_vec:
2754        tcg_out_dup_vec(s, type, vece, a0, a1);
2755        break;
2756
2757    case INDEX_op_x86_shufps_vec:
2758        insn = OPC_SHUFPS;
2759        sub = args[3];
2760        goto gen_simd_imm8;
2761    case INDEX_op_x86_blend_vec:
2762        if (vece == MO_16) {
2763            insn = OPC_PBLENDW;
2764        } else if (vece == MO_32) {
2765            insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
2766        } else {
2767            g_assert_not_reached();
2768        }
2769        sub = args[3];
2770        goto gen_simd_imm8;
2771    case INDEX_op_x86_vperm2i128_vec:
2772        insn = OPC_VPERM2I128;
2773        sub = args[3];
2774        goto gen_simd_imm8;
2775    gen_simd_imm8:
2776        if (type == TCG_TYPE_V256) {
2777            insn |= P_VEXL;
2778        }
2779        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2780        tcg_out8(s, sub);
2781        break;
2782
2783    case INDEX_op_x86_vpblendvb_vec:
2784        insn = OPC_VPBLENDVB;
2785        if (type == TCG_TYPE_V256) {
2786            insn |= P_VEXL;
2787        }
2788        tcg_out_vex_modrm(s, insn, a0, a1, a2);
2789        tcg_out8(s, args[3] << 4);
2790        break;
2791
2792    case INDEX_op_x86_psrldq_vec:
2793        tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
2794        tcg_out8(s, a2);
2795        break;
2796
2797    default:
2798        g_assert_not_reached();
2799    }
2800}
2801
2802static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
2803{
2804    static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
2805    static const TCGTargetOpDef ri_r = { .args_ct_str = { "ri", "r" } };
2806    static const TCGTargetOpDef re_r = { .args_ct_str = { "re", "r" } };
2807    static const TCGTargetOpDef qi_r = { .args_ct_str = { "qi", "r" } };
2808    static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
2809    static const TCGTargetOpDef r_q = { .args_ct_str = { "r", "q" } };
2810    static const TCGTargetOpDef r_re = { .args_ct_str = { "r", "re" } };
2811    static const TCGTargetOpDef r_0 = { .args_ct_str = { "r", "0" } };
2812    static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
2813    static const TCGTargetOpDef r_r_re = { .args_ct_str = { "r", "r", "re" } };
2814    static const TCGTargetOpDef r_0_re = { .args_ct_str = { "r", "0", "re" } };
2815    static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } };
2816    static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
2817    static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
2818    static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
2819    static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
2820    static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
2821    static const TCGTargetOpDef r_r_L_L
2822        = { .args_ct_str = { "r", "r", "L", "L" } };
2823    static const TCGTargetOpDef L_L_L_L
2824        = { .args_ct_str = { "L", "L", "L", "L" } };
2825    static const TCGTargetOpDef x_x = { .args_ct_str = { "x", "x" } };
2826    static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } };
2827    static const TCGTargetOpDef x_x_x_x
2828        = { .args_ct_str = { "x", "x", "x", "x" } };
2829    static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } };
2830
2831    switch (op) {
2832    case INDEX_op_goto_ptr:
2833        return &r;
2834
2835    case INDEX_op_ld8u_i32:
2836    case INDEX_op_ld8u_i64:
2837    case INDEX_op_ld8s_i32:
2838    case INDEX_op_ld8s_i64:
2839    case INDEX_op_ld16u_i32:
2840    case INDEX_op_ld16u_i64:
2841    case INDEX_op_ld16s_i32:
2842    case INDEX_op_ld16s_i64:
2843    case INDEX_op_ld_i32:
2844    case INDEX_op_ld32u_i64:
2845    case INDEX_op_ld32s_i64:
2846    case INDEX_op_ld_i64:
2847        return &r_r;
2848
2849    case INDEX_op_st8_i32:
2850    case INDEX_op_st8_i64:
2851        return &qi_r;
2852    case INDEX_op_st16_i32:
2853    case INDEX_op_st16_i64:
2854    case INDEX_op_st_i32:
2855    case INDEX_op_st32_i64:
2856        return &ri_r;
2857    case INDEX_op_st_i64:
2858        return &re_r;
2859
2860    case INDEX_op_add_i32:
2861    case INDEX_op_add_i64:
2862        return &r_r_re;
2863    case INDEX_op_sub_i32:
2864    case INDEX_op_sub_i64:
2865    case INDEX_op_mul_i32:
2866    case INDEX_op_mul_i64:
2867    case INDEX_op_or_i32:
2868    case INDEX_op_or_i64:
2869    case INDEX_op_xor_i32:
2870    case INDEX_op_xor_i64:
2871        return &r_0_re;
2872
2873    case INDEX_op_and_i32:
2874    case INDEX_op_and_i64:
2875        {
2876            static const TCGTargetOpDef and
2877                = { .args_ct_str = { "r", "0", "reZ" } };
2878            return &and;
2879        }
2880        break;
2881    case INDEX_op_andc_i32:
2882    case INDEX_op_andc_i64:
2883        {
2884            static const TCGTargetOpDef andc
2885                = { .args_ct_str = { "r", "r", "rI" } };
2886            return &andc;
2887        }
2888        break;
2889
2890    case INDEX_op_shl_i32:
2891    case INDEX_op_shl_i64:
2892    case INDEX_op_shr_i32:
2893    case INDEX_op_shr_i64:
2894    case INDEX_op_sar_i32:
2895    case INDEX_op_sar_i64:
2896        return have_bmi2 ? &r_r_ri : &r_0_ci;
2897    case INDEX_op_rotl_i32:
2898    case INDEX_op_rotl_i64:
2899    case INDEX_op_rotr_i32:
2900    case INDEX_op_rotr_i64:
2901        return &r_0_ci;
2902
2903    case INDEX_op_brcond_i32:
2904    case INDEX_op_brcond_i64:
2905        return &r_re;
2906
2907    case INDEX_op_bswap16_i32:
2908    case INDEX_op_bswap16_i64:
2909    case INDEX_op_bswap32_i32:
2910    case INDEX_op_bswap32_i64:
2911    case INDEX_op_bswap64_i64:
2912    case INDEX_op_neg_i32:
2913    case INDEX_op_neg_i64:
2914    case INDEX_op_not_i32:
2915    case INDEX_op_not_i64:
2916        return &r_0;
2917
2918    case INDEX_op_ext8s_i32:
2919    case INDEX_op_ext8s_i64:
2920    case INDEX_op_ext8u_i32:
2921    case INDEX_op_ext8u_i64:
2922        return &r_q;
2923    case INDEX_op_ext16s_i32:
2924    case INDEX_op_ext16s_i64:
2925    case INDEX_op_ext16u_i32:
2926    case INDEX_op_ext16u_i64:
2927    case INDEX_op_ext32s_i64:
2928    case INDEX_op_ext32u_i64:
2929    case INDEX_op_ext_i32_i64:
2930    case INDEX_op_extu_i32_i64:
2931    case INDEX_op_extract_i32:
2932    case INDEX_op_extract_i64:
2933    case INDEX_op_sextract_i32:
2934    case INDEX_op_ctpop_i32:
2935    case INDEX_op_ctpop_i64:
2936        return &r_r;
2937
2938    case INDEX_op_deposit_i32:
2939    case INDEX_op_deposit_i64:
2940        {
2941            static const TCGTargetOpDef dep
2942                = { .args_ct_str = { "Q", "0", "Q" } };
2943            return &dep;
2944        }
2945    case INDEX_op_setcond_i32:
2946    case INDEX_op_setcond_i64:
2947        {
2948            static const TCGTargetOpDef setc
2949                = { .args_ct_str = { "q", "r", "re" } };
2950            return &setc;
2951        }
2952    case INDEX_op_movcond_i32:
2953    case INDEX_op_movcond_i64:
2954        {
2955            static const TCGTargetOpDef movc
2956                = { .args_ct_str = { "r", "r", "re", "r", "0" } };
2957            return &movc;
2958        }
2959    case INDEX_op_div2_i32:
2960    case INDEX_op_div2_i64:
2961    case INDEX_op_divu2_i32:
2962    case INDEX_op_divu2_i64:
2963        {
2964            static const TCGTargetOpDef div2
2965                = { .args_ct_str = { "a", "d", "0", "1", "r" } };
2966            return &div2;
2967        }
2968    case INDEX_op_mulu2_i32:
2969    case INDEX_op_mulu2_i64:
2970    case INDEX_op_muls2_i32:
2971    case INDEX_op_muls2_i64:
2972        {
2973            static const TCGTargetOpDef mul2
2974                = { .args_ct_str = { "a", "d", "a", "r" } };
2975            return &mul2;
2976        }
2977    case INDEX_op_add2_i32:
2978    case INDEX_op_add2_i64:
2979    case INDEX_op_sub2_i32:
2980    case INDEX_op_sub2_i64:
2981        {
2982            static const TCGTargetOpDef arith2
2983                = { .args_ct_str = { "r", "r", "0", "1", "re", "re" } };
2984            return &arith2;
2985        }
2986    case INDEX_op_ctz_i32:
2987    case INDEX_op_ctz_i64:
2988        {
2989            static const TCGTargetOpDef ctz[2] = {
2990                { .args_ct_str = { "&r", "r", "r" } },
2991                { .args_ct_str = { "&r", "r", "rW" } },
2992            };
2993            return &ctz[have_bmi1];
2994        }
2995    case INDEX_op_clz_i32:
2996    case INDEX_op_clz_i64:
2997        {
2998            static const TCGTargetOpDef clz[2] = {
2999                { .args_ct_str = { "&r", "r", "r" } },
3000                { .args_ct_str = { "&r", "r", "rW" } },
3001            };
3002            return &clz[have_lzcnt];
3003        }
3004
3005    case INDEX_op_qemu_ld_i32:
3006        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
3007    case INDEX_op_qemu_st_i32:
3008        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L;
3009    case INDEX_op_qemu_ld_i64:
3010        return (TCG_TARGET_REG_BITS == 64 ? &r_L
3011                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
3012                : &r_r_L_L);
3013    case INDEX_op_qemu_st_i64:
3014        return (TCG_TARGET_REG_BITS == 64 ? &L_L
3015                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L_L
3016                : &L_L_L_L);
3017
3018    case INDEX_op_brcond2_i32:
3019        {
3020            static const TCGTargetOpDef b2
3021                = { .args_ct_str = { "r", "r", "ri", "ri" } };
3022            return &b2;
3023        }
3024    case INDEX_op_setcond2_i32:
3025        {
3026            static const TCGTargetOpDef s2
3027                = { .args_ct_str = { "r", "r", "r", "ri", "ri" } };
3028            return &s2;
3029        }
3030
3031    case INDEX_op_ld_vec:
3032    case INDEX_op_st_vec:
3033        return &x_r;
3034
3035    case INDEX_op_add_vec:
3036    case INDEX_op_sub_vec:
3037    case INDEX_op_mul_vec:
3038    case INDEX_op_and_vec:
3039    case INDEX_op_or_vec:
3040    case INDEX_op_xor_vec:
3041    case INDEX_op_andc_vec:
3042    case INDEX_op_cmp_vec:
3043    case INDEX_op_x86_shufps_vec:
3044    case INDEX_op_x86_blend_vec:
3045    case INDEX_op_x86_packss_vec:
3046    case INDEX_op_x86_packus_vec:
3047    case INDEX_op_x86_vperm2i128_vec:
3048    case INDEX_op_x86_punpckl_vec:
3049    case INDEX_op_x86_punpckh_vec:
3050#if TCG_TARGET_REG_BITS == 32
3051    case INDEX_op_dup2_vec:
3052#endif
3053        return &x_x_x;
3054    case INDEX_op_dup_vec:
3055    case INDEX_op_shli_vec:
3056    case INDEX_op_shri_vec:
3057    case INDEX_op_sari_vec:
3058    case INDEX_op_x86_psrldq_vec:
3059        return &x_x;
3060    case INDEX_op_x86_vpblendvb_vec:
3061        return &x_x_x_x;
3062
3063    default:
3064        break;
3065    }
3066    return NULL;
3067}
3068
3069int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3070{
3071    switch (opc) {
3072    case INDEX_op_add_vec:
3073    case INDEX_op_sub_vec:
3074    case INDEX_op_and_vec:
3075    case INDEX_op_or_vec:
3076    case INDEX_op_xor_vec:
3077    case INDEX_op_andc_vec:
3078        return 1;
3079    case INDEX_op_cmp_vec:
3080        return -1;
3081
3082    case INDEX_op_shli_vec:
3083    case INDEX_op_shri_vec:
3084        /* We must expand the operation for MO_8.  */
3085        return vece == MO_8 ? -1 : 1;
3086
3087    case INDEX_op_sari_vec:
3088        /* We must expand the operation for MO_8.  */
3089        if (vece == MO_8) {
3090            return -1;
3091        }
3092        /* We can emulate this for MO_64, but it does not pay off
3093           unless we're producing at least 4 values.  */
3094        if (vece == MO_64) {
3095            return type >= TCG_TYPE_V256 ? -1 : 0;
3096        }
3097        return 1;
3098
3099    case INDEX_op_mul_vec:
3100        if (vece == MO_8) {
3101            /* We can expand the operation for MO_8.  */
3102            return -1;
3103        }
3104        if (vece == MO_64) {
3105            return 0;
3106        }
3107        return 1;
3108
3109    default:
3110        return 0;
3111    }
3112}
3113
3114void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3115                       TCGArg a0, ...)
3116{
3117    va_list va;
3118    TCGArg a1, a2;
3119    TCGv_vec v0, t1, t2, t3, t4;
3120
3121    va_start(va, a0);
3122    v0 = temp_tcgv_vec(arg_temp(a0));
3123
3124    switch (opc) {
3125    case INDEX_op_shli_vec:
3126    case INDEX_op_shri_vec:
3127        tcg_debug_assert(vece == MO_8);
3128        a1 = va_arg(va, TCGArg);
3129        a2 = va_arg(va, TCGArg);
3130        /* Unpack to W, shift, and repack.  Tricky bits:
3131           (1) Use punpck*bw x,x to produce DDCCBBAA,
3132               i.e. duplicate in other half of the 16-bit lane.
3133           (2) For right-shift, add 8 so that the high half of
3134               the lane becomes zero.  For left-shift, we must
3135               shift up and down again.
3136           (3) Step 2 leaves high half zero such that PACKUSWB
3137               (pack with unsigned saturation) does not modify
3138               the quantity.  */
3139        t1 = tcg_temp_new_vec(type);
3140        t2 = tcg_temp_new_vec(type);
3141        vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3142                  tcgv_vec_arg(t1), a1, a1);
3143        vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3144                  tcgv_vec_arg(t2), a1, a1);
3145        if (opc == INDEX_op_shri_vec) {
3146            vec_gen_3(INDEX_op_shri_vec, type, MO_16,
3147                     tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
3148            vec_gen_3(INDEX_op_shri_vec, type, MO_16,
3149                     tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
3150        } else {
3151            vec_gen_3(INDEX_op_shli_vec, type, MO_16,
3152                     tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
3153            vec_gen_3(INDEX_op_shli_vec, type, MO_16,
3154                     tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
3155            vec_gen_3(INDEX_op_shri_vec, type, MO_16,
3156                     tcgv_vec_arg(t1), tcgv_vec_arg(t1), 8);
3157            vec_gen_3(INDEX_op_shri_vec, type, MO_16,
3158                     tcgv_vec_arg(t2), tcgv_vec_arg(t2), 8);
3159        }
3160        vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3161                 a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3162        tcg_temp_free_vec(t1);
3163        tcg_temp_free_vec(t2);
3164        break;
3165
3166    case INDEX_op_sari_vec:
3167        a1 = va_arg(va, TCGArg);
3168        a2 = va_arg(va, TCGArg);
3169        if (vece == MO_8) {
3170            /* Unpack to W, shift, and repack, as above.  */
3171            t1 = tcg_temp_new_vec(type);
3172            t2 = tcg_temp_new_vec(type);
3173            vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3174                      tcgv_vec_arg(t1), a1, a1);
3175            vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3176                      tcgv_vec_arg(t2), a1, a1);
3177            vec_gen_3(INDEX_op_sari_vec, type, MO_16,
3178                      tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
3179            vec_gen_3(INDEX_op_sari_vec, type, MO_16,
3180                      tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
3181            vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3182                      a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3183            tcg_temp_free_vec(t1);
3184            tcg_temp_free_vec(t2);
3185            break;
3186        }
3187        tcg_debug_assert(vece == MO_64);
3188        /* MO_64: If the shift is <= 32, we can emulate the sign extend by
3189           performing an arithmetic 32-bit shift and overwriting the high
3190           half of the result (note that the ISA says shift of 32 is valid). */
3191        if (a2 <= 32) {
3192            t1 = tcg_temp_new_vec(type);
3193            vec_gen_3(INDEX_op_sari_vec, type, MO_32, tcgv_vec_arg(t1), a1, a2);
3194            vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2);
3195            vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3196                      a0, a0, tcgv_vec_arg(t1), 0xaa);
3197            tcg_temp_free_vec(t1);
3198            break;
3199        }
3200        /* Otherwise we will need to use a compare vs 0 to produce the
3201           sign-extend, shift and merge.  */
3202        t1 = tcg_temp_new_vec(type);
3203        t2 = tcg_const_zeros_vec(type);
3204        vec_gen_4(INDEX_op_cmp_vec, type, MO_64,
3205                  tcgv_vec_arg(t1), tcgv_vec_arg(t2), a1, TCG_COND_GT);
3206        tcg_temp_free_vec(t2);
3207        vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2);
3208        vec_gen_3(INDEX_op_shli_vec, type, MO_64,
3209                  tcgv_vec_arg(t1), tcgv_vec_arg(t1), 64 - a2);
3210        vec_gen_3(INDEX_op_or_vec, type, MO_64, a0, a0, tcgv_vec_arg(t1));
3211        tcg_temp_free_vec(t1);
3212        break;
3213
3214    case INDEX_op_mul_vec:
3215        tcg_debug_assert(vece == MO_8);
3216        a1 = va_arg(va, TCGArg);
3217        a2 = va_arg(va, TCGArg);
3218        switch (type) {
3219        case TCG_TYPE_V64:
3220            t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3221            t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3222            tcg_gen_dup16i_vec(t2, 0);
3223            vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3224                      tcgv_vec_arg(t1), a1, tcgv_vec_arg(t2));
3225            vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3226                      tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2);
3227            tcg_gen_mul_vec(MO_16, t1, t1, t2);
3228            tcg_gen_shri_vec(MO_16, t1, t1, 8);
3229            vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3230                      a0, tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3231            tcg_temp_free_vec(t1);
3232            tcg_temp_free_vec(t2);
3233            break;
3234
3235        case TCG_TYPE_V128:
3236            t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3237            t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3238            t3 = tcg_temp_new_vec(TCG_TYPE_V128);
3239            t4 = tcg_temp_new_vec(TCG_TYPE_V128);
3240            tcg_gen_dup16i_vec(t4, 0);
3241            vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3242                      tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4));
3243            vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3244                      tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2);
3245            vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8,
3246                      tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4));
3247            vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8,
3248                      tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2);
3249            tcg_gen_mul_vec(MO_16, t1, t1, t2);
3250            tcg_gen_mul_vec(MO_16, t3, t3, t4);
3251            tcg_gen_shri_vec(MO_16, t1, t1, 8);
3252            tcg_gen_shri_vec(MO_16, t3, t3, 8);
3253            vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3254                      a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3255            tcg_temp_free_vec(t1);
3256            tcg_temp_free_vec(t2);
3257            tcg_temp_free_vec(t3);
3258            tcg_temp_free_vec(t4);
3259            break;
3260
3261        case TCG_TYPE_V256:
3262            t1 = tcg_temp_new_vec(TCG_TYPE_V256);
3263            t2 = tcg_temp_new_vec(TCG_TYPE_V256);
3264            t3 = tcg_temp_new_vec(TCG_TYPE_V256);
3265            t4 = tcg_temp_new_vec(TCG_TYPE_V256);
3266            tcg_gen_dup16i_vec(t4, 0);
3267            /* a1: A[0-7] ... D[0-7]; a2: W[0-7] ... Z[0-7]
3268               t1: extends of B[0-7], D[0-7]
3269               t2: extends of X[0-7], Z[0-7]
3270               t3: extends of A[0-7], C[0-7]
3271               t4: extends of W[0-7], Y[0-7].  */
3272            vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8,
3273                      tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4));
3274            vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8,
3275                      tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2);
3276            vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8,
3277                      tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4));
3278            vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8,
3279                      tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2);
3280            /* t1: BX DZ; t2: AW CY.  */
3281            tcg_gen_mul_vec(MO_16, t1, t1, t2);
3282            tcg_gen_mul_vec(MO_16, t3, t3, t4);
3283            tcg_gen_shri_vec(MO_16, t1, t1, 8);
3284            tcg_gen_shri_vec(MO_16, t3, t3, 8);
3285            /* a0: AW BX CY DZ.  */
3286            vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V256, MO_8,
3287                      a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3288            tcg_temp_free_vec(t1);
3289            tcg_temp_free_vec(t2);
3290            tcg_temp_free_vec(t3);
3291            tcg_temp_free_vec(t4);
3292            break;
3293
3294        default:
3295            g_assert_not_reached();
3296        }
3297        break;
3298
3299    case INDEX_op_cmp_vec:
3300        {
3301            enum {
3302                NEED_SWAP = 1,
3303                NEED_INV  = 2,
3304                NEED_BIAS = 4
3305            };
3306            static const uint8_t fixups[16] = {
3307                [0 ... 15] = -1,
3308                [TCG_COND_EQ] = 0,
3309                [TCG_COND_NE] = NEED_INV,
3310                [TCG_COND_GT] = 0,
3311                [TCG_COND_LT] = NEED_SWAP,
3312                [TCG_COND_LE] = NEED_INV,
3313                [TCG_COND_GE] = NEED_SWAP | NEED_INV,
3314                [TCG_COND_GTU] = NEED_BIAS,
3315                [TCG_COND_LTU] = NEED_BIAS | NEED_SWAP,
3316                [TCG_COND_LEU] = NEED_BIAS | NEED_INV,
3317                [TCG_COND_GEU] = NEED_BIAS | NEED_SWAP | NEED_INV,
3318            };
3319
3320            TCGCond cond;
3321            uint8_t fixup;
3322
3323            a1 = va_arg(va, TCGArg);
3324            a2 = va_arg(va, TCGArg);
3325            cond = va_arg(va, TCGArg);
3326            fixup = fixups[cond & 15];
3327            tcg_debug_assert(fixup != 0xff);
3328
3329            if (fixup & NEED_INV) {
3330                cond = tcg_invert_cond(cond);
3331            }
3332            if (fixup & NEED_SWAP) {
3333                TCGArg t;
3334                t = a1, a1 = a2, a2 = t;
3335                cond = tcg_swap_cond(cond);
3336            }
3337
3338            t1 = t2 = NULL;
3339            if (fixup & NEED_BIAS) {
3340                t1 = tcg_temp_new_vec(type);
3341                t2 = tcg_temp_new_vec(type);
3342                tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1));
3343                tcg_gen_sub_vec(vece, t1, temp_tcgv_vec(arg_temp(a1)), t2);
3344                tcg_gen_sub_vec(vece, t2, temp_tcgv_vec(arg_temp(a2)), t2);
3345                a1 = tcgv_vec_arg(t1);
3346                a2 = tcgv_vec_arg(t2);
3347                cond = tcg_signed_cond(cond);
3348            }
3349
3350            tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3351            vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond);
3352
3353            if (fixup & NEED_BIAS) {
3354                tcg_temp_free_vec(t1);
3355                tcg_temp_free_vec(t2);
3356            }
3357            if (fixup & NEED_INV) {
3358                tcg_gen_not_vec(vece, v0, v0);
3359            }
3360        }
3361        break;
3362
3363    default:
3364        break;
3365    }
3366
3367    va_end(va);
3368}
3369
3370static const int tcg_target_callee_save_regs[] = {
3371#if TCG_TARGET_REG_BITS == 64
3372    TCG_REG_RBP,
3373    TCG_REG_RBX,
3374#if defined(_WIN64)
3375    TCG_REG_RDI,
3376    TCG_REG_RSI,
3377#endif
3378    TCG_REG_R12,
3379    TCG_REG_R13,
3380    TCG_REG_R14, /* Currently used for the global env. */
3381    TCG_REG_R15,
3382#else
3383    TCG_REG_EBP, /* Currently used for the global env. */
3384    TCG_REG_EBX,
3385    TCG_REG_ESI,
3386    TCG_REG_EDI,
3387#endif
3388};
3389
3390/* Compute frame size via macros, to share between tcg_target_qemu_prologue
3391   and tcg_register_jit.  */
3392
3393#define PUSH_SIZE \
3394    ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3395     * (TCG_TARGET_REG_BITS / 8))
3396
3397#define FRAME_SIZE \
3398    ((PUSH_SIZE \
3399      + TCG_STATIC_CALL_ARGS_SIZE \
3400      + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3401      + TCG_TARGET_STACK_ALIGN - 1) \
3402     & ~(TCG_TARGET_STACK_ALIGN - 1))
3403
3404/* Generate global QEMU prologue and epilogue code */
3405static void tcg_target_qemu_prologue(TCGContext *s)
3406{
3407    int i, stack_addend;
3408
3409    /* TB prologue */
3410
3411    /* Reserve some stack space, also for TCG temps.  */
3412    stack_addend = FRAME_SIZE - PUSH_SIZE;
3413    tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3414                  CPU_TEMP_BUF_NLONGS * sizeof(long));
3415
3416    /* Save all callee saved registers.  */
3417    for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3418        tcg_out_push(s, tcg_target_callee_save_regs[i]);
3419    }
3420
3421#if TCG_TARGET_REG_BITS == 32
3422    tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3423               (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3424    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3425    /* jmp *tb.  */
3426    tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
3427                         (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3428                         + stack_addend);
3429#else
3430    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3431    tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3432    /* jmp *tb.  */
3433    tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3434#endif
3435
3436    /*
3437     * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3438     * and fall through to the rest of the epilogue.
3439     */
3440    s->code_gen_epilogue = s->code_ptr;
3441    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3442
3443    /* TB epilogue */
3444    tb_ret_addr = s->code_ptr;
3445
3446    tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3447
3448    if (have_avx2) {
3449        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3450    }
3451    for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3452        tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3453    }
3454    tcg_out_opc(s, OPC_RET, 0, 0, 0);
3455
3456#if !defined(CONFIG_SOFTMMU)
3457    /* Try to set up a segment register to point to guest_base.  */
3458    if (guest_base) {
3459        setup_guest_base_seg();
3460    }
3461#endif
3462}
3463
3464static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3465{
3466    memset(p, 0x90, count);
3467}
3468
3469static void tcg_target_init(TCGContext *s)
3470{
3471#ifdef CONFIG_CPUID_H
3472    unsigned a, b, c, d, b7 = 0;
3473    int max = __get_cpuid_max(0, 0);
3474
3475    if (max >= 7) {
3476        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
3477        __cpuid_count(7, 0, a, b7, c, d);
3478        have_bmi1 = (b7 & bit_BMI) != 0;
3479        have_bmi2 = (b7 & bit_BMI2) != 0;
3480    }
3481
3482    if (max >= 1) {
3483        __cpuid(1, a, b, c, d);
3484#ifndef have_cmov
3485        /* For 32-bit, 99% certainty that we're running on hardware that
3486           supports cmov, but we still need to check.  In case cmov is not
3487           available, we'll use a small forward branch.  */
3488        have_cmov = (d & bit_CMOV) != 0;
3489#endif
3490
3491        /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
3492           need to probe for it.  */
3493        have_movbe = (c & bit_MOVBE) != 0;
3494        have_popcnt = (c & bit_POPCNT) != 0;
3495
3496        /* There are a number of things we must check before we can be
3497           sure of not hitting invalid opcode.  */
3498        if (c & bit_OSXSAVE) {
3499            unsigned xcrl, xcrh;
3500            /* The xgetbv instruction is not available to older versions of
3501             * the assembler, so we encode the instruction manually.
3502             */
3503            asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
3504            if ((xcrl & 6) == 6) {
3505                have_avx1 = (c & bit_AVX) != 0;
3506                have_avx2 = (b7 & bit_AVX2) != 0;
3507            }
3508        }
3509    }
3510
3511    max = __get_cpuid_max(0x8000000, 0);
3512    if (max >= 1) {
3513        __cpuid(0x80000001, a, b, c, d);
3514        /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
3515        have_lzcnt = (c & bit_LZCNT) != 0;
3516    }
3517#endif /* CONFIG_CPUID_H */
3518
3519    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
3520    if (TCG_TARGET_REG_BITS == 64) {
3521        tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
3522    }
3523    if (have_avx1) {
3524        tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
3525        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
3526    }
3527    if (have_avx2) {
3528        tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
3529    }
3530
3531    tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
3532    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
3533    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
3534    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
3535    if (TCG_TARGET_REG_BITS == 64) {
3536#if !defined(_WIN64)
3537        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
3538        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
3539#endif
3540        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
3541        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
3542        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
3543        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
3544    }
3545
3546    s->reserved_regs = 0;
3547    tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
3548}
3549
3550typedef struct {
3551    DebugFrameHeader h;
3552    uint8_t fde_def_cfa[4];
3553    uint8_t fde_reg_ofs[14];
3554} DebugFrame;
3555
3556/* We're expecting a 2 byte uleb128 encoded value.  */
3557QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3558
3559#if !defined(__ELF__)
3560    /* Host machine without ELF. */
3561#elif TCG_TARGET_REG_BITS == 64
3562#define ELF_HOST_MACHINE EM_X86_64
3563static const DebugFrame debug_frame = {
3564    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3565    .h.cie.id = -1,
3566    .h.cie.version = 1,
3567    .h.cie.code_align = 1,
3568    .h.cie.data_align = 0x78,             /* sleb128 -8 */
3569    .h.cie.return_column = 16,
3570
3571    /* Total FDE size does not include the "len" member.  */
3572    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3573
3574    .fde_def_cfa = {
3575        12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
3576        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3577        (FRAME_SIZE >> 7)
3578    },
3579    .fde_reg_ofs = {
3580        0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
3581        /* The following ordering must match tcg_target_callee_save_regs.  */
3582        0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
3583        0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
3584        0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
3585        0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
3586        0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
3587        0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
3588    }
3589};
3590#else
3591#define ELF_HOST_MACHINE EM_386
3592static const DebugFrame debug_frame = {
3593    .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3594    .h.cie.id = -1,
3595    .h.cie.version = 1,
3596    .h.cie.code_align = 1,
3597    .h.cie.data_align = 0x7c,             /* sleb128 -4 */
3598    .h.cie.return_column = 8,
3599
3600    /* Total FDE size does not include the "len" member.  */
3601    .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3602
3603    .fde_def_cfa = {
3604        12, 4,                          /* DW_CFA_def_cfa %esp, ... */
3605        (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3606        (FRAME_SIZE >> 7)
3607    },
3608    .fde_reg_ofs = {
3609        0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
3610        /* The following ordering must match tcg_target_callee_save_regs.  */
3611        0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
3612        0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
3613        0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
3614        0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
3615    }
3616};
3617#endif
3618
3619#if defined(ELF_HOST_MACHINE)
3620void tcg_register_jit(void *buf, size_t buf_size)
3621{
3622    tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
3623}
3624#endif
3625