qemu/target/i386/kvm/kvm.c
<<
>>
Prefs
   1/*
   2 * QEMU KVM support
   3 *
   4 * Copyright (C) 2006-2008 Qumranet Technologies
   5 * Copyright IBM, Corp. 2008
   6 *
   7 * Authors:
   8 *  Anthony Liguori   <aliguori@us.ibm.com>
   9 *
  10 * This work is licensed under the terms of the GNU GPL, version 2 or later.
  11 * See the COPYING file in the top-level directory.
  12 *
  13 */
  14
  15#include "qemu/osdep.h"
  16#include "qapi/qapi-events-run-state.h"
  17#include "qapi/error.h"
  18#include <sys/ioctl.h>
  19#include <sys/utsname.h>
  20
  21#include <linux/kvm.h>
  22#include "standard-headers/asm-x86/kvm_para.h"
  23
  24#include "cpu.h"
  25#include "host-cpu.h"
  26#include "sysemu/sysemu.h"
  27#include "sysemu/hw_accel.h"
  28#include "sysemu/kvm_int.h"
  29#include "sysemu/runstate.h"
  30#include "kvm_i386.h"
  31#include "sev.h"
  32#include "hyperv.h"
  33#include "hyperv-proto.h"
  34
  35#include "exec/gdbstub.h"
  36#include "qemu/host-utils.h"
  37#include "qemu/main-loop.h"
  38#include "qemu/config-file.h"
  39#include "qemu/error-report.h"
  40#include "hw/i386/x86.h"
  41#include "hw/i386/apic.h"
  42#include "hw/i386/apic_internal.h"
  43#include "hw/i386/apic-msidef.h"
  44#include "hw/i386/intel_iommu.h"
  45#include "hw/i386/x86-iommu.h"
  46#include "hw/i386/e820_memory_layout.h"
  47
  48#include "hw/pci/pci.h"
  49#include "hw/pci/msi.h"
  50#include "hw/pci/msix.h"
  51#include "migration/blocker.h"
  52#include "exec/memattrs.h"
  53#include "trace.h"
  54
  55//#define DEBUG_KVM
  56
  57#ifdef DEBUG_KVM
  58#define DPRINTF(fmt, ...) \
  59    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
  60#else
  61#define DPRINTF(fmt, ...) \
  62    do { } while (0)
  63#endif
  64
  65/* From arch/x86/kvm/lapic.h */
  66#define KVM_APIC_BUS_CYCLE_NS       1
  67#define KVM_APIC_BUS_FREQUENCY      (1000000000ULL / KVM_APIC_BUS_CYCLE_NS)
  68
  69#define MSR_KVM_WALL_CLOCK  0x11
  70#define MSR_KVM_SYSTEM_TIME 0x12
  71
  72/* A 4096-byte buffer can hold the 8-byte kvm_msrs header, plus
  73 * 255 kvm_msr_entry structs */
  74#define MSR_BUF_SIZE 4096
  75
  76static void kvm_init_msrs(X86CPU *cpu);
  77
  78const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
  79    KVM_CAP_INFO(SET_TSS_ADDR),
  80    KVM_CAP_INFO(EXT_CPUID),
  81    KVM_CAP_INFO(MP_STATE),
  82    KVM_CAP_LAST_INFO
  83};
  84
  85static bool has_msr_star;
  86static bool has_msr_hsave_pa;
  87static bool has_msr_tsc_aux;
  88static bool has_msr_tsc_adjust;
  89static bool has_msr_tsc_deadline;
  90static bool has_msr_feature_control;
  91static bool has_msr_misc_enable;
  92static bool has_msr_smbase;
  93static bool has_msr_bndcfgs;
  94static int lm_capable_kernel;
  95static bool has_msr_hv_hypercall;
  96static bool has_msr_hv_crash;
  97static bool has_msr_hv_reset;
  98static bool has_msr_hv_vpindex;
  99static bool hv_vpindex_settable;
 100static bool has_msr_hv_runtime;
 101static bool has_msr_hv_synic;
 102static bool has_msr_hv_stimer;
 103static bool has_msr_hv_frequencies;
 104static bool has_msr_hv_reenlightenment;
 105static bool has_msr_xss;
 106static bool has_msr_umwait;
 107static bool has_msr_spec_ctrl;
 108static bool has_tsc_scale_msr;
 109static bool has_msr_tsx_ctrl;
 110static bool has_msr_virt_ssbd;
 111static bool has_msr_smi_count;
 112static bool has_msr_arch_capabs;
 113static bool has_msr_core_capabs;
 114static bool has_msr_vmx_vmfunc;
 115static bool has_msr_ucode_rev;
 116static bool has_msr_vmx_procbased_ctls2;
 117static bool has_msr_perf_capabs;
 118static bool has_msr_pkrs;
 119
 120static uint32_t has_architectural_pmu_version;
 121static uint32_t num_architectural_pmu_gp_counters;
 122static uint32_t num_architectural_pmu_fixed_counters;
 123
 124static int has_xsave;
 125static int has_xcrs;
 126static int has_pit_state2;
 127static int has_exception_payload;
 128
 129static bool has_msr_mcg_ext_ctl;
 130
 131static struct kvm_cpuid2 *cpuid_cache;
 132static struct kvm_cpuid2 *hv_cpuid_cache;
 133static struct kvm_msr_list *kvm_feature_msrs;
 134
 135#define BUS_LOCK_SLICE_TIME 1000000000ULL /* ns */
 136static RateLimit bus_lock_ratelimit_ctrl;
 137
 138int kvm_has_pit_state2(void)
 139{
 140    return has_pit_state2;
 141}
 142
 143bool kvm_has_smm(void)
 144{
 145    return kvm_vm_check_extension(kvm_state, KVM_CAP_X86_SMM);
 146}
 147
 148bool kvm_has_adjust_clock_stable(void)
 149{
 150    int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
 151
 152    return (ret == KVM_CLOCK_TSC_STABLE);
 153}
 154
 155bool kvm_has_adjust_clock(void)
 156{
 157    return kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
 158}
 159
 160bool kvm_has_exception_payload(void)
 161{
 162    return has_exception_payload;
 163}
 164
 165static bool kvm_x2apic_api_set_flags(uint64_t flags)
 166{
 167    KVMState *s = KVM_STATE(current_accel());
 168
 169    return !kvm_vm_enable_cap(s, KVM_CAP_X2APIC_API, 0, flags);
 170}
 171
 172#define MEMORIZE(fn, _result) \
 173    ({ \
 174        static bool _memorized; \
 175        \
 176        if (_memorized) { \
 177            return _result; \
 178        } \
 179        _memorized = true; \
 180        _result = fn; \
 181    })
 182
 183static bool has_x2apic_api;
 184
 185bool kvm_has_x2apic_api(void)
 186{
 187    return has_x2apic_api;
 188}
 189
 190bool kvm_enable_x2apic(void)
 191{
 192    return MEMORIZE(
 193             kvm_x2apic_api_set_flags(KVM_X2APIC_API_USE_32BIT_IDS |
 194                                      KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK),
 195             has_x2apic_api);
 196}
 197
 198bool kvm_hv_vpindex_settable(void)
 199{
 200    return hv_vpindex_settable;
 201}
 202
 203static int kvm_get_tsc(CPUState *cs)
 204{
 205    X86CPU *cpu = X86_CPU(cs);
 206    CPUX86State *env = &cpu->env;
 207    struct {
 208        struct kvm_msrs info;
 209        struct kvm_msr_entry entries[1];
 210    } msr_data = {};
 211    int ret;
 212
 213    if (env->tsc_valid) {
 214        return 0;
 215    }
 216
 217    memset(&msr_data, 0, sizeof(msr_data));
 218    msr_data.info.nmsrs = 1;
 219    msr_data.entries[0].index = MSR_IA32_TSC;
 220    env->tsc_valid = !runstate_is_running();
 221
 222    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, &msr_data);
 223    if (ret < 0) {
 224        return ret;
 225    }
 226
 227    assert(ret == 1);
 228    env->tsc = msr_data.entries[0].data;
 229    return 0;
 230}
 231
 232static inline void do_kvm_synchronize_tsc(CPUState *cpu, run_on_cpu_data arg)
 233{
 234    kvm_get_tsc(cpu);
 235}
 236
 237void kvm_synchronize_all_tsc(void)
 238{
 239    CPUState *cpu;
 240
 241    if (kvm_enabled()) {
 242        CPU_FOREACH(cpu) {
 243            run_on_cpu(cpu, do_kvm_synchronize_tsc, RUN_ON_CPU_NULL);
 244        }
 245    }
 246}
 247
 248static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
 249{
 250    struct kvm_cpuid2 *cpuid;
 251    int r, size;
 252
 253    size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
 254    cpuid = g_malloc0(size);
 255    cpuid->nent = max;
 256    r = kvm_ioctl(s, KVM_GET_SUPPORTED_CPUID, cpuid);
 257    if (r == 0 && cpuid->nent >= max) {
 258        r = -E2BIG;
 259    }
 260    if (r < 0) {
 261        if (r == -E2BIG) {
 262            g_free(cpuid);
 263            return NULL;
 264        } else {
 265            fprintf(stderr, "KVM_GET_SUPPORTED_CPUID failed: %s\n",
 266                    strerror(-r));
 267            exit(1);
 268        }
 269    }
 270    return cpuid;
 271}
 272
 273/* Run KVM_GET_SUPPORTED_CPUID ioctl(), allocating a buffer large enough
 274 * for all entries.
 275 */
 276static struct kvm_cpuid2 *get_supported_cpuid(KVMState *s)
 277{
 278    struct kvm_cpuid2 *cpuid;
 279    int max = 1;
 280
 281    if (cpuid_cache != NULL) {
 282        return cpuid_cache;
 283    }
 284    while ((cpuid = try_get_cpuid(s, max)) == NULL) {
 285        max *= 2;
 286    }
 287    cpuid_cache = cpuid;
 288    return cpuid;
 289}
 290
 291static bool host_tsx_broken(void)
 292{
 293    int family, model, stepping;\
 294    char vendor[CPUID_VENDOR_SZ + 1];
 295
 296    host_cpu_vendor_fms(vendor, &family, &model, &stepping);
 297
 298    /* Check if we are running on a Haswell host known to have broken TSX */
 299    return !strcmp(vendor, CPUID_VENDOR_INTEL) &&
 300           (family == 6) &&
 301           ((model == 63 && stepping < 4) ||
 302            model == 60 || model == 69 || model == 70);
 303}
 304
 305/* Returns the value for a specific register on the cpuid entry
 306 */
 307static uint32_t cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, int reg)
 308{
 309    uint32_t ret = 0;
 310    switch (reg) {
 311    case R_EAX:
 312        ret = entry->eax;
 313        break;
 314    case R_EBX:
 315        ret = entry->ebx;
 316        break;
 317    case R_ECX:
 318        ret = entry->ecx;
 319        break;
 320    case R_EDX:
 321        ret = entry->edx;
 322        break;
 323    }
 324    return ret;
 325}
 326
 327/* Find matching entry for function/index on kvm_cpuid2 struct
 328 */
 329static struct kvm_cpuid_entry2 *cpuid_find_entry(struct kvm_cpuid2 *cpuid,
 330                                                 uint32_t function,
 331                                                 uint32_t index)
 332{
 333    int i;
 334    for (i = 0; i < cpuid->nent; ++i) {
 335        if (cpuid->entries[i].function == function &&
 336            cpuid->entries[i].index == index) {
 337            return &cpuid->entries[i];
 338        }
 339    }
 340    /* not found: */
 341    return NULL;
 342}
 343
 344uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
 345                                      uint32_t index, int reg)
 346{
 347    struct kvm_cpuid2 *cpuid;
 348    uint32_t ret = 0;
 349    uint32_t cpuid_1_edx;
 350
 351    cpuid = get_supported_cpuid(s);
 352
 353    struct kvm_cpuid_entry2 *entry = cpuid_find_entry(cpuid, function, index);
 354    if (entry) {
 355        ret = cpuid_entry_get_reg(entry, reg);
 356    }
 357
 358    /* Fixups for the data returned by KVM, below */
 359
 360    if (function == 1 && reg == R_EDX) {
 361        /* KVM before 2.6.30 misreports the following features */
 362        ret |= CPUID_MTRR | CPUID_PAT | CPUID_MCE | CPUID_MCA;
 363    } else if (function == 1 && reg == R_ECX) {
 364        /* We can set the hypervisor flag, even if KVM does not return it on
 365         * GET_SUPPORTED_CPUID
 366         */
 367        ret |= CPUID_EXT_HYPERVISOR;
 368        /* tsc-deadline flag is not returned by GET_SUPPORTED_CPUID, but it
 369         * can be enabled if the kernel has KVM_CAP_TSC_DEADLINE_TIMER,
 370         * and the irqchip is in the kernel.
 371         */
 372        if (kvm_irqchip_in_kernel() &&
 373                kvm_check_extension(s, KVM_CAP_TSC_DEADLINE_TIMER)) {
 374            ret |= CPUID_EXT_TSC_DEADLINE_TIMER;
 375        }
 376
 377        /* x2apic is reported by GET_SUPPORTED_CPUID, but it can't be enabled
 378         * without the in-kernel irqchip
 379         */
 380        if (!kvm_irqchip_in_kernel()) {
 381            ret &= ~CPUID_EXT_X2APIC;
 382        }
 383
 384        if (enable_cpu_pm) {
 385            int disable_exits = kvm_check_extension(s,
 386                                                    KVM_CAP_X86_DISABLE_EXITS);
 387
 388            if (disable_exits & KVM_X86_DISABLE_EXITS_MWAIT) {
 389                ret |= CPUID_EXT_MONITOR;
 390            }
 391        }
 392    } else if (function == 6 && reg == R_EAX) {
 393        ret |= CPUID_6_EAX_ARAT; /* safe to allow because of emulated APIC */
 394    } else if (function == 7 && index == 0 && reg == R_EBX) {
 395        if (host_tsx_broken()) {
 396            ret &= ~(CPUID_7_0_EBX_RTM | CPUID_7_0_EBX_HLE);
 397        }
 398    } else if (function == 7 && index == 0 && reg == R_EDX) {
 399        /*
 400         * Linux v4.17-v4.20 incorrectly return ARCH_CAPABILITIES on SVM hosts.
 401         * We can detect the bug by checking if MSR_IA32_ARCH_CAPABILITIES is
 402         * returned by KVM_GET_MSR_INDEX_LIST.
 403         */
 404        if (!has_msr_arch_capabs) {
 405            ret &= ~CPUID_7_0_EDX_ARCH_CAPABILITIES;
 406        }
 407    } else if (function == 0x80000001 && reg == R_ECX) {
 408        /*
 409         * It's safe to enable TOPOEXT even if it's not returned by
 410         * GET_SUPPORTED_CPUID.  Unconditionally enabling TOPOEXT here allows
 411         * us to keep CPU models including TOPOEXT runnable on older kernels.
 412         */
 413        ret |= CPUID_EXT3_TOPOEXT;
 414    } else if (function == 0x80000001 && reg == R_EDX) {
 415        /* On Intel, kvm returns cpuid according to the Intel spec,
 416         * so add missing bits according to the AMD spec:
 417         */
 418        cpuid_1_edx = kvm_arch_get_supported_cpuid(s, 1, 0, R_EDX);
 419        ret |= cpuid_1_edx & CPUID_EXT2_AMD_ALIASES;
 420    } else if (function == KVM_CPUID_FEATURES && reg == R_EAX) {
 421        /* kvm_pv_unhalt is reported by GET_SUPPORTED_CPUID, but it can't
 422         * be enabled without the in-kernel irqchip
 423         */
 424        if (!kvm_irqchip_in_kernel()) {
 425            ret &= ~(1U << KVM_FEATURE_PV_UNHALT);
 426        }
 427        if (kvm_irqchip_is_split()) {
 428            ret |= 1U << KVM_FEATURE_MSI_EXT_DEST_ID;
 429        }
 430    } else if (function == KVM_CPUID_FEATURES && reg == R_EDX) {
 431        ret |= 1U << KVM_HINTS_REALTIME;
 432    }
 433
 434    return ret;
 435}
 436
 437uint64_t kvm_arch_get_supported_msr_feature(KVMState *s, uint32_t index)
 438{
 439    struct {
 440        struct kvm_msrs info;
 441        struct kvm_msr_entry entries[1];
 442    } msr_data = {};
 443    uint64_t value;
 444    uint32_t ret, can_be_one, must_be_one;
 445
 446    if (kvm_feature_msrs == NULL) { /* Host doesn't support feature MSRs */
 447        return 0;
 448    }
 449
 450    /* Check if requested MSR is supported feature MSR */
 451    int i;
 452    for (i = 0; i < kvm_feature_msrs->nmsrs; i++)
 453        if (kvm_feature_msrs->indices[i] == index) {
 454            break;
 455        }
 456    if (i == kvm_feature_msrs->nmsrs) {
 457        return 0; /* if the feature MSR is not supported, simply return 0 */
 458    }
 459
 460    msr_data.info.nmsrs = 1;
 461    msr_data.entries[0].index = index;
 462
 463    ret = kvm_ioctl(s, KVM_GET_MSRS, &msr_data);
 464    if (ret != 1) {
 465        error_report("KVM get MSR (index=0x%x) feature failed, %s",
 466            index, strerror(-ret));
 467        exit(1);
 468    }
 469
 470    value = msr_data.entries[0].data;
 471    switch (index) {
 472    case MSR_IA32_VMX_PROCBASED_CTLS2:
 473        if (!has_msr_vmx_procbased_ctls2) {
 474            /* KVM forgot to add these bits for some time, do this ourselves. */
 475            if (kvm_arch_get_supported_cpuid(s, 0xD, 1, R_ECX) &
 476                CPUID_XSAVE_XSAVES) {
 477                value |= (uint64_t)VMX_SECONDARY_EXEC_XSAVES << 32;
 478            }
 479            if (kvm_arch_get_supported_cpuid(s, 1, 0, R_ECX) &
 480                CPUID_EXT_RDRAND) {
 481                value |= (uint64_t)VMX_SECONDARY_EXEC_RDRAND_EXITING << 32;
 482            }
 483            if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) &
 484                CPUID_7_0_EBX_INVPCID) {
 485                value |= (uint64_t)VMX_SECONDARY_EXEC_ENABLE_INVPCID << 32;
 486            }
 487            if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) &
 488                CPUID_7_0_EBX_RDSEED) {
 489                value |= (uint64_t)VMX_SECONDARY_EXEC_RDSEED_EXITING << 32;
 490            }
 491            if (kvm_arch_get_supported_cpuid(s, 0x80000001, 0, R_EDX) &
 492                CPUID_EXT2_RDTSCP) {
 493                value |= (uint64_t)VMX_SECONDARY_EXEC_RDTSCP << 32;
 494            }
 495        }
 496        /* fall through */
 497    case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
 498    case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
 499    case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
 500    case MSR_IA32_VMX_TRUE_EXIT_CTLS:
 501        /*
 502         * Return true for bits that can be one, but do not have to be one.
 503         * The SDM tells us which bits could have a "must be one" setting,
 504         * so we can do the opposite transformation in make_vmx_msr_value.
 505         */
 506        must_be_one = (uint32_t)value;
 507        can_be_one = (uint32_t)(value >> 32);
 508        return can_be_one & ~must_be_one;
 509
 510    default:
 511        return value;
 512    }
 513}
 514
 515static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
 516                                     int *max_banks)
 517{
 518    int r;
 519
 520    r = kvm_check_extension(s, KVM_CAP_MCE);
 521    if (r > 0) {
 522        *max_banks = r;
 523        return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
 524    }
 525    return -ENOSYS;
 526}
 527
 528static void kvm_mce_inject(X86CPU *cpu, hwaddr paddr, int code)
 529{
 530    CPUState *cs = CPU(cpu);
 531    CPUX86State *env = &cpu->env;
 532    uint64_t status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
 533                      MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S;
 534    uint64_t mcg_status = MCG_STATUS_MCIP;
 535    int flags = 0;
 536
 537    if (code == BUS_MCEERR_AR) {
 538        status |= MCI_STATUS_AR | 0x134;
 539        mcg_status |= MCG_STATUS_EIPV;
 540    } else {
 541        status |= 0xc0;
 542        mcg_status |= MCG_STATUS_RIPV;
 543    }
 544
 545    flags = cpu_x86_support_mca_broadcast(env) ? MCE_INJECT_BROADCAST : 0;
 546    /* We need to read back the value of MSR_EXT_MCG_CTL that was set by the
 547     * guest kernel back into env->mcg_ext_ctl.
 548     */
 549    cpu_synchronize_state(cs);
 550    if (env->mcg_ext_ctl & MCG_EXT_CTL_LMCE_EN) {
 551        mcg_status |= MCG_STATUS_LMCE;
 552        flags = 0;
 553    }
 554
 555    cpu_x86_inject_mce(NULL, cpu, 9, status, mcg_status, paddr,
 556                       (MCM_ADDR_PHYS << 6) | 0xc, flags);
 557}
 558
 559static void emit_hypervisor_memory_failure(MemoryFailureAction action, bool ar)
 560{
 561    MemoryFailureFlags mff = {.action_required = ar, .recursive = false};
 562
 563    qapi_event_send_memory_failure(MEMORY_FAILURE_RECIPIENT_HYPERVISOR, action,
 564                                   &mff);
 565}
 566
 567static void hardware_memory_error(void *host_addr)
 568{
 569    emit_hypervisor_memory_failure(MEMORY_FAILURE_ACTION_FATAL, true);
 570    error_report("QEMU got Hardware memory error at addr %p", host_addr);
 571    exit(1);
 572}
 573
 574void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
 575{
 576    X86CPU *cpu = X86_CPU(c);
 577    CPUX86State *env = &cpu->env;
 578    ram_addr_t ram_addr;
 579    hwaddr paddr;
 580
 581    /* If we get an action required MCE, it has been injected by KVM
 582     * while the VM was running.  An action optional MCE instead should
 583     * be coming from the main thread, which qemu_init_sigbus identifies
 584     * as the "early kill" thread.
 585     */
 586    assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO);
 587
 588    if ((env->mcg_cap & MCG_SER_P) && addr) {
 589        ram_addr = qemu_ram_addr_from_host(addr);
 590        if (ram_addr != RAM_ADDR_INVALID &&
 591            kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
 592            kvm_hwpoison_page_add(ram_addr);
 593            kvm_mce_inject(cpu, paddr, code);
 594
 595            /*
 596             * Use different logging severity based on error type.
 597             * If there is additional MCE reporting on the hypervisor, QEMU VA
 598             * could be another source to identify the PA and MCE details.
 599             */
 600            if (code == BUS_MCEERR_AR) {
 601                error_report("Guest MCE Memory Error at QEMU addr %p and "
 602                    "GUEST addr 0x%" HWADDR_PRIx " of type %s injected",
 603                    addr, paddr, "BUS_MCEERR_AR");
 604            } else {
 605                 warn_report("Guest MCE Memory Error at QEMU addr %p and "
 606                     "GUEST addr 0x%" HWADDR_PRIx " of type %s injected",
 607                     addr, paddr, "BUS_MCEERR_AO");
 608            }
 609
 610            return;
 611        }
 612
 613        if (code == BUS_MCEERR_AO) {
 614            warn_report("Hardware memory error at addr %p of type %s "
 615                "for memory used by QEMU itself instead of guest system!",
 616                 addr, "BUS_MCEERR_AO");
 617        }
 618    }
 619
 620    if (code == BUS_MCEERR_AR) {
 621        hardware_memory_error(addr);
 622    }
 623
 624    /* Hope we are lucky for AO MCE, just notify a event */
 625    emit_hypervisor_memory_failure(MEMORY_FAILURE_ACTION_IGNORE, false);
 626}
 627
 628static void kvm_reset_exception(CPUX86State *env)
 629{
 630    env->exception_nr = -1;
 631    env->exception_pending = 0;
 632    env->exception_injected = 0;
 633    env->exception_has_payload = false;
 634    env->exception_payload = 0;
 635}
 636
 637static void kvm_queue_exception(CPUX86State *env,
 638                                int32_t exception_nr,
 639                                uint8_t exception_has_payload,
 640                                uint64_t exception_payload)
 641{
 642    assert(env->exception_nr == -1);
 643    assert(!env->exception_pending);
 644    assert(!env->exception_injected);
 645    assert(!env->exception_has_payload);
 646
 647    env->exception_nr = exception_nr;
 648
 649    if (has_exception_payload) {
 650        env->exception_pending = 1;
 651
 652        env->exception_has_payload = exception_has_payload;
 653        env->exception_payload = exception_payload;
 654    } else {
 655        env->exception_injected = 1;
 656
 657        if (exception_nr == EXCP01_DB) {
 658            assert(exception_has_payload);
 659            env->dr[6] = exception_payload;
 660        } else if (exception_nr == EXCP0E_PAGE) {
 661            assert(exception_has_payload);
 662            env->cr[2] = exception_payload;
 663        } else {
 664            assert(!exception_has_payload);
 665        }
 666    }
 667}
 668
 669static int kvm_inject_mce_oldstyle(X86CPU *cpu)
 670{
 671    CPUX86State *env = &cpu->env;
 672
 673    if (!kvm_has_vcpu_events() && env->exception_nr == EXCP12_MCHK) {
 674        unsigned int bank, bank_num = env->mcg_cap & 0xff;
 675        struct kvm_x86_mce mce;
 676
 677        kvm_reset_exception(env);
 678
 679        /*
 680         * There must be at least one bank in use if an MCE is pending.
 681         * Find it and use its values for the event injection.
 682         */
 683        for (bank = 0; bank < bank_num; bank++) {
 684            if (env->mce_banks[bank * 4 + 1] & MCI_STATUS_VAL) {
 685                break;
 686            }
 687        }
 688        assert(bank < bank_num);
 689
 690        mce.bank = bank;
 691        mce.status = env->mce_banks[bank * 4 + 1];
 692        mce.mcg_status = env->mcg_status;
 693        mce.addr = env->mce_banks[bank * 4 + 2];
 694        mce.misc = env->mce_banks[bank * 4 + 3];
 695
 696        return kvm_vcpu_ioctl(CPU(cpu), KVM_X86_SET_MCE, &mce);
 697    }
 698    return 0;
 699}
 700
 701static void cpu_update_state(void *opaque, bool running, RunState state)
 702{
 703    CPUX86State *env = opaque;
 704
 705    if (running) {
 706        env->tsc_valid = false;
 707    }
 708}
 709
 710unsigned long kvm_arch_vcpu_id(CPUState *cs)
 711{
 712    X86CPU *cpu = X86_CPU(cs);
 713    return cpu->apic_id;
 714}
 715
 716#ifndef KVM_CPUID_SIGNATURE_NEXT
 717#define KVM_CPUID_SIGNATURE_NEXT                0x40000100
 718#endif
 719
 720static bool hyperv_enabled(X86CPU *cpu)
 721{
 722    return kvm_check_extension(kvm_state, KVM_CAP_HYPERV) > 0 &&
 723        ((cpu->hyperv_spinlock_attempts != HYPERV_SPINLOCK_NEVER_NOTIFY) ||
 724         cpu->hyperv_features || cpu->hyperv_passthrough);
 725}
 726
 727/*
 728 * Check whether target_freq is within conservative
 729 * ntp correctable bounds (250ppm) of freq
 730 */
 731static inline bool freq_within_bounds(int freq, int target_freq)
 732{
 733        int max_freq = freq + (freq * 250 / 1000000);
 734        int min_freq = freq - (freq * 250 / 1000000);
 735
 736        if (target_freq >= min_freq && target_freq <= max_freq) {
 737                return true;
 738        }
 739
 740        return false;
 741}
 742
 743static int kvm_arch_set_tsc_khz(CPUState *cs)
 744{
 745    X86CPU *cpu = X86_CPU(cs);
 746    CPUX86State *env = &cpu->env;
 747    int r, cur_freq;
 748    bool set_ioctl = false;
 749
 750    if (!env->tsc_khz) {
 751        return 0;
 752    }
 753
 754    cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
 755               kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : -ENOTSUP;
 756
 757    /*
 758     * If TSC scaling is supported, attempt to set TSC frequency.
 759     */
 760    if (kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL)) {
 761        set_ioctl = true;
 762    }
 763
 764    /*
 765     * If desired TSC frequency is within bounds of NTP correction,
 766     * attempt to set TSC frequency.
 767     */
 768    if (cur_freq != -ENOTSUP && freq_within_bounds(cur_freq, env->tsc_khz)) {
 769        set_ioctl = true;
 770    }
 771
 772    r = set_ioctl ?
 773        kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz) :
 774        -ENOTSUP;
 775
 776    if (r < 0) {
 777        /* When KVM_SET_TSC_KHZ fails, it's an error only if the current
 778         * TSC frequency doesn't match the one we want.
 779         */
 780        cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
 781                   kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) :
 782                   -ENOTSUP;
 783        if (cur_freq <= 0 || cur_freq != env->tsc_khz) {
 784            warn_report("TSC frequency mismatch between "
 785                        "VM (%" PRId64 " kHz) and host (%d kHz), "
 786                        "and TSC scaling unavailable",
 787                        env->tsc_khz, cur_freq);
 788            return r;
 789        }
 790    }
 791
 792    return 0;
 793}
 794
 795static bool tsc_is_stable_and_known(CPUX86State *env)
 796{
 797    if (!env->tsc_khz) {
 798        return false;
 799    }
 800    return (env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC)
 801        || env->user_tsc_khz;
 802}
 803
 804static struct {
 805    const char *desc;
 806    struct {
 807        uint32_t func;
 808        int reg;
 809        uint32_t bits;
 810    } flags[2];
 811    uint64_t dependencies;
 812} kvm_hyperv_properties[] = {
 813    [HYPERV_FEAT_RELAXED] = {
 814        .desc = "relaxed timing (hv-relaxed)",
 815        .flags = {
 816            {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
 817             .bits = HV_RELAXED_TIMING_RECOMMENDED}
 818        }
 819    },
 820    [HYPERV_FEAT_VAPIC] = {
 821        .desc = "virtual APIC (hv-vapic)",
 822        .flags = {
 823            {.func = HV_CPUID_FEATURES, .reg = R_EAX,
 824             .bits = HV_APIC_ACCESS_AVAILABLE}
 825        }
 826    },
 827    [HYPERV_FEAT_TIME] = {
 828        .desc = "clocksources (hv-time)",
 829        .flags = {
 830            {.func = HV_CPUID_FEATURES, .reg = R_EAX,
 831             .bits = HV_TIME_REF_COUNT_AVAILABLE | HV_REFERENCE_TSC_AVAILABLE}
 832        }
 833    },
 834    [HYPERV_FEAT_CRASH] = {
 835        .desc = "crash MSRs (hv-crash)",
 836        .flags = {
 837            {.func = HV_CPUID_FEATURES, .reg = R_EDX,
 838             .bits = HV_GUEST_CRASH_MSR_AVAILABLE}
 839        }
 840    },
 841    [HYPERV_FEAT_RESET] = {
 842        .desc = "reset MSR (hv-reset)",
 843        .flags = {
 844            {.func = HV_CPUID_FEATURES, .reg = R_EAX,
 845             .bits = HV_RESET_AVAILABLE}
 846        }
 847    },
 848    [HYPERV_FEAT_VPINDEX] = {
 849        .desc = "VP_INDEX MSR (hv-vpindex)",
 850        .flags = {
 851            {.func = HV_CPUID_FEATURES, .reg = R_EAX,
 852             .bits = HV_VP_INDEX_AVAILABLE}
 853        }
 854    },
 855    [HYPERV_FEAT_RUNTIME] = {
 856        .desc = "VP_RUNTIME MSR (hv-runtime)",
 857        .flags = {
 858            {.func = HV_CPUID_FEATURES, .reg = R_EAX,
 859             .bits = HV_VP_RUNTIME_AVAILABLE}
 860        }
 861    },
 862    [HYPERV_FEAT_SYNIC] = {
 863        .desc = "synthetic interrupt controller (hv-synic)",
 864        .flags = {
 865            {.func = HV_CPUID_FEATURES, .reg = R_EAX,
 866             .bits = HV_SYNIC_AVAILABLE}
 867        }
 868    },
 869    [HYPERV_FEAT_STIMER] = {
 870        .desc = "synthetic timers (hv-stimer)",
 871        .flags = {
 872            {.func = HV_CPUID_FEATURES, .reg = R_EAX,
 873             .bits = HV_SYNTIMERS_AVAILABLE}
 874        },
 875        .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_TIME)
 876    },
 877    [HYPERV_FEAT_FREQUENCIES] = {
 878        .desc = "frequency MSRs (hv-frequencies)",
 879        .flags = {
 880            {.func = HV_CPUID_FEATURES, .reg = R_EAX,
 881             .bits = HV_ACCESS_FREQUENCY_MSRS},
 882            {.func = HV_CPUID_FEATURES, .reg = R_EDX,
 883             .bits = HV_FREQUENCY_MSRS_AVAILABLE}
 884        }
 885    },
 886    [HYPERV_FEAT_REENLIGHTENMENT] = {
 887        .desc = "reenlightenment MSRs (hv-reenlightenment)",
 888        .flags = {
 889            {.func = HV_CPUID_FEATURES, .reg = R_EAX,
 890             .bits = HV_ACCESS_REENLIGHTENMENTS_CONTROL}
 891        }
 892    },
 893    [HYPERV_FEAT_TLBFLUSH] = {
 894        .desc = "paravirtualized TLB flush (hv-tlbflush)",
 895        .flags = {
 896            {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
 897             .bits = HV_REMOTE_TLB_FLUSH_RECOMMENDED |
 898             HV_EX_PROCESSOR_MASKS_RECOMMENDED}
 899        },
 900        .dependencies = BIT(HYPERV_FEAT_VPINDEX)
 901    },
 902    [HYPERV_FEAT_EVMCS] = {
 903        .desc = "enlightened VMCS (hv-evmcs)",
 904        .flags = {
 905            {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
 906             .bits = HV_ENLIGHTENED_VMCS_RECOMMENDED}
 907        },
 908        .dependencies = BIT(HYPERV_FEAT_VAPIC)
 909    },
 910    [HYPERV_FEAT_IPI] = {
 911        .desc = "paravirtualized IPI (hv-ipi)",
 912        .flags = {
 913            {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
 914             .bits = HV_CLUSTER_IPI_RECOMMENDED |
 915             HV_EX_PROCESSOR_MASKS_RECOMMENDED}
 916        },
 917        .dependencies = BIT(HYPERV_FEAT_VPINDEX)
 918    },
 919    [HYPERV_FEAT_STIMER_DIRECT] = {
 920        .desc = "direct mode synthetic timers (hv-stimer-direct)",
 921        .flags = {
 922            {.func = HV_CPUID_FEATURES, .reg = R_EDX,
 923             .bits = HV_STIMER_DIRECT_MODE_AVAILABLE}
 924        },
 925        .dependencies = BIT(HYPERV_FEAT_STIMER)
 926    },
 927    [HYPERV_FEAT_AVIC] = {
 928        .desc = "AVIC/APICv support (hv-avic/hv-apicv)",
 929        .flags = {
 930            {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
 931             .bits = HV_DEPRECATING_AEOI_RECOMMENDED}
 932        }
 933    },
 934};
 935
 936static struct kvm_cpuid2 *try_get_hv_cpuid(CPUState *cs, int max,
 937                                           bool do_sys_ioctl)
 938{
 939    struct kvm_cpuid2 *cpuid;
 940    int r, size;
 941
 942    size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
 943    cpuid = g_malloc0(size);
 944    cpuid->nent = max;
 945
 946    if (do_sys_ioctl) {
 947        r = kvm_ioctl(kvm_state, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
 948    } else {
 949        r = kvm_vcpu_ioctl(cs, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
 950    }
 951    if (r == 0 && cpuid->nent >= max) {
 952        r = -E2BIG;
 953    }
 954    if (r < 0) {
 955        if (r == -E2BIG) {
 956            g_free(cpuid);
 957            return NULL;
 958        } else {
 959            fprintf(stderr, "KVM_GET_SUPPORTED_HV_CPUID failed: %s\n",
 960                    strerror(-r));
 961            exit(1);
 962        }
 963    }
 964    return cpuid;
 965}
 966
 967/*
 968 * Run KVM_GET_SUPPORTED_HV_CPUID ioctl(), allocating a buffer large enough
 969 * for all entries.
 970 */
 971static struct kvm_cpuid2 *get_supported_hv_cpuid(CPUState *cs)
 972{
 973    struct kvm_cpuid2 *cpuid;
 974    /* 0x40000000..0x40000005, 0x4000000A, 0x40000080..0x40000080 leaves */
 975    int max = 10;
 976    int i;
 977    bool do_sys_ioctl;
 978
 979    do_sys_ioctl =
 980        kvm_check_extension(kvm_state, KVM_CAP_SYS_HYPERV_CPUID) > 0;
 981
 982    /*
 983     * Non-empty KVM context is needed when KVM_CAP_SYS_HYPERV_CPUID is
 984     * unsupported, kvm_hyperv_expand_features() checks for that.
 985     */
 986    assert(do_sys_ioctl || cs->kvm_state);
 987
 988    /*
 989     * When the buffer is too small, KVM_GET_SUPPORTED_HV_CPUID fails with
 990     * -E2BIG, however, it doesn't report back the right size. Keep increasing
 991     * it and re-trying until we succeed.
 992     */
 993    while ((cpuid = try_get_hv_cpuid(cs, max, do_sys_ioctl)) == NULL) {
 994        max++;
 995    }
 996
 997    /*
 998     * KVM_GET_SUPPORTED_HV_CPUID does not set EVMCS CPUID bit before
 999     * KVM_CAP_HYPERV_ENLIGHTENED_VMCS is enabled but we want to get the
1000     * information early, just check for the capability and set the bit
1001     * manually.
1002     */
1003    if (!do_sys_ioctl && kvm_check_extension(cs->kvm_state,
1004                            KVM_CAP_HYPERV_ENLIGHTENED_VMCS) > 0) {
1005        for (i = 0; i < cpuid->nent; i++) {
1006            if (cpuid->entries[i].function == HV_CPUID_ENLIGHTMENT_INFO) {
1007                cpuid->entries[i].eax |= HV_ENLIGHTENED_VMCS_RECOMMENDED;
1008            }
1009        }
1010    }
1011
1012    return cpuid;
1013}
1014
1015/*
1016 * When KVM_GET_SUPPORTED_HV_CPUID is not supported we fill CPUID feature
1017 * leaves from KVM_CAP_HYPERV* and present MSRs data.
1018 */
1019static struct kvm_cpuid2 *get_supported_hv_cpuid_legacy(CPUState *cs)
1020{
1021    X86CPU *cpu = X86_CPU(cs);
1022    struct kvm_cpuid2 *cpuid;
1023    struct kvm_cpuid_entry2 *entry_feat, *entry_recomm;
1024
1025    /* HV_CPUID_FEATURES, HV_CPUID_ENLIGHTMENT_INFO */
1026    cpuid = g_malloc0(sizeof(*cpuid) + 2 * sizeof(*cpuid->entries));
1027    cpuid->nent = 2;
1028
1029    /* HV_CPUID_VENDOR_AND_MAX_FUNCTIONS */
1030    entry_feat = &cpuid->entries[0];
1031    entry_feat->function = HV_CPUID_FEATURES;
1032
1033    entry_recomm = &cpuid->entries[1];
1034    entry_recomm->function = HV_CPUID_ENLIGHTMENT_INFO;
1035    entry_recomm->ebx = cpu->hyperv_spinlock_attempts;
1036
1037    if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV) > 0) {
1038        entry_feat->eax |= HV_HYPERCALL_AVAILABLE;
1039        entry_feat->eax |= HV_APIC_ACCESS_AVAILABLE;
1040        entry_feat->edx |= HV_CPU_DYNAMIC_PARTITIONING_AVAILABLE;
1041        entry_recomm->eax |= HV_RELAXED_TIMING_RECOMMENDED;
1042        entry_recomm->eax |= HV_APIC_ACCESS_RECOMMENDED;
1043    }
1044
1045    if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV_TIME) > 0) {
1046        entry_feat->eax |= HV_TIME_REF_COUNT_AVAILABLE;
1047        entry_feat->eax |= HV_REFERENCE_TSC_AVAILABLE;
1048    }
1049
1050    if (has_msr_hv_frequencies) {
1051        entry_feat->eax |= HV_ACCESS_FREQUENCY_MSRS;
1052        entry_feat->edx |= HV_FREQUENCY_MSRS_AVAILABLE;
1053    }
1054
1055    if (has_msr_hv_crash) {
1056        entry_feat->edx |= HV_GUEST_CRASH_MSR_AVAILABLE;
1057    }
1058
1059    if (has_msr_hv_reenlightenment) {
1060        entry_feat->eax |= HV_ACCESS_REENLIGHTENMENTS_CONTROL;
1061    }
1062
1063    if (has_msr_hv_reset) {
1064        entry_feat->eax |= HV_RESET_AVAILABLE;
1065    }
1066
1067    if (has_msr_hv_vpindex) {
1068        entry_feat->eax |= HV_VP_INDEX_AVAILABLE;
1069    }
1070
1071    if (has_msr_hv_runtime) {
1072        entry_feat->eax |= HV_VP_RUNTIME_AVAILABLE;
1073    }
1074
1075    if (has_msr_hv_synic) {
1076        unsigned int cap = cpu->hyperv_synic_kvm_only ?
1077            KVM_CAP_HYPERV_SYNIC : KVM_CAP_HYPERV_SYNIC2;
1078
1079        if (kvm_check_extension(cs->kvm_state, cap) > 0) {
1080            entry_feat->eax |= HV_SYNIC_AVAILABLE;
1081        }
1082    }
1083
1084    if (has_msr_hv_stimer) {
1085        entry_feat->eax |= HV_SYNTIMERS_AVAILABLE;
1086    }
1087
1088    if (kvm_check_extension(cs->kvm_state,
1089                            KVM_CAP_HYPERV_TLBFLUSH) > 0) {
1090        entry_recomm->eax |= HV_REMOTE_TLB_FLUSH_RECOMMENDED;
1091        entry_recomm->eax |= HV_EX_PROCESSOR_MASKS_RECOMMENDED;
1092    }
1093
1094    if (kvm_check_extension(cs->kvm_state,
1095                            KVM_CAP_HYPERV_ENLIGHTENED_VMCS) > 0) {
1096        entry_recomm->eax |= HV_ENLIGHTENED_VMCS_RECOMMENDED;
1097    }
1098
1099    if (kvm_check_extension(cs->kvm_state,
1100                            KVM_CAP_HYPERV_SEND_IPI) > 0) {
1101        entry_recomm->eax |= HV_CLUSTER_IPI_RECOMMENDED;
1102        entry_recomm->eax |= HV_EX_PROCESSOR_MASKS_RECOMMENDED;
1103    }
1104
1105    return cpuid;
1106}
1107
1108static uint32_t hv_cpuid_get_host(CPUState *cs, uint32_t func, int reg)
1109{
1110    struct kvm_cpuid_entry2 *entry;
1111    struct kvm_cpuid2 *cpuid;
1112
1113    if (hv_cpuid_cache) {
1114        cpuid = hv_cpuid_cache;
1115    } else {
1116        if (kvm_check_extension(kvm_state, KVM_CAP_HYPERV_CPUID) > 0) {
1117            cpuid = get_supported_hv_cpuid(cs);
1118        } else {
1119            /*
1120             * 'cs->kvm_state' may be NULL when Hyper-V features are expanded
1121             * before KVM context is created but this is only done when
1122             * KVM_CAP_SYS_HYPERV_CPUID is supported and it implies
1123             * KVM_CAP_HYPERV_CPUID.
1124             */
1125            assert(cs->kvm_state);
1126
1127            cpuid = get_supported_hv_cpuid_legacy(cs);
1128        }
1129        hv_cpuid_cache = cpuid;
1130    }
1131
1132    if (!cpuid) {
1133        return 0;
1134    }
1135
1136    entry = cpuid_find_entry(cpuid, func, 0);
1137    if (!entry) {
1138        return 0;
1139    }
1140
1141    return cpuid_entry_get_reg(entry, reg);
1142}
1143
1144static bool hyperv_feature_supported(CPUState *cs, int feature)
1145{
1146    uint32_t func, bits;
1147    int i, reg;
1148
1149    for (i = 0; i < ARRAY_SIZE(kvm_hyperv_properties[feature].flags); i++) {
1150
1151        func = kvm_hyperv_properties[feature].flags[i].func;
1152        reg = kvm_hyperv_properties[feature].flags[i].reg;
1153        bits = kvm_hyperv_properties[feature].flags[i].bits;
1154
1155        if (!func) {
1156            continue;
1157        }
1158
1159        if ((hv_cpuid_get_host(cs, func, reg) & bits) != bits) {
1160            return false;
1161        }
1162    }
1163
1164    return true;
1165}
1166
1167/* Checks that all feature dependencies are enabled */
1168static bool hv_feature_check_deps(X86CPU *cpu, int feature, Error **errp)
1169{
1170    uint64_t deps;
1171    int dep_feat;
1172
1173    deps = kvm_hyperv_properties[feature].dependencies;
1174    while (deps) {
1175        dep_feat = ctz64(deps);
1176        if (!(hyperv_feat_enabled(cpu, dep_feat))) {
1177            error_setg(errp, "Hyper-V %s requires Hyper-V %s",
1178                       kvm_hyperv_properties[feature].desc,
1179                       kvm_hyperv_properties[dep_feat].desc);
1180            return false;
1181        }
1182        deps &= ~(1ull << dep_feat);
1183    }
1184
1185    return true;
1186}
1187
1188static uint32_t hv_build_cpuid_leaf(CPUState *cs, uint32_t func, int reg)
1189{
1190    X86CPU *cpu = X86_CPU(cs);
1191    uint32_t r = 0;
1192    int i, j;
1193
1194    for (i = 0; i < ARRAY_SIZE(kvm_hyperv_properties); i++) {
1195        if (!hyperv_feat_enabled(cpu, i)) {
1196            continue;
1197        }
1198
1199        for (j = 0; j < ARRAY_SIZE(kvm_hyperv_properties[i].flags); j++) {
1200            if (kvm_hyperv_properties[i].flags[j].func != func) {
1201                continue;
1202            }
1203            if (kvm_hyperv_properties[i].flags[j].reg != reg) {
1204                continue;
1205            }
1206
1207            r |= kvm_hyperv_properties[i].flags[j].bits;
1208        }
1209    }
1210
1211    return r;
1212}
1213
1214/*
1215 * Expand Hyper-V CPU features. In partucular, check that all the requested
1216 * features are supported by the host and the sanity of the configuration
1217 * (that all the required dependencies are included). Also, this takes care
1218 * of 'hv_passthrough' mode and fills the environment with all supported
1219 * Hyper-V features.
1220 */
1221bool kvm_hyperv_expand_features(X86CPU *cpu, Error **errp)
1222{
1223    CPUState *cs = CPU(cpu);
1224    Error *local_err = NULL;
1225    int feat;
1226
1227    if (!hyperv_enabled(cpu))
1228        return true;
1229
1230    /*
1231     * When kvm_hyperv_expand_features is called at CPU feature expansion
1232     * time per-CPU kvm_state is not available yet so we can only proceed
1233     * when KVM_CAP_SYS_HYPERV_CPUID is supported.
1234     */
1235    if (!cs->kvm_state &&
1236        !kvm_check_extension(kvm_state, KVM_CAP_SYS_HYPERV_CPUID))
1237        return true;
1238
1239    if (cpu->hyperv_passthrough) {
1240        cpu->hyperv_vendor_id[0] =
1241            hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_EBX);
1242        cpu->hyperv_vendor_id[1] =
1243            hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_ECX);
1244        cpu->hyperv_vendor_id[2] =
1245            hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_EDX);
1246        cpu->hyperv_vendor = g_realloc(cpu->hyperv_vendor,
1247                                       sizeof(cpu->hyperv_vendor_id) + 1);
1248        memcpy(cpu->hyperv_vendor, cpu->hyperv_vendor_id,
1249               sizeof(cpu->hyperv_vendor_id));
1250        cpu->hyperv_vendor[sizeof(cpu->hyperv_vendor_id)] = 0;
1251
1252        cpu->hyperv_interface_id[0] =
1253            hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EAX);
1254        cpu->hyperv_interface_id[1] =
1255            hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EBX);
1256        cpu->hyperv_interface_id[2] =
1257            hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_ECX);
1258        cpu->hyperv_interface_id[3] =
1259            hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EDX);
1260
1261        cpu->hyperv_ver_id_build =
1262            hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EAX);
1263        cpu->hyperv_ver_id_major =
1264            hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EBX) >> 16;
1265        cpu->hyperv_ver_id_minor =
1266            hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EBX) & 0xffff;
1267        cpu->hyperv_ver_id_sp =
1268            hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_ECX);
1269        cpu->hyperv_ver_id_sb =
1270            hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EDX) >> 24;
1271        cpu->hyperv_ver_id_sn =
1272            hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EDX) & 0xffffff;
1273
1274        cpu->hv_max_vps = hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS,
1275                                            R_EAX);
1276        cpu->hyperv_limits[0] =
1277            hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_EBX);
1278        cpu->hyperv_limits[1] =
1279            hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_ECX);
1280        cpu->hyperv_limits[2] =
1281            hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_EDX);
1282
1283        cpu->hyperv_spinlock_attempts =
1284            hv_cpuid_get_host(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EBX);
1285
1286        /*
1287         * Mark feature as enabled in 'cpu->hyperv_features' as
1288         * hv_build_cpuid_leaf() uses this info to build guest CPUIDs.
1289         */
1290        for (feat = 0; feat < ARRAY_SIZE(kvm_hyperv_properties); feat++) {
1291            if (hyperv_feature_supported(cs, feat)) {
1292                cpu->hyperv_features |= BIT(feat);
1293            }
1294        }
1295    } else {
1296        /* Check features availability and dependencies */
1297        for (feat = 0; feat < ARRAY_SIZE(kvm_hyperv_properties); feat++) {
1298            /* If the feature was not requested skip it. */
1299            if (!hyperv_feat_enabled(cpu, feat)) {
1300                continue;
1301            }
1302
1303            /* Check if the feature is supported by KVM */
1304            if (!hyperv_feature_supported(cs, feat)) {
1305                error_setg(errp, "Hyper-V %s is not supported by kernel",
1306                           kvm_hyperv_properties[feat].desc);
1307                return false;
1308            }
1309
1310            /* Check dependencies */
1311            if (!hv_feature_check_deps(cpu, feat, &local_err)) {
1312                error_propagate(errp, local_err);
1313                return false;
1314            }
1315        }
1316    }
1317
1318    /* Additional dependencies not covered by kvm_hyperv_properties[] */
1319    if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC) &&
1320        !cpu->hyperv_synic_kvm_only &&
1321        !hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX)) {
1322        error_setg(errp, "Hyper-V %s requires Hyper-V %s",
1323                   kvm_hyperv_properties[HYPERV_FEAT_SYNIC].desc,
1324                   kvm_hyperv_properties[HYPERV_FEAT_VPINDEX].desc);
1325        return false;
1326    }
1327
1328    return true;
1329}
1330
1331/*
1332 * Fill in Hyper-V CPUIDs. Returns the number of entries filled in cpuid_ent.
1333 */
1334static int hyperv_fill_cpuids(CPUState *cs,
1335                              struct kvm_cpuid_entry2 *cpuid_ent)
1336{
1337    X86CPU *cpu = X86_CPU(cs);
1338    struct kvm_cpuid_entry2 *c;
1339    uint32_t cpuid_i = 0;
1340
1341    c = &cpuid_ent[cpuid_i++];
1342    c->function = HV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
1343    c->eax = hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS) ?
1344        HV_CPUID_NESTED_FEATURES : HV_CPUID_IMPLEMENT_LIMITS;
1345    c->ebx = cpu->hyperv_vendor_id[0];
1346    c->ecx = cpu->hyperv_vendor_id[1];
1347    c->edx = cpu->hyperv_vendor_id[2];
1348
1349    c = &cpuid_ent[cpuid_i++];
1350    c->function = HV_CPUID_INTERFACE;
1351    c->eax = cpu->hyperv_interface_id[0];
1352    c->ebx = cpu->hyperv_interface_id[1];
1353    c->ecx = cpu->hyperv_interface_id[2];
1354    c->edx = cpu->hyperv_interface_id[3];
1355
1356    c = &cpuid_ent[cpuid_i++];
1357    c->function = HV_CPUID_VERSION;
1358    c->eax = cpu->hyperv_ver_id_build;
1359    c->ebx = (uint32_t)cpu->hyperv_ver_id_major << 16 |
1360        cpu->hyperv_ver_id_minor;
1361    c->ecx = cpu->hyperv_ver_id_sp;
1362    c->edx = (uint32_t)cpu->hyperv_ver_id_sb << 24 |
1363        (cpu->hyperv_ver_id_sn & 0xffffff);
1364
1365    c = &cpuid_ent[cpuid_i++];
1366    c->function = HV_CPUID_FEATURES;
1367    c->eax = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EAX);
1368    c->ebx = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EBX);
1369    c->edx = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EDX);
1370
1371    /* Unconditionally required with any Hyper-V enlightenment */
1372    c->eax |= HV_HYPERCALL_AVAILABLE;
1373
1374    /* SynIC and Vmbus devices require messages/signals hypercalls */
1375    if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC) &&
1376        !cpu->hyperv_synic_kvm_only) {
1377        c->ebx |= HV_POST_MESSAGES | HV_SIGNAL_EVENTS;
1378    }
1379
1380
1381    /* Not exposed by KVM but needed to make CPU hotplug in Windows work */
1382    c->edx |= HV_CPU_DYNAMIC_PARTITIONING_AVAILABLE;
1383
1384    c = &cpuid_ent[cpuid_i++];
1385    c->function = HV_CPUID_ENLIGHTMENT_INFO;
1386    c->eax = hv_build_cpuid_leaf(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EAX);
1387    c->ebx = cpu->hyperv_spinlock_attempts;
1388
1389    if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC) &&
1390        !hyperv_feat_enabled(cpu, HYPERV_FEAT_AVIC)) {
1391        c->eax |= HV_APIC_ACCESS_RECOMMENDED;
1392    }
1393
1394    if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_ON) {
1395        c->eax |= HV_NO_NONARCH_CORESHARING;
1396    } else if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_AUTO) {
1397        c->eax |= hv_cpuid_get_host(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EAX) &
1398            HV_NO_NONARCH_CORESHARING;
1399    }
1400
1401    c = &cpuid_ent[cpuid_i++];
1402    c->function = HV_CPUID_IMPLEMENT_LIMITS;
1403    c->eax = cpu->hv_max_vps;
1404    c->ebx = cpu->hyperv_limits[0];
1405    c->ecx = cpu->hyperv_limits[1];
1406    c->edx = cpu->hyperv_limits[2];
1407
1408    if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS)) {
1409        __u32 function;
1410
1411        /* Create zeroed 0x40000006..0x40000009 leaves */
1412        for (function = HV_CPUID_IMPLEMENT_LIMITS + 1;
1413             function < HV_CPUID_NESTED_FEATURES; function++) {
1414            c = &cpuid_ent[cpuid_i++];
1415            c->function = function;
1416        }
1417
1418        c = &cpuid_ent[cpuid_i++];
1419        c->function = HV_CPUID_NESTED_FEATURES;
1420        c->eax = cpu->hyperv_nested[0];
1421    }
1422
1423    return cpuid_i;
1424}
1425
1426static Error *hv_passthrough_mig_blocker;
1427static Error *hv_no_nonarch_cs_mig_blocker;
1428
1429/* Checks that the exposed eVMCS version range is supported by KVM */
1430static bool evmcs_version_supported(uint16_t evmcs_version,
1431                                    uint16_t supported_evmcs_version)
1432{
1433    uint8_t min_version = evmcs_version & 0xff;
1434    uint8_t max_version = evmcs_version >> 8;
1435    uint8_t min_supported_version = supported_evmcs_version & 0xff;
1436    uint8_t max_supported_version = supported_evmcs_version >> 8;
1437
1438    return (min_version >= min_supported_version) &&
1439        (max_version <= max_supported_version);
1440}
1441
1442#define DEFAULT_EVMCS_VERSION ((1 << 8) | 1)
1443
1444static int hyperv_init_vcpu(X86CPU *cpu)
1445{
1446    CPUState *cs = CPU(cpu);
1447    Error *local_err = NULL;
1448    int ret;
1449
1450    if (cpu->hyperv_passthrough && hv_passthrough_mig_blocker == NULL) {
1451        error_setg(&hv_passthrough_mig_blocker,
1452                   "'hv-passthrough' CPU flag prevents migration, use explicit"
1453                   " set of hv-* flags instead");
1454        ret = migrate_add_blocker(hv_passthrough_mig_blocker, &local_err);
1455        if (ret < 0) {
1456            error_report_err(local_err);
1457            return ret;
1458        }
1459    }
1460
1461    if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_AUTO &&
1462        hv_no_nonarch_cs_mig_blocker == NULL) {
1463        error_setg(&hv_no_nonarch_cs_mig_blocker,
1464                   "'hv-no-nonarch-coresharing=auto' CPU flag prevents migration"
1465                   " use explicit 'hv-no-nonarch-coresharing=on' instead (but"
1466                   " make sure SMT is disabled and/or that vCPUs are properly"
1467                   " pinned)");
1468        ret = migrate_add_blocker(hv_no_nonarch_cs_mig_blocker, &local_err);
1469        if (ret < 0) {
1470            error_report_err(local_err);
1471            return ret;
1472        }
1473    }
1474
1475    if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX) && !hv_vpindex_settable) {
1476        /*
1477         * the kernel doesn't support setting vp_index; assert that its value
1478         * is in sync
1479         */
1480        struct {
1481            struct kvm_msrs info;
1482            struct kvm_msr_entry entries[1];
1483        } msr_data = {
1484            .info.nmsrs = 1,
1485            .entries[0].index = HV_X64_MSR_VP_INDEX,
1486        };
1487
1488        ret = kvm_vcpu_ioctl(cs, KVM_GET_MSRS, &msr_data);
1489        if (ret < 0) {
1490            return ret;
1491        }
1492        assert(ret == 1);
1493
1494        if (msr_data.entries[0].data != hyperv_vp_index(CPU(cpu))) {
1495            error_report("kernel's vp_index != QEMU's vp_index");
1496            return -ENXIO;
1497        }
1498    }
1499
1500    if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
1501        uint32_t synic_cap = cpu->hyperv_synic_kvm_only ?
1502            KVM_CAP_HYPERV_SYNIC : KVM_CAP_HYPERV_SYNIC2;
1503        ret = kvm_vcpu_enable_cap(cs, synic_cap, 0);
1504        if (ret < 0) {
1505            error_report("failed to turn on HyperV SynIC in KVM: %s",
1506                         strerror(-ret));
1507            return ret;
1508        }
1509
1510        if (!cpu->hyperv_synic_kvm_only) {
1511            ret = hyperv_x86_synic_add(cpu);
1512            if (ret < 0) {
1513                error_report("failed to create HyperV SynIC: %s",
1514                             strerror(-ret));
1515                return ret;
1516            }
1517        }
1518    }
1519
1520    if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS)) {
1521        uint16_t evmcs_version = DEFAULT_EVMCS_VERSION;
1522        uint16_t supported_evmcs_version;
1523
1524        ret = kvm_vcpu_enable_cap(cs, KVM_CAP_HYPERV_ENLIGHTENED_VMCS, 0,
1525                                  (uintptr_t)&supported_evmcs_version);
1526
1527        /*
1528         * KVM is required to support EVMCS ver.1. as that's what 'hv-evmcs'
1529         * option sets. Note: we hardcode the maximum supported eVMCS version
1530         * to '1' as well so 'hv-evmcs' feature is migratable even when (and if)
1531         * ver.2 is implemented. A new option (e.g. 'hv-evmcs=2') will then have
1532         * to be added.
1533         */
1534        if (ret < 0) {
1535            error_report("Hyper-V %s is not supported by kernel",
1536                         kvm_hyperv_properties[HYPERV_FEAT_EVMCS].desc);
1537            return ret;
1538        }
1539
1540        if (!evmcs_version_supported(evmcs_version, supported_evmcs_version)) {
1541            error_report("eVMCS version range [%d..%d] is not supported by "
1542                         "kernel (supported: [%d..%d])", evmcs_version & 0xff,
1543                         evmcs_version >> 8, supported_evmcs_version & 0xff,
1544                         supported_evmcs_version >> 8);
1545            return -ENOTSUP;
1546        }
1547
1548        cpu->hyperv_nested[0] = evmcs_version;
1549    }
1550
1551    if (cpu->hyperv_enforce_cpuid) {
1552        ret = kvm_vcpu_enable_cap(cs, KVM_CAP_HYPERV_ENFORCE_CPUID, 0, 1);
1553        if (ret < 0) {
1554            error_report("failed to enable KVM_CAP_HYPERV_ENFORCE_CPUID: %s",
1555                         strerror(-ret));
1556            return ret;
1557        }
1558    }
1559
1560    return 0;
1561}
1562
1563static Error *invtsc_mig_blocker;
1564
1565#define KVM_MAX_CPUID_ENTRIES  100
1566
1567int kvm_arch_init_vcpu(CPUState *cs)
1568{
1569    struct {
1570        struct kvm_cpuid2 cpuid;
1571        struct kvm_cpuid_entry2 entries[KVM_MAX_CPUID_ENTRIES];
1572    } cpuid_data;
1573    /*
1574     * The kernel defines these structs with padding fields so there
1575     * should be no extra padding in our cpuid_data struct.
1576     */
1577    QEMU_BUILD_BUG_ON(sizeof(cpuid_data) !=
1578                      sizeof(struct kvm_cpuid2) +
1579                      sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES);
1580
1581    X86CPU *cpu = X86_CPU(cs);
1582    CPUX86State *env = &cpu->env;
1583    uint32_t limit, i, j, cpuid_i;
1584    uint32_t unused;
1585    struct kvm_cpuid_entry2 *c;
1586    uint32_t signature[3];
1587    int kvm_base = KVM_CPUID_SIGNATURE;
1588    int max_nested_state_len;
1589    int r;
1590    Error *local_err = NULL;
1591
1592    memset(&cpuid_data, 0, sizeof(cpuid_data));
1593
1594    cpuid_i = 0;
1595
1596    r = kvm_arch_set_tsc_khz(cs);
1597    if (r < 0) {
1598        return r;
1599    }
1600
1601    /* vcpu's TSC frequency is either specified by user, or following
1602     * the value used by KVM if the former is not present. In the
1603     * latter case, we query it from KVM and record in env->tsc_khz,
1604     * so that vcpu's TSC frequency can be migrated later via this field.
1605     */
1606    if (!env->tsc_khz) {
1607        r = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
1608            kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) :
1609            -ENOTSUP;
1610        if (r > 0) {
1611            env->tsc_khz = r;
1612        }
1613    }
1614
1615    env->apic_bus_freq = KVM_APIC_BUS_FREQUENCY;
1616
1617    /*
1618     * kvm_hyperv_expand_features() is called here for the second time in case
1619     * KVM_CAP_SYS_HYPERV_CPUID is not supported. While we can't possibly handle
1620     * 'query-cpu-model-expansion' in this case as we don't have a KVM vCPU to
1621     * check which Hyper-V enlightenments are supported and which are not, we
1622     * can still proceed and check/expand Hyper-V enlightenments here so legacy
1623     * behavior is preserved.
1624     */
1625    if (!kvm_hyperv_expand_features(cpu, &local_err)) {
1626        error_report_err(local_err);
1627        return -ENOSYS;
1628    }
1629
1630    if (hyperv_enabled(cpu)) {
1631        r = hyperv_init_vcpu(cpu);
1632        if (r) {
1633            return r;
1634        }
1635
1636        cpuid_i = hyperv_fill_cpuids(cs, cpuid_data.entries);
1637        kvm_base = KVM_CPUID_SIGNATURE_NEXT;
1638        has_msr_hv_hypercall = true;
1639    }
1640
1641    if (cpu->expose_kvm) {
1642        memcpy(signature, "KVMKVMKVM\0\0\0", 12);
1643        c = &cpuid_data.entries[cpuid_i++];
1644        c->function = KVM_CPUID_SIGNATURE | kvm_base;
1645        c->eax = KVM_CPUID_FEATURES | kvm_base;
1646        c->ebx = signature[0];
1647        c->ecx = signature[1];
1648        c->edx = signature[2];
1649
1650        c = &cpuid_data.entries[cpuid_i++];
1651        c->function = KVM_CPUID_FEATURES | kvm_base;
1652        c->eax = env->features[FEAT_KVM];
1653        c->edx = env->features[FEAT_KVM_HINTS];
1654    }
1655
1656    cpu_x86_cpuid(env, 0, 0, &limit, &unused, &unused, &unused);
1657
1658    if (cpu->kvm_pv_enforce_cpuid) {
1659        r = kvm_vcpu_enable_cap(cs, KVM_CAP_ENFORCE_PV_FEATURE_CPUID, 0, 1);
1660        if (r < 0) {
1661            fprintf(stderr,
1662                    "failed to enable KVM_CAP_ENFORCE_PV_FEATURE_CPUID: %s",
1663                    strerror(-r));
1664            abort();
1665        }
1666    }
1667
1668    for (i = 0; i <= limit; i++) {
1669        if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1670            fprintf(stderr, "unsupported level value: 0x%x\n", limit);
1671            abort();
1672        }
1673        c = &cpuid_data.entries[cpuid_i++];
1674
1675        switch (i) {
1676        case 2: {
1677            /* Keep reading function 2 till all the input is received */
1678            int times;
1679
1680            c->function = i;
1681            c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC |
1682                       KVM_CPUID_FLAG_STATE_READ_NEXT;
1683            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1684            times = c->eax & 0xff;
1685
1686            for (j = 1; j < times; ++j) {
1687                if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1688                    fprintf(stderr, "cpuid_data is full, no space for "
1689                            "cpuid(eax:2):eax & 0xf = 0x%x\n", times);
1690                    abort();
1691                }
1692                c = &cpuid_data.entries[cpuid_i++];
1693                c->function = i;
1694                c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC;
1695                cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1696            }
1697            break;
1698        }
1699        case 0x1f:
1700            if (env->nr_dies < 2) {
1701                break;
1702            }
1703            /* fallthrough */
1704        case 4:
1705        case 0xb:
1706        case 0xd:
1707            for (j = 0; ; j++) {
1708                if (i == 0xd && j == 64) {
1709                    break;
1710                }
1711
1712                if (i == 0x1f && j == 64) {
1713                    break;
1714                }
1715
1716                c->function = i;
1717                c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1718                c->index = j;
1719                cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1720
1721                if (i == 4 && c->eax == 0) {
1722                    break;
1723                }
1724                if (i == 0xb && !(c->ecx & 0xff00)) {
1725                    break;
1726                }
1727                if (i == 0x1f && !(c->ecx & 0xff00)) {
1728                    break;
1729                }
1730                if (i == 0xd && c->eax == 0) {
1731                    continue;
1732                }
1733                if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1734                    fprintf(stderr, "cpuid_data is full, no space for "
1735                            "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
1736                    abort();
1737                }
1738                c = &cpuid_data.entries[cpuid_i++];
1739            }
1740            break;
1741        case 0x7:
1742        case 0x12:
1743            for (j = 0; ; j++) {
1744                c->function = i;
1745                c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1746                c->index = j;
1747                cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1748
1749                if (j > 1 && (c->eax & 0xf) != 1) {
1750                    break;
1751                }
1752
1753                if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1754                    fprintf(stderr, "cpuid_data is full, no space for "
1755                                "cpuid(eax:0x12,ecx:0x%x)\n", j);
1756                    abort();
1757                }
1758                c = &cpuid_data.entries[cpuid_i++];
1759            }
1760            break;
1761        case 0x14: {
1762            uint32_t times;
1763
1764            c->function = i;
1765            c->index = 0;
1766            c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1767            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1768            times = c->eax;
1769
1770            for (j = 1; j <= times; ++j) {
1771                if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1772                    fprintf(stderr, "cpuid_data is full, no space for "
1773                                "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
1774                    abort();
1775                }
1776                c = &cpuid_data.entries[cpuid_i++];
1777                c->function = i;
1778                c->index = j;
1779                c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1780                cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1781            }
1782            break;
1783        }
1784        default:
1785            c->function = i;
1786            c->flags = 0;
1787            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1788            if (!c->eax && !c->ebx && !c->ecx && !c->edx) {
1789                /*
1790                 * KVM already returns all zeroes if a CPUID entry is missing,
1791                 * so we can omit it and avoid hitting KVM's 80-entry limit.
1792                 */
1793                cpuid_i--;
1794            }
1795            break;
1796        }
1797    }
1798
1799    if (limit >= 0x0a) {
1800        uint32_t eax, edx;
1801
1802        cpu_x86_cpuid(env, 0x0a, 0, &eax, &unused, &unused, &edx);
1803
1804        has_architectural_pmu_version = eax & 0xff;
1805        if (has_architectural_pmu_version > 0) {
1806            num_architectural_pmu_gp_counters = (eax & 0xff00) >> 8;
1807
1808            /* Shouldn't be more than 32, since that's the number of bits
1809             * available in EBX to tell us _which_ counters are available.
1810             * Play it safe.
1811             */
1812            if (num_architectural_pmu_gp_counters > MAX_GP_COUNTERS) {
1813                num_architectural_pmu_gp_counters = MAX_GP_COUNTERS;
1814            }
1815
1816            if (has_architectural_pmu_version > 1) {
1817                num_architectural_pmu_fixed_counters = edx & 0x1f;
1818
1819                if (num_architectural_pmu_fixed_counters > MAX_FIXED_COUNTERS) {
1820                    num_architectural_pmu_fixed_counters = MAX_FIXED_COUNTERS;
1821                }
1822            }
1823        }
1824    }
1825
1826    cpu_x86_cpuid(env, 0x80000000, 0, &limit, &unused, &unused, &unused);
1827
1828    for (i = 0x80000000; i <= limit; i++) {
1829        if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1830            fprintf(stderr, "unsupported xlevel value: 0x%x\n", limit);
1831            abort();
1832        }
1833        c = &cpuid_data.entries[cpuid_i++];
1834
1835        switch (i) {
1836        case 0x8000001d:
1837            /* Query for all AMD cache information leaves */
1838            for (j = 0; ; j++) {
1839                c->function = i;
1840                c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1841                c->index = j;
1842                cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1843
1844                if (c->eax == 0) {
1845                    break;
1846                }
1847                if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1848                    fprintf(stderr, "cpuid_data is full, no space for "
1849                            "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
1850                    abort();
1851                }
1852                c = &cpuid_data.entries[cpuid_i++];
1853            }
1854            break;
1855        default:
1856            c->function = i;
1857            c->flags = 0;
1858            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1859            if (!c->eax && !c->ebx && !c->ecx && !c->edx) {
1860                /*
1861                 * KVM already returns all zeroes if a CPUID entry is missing,
1862                 * so we can omit it and avoid hitting KVM's 80-entry limit.
1863                 */
1864                cpuid_i--;
1865            }
1866            break;
1867        }
1868    }
1869
1870    /* Call Centaur's CPUID instructions they are supported. */
1871    if (env->cpuid_xlevel2 > 0) {
1872        cpu_x86_cpuid(env, 0xC0000000, 0, &limit, &unused, &unused, &unused);
1873
1874        for (i = 0xC0000000; i <= limit; i++) {
1875            if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1876                fprintf(stderr, "unsupported xlevel2 value: 0x%x\n", limit);
1877                abort();
1878            }
1879            c = &cpuid_data.entries[cpuid_i++];
1880
1881            c->function = i;
1882            c->flags = 0;
1883            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1884        }
1885    }
1886
1887    cpuid_data.cpuid.nent = cpuid_i;
1888
1889    if (((env->cpuid_version >> 8)&0xF) >= 6
1890        && (env->features[FEAT_1_EDX] & (CPUID_MCE | CPUID_MCA)) ==
1891           (CPUID_MCE | CPUID_MCA)
1892        && kvm_check_extension(cs->kvm_state, KVM_CAP_MCE) > 0) {
1893        uint64_t mcg_cap, unsupported_caps;
1894        int banks;
1895        int ret;
1896
1897        ret = kvm_get_mce_cap_supported(cs->kvm_state, &mcg_cap, &banks);
1898        if (ret < 0) {
1899            fprintf(stderr, "kvm_get_mce_cap_supported: %s", strerror(-ret));
1900            return ret;
1901        }
1902
1903        if (banks < (env->mcg_cap & MCG_CAP_BANKS_MASK)) {
1904            error_report("kvm: Unsupported MCE bank count (QEMU = %d, KVM = %d)",
1905                         (int)(env->mcg_cap & MCG_CAP_BANKS_MASK), banks);
1906            return -ENOTSUP;
1907        }
1908
1909        unsupported_caps = env->mcg_cap & ~(mcg_cap | MCG_CAP_BANKS_MASK);
1910        if (unsupported_caps) {
1911            if (unsupported_caps & MCG_LMCE_P) {
1912                error_report("kvm: LMCE not supported");
1913                return -ENOTSUP;
1914            }
1915            warn_report("Unsupported MCG_CAP bits: 0x%" PRIx64,
1916                        unsupported_caps);
1917        }
1918
1919        env->mcg_cap &= mcg_cap | MCG_CAP_BANKS_MASK;
1920        ret = kvm_vcpu_ioctl(cs, KVM_X86_SETUP_MCE, &env->mcg_cap);
1921        if (ret < 0) {
1922            fprintf(stderr, "KVM_X86_SETUP_MCE: %s", strerror(-ret));
1923            return ret;
1924        }
1925    }
1926
1927    cpu->vmsentry = qemu_add_vm_change_state_handler(cpu_update_state, env);
1928
1929    c = cpuid_find_entry(&cpuid_data.cpuid, 1, 0);
1930    if (c) {
1931        has_msr_feature_control = !!(c->ecx & CPUID_EXT_VMX) ||
1932                                  !!(c->ecx & CPUID_EXT_SMX);
1933    }
1934
1935    c = cpuid_find_entry(&cpuid_data.cpuid, 7, 0);
1936    if (c && (c->ebx & CPUID_7_0_EBX_SGX)) {
1937        has_msr_feature_control = true;
1938    }
1939
1940    if (env->mcg_cap & MCG_LMCE_P) {
1941        has_msr_mcg_ext_ctl = has_msr_feature_control = true;
1942    }
1943
1944    if (!env->user_tsc_khz) {
1945        if ((env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) &&
1946            invtsc_mig_blocker == NULL) {
1947            error_setg(&invtsc_mig_blocker,
1948                       "State blocked by non-migratable CPU device"
1949                       " (invtsc flag)");
1950            r = migrate_add_blocker(invtsc_mig_blocker, &local_err);
1951            if (r < 0) {
1952                error_report_err(local_err);
1953                return r;
1954            }
1955        }
1956    }
1957
1958    if (cpu->vmware_cpuid_freq
1959        /* Guests depend on 0x40000000 to detect this feature, so only expose
1960         * it if KVM exposes leaf 0x40000000. (Conflicts with Hyper-V) */
1961        && cpu->expose_kvm
1962        && kvm_base == KVM_CPUID_SIGNATURE
1963        /* TSC clock must be stable and known for this feature. */
1964        && tsc_is_stable_and_known(env)) {
1965
1966        c = &cpuid_data.entries[cpuid_i++];
1967        c->function = KVM_CPUID_SIGNATURE | 0x10;
1968        c->eax = env->tsc_khz;
1969        c->ebx = env->apic_bus_freq / 1000; /* Hz to KHz */
1970        c->ecx = c->edx = 0;
1971
1972        c = cpuid_find_entry(&cpuid_data.cpuid, kvm_base, 0);
1973        c->eax = MAX(c->eax, KVM_CPUID_SIGNATURE | 0x10);
1974    }
1975
1976    cpuid_data.cpuid.nent = cpuid_i;
1977
1978    cpuid_data.cpuid.padding = 0;
1979    r = kvm_vcpu_ioctl(cs, KVM_SET_CPUID2, &cpuid_data);
1980    if (r) {
1981        goto fail;
1982    }
1983
1984    if (has_xsave) {
1985        env->xsave_buf_len = sizeof(struct kvm_xsave);
1986        env->xsave_buf = qemu_memalign(4096, env->xsave_buf_len);
1987        memset(env->xsave_buf, 0, env->xsave_buf_len);
1988
1989        /*
1990         * The allocated storage must be large enough for all of the
1991         * possible XSAVE state components.
1992         */
1993        assert(kvm_arch_get_supported_cpuid(kvm_state, 0xd, 0, R_ECX)
1994               <= env->xsave_buf_len);
1995    }
1996
1997    max_nested_state_len = kvm_max_nested_state_length();
1998    if (max_nested_state_len > 0) {
1999        assert(max_nested_state_len >= offsetof(struct kvm_nested_state, data));
2000
2001        if (cpu_has_vmx(env) || cpu_has_svm(env)) {
2002            struct kvm_vmx_nested_state_hdr *vmx_hdr;
2003
2004            env->nested_state = g_malloc0(max_nested_state_len);
2005            env->nested_state->size = max_nested_state_len;
2006
2007            if (cpu_has_vmx(env)) {
2008                env->nested_state->format = KVM_STATE_NESTED_FORMAT_VMX;
2009                vmx_hdr = &env->nested_state->hdr.vmx;
2010                vmx_hdr->vmxon_pa = -1ull;
2011                vmx_hdr->vmcs12_pa = -1ull;
2012            } else {
2013                env->nested_state->format = KVM_STATE_NESTED_FORMAT_SVM;
2014            }
2015        }
2016    }
2017
2018    cpu->kvm_msr_buf = g_malloc0(MSR_BUF_SIZE);
2019
2020    if (!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP)) {
2021        has_msr_tsc_aux = false;
2022    }
2023
2024    kvm_init_msrs(cpu);
2025
2026    return 0;
2027
2028 fail:
2029    migrate_del_blocker(invtsc_mig_blocker);
2030
2031    return r;
2032}
2033
2034int kvm_arch_destroy_vcpu(CPUState *cs)
2035{
2036    X86CPU *cpu = X86_CPU(cs);
2037    CPUX86State *env = &cpu->env;
2038
2039    if (cpu->kvm_msr_buf) {
2040        g_free(cpu->kvm_msr_buf);
2041        cpu->kvm_msr_buf = NULL;
2042    }
2043
2044    if (env->nested_state) {
2045        g_free(env->nested_state);
2046        env->nested_state = NULL;
2047    }
2048
2049    qemu_del_vm_change_state_handler(cpu->vmsentry);
2050
2051    return 0;
2052}
2053
2054void kvm_arch_reset_vcpu(X86CPU *cpu)
2055{
2056    CPUX86State *env = &cpu->env;
2057
2058    env->xcr0 = 1;
2059    if (kvm_irqchip_in_kernel()) {
2060        env->mp_state = cpu_is_bsp(cpu) ? KVM_MP_STATE_RUNNABLE :
2061                                          KVM_MP_STATE_UNINITIALIZED;
2062    } else {
2063        env->mp_state = KVM_MP_STATE_RUNNABLE;
2064    }
2065
2066    if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
2067        int i;
2068        for (i = 0; i < ARRAY_SIZE(env->msr_hv_synic_sint); i++) {
2069            env->msr_hv_synic_sint[i] = HV_SINT_MASKED;
2070        }
2071
2072        hyperv_x86_synic_reset(cpu);
2073    }
2074    /* enabled by default */
2075    env->poll_control_msr = 1;
2076
2077    sev_es_set_reset_vector(CPU(cpu));
2078}
2079
2080void kvm_arch_do_init_vcpu(X86CPU *cpu)
2081{
2082    CPUX86State *env = &cpu->env;
2083
2084    /* APs get directly into wait-for-SIPI state.  */
2085    if (env->mp_state == KVM_MP_STATE_UNINITIALIZED) {
2086        env->mp_state = KVM_MP_STATE_INIT_RECEIVED;
2087    }
2088}
2089
2090static int kvm_get_supported_feature_msrs(KVMState *s)
2091{
2092    int ret = 0;
2093
2094    if (kvm_feature_msrs != NULL) {
2095        return 0;
2096    }
2097
2098    if (!kvm_check_extension(s, KVM_CAP_GET_MSR_FEATURES)) {
2099        return 0;
2100    }
2101
2102    struct kvm_msr_list msr_list;
2103
2104    msr_list.nmsrs = 0;
2105    ret = kvm_ioctl(s, KVM_GET_MSR_FEATURE_INDEX_LIST, &msr_list);
2106    if (ret < 0 && ret != -E2BIG) {
2107        error_report("Fetch KVM feature MSR list failed: %s",
2108            strerror(-ret));
2109        return ret;
2110    }
2111
2112    assert(msr_list.nmsrs > 0);
2113    kvm_feature_msrs = (struct kvm_msr_list *) \
2114        g_malloc0(sizeof(msr_list) +
2115                 msr_list.nmsrs * sizeof(msr_list.indices[0]));
2116
2117    kvm_feature_msrs->nmsrs = msr_list.nmsrs;
2118    ret = kvm_ioctl(s, KVM_GET_MSR_FEATURE_INDEX_LIST, kvm_feature_msrs);
2119
2120    if (ret < 0) {
2121        error_report("Fetch KVM feature MSR list failed: %s",
2122            strerror(-ret));
2123        g_free(kvm_feature_msrs);
2124        kvm_feature_msrs = NULL;
2125        return ret;
2126    }
2127
2128    return 0;
2129}
2130
2131static int kvm_get_supported_msrs(KVMState *s)
2132{
2133    int ret = 0;
2134    struct kvm_msr_list msr_list, *kvm_msr_list;
2135
2136    /*
2137     *  Obtain MSR list from KVM.  These are the MSRs that we must
2138     *  save/restore.
2139     */
2140    msr_list.nmsrs = 0;
2141    ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, &msr_list);
2142    if (ret < 0 && ret != -E2BIG) {
2143        return ret;
2144    }
2145    /*
2146     * Old kernel modules had a bug and could write beyond the provided
2147     * memory. Allocate at least a safe amount of 1K.
2148     */
2149    kvm_msr_list = g_malloc0(MAX(1024, sizeof(msr_list) +
2150                                          msr_list.nmsrs *
2151                                          sizeof(msr_list.indices[0])));
2152
2153    kvm_msr_list->nmsrs = msr_list.nmsrs;
2154    ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, kvm_msr_list);
2155    if (ret >= 0) {
2156        int i;
2157
2158        for (i = 0; i < kvm_msr_list->nmsrs; i++) {
2159            switch (kvm_msr_list->indices[i]) {
2160            case MSR_STAR:
2161                has_msr_star = true;
2162                break;
2163            case MSR_VM_HSAVE_PA:
2164                has_msr_hsave_pa = true;
2165                break;
2166            case MSR_TSC_AUX:
2167                has_msr_tsc_aux = true;
2168                break;
2169            case MSR_TSC_ADJUST:
2170                has_msr_tsc_adjust = true;
2171                break;
2172            case MSR_IA32_TSCDEADLINE:
2173                has_msr_tsc_deadline = true;
2174                break;
2175            case MSR_IA32_SMBASE:
2176                has_msr_smbase = true;
2177                break;
2178            case MSR_SMI_COUNT:
2179                has_msr_smi_count = true;
2180                break;
2181            case MSR_IA32_MISC_ENABLE:
2182                has_msr_misc_enable = true;
2183                break;
2184            case MSR_IA32_BNDCFGS:
2185                has_msr_bndcfgs = true;
2186                break;
2187            case MSR_IA32_XSS:
2188                has_msr_xss = true;
2189                break;
2190            case MSR_IA32_UMWAIT_CONTROL:
2191                has_msr_umwait = true;
2192                break;
2193            case HV_X64_MSR_CRASH_CTL:
2194                has_msr_hv_crash = true;
2195                break;
2196            case HV_X64_MSR_RESET:
2197                has_msr_hv_reset = true;
2198                break;
2199            case HV_X64_MSR_VP_INDEX:
2200                has_msr_hv_vpindex = true;
2201                break;
2202            case HV_X64_MSR_VP_RUNTIME:
2203                has_msr_hv_runtime = true;
2204                break;
2205            case HV_X64_MSR_SCONTROL:
2206                has_msr_hv_synic = true;
2207                break;
2208            case HV_X64_MSR_STIMER0_CONFIG:
2209                has_msr_hv_stimer = true;
2210                break;
2211            case HV_X64_MSR_TSC_FREQUENCY:
2212                has_msr_hv_frequencies = true;
2213                break;
2214            case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
2215                has_msr_hv_reenlightenment = true;
2216                break;
2217            case MSR_IA32_SPEC_CTRL:
2218                has_msr_spec_ctrl = true;
2219                break;
2220            case MSR_AMD64_TSC_RATIO:
2221                has_tsc_scale_msr = true;
2222                break;
2223            case MSR_IA32_TSX_CTRL:
2224                has_msr_tsx_ctrl = true;
2225                break;
2226            case MSR_VIRT_SSBD:
2227                has_msr_virt_ssbd = true;
2228                break;
2229            case MSR_IA32_ARCH_CAPABILITIES:
2230                has_msr_arch_capabs = true;
2231                break;
2232            case MSR_IA32_CORE_CAPABILITY:
2233                has_msr_core_capabs = true;
2234                break;
2235            case MSR_IA32_PERF_CAPABILITIES:
2236                has_msr_perf_capabs = true;
2237                break;
2238            case MSR_IA32_VMX_VMFUNC:
2239                has_msr_vmx_vmfunc = true;
2240                break;
2241            case MSR_IA32_UCODE_REV:
2242                has_msr_ucode_rev = true;
2243                break;
2244            case MSR_IA32_VMX_PROCBASED_CTLS2:
2245                has_msr_vmx_procbased_ctls2 = true;
2246                break;
2247            case MSR_IA32_PKRS:
2248                has_msr_pkrs = true;
2249                break;
2250            }
2251        }
2252    }
2253
2254    g_free(kvm_msr_list);
2255
2256    return ret;
2257}
2258
2259static Notifier smram_machine_done;
2260static KVMMemoryListener smram_listener;
2261static AddressSpace smram_address_space;
2262static MemoryRegion smram_as_root;
2263static MemoryRegion smram_as_mem;
2264
2265static void register_smram_listener(Notifier *n, void *unused)
2266{
2267    MemoryRegion *smram =
2268        (MemoryRegion *) object_resolve_path("/machine/smram", NULL);
2269
2270    /* Outer container... */
2271    memory_region_init(&smram_as_root, OBJECT(kvm_state), "mem-container-smram", ~0ull);
2272    memory_region_set_enabled(&smram_as_root, true);
2273
2274    /* ... with two regions inside: normal system memory with low
2275     * priority, and...
2276     */
2277    memory_region_init_alias(&smram_as_mem, OBJECT(kvm_state), "mem-smram",
2278                             get_system_memory(), 0, ~0ull);
2279    memory_region_add_subregion_overlap(&smram_as_root, 0, &smram_as_mem, 0);
2280    memory_region_set_enabled(&smram_as_mem, true);
2281
2282    if (smram) {
2283        /* ... SMRAM with higher priority */
2284        memory_region_add_subregion_overlap(&smram_as_root, 0, smram, 10);
2285        memory_region_set_enabled(smram, true);
2286    }
2287
2288    address_space_init(&smram_address_space, &smram_as_root, "KVM-SMRAM");
2289    kvm_memory_listener_register(kvm_state, &smram_listener,
2290                                 &smram_address_space, 1, "kvm-smram");
2291}
2292
2293int kvm_arch_init(MachineState *ms, KVMState *s)
2294{
2295    uint64_t identity_base = 0xfffbc000;
2296    uint64_t shadow_mem;
2297    int ret;
2298    struct utsname utsname;
2299    Error *local_err = NULL;
2300
2301    /*
2302     * Initialize SEV context, if required
2303     *
2304     * If no memory encryption is requested (ms->cgs == NULL) this is
2305     * a no-op.
2306     *
2307     * It's also a no-op if a non-SEV confidential guest support
2308     * mechanism is selected.  SEV is the only mechanism available to
2309     * select on x86 at present, so this doesn't arise, but if new
2310     * mechanisms are supported in future (e.g. TDX), they'll need
2311     * their own initialization either here or elsewhere.
2312     */
2313    ret = sev_kvm_init(ms->cgs, &local_err);
2314    if (ret < 0) {
2315        error_report_err(local_err);
2316        return ret;
2317    }
2318
2319    if (!kvm_check_extension(s, KVM_CAP_IRQ_ROUTING)) {
2320        error_report("kvm: KVM_CAP_IRQ_ROUTING not supported by KVM");
2321        return -ENOTSUP;
2322    }
2323
2324    has_xsave = kvm_check_extension(s, KVM_CAP_XSAVE);
2325    has_xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
2326    has_pit_state2 = kvm_check_extension(s, KVM_CAP_PIT_STATE2);
2327
2328    hv_vpindex_settable = kvm_check_extension(s, KVM_CAP_HYPERV_VP_INDEX);
2329
2330    has_exception_payload = kvm_check_extension(s, KVM_CAP_EXCEPTION_PAYLOAD);
2331    if (has_exception_payload) {
2332        ret = kvm_vm_enable_cap(s, KVM_CAP_EXCEPTION_PAYLOAD, 0, true);
2333        if (ret < 0) {
2334            error_report("kvm: Failed to enable exception payload cap: %s",
2335                         strerror(-ret));
2336            return ret;
2337        }
2338    }
2339
2340    ret = kvm_get_supported_msrs(s);
2341    if (ret < 0) {
2342        return ret;
2343    }
2344
2345    kvm_get_supported_feature_msrs(s);
2346
2347    uname(&utsname);
2348    lm_capable_kernel = strcmp(utsname.machine, "x86_64") == 0;
2349
2350    /*
2351     * On older Intel CPUs, KVM uses vm86 mode to emulate 16-bit code directly.
2352     * In order to use vm86 mode, an EPT identity map and a TSS  are needed.
2353     * Since these must be part of guest physical memory, we need to allocate
2354     * them, both by setting their start addresses in the kernel and by
2355     * creating a corresponding e820 entry. We need 4 pages before the BIOS.
2356     *
2357     * Older KVM versions may not support setting the identity map base. In
2358     * that case we need to stick with the default, i.e. a 256K maximum BIOS
2359     * size.
2360     */
2361    if (kvm_check_extension(s, KVM_CAP_SET_IDENTITY_MAP_ADDR)) {
2362        /* Allows up to 16M BIOSes. */
2363        identity_base = 0xfeffc000;
2364
2365        ret = kvm_vm_ioctl(s, KVM_SET_IDENTITY_MAP_ADDR, &identity_base);
2366        if (ret < 0) {
2367            return ret;
2368        }
2369    }
2370
2371    /* Set TSS base one page after EPT identity map. */
2372    ret = kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, identity_base + 0x1000);
2373    if (ret < 0) {
2374        return ret;
2375    }
2376
2377    /* Tell fw_cfg to notify the BIOS to reserve the range. */
2378    ret = e820_add_entry(identity_base, 0x4000, E820_RESERVED);
2379    if (ret < 0) {
2380        fprintf(stderr, "e820_add_entry() table is full\n");
2381        return ret;
2382    }
2383
2384    shadow_mem = object_property_get_int(OBJECT(s), "kvm-shadow-mem", &error_abort);
2385    if (shadow_mem != -1) {
2386        shadow_mem /= 4096;
2387        ret = kvm_vm_ioctl(s, KVM_SET_NR_MMU_PAGES, shadow_mem);
2388        if (ret < 0) {
2389            return ret;
2390        }
2391    }
2392
2393    if (kvm_check_extension(s, KVM_CAP_X86_SMM) &&
2394        object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE) &&
2395        x86_machine_is_smm_enabled(X86_MACHINE(ms))) {
2396        smram_machine_done.notify = register_smram_listener;
2397        qemu_add_machine_init_done_notifier(&smram_machine_done);
2398    }
2399
2400    if (enable_cpu_pm) {
2401        int disable_exits = kvm_check_extension(s, KVM_CAP_X86_DISABLE_EXITS);
2402        int ret;
2403
2404/* Work around for kernel header with a typo. TODO: fix header and drop. */
2405#if defined(KVM_X86_DISABLE_EXITS_HTL) && !defined(KVM_X86_DISABLE_EXITS_HLT)
2406#define KVM_X86_DISABLE_EXITS_HLT KVM_X86_DISABLE_EXITS_HTL
2407#endif
2408        if (disable_exits) {
2409            disable_exits &= (KVM_X86_DISABLE_EXITS_MWAIT |
2410                              KVM_X86_DISABLE_EXITS_HLT |
2411                              KVM_X86_DISABLE_EXITS_PAUSE |
2412                              KVM_X86_DISABLE_EXITS_CSTATE);
2413        }
2414
2415        ret = kvm_vm_enable_cap(s, KVM_CAP_X86_DISABLE_EXITS, 0,
2416                                disable_exits);
2417        if (ret < 0) {
2418            error_report("kvm: guest stopping CPU not supported: %s",
2419                         strerror(-ret));
2420        }
2421    }
2422
2423    if (object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE)) {
2424        X86MachineState *x86ms = X86_MACHINE(ms);
2425
2426        if (x86ms->bus_lock_ratelimit > 0) {
2427            ret = kvm_check_extension(s, KVM_CAP_X86_BUS_LOCK_EXIT);
2428            if (!(ret & KVM_BUS_LOCK_DETECTION_EXIT)) {
2429                error_report("kvm: bus lock detection unsupported");
2430                return -ENOTSUP;
2431            }
2432            ret = kvm_vm_enable_cap(s, KVM_CAP_X86_BUS_LOCK_EXIT, 0,
2433                                    KVM_BUS_LOCK_DETECTION_EXIT);
2434            if (ret < 0) {
2435                error_report("kvm: Failed to enable bus lock detection cap: %s",
2436                             strerror(-ret));
2437                return ret;
2438            }
2439            ratelimit_init(&bus_lock_ratelimit_ctrl);
2440            ratelimit_set_speed(&bus_lock_ratelimit_ctrl,
2441                                x86ms->bus_lock_ratelimit, BUS_LOCK_SLICE_TIME);
2442        }
2443    }
2444
2445    return 0;
2446}
2447
2448static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
2449{
2450    lhs->selector = rhs->selector;
2451    lhs->base = rhs->base;
2452    lhs->limit = rhs->limit;
2453    lhs->type = 3;
2454    lhs->present = 1;
2455    lhs->dpl = 3;
2456    lhs->db = 0;
2457    lhs->s = 1;
2458    lhs->l = 0;
2459    lhs->g = 0;
2460    lhs->avl = 0;
2461    lhs->unusable = 0;
2462}
2463
2464static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
2465{
2466    unsigned flags = rhs->flags;
2467    lhs->selector = rhs->selector;
2468    lhs->base = rhs->base;
2469    lhs->limit = rhs->limit;
2470    lhs->type = (flags >> DESC_TYPE_SHIFT) & 15;
2471    lhs->present = (flags & DESC_P_MASK) != 0;
2472    lhs->dpl = (flags >> DESC_DPL_SHIFT) & 3;
2473    lhs->db = (flags >> DESC_B_SHIFT) & 1;
2474    lhs->s = (flags & DESC_S_MASK) != 0;
2475    lhs->l = (flags >> DESC_L_SHIFT) & 1;
2476    lhs->g = (flags & DESC_G_MASK) != 0;
2477    lhs->avl = (flags & DESC_AVL_MASK) != 0;
2478    lhs->unusable = !lhs->present;
2479    lhs->padding = 0;
2480}
2481
2482static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs)
2483{
2484    lhs->selector = rhs->selector;
2485    lhs->base = rhs->base;
2486    lhs->limit = rhs->limit;
2487    lhs->flags = (rhs->type << DESC_TYPE_SHIFT) |
2488                 ((rhs->present && !rhs->unusable) * DESC_P_MASK) |
2489                 (rhs->dpl << DESC_DPL_SHIFT) |
2490                 (rhs->db << DESC_B_SHIFT) |
2491                 (rhs->s * DESC_S_MASK) |
2492                 (rhs->l << DESC_L_SHIFT) |
2493                 (rhs->g * DESC_G_MASK) |
2494                 (rhs->avl * DESC_AVL_MASK);
2495}
2496
2497static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, int set)
2498{
2499    if (set) {
2500        *kvm_reg = *qemu_reg;
2501    } else {
2502        *qemu_reg = *kvm_reg;
2503    }
2504}
2505
2506static int kvm_getput_regs(X86CPU *cpu, int set)
2507{
2508    CPUX86State *env = &cpu->env;
2509    struct kvm_regs regs;
2510    int ret = 0;
2511
2512    if (!set) {
2513        ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_REGS, &regs);
2514        if (ret < 0) {
2515            return ret;
2516        }
2517    }
2518
2519    kvm_getput_reg(&regs.rax, &env->regs[R_EAX], set);
2520    kvm_getput_reg(&regs.rbx, &env->regs[R_EBX], set);
2521    kvm_getput_reg(&regs.rcx, &env->regs[R_ECX], set);
2522    kvm_getput_reg(&regs.rdx, &env->regs[R_EDX], set);
2523    kvm_getput_reg(&regs.rsi, &env->regs[R_ESI], set);
2524    kvm_getput_reg(&regs.rdi, &env->regs[R_EDI], set);
2525    kvm_getput_reg(&regs.rsp, &env->regs[R_ESP], set);
2526    kvm_getput_reg(&regs.rbp, &env->regs[R_EBP], set);
2527#ifdef TARGET_X86_64
2528    kvm_getput_reg(&regs.r8, &env->regs[8], set);
2529    kvm_getput_reg(&regs.r9, &env->regs[9], set);
2530    kvm_getput_reg(&regs.r10, &env->regs[10], set);
2531    kvm_getput_reg(&regs.r11, &env->regs[11], set);
2532    kvm_getput_reg(&regs.r12, &env->regs[12], set);
2533    kvm_getput_reg(&regs.r13, &env->regs[13], set);
2534    kvm_getput_reg(&regs.r14, &env->regs[14], set);
2535    kvm_getput_reg(&regs.r15, &env->regs[15], set);
2536#endif
2537
2538    kvm_getput_reg(&regs.rflags, &env->eflags, set);
2539    kvm_getput_reg(&regs.rip, &env->eip, set);
2540
2541    if (set) {
2542        ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_REGS, &regs);
2543    }
2544
2545    return ret;
2546}
2547
2548static int kvm_put_fpu(X86CPU *cpu)
2549{
2550    CPUX86State *env = &cpu->env;
2551    struct kvm_fpu fpu;
2552    int i;
2553
2554    memset(&fpu, 0, sizeof fpu);
2555    fpu.fsw = env->fpus & ~(7 << 11);
2556    fpu.fsw |= (env->fpstt & 7) << 11;
2557    fpu.fcw = env->fpuc;
2558    fpu.last_opcode = env->fpop;
2559    fpu.last_ip = env->fpip;
2560    fpu.last_dp = env->fpdp;
2561    for (i = 0; i < 8; ++i) {
2562        fpu.ftwx |= (!env->fptags[i]) << i;
2563    }
2564    memcpy(fpu.fpr, env->fpregs, sizeof env->fpregs);
2565    for (i = 0; i < CPU_NB_REGS; i++) {
2566        stq_p(&fpu.xmm[i][0], env->xmm_regs[i].ZMM_Q(0));
2567        stq_p(&fpu.xmm[i][8], env->xmm_regs[i].ZMM_Q(1));
2568    }
2569    fpu.mxcsr = env->mxcsr;
2570
2571    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_FPU, &fpu);
2572}
2573
2574static int kvm_put_xsave(X86CPU *cpu)
2575{
2576    CPUX86State *env = &cpu->env;
2577    void *xsave = env->xsave_buf;
2578
2579    if (!has_xsave) {
2580        return kvm_put_fpu(cpu);
2581    }
2582    x86_cpu_xsave_all_areas(cpu, xsave, env->xsave_buf_len);
2583
2584    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XSAVE, xsave);
2585}
2586
2587static int kvm_put_xcrs(X86CPU *cpu)
2588{
2589    CPUX86State *env = &cpu->env;
2590    struct kvm_xcrs xcrs = {};
2591
2592    if (!has_xcrs) {
2593        return 0;
2594    }
2595
2596    xcrs.nr_xcrs = 1;
2597    xcrs.flags = 0;
2598    xcrs.xcrs[0].xcr = 0;
2599    xcrs.xcrs[0].value = env->xcr0;
2600    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XCRS, &xcrs);
2601}
2602
2603static int kvm_put_sregs(X86CPU *cpu)
2604{
2605    CPUX86State *env = &cpu->env;
2606    struct kvm_sregs sregs;
2607
2608    memset(sregs.interrupt_bitmap, 0, sizeof(sregs.interrupt_bitmap));
2609    if (env->interrupt_injected >= 0) {
2610        sregs.interrupt_bitmap[env->interrupt_injected / 64] |=
2611                (uint64_t)1 << (env->interrupt_injected % 64);
2612    }
2613
2614    if ((env->eflags & VM_MASK)) {
2615        set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
2616        set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
2617        set_v8086_seg(&sregs.es, &env->segs[R_ES]);
2618        set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
2619        set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
2620        set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
2621    } else {
2622        set_seg(&sregs.cs, &env->segs[R_CS]);
2623        set_seg(&sregs.ds, &env->segs[R_DS]);
2624        set_seg(&sregs.es, &env->segs[R_ES]);
2625        set_seg(&sregs.fs, &env->segs[R_FS]);
2626        set_seg(&sregs.gs, &env->segs[R_GS]);
2627        set_seg(&sregs.ss, &env->segs[R_SS]);
2628    }
2629
2630    set_seg(&sregs.tr, &env->tr);
2631    set_seg(&sregs.ldt, &env->ldt);
2632
2633    sregs.idt.limit = env->idt.limit;
2634    sregs.idt.base = env->idt.base;
2635    memset(sregs.idt.padding, 0, sizeof sregs.idt.padding);
2636    sregs.gdt.limit = env->gdt.limit;
2637    sregs.gdt.base = env->gdt.base;
2638    memset(sregs.gdt.padding, 0, sizeof sregs.gdt.padding);
2639
2640    sregs.cr0 = env->cr[0];
2641    sregs.cr2 = env->cr[2];
2642    sregs.cr3 = env->cr[3];
2643    sregs.cr4 = env->cr[4];
2644
2645    sregs.cr8 = cpu_get_apic_tpr(cpu->apic_state);
2646    sregs.apic_base = cpu_get_apic_base(cpu->apic_state);
2647
2648    sregs.efer = env->efer;
2649
2650    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS, &sregs);
2651}
2652
2653static void kvm_msr_buf_reset(X86CPU *cpu)
2654{
2655    memset(cpu->kvm_msr_buf, 0, MSR_BUF_SIZE);
2656}
2657
2658static void kvm_msr_entry_add(X86CPU *cpu, uint32_t index, uint64_t value)
2659{
2660    struct kvm_msrs *msrs = cpu->kvm_msr_buf;
2661    void *limit = ((void *)msrs) + MSR_BUF_SIZE;
2662    struct kvm_msr_entry *entry = &msrs->entries[msrs->nmsrs];
2663
2664    assert((void *)(entry + 1) <= limit);
2665
2666    entry->index = index;
2667    entry->reserved = 0;
2668    entry->data = value;
2669    msrs->nmsrs++;
2670}
2671
2672static int kvm_put_one_msr(X86CPU *cpu, int index, uint64_t value)
2673{
2674    kvm_msr_buf_reset(cpu);
2675    kvm_msr_entry_add(cpu, index, value);
2676
2677    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf);
2678}
2679
2680void kvm_put_apicbase(X86CPU *cpu, uint64_t value)
2681{
2682    int ret;
2683
2684    ret = kvm_put_one_msr(cpu, MSR_IA32_APICBASE, value);
2685    assert(ret == 1);
2686}
2687
2688static int kvm_put_tscdeadline_msr(X86CPU *cpu)
2689{
2690    CPUX86State *env = &cpu->env;
2691    int ret;
2692
2693    if (!has_msr_tsc_deadline) {
2694        return 0;
2695    }
2696
2697    ret = kvm_put_one_msr(cpu, MSR_IA32_TSCDEADLINE, env->tsc_deadline);
2698    if (ret < 0) {
2699        return ret;
2700    }
2701
2702    assert(ret == 1);
2703    return 0;
2704}
2705
2706/*
2707 * Provide a separate write service for the feature control MSR in order to
2708 * kick the VCPU out of VMXON or even guest mode on reset. This has to be done
2709 * before writing any other state because forcibly leaving nested mode
2710 * invalidates the VCPU state.
2711 */
2712static int kvm_put_msr_feature_control(X86CPU *cpu)
2713{
2714    int ret;
2715
2716    if (!has_msr_feature_control) {
2717        return 0;
2718    }
2719
2720    ret = kvm_put_one_msr(cpu, MSR_IA32_FEATURE_CONTROL,
2721                          cpu->env.msr_ia32_feature_control);
2722    if (ret < 0) {
2723        return ret;
2724    }
2725
2726    assert(ret == 1);
2727    return 0;
2728}
2729
2730static uint64_t make_vmx_msr_value(uint32_t index, uint32_t features)
2731{
2732    uint32_t default1, can_be_one, can_be_zero;
2733    uint32_t must_be_one;
2734
2735    switch (index) {
2736    case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
2737        default1 = 0x00000016;
2738        break;
2739    case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
2740        default1 = 0x0401e172;
2741        break;
2742    case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
2743        default1 = 0x000011ff;
2744        break;
2745    case MSR_IA32_VMX_TRUE_EXIT_CTLS:
2746        default1 = 0x00036dff;
2747        break;
2748    case MSR_IA32_VMX_PROCBASED_CTLS2:
2749        default1 = 0;
2750        break;
2751    default:
2752        abort();
2753    }
2754
2755    /* If a feature bit is set, the control can be either set or clear.
2756     * Otherwise the value is limited to either 0 or 1 by default1.
2757     */
2758    can_be_one = features | default1;
2759    can_be_zero = features | ~default1;
2760    must_be_one = ~can_be_zero;
2761
2762    /*
2763     * Bit 0:31 -> 0 if the control bit can be zero (i.e. 1 if it must be one).
2764     * Bit 32:63 -> 1 if the control bit can be one.
2765     */
2766    return must_be_one | (((uint64_t)can_be_one) << 32);
2767}
2768
2769static void kvm_msr_entry_add_vmx(X86CPU *cpu, FeatureWordArray f)
2770{
2771    uint64_t kvm_vmx_basic =
2772        kvm_arch_get_supported_msr_feature(kvm_state,
2773                                           MSR_IA32_VMX_BASIC);
2774
2775    if (!kvm_vmx_basic) {
2776        /* If the kernel doesn't support VMX feature (kvm_intel.nested=0),
2777         * then kvm_vmx_basic will be 0 and KVM_SET_MSR will fail.
2778         */
2779        return;
2780    }
2781
2782    uint64_t kvm_vmx_misc =
2783        kvm_arch_get_supported_msr_feature(kvm_state,
2784                                           MSR_IA32_VMX_MISC);
2785    uint64_t kvm_vmx_ept_vpid =
2786        kvm_arch_get_supported_msr_feature(kvm_state,
2787                                           MSR_IA32_VMX_EPT_VPID_CAP);
2788
2789    /*
2790     * If the guest is 64-bit, a value of 1 is allowed for the host address
2791     * space size vmexit control.
2792     */
2793    uint64_t fixed_vmx_exit = f[FEAT_8000_0001_EDX] & CPUID_EXT2_LM
2794        ? (uint64_t)VMX_VM_EXIT_HOST_ADDR_SPACE_SIZE << 32 : 0;
2795
2796    /*
2797     * Bits 0-30, 32-44 and 50-53 come from the host.  KVM should
2798     * not change them for backwards compatibility.
2799     */
2800    uint64_t fixed_vmx_basic = kvm_vmx_basic &
2801        (MSR_VMX_BASIC_VMCS_REVISION_MASK |
2802         MSR_VMX_BASIC_VMXON_REGION_SIZE_MASK |
2803         MSR_VMX_BASIC_VMCS_MEM_TYPE_MASK);
2804
2805    /*
2806     * Same for bits 0-4 and 25-27.  Bits 16-24 (CR3 target count) can
2807     * change in the future but are always zero for now, clear them to be
2808     * future proof.  Bits 32-63 in theory could change, though KVM does
2809     * not support dual-monitor treatment and probably never will; mask
2810     * them out as well.
2811     */
2812    uint64_t fixed_vmx_misc = kvm_vmx_misc &
2813        (MSR_VMX_MISC_PREEMPTION_TIMER_SHIFT_MASK |
2814         MSR_VMX_MISC_MAX_MSR_LIST_SIZE_MASK);
2815
2816    /*
2817     * EPT memory types should not change either, so we do not bother
2818     * adding features for them.
2819     */
2820    uint64_t fixed_vmx_ept_mask =
2821            (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_ENABLE_EPT ?
2822             MSR_VMX_EPT_UC | MSR_VMX_EPT_WB : 0);
2823    uint64_t fixed_vmx_ept_vpid = kvm_vmx_ept_vpid & fixed_vmx_ept_mask;
2824
2825    kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
2826                      make_vmx_msr_value(MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
2827                                         f[FEAT_VMX_PROCBASED_CTLS]));
2828    kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_PINBASED_CTLS,
2829                      make_vmx_msr_value(MSR_IA32_VMX_TRUE_PINBASED_CTLS,
2830                                         f[FEAT_VMX_PINBASED_CTLS]));
2831    kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_EXIT_CTLS,
2832                      make_vmx_msr_value(MSR_IA32_VMX_TRUE_EXIT_CTLS,
2833                                         f[FEAT_VMX_EXIT_CTLS]) | fixed_vmx_exit);
2834    kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_ENTRY_CTLS,
2835                      make_vmx_msr_value(MSR_IA32_VMX_TRUE_ENTRY_CTLS,
2836                                         f[FEAT_VMX_ENTRY_CTLS]));
2837    kvm_msr_entry_add(cpu, MSR_IA32_VMX_PROCBASED_CTLS2,
2838                      make_vmx_msr_value(MSR_IA32_VMX_PROCBASED_CTLS2,
2839                                         f[FEAT_VMX_SECONDARY_CTLS]));
2840    kvm_msr_entry_add(cpu, MSR_IA32_VMX_EPT_VPID_CAP,
2841                      f[FEAT_VMX_EPT_VPID_CAPS] | fixed_vmx_ept_vpid);
2842    kvm_msr_entry_add(cpu, MSR_IA32_VMX_BASIC,
2843                      f[FEAT_VMX_BASIC] | fixed_vmx_basic);
2844    kvm_msr_entry_add(cpu, MSR_IA32_VMX_MISC,
2845                      f[FEAT_VMX_MISC] | fixed_vmx_misc);
2846    if (has_msr_vmx_vmfunc) {
2847        kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMFUNC, f[FEAT_VMX_VMFUNC]);
2848    }
2849
2850    /*
2851     * Just to be safe, write these with constant values.  The CRn_FIXED1
2852     * MSRs are generated by KVM based on the vCPU's CPUID.
2853     */
2854    kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR0_FIXED0,
2855                      CR0_PE_MASK | CR0_PG_MASK | CR0_NE_MASK);
2856    kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR4_FIXED0,
2857                      CR4_VMXE_MASK);
2858
2859    if (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_TSC_SCALING) {
2860        /* TSC multiplier (0x2032).  */
2861        kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x32);
2862    } else {
2863        /* Preemption timer (0x482E).  */
2864        kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x2E);
2865    }
2866}
2867
2868static void kvm_msr_entry_add_perf(X86CPU *cpu, FeatureWordArray f)
2869{
2870    uint64_t kvm_perf_cap =
2871        kvm_arch_get_supported_msr_feature(kvm_state,
2872                                           MSR_IA32_PERF_CAPABILITIES);
2873
2874    if (kvm_perf_cap) {
2875        kvm_msr_entry_add(cpu, MSR_IA32_PERF_CAPABILITIES,
2876                        kvm_perf_cap & f[FEAT_PERF_CAPABILITIES]);
2877    }
2878}
2879
2880static int kvm_buf_set_msrs(X86CPU *cpu)
2881{
2882    int ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf);
2883    if (ret < 0) {
2884        return ret;
2885    }
2886
2887    if (ret < cpu->kvm_msr_buf->nmsrs) {
2888        struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret];
2889        error_report("error: failed to set MSR 0x%" PRIx32 " to 0x%" PRIx64,
2890                     (uint32_t)e->index, (uint64_t)e->data);
2891    }
2892
2893    assert(ret == cpu->kvm_msr_buf->nmsrs);
2894    return 0;
2895}
2896
2897static void kvm_init_msrs(X86CPU *cpu)
2898{
2899    CPUX86State *env = &cpu->env;
2900
2901    kvm_msr_buf_reset(cpu);
2902    if (has_msr_arch_capabs) {
2903        kvm_msr_entry_add(cpu, MSR_IA32_ARCH_CAPABILITIES,
2904                          env->features[FEAT_ARCH_CAPABILITIES]);
2905    }
2906
2907    if (has_msr_core_capabs) {
2908        kvm_msr_entry_add(cpu, MSR_IA32_CORE_CAPABILITY,
2909                          env->features[FEAT_CORE_CAPABILITY]);
2910    }
2911
2912    if (has_msr_perf_capabs && cpu->enable_pmu) {
2913        kvm_msr_entry_add_perf(cpu, env->features);
2914    }
2915
2916    if (has_msr_ucode_rev) {
2917        kvm_msr_entry_add(cpu, MSR_IA32_UCODE_REV, cpu->ucode_rev);
2918    }
2919
2920    /*
2921     * Older kernels do not include VMX MSRs in KVM_GET_MSR_INDEX_LIST, but
2922     * all kernels with MSR features should have them.
2923     */
2924    if (kvm_feature_msrs && cpu_has_vmx(env)) {
2925        kvm_msr_entry_add_vmx(cpu, env->features);
2926    }
2927
2928    assert(kvm_buf_set_msrs(cpu) == 0);
2929}
2930
2931static int kvm_put_msrs(X86CPU *cpu, int level)
2932{
2933    CPUX86State *env = &cpu->env;
2934    int i;
2935
2936    kvm_msr_buf_reset(cpu);
2937
2938    kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, env->sysenter_cs);
2939    kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
2940    kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
2941    kvm_msr_entry_add(cpu, MSR_PAT, env->pat);
2942    if (has_msr_star) {
2943        kvm_msr_entry_add(cpu, MSR_STAR, env->star);
2944    }
2945    if (has_msr_hsave_pa) {
2946        kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, env->vm_hsave);
2947    }
2948    if (has_msr_tsc_aux) {
2949        kvm_msr_entry_add(cpu, MSR_TSC_AUX, env->tsc_aux);
2950    }
2951    if (has_msr_tsc_adjust) {
2952        kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, env->tsc_adjust);
2953    }
2954    if (has_msr_misc_enable) {
2955        kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE,
2956                          env->msr_ia32_misc_enable);
2957    }
2958    if (has_msr_smbase) {
2959        kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, env->smbase);
2960    }
2961    if (has_msr_smi_count) {
2962        kvm_msr_entry_add(cpu, MSR_SMI_COUNT, env->msr_smi_count);
2963    }
2964    if (has_msr_pkrs) {
2965        kvm_msr_entry_add(cpu, MSR_IA32_PKRS, env->pkrs);
2966    }
2967    if (has_msr_bndcfgs) {
2968        kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, env->msr_bndcfgs);
2969    }
2970    if (has_msr_xss) {
2971        kvm_msr_entry_add(cpu, MSR_IA32_XSS, env->xss);
2972    }
2973    if (has_msr_umwait) {
2974        kvm_msr_entry_add(cpu, MSR_IA32_UMWAIT_CONTROL, env->umwait);
2975    }
2976    if (has_msr_spec_ctrl) {
2977        kvm_msr_entry_add(cpu, MSR_IA32_SPEC_CTRL, env->spec_ctrl);
2978    }
2979    if (has_tsc_scale_msr) {
2980        kvm_msr_entry_add(cpu, MSR_AMD64_TSC_RATIO, env->amd_tsc_scale_msr);
2981    }
2982
2983    if (has_msr_tsx_ctrl) {
2984        kvm_msr_entry_add(cpu, MSR_IA32_TSX_CTRL, env->tsx_ctrl);
2985    }
2986    if (has_msr_virt_ssbd) {
2987        kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, env->virt_ssbd);
2988    }
2989
2990#ifdef TARGET_X86_64
2991    if (lm_capable_kernel) {
2992        kvm_msr_entry_add(cpu, MSR_CSTAR, env->cstar);
2993        kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, env->kernelgsbase);
2994        kvm_msr_entry_add(cpu, MSR_FMASK, env->fmask);
2995        kvm_msr_entry_add(cpu, MSR_LSTAR, env->lstar);
2996    }
2997#endif
2998
2999    /*
3000     * The following MSRs have side effects on the guest or are too heavy
3001     * for normal writeback. Limit them to reset or full state updates.
3002     */
3003    if (level >= KVM_PUT_RESET_STATE) {
3004        kvm_msr_entry_add(cpu, MSR_IA32_TSC, env->tsc);
3005        kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, env->system_time_msr);
3006        kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
3007        if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF_INT)) {
3008            kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_INT, env->async_pf_int_msr);
3009        }
3010        if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF)) {
3011            kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, env->async_pf_en_msr);
3012        }
3013        if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_PV_EOI)) {
3014            kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, env->pv_eoi_en_msr);
3015        }
3016        if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) {
3017            kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, env->steal_time_msr);
3018        }
3019
3020        if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_POLL_CONTROL)) {
3021            kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, env->poll_control_msr);
3022        }
3023
3024        if (has_architectural_pmu_version > 0) {
3025            if (has_architectural_pmu_version > 1) {
3026                /* Stop the counter.  */
3027                kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
3028                kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0);
3029            }
3030
3031            /* Set the counter values.  */
3032            for (i = 0; i < num_architectural_pmu_fixed_counters; i++) {
3033                kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR0 + i,
3034                                  env->msr_fixed_counters[i]);
3035            }
3036            for (i = 0; i < num_architectural_pmu_gp_counters; i++) {
3037                kvm_msr_entry_add(cpu, MSR_P6_PERFCTR0 + i,
3038                                  env->msr_gp_counters[i]);
3039                kvm_msr_entry_add(cpu, MSR_P6_EVNTSEL0 + i,
3040                                  env->msr_gp_evtsel[i]);
3041            }
3042            if (has_architectural_pmu_version > 1) {
3043                kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_STATUS,
3044                                  env->msr_global_status);
3045                kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
3046                                  env->msr_global_ovf_ctrl);
3047
3048                /* Now start the PMU.  */
3049                kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL,
3050                                  env->msr_fixed_ctr_ctrl);
3051                kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL,
3052                                  env->msr_global_ctrl);
3053            }
3054        }
3055        /*
3056         * Hyper-V partition-wide MSRs: to avoid clearing them on cpu hot-add,
3057         * only sync them to KVM on the first cpu
3058         */
3059        if (current_cpu == first_cpu) {
3060            if (has_msr_hv_hypercall) {
3061                kvm_msr_entry_add(cpu, HV_X64_MSR_GUEST_OS_ID,
3062                                  env->msr_hv_guest_os_id);
3063                kvm_msr_entry_add(cpu, HV_X64_MSR_HYPERCALL,
3064                                  env->msr_hv_hypercall);
3065            }
3066            if (hyperv_feat_enabled(cpu, HYPERV_FEAT_TIME)) {
3067                kvm_msr_entry_add(cpu, HV_X64_MSR_REFERENCE_TSC,
3068                                  env->msr_hv_tsc);
3069            }
3070            if (hyperv_feat_enabled(cpu, HYPERV_FEAT_REENLIGHTENMENT)) {
3071                kvm_msr_entry_add(cpu, HV_X64_MSR_REENLIGHTENMENT_CONTROL,
3072                                  env->msr_hv_reenlightenment_control);
3073                kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_CONTROL,
3074                                  env->msr_hv_tsc_emulation_control);
3075                kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS,
3076                                  env->msr_hv_tsc_emulation_status);
3077            }
3078        }
3079        if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) {
3080            kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE,
3081                              env->msr_hv_vapic);
3082        }
3083        if (has_msr_hv_crash) {
3084            int j;
3085
3086            for (j = 0; j < HV_CRASH_PARAMS; j++)
3087                kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_P0 + j,
3088                                  env->msr_hv_crash_params[j]);
3089
3090            kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_NOTIFY);
3091        }
3092        if (has_msr_hv_runtime) {
3093            kvm_msr_entry_add(cpu, HV_X64_MSR_VP_RUNTIME, env->msr_hv_runtime);
3094        }
3095        if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX)
3096            && hv_vpindex_settable) {
3097            kvm_msr_entry_add(cpu, HV_X64_MSR_VP_INDEX,
3098                              hyperv_vp_index(CPU(cpu)));
3099        }
3100        if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
3101            int j;
3102
3103            kvm_msr_entry_add(cpu, HV_X64_MSR_SVERSION, HV_SYNIC_VERSION);
3104
3105            kvm_msr_entry_add(cpu, HV_X64_MSR_SCONTROL,
3106                              env->msr_hv_synic_control);
3107            kvm_msr_entry_add(cpu, HV_X64_MSR_SIEFP,
3108                              env->msr_hv_synic_evt_page);
3109            kvm_msr_entry_add(cpu, HV_X64_MSR_SIMP,
3110                              env->msr_hv_synic_msg_page);
3111
3112            for (j = 0; j < ARRAY_SIZE(env->msr_hv_synic_sint); j++) {
3113                kvm_msr_entry_add(cpu, HV_X64_MSR_SINT0 + j,
3114                                  env->msr_hv_synic_sint[j]);
3115            }
3116        }
3117        if (has_msr_hv_stimer) {
3118            int j;
3119
3120            for (j = 0; j < ARRAY_SIZE(env->msr_hv_stimer_config); j++) {
3121                kvm_msr_entry_add(cpu, HV_X64_MSR_STIMER0_CONFIG + j * 2,
3122                                env->msr_hv_stimer_config[j]);
3123            }
3124
3125            for (j = 0; j < ARRAY_SIZE(env->msr_hv_stimer_count); j++) {
3126                kvm_msr_entry_add(cpu, HV_X64_MSR_STIMER0_COUNT + j * 2,
3127                                env->msr_hv_stimer_count[j]);
3128            }
3129        }
3130        if (env->features[FEAT_1_EDX] & CPUID_MTRR) {
3131            uint64_t phys_mask = MAKE_64BIT_MASK(0, cpu->phys_bits);
3132
3133            kvm_msr_entry_add(cpu, MSR_MTRRdefType, env->mtrr_deftype);
3134            kvm_msr_entry_add(cpu, MSR_MTRRfix64K_00000, env->mtrr_fixed[0]);
3135            kvm_msr_entry_add(cpu, MSR_MTRRfix16K_80000, env->mtrr_fixed[1]);
3136            kvm_msr_entry_add(cpu, MSR_MTRRfix16K_A0000, env->mtrr_fixed[2]);
3137            kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C0000, env->mtrr_fixed[3]);
3138            kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C8000, env->mtrr_fixed[4]);
3139            kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D0000, env->mtrr_fixed[5]);
3140            kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D8000, env->mtrr_fixed[6]);
3141            kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E0000, env->mtrr_fixed[7]);
3142            kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E8000, env->mtrr_fixed[8]);
3143            kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F0000, env->mtrr_fixed[9]);
3144            kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F8000, env->mtrr_fixed[10]);
3145            for (i = 0; i < MSR_MTRRcap_VCNT; i++) {
3146                /* The CPU GPs if we write to a bit above the physical limit of
3147                 * the host CPU (and KVM emulates that)
3148                 */
3149                uint64_t mask = env->mtrr_var[i].mask;
3150                mask &= phys_mask;
3151
3152                kvm_msr_entry_add(cpu, MSR_MTRRphysBase(i),
3153                                  env->mtrr_var[i].base);
3154                kvm_msr_entry_add(cpu, MSR_MTRRphysMask(i), mask);
3155            }
3156        }
3157        if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) {
3158            int addr_num = kvm_arch_get_supported_cpuid(kvm_state,
3159                                                    0x14, 1, R_EAX) & 0x7;
3160
3161            kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CTL,
3162                            env->msr_rtit_ctrl);
3163            kvm_msr_entry_add(cpu, MSR_IA32_RTIT_STATUS,
3164                            env->msr_rtit_status);
3165            kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_BASE,
3166                            env->msr_rtit_output_base);
3167            kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_MASK,
3168                            env->msr_rtit_output_mask);
3169            kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CR3_MATCH,
3170                            env->msr_rtit_cr3_match);
3171            for (i = 0; i < addr_num; i++) {
3172                kvm_msr_entry_add(cpu, MSR_IA32_RTIT_ADDR0_A + i,
3173                            env->msr_rtit_addrs[i]);
3174            }
3175        }
3176
3177        if (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_SGX_LC) {
3178            kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH0,
3179                              env->msr_ia32_sgxlepubkeyhash[0]);
3180            kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH1,
3181                              env->msr_ia32_sgxlepubkeyhash[1]);
3182            kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH2,
3183                              env->msr_ia32_sgxlepubkeyhash[2]);
3184            kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH3,
3185                              env->msr_ia32_sgxlepubkeyhash[3]);
3186        }
3187
3188        /* Note: MSR_IA32_FEATURE_CONTROL is written separately, see
3189         *       kvm_put_msr_feature_control. */
3190    }
3191
3192    if (env->mcg_cap) {
3193        int i;
3194
3195        kvm_msr_entry_add(cpu, MSR_MCG_STATUS, env->mcg_status);
3196        kvm_msr_entry_add(cpu, MSR_MCG_CTL, env->mcg_ctl);
3197        if (has_msr_mcg_ext_ctl) {
3198            kvm_msr_entry_add(cpu, MSR_MCG_EXT_CTL, env->mcg_ext_ctl);
3199        }
3200        for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
3201            kvm_msr_entry_add(cpu, MSR_MC0_CTL + i, env->mce_banks[i]);
3202        }
3203    }
3204
3205    return kvm_buf_set_msrs(cpu);
3206}
3207
3208
3209static int kvm_get_fpu(X86CPU *cpu)
3210{
3211    CPUX86State *env = &cpu->env;
3212    struct kvm_fpu fpu;
3213    int i, ret;
3214
3215    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_FPU, &fpu);
3216    if (ret < 0) {
3217        return ret;
3218    }
3219
3220    env->fpstt = (fpu.fsw >> 11) & 7;
3221    env->fpus = fpu.fsw;
3222    env->fpuc = fpu.fcw;
3223    env->fpop = fpu.last_opcode;
3224    env->fpip = fpu.last_ip;
3225    env->fpdp = fpu.last_dp;
3226    for (i = 0; i < 8; ++i) {
3227        env->fptags[i] = !((fpu.ftwx >> i) & 1);
3228    }
3229    memcpy(env->fpregs, fpu.fpr, sizeof env->fpregs);
3230    for (i = 0; i < CPU_NB_REGS; i++) {
3231        env->xmm_regs[i].ZMM_Q(0) = ldq_p(&fpu.xmm[i][0]);
3232        env->xmm_regs[i].ZMM_Q(1) = ldq_p(&fpu.xmm[i][8]);
3233    }
3234    env->mxcsr = fpu.mxcsr;
3235
3236    return 0;
3237}
3238
3239static int kvm_get_xsave(X86CPU *cpu)
3240{
3241    CPUX86State *env = &cpu->env;
3242    void *xsave = env->xsave_buf;
3243    int ret;
3244
3245    if (!has_xsave) {
3246        return kvm_get_fpu(cpu);
3247    }
3248
3249    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XSAVE, xsave);
3250    if (ret < 0) {
3251        return ret;
3252    }
3253    x86_cpu_xrstor_all_areas(cpu, xsave, env->xsave_buf_len);
3254
3255    return 0;
3256}
3257
3258static int kvm_get_xcrs(X86CPU *cpu)
3259{
3260    CPUX86State *env = &cpu->env;
3261    int i, ret;
3262    struct kvm_xcrs xcrs;
3263
3264    if (!has_xcrs) {
3265        return 0;
3266    }
3267
3268    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XCRS, &xcrs);
3269    if (ret < 0) {
3270        return ret;
3271    }
3272
3273    for (i = 0; i < xcrs.nr_xcrs; i++) {
3274        /* Only support xcr0 now */
3275        if (xcrs.xcrs[i].xcr == 0) {
3276            env->xcr0 = xcrs.xcrs[i].value;
3277            break;
3278        }
3279    }
3280    return 0;
3281}
3282
3283static int kvm_get_sregs(X86CPU *cpu)
3284{
3285    CPUX86State *env = &cpu->env;
3286    struct kvm_sregs sregs;
3287    int bit, i, ret;
3288
3289    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs);
3290    if (ret < 0) {
3291        return ret;
3292    }
3293
3294    /* There can only be one pending IRQ set in the bitmap at a time, so try
3295       to find it and save its number instead (-1 for none). */
3296    env->interrupt_injected = -1;
3297    for (i = 0; i < ARRAY_SIZE(sregs.interrupt_bitmap); i++) {
3298        if (sregs.interrupt_bitmap[i]) {
3299            bit = ctz64(sregs.interrupt_bitmap[i]);
3300            env->interrupt_injected = i * 64 + bit;
3301            break;
3302        }
3303    }
3304
3305    get_seg(&env->segs[R_CS], &sregs.cs);
3306    get_seg(&env->segs[R_DS], &sregs.ds);
3307    get_seg(&env->segs[R_ES], &sregs.es);
3308    get_seg(&env->segs[R_FS], &sregs.fs);
3309    get_seg(&env->segs[R_GS], &sregs.gs);
3310    get_seg(&env->segs[R_SS], &sregs.ss);
3311
3312    get_seg(&env->tr, &sregs.tr);
3313    get_seg(&env->ldt, &sregs.ldt);
3314
3315    env->idt.limit = sregs.idt.limit;
3316    env->idt.base = sregs.idt.base;
3317    env->gdt.limit = sregs.gdt.limit;
3318    env->gdt.base = sregs.gdt.base;
3319
3320    env->cr[0] = sregs.cr0;
3321    env->cr[2] = sregs.cr2;
3322    env->cr[3] = sregs.cr3;
3323    env->cr[4] = sregs.cr4;
3324
3325    env->efer = sregs.efer;
3326
3327    /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */
3328    x86_update_hflags(env);
3329
3330    return 0;
3331}
3332
3333static int kvm_get_msrs(X86CPU *cpu)
3334{
3335    CPUX86State *env = &cpu->env;
3336    struct kvm_msr_entry *msrs = cpu->kvm_msr_buf->entries;
3337    int ret, i;
3338    uint64_t mtrr_top_bits;
3339
3340    kvm_msr_buf_reset(cpu);
3341
3342    kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, 0);
3343    kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, 0);
3344    kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, 0);
3345    kvm_msr_entry_add(cpu, MSR_PAT, 0);
3346    if (has_msr_star) {
3347        kvm_msr_entry_add(cpu, MSR_STAR, 0);
3348    }
3349    if (has_msr_hsave_pa) {
3350        kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, 0);
3351    }
3352    if (has_msr_tsc_aux) {
3353        kvm_msr_entry_add(cpu, MSR_TSC_AUX, 0);
3354    }
3355    if (has_msr_tsc_adjust) {
3356        kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, 0);
3357    }
3358    if (has_msr_tsc_deadline) {
3359        kvm_msr_entry_add(cpu, MSR_IA32_TSCDEADLINE, 0);
3360    }
3361    if (has_msr_misc_enable) {
3362        kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE, 0);
3363    }
3364    if (has_msr_smbase) {
3365        kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, 0);
3366    }
3367    if (has_msr_smi_count) {
3368        kvm_msr_entry_add(cpu, MSR_SMI_COUNT, 0);
3369    }
3370    if (has_msr_feature_control) {
3371        kvm_msr_entry_add(cpu, MSR_IA32_FEATURE_CONTROL, 0);
3372    }
3373    if (has_msr_pkrs) {
3374        kvm_msr_entry_add(cpu, MSR_IA32_PKRS, 0);
3375    }
3376    if (has_msr_bndcfgs) {
3377        kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, 0);
3378    }
3379    if (has_msr_xss) {
3380        kvm_msr_entry_add(cpu, MSR_IA32_XSS, 0);
3381    }
3382    if (has_msr_umwait) {
3383        kvm_msr_entry_add(cpu, MSR_IA32_UMWAIT_CONTROL, 0);
3384    }
3385    if (has_msr_spec_ctrl) {
3386        kvm_msr_entry_add(cpu, MSR_IA32_SPEC_CTRL, 0);
3387    }
3388    if (has_tsc_scale_msr) {
3389        kvm_msr_entry_add(cpu, MSR_AMD64_TSC_RATIO, 0);
3390    }
3391
3392    if (has_msr_tsx_ctrl) {
3393        kvm_msr_entry_add(cpu, MSR_IA32_TSX_CTRL, 0);
3394    }
3395    if (has_msr_virt_ssbd) {
3396        kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, 0);
3397    }
3398    if (!env->tsc_valid) {
3399        kvm_msr_entry_add(cpu, MSR_IA32_TSC, 0);
3400        env->tsc_valid = !runstate_is_running();
3401    }
3402
3403#ifdef TARGET_X86_64
3404    if (lm_capable_kernel) {
3405        kvm_msr_entry_add(cpu, MSR_CSTAR, 0);
3406        kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, 0);
3407        kvm_msr_entry_add(cpu, MSR_FMASK, 0);
3408        kvm_msr_entry_add(cpu, MSR_LSTAR, 0);
3409    }
3410#endif
3411    kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, 0);
3412    kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, 0);
3413    if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF_INT)) {
3414        kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_INT, 0);
3415    }
3416    if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF)) {
3417        kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, 0);
3418    }
3419    if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_PV_EOI)) {
3420        kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, 0);
3421    }
3422    if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) {
3423        kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, 0);
3424    }
3425    if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_POLL_CONTROL)) {
3426        kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, 1);
3427    }
3428    if (has_architectural_pmu_version > 0) {
3429        if (has_architectural_pmu_version > 1) {
3430            kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
3431            kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0);
3432            kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_STATUS, 0);
3433            kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, 0);
3434        }
3435        for (i = 0; i < num_architectural_pmu_fixed_counters; i++) {
3436            kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR0 + i, 0);
3437        }
3438        for (i = 0; i < num_architectural_pmu_gp_counters; i++) {
3439            kvm_msr_entry_add(cpu, MSR_P6_PERFCTR0 + i, 0);
3440            kvm_msr_entry_add(cpu, MSR_P6_EVNTSEL0 + i, 0);
3441        }
3442    }
3443
3444    if (env->mcg_cap) {
3445        kvm_msr_entry_add(cpu, MSR_MCG_STATUS, 0);
3446        kvm_msr_entry_add(cpu, MSR_MCG_CTL, 0);
3447        if (has_msr_mcg_ext_ctl) {
3448            kvm_msr_entry_add(cpu, MSR_MCG_EXT_CTL, 0);
3449        }
3450        for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
3451            kvm_msr_entry_add(cpu, MSR_MC0_CTL + i, 0);
3452        }
3453    }
3454
3455    if (has_msr_hv_hypercall) {
3456        kvm_msr_entry_add(cpu, HV_X64_MSR_HYPERCALL, 0);
3457        kvm_msr_entry_add(cpu, HV_X64_MSR_GUEST_OS_ID, 0);
3458    }
3459    if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) {
3460        kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE, 0);
3461    }
3462    if (hyperv_feat_enabled(cpu, HYPERV_FEAT_TIME)) {
3463        kvm_msr_entry_add(cpu, HV_X64_MSR_REFERENCE_TSC, 0);
3464    }
3465    if (hyperv_feat_enabled(cpu, HYPERV_FEAT_REENLIGHTENMENT)) {
3466        kvm_msr_entry_add(cpu, HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0);
3467        kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_CONTROL, 0);
3468        kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS, 0);
3469    }
3470    if (has_msr_hv_crash) {
3471        int j;
3472
3473        for (j = 0; j < HV_CRASH_PARAMS; j++) {
3474            kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_P0 + j, 0);
3475        }
3476    }
3477    if (has_msr_hv_runtime) {
3478        kvm_msr_entry_add(cpu, HV_X64_MSR_VP_RUNTIME, 0);
3479    }
3480    if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
3481        uint32_t msr;
3482
3483        kvm_msr_entry_add(cpu, HV_X64_MSR_SCONTROL, 0);
3484        kvm_msr_entry_add(cpu, HV_X64_MSR_SIEFP, 0);
3485        kvm_msr_entry_add(cpu, HV_X64_MSR_SIMP, 0);
3486        for (msr = HV_X64_MSR_SINT0; msr <= HV_X64_MSR_SINT15; msr++) {
3487            kvm_msr_entry_add(cpu, msr, 0);
3488        }
3489    }
3490    if (has_msr_hv_stimer) {
3491        uint32_t msr;
3492
3493        for (msr = HV_X64_MSR_STIMER0_CONFIG; msr <= HV_X64_MSR_STIMER3_COUNT;
3494             msr++) {
3495            kvm_msr_entry_add(cpu, msr, 0);
3496        }
3497    }
3498    if (env->features[FEAT_1_EDX] & CPUID_MTRR) {
3499        kvm_msr_entry_add(cpu, MSR_MTRRdefType, 0);
3500        kvm_msr_entry_add(cpu, MSR_MTRRfix64K_00000, 0);
3501        kvm_msr_entry_add(cpu, MSR_MTRRfix16K_80000, 0);
3502        kvm_msr_entry_add(cpu, MSR_MTRRfix16K_A0000, 0);
3503        kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C0000, 0);
3504        kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C8000, 0);
3505        kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D0000, 0);
3506        kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D8000, 0);
3507        kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E0000, 0);
3508        kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E8000, 0);
3509        kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F0000, 0);
3510        kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F8000, 0);
3511        for (i = 0; i < MSR_MTRRcap_VCNT; i++) {
3512            kvm_msr_entry_add(cpu, MSR_MTRRphysBase(i), 0);
3513            kvm_msr_entry_add(cpu, MSR_MTRRphysMask(i), 0);
3514        }
3515    }
3516
3517    if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) {
3518        int addr_num =
3519            kvm_arch_get_supported_cpuid(kvm_state, 0x14, 1, R_EAX) & 0x7;
3520
3521        kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CTL, 0);
3522        kvm_msr_entry_add(cpu, MSR_IA32_RTIT_STATUS, 0);
3523        kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_BASE, 0);
3524        kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_MASK, 0);
3525        kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CR3_MATCH, 0);
3526        for (i = 0; i < addr_num; i++) {
3527            kvm_msr_entry_add(cpu, MSR_IA32_RTIT_ADDR0_A + i, 0);
3528        }
3529    }
3530
3531    if (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_SGX_LC) {
3532        kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH0, 0);
3533        kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH1, 0);
3534        kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH2, 0);
3535        kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH3, 0);
3536    }
3537
3538    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, cpu->kvm_msr_buf);
3539    if (ret < 0) {
3540        return ret;
3541    }
3542
3543    if (ret < cpu->kvm_msr_buf->nmsrs) {
3544        struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret];
3545        error_report("error: failed to get MSR 0x%" PRIx32,
3546                     (uint32_t)e->index);
3547    }
3548
3549    assert(ret == cpu->kvm_msr_buf->nmsrs);
3550    /*
3551     * MTRR masks: Each mask consists of 5 parts
3552     * a  10..0: must be zero
3553     * b  11   : valid bit
3554     * c n-1.12: actual mask bits
3555     * d  51..n: reserved must be zero
3556     * e  63.52: reserved must be zero
3557     *
3558     * 'n' is the number of physical bits supported by the CPU and is
3559     * apparently always <= 52.   We know our 'n' but don't know what
3560     * the destinations 'n' is; it might be smaller, in which case
3561     * it masks (c) on loading. It might be larger, in which case
3562     * we fill 'd' so that d..c is consistent irrespetive of the 'n'
3563     * we're migrating to.
3564     */
3565
3566    if (cpu->fill_mtrr_mask) {
3567        QEMU_BUILD_BUG_ON(TARGET_PHYS_ADDR_SPACE_BITS > 52);
3568        assert(cpu->phys_bits <= TARGET_PHYS_ADDR_SPACE_BITS);
3569        mtrr_top_bits = MAKE_64BIT_MASK(cpu->phys_bits, 52 - cpu->phys_bits);
3570    } else {
3571        mtrr_top_bits = 0;
3572    }
3573
3574    for (i = 0; i < ret; i++) {
3575        uint32_t index = msrs[i].index;
3576        switch (index) {
3577        case MSR_IA32_SYSENTER_CS:
3578            env->sysenter_cs = msrs[i].data;
3579            break;
3580        case MSR_IA32_SYSENTER_ESP:
3581            env->sysenter_esp = msrs[i].data;
3582            break;
3583        case MSR_IA32_SYSENTER_EIP:
3584            env->sysenter_eip = msrs[i].data;
3585            break;
3586        case MSR_PAT:
3587            env->pat = msrs[i].data;
3588            break;
3589        case MSR_STAR:
3590            env->star = msrs[i].data;
3591            break;
3592#ifdef TARGET_X86_64
3593        case MSR_CSTAR:
3594            env->cstar = msrs[i].data;
3595            break;
3596        case MSR_KERNELGSBASE:
3597            env->kernelgsbase = msrs[i].data;
3598            break;
3599        case MSR_FMASK:
3600            env->fmask = msrs[i].data;
3601            break;
3602        case MSR_LSTAR:
3603            env->lstar = msrs[i].data;
3604            break;
3605#endif
3606        case MSR_IA32_TSC:
3607            env->tsc = msrs[i].data;
3608            break;
3609        case MSR_TSC_AUX:
3610            env->tsc_aux = msrs[i].data;
3611            break;
3612        case MSR_TSC_ADJUST:
3613            env->tsc_adjust = msrs[i].data;
3614            break;
3615        case MSR_IA32_TSCDEADLINE:
3616            env->tsc_deadline = msrs[i].data;
3617            break;
3618        case MSR_VM_HSAVE_PA:
3619            env->vm_hsave = msrs[i].data;
3620            break;
3621        case MSR_KVM_SYSTEM_TIME:
3622            env->system_time_msr = msrs[i].data;
3623            break;
3624        case MSR_KVM_WALL_CLOCK:
3625            env->wall_clock_msr = msrs[i].data;
3626            break;
3627        case MSR_MCG_STATUS:
3628            env->mcg_status = msrs[i].data;
3629            break;
3630        case MSR_MCG_CTL:
3631            env->mcg_ctl = msrs[i].data;
3632            break;
3633        case MSR_MCG_EXT_CTL:
3634            env->mcg_ext_ctl = msrs[i].data;
3635            break;
3636        case MSR_IA32_MISC_ENABLE:
3637            env->msr_ia32_misc_enable = msrs[i].data;
3638            break;
3639        case MSR_IA32_SMBASE:
3640            env->smbase = msrs[i].data;
3641            break;
3642        case MSR_SMI_COUNT:
3643            env->msr_smi_count = msrs[i].data;
3644            break;
3645        case MSR_IA32_FEATURE_CONTROL:
3646            env->msr_ia32_feature_control = msrs[i].data;
3647            break;
3648        case MSR_IA32_BNDCFGS:
3649            env->msr_bndcfgs = msrs[i].data;
3650            break;
3651        case MSR_IA32_XSS:
3652            env->xss = msrs[i].data;
3653            break;
3654        case MSR_IA32_UMWAIT_CONTROL:
3655            env->umwait = msrs[i].data;
3656            break;
3657        case MSR_IA32_PKRS:
3658            env->pkrs = msrs[i].data;
3659            break;
3660        default:
3661            if (msrs[i].index >= MSR_MC0_CTL &&
3662                msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
3663                env->mce_banks[msrs[i].index - MSR_MC0_CTL] = msrs[i].data;
3664            }
3665            break;
3666        case MSR_KVM_ASYNC_PF_EN:
3667            env->async_pf_en_msr = msrs[i].data;
3668            break;
3669        case MSR_KVM_ASYNC_PF_INT:
3670            env->async_pf_int_msr = msrs[i].data;
3671            break;
3672        case MSR_KVM_PV_EOI_EN:
3673            env->pv_eoi_en_msr = msrs[i].data;
3674            break;
3675        case MSR_KVM_STEAL_TIME:
3676            env->steal_time_msr = msrs[i].data;
3677            break;
3678        case MSR_KVM_POLL_CONTROL: {
3679            env->poll_control_msr = msrs[i].data;
3680            break;
3681        }
3682        case MSR_CORE_PERF_FIXED_CTR_CTRL:
3683            env->msr_fixed_ctr_ctrl = msrs[i].data;
3684            break;
3685        case MSR_CORE_PERF_GLOBAL_CTRL:
3686            env->msr_global_ctrl = msrs[i].data;
3687            break;
3688        case MSR_CORE_PERF_GLOBAL_STATUS:
3689            env->msr_global_status = msrs[i].data;
3690            break;
3691        case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
3692            env->msr_global_ovf_ctrl = msrs[i].data;
3693            break;
3694        case MSR_CORE_PERF_FIXED_CTR0 ... MSR_CORE_PERF_FIXED_CTR0 + MAX_FIXED_COUNTERS - 1:
3695            env->msr_fixed_counters[index - MSR_CORE_PERF_FIXED_CTR0] = msrs[i].data;
3696            break;
3697        case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR0 + MAX_GP_COUNTERS - 1:
3698            env->msr_gp_counters[index - MSR_P6_PERFCTR0] = msrs[i].data;
3699            break;
3700        case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL0 + MAX_GP_COUNTERS - 1:
3701            env->msr_gp_evtsel[index - MSR_P6_EVNTSEL0] = msrs[i].data;
3702            break;
3703        case HV_X64_MSR_HYPERCALL:
3704            env->msr_hv_hypercall = msrs[i].data;
3705            break;
3706        case HV_X64_MSR_GUEST_OS_ID:
3707            env->msr_hv_guest_os_id = msrs[i].data;
3708            break;
3709        case HV_X64_MSR_APIC_ASSIST_PAGE:
3710            env->msr_hv_vapic = msrs[i].data;
3711            break;
3712        case HV_X64_MSR_REFERENCE_TSC:
3713            env->msr_hv_tsc = msrs[i].data;
3714            break;
3715        case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
3716            env->msr_hv_crash_params[index - HV_X64_MSR_CRASH_P0] = msrs[i].data;
3717            break;
3718        case HV_X64_MSR_VP_RUNTIME:
3719            env->msr_hv_runtime = msrs[i].data;
3720            break;
3721        case HV_X64_MSR_SCONTROL:
3722            env->msr_hv_synic_control = msrs[i].data;
3723            break;
3724        case HV_X64_MSR_SIEFP:
3725            env->msr_hv_synic_evt_page = msrs[i].data;
3726            break;
3727        case HV_X64_MSR_SIMP:
3728            env->msr_hv_synic_msg_page = msrs[i].data;
3729            break;
3730        case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
3731            env->msr_hv_synic_sint[index - HV_X64_MSR_SINT0] = msrs[i].data;
3732            break;
3733        case HV_X64_MSR_STIMER0_CONFIG:
3734        case HV_X64_MSR_STIMER1_CONFIG:
3735        case HV_X64_MSR_STIMER2_CONFIG:
3736        case HV_X64_MSR_STIMER3_CONFIG:
3737            env->msr_hv_stimer_config[(index - HV_X64_MSR_STIMER0_CONFIG)/2] =
3738                                msrs[i].data;
3739            break;
3740        case HV_X64_MSR_STIMER0_COUNT:
3741        case HV_X64_MSR_STIMER1_COUNT:
3742        case HV_X64_MSR_STIMER2_COUNT:
3743        case HV_X64_MSR_STIMER3_COUNT:
3744            env->msr_hv_stimer_count[(index - HV_X64_MSR_STIMER0_COUNT)/2] =
3745                                msrs[i].data;
3746            break;
3747        case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
3748            env->msr_hv_reenlightenment_control = msrs[i].data;
3749            break;
3750        case HV_X64_MSR_TSC_EMULATION_CONTROL:
3751            env->msr_hv_tsc_emulation_control = msrs[i].data;
3752            break;
3753        case HV_X64_MSR_TSC_EMULATION_STATUS:
3754            env->msr_hv_tsc_emulation_status = msrs[i].data;
3755            break;
3756        case MSR_MTRRdefType:
3757            env->mtrr_deftype = msrs[i].data;
3758            break;
3759        case MSR_MTRRfix64K_00000:
3760            env->mtrr_fixed[0] = msrs[i].data;
3761            break;
3762        case MSR_MTRRfix16K_80000:
3763            env->mtrr_fixed[1] = msrs[i].data;
3764            break;
3765        case MSR_MTRRfix16K_A0000:
3766            env->mtrr_fixed[2] = msrs[i].data;
3767            break;
3768        case MSR_MTRRfix4K_C0000:
3769            env->mtrr_fixed[3] = msrs[i].data;
3770            break;
3771        case MSR_MTRRfix4K_C8000:
3772            env->mtrr_fixed[4] = msrs[i].data;
3773            break;
3774        case MSR_MTRRfix4K_D0000:
3775            env->mtrr_fixed[5] = msrs[i].data;
3776            break;
3777        case MSR_MTRRfix4K_D8000:
3778            env->mtrr_fixed[6] = msrs[i].data;
3779            break;
3780        case MSR_MTRRfix4K_E0000:
3781            env->mtrr_fixed[7] = msrs[i].data;
3782            break;
3783        case MSR_MTRRfix4K_E8000:
3784            env->mtrr_fixed[8] = msrs[i].data;
3785            break;
3786        case MSR_MTRRfix4K_F0000:
3787            env->mtrr_fixed[9] = msrs[i].data;
3788            break;
3789        case MSR_MTRRfix4K_F8000:
3790            env->mtrr_fixed[10] = msrs[i].data;
3791            break;
3792        case MSR_MTRRphysBase(0) ... MSR_MTRRphysMask(MSR_MTRRcap_VCNT - 1):
3793            if (index & 1) {
3794                env->mtrr_var[MSR_MTRRphysIndex(index)].mask = msrs[i].data |
3795                                                               mtrr_top_bits;
3796            } else {
3797                env->mtrr_var[MSR_MTRRphysIndex(index)].base = msrs[i].data;
3798            }
3799            break;
3800        case MSR_IA32_SPEC_CTRL:
3801            env->spec_ctrl = msrs[i].data;
3802            break;
3803        case MSR_AMD64_TSC_RATIO:
3804            env->amd_tsc_scale_msr = msrs[i].data;
3805            break;
3806        case MSR_IA32_TSX_CTRL:
3807            env->tsx_ctrl = msrs[i].data;
3808            break;
3809        case MSR_VIRT_SSBD:
3810            env->virt_ssbd = msrs[i].data;
3811            break;
3812        case MSR_IA32_RTIT_CTL:
3813            env->msr_rtit_ctrl = msrs[i].data;
3814            break;
3815        case MSR_IA32_RTIT_STATUS:
3816            env->msr_rtit_status = msrs[i].data;
3817            break;
3818        case MSR_IA32_RTIT_OUTPUT_BASE:
3819            env->msr_rtit_output_base = msrs[i].data;
3820            break;
3821        case MSR_IA32_RTIT_OUTPUT_MASK:
3822            env->msr_rtit_output_mask = msrs[i].data;
3823            break;
3824        case MSR_IA32_RTIT_CR3_MATCH:
3825            env->msr_rtit_cr3_match = msrs[i].data;
3826            break;
3827        case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
3828            env->msr_rtit_addrs[index - MSR_IA32_RTIT_ADDR0_A] = msrs[i].data;
3829            break;
3830        case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
3831            env->msr_ia32_sgxlepubkeyhash[index - MSR_IA32_SGXLEPUBKEYHASH0] =
3832                           msrs[i].data;
3833            break;
3834        }
3835    }
3836
3837    return 0;
3838}
3839
3840static int kvm_put_mp_state(X86CPU *cpu)
3841{
3842    struct kvm_mp_state mp_state = { .mp_state = cpu->env.mp_state };
3843
3844    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MP_STATE, &mp_state);
3845}
3846
3847static int kvm_get_mp_state(X86CPU *cpu)
3848{
3849    CPUState *cs = CPU(cpu);
3850    CPUX86State *env = &cpu->env;
3851    struct kvm_mp_state mp_state;
3852    int ret;
3853
3854    ret = kvm_vcpu_ioctl(cs, KVM_GET_MP_STATE, &mp_state);
3855    if (ret < 0) {
3856        return ret;
3857    }
3858    env->mp_state = mp_state.mp_state;
3859    if (kvm_irqchip_in_kernel()) {
3860        cs->halted = (mp_state.mp_state == KVM_MP_STATE_HALTED);
3861    }
3862    return 0;
3863}
3864
3865static int kvm_get_apic(X86CPU *cpu)
3866{
3867    DeviceState *apic = cpu->apic_state;
3868    struct kvm_lapic_state kapic;
3869    int ret;
3870
3871    if (apic && kvm_irqchip_in_kernel()) {
3872        ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_LAPIC, &kapic);
3873        if (ret < 0) {
3874            return ret;
3875        }
3876
3877        kvm_get_apic_state(apic, &kapic);
3878    }
3879    return 0;
3880}
3881
3882static int kvm_put_vcpu_events(X86CPU *cpu, int level)
3883{
3884    CPUState *cs = CPU(cpu);
3885    CPUX86State *env = &cpu->env;
3886    struct kvm_vcpu_events events = {};
3887
3888    if (!kvm_has_vcpu_events()) {
3889        return 0;
3890    }
3891
3892    events.flags = 0;
3893
3894    if (has_exception_payload) {
3895        events.flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
3896        events.exception.pending = env->exception_pending;
3897        events.exception_has_payload = env->exception_has_payload;
3898        events.exception_payload = env->exception_payload;
3899    }
3900    events.exception.nr = env->exception_nr;
3901    events.exception.injected = env->exception_injected;
3902    events.exception.has_error_code = env->has_error_code;
3903    events.exception.error_code = env->error_code;
3904
3905    events.interrupt.injected = (env->interrupt_injected >= 0);
3906    events.interrupt.nr = env->interrupt_injected;
3907    events.interrupt.soft = env->soft_interrupt;
3908
3909    events.nmi.injected = env->nmi_injected;
3910    events.nmi.pending = env->nmi_pending;
3911    events.nmi.masked = !!(env->hflags2 & HF2_NMI_MASK);
3912
3913    events.sipi_vector = env->sipi_vector;
3914
3915    if (has_msr_smbase) {
3916        events.smi.smm = !!(env->hflags & HF_SMM_MASK);
3917        events.smi.smm_inside_nmi = !!(env->hflags2 & HF2_SMM_INSIDE_NMI_MASK);
3918        if (kvm_irqchip_in_kernel()) {
3919            /* As soon as these are moved to the kernel, remove them
3920             * from cs->interrupt_request.
3921             */
3922            events.smi.pending = cs->interrupt_request & CPU_INTERRUPT_SMI;
3923            events.smi.latched_init = cs->interrupt_request & CPU_INTERRUPT_INIT;
3924            cs->interrupt_request &= ~(CPU_INTERRUPT_INIT | CPU_INTERRUPT_SMI);
3925        } else {
3926            /* Keep these in cs->interrupt_request.  */
3927            events.smi.pending = 0;
3928            events.smi.latched_init = 0;
3929        }
3930        /* Stop SMI delivery on old machine types to avoid a reboot
3931         * on an inward migration of an old VM.
3932         */
3933        if (!cpu->kvm_no_smi_migration) {
3934            events.flags |= KVM_VCPUEVENT_VALID_SMM;
3935        }
3936    }
3937
3938    if (level >= KVM_PUT_RESET_STATE) {
3939        events.flags |= KVM_VCPUEVENT_VALID_NMI_PENDING;
3940        if (env->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
3941            events.flags |= KVM_VCPUEVENT_VALID_SIPI_VECTOR;
3942        }
3943    }
3944
3945    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_VCPU_EVENTS, &events);
3946}
3947
3948static int kvm_get_vcpu_events(X86CPU *cpu)
3949{
3950    CPUX86State *env = &cpu->env;
3951    struct kvm_vcpu_events events;
3952    int ret;
3953
3954    if (!kvm_has_vcpu_events()) {
3955        return 0;
3956    }
3957
3958    memset(&events, 0, sizeof(events));
3959    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_VCPU_EVENTS, &events);
3960    if (ret < 0) {
3961       return ret;
3962    }
3963
3964    if (events.flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
3965        env->exception_pending = events.exception.pending;
3966        env->exception_has_payload = events.exception_has_payload;
3967        env->exception_payload = events.exception_payload;
3968    } else {
3969        env->exception_pending = 0;
3970        env->exception_has_payload = false;
3971    }
3972    env->exception_injected = events.exception.injected;
3973    env->exception_nr =
3974        (env->exception_pending || env->exception_injected) ?
3975        events.exception.nr : -1;
3976    env->has_error_code = events.exception.has_error_code;
3977    env->error_code = events.exception.error_code;
3978
3979    env->interrupt_injected =
3980        events.interrupt.injected ? events.interrupt.nr : -1;
3981    env->soft_interrupt = events.interrupt.soft;
3982
3983    env->nmi_injected = events.nmi.injected;
3984    env->nmi_pending = events.nmi.pending;
3985    if (events.nmi.masked) {
3986        env->hflags2 |= HF2_NMI_MASK;
3987    } else {
3988        env->hflags2 &= ~HF2_NMI_MASK;
3989    }
3990
3991    if (events.flags & KVM_VCPUEVENT_VALID_SMM) {
3992        if (events.smi.smm) {
3993            env->hflags |= HF_SMM_MASK;
3994        } else {
3995            env->hflags &= ~HF_SMM_MASK;
3996        }
3997        if (events.smi.pending) {
3998            cpu_interrupt(CPU(cpu), CPU_INTERRUPT_SMI);
3999        } else {
4000            cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_SMI);
4001        }
4002        if (events.smi.smm_inside_nmi) {
4003            env->hflags2 |= HF2_SMM_INSIDE_NMI_MASK;
4004        } else {
4005            env->hflags2 &= ~HF2_SMM_INSIDE_NMI_MASK;
4006        }
4007        if (events.smi.latched_init) {
4008            cpu_interrupt(CPU(cpu), CPU_INTERRUPT_INIT);
4009        } else {
4010            cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_INIT);
4011        }
4012    }
4013
4014    env->sipi_vector = events.sipi_vector;
4015
4016    return 0;
4017}
4018
4019static int kvm_guest_debug_workarounds(X86CPU *cpu)
4020{
4021    CPUState *cs = CPU(cpu);
4022    CPUX86State *env = &cpu->env;
4023    int ret = 0;
4024    unsigned long reinject_trap = 0;
4025
4026    if (!kvm_has_vcpu_events()) {
4027        if (env->exception_nr == EXCP01_DB) {
4028            reinject_trap = KVM_GUESTDBG_INJECT_DB;
4029        } else if (env->exception_injected == EXCP03_INT3) {
4030            reinject_trap = KVM_GUESTDBG_INJECT_BP;
4031        }
4032        kvm_reset_exception(env);
4033    }
4034
4035    /*
4036     * Kernels before KVM_CAP_X86_ROBUST_SINGLESTEP overwrote flags.TF
4037     * injected via SET_GUEST_DEBUG while updating GP regs. Work around this
4038     * by updating the debug state once again if single-stepping is on.
4039     * Another reason to call kvm_update_guest_debug here is a pending debug
4040     * trap raise by the guest. On kernels without SET_VCPU_EVENTS we have to
4041     * reinject them via SET_GUEST_DEBUG.
4042     */
4043    if (reinject_trap ||
4044        (!kvm_has_robust_singlestep() && cs->singlestep_enabled)) {
4045        ret = kvm_update_guest_debug(cs, reinject_trap);
4046    }
4047    return ret;
4048}
4049
4050static int kvm_put_debugregs(X86CPU *cpu)
4051{
4052    CPUX86State *env = &cpu->env;
4053    struct kvm_debugregs dbgregs;
4054    int i;
4055
4056    if (!kvm_has_debugregs()) {
4057        return 0;
4058    }
4059
4060    memset(&dbgregs, 0, sizeof(dbgregs));
4061    for (i = 0; i < 4; i++) {
4062        dbgregs.db[i] = env->dr[i];
4063    }
4064    dbgregs.dr6 = env->dr[6];
4065    dbgregs.dr7 = env->dr[7];
4066    dbgregs.flags = 0;
4067
4068    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_DEBUGREGS, &dbgregs);
4069}
4070
4071static int kvm_get_debugregs(X86CPU *cpu)
4072{
4073    CPUX86State *env = &cpu->env;
4074    struct kvm_debugregs dbgregs;
4075    int i, ret;
4076
4077    if (!kvm_has_debugregs()) {
4078        return 0;
4079    }
4080
4081    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_DEBUGREGS, &dbgregs);
4082    if (ret < 0) {
4083        return ret;
4084    }
4085    for (i = 0; i < 4; i++) {
4086        env->dr[i] = dbgregs.db[i];
4087    }
4088    env->dr[4] = env->dr[6] = dbgregs.dr6;
4089    env->dr[5] = env->dr[7] = dbgregs.dr7;
4090
4091    return 0;
4092}
4093
4094static int kvm_put_nested_state(X86CPU *cpu)
4095{
4096    CPUX86State *env = &cpu->env;
4097    int max_nested_state_len = kvm_max_nested_state_length();
4098
4099    if (!env->nested_state) {
4100        return 0;
4101    }
4102
4103    /*
4104     * Copy flags that are affected by reset from env->hflags and env->hflags2.
4105     */
4106    if (env->hflags & HF_GUEST_MASK) {
4107        env->nested_state->flags |= KVM_STATE_NESTED_GUEST_MODE;
4108    } else {
4109        env->nested_state->flags &= ~KVM_STATE_NESTED_GUEST_MODE;
4110    }
4111
4112    /* Don't set KVM_STATE_NESTED_GIF_SET on VMX as it is illegal */
4113    if (cpu_has_svm(env) && (env->hflags2 & HF2_GIF_MASK)) {
4114        env->nested_state->flags |= KVM_STATE_NESTED_GIF_SET;
4115    } else {
4116        env->nested_state->flags &= ~KVM_STATE_NESTED_GIF_SET;
4117    }
4118
4119    assert(env->nested_state->size <= max_nested_state_len);
4120    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_NESTED_STATE, env->nested_state);
4121}
4122
4123static int kvm_get_nested_state(X86CPU *cpu)
4124{
4125    CPUX86State *env = &cpu->env;
4126    int max_nested_state_len = kvm_max_nested_state_length();
4127    int ret;
4128
4129    if (!env->nested_state) {
4130        return 0;
4131    }
4132
4133    /*
4134     * It is possible that migration restored a smaller size into
4135     * nested_state->hdr.size than what our kernel support.
4136     * We preserve migration origin nested_state->hdr.size for
4137     * call to KVM_SET_NESTED_STATE but wish that our next call
4138     * to KVM_GET_NESTED_STATE will use max size our kernel support.
4139     */
4140    env->nested_state->size = max_nested_state_len;
4141
4142    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_NESTED_STATE, env->nested_state);
4143    if (ret < 0) {
4144        return ret;
4145    }
4146
4147    /*
4148     * Copy flags that are affected by reset to env->hflags and env->hflags2.
4149     */
4150    if (env->nested_state->flags & KVM_STATE_NESTED_GUEST_MODE) {
4151        env->hflags |= HF_GUEST_MASK;
4152    } else {
4153        env->hflags &= ~HF_GUEST_MASK;
4154    }
4155
4156    /* Keep HF2_GIF_MASK set on !SVM as x86_cpu_pending_interrupt() needs it */
4157    if (cpu_has_svm(env)) {
4158        if (env->nested_state->flags & KVM_STATE_NESTED_GIF_SET) {
4159            env->hflags2 |= HF2_GIF_MASK;
4160        } else {
4161            env->hflags2 &= ~HF2_GIF_MASK;
4162        }
4163    }
4164
4165    return ret;
4166}
4167
4168int kvm_arch_put_registers(CPUState *cpu, int level)
4169{
4170    X86CPU *x86_cpu = X86_CPU(cpu);
4171    int ret;
4172
4173    assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
4174
4175    /* must be before kvm_put_nested_state so that EFER.SVME is set */
4176    ret = kvm_put_sregs(x86_cpu);
4177    if (ret < 0) {
4178        return ret;
4179    }
4180
4181    if (level >= KVM_PUT_RESET_STATE) {
4182        ret = kvm_put_nested_state(x86_cpu);
4183        if (ret < 0) {
4184            return ret;
4185        }
4186
4187        ret = kvm_put_msr_feature_control(x86_cpu);
4188        if (ret < 0) {
4189            return ret;
4190        }
4191    }
4192
4193    if (level == KVM_PUT_FULL_STATE) {
4194        /* We don't check for kvm_arch_set_tsc_khz() errors here,
4195         * because TSC frequency mismatch shouldn't abort migration,
4196         * unless the user explicitly asked for a more strict TSC
4197         * setting (e.g. using an explicit "tsc-freq" option).
4198         */
4199        kvm_arch_set_tsc_khz(cpu);
4200    }
4201
4202    ret = kvm_getput_regs(x86_cpu, 1);
4203    if (ret < 0) {
4204        return ret;
4205    }
4206    ret = kvm_put_xsave(x86_cpu);
4207    if (ret < 0) {
4208        return ret;
4209    }
4210    ret = kvm_put_xcrs(x86_cpu);
4211    if (ret < 0) {
4212        return ret;
4213    }
4214    /* must be before kvm_put_msrs */
4215    ret = kvm_inject_mce_oldstyle(x86_cpu);
4216    if (ret < 0) {
4217        return ret;
4218    }
4219    ret = kvm_put_msrs(x86_cpu, level);
4220    if (ret < 0) {
4221        return ret;
4222    }
4223    ret = kvm_put_vcpu_events(x86_cpu, level);
4224    if (ret < 0) {
4225        return ret;
4226    }
4227    if (level >= KVM_PUT_RESET_STATE) {
4228        ret = kvm_put_mp_state(x86_cpu);
4229        if (ret < 0) {
4230            return ret;
4231        }
4232    }
4233
4234    ret = kvm_put_tscdeadline_msr(x86_cpu);
4235    if (ret < 0) {
4236        return ret;
4237    }
4238    ret = kvm_put_debugregs(x86_cpu);
4239    if (ret < 0) {
4240        return ret;
4241    }
4242    /* must be last */
4243    ret = kvm_guest_debug_workarounds(x86_cpu);
4244    if (ret < 0) {
4245        return ret;
4246    }
4247    return 0;
4248}
4249
4250int kvm_arch_get_registers(CPUState *cs)
4251{
4252    X86CPU *cpu = X86_CPU(cs);
4253    int ret;
4254
4255    assert(cpu_is_stopped(cs) || qemu_cpu_is_self(cs));
4256
4257    ret = kvm_get_vcpu_events(cpu);
4258    if (ret < 0) {
4259        goto out;
4260    }
4261    /*
4262     * KVM_GET_MPSTATE can modify CS and RIP, call it before
4263     * KVM_GET_REGS and KVM_GET_SREGS.
4264     */
4265    ret = kvm_get_mp_state(cpu);
4266    if (ret < 0) {
4267        goto out;
4268    }
4269    ret = kvm_getput_regs(cpu, 0);
4270    if (ret < 0) {
4271        goto out;
4272    }
4273    ret = kvm_get_xsave(cpu);
4274    if (ret < 0) {
4275        goto out;
4276    }
4277    ret = kvm_get_xcrs(cpu);
4278    if (ret < 0) {
4279        goto out;
4280    }
4281    ret = kvm_get_sregs(cpu);
4282    if (ret < 0) {
4283        goto out;
4284    }
4285    ret = kvm_get_msrs(cpu);
4286    if (ret < 0) {
4287        goto out;
4288    }
4289    ret = kvm_get_apic(cpu);
4290    if (ret < 0) {
4291        goto out;
4292    }
4293    ret = kvm_get_debugregs(cpu);
4294    if (ret < 0) {
4295        goto out;
4296    }
4297    ret = kvm_get_nested_state(cpu);
4298    if (ret < 0) {
4299        goto out;
4300    }
4301    ret = 0;
4302 out:
4303    cpu_sync_bndcs_hflags(&cpu->env);
4304    return ret;
4305}
4306
4307void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run)
4308{
4309    X86CPU *x86_cpu = X86_CPU(cpu);
4310    CPUX86State *env = &x86_cpu->env;
4311    int ret;
4312
4313    /* Inject NMI */
4314    if (cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
4315        if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
4316            qemu_mutex_lock_iothread();
4317            cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
4318            qemu_mutex_unlock_iothread();
4319            DPRINTF("injected NMI\n");
4320            ret = kvm_vcpu_ioctl(cpu, KVM_NMI);
4321            if (ret < 0) {
4322                fprintf(stderr, "KVM: injection failed, NMI lost (%s)\n",
4323                        strerror(-ret));
4324            }
4325        }
4326        if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
4327            qemu_mutex_lock_iothread();
4328            cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
4329            qemu_mutex_unlock_iothread();
4330            DPRINTF("injected SMI\n");
4331            ret = kvm_vcpu_ioctl(cpu, KVM_SMI);
4332            if (ret < 0) {
4333                fprintf(stderr, "KVM: injection failed, SMI lost (%s)\n",
4334                        strerror(-ret));
4335            }
4336        }
4337    }
4338
4339    if (!kvm_pic_in_kernel()) {
4340        qemu_mutex_lock_iothread();
4341    }
4342
4343    /* Force the VCPU out of its inner loop to process any INIT requests
4344     * or (for userspace APIC, but it is cheap to combine the checks here)
4345     * pending TPR access reports.
4346     */
4347    if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
4348        if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
4349            !(env->hflags & HF_SMM_MASK)) {
4350            cpu->exit_request = 1;
4351        }
4352        if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
4353            cpu->exit_request = 1;
4354        }
4355    }
4356
4357    if (!kvm_pic_in_kernel()) {
4358        /* Try to inject an interrupt if the guest can accept it */
4359        if (run->ready_for_interrupt_injection &&
4360            (cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
4361            (env->eflags & IF_MASK)) {
4362            int irq;
4363
4364            cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
4365            irq = cpu_get_pic_interrupt(env);
4366            if (irq >= 0) {
4367                struct kvm_interrupt intr;
4368
4369                intr.irq = irq;
4370                DPRINTF("injected interrupt %d\n", irq);
4371                ret = kvm_vcpu_ioctl(cpu, KVM_INTERRUPT, &intr);
4372                if (ret < 0) {
4373                    fprintf(stderr,
4374                            "KVM: injection failed, interrupt lost (%s)\n",
4375                            strerror(-ret));
4376                }
4377            }
4378        }
4379
4380        /* If we have an interrupt but the guest is not ready to receive an
4381         * interrupt, request an interrupt window exit.  This will
4382         * cause a return to userspace as soon as the guest is ready to
4383         * receive interrupts. */
4384        if ((cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
4385            run->request_interrupt_window = 1;
4386        } else {
4387            run->request_interrupt_window = 0;
4388        }
4389
4390        DPRINTF("setting tpr\n");
4391        run->cr8 = cpu_get_apic_tpr(x86_cpu->apic_state);
4392
4393        qemu_mutex_unlock_iothread();
4394    }
4395}
4396
4397static void kvm_rate_limit_on_bus_lock(void)
4398{
4399    uint64_t delay_ns = ratelimit_calculate_delay(&bus_lock_ratelimit_ctrl, 1);
4400
4401    if (delay_ns) {
4402        g_usleep(delay_ns / SCALE_US);
4403    }
4404}
4405
4406MemTxAttrs kvm_arch_post_run(CPUState *cpu, struct kvm_run *run)
4407{
4408    X86CPU *x86_cpu = X86_CPU(cpu);
4409    CPUX86State *env = &x86_cpu->env;
4410
4411    if (run->flags & KVM_RUN_X86_SMM) {
4412        env->hflags |= HF_SMM_MASK;
4413    } else {
4414        env->hflags &= ~HF_SMM_MASK;
4415    }
4416    if (run->if_flag) {
4417        env->eflags |= IF_MASK;
4418    } else {
4419        env->eflags &= ~IF_MASK;
4420    }
4421    if (run->flags & KVM_RUN_X86_BUS_LOCK) {
4422        kvm_rate_limit_on_bus_lock();
4423    }
4424
4425    /* We need to protect the apic state against concurrent accesses from
4426     * different threads in case the userspace irqchip is used. */
4427    if (!kvm_irqchip_in_kernel()) {
4428        qemu_mutex_lock_iothread();
4429    }
4430    cpu_set_apic_tpr(x86_cpu->apic_state, run->cr8);
4431    cpu_set_apic_base(x86_cpu->apic_state, run->apic_base);
4432    if (!kvm_irqchip_in_kernel()) {
4433        qemu_mutex_unlock_iothread();
4434    }
4435    return cpu_get_mem_attrs(env);
4436}
4437
4438int kvm_arch_process_async_events(CPUState *cs)
4439{
4440    X86CPU *cpu = X86_CPU(cs);
4441    CPUX86State *env = &cpu->env;
4442
4443    if (cs->interrupt_request & CPU_INTERRUPT_MCE) {
4444        /* We must not raise CPU_INTERRUPT_MCE if it's not supported. */
4445        assert(env->mcg_cap);
4446
4447        cs->interrupt_request &= ~CPU_INTERRUPT_MCE;
4448
4449        kvm_cpu_synchronize_state(cs);
4450
4451        if (env->exception_nr == EXCP08_DBLE) {
4452            /* this means triple fault */
4453            qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
4454            cs->exit_request = 1;
4455            return 0;
4456        }
4457        kvm_queue_exception(env, EXCP12_MCHK, 0, 0);
4458        env->has_error_code = 0;
4459
4460        cs->halted = 0;
4461        if (kvm_irqchip_in_kernel() && env->mp_state == KVM_MP_STATE_HALTED) {
4462            env->mp_state = KVM_MP_STATE_RUNNABLE;
4463        }
4464    }
4465
4466    if ((cs->interrupt_request & CPU_INTERRUPT_INIT) &&
4467        !(env->hflags & HF_SMM_MASK)) {
4468        kvm_cpu_synchronize_state(cs);
4469        do_cpu_init(cpu);
4470    }
4471
4472    if (kvm_irqchip_in_kernel()) {
4473        return 0;
4474    }
4475
4476    if (cs->interrupt_request & CPU_INTERRUPT_POLL) {
4477        cs->interrupt_request &= ~CPU_INTERRUPT_POLL;
4478        apic_poll_irq(cpu->apic_state);
4479    }
4480    if (((cs->interrupt_request & CPU_INTERRUPT_HARD) &&
4481         (env->eflags & IF_MASK)) ||
4482        (cs->interrupt_request & CPU_INTERRUPT_NMI)) {
4483        cs->halted = 0;
4484    }
4485    if (cs->interrupt_request & CPU_INTERRUPT_SIPI) {
4486        kvm_cpu_synchronize_state(cs);
4487        do_cpu_sipi(cpu);
4488    }
4489    if (cs->interrupt_request & CPU_INTERRUPT_TPR) {
4490        cs->interrupt_request &= ~CPU_INTERRUPT_TPR;
4491        kvm_cpu_synchronize_state(cs);
4492        apic_handle_tpr_access_report(cpu->apic_state, env->eip,
4493                                      env->tpr_access_type);
4494    }
4495
4496    return cs->halted;
4497}
4498
4499static int kvm_handle_halt(X86CPU *cpu)
4500{
4501    CPUState *cs = CPU(cpu);
4502    CPUX86State *env = &cpu->env;
4503
4504    if (!((cs->interrupt_request & CPU_INTERRUPT_HARD) &&
4505          (env->eflags & IF_MASK)) &&
4506        !(cs->interrupt_request & CPU_INTERRUPT_NMI)) {
4507        cs->halted = 1;
4508        return EXCP_HLT;
4509    }
4510
4511    return 0;
4512}
4513
4514static int kvm_handle_tpr_access(X86CPU *cpu)
4515{
4516    CPUState *cs = CPU(cpu);
4517    struct kvm_run *run = cs->kvm_run;
4518
4519    apic_handle_tpr_access_report(cpu->apic_state, run->tpr_access.rip,
4520                                  run->tpr_access.is_write ? TPR_ACCESS_WRITE
4521                                                           : TPR_ACCESS_READ);
4522    return 1;
4523}
4524
4525int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
4526{
4527    static const uint8_t int3 = 0xcc;
4528
4529    if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 1, 0) ||
4530        cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&int3, 1, 1)) {
4531        return -EINVAL;
4532    }
4533    return 0;
4534}
4535
4536int kvm_arch_remove_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
4537{
4538    uint8_t int3;
4539
4540    if (cpu_memory_rw_debug(cs, bp->pc, &int3, 1, 0)) {
4541        return -EINVAL;
4542    }
4543    if (int3 != 0xcc) {
4544        return 0;
4545    }
4546    if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 1, 1)) {
4547        return -EINVAL;
4548    }
4549    return 0;
4550}
4551
4552static struct {
4553    target_ulong addr;
4554    int len;
4555    int type;
4556} hw_breakpoint[4];
4557
4558static int nb_hw_breakpoint;
4559
4560static int find_hw_breakpoint(target_ulong addr, int len, int type)
4561{
4562    int n;
4563
4564    for (n = 0; n < nb_hw_breakpoint; n++) {
4565        if (hw_breakpoint[n].addr == addr && hw_breakpoint[n].type == type &&
4566            (hw_breakpoint[n].len == len || len == -1)) {
4567            return n;
4568        }
4569    }
4570    return -1;
4571}
4572
4573int kvm_arch_insert_hw_breakpoint(target_ulong addr,
4574                                  target_ulong len, int type)
4575{
4576    switch (type) {
4577    case GDB_BREAKPOINT_HW:
4578        len = 1;
4579        break;
4580    case GDB_WATCHPOINT_WRITE:
4581    case GDB_WATCHPOINT_ACCESS:
4582        switch (len) {
4583        case 1:
4584            break;
4585        case 2:
4586        case 4:
4587        case 8:
4588            if (addr & (len - 1)) {
4589                return -EINVAL;
4590            }
4591            break;
4592        default:
4593            return -EINVAL;
4594        }
4595        break;
4596    default:
4597        return -ENOSYS;
4598    }
4599
4600    if (nb_hw_breakpoint == 4) {
4601        return -ENOBUFS;
4602    }
4603    if (find_hw_breakpoint(addr, len, type) >= 0) {
4604        return -EEXIST;
4605    }
4606    hw_breakpoint[nb_hw_breakpoint].addr = addr;
4607    hw_breakpoint[nb_hw_breakpoint].len = len;
4608    hw_breakpoint[nb_hw_breakpoint].type = type;
4609    nb_hw_breakpoint++;
4610
4611    return 0;
4612}
4613
4614int kvm_arch_remove_hw_breakpoint(target_ulong addr,
4615                                  target_ulong len, int type)
4616{
4617    int n;
4618
4619    n = find_hw_breakpoint(addr, (type == GDB_BREAKPOINT_HW) ? 1 : len, type);
4620    if (n < 0) {
4621        return -ENOENT;
4622    }
4623    nb_hw_breakpoint--;
4624    hw_breakpoint[n] = hw_breakpoint[nb_hw_breakpoint];
4625
4626    return 0;
4627}
4628
4629void kvm_arch_remove_all_hw_breakpoints(void)
4630{
4631    nb_hw_breakpoint = 0;
4632}
4633
4634static CPUWatchpoint hw_watchpoint;
4635
4636static int kvm_handle_debug(X86CPU *cpu,
4637                            struct kvm_debug_exit_arch *arch_info)
4638{
4639    CPUState *cs = CPU(cpu);
4640    CPUX86State *env = &cpu->env;
4641    int ret = 0;
4642    int n;
4643
4644    if (arch_info->exception == EXCP01_DB) {
4645        if (arch_info->dr6 & DR6_BS) {
4646            if (cs->singlestep_enabled) {
4647                ret = EXCP_DEBUG;
4648            }
4649        } else {
4650            for (n = 0; n < 4; n++) {
4651                if (arch_info->dr6 & (1 << n)) {
4652                    switch ((arch_info->dr7 >> (16 + n*4)) & 0x3) {
4653                    case 0x0:
4654                        ret = EXCP_DEBUG;
4655                        break;
4656                    case 0x1:
4657                        ret = EXCP_DEBUG;
4658                        cs->watchpoint_hit = &hw_watchpoint;
4659                        hw_watchpoint.vaddr = hw_breakpoint[n].addr;
4660                        hw_watchpoint.flags = BP_MEM_WRITE;
4661                        break;
4662                    case 0x3:
4663                        ret = EXCP_DEBUG;
4664                        cs->watchpoint_hit = &hw_watchpoint;
4665                        hw_watchpoint.vaddr = hw_breakpoint[n].addr;
4666                        hw_watchpoint.flags = BP_MEM_ACCESS;
4667                        break;
4668                    }
4669                }
4670            }
4671        }
4672    } else if (kvm_find_sw_breakpoint(cs, arch_info->pc)) {
4673        ret = EXCP_DEBUG;
4674    }
4675    if (ret == 0) {
4676        cpu_synchronize_state(cs);
4677        assert(env->exception_nr == -1);
4678
4679        /* pass to guest */
4680        kvm_queue_exception(env, arch_info->exception,
4681                            arch_info->exception == EXCP01_DB,
4682                            arch_info->dr6);
4683        env->has_error_code = 0;
4684    }
4685
4686    return ret;
4687}
4688
4689void kvm_arch_update_guest_debug(CPUState *cpu, struct kvm_guest_debug *dbg)
4690{
4691    const uint8_t type_code[] = {
4692        [GDB_BREAKPOINT_HW] = 0x0,
4693        [GDB_WATCHPOINT_WRITE] = 0x1,
4694        [GDB_WATCHPOINT_ACCESS] = 0x3
4695    };
4696    const uint8_t len_code[] = {
4697        [1] = 0x0, [2] = 0x1, [4] = 0x3, [8] = 0x2
4698    };
4699    int n;
4700
4701    if (kvm_sw_breakpoints_active(cpu)) {
4702        dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
4703    }
4704    if (nb_hw_breakpoint > 0) {
4705        dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
4706        dbg->arch.debugreg[7] = 0x0600;
4707        for (n = 0; n < nb_hw_breakpoint; n++) {
4708            dbg->arch.debugreg[n] = hw_breakpoint[n].addr;
4709            dbg->arch.debugreg[7] |= (2 << (n * 2)) |
4710                (type_code[hw_breakpoint[n].type] << (16 + n*4)) |
4711                ((uint32_t)len_code[hw_breakpoint[n].len] << (18 + n*4));
4712        }
4713    }
4714}
4715
4716static bool has_sgx_provisioning;
4717
4718static bool __kvm_enable_sgx_provisioning(KVMState *s)
4719{
4720    int fd, ret;
4721
4722    if (!kvm_vm_check_extension(s, KVM_CAP_SGX_ATTRIBUTE)) {
4723        return false;
4724    }
4725
4726    fd = qemu_open_old("/dev/sgx_provision", O_RDONLY);
4727    if (fd < 0) {
4728        return false;
4729    }
4730
4731    ret = kvm_vm_enable_cap(s, KVM_CAP_SGX_ATTRIBUTE, 0, fd);
4732    if (ret) {
4733        error_report("Could not enable SGX PROVISIONKEY: %s", strerror(-ret));
4734        exit(1);
4735    }
4736    close(fd);
4737    return true;
4738}
4739
4740bool kvm_enable_sgx_provisioning(KVMState *s)
4741{
4742    return MEMORIZE(__kvm_enable_sgx_provisioning(s), has_sgx_provisioning);
4743}
4744
4745static bool host_supports_vmx(void)
4746{
4747    uint32_t ecx, unused;
4748
4749    host_cpuid(1, 0, &unused, &unused, &ecx, &unused);
4750    return ecx & CPUID_EXT_VMX;
4751}
4752
4753#define VMX_INVALID_GUEST_STATE 0x80000021
4754
4755int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
4756{
4757    X86CPU *cpu = X86_CPU(cs);
4758    uint64_t code;
4759    int ret;
4760
4761    switch (run->exit_reason) {
4762    case KVM_EXIT_HLT:
4763        DPRINTF("handle_hlt\n");
4764        qemu_mutex_lock_iothread();
4765        ret = kvm_handle_halt(cpu);
4766        qemu_mutex_unlock_iothread();
4767        break;
4768    case KVM_EXIT_SET_TPR:
4769        ret = 0;
4770        break;
4771    case KVM_EXIT_TPR_ACCESS:
4772        qemu_mutex_lock_iothread();
4773        ret = kvm_handle_tpr_access(cpu);
4774        qemu_mutex_unlock_iothread();
4775        break;
4776    case KVM_EXIT_FAIL_ENTRY:
4777        code = run->fail_entry.hardware_entry_failure_reason;
4778        fprintf(stderr, "KVM: entry failed, hardware error 0x%" PRIx64 "\n",
4779                code);
4780        if (host_supports_vmx() && code == VMX_INVALID_GUEST_STATE) {
4781            fprintf(stderr,
4782                    "\nIf you're running a guest on an Intel machine without "
4783                        "unrestricted mode\n"
4784                    "support, the failure can be most likely due to the guest "
4785                        "entering an invalid\n"
4786                    "state for Intel VT. For example, the guest maybe running "
4787                        "in big real mode\n"
4788                    "which is not supported on less recent Intel processors."
4789                        "\n\n");
4790        }
4791        ret = -1;
4792        break;
4793    case KVM_EXIT_EXCEPTION:
4794        fprintf(stderr, "KVM: exception %d exit (error code 0x%x)\n",
4795                run->ex.exception, run->ex.error_code);
4796        ret = -1;
4797        break;
4798    case KVM_EXIT_DEBUG:
4799        DPRINTF("kvm_exit_debug\n");
4800        qemu_mutex_lock_iothread();
4801        ret = kvm_handle_debug(cpu, &run->debug.arch);
4802        qemu_mutex_unlock_iothread();
4803        break;
4804    case KVM_EXIT_HYPERV:
4805        ret = kvm_hv_handle_exit(cpu, &run->hyperv);
4806        break;
4807    case KVM_EXIT_IOAPIC_EOI:
4808        ioapic_eoi_broadcast(run->eoi.vector);
4809        ret = 0;
4810        break;
4811    case KVM_EXIT_X86_BUS_LOCK:
4812        /* already handled in kvm_arch_post_run */
4813        ret = 0;
4814        break;
4815    default:
4816        fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
4817        ret = -1;
4818        break;
4819    }
4820
4821    return ret;
4822}
4823
4824bool kvm_arch_stop_on_emulation_error(CPUState *cs)
4825{
4826    X86CPU *cpu = X86_CPU(cs);
4827    CPUX86State *env = &cpu->env;
4828
4829    kvm_cpu_synchronize_state(cs);
4830    return !(env->cr[0] & CR0_PE_MASK) ||
4831           ((env->segs[R_CS].selector  & 3) != 3);
4832}
4833
4834void kvm_arch_init_irq_routing(KVMState *s)
4835{
4836    /* We know at this point that we're using the in-kernel
4837     * irqchip, so we can use irqfds, and on x86 we know
4838     * we can use msi via irqfd and GSI routing.
4839     */
4840    kvm_msi_via_irqfd_allowed = true;
4841    kvm_gsi_routing_allowed = true;
4842
4843    if (kvm_irqchip_is_split()) {
4844        int i;
4845
4846        /* If the ioapic is in QEMU and the lapics are in KVM, reserve
4847           MSI routes for signaling interrupts to the local apics. */
4848        for (i = 0; i < IOAPIC_NUM_PINS; i++) {
4849            if (kvm_irqchip_add_msi_route(s, 0, NULL) < 0) {
4850                error_report("Could not enable split IRQ mode.");
4851                exit(1);
4852            }
4853        }
4854    }
4855}
4856
4857int kvm_arch_irqchip_create(KVMState *s)
4858{
4859    int ret;
4860    if (kvm_kernel_irqchip_split()) {
4861        ret = kvm_vm_enable_cap(s, KVM_CAP_SPLIT_IRQCHIP, 0, 24);
4862        if (ret) {
4863            error_report("Could not enable split irqchip mode: %s",
4864                         strerror(-ret));
4865            exit(1);
4866        } else {
4867            DPRINTF("Enabled KVM_CAP_SPLIT_IRQCHIP\n");
4868            kvm_split_irqchip = true;
4869            return 1;
4870        }
4871    } else {
4872        return 0;
4873    }
4874}
4875
4876uint64_t kvm_swizzle_msi_ext_dest_id(uint64_t address)
4877{
4878    CPUX86State *env;
4879    uint64_t ext_id;
4880
4881    if (!first_cpu) {
4882        return address;
4883    }
4884    env = &X86_CPU(first_cpu)->env;
4885    if (!(env->features[FEAT_KVM] & (1 << KVM_FEATURE_MSI_EXT_DEST_ID))) {
4886        return address;
4887    }
4888
4889    /*
4890     * If the remappable format bit is set, or the upper bits are
4891     * already set in address_hi, or the low extended bits aren't
4892     * there anyway, do nothing.
4893     */
4894    ext_id = address & (0xff << MSI_ADDR_DEST_IDX_SHIFT);
4895    if (!ext_id || (ext_id & (1 << MSI_ADDR_DEST_IDX_SHIFT)) || (address >> 32)) {
4896        return address;
4897    }
4898
4899    address &= ~ext_id;
4900    address |= ext_id << 35;
4901    return address;
4902}
4903
4904int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route,
4905                             uint64_t address, uint32_t data, PCIDevice *dev)
4906{
4907    X86IOMMUState *iommu = x86_iommu_get_default();
4908
4909    if (iommu) {
4910        X86IOMMUClass *class = X86_IOMMU_DEVICE_GET_CLASS(iommu);
4911
4912        if (class->int_remap) {
4913            int ret;
4914            MSIMessage src, dst;
4915
4916            src.address = route->u.msi.address_hi;
4917            src.address <<= VTD_MSI_ADDR_HI_SHIFT;
4918            src.address |= route->u.msi.address_lo;
4919            src.data = route->u.msi.data;
4920
4921            ret = class->int_remap(iommu, &src, &dst, dev ?     \
4922                                   pci_requester_id(dev) :      \
4923                                   X86_IOMMU_SID_INVALID);
4924            if (ret) {
4925                trace_kvm_x86_fixup_msi_error(route->gsi);
4926                return 1;
4927            }
4928
4929            /*
4930             * Handled untranslated compatibilty format interrupt with
4931             * extended destination ID in the low bits 11-5. */
4932            dst.address = kvm_swizzle_msi_ext_dest_id(dst.address);
4933
4934            route->u.msi.address_hi = dst.address >> VTD_MSI_ADDR_HI_SHIFT;
4935            route->u.msi.address_lo = dst.address & VTD_MSI_ADDR_LO_MASK;
4936            route->u.msi.data = dst.data;
4937            return 0;
4938        }
4939    }
4940
4941    address = kvm_swizzle_msi_ext_dest_id(address);
4942    route->u.msi.address_hi = address >> VTD_MSI_ADDR_HI_SHIFT;
4943    route->u.msi.address_lo = address & VTD_MSI_ADDR_LO_MASK;
4944    return 0;
4945}
4946
4947typedef struct MSIRouteEntry MSIRouteEntry;
4948
4949struct MSIRouteEntry {
4950    PCIDevice *dev;             /* Device pointer */
4951    int vector;                 /* MSI/MSIX vector index */
4952    int virq;                   /* Virtual IRQ index */
4953    QLIST_ENTRY(MSIRouteEntry) list;
4954};
4955
4956/* List of used GSI routes */
4957static QLIST_HEAD(, MSIRouteEntry) msi_route_list = \
4958    QLIST_HEAD_INITIALIZER(msi_route_list);
4959
4960static void kvm_update_msi_routes_all(void *private, bool global,
4961                                      uint32_t index, uint32_t mask)
4962{
4963    int cnt = 0, vector;
4964    MSIRouteEntry *entry;
4965    MSIMessage msg;
4966    PCIDevice *dev;
4967
4968    /* TODO: explicit route update */
4969    QLIST_FOREACH(entry, &msi_route_list, list) {
4970        cnt++;
4971        vector = entry->vector;
4972        dev = entry->dev;
4973        if (msix_enabled(dev) && !msix_is_masked(dev, vector)) {
4974            msg = msix_get_message(dev, vector);
4975        } else if (msi_enabled(dev) && !msi_is_masked(dev, vector)) {
4976            msg = msi_get_message(dev, vector);
4977        } else {
4978            /*
4979             * Either MSI/MSIX is disabled for the device, or the
4980             * specific message was masked out.  Skip this one.
4981             */
4982            continue;
4983        }
4984        kvm_irqchip_update_msi_route(kvm_state, entry->virq, msg, dev);
4985    }
4986    kvm_irqchip_commit_routes(kvm_state);
4987    trace_kvm_x86_update_msi_routes(cnt);
4988}
4989
4990int kvm_arch_add_msi_route_post(struct kvm_irq_routing_entry *route,
4991                                int vector, PCIDevice *dev)
4992{
4993    static bool notify_list_inited = false;
4994    MSIRouteEntry *entry;
4995
4996    if (!dev) {
4997        /* These are (possibly) IOAPIC routes only used for split
4998         * kernel irqchip mode, while what we are housekeeping are
4999         * PCI devices only. */
5000        return 0;
5001    }
5002
5003    entry = g_new0(MSIRouteEntry, 1);
5004    entry->dev = dev;
5005    entry->vector = vector;
5006    entry->virq = route->gsi;
5007    QLIST_INSERT_HEAD(&msi_route_list, entry, list);
5008
5009    trace_kvm_x86_add_msi_route(route->gsi);
5010
5011    if (!notify_list_inited) {
5012        /* For the first time we do add route, add ourselves into
5013         * IOMMU's IEC notify list if needed. */
5014        X86IOMMUState *iommu = x86_iommu_get_default();
5015        if (iommu) {
5016            x86_iommu_iec_register_notifier(iommu,
5017                                            kvm_update_msi_routes_all,
5018                                            NULL);
5019        }
5020        notify_list_inited = true;
5021    }
5022    return 0;
5023}
5024
5025int kvm_arch_release_virq_post(int virq)
5026{
5027    MSIRouteEntry *entry, *next;
5028    QLIST_FOREACH_SAFE(entry, &msi_route_list, list, next) {
5029        if (entry->virq == virq) {
5030            trace_kvm_x86_remove_msi_route(virq);
5031            QLIST_REMOVE(entry, list);
5032            g_free(entry);
5033            break;
5034        }
5035    }
5036    return 0;
5037}
5038
5039int kvm_arch_msi_data_to_gsi(uint32_t data)
5040{
5041    abort();
5042}
5043
5044bool kvm_has_waitpkg(void)
5045{
5046    return has_msr_umwait;
5047}
5048
5049bool kvm_arch_cpu_check_are_resettable(void)
5050{
5051    return !sev_es_enabled();
5052}
5053