qemu/target/i386/kvm.c
<<
>>
Prefs
   1/*
   2 * QEMU KVM support
   3 *
   4 * Copyright (C) 2006-2008 Qumranet Technologies
   5 * Copyright IBM, Corp. 2008
   6 *
   7 * Authors:
   8 *  Anthony Liguori   <aliguori@us.ibm.com>
   9 *
  10 * This work is licensed under the terms of the GNU GPL, version 2 or later.
  11 * See the COPYING file in the top-level directory.
  12 *
  13 */
  14
  15#include "qemu/osdep.h"
  16#include "qapi/error.h"
  17#include <sys/ioctl.h>
  18#include <sys/utsname.h>
  19
  20#include <linux/kvm.h>
  21#include "standard-headers/asm-x86/kvm_para.h"
  22
  23#include "qemu-common.h"
  24#include "cpu.h"
  25#include "sysemu/sysemu.h"
  26#include "sysemu/hw_accel.h"
  27#include "sysemu/kvm_int.h"
  28#include "kvm_i386.h"
  29#include "hyperv.h"
  30#include "hyperv-proto.h"
  31
  32#include "exec/gdbstub.h"
  33#include "qemu/host-utils.h"
  34#include "qemu/config-file.h"
  35#include "qemu/error-report.h"
  36#include "hw/i386/pc.h"
  37#include "hw/i386/apic.h"
  38#include "hw/i386/apic_internal.h"
  39#include "hw/i386/apic-msidef.h"
  40#include "hw/i386/intel_iommu.h"
  41#include "hw/i386/x86-iommu.h"
  42
  43#include "hw/pci/pci.h"
  44#include "hw/pci/msi.h"
  45#include "hw/pci/msix.h"
  46#include "migration/blocker.h"
  47#include "exec/memattrs.h"
  48#include "trace.h"
  49
  50//#define DEBUG_KVM
  51
  52#ifdef DEBUG_KVM
  53#define DPRINTF(fmt, ...) \
  54    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
  55#else
  56#define DPRINTF(fmt, ...) \
  57    do { } while (0)
  58#endif
  59
  60#define MSR_KVM_WALL_CLOCK  0x11
  61#define MSR_KVM_SYSTEM_TIME 0x12
  62
  63/* A 4096-byte buffer can hold the 8-byte kvm_msrs header, plus
  64 * 255 kvm_msr_entry structs */
  65#define MSR_BUF_SIZE 4096
  66
  67const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
  68    KVM_CAP_INFO(SET_TSS_ADDR),
  69    KVM_CAP_INFO(EXT_CPUID),
  70    KVM_CAP_INFO(MP_STATE),
  71    KVM_CAP_LAST_INFO
  72};
  73
  74static bool has_msr_star;
  75static bool has_msr_hsave_pa;
  76static bool has_msr_tsc_aux;
  77static bool has_msr_tsc_adjust;
  78static bool has_msr_tsc_deadline;
  79static bool has_msr_feature_control;
  80static bool has_msr_misc_enable;
  81static bool has_msr_smbase;
  82static bool has_msr_bndcfgs;
  83static int lm_capable_kernel;
  84static bool has_msr_hv_hypercall;
  85static bool has_msr_hv_crash;
  86static bool has_msr_hv_reset;
  87static bool has_msr_hv_vpindex;
  88static bool hv_vpindex_settable;
  89static bool has_msr_hv_runtime;
  90static bool has_msr_hv_synic;
  91static bool has_msr_hv_stimer;
  92static bool has_msr_hv_frequencies;
  93static bool has_msr_hv_reenlightenment;
  94static bool has_msr_xss;
  95static bool has_msr_spec_ctrl;
  96static bool has_msr_virt_ssbd;
  97static bool has_msr_smi_count;
  98
  99static uint32_t has_architectural_pmu_version;
 100static uint32_t num_architectural_pmu_gp_counters;
 101static uint32_t num_architectural_pmu_fixed_counters;
 102
 103static int has_xsave;
 104static int has_xcrs;
 105static int has_pit_state2;
 106
 107static bool has_msr_mcg_ext_ctl;
 108
 109static struct kvm_cpuid2 *cpuid_cache;
 110
 111int kvm_has_pit_state2(void)
 112{
 113    return has_pit_state2;
 114}
 115
 116bool kvm_has_smm(void)
 117{
 118    return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
 119}
 120
 121bool kvm_has_adjust_clock_stable(void)
 122{
 123    int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
 124
 125    return (ret == KVM_CLOCK_TSC_STABLE);
 126}
 127
 128bool kvm_allows_irq0_override(void)
 129{
 130    return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
 131}
 132
 133static bool kvm_x2apic_api_set_flags(uint64_t flags)
 134{
 135    KVMState *s = KVM_STATE(current_machine->accelerator);
 136
 137    return !kvm_vm_enable_cap(s, KVM_CAP_X2APIC_API, 0, flags);
 138}
 139
 140#define MEMORIZE(fn, _result) \
 141    ({ \
 142        static bool _memorized; \
 143        \
 144        if (_memorized) { \
 145            return _result; \
 146        } \
 147        _memorized = true; \
 148        _result = fn; \
 149    })
 150
 151static bool has_x2apic_api;
 152
 153bool kvm_has_x2apic_api(void)
 154{
 155    return has_x2apic_api;
 156}
 157
 158bool kvm_enable_x2apic(void)
 159{
 160    return MEMORIZE(
 161             kvm_x2apic_api_set_flags(KVM_X2APIC_API_USE_32BIT_IDS |
 162                                      KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK),
 163             has_x2apic_api);
 164}
 165
 166bool kvm_hv_vpindex_settable(void)
 167{
 168    return hv_vpindex_settable;
 169}
 170
 171static int kvm_get_tsc(CPUState *cs)
 172{
 173    X86CPU *cpu = X86_CPU(cs);
 174    CPUX86State *env = &cpu->env;
 175    struct {
 176        struct kvm_msrs info;
 177        struct kvm_msr_entry entries[1];
 178    } msr_data;
 179    int ret;
 180
 181    if (env->tsc_valid) {
 182        return 0;
 183    }
 184
 185    msr_data.info.nmsrs = 1;
 186    msr_data.entries[0].index = MSR_IA32_TSC;
 187    env->tsc_valid = !runstate_is_running();
 188
 189    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, &msr_data);
 190    if (ret < 0) {
 191        return ret;
 192    }
 193
 194    assert(ret == 1);
 195    env->tsc = msr_data.entries[0].data;
 196    return 0;
 197}
 198
 199static inline void do_kvm_synchronize_tsc(CPUState *cpu, run_on_cpu_data arg)
 200{
 201    kvm_get_tsc(cpu);
 202}
 203
 204void kvm_synchronize_all_tsc(void)
 205{
 206    CPUState *cpu;
 207
 208    if (kvm_enabled()) {
 209        CPU_FOREACH(cpu) {
 210            run_on_cpu(cpu, do_kvm_synchronize_tsc, RUN_ON_CPU_NULL);
 211        }
 212    }
 213}
 214
 215static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
 216{
 217    struct kvm_cpuid2 *cpuid;
 218    int r, size;
 219
 220    size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
 221    cpuid = g_malloc0(size);
 222    cpuid->nent = max;
 223    r = kvm_ioctl(s, KVM_GET_SUPPORTED_CPUID, cpuid);
 224    if (r == 0 && cpuid->nent >= max) {
 225        r = -E2BIG;
 226    }
 227    if (r < 0) {
 228        if (r == -E2BIG) {
 229            g_free(cpuid);
 230            return NULL;
 231        } else {
 232            fprintf(stderr, "KVM_GET_SUPPORTED_CPUID failed: %s\n",
 233                    strerror(-r));
 234            exit(1);
 235        }
 236    }
 237    return cpuid;
 238}
 239
 240/* Run KVM_GET_SUPPORTED_CPUID ioctl(), allocating a buffer large enough
 241 * for all entries.
 242 */
 243static struct kvm_cpuid2 *get_supported_cpuid(KVMState *s)
 244{
 245    struct kvm_cpuid2 *cpuid;
 246    int max = 1;
 247
 248    if (cpuid_cache != NULL) {
 249        return cpuid_cache;
 250    }
 251    while ((cpuid = try_get_cpuid(s, max)) == NULL) {
 252        max *= 2;
 253    }
 254    cpuid_cache = cpuid;
 255    return cpuid;
 256}
 257
 258static const struct kvm_para_features {
 259    int cap;
 260    int feature;
 261} para_features[] = {
 262    { KVM_CAP_CLOCKSOURCE, KVM_FEATURE_CLOCKSOURCE },
 263    { KVM_CAP_NOP_IO_DELAY, KVM_FEATURE_NOP_IO_DELAY },
 264    { KVM_CAP_PV_MMU, KVM_FEATURE_MMU_OP },
 265    { KVM_CAP_ASYNC_PF, KVM_FEATURE_ASYNC_PF },
 266};
 267
 268static int get_para_features(KVMState *s)
 269{
 270    int i, features = 0;
 271
 272    for (i = 0; i < ARRAY_SIZE(para_features); i++) {
 273        if (kvm_check_extension(s, para_features[i].cap)) {
 274            features |= (1 << para_features[i].feature);
 275        }
 276    }
 277
 278    return features;
 279}
 280
 281static bool host_tsx_blacklisted(void)
 282{
 283    int family, model, stepping;\
 284    char vendor[CPUID_VENDOR_SZ + 1];
 285
 286    host_vendor_fms(vendor, &family, &model, &stepping);
 287
 288    /* Check if we are running on a Haswell host known to have broken TSX */
 289    return !strcmp(vendor, CPUID_VENDOR_INTEL) &&
 290           (family == 6) &&
 291           ((model == 63 && stepping < 4) ||
 292            model == 60 || model == 69 || model == 70);
 293}
 294
 295/* Returns the value for a specific register on the cpuid entry
 296 */
 297static uint32_t cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, int reg)
 298{
 299    uint32_t ret = 0;
 300    switch (reg) {
 301    case R_EAX:
 302        ret = entry->eax;
 303        break;
 304    case R_EBX:
 305        ret = entry->ebx;
 306        break;
 307    case R_ECX:
 308        ret = entry->ecx;
 309        break;
 310    case R_EDX:
 311        ret = entry->edx;
 312        break;
 313    }
 314    return ret;
 315}
 316
 317/* Find matching entry for function/index on kvm_cpuid2 struct
 318 */
 319static struct kvm_cpuid_entry2 *cpuid_find_entry(struct kvm_cpuid2 *cpuid,
 320                                                 uint32_t function,
 321                                                 uint32_t index)
 322{
 323    int i;
 324    for (i = 0; i < cpuid->nent; ++i) {
 325        if (cpuid->entries[i].function == function &&
 326            cpuid->entries[i].index == index) {
 327            return &cpuid->entries[i];
 328        }
 329    }
 330    /* not found: */
 331    return NULL;
 332}
 333
 334uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
 335                                      uint32_t index, int reg)
 336{
 337    struct kvm_cpuid2 *cpuid;
 338    uint32_t ret = 0;
 339    uint32_t cpuid_1_edx;
 340    bool found = false;
 341
 342    cpuid = get_supported_cpuid(s);
 343
 344    struct kvm_cpuid_entry2 *entry = cpuid_find_entry(cpuid, function, index);
 345    if (entry) {
 346        found = true;
 347        ret = cpuid_entry_get_reg(entry, reg);
 348    }
 349
 350    /* Fixups for the data returned by KVM, below */
 351
 352    if (function == 1 && reg == R_EDX) {
 353        /* KVM before 2.6.30 misreports the following features */
 354        ret |= CPUID_MTRR | CPUID_PAT | CPUID_MCE | CPUID_MCA;
 355    } else if (function == 1 && reg == R_ECX) {
 356        /* We can set the hypervisor flag, even if KVM does not return it on
 357         * GET_SUPPORTED_CPUID
 358         */
 359        ret |= CPUID_EXT_HYPERVISOR;
 360        /* tsc-deadline flag is not returned by GET_SUPPORTED_CPUID, but it
 361         * can be enabled if the kernel has KVM_CAP_TSC_DEADLINE_TIMER,
 362         * and the irqchip is in the kernel.
 363         */
 364        if (kvm_irqchip_in_kernel() &&
 365                kvm_check_extension(s, KVM_CAP_TSC_DEADLINE_TIMER)) {
 366            ret |= CPUID_EXT_TSC_DEADLINE_TIMER;
 367        }
 368
 369        /* x2apic is reported by GET_SUPPORTED_CPUID, but it can't be enabled
 370         * without the in-kernel irqchip
 371         */
 372        if (!kvm_irqchip_in_kernel()) {
 373            ret &= ~CPUID_EXT_X2APIC;
 374        }
 375
 376        if (enable_cpu_pm) {
 377            int disable_exits = kvm_check_extension(s,
 378                                                    KVM_CAP_X86_DISABLE_EXITS);
 379
 380            if (disable_exits & KVM_X86_DISABLE_EXITS_MWAIT) {
 381                ret |= CPUID_EXT_MONITOR;
 382            }
 383        }
 384    } else if (function == 6 && reg == R_EAX) {
 385        ret |= CPUID_6_EAX_ARAT; /* safe to allow because of emulated APIC */
 386    } else if (function == 7 && index == 0 && reg == R_EBX) {
 387        if (host_tsx_blacklisted()) {
 388            ret &= ~(CPUID_7_0_EBX_RTM | CPUID_7_0_EBX_HLE);
 389        }
 390    } else if (function == 0x80000001 && reg == R_ECX) {
 391        /*
 392         * It's safe to enable TOPOEXT even if it's not returned by
 393         * GET_SUPPORTED_CPUID.  Unconditionally enabling TOPOEXT here allows
 394         * us to keep CPU models including TOPOEXT runnable on older kernels.
 395         */
 396        ret |= CPUID_EXT3_TOPOEXT;
 397    } else if (function == 0x80000001 && reg == R_EDX) {
 398        /* On Intel, kvm returns cpuid according to the Intel spec,
 399         * so add missing bits according to the AMD spec:
 400         */
 401        cpuid_1_edx = kvm_arch_get_supported_cpuid(s, 1, 0, R_EDX);
 402        ret |= cpuid_1_edx & CPUID_EXT2_AMD_ALIASES;
 403    } else if (function == KVM_CPUID_FEATURES && reg == R_EAX) {
 404        /* kvm_pv_unhalt is reported by GET_SUPPORTED_CPUID, but it can't
 405         * be enabled without the in-kernel irqchip
 406         */
 407        if (!kvm_irqchip_in_kernel()) {
 408            ret &= ~(1U << KVM_FEATURE_PV_UNHALT);
 409        }
 410    } else if (function == KVM_CPUID_FEATURES && reg == R_EDX) {
 411        ret |= 1U << KVM_HINTS_REALTIME;
 412        found = 1;
 413    }
 414
 415    /* fallback for older kernels */
 416    if ((function == KVM_CPUID_FEATURES) && !found) {
 417        ret = get_para_features(s);
 418    }
 419
 420    return ret;
 421}
 422
 423typedef struct HWPoisonPage {
 424    ram_addr_t ram_addr;
 425    QLIST_ENTRY(HWPoisonPage) list;
 426} HWPoisonPage;
 427
 428static QLIST_HEAD(, HWPoisonPage) hwpoison_page_list =
 429    QLIST_HEAD_INITIALIZER(hwpoison_page_list);
 430
 431static void kvm_unpoison_all(void *param)
 432{
 433    HWPoisonPage *page, *next_page;
 434
 435    QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
 436        QLIST_REMOVE(page, list);
 437        qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
 438        g_free(page);
 439    }
 440}
 441
 442static void kvm_hwpoison_page_add(ram_addr_t ram_addr)
 443{
 444    HWPoisonPage *page;
 445
 446    QLIST_FOREACH(page, &hwpoison_page_list, list) {
 447        if (page->ram_addr == ram_addr) {
 448            return;
 449        }
 450    }
 451    page = g_new(HWPoisonPage, 1);
 452    page->ram_addr = ram_addr;
 453    QLIST_INSERT_HEAD(&hwpoison_page_list, page, list);
 454}
 455
 456static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
 457                                     int *max_banks)
 458{
 459    int r;
 460
 461    r = kvm_check_extension(s, KVM_CAP_MCE);
 462    if (r > 0) {
 463        *max_banks = r;
 464        return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
 465    }
 466    return -ENOSYS;
 467}
 468
 469static void kvm_mce_inject(X86CPU *cpu, hwaddr paddr, int code)
 470{
 471    CPUState *cs = CPU(cpu);
 472    CPUX86State *env = &cpu->env;
 473    uint64_t status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
 474                      MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S;
 475    uint64_t mcg_status = MCG_STATUS_MCIP;
 476    int flags = 0;
 477
 478    if (code == BUS_MCEERR_AR) {
 479        status |= MCI_STATUS_AR | 0x134;
 480        mcg_status |= MCG_STATUS_EIPV;
 481    } else {
 482        status |= 0xc0;
 483        mcg_status |= MCG_STATUS_RIPV;
 484    }
 485
 486    flags = cpu_x86_support_mca_broadcast(env) ? MCE_INJECT_BROADCAST : 0;
 487    /* We need to read back the value of MSR_EXT_MCG_CTL that was set by the
 488     * guest kernel back into env->mcg_ext_ctl.
 489     */
 490    cpu_synchronize_state(cs);
 491    if (env->mcg_ext_ctl & MCG_EXT_CTL_LMCE_EN) {
 492        mcg_status |= MCG_STATUS_LMCE;
 493        flags = 0;
 494    }
 495
 496    cpu_x86_inject_mce(NULL, cpu, 9, status, mcg_status, paddr,
 497                       (MCM_ADDR_PHYS << 6) | 0xc, flags);
 498}
 499
 500static void hardware_memory_error(void)
 501{
 502    fprintf(stderr, "Hardware memory error!\n");
 503    exit(1);
 504}
 505
 506void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
 507{
 508    X86CPU *cpu = X86_CPU(c);
 509    CPUX86State *env = &cpu->env;
 510    ram_addr_t ram_addr;
 511    hwaddr paddr;
 512
 513    /* If we get an action required MCE, it has been injected by KVM
 514     * while the VM was running.  An action optional MCE instead should
 515     * be coming from the main thread, which qemu_init_sigbus identifies
 516     * as the "early kill" thread.
 517     */
 518    assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO);
 519
 520    if ((env->mcg_cap & MCG_SER_P) && addr) {
 521        ram_addr = qemu_ram_addr_from_host(addr);
 522        if (ram_addr != RAM_ADDR_INVALID &&
 523            kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
 524            kvm_hwpoison_page_add(ram_addr);
 525            kvm_mce_inject(cpu, paddr, code);
 526            return;
 527        }
 528
 529        fprintf(stderr, "Hardware memory error for memory used by "
 530                "QEMU itself instead of guest system!\n");
 531    }
 532
 533    if (code == BUS_MCEERR_AR) {
 534        hardware_memory_error();
 535    }
 536
 537    /* Hope we are lucky for AO MCE */
 538}
 539
 540static int kvm_inject_mce_oldstyle(X86CPU *cpu)
 541{
 542    CPUX86State *env = &cpu->env;
 543
 544    if (!kvm_has_vcpu_events() && env->exception_injected == EXCP12_MCHK) {
 545        unsigned int bank, bank_num = env->mcg_cap & 0xff;
 546        struct kvm_x86_mce mce;
 547
 548        env->exception_injected = -1;
 549
 550        /*
 551         * There must be at least one bank in use if an MCE is pending.
 552         * Find it and use its values for the event injection.
 553         */
 554        for (bank = 0; bank < bank_num; bank++) {
 555            if (env->mce_banks[bank * 4 + 1] & MCI_STATUS_VAL) {
 556                break;
 557            }
 558        }
 559        assert(bank < bank_num);
 560
 561        mce.bank = bank;
 562        mce.status = env->mce_banks[bank * 4 + 1];
 563        mce.mcg_status = env->mcg_status;
 564        mce.addr = env->mce_banks[bank * 4 + 2];
 565        mce.misc = env->mce_banks[bank * 4 + 3];
 566
 567        return kvm_vcpu_ioctl(CPU(cpu), KVM_X86_SET_MCE, &mce);
 568    }
 569    return 0;
 570}
 571
 572static void cpu_update_state(void *opaque, int running, RunState state)
 573{
 574    CPUX86State *env = opaque;
 575
 576    if (running) {
 577        env->tsc_valid = false;
 578    }
 579}
 580
 581unsigned long kvm_arch_vcpu_id(CPUState *cs)
 582{
 583    X86CPU *cpu = X86_CPU(cs);
 584    return cpu->apic_id;
 585}
 586
 587#ifndef KVM_CPUID_SIGNATURE_NEXT
 588#define KVM_CPUID_SIGNATURE_NEXT                0x40000100
 589#endif
 590
 591static bool hyperv_hypercall_available(X86CPU *cpu)
 592{
 593    return cpu->hyperv_vapic ||
 594           (cpu->hyperv_spinlock_attempts != HYPERV_SPINLOCK_NEVER_RETRY);
 595}
 596
 597static bool hyperv_enabled(X86CPU *cpu)
 598{
 599    CPUState *cs = CPU(cpu);
 600    return kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV) > 0 &&
 601           (hyperv_hypercall_available(cpu) ||
 602            cpu->hyperv_time  ||
 603            cpu->hyperv_relaxed_timing ||
 604            cpu->hyperv_crash ||
 605            cpu->hyperv_reset ||
 606            cpu->hyperv_vpindex ||
 607            cpu->hyperv_runtime ||
 608            cpu->hyperv_synic ||
 609            cpu->hyperv_stimer ||
 610            cpu->hyperv_reenlightenment ||
 611            cpu->hyperv_tlbflush);
 612}
 613
 614static int kvm_arch_set_tsc_khz(CPUState *cs)
 615{
 616    X86CPU *cpu = X86_CPU(cs);
 617    CPUX86State *env = &cpu->env;
 618    int r;
 619
 620    if (!env->tsc_khz) {
 621        return 0;
 622    }
 623
 624    r = kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL) ?
 625        kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz) :
 626        -ENOTSUP;
 627    if (r < 0) {
 628        /* When KVM_SET_TSC_KHZ fails, it's an error only if the current
 629         * TSC frequency doesn't match the one we want.
 630         */
 631        int cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
 632                       kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) :
 633                       -ENOTSUP;
 634        if (cur_freq <= 0 || cur_freq != env->tsc_khz) {
 635            warn_report("TSC frequency mismatch between "
 636                        "VM (%" PRId64 " kHz) and host (%d kHz), "
 637                        "and TSC scaling unavailable",
 638                        env->tsc_khz, cur_freq);
 639            return r;
 640        }
 641    }
 642
 643    return 0;
 644}
 645
 646static bool tsc_is_stable_and_known(CPUX86State *env)
 647{
 648    if (!env->tsc_khz) {
 649        return false;
 650    }
 651    return (env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC)
 652        || env->user_tsc_khz;
 653}
 654
 655static int hyperv_handle_properties(CPUState *cs)
 656{
 657    X86CPU *cpu = X86_CPU(cs);
 658    CPUX86State *env = &cpu->env;
 659
 660    if (cpu->hyperv_relaxed_timing) {
 661        env->features[FEAT_HYPERV_EAX] |= HV_HYPERCALL_AVAILABLE;
 662    }
 663    if (cpu->hyperv_vapic) {
 664        env->features[FEAT_HYPERV_EAX] |= HV_HYPERCALL_AVAILABLE;
 665        env->features[FEAT_HYPERV_EAX] |= HV_APIC_ACCESS_AVAILABLE;
 666    }
 667    if (cpu->hyperv_time) {
 668        if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV_TIME) <= 0) {
 669            fprintf(stderr, "Hyper-V clocksources "
 670                    "(requested by 'hv-time' cpu flag) "
 671                    "are not supported by kernel\n");
 672            return -ENOSYS;
 673        }
 674        env->features[FEAT_HYPERV_EAX] |= HV_HYPERCALL_AVAILABLE;
 675        env->features[FEAT_HYPERV_EAX] |= HV_TIME_REF_COUNT_AVAILABLE;
 676        env->features[FEAT_HYPERV_EAX] |= HV_REFERENCE_TSC_AVAILABLE;
 677    }
 678    if (cpu->hyperv_frequencies) {
 679        if (!has_msr_hv_frequencies) {
 680            fprintf(stderr, "Hyper-V frequency MSRs "
 681                    "(requested by 'hv-frequencies' cpu flag) "
 682                    "are not supported by kernel\n");
 683            return -ENOSYS;
 684        }
 685        env->features[FEAT_HYPERV_EAX] |= HV_ACCESS_FREQUENCY_MSRS;
 686        env->features[FEAT_HYPERV_EDX] |= HV_FREQUENCY_MSRS_AVAILABLE;
 687    }
 688    if (cpu->hyperv_crash) {
 689        if (!has_msr_hv_crash) {
 690            fprintf(stderr, "Hyper-V crash MSRs "
 691                    "(requested by 'hv-crash' cpu flag) "
 692                    "are not supported by kernel\n");
 693            return -ENOSYS;
 694        }
 695        env->features[FEAT_HYPERV_EDX] |= HV_GUEST_CRASH_MSR_AVAILABLE;
 696    }
 697    if (cpu->hyperv_reenlightenment) {
 698        if (!has_msr_hv_reenlightenment) {
 699            fprintf(stderr,
 700                    "Hyper-V Reenlightenment MSRs "
 701                    "(requested by 'hv-reenlightenment' cpu flag) "
 702                    "are not supported by kernel\n");
 703            return -ENOSYS;
 704        }
 705        env->features[FEAT_HYPERV_EAX] |= HV_ACCESS_REENLIGHTENMENTS_CONTROL;
 706    }
 707    env->features[FEAT_HYPERV_EDX] |= HV_CPU_DYNAMIC_PARTITIONING_AVAILABLE;
 708    if (cpu->hyperv_reset) {
 709        if (!has_msr_hv_reset) {
 710            fprintf(stderr, "Hyper-V reset MSR "
 711                    "(requested by 'hv-reset' cpu flag) "
 712                    "is not supported by kernel\n");
 713            return -ENOSYS;
 714        }
 715        env->features[FEAT_HYPERV_EAX] |= HV_RESET_AVAILABLE;
 716    }
 717    if (cpu->hyperv_vpindex) {
 718        if (!has_msr_hv_vpindex) {
 719            fprintf(stderr, "Hyper-V VP_INDEX MSR "
 720                    "(requested by 'hv-vpindex' cpu flag) "
 721                    "is not supported by kernel\n");
 722            return -ENOSYS;
 723        }
 724        env->features[FEAT_HYPERV_EAX] |= HV_VP_INDEX_AVAILABLE;
 725    }
 726    if (cpu->hyperv_runtime) {
 727        if (!has_msr_hv_runtime) {
 728            fprintf(stderr, "Hyper-V VP_RUNTIME MSR "
 729                    "(requested by 'hv-runtime' cpu flag) "
 730                    "is not supported by kernel\n");
 731            return -ENOSYS;
 732        }
 733        env->features[FEAT_HYPERV_EAX] |= HV_VP_RUNTIME_AVAILABLE;
 734    }
 735    if (cpu->hyperv_synic) {
 736        if (!has_msr_hv_synic ||
 737            kvm_vcpu_enable_cap(cs, KVM_CAP_HYPERV_SYNIC, 0)) {
 738            fprintf(stderr, "Hyper-V SynIC is not supported by kernel\n");
 739            return -ENOSYS;
 740        }
 741
 742        env->features[FEAT_HYPERV_EAX] |= HV_SYNIC_AVAILABLE;
 743    }
 744    if (cpu->hyperv_stimer) {
 745        if (!has_msr_hv_stimer) {
 746            fprintf(stderr, "Hyper-V timers aren't supported by kernel\n");
 747            return -ENOSYS;
 748        }
 749        env->features[FEAT_HYPERV_EAX] |= HV_SYNTIMERS_AVAILABLE;
 750    }
 751    return 0;
 752}
 753
 754static int hyperv_init_vcpu(X86CPU *cpu)
 755{
 756    if (cpu->hyperv_vpindex && !hv_vpindex_settable) {
 757        /*
 758         * the kernel doesn't support setting vp_index; assert that its value
 759         * is in sync
 760         */
 761        int ret;
 762        struct {
 763            struct kvm_msrs info;
 764            struct kvm_msr_entry entries[1];
 765        } msr_data = {
 766            .info.nmsrs = 1,
 767            .entries[0].index = HV_X64_MSR_VP_INDEX,
 768        };
 769
 770        ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, &msr_data);
 771        if (ret < 0) {
 772            return ret;
 773        }
 774        assert(ret == 1);
 775
 776        if (msr_data.entries[0].data != hyperv_vp_index(cpu)) {
 777            error_report("kernel's vp_index != QEMU's vp_index");
 778            return -ENXIO;
 779        }
 780    }
 781
 782    return 0;
 783}
 784
 785static Error *invtsc_mig_blocker;
 786
 787#define KVM_MAX_CPUID_ENTRIES  100
 788
 789int kvm_arch_init_vcpu(CPUState *cs)
 790{
 791    struct {
 792        struct kvm_cpuid2 cpuid;
 793        struct kvm_cpuid_entry2 entries[KVM_MAX_CPUID_ENTRIES];
 794    } QEMU_PACKED cpuid_data;
 795    X86CPU *cpu = X86_CPU(cs);
 796    CPUX86State *env = &cpu->env;
 797    uint32_t limit, i, j, cpuid_i;
 798    uint32_t unused;
 799    struct kvm_cpuid_entry2 *c;
 800    uint32_t signature[3];
 801    int kvm_base = KVM_CPUID_SIGNATURE;
 802    int r;
 803    Error *local_err = NULL;
 804
 805    memset(&cpuid_data, 0, sizeof(cpuid_data));
 806
 807    cpuid_i = 0;
 808
 809    r = kvm_arch_set_tsc_khz(cs);
 810    if (r < 0) {
 811        goto fail;
 812    }
 813
 814    /* vcpu's TSC frequency is either specified by user, or following
 815     * the value used by KVM if the former is not present. In the
 816     * latter case, we query it from KVM and record in env->tsc_khz,
 817     * so that vcpu's TSC frequency can be migrated later via this field.
 818     */
 819    if (!env->tsc_khz) {
 820        r = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
 821            kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) :
 822            -ENOTSUP;
 823        if (r > 0) {
 824            env->tsc_khz = r;
 825        }
 826    }
 827
 828    /* Paravirtualization CPUIDs */
 829    if (hyperv_enabled(cpu)) {
 830        c = &cpuid_data.entries[cpuid_i++];
 831        c->function = HV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
 832        if (!cpu->hyperv_vendor_id) {
 833            memcpy(signature, "Microsoft Hv", 12);
 834        } else {
 835            size_t len = strlen(cpu->hyperv_vendor_id);
 836
 837            if (len > 12) {
 838                error_report("hv-vendor-id truncated to 12 characters");
 839                len = 12;
 840            }
 841            memset(signature, 0, 12);
 842            memcpy(signature, cpu->hyperv_vendor_id, len);
 843        }
 844        c->eax = HV_CPUID_MIN;
 845        c->ebx = signature[0];
 846        c->ecx = signature[1];
 847        c->edx = signature[2];
 848
 849        c = &cpuid_data.entries[cpuid_i++];
 850        c->function = HV_CPUID_INTERFACE;
 851        memcpy(signature, "Hv#1\0\0\0\0\0\0\0\0", 12);
 852        c->eax = signature[0];
 853        c->ebx = 0;
 854        c->ecx = 0;
 855        c->edx = 0;
 856
 857        c = &cpuid_data.entries[cpuid_i++];
 858        c->function = HV_CPUID_VERSION;
 859        c->eax = 0x00001bbc;
 860        c->ebx = 0x00060001;
 861
 862        c = &cpuid_data.entries[cpuid_i++];
 863        c->function = HV_CPUID_FEATURES;
 864        r = hyperv_handle_properties(cs);
 865        if (r) {
 866            return r;
 867        }
 868        c->eax = env->features[FEAT_HYPERV_EAX];
 869        c->ebx = env->features[FEAT_HYPERV_EBX];
 870        c->edx = env->features[FEAT_HYPERV_EDX];
 871
 872        c = &cpuid_data.entries[cpuid_i++];
 873        c->function = HV_CPUID_ENLIGHTMENT_INFO;
 874        if (cpu->hyperv_relaxed_timing) {
 875            c->eax |= HV_RELAXED_TIMING_RECOMMENDED;
 876        }
 877        if (cpu->hyperv_vapic) {
 878            c->eax |= HV_APIC_ACCESS_RECOMMENDED;
 879        }
 880        if (cpu->hyperv_tlbflush) {
 881            if (kvm_check_extension(cs->kvm_state,
 882                                    KVM_CAP_HYPERV_TLBFLUSH) <= 0) {
 883                fprintf(stderr, "Hyper-V TLB flush support "
 884                        "(requested by 'hv-tlbflush' cpu flag) "
 885                        " is not supported by kernel\n");
 886                return -ENOSYS;
 887            }
 888            c->eax |= HV_REMOTE_TLB_FLUSH_RECOMMENDED;
 889            c->eax |= HV_EX_PROCESSOR_MASKS_RECOMMENDED;
 890        }
 891
 892        c->ebx = cpu->hyperv_spinlock_attempts;
 893
 894        c = &cpuid_data.entries[cpuid_i++];
 895        c->function = HV_CPUID_IMPLEMENT_LIMITS;
 896
 897        c->eax = cpu->hv_max_vps;
 898        c->ebx = 0x40;
 899
 900        kvm_base = KVM_CPUID_SIGNATURE_NEXT;
 901        has_msr_hv_hypercall = true;
 902    }
 903
 904    if (cpu->expose_kvm) {
 905        memcpy(signature, "KVMKVMKVM\0\0\0", 12);
 906        c = &cpuid_data.entries[cpuid_i++];
 907        c->function = KVM_CPUID_SIGNATURE | kvm_base;
 908        c->eax = KVM_CPUID_FEATURES | kvm_base;
 909        c->ebx = signature[0];
 910        c->ecx = signature[1];
 911        c->edx = signature[2];
 912
 913        c = &cpuid_data.entries[cpuid_i++];
 914        c->function = KVM_CPUID_FEATURES | kvm_base;
 915        c->eax = env->features[FEAT_KVM];
 916        c->edx = env->features[FEAT_KVM_HINTS];
 917    }
 918
 919    cpu_x86_cpuid(env, 0, 0, &limit, &unused, &unused, &unused);
 920
 921    for (i = 0; i <= limit; i++) {
 922        if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
 923            fprintf(stderr, "unsupported level value: 0x%x\n", limit);
 924            abort();
 925        }
 926        c = &cpuid_data.entries[cpuid_i++];
 927
 928        switch (i) {
 929        case 2: {
 930            /* Keep reading function 2 till all the input is received */
 931            int times;
 932
 933            c->function = i;
 934            c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC |
 935                       KVM_CPUID_FLAG_STATE_READ_NEXT;
 936            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
 937            times = c->eax & 0xff;
 938
 939            for (j = 1; j < times; ++j) {
 940                if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
 941                    fprintf(stderr, "cpuid_data is full, no space for "
 942                            "cpuid(eax:2):eax & 0xf = 0x%x\n", times);
 943                    abort();
 944                }
 945                c = &cpuid_data.entries[cpuid_i++];
 946                c->function = i;
 947                c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC;
 948                cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
 949            }
 950            break;
 951        }
 952        case 4:
 953        case 0xb:
 954        case 0xd:
 955            for (j = 0; ; j++) {
 956                if (i == 0xd && j == 64) {
 957                    break;
 958                }
 959                c->function = i;
 960                c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 961                c->index = j;
 962                cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
 963
 964                if (i == 4 && c->eax == 0) {
 965                    break;
 966                }
 967                if (i == 0xb && !(c->ecx & 0xff00)) {
 968                    break;
 969                }
 970                if (i == 0xd && c->eax == 0) {
 971                    continue;
 972                }
 973                if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
 974                    fprintf(stderr, "cpuid_data is full, no space for "
 975                            "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
 976                    abort();
 977                }
 978                c = &cpuid_data.entries[cpuid_i++];
 979            }
 980            break;
 981        case 0x14: {
 982            uint32_t times;
 983
 984            c->function = i;
 985            c->index = 0;
 986            c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 987            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
 988            times = c->eax;
 989
 990            for (j = 1; j <= times; ++j) {
 991                if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
 992                    fprintf(stderr, "cpuid_data is full, no space for "
 993                                "cpuid(eax:0x14,ecx:0x%x)\n", j);
 994                    abort();
 995                }
 996                c = &cpuid_data.entries[cpuid_i++];
 997                c->function = i;
 998                c->index = j;
 999                c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1000                cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1001            }
1002            break;
1003        }
1004        default:
1005            c->function = i;
1006            c->flags = 0;
1007            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1008            break;
1009        }
1010    }
1011
1012    if (limit >= 0x0a) {
1013        uint32_t eax, edx;
1014
1015        cpu_x86_cpuid(env, 0x0a, 0, &eax, &unused, &unused, &edx);
1016
1017        has_architectural_pmu_version = eax & 0xff;
1018        if (has_architectural_pmu_version > 0) {
1019            num_architectural_pmu_gp_counters = (eax & 0xff00) >> 8;
1020
1021            /* Shouldn't be more than 32, since that's the number of bits
1022             * available in EBX to tell us _which_ counters are available.
1023             * Play it safe.
1024             */
1025            if (num_architectural_pmu_gp_counters > MAX_GP_COUNTERS) {
1026                num_architectural_pmu_gp_counters = MAX_GP_COUNTERS;
1027            }
1028
1029            if (has_architectural_pmu_version > 1) {
1030                num_architectural_pmu_fixed_counters = edx & 0x1f;
1031
1032                if (num_architectural_pmu_fixed_counters > MAX_FIXED_COUNTERS) {
1033                    num_architectural_pmu_fixed_counters = MAX_FIXED_COUNTERS;
1034                }
1035            }
1036        }
1037    }
1038
1039    cpu_x86_cpuid(env, 0x80000000, 0, &limit, &unused, &unused, &unused);
1040
1041    for (i = 0x80000000; i <= limit; i++) {
1042        if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1043            fprintf(stderr, "unsupported xlevel value: 0x%x\n", limit);
1044            abort();
1045        }
1046        c = &cpuid_data.entries[cpuid_i++];
1047
1048        switch (i) {
1049        case 0x8000001d:
1050            /* Query for all AMD cache information leaves */
1051            for (j = 0; ; j++) {
1052                c->function = i;
1053                c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1054                c->index = j;
1055                cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1056
1057                if (c->eax == 0) {
1058                    break;
1059                }
1060                if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1061                    fprintf(stderr, "cpuid_data is full, no space for "
1062                            "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
1063                    abort();
1064                }
1065                c = &cpuid_data.entries[cpuid_i++];
1066            }
1067            break;
1068        default:
1069            c->function = i;
1070            c->flags = 0;
1071            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1072            break;
1073        }
1074    }
1075
1076    /* Call Centaur's CPUID instructions they are supported. */
1077    if (env->cpuid_xlevel2 > 0) {
1078        cpu_x86_cpuid(env, 0xC0000000, 0, &limit, &unused, &unused, &unused);
1079
1080        for (i = 0xC0000000; i <= limit; i++) {
1081            if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1082                fprintf(stderr, "unsupported xlevel2 value: 0x%x\n", limit);
1083                abort();
1084            }
1085            c = &cpuid_data.entries[cpuid_i++];
1086
1087            c->function = i;
1088            c->flags = 0;
1089            cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1090        }
1091    }
1092
1093    cpuid_data.cpuid.nent = cpuid_i;
1094
1095    if (((env->cpuid_version >> 8)&0xF) >= 6
1096        && (env->features[FEAT_1_EDX] & (CPUID_MCE | CPUID_MCA)) ==
1097           (CPUID_MCE | CPUID_MCA)
1098        && kvm_check_extension(cs->kvm_state, KVM_CAP_MCE) > 0) {
1099        uint64_t mcg_cap, unsupported_caps;
1100        int banks;
1101        int ret;
1102
1103        ret = kvm_get_mce_cap_supported(cs->kvm_state, &mcg_cap, &banks);
1104        if (ret < 0) {
1105            fprintf(stderr, "kvm_get_mce_cap_supported: %s", strerror(-ret));
1106            return ret;
1107        }
1108
1109        if (banks < (env->mcg_cap & MCG_CAP_BANKS_MASK)) {
1110            error_report("kvm: Unsupported MCE bank count (QEMU = %d, KVM = %d)",
1111                         (int)(env->mcg_cap & MCG_CAP_BANKS_MASK), banks);
1112            return -ENOTSUP;
1113        }
1114
1115        unsupported_caps = env->mcg_cap & ~(mcg_cap | MCG_CAP_BANKS_MASK);
1116        if (unsupported_caps) {
1117            if (unsupported_caps & MCG_LMCE_P) {
1118                error_report("kvm: LMCE not supported");
1119                return -ENOTSUP;
1120            }
1121            warn_report("Unsupported MCG_CAP bits: 0x%" PRIx64,
1122                        unsupported_caps);
1123        }
1124
1125        env->mcg_cap &= mcg_cap | MCG_CAP_BANKS_MASK;
1126        ret = kvm_vcpu_ioctl(cs, KVM_X86_SETUP_MCE, &env->mcg_cap);
1127        if (ret < 0) {
1128            fprintf(stderr, "KVM_X86_SETUP_MCE: %s", strerror(-ret));
1129            return ret;
1130        }
1131    }
1132
1133    qemu_add_vm_change_state_handler(cpu_update_state, env);
1134
1135    c = cpuid_find_entry(&cpuid_data.cpuid, 1, 0);
1136    if (c) {
1137        has_msr_feature_control = !!(c->ecx & CPUID_EXT_VMX) ||
1138                                  !!(c->ecx & CPUID_EXT_SMX);
1139    }
1140
1141    if (env->mcg_cap & MCG_LMCE_P) {
1142        has_msr_mcg_ext_ctl = has_msr_feature_control = true;
1143    }
1144
1145    if (!env->user_tsc_khz) {
1146        if ((env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) &&
1147            invtsc_mig_blocker == NULL) {
1148            /* for migration */
1149            error_setg(&invtsc_mig_blocker,
1150                       "State blocked by non-migratable CPU device"
1151                       " (invtsc flag)");
1152            r = migrate_add_blocker(invtsc_mig_blocker, &local_err);
1153            if (local_err) {
1154                error_report_err(local_err);
1155                error_free(invtsc_mig_blocker);
1156                goto fail;
1157            }
1158            /* for savevm */
1159            vmstate_x86_cpu.unmigratable = 1;
1160        }
1161    }
1162
1163    if (cpu->vmware_cpuid_freq
1164        /* Guests depend on 0x40000000 to detect this feature, so only expose
1165         * it if KVM exposes leaf 0x40000000. (Conflicts with Hyper-V) */
1166        && cpu->expose_kvm
1167        && kvm_base == KVM_CPUID_SIGNATURE
1168        /* TSC clock must be stable and known for this feature. */
1169        && tsc_is_stable_and_known(env)) {
1170
1171        c = &cpuid_data.entries[cpuid_i++];
1172        c->function = KVM_CPUID_SIGNATURE | 0x10;
1173        c->eax = env->tsc_khz;
1174        /* LAPIC resolution of 1ns (freq: 1GHz) is hardcoded in KVM's
1175         * APIC_BUS_CYCLE_NS */
1176        c->ebx = 1000000;
1177        c->ecx = c->edx = 0;
1178
1179        c = cpuid_find_entry(&cpuid_data.cpuid, kvm_base, 0);
1180        c->eax = MAX(c->eax, KVM_CPUID_SIGNATURE | 0x10);
1181    }
1182
1183    cpuid_data.cpuid.nent = cpuid_i;
1184
1185    cpuid_data.cpuid.padding = 0;
1186    r = kvm_vcpu_ioctl(cs, KVM_SET_CPUID2, &cpuid_data);
1187    if (r) {
1188        goto fail;
1189    }
1190
1191    if (has_xsave) {
1192        env->kvm_xsave_buf = qemu_memalign(4096, sizeof(struct kvm_xsave));
1193    }
1194    cpu->kvm_msr_buf = g_malloc0(MSR_BUF_SIZE);
1195
1196    if (!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP)) {
1197        has_msr_tsc_aux = false;
1198    }
1199
1200    r = hyperv_init_vcpu(cpu);
1201    if (r) {
1202        goto fail;
1203    }
1204
1205    return 0;
1206
1207 fail:
1208    migrate_del_blocker(invtsc_mig_blocker);
1209    return r;
1210}
1211
1212void kvm_arch_reset_vcpu(X86CPU *cpu)
1213{
1214    CPUX86State *env = &cpu->env;
1215
1216    env->xcr0 = 1;
1217    if (kvm_irqchip_in_kernel()) {
1218        env->mp_state = cpu_is_bsp(cpu) ? KVM_MP_STATE_RUNNABLE :
1219                                          KVM_MP_STATE_UNINITIALIZED;
1220    } else {
1221        env->mp_state = KVM_MP_STATE_RUNNABLE;
1222    }
1223
1224    if (cpu->hyperv_synic) {
1225        int i;
1226        for (i = 0; i < ARRAY_SIZE(env->msr_hv_synic_sint); i++) {
1227            env->msr_hv_synic_sint[i] = HV_SINT_MASKED;
1228        }
1229    }
1230}
1231
1232void kvm_arch_do_init_vcpu(X86CPU *cpu)
1233{
1234    CPUX86State *env = &cpu->env;
1235
1236    /* APs get directly into wait-for-SIPI state.  */
1237    if (env->mp_state == KVM_MP_STATE_UNINITIALIZED) {
1238        env->mp_state = KVM_MP_STATE_INIT_RECEIVED;
1239    }
1240}
1241
1242static int kvm_get_supported_msrs(KVMState *s)
1243{
1244    static int kvm_supported_msrs;
1245    int ret = 0;
1246
1247    /* first time */
1248    if (kvm_supported_msrs == 0) {
1249        struct kvm_msr_list msr_list, *kvm_msr_list;
1250
1251        kvm_supported_msrs = -1;
1252
1253        /* Obtain MSR list from KVM.  These are the MSRs that we must
1254         * save/restore */
1255        msr_list.nmsrs = 0;
1256        ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, &msr_list);
1257        if (ret < 0 && ret != -E2BIG) {
1258            return ret;
1259        }
1260        /* Old kernel modules had a bug and could write beyond the provided
1261           memory. Allocate at least a safe amount of 1K. */
1262        kvm_msr_list = g_malloc0(MAX(1024, sizeof(msr_list) +
1263                                              msr_list.nmsrs *
1264                                              sizeof(msr_list.indices[0])));
1265
1266        kvm_msr_list->nmsrs = msr_list.nmsrs;
1267        ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, kvm_msr_list);
1268        if (ret >= 0) {
1269            int i;
1270
1271            for (i = 0; i < kvm_msr_list->nmsrs; i++) {
1272                switch (kvm_msr_list->indices[i]) {
1273                case MSR_STAR:
1274                    has_msr_star = true;
1275                    break;
1276                case MSR_VM_HSAVE_PA:
1277                    has_msr_hsave_pa = true;
1278                    break;
1279                case MSR_TSC_AUX:
1280                    has_msr_tsc_aux = true;
1281                    break;
1282                case MSR_TSC_ADJUST:
1283                    has_msr_tsc_adjust = true;
1284                    break;
1285                case MSR_IA32_TSCDEADLINE:
1286                    has_msr_tsc_deadline = true;
1287                    break;
1288                case MSR_IA32_SMBASE:
1289                    has_msr_smbase = true;
1290                    break;
1291                case MSR_SMI_COUNT:
1292                    has_msr_smi_count = true;
1293                    break;
1294                case MSR_IA32_MISC_ENABLE:
1295                    has_msr_misc_enable = true;
1296                    break;
1297                case MSR_IA32_BNDCFGS:
1298                    has_msr_bndcfgs = true;
1299                    break;
1300                case MSR_IA32_XSS:
1301                    has_msr_xss = true;
1302                    break;
1303                case HV_X64_MSR_CRASH_CTL:
1304                    has_msr_hv_crash = true;
1305                    break;
1306                case HV_X64_MSR_RESET:
1307                    has_msr_hv_reset = true;
1308                    break;
1309                case HV_X64_MSR_VP_INDEX:
1310                    has_msr_hv_vpindex = true;
1311                    break;
1312                case HV_X64_MSR_VP_RUNTIME:
1313                    has_msr_hv_runtime = true;
1314                    break;
1315                case HV_X64_MSR_SCONTROL:
1316                    has_msr_hv_synic = true;
1317                    break;
1318                case HV_X64_MSR_STIMER0_CONFIG:
1319                    has_msr_hv_stimer = true;
1320                    break;
1321                case HV_X64_MSR_TSC_FREQUENCY:
1322                    has_msr_hv_frequencies = true;
1323                    break;
1324                case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
1325                    has_msr_hv_reenlightenment = true;
1326                    break;
1327                case MSR_IA32_SPEC_CTRL:
1328                    has_msr_spec_ctrl = true;
1329                    break;
1330                case MSR_VIRT_SSBD:
1331                    has_msr_virt_ssbd = true;
1332                    break;
1333                }
1334            }
1335        }
1336
1337        g_free(kvm_msr_list);
1338    }
1339
1340    return ret;
1341}
1342
1343static Notifier smram_machine_done;
1344static KVMMemoryListener smram_listener;
1345static AddressSpace smram_address_space;
1346static MemoryRegion smram_as_root;
1347static MemoryRegion smram_as_mem;
1348
1349static void register_smram_listener(Notifier *n, void *unused)
1350{
1351    MemoryRegion *smram =
1352        (MemoryRegion *) object_resolve_path("/machine/smram", NULL);
1353
1354    /* Outer container... */
1355    memory_region_init(&smram_as_root, OBJECT(kvm_state), "mem-container-smram", ~0ull);
1356    memory_region_set_enabled(&smram_as_root, true);
1357
1358    /* ... with two regions inside: normal system memory with low
1359     * priority, and...
1360     */
1361    memory_region_init_alias(&smram_as_mem, OBJECT(kvm_state), "mem-smram",
1362                             get_system_memory(), 0, ~0ull);
1363    memory_region_add_subregion_overlap(&smram_as_root, 0, &smram_as_mem, 0);
1364    memory_region_set_enabled(&smram_as_mem, true);
1365
1366    if (smram) {
1367        /* ... SMRAM with higher priority */
1368        memory_region_add_subregion_overlap(&smram_as_root, 0, smram, 10);
1369        memory_region_set_enabled(smram, true);
1370    }
1371
1372    address_space_init(&smram_address_space, &smram_as_root, "KVM-SMRAM");
1373    kvm_memory_listener_register(kvm_state, &smram_listener,
1374                                 &smram_address_space, 1);
1375}
1376
1377int kvm_arch_init(MachineState *ms, KVMState *s)
1378{
1379    uint64_t identity_base = 0xfffbc000;
1380    uint64_t shadow_mem;
1381    int ret;
1382    struct utsname utsname;
1383
1384#ifdef KVM_CAP_XSAVE
1385    has_xsave = kvm_check_extension(s, KVM_CAP_XSAVE);
1386#endif
1387
1388#ifdef KVM_CAP_XCRS
1389    has_xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
1390#endif
1391
1392#ifdef KVM_CAP_PIT_STATE2
1393    has_pit_state2 = kvm_check_extension(s, KVM_CAP_PIT_STATE2);
1394#endif
1395
1396    hv_vpindex_settable = kvm_check_extension(s, KVM_CAP_HYPERV_VP_INDEX);
1397
1398    ret = kvm_get_supported_msrs(s);
1399    if (ret < 0) {
1400        return ret;
1401    }
1402
1403    uname(&utsname);
1404    lm_capable_kernel = strcmp(utsname.machine, "x86_64") == 0;
1405
1406    /*
1407     * On older Intel CPUs, KVM uses vm86 mode to emulate 16-bit code directly.
1408     * In order to use vm86 mode, an EPT identity map and a TSS  are needed.
1409     * Since these must be part of guest physical memory, we need to allocate
1410     * them, both by setting their start addresses in the kernel and by
1411     * creating a corresponding e820 entry. We need 4 pages before the BIOS.
1412     *
1413     * Older KVM versions may not support setting the identity map base. In
1414     * that case we need to stick with the default, i.e. a 256K maximum BIOS
1415     * size.
1416     */
1417    if (kvm_check_extension(s, KVM_CAP_SET_IDENTITY_MAP_ADDR)) {
1418        /* Allows up to 16M BIOSes. */
1419        identity_base = 0xfeffc000;
1420
1421        ret = kvm_vm_ioctl(s, KVM_SET_IDENTITY_MAP_ADDR, &identity_base);
1422        if (ret < 0) {
1423            return ret;
1424        }
1425    }
1426
1427    /* Set TSS base one page after EPT identity map. */
1428    ret = kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, identity_base + 0x1000);
1429    if (ret < 0) {
1430        return ret;
1431    }
1432
1433    /* Tell fw_cfg to notify the BIOS to reserve the range. */
1434    ret = e820_add_entry(identity_base, 0x4000, E820_RESERVED);
1435    if (ret < 0) {
1436        fprintf(stderr, "e820_add_entry() table is full\n");
1437        return ret;
1438    }
1439    qemu_register_reset(kvm_unpoison_all, NULL);
1440
1441    shadow_mem = machine_kvm_shadow_mem(ms);
1442    if (shadow_mem != -1) {
1443        shadow_mem /= 4096;
1444        ret = kvm_vm_ioctl(s, KVM_SET_NR_MMU_PAGES, shadow_mem);
1445        if (ret < 0) {
1446            return ret;
1447        }
1448    }
1449
1450    if (kvm_check_extension(s, KVM_CAP_X86_SMM) &&
1451        object_dynamic_cast(OBJECT(ms), TYPE_PC_MACHINE) &&
1452        pc_machine_is_smm_enabled(PC_MACHINE(ms))) {
1453        smram_machine_done.notify = register_smram_listener;
1454        qemu_add_machine_init_done_notifier(&smram_machine_done);
1455    }
1456
1457    if (enable_cpu_pm) {
1458        int disable_exits = kvm_check_extension(s, KVM_CAP_X86_DISABLE_EXITS);
1459        int ret;
1460
1461/* Work around for kernel header with a typo. TODO: fix header and drop. */
1462#if defined(KVM_X86_DISABLE_EXITS_HTL) && !defined(KVM_X86_DISABLE_EXITS_HLT)
1463#define KVM_X86_DISABLE_EXITS_HLT KVM_X86_DISABLE_EXITS_HTL
1464#endif
1465        if (disable_exits) {
1466            disable_exits &= (KVM_X86_DISABLE_EXITS_MWAIT |
1467                              KVM_X86_DISABLE_EXITS_HLT |
1468                              KVM_X86_DISABLE_EXITS_PAUSE);
1469        }
1470
1471        ret = kvm_vm_enable_cap(s, KVM_CAP_X86_DISABLE_EXITS, 0,
1472                                disable_exits);
1473        if (ret < 0) {
1474            error_report("kvm: guest stopping CPU not supported: %s",
1475                         strerror(-ret));
1476        }
1477    }
1478
1479    return 0;
1480}
1481
1482static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
1483{
1484    lhs->selector = rhs->selector;
1485    lhs->base = rhs->base;
1486    lhs->limit = rhs->limit;
1487    lhs->type = 3;
1488    lhs->present = 1;
1489    lhs->dpl = 3;
1490    lhs->db = 0;
1491    lhs->s = 1;
1492    lhs->l = 0;
1493    lhs->g = 0;
1494    lhs->avl = 0;
1495    lhs->unusable = 0;
1496}
1497
1498static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
1499{
1500    unsigned flags = rhs->flags;
1501    lhs->selector = rhs->selector;
1502    lhs->base = rhs->base;
1503    lhs->limit = rhs->limit;
1504    lhs->type = (flags >> DESC_TYPE_SHIFT) & 15;
1505    lhs->present = (flags & DESC_P_MASK) != 0;
1506    lhs->dpl = (flags >> DESC_DPL_SHIFT) & 3;
1507    lhs->db = (flags >> DESC_B_SHIFT) & 1;
1508    lhs->s = (flags & DESC_S_MASK) != 0;
1509    lhs->l = (flags >> DESC_L_SHIFT) & 1;
1510    lhs->g = (flags & DESC_G_MASK) != 0;
1511    lhs->avl = (flags & DESC_AVL_MASK) != 0;
1512    lhs->unusable = !lhs->present;
1513    lhs->padding = 0;
1514}
1515
1516static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs)
1517{
1518    lhs->selector = rhs->selector;
1519    lhs->base = rhs->base;
1520    lhs->limit = rhs->limit;
1521    lhs->flags = (rhs->type << DESC_TYPE_SHIFT) |
1522                 ((rhs->present && !rhs->unusable) * DESC_P_MASK) |
1523                 (rhs->dpl << DESC_DPL_SHIFT) |
1524                 (rhs->db << DESC_B_SHIFT) |
1525                 (rhs->s * DESC_S_MASK) |
1526                 (rhs->l << DESC_L_SHIFT) |
1527                 (rhs->g * DESC_G_MASK) |
1528                 (rhs->avl * DESC_AVL_MASK);
1529}
1530
1531static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, int set)
1532{
1533    if (set) {
1534        *kvm_reg = *qemu_reg;
1535    } else {
1536        *qemu_reg = *kvm_reg;
1537    }
1538}
1539
1540static int kvm_getput_regs(X86CPU *cpu, int set)
1541{
1542    CPUX86State *env = &cpu->env;
1543    struct kvm_regs regs;
1544    int ret = 0;
1545
1546    if (!set) {
1547        ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_REGS, &regs);
1548        if (ret < 0) {
1549            return ret;
1550        }
1551    }
1552
1553    kvm_getput_reg(&regs.rax, &env->regs[R_EAX], set);
1554    kvm_getput_reg(&regs.rbx, &env->regs[R_EBX], set);
1555    kvm_getput_reg(&regs.rcx, &env->regs[R_ECX], set);
1556    kvm_getput_reg(&regs.rdx, &env->regs[R_EDX], set);
1557    kvm_getput_reg(&regs.rsi, &env->regs[R_ESI], set);
1558    kvm_getput_reg(&regs.rdi, &env->regs[R_EDI], set);
1559    kvm_getput_reg(&regs.rsp, &env->regs[R_ESP], set);
1560    kvm_getput_reg(&regs.rbp, &env->regs[R_EBP], set);
1561#ifdef TARGET_X86_64
1562    kvm_getput_reg(&regs.r8, &env->regs[8], set);
1563    kvm_getput_reg(&regs.r9, &env->regs[9], set);
1564    kvm_getput_reg(&regs.r10, &env->regs[10], set);
1565    kvm_getput_reg(&regs.r11, &env->regs[11], set);
1566    kvm_getput_reg(&regs.r12, &env->regs[12], set);
1567    kvm_getput_reg(&regs.r13, &env->regs[13], set);
1568    kvm_getput_reg(&regs.r14, &env->regs[14], set);
1569    kvm_getput_reg(&regs.r15, &env->regs[15], set);
1570#endif
1571
1572    kvm_getput_reg(&regs.rflags, &env->eflags, set);
1573    kvm_getput_reg(&regs.rip, &env->eip, set);
1574
1575    if (set) {
1576        ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_REGS, &regs);
1577    }
1578
1579    return ret;
1580}
1581
1582static int kvm_put_fpu(X86CPU *cpu)
1583{
1584    CPUX86State *env = &cpu->env;
1585    struct kvm_fpu fpu;
1586    int i;
1587
1588    memset(&fpu, 0, sizeof fpu);
1589    fpu.fsw = env->fpus & ~(7 << 11);
1590    fpu.fsw |= (env->fpstt & 7) << 11;
1591    fpu.fcw = env->fpuc;
1592    fpu.last_opcode = env->fpop;
1593    fpu.last_ip = env->fpip;
1594    fpu.last_dp = env->fpdp;
1595    for (i = 0; i < 8; ++i) {
1596        fpu.ftwx |= (!env->fptags[i]) << i;
1597    }
1598    memcpy(fpu.fpr, env->fpregs, sizeof env->fpregs);
1599    for (i = 0; i < CPU_NB_REGS; i++) {
1600        stq_p(&fpu.xmm[i][0], env->xmm_regs[i].ZMM_Q(0));
1601        stq_p(&fpu.xmm[i][8], env->xmm_regs[i].ZMM_Q(1));
1602    }
1603    fpu.mxcsr = env->mxcsr;
1604
1605    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_FPU, &fpu);
1606}
1607
1608#define XSAVE_FCW_FSW     0
1609#define XSAVE_FTW_FOP     1
1610#define XSAVE_CWD_RIP     2
1611#define XSAVE_CWD_RDP     4
1612#define XSAVE_MXCSR       6
1613#define XSAVE_ST_SPACE    8
1614#define XSAVE_XMM_SPACE   40
1615#define XSAVE_XSTATE_BV   128
1616#define XSAVE_YMMH_SPACE  144
1617#define XSAVE_BNDREGS     240
1618#define XSAVE_BNDCSR      256
1619#define XSAVE_OPMASK      272
1620#define XSAVE_ZMM_Hi256   288
1621#define XSAVE_Hi16_ZMM    416
1622#define XSAVE_PKRU        672
1623
1624#define XSAVE_BYTE_OFFSET(word_offset) \
1625    ((word_offset) * sizeof_field(struct kvm_xsave, region[0]))
1626
1627#define ASSERT_OFFSET(word_offset, field) \
1628    QEMU_BUILD_BUG_ON(XSAVE_BYTE_OFFSET(word_offset) != \
1629                      offsetof(X86XSaveArea, field))
1630
1631ASSERT_OFFSET(XSAVE_FCW_FSW, legacy.fcw);
1632ASSERT_OFFSET(XSAVE_FTW_FOP, legacy.ftw);
1633ASSERT_OFFSET(XSAVE_CWD_RIP, legacy.fpip);
1634ASSERT_OFFSET(XSAVE_CWD_RDP, legacy.fpdp);
1635ASSERT_OFFSET(XSAVE_MXCSR, legacy.mxcsr);
1636ASSERT_OFFSET(XSAVE_ST_SPACE, legacy.fpregs);
1637ASSERT_OFFSET(XSAVE_XMM_SPACE, legacy.xmm_regs);
1638ASSERT_OFFSET(XSAVE_XSTATE_BV, header.xstate_bv);
1639ASSERT_OFFSET(XSAVE_YMMH_SPACE, avx_state);
1640ASSERT_OFFSET(XSAVE_BNDREGS, bndreg_state);
1641ASSERT_OFFSET(XSAVE_BNDCSR, bndcsr_state);
1642ASSERT_OFFSET(XSAVE_OPMASK, opmask_state);
1643ASSERT_OFFSET(XSAVE_ZMM_Hi256, zmm_hi256_state);
1644ASSERT_OFFSET(XSAVE_Hi16_ZMM, hi16_zmm_state);
1645ASSERT_OFFSET(XSAVE_PKRU, pkru_state);
1646
1647static int kvm_put_xsave(X86CPU *cpu)
1648{
1649    CPUX86State *env = &cpu->env;
1650    X86XSaveArea *xsave = env->kvm_xsave_buf;
1651
1652    if (!has_xsave) {
1653        return kvm_put_fpu(cpu);
1654    }
1655    x86_cpu_xsave_all_areas(cpu, xsave);
1656
1657    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XSAVE, xsave);
1658}
1659
1660static int kvm_put_xcrs(X86CPU *cpu)
1661{
1662    CPUX86State *env = &cpu->env;
1663    struct kvm_xcrs xcrs = {};
1664
1665    if (!has_xcrs) {
1666        return 0;
1667    }
1668
1669    xcrs.nr_xcrs = 1;
1670    xcrs.flags = 0;
1671    xcrs.xcrs[0].xcr = 0;
1672    xcrs.xcrs[0].value = env->xcr0;
1673    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XCRS, &xcrs);
1674}
1675
1676static int kvm_put_sregs(X86CPU *cpu)
1677{
1678    CPUX86State *env = &cpu->env;
1679    struct kvm_sregs sregs;
1680
1681    memset(sregs.interrupt_bitmap, 0, sizeof(sregs.interrupt_bitmap));
1682    if (env->interrupt_injected >= 0) {
1683        sregs.interrupt_bitmap[env->interrupt_injected / 64] |=
1684                (uint64_t)1 << (env->interrupt_injected % 64);
1685    }
1686
1687    if ((env->eflags & VM_MASK)) {
1688        set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
1689        set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
1690        set_v8086_seg(&sregs.es, &env->segs[R_ES]);
1691        set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
1692        set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
1693        set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
1694    } else {
1695        set_seg(&sregs.cs, &env->segs[R_CS]);
1696        set_seg(&sregs.ds, &env->segs[R_DS]);
1697        set_seg(&sregs.es, &env->segs[R_ES]);
1698        set_seg(&sregs.fs, &env->segs[R_FS]);
1699        set_seg(&sregs.gs, &env->segs[R_GS]);
1700        set_seg(&sregs.ss, &env->segs[R_SS]);
1701    }
1702
1703    set_seg(&sregs.tr, &env->tr);
1704    set_seg(&sregs.ldt, &env->ldt);
1705
1706    sregs.idt.limit = env->idt.limit;
1707    sregs.idt.base = env->idt.base;
1708    memset(sregs.idt.padding, 0, sizeof sregs.idt.padding);
1709    sregs.gdt.limit = env->gdt.limit;
1710    sregs.gdt.base = env->gdt.base;
1711    memset(sregs.gdt.padding, 0, sizeof sregs.gdt.padding);
1712
1713    sregs.cr0 = env->cr[0];
1714    sregs.cr2 = env->cr[2];
1715    sregs.cr3 = env->cr[3];
1716    sregs.cr4 = env->cr[4];
1717
1718    sregs.cr8 = cpu_get_apic_tpr(cpu->apic_state);
1719    sregs.apic_base = cpu_get_apic_base(cpu->apic_state);
1720
1721    sregs.efer = env->efer;
1722
1723    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS, &sregs);
1724}
1725
1726static void kvm_msr_buf_reset(X86CPU *cpu)
1727{
1728    memset(cpu->kvm_msr_buf, 0, MSR_BUF_SIZE);
1729}
1730
1731static void kvm_msr_entry_add(X86CPU *cpu, uint32_t index, uint64_t value)
1732{
1733    struct kvm_msrs *msrs = cpu->kvm_msr_buf;
1734    void *limit = ((void *)msrs) + MSR_BUF_SIZE;
1735    struct kvm_msr_entry *entry = &msrs->entries[msrs->nmsrs];
1736
1737    assert((void *)(entry + 1) <= limit);
1738
1739    entry->index = index;
1740    entry->reserved = 0;
1741    entry->data = value;
1742    msrs->nmsrs++;
1743}
1744
1745static int kvm_put_one_msr(X86CPU *cpu, int index, uint64_t value)
1746{
1747    kvm_msr_buf_reset(cpu);
1748    kvm_msr_entry_add(cpu, index, value);
1749
1750    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf);
1751}
1752
1753void kvm_put_apicbase(X86CPU *cpu, uint64_t value)
1754{
1755    int ret;
1756
1757    ret = kvm_put_one_msr(cpu, MSR_IA32_APICBASE, value);
1758    assert(ret == 1);
1759}
1760
1761static int kvm_put_tscdeadline_msr(X86CPU *cpu)
1762{
1763    CPUX86State *env = &cpu->env;
1764    int ret;
1765
1766    if (!has_msr_tsc_deadline) {
1767        return 0;
1768    }
1769
1770    ret = kvm_put_one_msr(cpu, MSR_IA32_TSCDEADLINE, env->tsc_deadline);
1771    if (ret < 0) {
1772        return ret;
1773    }
1774
1775    assert(ret == 1);
1776    return 0;
1777}
1778
1779/*
1780 * Provide a separate write service for the feature control MSR in order to
1781 * kick the VCPU out of VMXON or even guest mode on reset. This has to be done
1782 * before writing any other state because forcibly leaving nested mode
1783 * invalidates the VCPU state.
1784 */
1785static int kvm_put_msr_feature_control(X86CPU *cpu)
1786{
1787    int ret;
1788
1789    if (!has_msr_feature_control) {
1790        return 0;
1791    }
1792
1793    ret = kvm_put_one_msr(cpu, MSR_IA32_FEATURE_CONTROL,
1794                          cpu->env.msr_ia32_feature_control);
1795    if (ret < 0) {
1796        return ret;
1797    }
1798
1799    assert(ret == 1);
1800    return 0;
1801}
1802
1803static int kvm_put_msrs(X86CPU *cpu, int level)
1804{
1805    CPUX86State *env = &cpu->env;
1806    int i;
1807    int ret;
1808
1809    kvm_msr_buf_reset(cpu);
1810
1811    kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, env->sysenter_cs);
1812    kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
1813    kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
1814    kvm_msr_entry_add(cpu, MSR_PAT, env->pat);
1815    if (has_msr_star) {
1816        kvm_msr_entry_add(cpu, MSR_STAR, env->star);
1817    }
1818    if (has_msr_hsave_pa) {
1819        kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, env->vm_hsave);
1820    }
1821    if (has_msr_tsc_aux) {
1822        kvm_msr_entry_add(cpu, MSR_TSC_AUX, env->tsc_aux);
1823    }
1824    if (has_msr_tsc_adjust) {
1825        kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, env->tsc_adjust);
1826    }
1827    if (has_msr_misc_enable) {
1828        kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE,
1829                          env->msr_ia32_misc_enable);
1830    }
1831    if (has_msr_smbase) {
1832        kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, env->smbase);
1833    }
1834    if (has_msr_smi_count) {
1835        kvm_msr_entry_add(cpu, MSR_SMI_COUNT, env->msr_smi_count);
1836    }
1837    if (has_msr_bndcfgs) {
1838        kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, env->msr_bndcfgs);
1839    }
1840    if (has_msr_xss) {
1841        kvm_msr_entry_add(cpu, MSR_IA32_XSS, env->xss);
1842    }
1843    if (has_msr_spec_ctrl) {
1844        kvm_msr_entry_add(cpu, MSR_IA32_SPEC_CTRL, env->spec_ctrl);
1845    }
1846    if (has_msr_virt_ssbd) {
1847        kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, env->virt_ssbd);
1848    }
1849
1850#ifdef TARGET_X86_64
1851    if (lm_capable_kernel) {
1852        kvm_msr_entry_add(cpu, MSR_CSTAR, env->cstar);
1853        kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, env->kernelgsbase);
1854        kvm_msr_entry_add(cpu, MSR_FMASK, env->fmask);
1855        kvm_msr_entry_add(cpu, MSR_LSTAR, env->lstar);
1856    }
1857#endif
1858
1859    /*
1860     * The following MSRs have side effects on the guest or are too heavy
1861     * for normal writeback. Limit them to reset or full state updates.
1862     */
1863    if (level >= KVM_PUT_RESET_STATE) {
1864        kvm_msr_entry_add(cpu, MSR_IA32_TSC, env->tsc);
1865        kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, env->system_time_msr);
1866        kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
1867        if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF)) {
1868            kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, env->async_pf_en_msr);
1869        }
1870        if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_PV_EOI)) {
1871            kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, env->pv_eoi_en_msr);
1872        }
1873        if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) {
1874            kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, env->steal_time_msr);
1875        }
1876        if (has_architectural_pmu_version > 0) {
1877            if (has_architectural_pmu_version > 1) {
1878                /* Stop the counter.  */
1879                kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
1880                kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0);
1881            }
1882
1883            /* Set the counter values.  */
1884            for (i = 0; i < num_architectural_pmu_fixed_counters; i++) {
1885                kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR0 + i,
1886                                  env->msr_fixed_counters[i]);
1887            }
1888            for (i = 0; i < num_architectural_pmu_gp_counters; i++) {
1889                kvm_msr_entry_add(cpu, MSR_P6_PERFCTR0 + i,
1890                                  env->msr_gp_counters[i]);
1891                kvm_msr_entry_add(cpu, MSR_P6_EVNTSEL0 + i,
1892                                  env->msr_gp_evtsel[i]);
1893            }
1894            if (has_architectural_pmu_version > 1) {
1895                kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_STATUS,
1896                                  env->msr_global_status);
1897                kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
1898                                  env->msr_global_ovf_ctrl);
1899
1900                /* Now start the PMU.  */
1901                kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL,
1902                                  env->msr_fixed_ctr_ctrl);
1903                kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL,
1904                                  env->msr_global_ctrl);
1905            }
1906        }
1907        /*
1908         * Hyper-V partition-wide MSRs: to avoid clearing them on cpu hot-add,
1909         * only sync them to KVM on the first cpu
1910         */
1911        if (current_cpu == first_cpu) {
1912            if (has_msr_hv_hypercall) {
1913                kvm_msr_entry_add(cpu, HV_X64_MSR_GUEST_OS_ID,
1914                                  env->msr_hv_guest_os_id);
1915                kvm_msr_entry_add(cpu, HV_X64_MSR_HYPERCALL,
1916                                  env->msr_hv_hypercall);
1917            }
1918            if (cpu->hyperv_time) {
1919                kvm_msr_entry_add(cpu, HV_X64_MSR_REFERENCE_TSC,
1920                                  env->msr_hv_tsc);
1921            }
1922            if (cpu->hyperv_reenlightenment) {
1923                kvm_msr_entry_add(cpu, HV_X64_MSR_REENLIGHTENMENT_CONTROL,
1924                                  env->msr_hv_reenlightenment_control);
1925                kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_CONTROL,
1926                                  env->msr_hv_tsc_emulation_control);
1927                kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS,
1928                                  env->msr_hv_tsc_emulation_status);
1929            }
1930        }
1931        if (cpu->hyperv_vapic) {
1932            kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE,
1933                              env->msr_hv_vapic);
1934        }
1935        if (has_msr_hv_crash) {
1936            int j;
1937
1938            for (j = 0; j < HV_CRASH_PARAMS; j++)
1939                kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_P0 + j,
1940                                  env->msr_hv_crash_params[j]);
1941
1942            kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_NOTIFY);
1943        }
1944        if (has_msr_hv_runtime) {
1945            kvm_msr_entry_add(cpu, HV_X64_MSR_VP_RUNTIME, env->msr_hv_runtime);
1946        }
1947        if (cpu->hyperv_vpindex && hv_vpindex_settable) {
1948            kvm_msr_entry_add(cpu, HV_X64_MSR_VP_INDEX, hyperv_vp_index(cpu));
1949        }
1950        if (cpu->hyperv_synic) {
1951            int j;
1952
1953            kvm_msr_entry_add(cpu, HV_X64_MSR_SVERSION, HV_SYNIC_VERSION);
1954
1955            kvm_msr_entry_add(cpu, HV_X64_MSR_SCONTROL,
1956                              env->msr_hv_synic_control);
1957            kvm_msr_entry_add(cpu, HV_X64_MSR_SIEFP,
1958                              env->msr_hv_synic_evt_page);
1959            kvm_msr_entry_add(cpu, HV_X64_MSR_SIMP,
1960                              env->msr_hv_synic_msg_page);
1961
1962            for (j = 0; j < ARRAY_SIZE(env->msr_hv_synic_sint); j++) {
1963                kvm_msr_entry_add(cpu, HV_X64_MSR_SINT0 + j,
1964                                  env->msr_hv_synic_sint[j]);
1965            }
1966        }
1967        if (has_msr_hv_stimer) {
1968            int j;
1969
1970            for (j = 0; j < ARRAY_SIZE(env->msr_hv_stimer_config); j++) {
1971                kvm_msr_entry_add(cpu, HV_X64_MSR_STIMER0_CONFIG + j * 2,
1972                                env->msr_hv_stimer_config[j]);
1973            }
1974
1975            for (j = 0; j < ARRAY_SIZE(env->msr_hv_stimer_count); j++) {
1976                kvm_msr_entry_add(cpu, HV_X64_MSR_STIMER0_COUNT + j * 2,
1977                                env->msr_hv_stimer_count[j]);
1978            }
1979        }
1980        if (env->features[FEAT_1_EDX] & CPUID_MTRR) {
1981            uint64_t phys_mask = MAKE_64BIT_MASK(0, cpu->phys_bits);
1982
1983            kvm_msr_entry_add(cpu, MSR_MTRRdefType, env->mtrr_deftype);
1984            kvm_msr_entry_add(cpu, MSR_MTRRfix64K_00000, env->mtrr_fixed[0]);
1985            kvm_msr_entry_add(cpu, MSR_MTRRfix16K_80000, env->mtrr_fixed[1]);
1986            kvm_msr_entry_add(cpu, MSR_MTRRfix16K_A0000, env->mtrr_fixed[2]);
1987            kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C0000, env->mtrr_fixed[3]);
1988            kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C8000, env->mtrr_fixed[4]);
1989            kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D0000, env->mtrr_fixed[5]);
1990            kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D8000, env->mtrr_fixed[6]);
1991            kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E0000, env->mtrr_fixed[7]);
1992            kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E8000, env->mtrr_fixed[8]);
1993            kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F0000, env->mtrr_fixed[9]);
1994            kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F8000, env->mtrr_fixed[10]);
1995            for (i = 0; i < MSR_MTRRcap_VCNT; i++) {
1996                /* The CPU GPs if we write to a bit above the physical limit of
1997                 * the host CPU (and KVM emulates that)
1998                 */
1999                uint64_t mask = env->mtrr_var[i].mask;
2000                mask &= phys_mask;
2001
2002                kvm_msr_entry_add(cpu, MSR_MTRRphysBase(i),
2003                                  env->mtrr_var[i].base);
2004                kvm_msr_entry_add(cpu, MSR_MTRRphysMask(i), mask);
2005            }
2006        }
2007        if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) {
2008            int addr_num = kvm_arch_get_supported_cpuid(kvm_state,
2009                                                    0x14, 1, R_EAX) & 0x7;
2010
2011            kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CTL,
2012                            env->msr_rtit_ctrl);
2013            kvm_msr_entry_add(cpu, MSR_IA32_RTIT_STATUS,
2014                            env->msr_rtit_status);
2015            kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_BASE,
2016                            env->msr_rtit_output_base);
2017            kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_MASK,
2018                            env->msr_rtit_output_mask);
2019            kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CR3_MATCH,
2020                            env->msr_rtit_cr3_match);
2021            for (i = 0; i < addr_num; i++) {
2022                kvm_msr_entry_add(cpu, MSR_IA32_RTIT_ADDR0_A + i,
2023                            env->msr_rtit_addrs[i]);
2024            }
2025        }
2026
2027        /* Note: MSR_IA32_FEATURE_CONTROL is written separately, see
2028         *       kvm_put_msr_feature_control. */
2029    }
2030    if (env->mcg_cap) {
2031        int i;
2032
2033        kvm_msr_entry_add(cpu, MSR_MCG_STATUS, env->mcg_status);
2034        kvm_msr_entry_add(cpu, MSR_MCG_CTL, env->mcg_ctl);
2035        if (has_msr_mcg_ext_ctl) {
2036            kvm_msr_entry_add(cpu, MSR_MCG_EXT_CTL, env->mcg_ext_ctl);
2037        }
2038        for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
2039            kvm_msr_entry_add(cpu, MSR_MC0_CTL + i, env->mce_banks[i]);
2040        }
2041    }
2042
2043    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf);
2044    if (ret < 0) {
2045        return ret;
2046    }
2047
2048    if (ret < cpu->kvm_msr_buf->nmsrs) {
2049        struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret];
2050        error_report("error: failed to set MSR 0x%" PRIx32 " to 0x%" PRIx64,
2051                     (uint32_t)e->index, (uint64_t)e->data);
2052    }
2053
2054    assert(ret == cpu->kvm_msr_buf->nmsrs);
2055    return 0;
2056}
2057
2058
2059static int kvm_get_fpu(X86CPU *cpu)
2060{
2061    CPUX86State *env = &cpu->env;
2062    struct kvm_fpu fpu;
2063    int i, ret;
2064
2065    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_FPU, &fpu);
2066    if (ret < 0) {
2067        return ret;
2068    }
2069
2070    env->fpstt = (fpu.fsw >> 11) & 7;
2071    env->fpus = fpu.fsw;
2072    env->fpuc = fpu.fcw;
2073    env->fpop = fpu.last_opcode;
2074    env->fpip = fpu.last_ip;
2075    env->fpdp = fpu.last_dp;
2076    for (i = 0; i < 8; ++i) {
2077        env->fptags[i] = !((fpu.ftwx >> i) & 1);
2078    }
2079    memcpy(env->fpregs, fpu.fpr, sizeof env->fpregs);
2080    for (i = 0; i < CPU_NB_REGS; i++) {
2081        env->xmm_regs[i].ZMM_Q(0) = ldq_p(&fpu.xmm[i][0]);
2082        env->xmm_regs[i].ZMM_Q(1) = ldq_p(&fpu.xmm[i][8]);
2083    }
2084    env->mxcsr = fpu.mxcsr;
2085
2086    return 0;
2087}
2088
2089static int kvm_get_xsave(X86CPU *cpu)
2090{
2091    CPUX86State *env = &cpu->env;
2092    X86XSaveArea *xsave = env->kvm_xsave_buf;
2093    int ret;
2094
2095    if (!has_xsave) {
2096        return kvm_get_fpu(cpu);
2097    }
2098
2099    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XSAVE, xsave);
2100    if (ret < 0) {
2101        return ret;
2102    }
2103    x86_cpu_xrstor_all_areas(cpu, xsave);
2104
2105    return 0;
2106}
2107
2108static int kvm_get_xcrs(X86CPU *cpu)
2109{
2110    CPUX86State *env = &cpu->env;
2111    int i, ret;
2112    struct kvm_xcrs xcrs;
2113
2114    if (!has_xcrs) {
2115        return 0;
2116    }
2117
2118    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XCRS, &xcrs);
2119    if (ret < 0) {
2120        return ret;
2121    }
2122
2123    for (i = 0; i < xcrs.nr_xcrs; i++) {
2124        /* Only support xcr0 now */
2125        if (xcrs.xcrs[i].xcr == 0) {
2126            env->xcr0 = xcrs.xcrs[i].value;
2127            break;
2128        }
2129    }
2130    return 0;
2131}
2132
2133static int kvm_get_sregs(X86CPU *cpu)
2134{
2135    CPUX86State *env = &cpu->env;
2136    struct kvm_sregs sregs;
2137    int bit, i, ret;
2138
2139    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs);
2140    if (ret < 0) {
2141        return ret;
2142    }
2143
2144    /* There can only be one pending IRQ set in the bitmap at a time, so try
2145       to find it and save its number instead (-1 for none). */
2146    env->interrupt_injected = -1;
2147    for (i = 0; i < ARRAY_SIZE(sregs.interrupt_bitmap); i++) {
2148        if (sregs.interrupt_bitmap[i]) {
2149            bit = ctz64(sregs.interrupt_bitmap[i]);
2150            env->interrupt_injected = i * 64 + bit;
2151            break;
2152        }
2153    }
2154
2155    get_seg(&env->segs[R_CS], &sregs.cs);
2156    get_seg(&env->segs[R_DS], &sregs.ds);
2157    get_seg(&env->segs[R_ES], &sregs.es);
2158    get_seg(&env->segs[R_FS], &sregs.fs);
2159    get_seg(&env->segs[R_GS], &sregs.gs);
2160    get_seg(&env->segs[R_SS], &sregs.ss);
2161
2162    get_seg(&env->tr, &sregs.tr);
2163    get_seg(&env->ldt, &sregs.ldt);
2164
2165    env->idt.limit = sregs.idt.limit;
2166    env->idt.base = sregs.idt.base;
2167    env->gdt.limit = sregs.gdt.limit;
2168    env->gdt.base = sregs.gdt.base;
2169
2170    env->cr[0] = sregs.cr0;
2171    env->cr[2] = sregs.cr2;
2172    env->cr[3] = sregs.cr3;
2173    env->cr[4] = sregs.cr4;
2174
2175    env->efer = sregs.efer;
2176
2177    /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */
2178    x86_update_hflags(env);
2179
2180    return 0;
2181}
2182
2183static int kvm_get_msrs(X86CPU *cpu)
2184{
2185    CPUX86State *env = &cpu->env;
2186    struct kvm_msr_entry *msrs = cpu->kvm_msr_buf->entries;
2187    int ret, i;
2188    uint64_t mtrr_top_bits;
2189
2190    kvm_msr_buf_reset(cpu);
2191
2192    kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, 0);
2193    kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, 0);
2194    kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, 0);
2195    kvm_msr_entry_add(cpu, MSR_PAT, 0);
2196    if (has_msr_star) {
2197        kvm_msr_entry_add(cpu, MSR_STAR, 0);
2198    }
2199    if (has_msr_hsave_pa) {
2200        kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, 0);
2201    }
2202    if (has_msr_tsc_aux) {
2203        kvm_msr_entry_add(cpu, MSR_TSC_AUX, 0);
2204    }
2205    if (has_msr_tsc_adjust) {
2206        kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, 0);
2207    }
2208    if (has_msr_tsc_deadline) {
2209        kvm_msr_entry_add(cpu, MSR_IA32_TSCDEADLINE, 0);
2210    }
2211    if (has_msr_misc_enable) {
2212        kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE, 0);
2213    }
2214    if (has_msr_smbase) {
2215        kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, 0);
2216    }
2217    if (has_msr_smi_count) {
2218        kvm_msr_entry_add(cpu, MSR_SMI_COUNT, 0);
2219    }
2220    if (has_msr_feature_control) {
2221        kvm_msr_entry_add(cpu, MSR_IA32_FEATURE_CONTROL, 0);
2222    }
2223    if (has_msr_bndcfgs) {
2224        kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, 0);
2225    }
2226    if (has_msr_xss) {
2227        kvm_msr_entry_add(cpu, MSR_IA32_XSS, 0);
2228    }
2229    if (has_msr_spec_ctrl) {
2230        kvm_msr_entry_add(cpu, MSR_IA32_SPEC_CTRL, 0);
2231    }
2232    if (has_msr_virt_ssbd) {
2233        kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, 0);
2234    }
2235    if (!env->tsc_valid) {
2236        kvm_msr_entry_add(cpu, MSR_IA32_TSC, 0);
2237        env->tsc_valid = !runstate_is_running();
2238    }
2239
2240#ifdef TARGET_X86_64
2241    if (lm_capable_kernel) {
2242        kvm_msr_entry_add(cpu, MSR_CSTAR, 0);
2243        kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, 0);
2244        kvm_msr_entry_add(cpu, MSR_FMASK, 0);
2245        kvm_msr_entry_add(cpu, MSR_LSTAR, 0);
2246    }
2247#endif
2248    kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, 0);
2249    kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, 0);
2250    if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF)) {
2251        kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, 0);
2252    }
2253    if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_PV_EOI)) {
2254        kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, 0);
2255    }
2256    if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) {
2257        kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, 0);
2258    }
2259    if (has_architectural_pmu_version > 0) {
2260        if (has_architectural_pmu_version > 1) {
2261            kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
2262            kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0);
2263            kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_STATUS, 0);
2264            kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, 0);
2265        }
2266        for (i = 0; i < num_architectural_pmu_fixed_counters; i++) {
2267            kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR0 + i, 0);
2268        }
2269        for (i = 0; i < num_architectural_pmu_gp_counters; i++) {
2270            kvm_msr_entry_add(cpu, MSR_P6_PERFCTR0 + i, 0);
2271            kvm_msr_entry_add(cpu, MSR_P6_EVNTSEL0 + i, 0);
2272        }
2273    }
2274
2275    if (env->mcg_cap) {
2276        kvm_msr_entry_add(cpu, MSR_MCG_STATUS, 0);
2277        kvm_msr_entry_add(cpu, MSR_MCG_CTL, 0);
2278        if (has_msr_mcg_ext_ctl) {
2279            kvm_msr_entry_add(cpu, MSR_MCG_EXT_CTL, 0);
2280        }
2281        for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
2282            kvm_msr_entry_add(cpu, MSR_MC0_CTL + i, 0);
2283        }
2284    }
2285
2286    if (has_msr_hv_hypercall) {
2287        kvm_msr_entry_add(cpu, HV_X64_MSR_HYPERCALL, 0);
2288        kvm_msr_entry_add(cpu, HV_X64_MSR_GUEST_OS_ID, 0);
2289    }
2290    if (cpu->hyperv_vapic) {
2291        kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE, 0);
2292    }
2293    if (cpu->hyperv_time) {
2294        kvm_msr_entry_add(cpu, HV_X64_MSR_REFERENCE_TSC, 0);
2295    }
2296    if (cpu->hyperv_reenlightenment) {
2297        kvm_msr_entry_add(cpu, HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0);
2298        kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_CONTROL, 0);
2299        kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS, 0);
2300    }
2301    if (has_msr_hv_crash) {
2302        int j;
2303
2304        for (j = 0; j < HV_CRASH_PARAMS; j++) {
2305            kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_P0 + j, 0);
2306        }
2307    }
2308    if (has_msr_hv_runtime) {
2309        kvm_msr_entry_add(cpu, HV_X64_MSR_VP_RUNTIME, 0);
2310    }
2311    if (cpu->hyperv_synic) {
2312        uint32_t msr;
2313
2314        kvm_msr_entry_add(cpu, HV_X64_MSR_SCONTROL, 0);
2315        kvm_msr_entry_add(cpu, HV_X64_MSR_SIEFP, 0);
2316        kvm_msr_entry_add(cpu, HV_X64_MSR_SIMP, 0);
2317        for (msr = HV_X64_MSR_SINT0; msr <= HV_X64_MSR_SINT15; msr++) {
2318            kvm_msr_entry_add(cpu, msr, 0);
2319        }
2320    }
2321    if (has_msr_hv_stimer) {
2322        uint32_t msr;
2323
2324        for (msr = HV_X64_MSR_STIMER0_CONFIG; msr <= HV_X64_MSR_STIMER3_COUNT;
2325             msr++) {
2326            kvm_msr_entry_add(cpu, msr, 0);
2327        }
2328    }
2329    if (env->features[FEAT_1_EDX] & CPUID_MTRR) {
2330        kvm_msr_entry_add(cpu, MSR_MTRRdefType, 0);
2331        kvm_msr_entry_add(cpu, MSR_MTRRfix64K_00000, 0);
2332        kvm_msr_entry_add(cpu, MSR_MTRRfix16K_80000, 0);
2333        kvm_msr_entry_add(cpu, MSR_MTRRfix16K_A0000, 0);
2334        kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C0000, 0);
2335        kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C8000, 0);
2336        kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D0000, 0);
2337        kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D8000, 0);
2338        kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E0000, 0);
2339        kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E8000, 0);
2340        kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F0000, 0);
2341        kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F8000, 0);
2342        for (i = 0; i < MSR_MTRRcap_VCNT; i++) {
2343            kvm_msr_entry_add(cpu, MSR_MTRRphysBase(i), 0);
2344            kvm_msr_entry_add(cpu, MSR_MTRRphysMask(i), 0);
2345        }
2346    }
2347
2348    if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) {
2349        int addr_num =
2350            kvm_arch_get_supported_cpuid(kvm_state, 0x14, 1, R_EAX) & 0x7;
2351
2352        kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CTL, 0);
2353        kvm_msr_entry_add(cpu, MSR_IA32_RTIT_STATUS, 0);
2354        kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_BASE, 0);
2355        kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_MASK, 0);
2356        kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CR3_MATCH, 0);
2357        for (i = 0; i < addr_num; i++) {
2358            kvm_msr_entry_add(cpu, MSR_IA32_RTIT_ADDR0_A + i, 0);
2359        }
2360    }
2361
2362    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, cpu->kvm_msr_buf);
2363    if (ret < 0) {
2364        return ret;
2365    }
2366
2367    if (ret < cpu->kvm_msr_buf->nmsrs) {
2368        struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret];
2369        error_report("error: failed to get MSR 0x%" PRIx32,
2370                     (uint32_t)e->index);
2371    }
2372
2373    assert(ret == cpu->kvm_msr_buf->nmsrs);
2374    /*
2375     * MTRR masks: Each mask consists of 5 parts
2376     * a  10..0: must be zero
2377     * b  11   : valid bit
2378     * c n-1.12: actual mask bits
2379     * d  51..n: reserved must be zero
2380     * e  63.52: reserved must be zero
2381     *
2382     * 'n' is the number of physical bits supported by the CPU and is
2383     * apparently always <= 52.   We know our 'n' but don't know what
2384     * the destinations 'n' is; it might be smaller, in which case
2385     * it masks (c) on loading. It might be larger, in which case
2386     * we fill 'd' so that d..c is consistent irrespetive of the 'n'
2387     * we're migrating to.
2388     */
2389
2390    if (cpu->fill_mtrr_mask) {
2391        QEMU_BUILD_BUG_ON(TARGET_PHYS_ADDR_SPACE_BITS > 52);
2392        assert(cpu->phys_bits <= TARGET_PHYS_ADDR_SPACE_BITS);
2393        mtrr_top_bits = MAKE_64BIT_MASK(cpu->phys_bits, 52 - cpu->phys_bits);
2394    } else {
2395        mtrr_top_bits = 0;
2396    }
2397
2398    for (i = 0; i < ret; i++) {
2399        uint32_t index = msrs[i].index;
2400        switch (index) {
2401        case MSR_IA32_SYSENTER_CS:
2402            env->sysenter_cs = msrs[i].data;
2403            break;
2404        case MSR_IA32_SYSENTER_ESP:
2405            env->sysenter_esp = msrs[i].data;
2406            break;
2407        case MSR_IA32_SYSENTER_EIP:
2408            env->sysenter_eip = msrs[i].data;
2409            break;
2410        case MSR_PAT:
2411            env->pat = msrs[i].data;
2412            break;
2413        case MSR_STAR:
2414            env->star = msrs[i].data;
2415            break;
2416#ifdef TARGET_X86_64
2417        case MSR_CSTAR:
2418            env->cstar = msrs[i].data;
2419            break;
2420        case MSR_KERNELGSBASE:
2421            env->kernelgsbase = msrs[i].data;
2422            break;
2423        case MSR_FMASK:
2424            env->fmask = msrs[i].data;
2425            break;
2426        case MSR_LSTAR:
2427            env->lstar = msrs[i].data;
2428            break;
2429#endif
2430        case MSR_IA32_TSC:
2431            env->tsc = msrs[i].data;
2432            break;
2433        case MSR_TSC_AUX:
2434            env->tsc_aux = msrs[i].data;
2435            break;
2436        case MSR_TSC_ADJUST:
2437            env->tsc_adjust = msrs[i].data;
2438            break;
2439        case MSR_IA32_TSCDEADLINE:
2440            env->tsc_deadline = msrs[i].data;
2441            break;
2442        case MSR_VM_HSAVE_PA:
2443            env->vm_hsave = msrs[i].data;
2444            break;
2445        case MSR_KVM_SYSTEM_TIME:
2446            env->system_time_msr = msrs[i].data;
2447            break;
2448        case MSR_KVM_WALL_CLOCK:
2449            env->wall_clock_msr = msrs[i].data;
2450            break;
2451        case MSR_MCG_STATUS:
2452            env->mcg_status = msrs[i].data;
2453            break;
2454        case MSR_MCG_CTL:
2455            env->mcg_ctl = msrs[i].data;
2456            break;
2457        case MSR_MCG_EXT_CTL:
2458            env->mcg_ext_ctl = msrs[i].data;
2459            break;
2460        case MSR_IA32_MISC_ENABLE:
2461            env->msr_ia32_misc_enable = msrs[i].data;
2462            break;
2463        case MSR_IA32_SMBASE:
2464            env->smbase = msrs[i].data;
2465            break;
2466        case MSR_SMI_COUNT:
2467            env->msr_smi_count = msrs[i].data;
2468            break;
2469        case MSR_IA32_FEATURE_CONTROL:
2470            env->msr_ia32_feature_control = msrs[i].data;
2471            break;
2472        case MSR_IA32_BNDCFGS:
2473            env->msr_bndcfgs = msrs[i].data;
2474            break;
2475        case MSR_IA32_XSS:
2476            env->xss = msrs[i].data;
2477            break;
2478        default:
2479            if (msrs[i].index >= MSR_MC0_CTL &&
2480                msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
2481                env->mce_banks[msrs[i].index - MSR_MC0_CTL] = msrs[i].data;
2482            }
2483            break;
2484        case MSR_KVM_ASYNC_PF_EN:
2485            env->async_pf_en_msr = msrs[i].data;
2486            break;
2487        case MSR_KVM_PV_EOI_EN:
2488            env->pv_eoi_en_msr = msrs[i].data;
2489            break;
2490        case MSR_KVM_STEAL_TIME:
2491            env->steal_time_msr = msrs[i].data;
2492            break;
2493        case MSR_CORE_PERF_FIXED_CTR_CTRL:
2494            env->msr_fixed_ctr_ctrl = msrs[i].data;
2495            break;
2496        case MSR_CORE_PERF_GLOBAL_CTRL:
2497            env->msr_global_ctrl = msrs[i].data;
2498            break;
2499        case MSR_CORE_PERF_GLOBAL_STATUS:
2500            env->msr_global_status = msrs[i].data;
2501            break;
2502        case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
2503            env->msr_global_ovf_ctrl = msrs[i].data;
2504            break;
2505        case MSR_CORE_PERF_FIXED_CTR0 ... MSR_CORE_PERF_FIXED_CTR0 + MAX_FIXED_COUNTERS - 1:
2506            env->msr_fixed_counters[index - MSR_CORE_PERF_FIXED_CTR0] = msrs[i].data;
2507            break;
2508        case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR0 + MAX_GP_COUNTERS - 1:
2509            env->msr_gp_counters[index - MSR_P6_PERFCTR0] = msrs[i].data;
2510            break;
2511        case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL0 + MAX_GP_COUNTERS - 1:
2512            env->msr_gp_evtsel[index - MSR_P6_EVNTSEL0] = msrs[i].data;
2513            break;
2514        case HV_X64_MSR_HYPERCALL:
2515            env->msr_hv_hypercall = msrs[i].data;
2516            break;
2517        case HV_X64_MSR_GUEST_OS_ID:
2518            env->msr_hv_guest_os_id = msrs[i].data;
2519            break;
2520        case HV_X64_MSR_APIC_ASSIST_PAGE:
2521            env->msr_hv_vapic = msrs[i].data;
2522            break;
2523        case HV_X64_MSR_REFERENCE_TSC:
2524            env->msr_hv_tsc = msrs[i].data;
2525            break;
2526        case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
2527            env->msr_hv_crash_params[index - HV_X64_MSR_CRASH_P0] = msrs[i].data;
2528            break;
2529        case HV_X64_MSR_VP_RUNTIME:
2530            env->msr_hv_runtime = msrs[i].data;
2531            break;
2532        case HV_X64_MSR_SCONTROL:
2533            env->msr_hv_synic_control = msrs[i].data;
2534            break;
2535        case HV_X64_MSR_SIEFP:
2536            env->msr_hv_synic_evt_page = msrs[i].data;
2537            break;
2538        case HV_X64_MSR_SIMP:
2539            env->msr_hv_synic_msg_page = msrs[i].data;
2540            break;
2541        case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
2542            env->msr_hv_synic_sint[index - HV_X64_MSR_SINT0] = msrs[i].data;
2543            break;
2544        case HV_X64_MSR_STIMER0_CONFIG:
2545        case HV_X64_MSR_STIMER1_CONFIG:
2546        case HV_X64_MSR_STIMER2_CONFIG:
2547        case HV_X64_MSR_STIMER3_CONFIG:
2548            env->msr_hv_stimer_config[(index - HV_X64_MSR_STIMER0_CONFIG)/2] =
2549                                msrs[i].data;
2550            break;
2551        case HV_X64_MSR_STIMER0_COUNT:
2552        case HV_X64_MSR_STIMER1_COUNT:
2553        case HV_X64_MSR_STIMER2_COUNT:
2554        case HV_X64_MSR_STIMER3_COUNT:
2555            env->msr_hv_stimer_count[(index - HV_X64_MSR_STIMER0_COUNT)/2] =
2556                                msrs[i].data;
2557            break;
2558        case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
2559            env->msr_hv_reenlightenment_control = msrs[i].data;
2560            break;
2561        case HV_X64_MSR_TSC_EMULATION_CONTROL:
2562            env->msr_hv_tsc_emulation_control = msrs[i].data;
2563            break;
2564        case HV_X64_MSR_TSC_EMULATION_STATUS:
2565            env->msr_hv_tsc_emulation_status = msrs[i].data;
2566            break;
2567        case MSR_MTRRdefType:
2568            env->mtrr_deftype = msrs[i].data;
2569            break;
2570        case MSR_MTRRfix64K_00000:
2571            env->mtrr_fixed[0] = msrs[i].data;
2572            break;
2573        case MSR_MTRRfix16K_80000:
2574            env->mtrr_fixed[1] = msrs[i].data;
2575            break;
2576        case MSR_MTRRfix16K_A0000:
2577            env->mtrr_fixed[2] = msrs[i].data;
2578            break;
2579        case MSR_MTRRfix4K_C0000:
2580            env->mtrr_fixed[3] = msrs[i].data;
2581            break;
2582        case MSR_MTRRfix4K_C8000:
2583            env->mtrr_fixed[4] = msrs[i].data;
2584            break;
2585        case MSR_MTRRfix4K_D0000:
2586            env->mtrr_fixed[5] = msrs[i].data;
2587            break;
2588        case MSR_MTRRfix4K_D8000:
2589            env->mtrr_fixed[6] = msrs[i].data;
2590            break;
2591        case MSR_MTRRfix4K_E0000:
2592            env->mtrr_fixed[7] = msrs[i].data;
2593            break;
2594        case MSR_MTRRfix4K_E8000:
2595            env->mtrr_fixed[8] = msrs[i].data;
2596            break;
2597        case MSR_MTRRfix4K_F0000:
2598            env->mtrr_fixed[9] = msrs[i].data;
2599            break;
2600        case MSR_MTRRfix4K_F8000:
2601            env->mtrr_fixed[10] = msrs[i].data;
2602            break;
2603        case MSR_MTRRphysBase(0) ... MSR_MTRRphysMask(MSR_MTRRcap_VCNT - 1):
2604            if (index & 1) {
2605                env->mtrr_var[MSR_MTRRphysIndex(index)].mask = msrs[i].data |
2606                                                               mtrr_top_bits;
2607            } else {
2608                env->mtrr_var[MSR_MTRRphysIndex(index)].base = msrs[i].data;
2609            }
2610            break;
2611        case MSR_IA32_SPEC_CTRL:
2612            env->spec_ctrl = msrs[i].data;
2613            break;
2614        case MSR_VIRT_SSBD:
2615            env->virt_ssbd = msrs[i].data;
2616            break;
2617        case MSR_IA32_RTIT_CTL:
2618            env->msr_rtit_ctrl = msrs[i].data;
2619            break;
2620        case MSR_IA32_RTIT_STATUS:
2621            env->msr_rtit_status = msrs[i].data;
2622            break;
2623        case MSR_IA32_RTIT_OUTPUT_BASE:
2624            env->msr_rtit_output_base = msrs[i].data;
2625            break;
2626        case MSR_IA32_RTIT_OUTPUT_MASK:
2627            env->msr_rtit_output_mask = msrs[i].data;
2628            break;
2629        case MSR_IA32_RTIT_CR3_MATCH:
2630            env->msr_rtit_cr3_match = msrs[i].data;
2631            break;
2632        case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2633            env->msr_rtit_addrs[index - MSR_IA32_RTIT_ADDR0_A] = msrs[i].data;
2634            break;
2635        }
2636    }
2637
2638    return 0;
2639}
2640
2641static int kvm_put_mp_state(X86CPU *cpu)
2642{
2643    struct kvm_mp_state mp_state = { .mp_state = cpu->env.mp_state };
2644
2645    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MP_STATE, &mp_state);
2646}
2647
2648static int kvm_get_mp_state(X86CPU *cpu)
2649{
2650    CPUState *cs = CPU(cpu);
2651    CPUX86State *env = &cpu->env;
2652    struct kvm_mp_state mp_state;
2653    int ret;
2654
2655    ret = kvm_vcpu_ioctl(cs, KVM_GET_MP_STATE, &mp_state);
2656    if (ret < 0) {
2657        return ret;
2658    }
2659    env->mp_state = mp_state.mp_state;
2660    if (kvm_irqchip_in_kernel()) {
2661        cs->halted = (mp_state.mp_state == KVM_MP_STATE_HALTED);
2662    }
2663    return 0;
2664}
2665
2666static int kvm_get_apic(X86CPU *cpu)
2667{
2668    DeviceState *apic = cpu->apic_state;
2669    struct kvm_lapic_state kapic;
2670    int ret;
2671
2672    if (apic && kvm_irqchip_in_kernel()) {
2673        ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_LAPIC, &kapic);
2674        if (ret < 0) {
2675            return ret;
2676        }
2677
2678        kvm_get_apic_state(apic, &kapic);
2679    }
2680    return 0;
2681}
2682
2683static int kvm_put_vcpu_events(X86CPU *cpu, int level)
2684{
2685    CPUState *cs = CPU(cpu);
2686    CPUX86State *env = &cpu->env;
2687    struct kvm_vcpu_events events = {};
2688
2689    if (!kvm_has_vcpu_events()) {
2690        return 0;
2691    }
2692
2693    events.exception.injected = (env->exception_injected >= 0);
2694    events.exception.nr = env->exception_injected;
2695    events.exception.has_error_code = env->has_error_code;
2696    events.exception.error_code = env->error_code;
2697    events.exception.pad = 0;
2698
2699    events.interrupt.injected = (env->interrupt_injected >= 0);
2700    events.interrupt.nr = env->interrupt_injected;
2701    events.interrupt.soft = env->soft_interrupt;
2702
2703    events.nmi.injected = env->nmi_injected;
2704    events.nmi.pending = env->nmi_pending;
2705    events.nmi.masked = !!(env->hflags2 & HF2_NMI_MASK);
2706    events.nmi.pad = 0;
2707
2708    events.sipi_vector = env->sipi_vector;
2709    events.flags = 0;
2710
2711    if (has_msr_smbase) {
2712        events.smi.smm = !!(env->hflags & HF_SMM_MASK);
2713        events.smi.smm_inside_nmi = !!(env->hflags2 & HF2_SMM_INSIDE_NMI_MASK);
2714        if (kvm_irqchip_in_kernel()) {
2715            /* As soon as these are moved to the kernel, remove them
2716             * from cs->interrupt_request.
2717             */
2718            events.smi.pending = cs->interrupt_request & CPU_INTERRUPT_SMI;
2719            events.smi.latched_init = cs->interrupt_request & CPU_INTERRUPT_INIT;
2720            cs->interrupt_request &= ~(CPU_INTERRUPT_INIT | CPU_INTERRUPT_SMI);
2721        } else {
2722            /* Keep these in cs->interrupt_request.  */
2723            events.smi.pending = 0;
2724            events.smi.latched_init = 0;
2725        }
2726        /* Stop SMI delivery on old machine types to avoid a reboot
2727         * on an inward migration of an old VM.
2728         */
2729        if (!cpu->kvm_no_smi_migration) {
2730            events.flags |= KVM_VCPUEVENT_VALID_SMM;
2731        }
2732    }
2733
2734    if (level >= KVM_PUT_RESET_STATE) {
2735        events.flags |= KVM_VCPUEVENT_VALID_NMI_PENDING;
2736        if (env->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
2737            events.flags |= KVM_VCPUEVENT_VALID_SIPI_VECTOR;
2738        }
2739    }
2740
2741    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_VCPU_EVENTS, &events);
2742}
2743
2744static int kvm_get_vcpu_events(X86CPU *cpu)
2745{
2746    CPUX86State *env = &cpu->env;
2747    struct kvm_vcpu_events events;
2748    int ret;
2749
2750    if (!kvm_has_vcpu_events()) {
2751        return 0;
2752    }
2753
2754    memset(&events, 0, sizeof(events));
2755    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_VCPU_EVENTS, &events);
2756    if (ret < 0) {
2757       return ret;
2758    }
2759    env->exception_injected =
2760       events.exception.injected ? events.exception.nr : -1;
2761    env->has_error_code = events.exception.has_error_code;
2762    env->error_code = events.exception.error_code;
2763
2764    env->interrupt_injected =
2765        events.interrupt.injected ? events.interrupt.nr : -1;
2766    env->soft_interrupt = events.interrupt.soft;
2767
2768    env->nmi_injected = events.nmi.injected;
2769    env->nmi_pending = events.nmi.pending;
2770    if (events.nmi.masked) {
2771        env->hflags2 |= HF2_NMI_MASK;
2772    } else {
2773        env->hflags2 &= ~HF2_NMI_MASK;
2774    }
2775
2776    if (events.flags & KVM_VCPUEVENT_VALID_SMM) {
2777        if (events.smi.smm) {
2778            env->hflags |= HF_SMM_MASK;
2779        } else {
2780            env->hflags &= ~HF_SMM_MASK;
2781        }
2782        if (events.smi.pending) {
2783            cpu_interrupt(CPU(cpu), CPU_INTERRUPT_SMI);
2784        } else {
2785            cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_SMI);
2786        }
2787        if (events.smi.smm_inside_nmi) {
2788            env->hflags2 |= HF2_SMM_INSIDE_NMI_MASK;
2789        } else {
2790            env->hflags2 &= ~HF2_SMM_INSIDE_NMI_MASK;
2791        }
2792        if (events.smi.latched_init) {
2793            cpu_interrupt(CPU(cpu), CPU_INTERRUPT_INIT);
2794        } else {
2795            cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_INIT);
2796        }
2797    }
2798
2799    env->sipi_vector = events.sipi_vector;
2800
2801    return 0;
2802}
2803
2804static int kvm_guest_debug_workarounds(X86CPU *cpu)
2805{
2806    CPUState *cs = CPU(cpu);
2807    CPUX86State *env = &cpu->env;
2808    int ret = 0;
2809    unsigned long reinject_trap = 0;
2810
2811    if (!kvm_has_vcpu_events()) {
2812        if (env->exception_injected == 1) {
2813            reinject_trap = KVM_GUESTDBG_INJECT_DB;
2814        } else if (env->exception_injected == 3) {
2815            reinject_trap = KVM_GUESTDBG_INJECT_BP;
2816        }
2817        env->exception_injected = -1;
2818    }
2819
2820    /*
2821     * Kernels before KVM_CAP_X86_ROBUST_SINGLESTEP overwrote flags.TF
2822     * injected via SET_GUEST_DEBUG while updating GP regs. Work around this
2823     * by updating the debug state once again if single-stepping is on.
2824     * Another reason to call kvm_update_guest_debug here is a pending debug
2825     * trap raise by the guest. On kernels without SET_VCPU_EVENTS we have to
2826     * reinject them via SET_GUEST_DEBUG.
2827     */
2828    if (reinject_trap ||
2829        (!kvm_has_robust_singlestep() && cs->singlestep_enabled)) {
2830        ret = kvm_update_guest_debug(cs, reinject_trap);
2831    }
2832    return ret;
2833}
2834
2835static int kvm_put_debugregs(X86CPU *cpu)
2836{
2837    CPUX86State *env = &cpu->env;
2838    struct kvm_debugregs dbgregs;
2839    int i;
2840
2841    if (!kvm_has_debugregs()) {
2842        return 0;
2843    }
2844
2845    for (i = 0; i < 4; i++) {
2846        dbgregs.db[i] = env->dr[i];
2847    }
2848    dbgregs.dr6 = env->dr[6];
2849    dbgregs.dr7 = env->dr[7];
2850    dbgregs.flags = 0;
2851
2852    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_DEBUGREGS, &dbgregs);
2853}
2854
2855static int kvm_get_debugregs(X86CPU *cpu)
2856{
2857    CPUX86State *env = &cpu->env;
2858    struct kvm_debugregs dbgregs;
2859    int i, ret;
2860
2861    if (!kvm_has_debugregs()) {
2862        return 0;
2863    }
2864
2865    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_DEBUGREGS, &dbgregs);
2866    if (ret < 0) {
2867        return ret;
2868    }
2869    for (i = 0; i < 4; i++) {
2870        env->dr[i] = dbgregs.db[i];
2871    }
2872    env->dr[4] = env->dr[6] = dbgregs.dr6;
2873    env->dr[5] = env->dr[7] = dbgregs.dr7;
2874
2875    return 0;
2876}
2877
2878int kvm_arch_put_registers(CPUState *cpu, int level)
2879{
2880    X86CPU *x86_cpu = X86_CPU(cpu);
2881    int ret;
2882
2883    assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
2884
2885    if (level >= KVM_PUT_RESET_STATE) {
2886        ret = kvm_put_msr_feature_control(x86_cpu);
2887        if (ret < 0) {
2888            return ret;
2889        }
2890    }
2891
2892    if (level == KVM_PUT_FULL_STATE) {
2893        /* We don't check for kvm_arch_set_tsc_khz() errors here,
2894         * because TSC frequency mismatch shouldn't abort migration,
2895         * unless the user explicitly asked for a more strict TSC
2896         * setting (e.g. using an explicit "tsc-freq" option).
2897         */
2898        kvm_arch_set_tsc_khz(cpu);
2899    }
2900
2901    ret = kvm_getput_regs(x86_cpu, 1);
2902    if (ret < 0) {
2903        return ret;
2904    }
2905    ret = kvm_put_xsave(x86_cpu);
2906    if (ret < 0) {
2907        return ret;
2908    }
2909    ret = kvm_put_xcrs(x86_cpu);
2910    if (ret < 0) {
2911        return ret;
2912    }
2913    ret = kvm_put_sregs(x86_cpu);
2914    if (ret < 0) {
2915        return ret;
2916    }
2917    /* must be before kvm_put_msrs */
2918    ret = kvm_inject_mce_oldstyle(x86_cpu);
2919    if (ret < 0) {
2920        return ret;
2921    }
2922    ret = kvm_put_msrs(x86_cpu, level);
2923    if (ret < 0) {
2924        return ret;
2925    }
2926    ret = kvm_put_vcpu_events(x86_cpu, level);
2927    if (ret < 0) {
2928        return ret;
2929    }
2930    if (level >= KVM_PUT_RESET_STATE) {
2931        ret = kvm_put_mp_state(x86_cpu);
2932        if (ret < 0) {
2933            return ret;
2934        }
2935    }
2936
2937    ret = kvm_put_tscdeadline_msr(x86_cpu);
2938    if (ret < 0) {
2939        return ret;
2940    }
2941    ret = kvm_put_debugregs(x86_cpu);
2942    if (ret < 0) {
2943        return ret;
2944    }
2945    /* must be last */
2946    ret = kvm_guest_debug_workarounds(x86_cpu);
2947    if (ret < 0) {
2948        return ret;
2949    }
2950    return 0;
2951}
2952
2953int kvm_arch_get_registers(CPUState *cs)
2954{
2955    X86CPU *cpu = X86_CPU(cs);
2956    int ret;
2957
2958    assert(cpu_is_stopped(cs) || qemu_cpu_is_self(cs));
2959
2960    ret = kvm_get_vcpu_events(cpu);
2961    if (ret < 0) {
2962        goto out;
2963    }
2964    /*
2965     * KVM_GET_MPSTATE can modify CS and RIP, call it before
2966     * KVM_GET_REGS and KVM_GET_SREGS.
2967     */
2968    ret = kvm_get_mp_state(cpu);
2969    if (ret < 0) {
2970        goto out;
2971    }
2972    ret = kvm_getput_regs(cpu, 0);
2973    if (ret < 0) {
2974        goto out;
2975    }
2976    ret = kvm_get_xsave(cpu);
2977    if (ret < 0) {
2978        goto out;
2979    }
2980    ret = kvm_get_xcrs(cpu);
2981    if (ret < 0) {
2982        goto out;
2983    }
2984    ret = kvm_get_sregs(cpu);
2985    if (ret < 0) {
2986        goto out;
2987    }
2988    ret = kvm_get_msrs(cpu);
2989    if (ret < 0) {
2990        goto out;
2991    }
2992    ret = kvm_get_apic(cpu);
2993    if (ret < 0) {
2994        goto out;
2995    }
2996    ret = kvm_get_debugregs(cpu);
2997    if (ret < 0) {
2998        goto out;
2999    }
3000    ret = 0;
3001 out:
3002    cpu_sync_bndcs_hflags(&cpu->env);
3003    return ret;
3004}
3005
3006void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run)
3007{
3008    X86CPU *x86_cpu = X86_CPU(cpu);
3009    CPUX86State *env = &x86_cpu->env;
3010    int ret;
3011
3012    /* Inject NMI */
3013    if (cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
3014        if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
3015            qemu_mutex_lock_iothread();
3016            cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
3017            qemu_mutex_unlock_iothread();
3018            DPRINTF("injected NMI\n");
3019            ret = kvm_vcpu_ioctl(cpu, KVM_NMI);
3020            if (ret < 0) {
3021                fprintf(stderr, "KVM: injection failed, NMI lost (%s)\n",
3022                        strerror(-ret));
3023            }
3024        }
3025        if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
3026            qemu_mutex_lock_iothread();
3027            cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
3028            qemu_mutex_unlock_iothread();
3029            DPRINTF("injected SMI\n");
3030            ret = kvm_vcpu_ioctl(cpu, KVM_SMI);
3031            if (ret < 0) {
3032                fprintf(stderr, "KVM: injection failed, SMI lost (%s)\n",
3033                        strerror(-ret));
3034            }
3035        }
3036    }
3037
3038    if (!kvm_pic_in_kernel()) {
3039        qemu_mutex_lock_iothread();
3040    }
3041
3042    /* Force the VCPU out of its inner loop to process any INIT requests
3043     * or (for userspace APIC, but it is cheap to combine the checks here)
3044     * pending TPR access reports.
3045     */
3046    if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
3047        if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
3048            !(env->hflags & HF_SMM_MASK)) {
3049            cpu->exit_request = 1;
3050        }
3051        if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
3052            cpu->exit_request = 1;
3053        }
3054    }
3055
3056    if (!kvm_pic_in_kernel()) {
3057        /* Try to inject an interrupt if the guest can accept it */
3058        if (run->ready_for_interrupt_injection &&
3059            (cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
3060            (env->eflags & IF_MASK)) {
3061            int irq;
3062
3063            cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
3064            irq = cpu_get_pic_interrupt(env);
3065            if (irq >= 0) {
3066                struct kvm_interrupt intr;
3067
3068                intr.irq = irq;
3069                DPRINTF("injected interrupt %d\n", irq);
3070                ret = kvm_vcpu_ioctl(cpu, KVM_INTERRUPT, &intr);
3071                if (ret < 0) {
3072                    fprintf(stderr,
3073                            "KVM: injection failed, interrupt lost (%s)\n",
3074                            strerror(-ret));
3075                }
3076            }
3077        }
3078
3079        /* If we have an interrupt but the guest is not ready to receive an
3080         * interrupt, request an interrupt window exit.  This will
3081         * cause a return to userspace as soon as the guest is ready to
3082         * receive interrupts. */
3083        if ((cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
3084            run->request_interrupt_window = 1;
3085        } else {
3086            run->request_interrupt_window = 0;
3087        }
3088
3089        DPRINTF("setting tpr\n");
3090        run->cr8 = cpu_get_apic_tpr(x86_cpu->apic_state);
3091
3092        qemu_mutex_unlock_iothread();
3093    }
3094}
3095
3096MemTxAttrs kvm_arch_post_run(CPUState *cpu, struct kvm_run *run)
3097{
3098    X86CPU *x86_cpu = X86_CPU(cpu);
3099    CPUX86State *env = &x86_cpu->env;
3100
3101    if (run->flags & KVM_RUN_X86_SMM) {
3102        env->hflags |= HF_SMM_MASK;
3103    } else {
3104        env->hflags &= ~HF_SMM_MASK;
3105    }
3106    if (run->if_flag) {
3107        env->eflags |= IF_MASK;
3108    } else {
3109        env->eflags &= ~IF_MASK;
3110    }
3111
3112    /* We need to protect the apic state against concurrent accesses from
3113     * different threads in case the userspace irqchip is used. */
3114    if (!kvm_irqchip_in_kernel()) {
3115        qemu_mutex_lock_iothread();
3116    }
3117    cpu_set_apic_tpr(x86_cpu->apic_state, run->cr8);
3118    cpu_set_apic_base(x86_cpu->apic_state, run->apic_base);
3119    if (!kvm_irqchip_in_kernel()) {
3120        qemu_mutex_unlock_iothread();
3121    }
3122    return cpu_get_mem_attrs(env);
3123}
3124
3125int kvm_arch_process_async_events(CPUState *cs)
3126{
3127    X86CPU *cpu = X86_CPU(cs);
3128    CPUX86State *env = &cpu->env;
3129
3130    if (cs->interrupt_request & CPU_INTERRUPT_MCE) {
3131        /* We must not raise CPU_INTERRUPT_MCE if it's not supported. */
3132        assert(env->mcg_cap);
3133
3134        cs->interrupt_request &= ~CPU_INTERRUPT_MCE;
3135
3136        kvm_cpu_synchronize_state(cs);
3137
3138        if (env->exception_injected == EXCP08_DBLE) {
3139            /* this means triple fault */
3140            qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
3141            cs->exit_request = 1;
3142            return 0;
3143        }
3144        env->exception_injected = EXCP12_MCHK;
3145        env->has_error_code = 0;
3146
3147        cs->halted = 0;
3148        if (kvm_irqchip_in_kernel() && env->mp_state == KVM_MP_STATE_HALTED) {
3149            env->mp_state = KVM_MP_STATE_RUNNABLE;
3150        }
3151    }
3152
3153    if ((cs->interrupt_request & CPU_INTERRUPT_INIT) &&
3154        !(env->hflags & HF_SMM_MASK)) {
3155        kvm_cpu_synchronize_state(cs);
3156        do_cpu_init(cpu);
3157    }
3158
3159    if (kvm_irqchip_in_kernel()) {
3160        return 0;
3161    }
3162
3163    if (cs->interrupt_request & CPU_INTERRUPT_POLL) {
3164        cs->interrupt_request &= ~CPU_INTERRUPT_POLL;
3165        apic_poll_irq(cpu->apic_state);
3166    }
3167    if (((cs->interrupt_request & CPU_INTERRUPT_HARD) &&
3168         (env->eflags & IF_MASK)) ||
3169        (cs->interrupt_request & CPU_INTERRUPT_NMI)) {
3170        cs->halted = 0;
3171    }
3172    if (cs->interrupt_request & CPU_INTERRUPT_SIPI) {
3173        kvm_cpu_synchronize_state(cs);
3174        do_cpu_sipi(cpu);
3175    }
3176    if (cs->interrupt_request & CPU_INTERRUPT_TPR) {
3177        cs->interrupt_request &= ~CPU_INTERRUPT_TPR;
3178        kvm_cpu_synchronize_state(cs);
3179        apic_handle_tpr_access_report(cpu->apic_state, env->eip,
3180                                      env->tpr_access_type);
3181    }
3182
3183    return cs->halted;
3184}
3185
3186static int kvm_handle_halt(X86CPU *cpu)
3187{
3188    CPUState *cs = CPU(cpu);
3189    CPUX86State *env = &cpu->env;
3190
3191    if (!((cs->interrupt_request & CPU_INTERRUPT_HARD) &&
3192          (env->eflags & IF_MASK)) &&
3193        !(cs->interrupt_request & CPU_INTERRUPT_NMI)) {
3194        cs->halted = 1;
3195        return EXCP_HLT;
3196    }
3197
3198    return 0;
3199}
3200
3201static int kvm_handle_tpr_access(X86CPU *cpu)
3202{
3203    CPUState *cs = CPU(cpu);
3204    struct kvm_run *run = cs->kvm_run;
3205
3206    apic_handle_tpr_access_report(cpu->apic_state, run->tpr_access.rip,
3207                                  run->tpr_access.is_write ? TPR_ACCESS_WRITE
3208                                                           : TPR_ACCESS_READ);
3209    return 1;
3210}
3211
3212int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
3213{
3214    static const uint8_t int3 = 0xcc;
3215
3216    if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 1, 0) ||
3217        cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&int3, 1, 1)) {
3218        return -EINVAL;
3219    }
3220    return 0;
3221}
3222
3223int kvm_arch_remove_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
3224{
3225    uint8_t int3;
3226
3227    if (cpu_memory_rw_debug(cs, bp->pc, &int3, 1, 0) || int3 != 0xcc ||
3228        cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 1, 1)) {
3229        return -EINVAL;
3230    }
3231    return 0;
3232}
3233
3234static struct {
3235    target_ulong addr;
3236    int len;
3237    int type;
3238} hw_breakpoint[4];
3239
3240static int nb_hw_breakpoint;
3241
3242static int find_hw_breakpoint(target_ulong addr, int len, int type)
3243{
3244    int n;
3245
3246    for (n = 0; n < nb_hw_breakpoint; n++) {
3247        if (hw_breakpoint[n].addr == addr && hw_breakpoint[n].type == type &&
3248            (hw_breakpoint[n].len == len || len == -1)) {
3249            return n;
3250        }
3251    }
3252    return -1;
3253}
3254
3255int kvm_arch_insert_hw_breakpoint(target_ulong addr,
3256                                  target_ulong len, int type)
3257{
3258    switch (type) {
3259    case GDB_BREAKPOINT_HW:
3260        len = 1;
3261        break;
3262    case GDB_WATCHPOINT_WRITE:
3263    case GDB_WATCHPOINT_ACCESS:
3264        switch (len) {
3265        case 1:
3266            break;
3267        case 2:
3268        case 4:
3269        case 8:
3270            if (addr & (len - 1)) {
3271                return -EINVAL;
3272            }
3273            break;
3274        default:
3275            return -EINVAL;
3276        }
3277        break;
3278    default:
3279        return -ENOSYS;
3280    }
3281
3282    if (nb_hw_breakpoint == 4) {
3283        return -ENOBUFS;
3284    }
3285    if (find_hw_breakpoint(addr, len, type) >= 0) {
3286        return -EEXIST;
3287    }
3288    hw_breakpoint[nb_hw_breakpoint].addr = addr;
3289    hw_breakpoint[nb_hw_breakpoint].len = len;
3290    hw_breakpoint[nb_hw_breakpoint].type = type;
3291    nb_hw_breakpoint++;
3292
3293    return 0;
3294}
3295
3296int kvm_arch_remove_hw_breakpoint(target_ulong addr,
3297                                  target_ulong len, int type)
3298{
3299    int n;
3300
3301    n = find_hw_breakpoint(addr, (type == GDB_BREAKPOINT_HW) ? 1 : len, type);
3302    if (n < 0) {
3303        return -ENOENT;
3304    }
3305    nb_hw_breakpoint--;
3306    hw_breakpoint[n] = hw_breakpoint[nb_hw_breakpoint];
3307
3308    return 0;
3309}
3310
3311void kvm_arch_remove_all_hw_breakpoints(void)
3312{
3313    nb_hw_breakpoint = 0;
3314}
3315
3316static CPUWatchpoint hw_watchpoint;
3317
3318static int kvm_handle_debug(X86CPU *cpu,
3319                            struct kvm_debug_exit_arch *arch_info)
3320{
3321    CPUState *cs = CPU(cpu);
3322    CPUX86State *env = &cpu->env;
3323    int ret = 0;
3324    int n;
3325
3326    if (arch_info->exception == 1) {
3327        if (arch_info->dr6 & (1 << 14)) {
3328            if (cs->singlestep_enabled) {
3329                ret = EXCP_DEBUG;
3330            }
3331        } else {
3332            for (n = 0; n < 4; n++) {
3333                if (arch_info->dr6 & (1 << n)) {
3334                    switch ((arch_info->dr7 >> (16 + n*4)) & 0x3) {
3335                    case 0x0:
3336                        ret = EXCP_DEBUG;
3337                        break;
3338                    case 0x1:
3339                        ret = EXCP_DEBUG;
3340                        cs->watchpoint_hit = &hw_watchpoint;
3341                        hw_watchpoint.vaddr = hw_breakpoint[n].addr;
3342                        hw_watchpoint.flags = BP_MEM_WRITE;
3343                        break;
3344                    case 0x3:
3345                        ret = EXCP_DEBUG;
3346                        cs->watchpoint_hit = &hw_watchpoint;
3347                        hw_watchpoint.vaddr = hw_breakpoint[n].addr;
3348                        hw_watchpoint.flags = BP_MEM_ACCESS;
3349                        break;
3350                    }
3351                }
3352            }
3353        }
3354    } else if (kvm_find_sw_breakpoint(cs, arch_info->pc)) {
3355        ret = EXCP_DEBUG;
3356    }
3357    if (ret == 0) {
3358        cpu_synchronize_state(cs);
3359        assert(env->exception_injected == -1);
3360
3361        /* pass to guest */
3362        env->exception_injected = arch_info->exception;
3363        env->has_error_code = 0;
3364    }
3365
3366    return ret;
3367}
3368
3369void kvm_arch_update_guest_debug(CPUState *cpu, struct kvm_guest_debug *dbg)
3370{
3371    const uint8_t type_code[] = {
3372        [GDB_BREAKPOINT_HW] = 0x0,
3373        [GDB_WATCHPOINT_WRITE] = 0x1,
3374        [GDB_WATCHPOINT_ACCESS] = 0x3
3375    };
3376    const uint8_t len_code[] = {
3377        [1] = 0x0, [2] = 0x1, [4] = 0x3, [8] = 0x2
3378    };
3379    int n;
3380
3381    if (kvm_sw_breakpoints_active(cpu)) {
3382        dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
3383    }
3384    if (nb_hw_breakpoint > 0) {
3385        dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
3386        dbg->arch.debugreg[7] = 0x0600;
3387        for (n = 0; n < nb_hw_breakpoint; n++) {
3388            dbg->arch.debugreg[n] = hw_breakpoint[n].addr;
3389            dbg->arch.debugreg[7] |= (2 << (n * 2)) |
3390                (type_code[hw_breakpoint[n].type] << (16 + n*4)) |
3391                ((uint32_t)len_code[hw_breakpoint[n].len] << (18 + n*4));
3392        }
3393    }
3394}
3395
3396static bool host_supports_vmx(void)
3397{
3398    uint32_t ecx, unused;
3399
3400    host_cpuid(1, 0, &unused, &unused, &ecx, &unused);
3401    return ecx & CPUID_EXT_VMX;
3402}
3403
3404#define VMX_INVALID_GUEST_STATE 0x80000021
3405
3406int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
3407{
3408    X86CPU *cpu = X86_CPU(cs);
3409    uint64_t code;
3410    int ret;
3411
3412    switch (run->exit_reason) {
3413    case KVM_EXIT_HLT:
3414        DPRINTF("handle_hlt\n");
3415        qemu_mutex_lock_iothread();
3416        ret = kvm_handle_halt(cpu);
3417        qemu_mutex_unlock_iothread();
3418        break;
3419    case KVM_EXIT_SET_TPR:
3420        ret = 0;
3421        break;
3422    case KVM_EXIT_TPR_ACCESS:
3423        qemu_mutex_lock_iothread();
3424        ret = kvm_handle_tpr_access(cpu);
3425        qemu_mutex_unlock_iothread();
3426        break;
3427    case KVM_EXIT_FAIL_ENTRY:
3428        code = run->fail_entry.hardware_entry_failure_reason;
3429        fprintf(stderr, "KVM: entry failed, hardware error 0x%" PRIx64 "\n",
3430                code);
3431        if (host_supports_vmx() && code == VMX_INVALID_GUEST_STATE) {
3432            fprintf(stderr,
3433                    "\nIf you're running a guest on an Intel machine without "
3434                        "unrestricted mode\n"
3435                    "support, the failure can be most likely due to the guest "
3436                        "entering an invalid\n"
3437                    "state for Intel VT. For example, the guest maybe running "
3438                        "in big real mode\n"
3439                    "which is not supported on less recent Intel processors."
3440                        "\n\n");
3441        }
3442        ret = -1;
3443        break;
3444    case KVM_EXIT_EXCEPTION:
3445        fprintf(stderr, "KVM: exception %d exit (error code 0x%x)\n",
3446                run->ex.exception, run->ex.error_code);
3447        ret = -1;
3448        break;
3449    case KVM_EXIT_DEBUG:
3450        DPRINTF("kvm_exit_debug\n");
3451        qemu_mutex_lock_iothread();
3452        ret = kvm_handle_debug(cpu, &run->debug.arch);
3453        qemu_mutex_unlock_iothread();
3454        break;
3455    case KVM_EXIT_HYPERV:
3456        ret = kvm_hv_handle_exit(cpu, &run->hyperv);
3457        break;
3458    case KVM_EXIT_IOAPIC_EOI:
3459        ioapic_eoi_broadcast(run->eoi.vector);
3460        ret = 0;
3461        break;
3462    default:
3463        fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
3464        ret = -1;
3465        break;
3466    }
3467
3468    return ret;
3469}
3470
3471bool kvm_arch_stop_on_emulation_error(CPUState *cs)
3472{
3473    X86CPU *cpu = X86_CPU(cs);
3474    CPUX86State *env = &cpu->env;
3475
3476    kvm_cpu_synchronize_state(cs);
3477    return !(env->cr[0] & CR0_PE_MASK) ||
3478           ((env->segs[R_CS].selector  & 3) != 3);
3479}
3480
3481void kvm_arch_init_irq_routing(KVMState *s)
3482{
3483    if (!kvm_check_extension(s, KVM_CAP_IRQ_ROUTING)) {
3484        /* If kernel can't do irq routing, interrupt source
3485         * override 0->2 cannot be set up as required by HPET.
3486         * So we have to disable it.
3487         */
3488        no_hpet = 1;
3489    }
3490    /* We know at this point that we're using the in-kernel
3491     * irqchip, so we can use irqfds, and on x86 we know
3492     * we can use msi via irqfd and GSI routing.
3493     */
3494    kvm_msi_via_irqfd_allowed = true;
3495    kvm_gsi_routing_allowed = true;
3496
3497    if (kvm_irqchip_is_split()) {
3498        int i;
3499
3500        /* If the ioapic is in QEMU and the lapics are in KVM, reserve
3501           MSI routes for signaling interrupts to the local apics. */
3502        for (i = 0; i < IOAPIC_NUM_PINS; i++) {
3503            if (kvm_irqchip_add_msi_route(s, 0, NULL) < 0) {
3504                error_report("Could not enable split IRQ mode.");
3505                exit(1);
3506            }
3507        }
3508    }
3509}
3510
3511int kvm_arch_irqchip_create(MachineState *ms, KVMState *s)
3512{
3513    int ret;
3514    if (machine_kernel_irqchip_split(ms)) {
3515        ret = kvm_vm_enable_cap(s, KVM_CAP_SPLIT_IRQCHIP, 0, 24);
3516        if (ret) {
3517            error_report("Could not enable split irqchip mode: %s",
3518                         strerror(-ret));
3519            exit(1);
3520        } else {
3521            DPRINTF("Enabled KVM_CAP_SPLIT_IRQCHIP\n");
3522            kvm_split_irqchip = true;
3523            return 1;
3524        }
3525    } else {
3526        return 0;
3527    }
3528}
3529
3530/* Classic KVM device assignment interface. Will remain x86 only. */
3531int kvm_device_pci_assign(KVMState *s, PCIHostDeviceAddress *dev_addr,
3532                          uint32_t flags, uint32_t *dev_id)
3533{
3534    struct kvm_assigned_pci_dev dev_data = {
3535        .segnr = dev_addr->domain,
3536        .busnr = dev_addr->bus,
3537        .devfn = PCI_DEVFN(dev_addr->slot, dev_addr->function),
3538        .flags = flags,
3539    };
3540    int ret;
3541
3542    dev_data.assigned_dev_id =
3543        (dev_addr->domain << 16) | (dev_addr->bus << 8) | dev_data.devfn;
3544
3545    ret = kvm_vm_ioctl(s, KVM_ASSIGN_PCI_DEVICE, &dev_data);
3546    if (ret < 0) {
3547        return ret;
3548    }
3549
3550    *dev_id = dev_data.assigned_dev_id;
3551
3552    return 0;
3553}
3554
3555int kvm_device_pci_deassign(KVMState *s, uint32_t dev_id)
3556{
3557    struct kvm_assigned_pci_dev dev_data = {
3558        .assigned_dev_id = dev_id,
3559    };
3560
3561    return kvm_vm_ioctl(s, KVM_DEASSIGN_PCI_DEVICE, &dev_data);
3562}
3563
3564static int kvm_assign_irq_internal(KVMState *s, uint32_t dev_id,
3565                                   uint32_t irq_type, uint32_t guest_irq)
3566{
3567    struct kvm_assigned_irq assigned_irq = {
3568        .assigned_dev_id = dev_id,
3569        .guest_irq = guest_irq,
3570        .flags = irq_type,
3571    };
3572
3573    if (kvm_check_extension(s, KVM_CAP_ASSIGN_DEV_IRQ)) {
3574        return kvm_vm_ioctl(s, KVM_ASSIGN_DEV_IRQ, &assigned_irq);
3575    } else {
3576        return kvm_vm_ioctl(s, KVM_ASSIGN_IRQ, &assigned_irq);
3577    }
3578}
3579
3580int kvm_device_intx_assign(KVMState *s, uint32_t dev_id, bool use_host_msi,
3581                           uint32_t guest_irq)
3582{
3583    uint32_t irq_type = KVM_DEV_IRQ_GUEST_INTX |
3584        (use_host_msi ? KVM_DEV_IRQ_HOST_MSI : KVM_DEV_IRQ_HOST_INTX);
3585
3586    return kvm_assign_irq_internal(s, dev_id, irq_type, guest_irq);
3587}
3588
3589int kvm_device_intx_set_mask(KVMState *s, uint32_t dev_id, bool masked)
3590{
3591    struct kvm_assigned_pci_dev dev_data = {
3592        .assigned_dev_id = dev_id,
3593        .flags = masked ? KVM_DEV_ASSIGN_MASK_INTX : 0,
3594    };
3595
3596    return kvm_vm_ioctl(s, KVM_ASSIGN_SET_INTX_MASK, &dev_data);
3597}
3598
3599static int kvm_deassign_irq_internal(KVMState *s, uint32_t dev_id,
3600                                     uint32_t type)
3601{
3602    struct kvm_assigned_irq assigned_irq = {
3603        .assigned_dev_id = dev_id,
3604        .flags = type,
3605    };
3606
3607    return kvm_vm_ioctl(s, KVM_DEASSIGN_DEV_IRQ, &assigned_irq);
3608}
3609
3610int kvm_device_intx_deassign(KVMState *s, uint32_t dev_id, bool use_host_msi)
3611{
3612    return kvm_deassign_irq_internal(s, dev_id, KVM_DEV_IRQ_GUEST_INTX |
3613        (use_host_msi ? KVM_DEV_IRQ_HOST_MSI : KVM_DEV_IRQ_HOST_INTX));
3614}
3615
3616int kvm_device_msi_assign(KVMState *s, uint32_t dev_id, int virq)
3617{
3618    return kvm_assign_irq_internal(s, dev_id, KVM_DEV_IRQ_HOST_MSI |
3619                                              KVM_DEV_IRQ_GUEST_MSI, virq);
3620}
3621
3622int kvm_device_msi_deassign(KVMState *s, uint32_t dev_id)
3623{
3624    return kvm_deassign_irq_internal(s, dev_id, KVM_DEV_IRQ_GUEST_MSI |
3625                                                KVM_DEV_IRQ_HOST_MSI);
3626}
3627
3628bool kvm_device_msix_supported(KVMState *s)
3629{
3630    /* The kernel lacks a corresponding KVM_CAP, so we probe by calling
3631     * KVM_ASSIGN_SET_MSIX_NR with an invalid parameter. */
3632    return kvm_vm_ioctl(s, KVM_ASSIGN_SET_MSIX_NR, NULL) == -EFAULT;
3633}
3634
3635int kvm_device_msix_init_vectors(KVMState *s, uint32_t dev_id,
3636                                 uint32_t nr_vectors)
3637{
3638    struct kvm_assigned_msix_nr msix_nr = {
3639        .assigned_dev_id = dev_id,
3640        .entry_nr = nr_vectors,
3641    };
3642
3643    return kvm_vm_ioctl(s, KVM_ASSIGN_SET_MSIX_NR, &msix_nr);
3644}
3645
3646int kvm_device_msix_set_vector(KVMState *s, uint32_t dev_id, uint32_t vector,
3647                               int virq)
3648{
3649    struct kvm_assigned_msix_entry msix_entry = {
3650        .assigned_dev_id = dev_id,
3651        .gsi = virq,
3652        .entry = vector,
3653    };
3654
3655    return kvm_vm_ioctl(s, KVM_ASSIGN_SET_MSIX_ENTRY, &msix_entry);
3656}
3657
3658int kvm_device_msix_assign(KVMState *s, uint32_t dev_id)
3659{
3660    return kvm_assign_irq_internal(s, dev_id, KVM_DEV_IRQ_HOST_MSIX |
3661                                              KVM_DEV_IRQ_GUEST_MSIX, 0);
3662}
3663
3664int kvm_device_msix_deassign(KVMState *s, uint32_t dev_id)
3665{
3666    return kvm_deassign_irq_internal(s, dev_id, KVM_DEV_IRQ_GUEST_MSIX |
3667                                                KVM_DEV_IRQ_HOST_MSIX);
3668}
3669
3670int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route,
3671                             uint64_t address, uint32_t data, PCIDevice *dev)
3672{
3673    X86IOMMUState *iommu = x86_iommu_get_default();
3674
3675    if (iommu) {
3676        int ret;
3677        MSIMessage src, dst;
3678        X86IOMMUClass *class = X86_IOMMU_GET_CLASS(iommu);
3679
3680        src.address = route->u.msi.address_hi;
3681        src.address <<= VTD_MSI_ADDR_HI_SHIFT;
3682        src.address |= route->u.msi.address_lo;
3683        src.data = route->u.msi.data;
3684
3685        ret = class->int_remap(iommu, &src, &dst, dev ? \
3686                               pci_requester_id(dev) : \
3687                               X86_IOMMU_SID_INVALID);
3688        if (ret) {
3689            trace_kvm_x86_fixup_msi_error(route->gsi);
3690            return 1;
3691        }
3692
3693        route->u.msi.address_hi = dst.address >> VTD_MSI_ADDR_HI_SHIFT;
3694        route->u.msi.address_lo = dst.address & VTD_MSI_ADDR_LO_MASK;
3695        route->u.msi.data = dst.data;
3696    }
3697
3698    return 0;
3699}
3700
3701typedef struct MSIRouteEntry MSIRouteEntry;
3702
3703struct MSIRouteEntry {
3704    PCIDevice *dev;             /* Device pointer */
3705    int vector;                 /* MSI/MSIX vector index */
3706    int virq;                   /* Virtual IRQ index */
3707    QLIST_ENTRY(MSIRouteEntry) list;
3708};
3709
3710/* List of used GSI routes */
3711static QLIST_HEAD(, MSIRouteEntry) msi_route_list = \
3712    QLIST_HEAD_INITIALIZER(msi_route_list);
3713
3714static void kvm_update_msi_routes_all(void *private, bool global,
3715                                      uint32_t index, uint32_t mask)
3716{
3717    int cnt = 0;
3718    MSIRouteEntry *entry;
3719    MSIMessage msg;
3720    PCIDevice *dev;
3721
3722    /* TODO: explicit route update */
3723    QLIST_FOREACH(entry, &msi_route_list, list) {
3724        cnt++;
3725        dev = entry->dev;
3726        if (!msix_enabled(dev) && !msi_enabled(dev)) {
3727            continue;
3728        }
3729        msg = pci_get_msi_message(dev, entry->vector);
3730        kvm_irqchip_update_msi_route(kvm_state, entry->virq, msg, dev);
3731    }
3732    kvm_irqchip_commit_routes(kvm_state);
3733    trace_kvm_x86_update_msi_routes(cnt);
3734}
3735
3736int kvm_arch_add_msi_route_post(struct kvm_irq_routing_entry *route,
3737                                int vector, PCIDevice *dev)
3738{
3739    static bool notify_list_inited = false;
3740    MSIRouteEntry *entry;
3741
3742    if (!dev) {
3743        /* These are (possibly) IOAPIC routes only used for split
3744         * kernel irqchip mode, while what we are housekeeping are
3745         * PCI devices only. */
3746        return 0;
3747    }
3748
3749    entry = g_new0(MSIRouteEntry, 1);
3750    entry->dev = dev;
3751    entry->vector = vector;
3752    entry->virq = route->gsi;
3753    QLIST_INSERT_HEAD(&msi_route_list, entry, list);
3754
3755    trace_kvm_x86_add_msi_route(route->gsi);
3756
3757    if (!notify_list_inited) {
3758        /* For the first time we do add route, add ourselves into
3759         * IOMMU's IEC notify list if needed. */
3760        X86IOMMUState *iommu = x86_iommu_get_default();
3761        if (iommu) {
3762            x86_iommu_iec_register_notifier(iommu,
3763                                            kvm_update_msi_routes_all,
3764                                            NULL);
3765        }
3766        notify_list_inited = true;
3767    }
3768    return 0;
3769}
3770
3771int kvm_arch_release_virq_post(int virq)
3772{
3773    MSIRouteEntry *entry, *next;
3774    QLIST_FOREACH_SAFE(entry, &msi_route_list, list, next) {
3775        if (entry->virq == virq) {
3776            trace_kvm_x86_remove_msi_route(virq);
3777            QLIST_REMOVE(entry, list);
3778            g_free(entry);
3779            break;
3780        }
3781    }
3782    return 0;
3783}
3784
3785int kvm_arch_msi_data_to_gsi(uint32_t data)
3786{
3787    abort();
3788}
3789