linux/arch/x86/kvm/x86.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Kernel-based Virtual Machine driver for Linux
   4 *
   5 * derived from drivers/kvm/kvm_main.c
   6 *
   7 * Copyright (C) 2006 Qumranet, Inc.
   8 * Copyright (C) 2008 Qumranet, Inc.
   9 * Copyright IBM Corporation, 2008
  10 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  11 *
  12 * Authors:
  13 *   Avi Kivity   <avi@qumranet.com>
  14 *   Yaniv Kamay  <yaniv@qumranet.com>
  15 *   Amit Shah    <amit.shah@qumranet.com>
  16 *   Ben-Ami Yassour <benami@il.ibm.com>
  17 */
  18
  19#include <linux/kvm_host.h>
  20#include "irq.h"
  21#include "ioapic.h"
  22#include "mmu.h"
  23#include "i8254.h"
  24#include "tss.h"
  25#include "kvm_cache_regs.h"
  26#include "kvm_emulate.h"
  27#include "x86.h"
  28#include "cpuid.h"
  29#include "pmu.h"
  30#include "hyperv.h"
  31#include "lapic.h"
  32
  33#include <linux/clocksource.h>
  34#include <linux/interrupt.h>
  35#include <linux/kvm.h>
  36#include <linux/fs.h>
  37#include <linux/vmalloc.h>
  38#include <linux/export.h>
  39#include <linux/moduleparam.h>
  40#include <linux/mman.h>
  41#include <linux/highmem.h>
  42#include <linux/iommu.h>
  43#include <linux/intel-iommu.h>
  44#include <linux/cpufreq.h>
  45#include <linux/user-return-notifier.h>
  46#include <linux/srcu.h>
  47#include <linux/slab.h>
  48#include <linux/perf_event.h>
  49#include <linux/uaccess.h>
  50#include <linux/hash.h>
  51#include <linux/pci.h>
  52#include <linux/timekeeper_internal.h>
  53#include <linux/pvclock_gtod.h>
  54#include <linux/kvm_irqfd.h>
  55#include <linux/irqbypass.h>
  56#include <linux/sched/stat.h>
  57#include <linux/sched/isolation.h>
  58#include <linux/mem_encrypt.h>
  59#include <linux/entry-kvm.h>
  60
  61#include <trace/events/kvm.h>
  62
  63#include <asm/debugreg.h>
  64#include <asm/msr.h>
  65#include <asm/desc.h>
  66#include <asm/mce.h>
  67#include <linux/kernel_stat.h>
  68#include <asm/fpu/internal.h> /* Ugh! */
  69#include <asm/pvclock.h>
  70#include <asm/div64.h>
  71#include <asm/irq_remapping.h>
  72#include <asm/mshyperv.h>
  73#include <asm/hypervisor.h>
  74#include <asm/tlbflush.h>
  75#include <asm/intel_pt.h>
  76#include <asm/emulate_prefix.h>
  77#include <clocksource/hyperv_timer.h>
  78
  79#define CREATE_TRACE_POINTS
  80#include "trace.h"
  81
  82#define MAX_IO_MSRS 256
  83#define KVM_MAX_MCE_BANKS 32
  84u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
  85EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
  86
  87#define emul_to_vcpu(ctxt) \
  88        ((struct kvm_vcpu *)(ctxt)->vcpu)
  89
  90/* EFER defaults:
  91 * - enable syscall per default because its emulated by KVM
  92 * - enable LME and LMA per default on 64 bit KVM
  93 */
  94#ifdef CONFIG_X86_64
  95static
  96u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
  97#else
  98static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
  99#endif
 100
 101static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
 102
 103#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
 104                                    KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
 105
 106static void update_cr8_intercept(struct kvm_vcpu *vcpu);
 107static void process_nmi(struct kvm_vcpu *vcpu);
 108static void enter_smm(struct kvm_vcpu *vcpu);
 109static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
 110static void store_regs(struct kvm_vcpu *vcpu);
 111static int sync_regs(struct kvm_vcpu *vcpu);
 112
 113struct kvm_x86_ops kvm_x86_ops __read_mostly;
 114EXPORT_SYMBOL_GPL(kvm_x86_ops);
 115
 116static bool __read_mostly ignore_msrs = 0;
 117module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
 118
 119static bool __read_mostly report_ignored_msrs = true;
 120module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
 121
 122unsigned int min_timer_period_us = 200;
 123module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
 124
 125static bool __read_mostly kvmclock_periodic_sync = true;
 126module_param(kvmclock_periodic_sync, bool, S_IRUGO);
 127
 128bool __read_mostly kvm_has_tsc_control;
 129EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
 130u32  __read_mostly kvm_max_guest_tsc_khz;
 131EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
 132u8   __read_mostly kvm_tsc_scaling_ratio_frac_bits;
 133EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
 134u64  __read_mostly kvm_max_tsc_scaling_ratio;
 135EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
 136u64 __read_mostly kvm_default_tsc_scaling_ratio;
 137EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
 138
 139/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
 140static u32 __read_mostly tsc_tolerance_ppm = 250;
 141module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
 142
 143/*
 144 * lapic timer advance (tscdeadline mode only) in nanoseconds.  '-1' enables
 145 * adaptive tuning starting from default advancment of 1000ns.  '0' disables
 146 * advancement entirely.  Any other value is used as-is and disables adaptive
 147 * tuning, i.e. allows priveleged userspace to set an exact advancement time.
 148 */
 149static int __read_mostly lapic_timer_advance_ns = -1;
 150module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
 151
 152static bool __read_mostly vector_hashing = true;
 153module_param(vector_hashing, bool, S_IRUGO);
 154
 155bool __read_mostly enable_vmware_backdoor = false;
 156module_param(enable_vmware_backdoor, bool, S_IRUGO);
 157EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
 158
 159static bool __read_mostly force_emulation_prefix = false;
 160module_param(force_emulation_prefix, bool, S_IRUGO);
 161
 162int __read_mostly pi_inject_timer = -1;
 163module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
 164
 165/*
 166 * Restoring the host value for MSRs that are only consumed when running in
 167 * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
 168 * returns to userspace, i.e. the kernel can run with the guest's value.
 169 */
 170#define KVM_MAX_NR_USER_RETURN_MSRS 16
 171
 172struct kvm_user_return_msrs_global {
 173        int nr;
 174        u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS];
 175};
 176
 177struct kvm_user_return_msrs {
 178        struct user_return_notifier urn;
 179        bool registered;
 180        struct kvm_user_return_msr_values {
 181                u64 host;
 182                u64 curr;
 183        } values[KVM_MAX_NR_USER_RETURN_MSRS];
 184};
 185
 186static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global;
 187static struct kvm_user_return_msrs __percpu *user_return_msrs;
 188
 189#define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
 190                                | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
 191                                | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
 192                                | XFEATURE_MASK_PKRU)
 193
 194u64 __read_mostly host_efer;
 195EXPORT_SYMBOL_GPL(host_efer);
 196
 197bool __read_mostly allow_smaller_maxphyaddr = 0;
 198EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
 199
 200static u64 __read_mostly host_xss;
 201u64 __read_mostly supported_xss;
 202EXPORT_SYMBOL_GPL(supported_xss);
 203
 204struct kvm_stats_debugfs_item debugfs_entries[] = {
 205        VCPU_STAT("pf_fixed", pf_fixed),
 206        VCPU_STAT("pf_guest", pf_guest),
 207        VCPU_STAT("tlb_flush", tlb_flush),
 208        VCPU_STAT("invlpg", invlpg),
 209        VCPU_STAT("exits", exits),
 210        VCPU_STAT("io_exits", io_exits),
 211        VCPU_STAT("mmio_exits", mmio_exits),
 212        VCPU_STAT("signal_exits", signal_exits),
 213        VCPU_STAT("irq_window", irq_window_exits),
 214        VCPU_STAT("nmi_window", nmi_window_exits),
 215        VCPU_STAT("halt_exits", halt_exits),
 216        VCPU_STAT("halt_successful_poll", halt_successful_poll),
 217        VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
 218        VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
 219        VCPU_STAT("halt_wakeup", halt_wakeup),
 220        VCPU_STAT("hypercalls", hypercalls),
 221        VCPU_STAT("request_irq", request_irq_exits),
 222        VCPU_STAT("irq_exits", irq_exits),
 223        VCPU_STAT("host_state_reload", host_state_reload),
 224        VCPU_STAT("fpu_reload", fpu_reload),
 225        VCPU_STAT("insn_emulation", insn_emulation),
 226        VCPU_STAT("insn_emulation_fail", insn_emulation_fail),
 227        VCPU_STAT("irq_injections", irq_injections),
 228        VCPU_STAT("nmi_injections", nmi_injections),
 229        VCPU_STAT("req_event", req_event),
 230        VCPU_STAT("l1d_flush", l1d_flush),
 231        VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
 232        VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
 233        VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
 234        VM_STAT("mmu_pte_write", mmu_pte_write),
 235        VM_STAT("mmu_pte_updated", mmu_pte_updated),
 236        VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
 237        VM_STAT("mmu_flooded", mmu_flooded),
 238        VM_STAT("mmu_recycled", mmu_recycled),
 239        VM_STAT("mmu_cache_miss", mmu_cache_miss),
 240        VM_STAT("mmu_unsync", mmu_unsync),
 241        VM_STAT("remote_tlb_flush", remote_tlb_flush),
 242        VM_STAT("largepages", lpages, .mode = 0444),
 243        VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444),
 244        VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions),
 245        { NULL }
 246};
 247
 248u64 __read_mostly host_xcr0;
 249u64 __read_mostly supported_xcr0;
 250EXPORT_SYMBOL_GPL(supported_xcr0);
 251
 252static struct kmem_cache *x86_fpu_cache;
 253
 254static struct kmem_cache *x86_emulator_cache;
 255
 256/*
 257 * When called, it means the previous get/set msr reached an invalid msr.
 258 * Return true if we want to ignore/silent this failed msr access.
 259 */
 260static bool kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr,
 261                                  u64 data, bool write)
 262{
 263        const char *op = write ? "wrmsr" : "rdmsr";
 264
 265        if (ignore_msrs) {
 266                if (report_ignored_msrs)
 267                        kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n",
 268                                      op, msr, data);
 269                /* Mask the error */
 270                return true;
 271        } else {
 272                kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
 273                                      op, msr, data);
 274                return false;
 275        }
 276}
 277
 278static struct kmem_cache *kvm_alloc_emulator_cache(void)
 279{
 280        unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src);
 281        unsigned int size = sizeof(struct x86_emulate_ctxt);
 282
 283        return kmem_cache_create_usercopy("x86_emulator", size,
 284                                          __alignof__(struct x86_emulate_ctxt),
 285                                          SLAB_ACCOUNT, useroffset,
 286                                          size - useroffset, NULL);
 287}
 288
 289static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
 290
 291static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
 292{
 293        int i;
 294        for (i = 0; i < ASYNC_PF_PER_VCPU; i++)
 295                vcpu->arch.apf.gfns[i] = ~0;
 296}
 297
 298static void kvm_on_user_return(struct user_return_notifier *urn)
 299{
 300        unsigned slot;
 301        struct kvm_user_return_msrs *msrs
 302                = container_of(urn, struct kvm_user_return_msrs, urn);
 303        struct kvm_user_return_msr_values *values;
 304        unsigned long flags;
 305
 306        /*
 307         * Disabling irqs at this point since the following code could be
 308         * interrupted and executed through kvm_arch_hardware_disable()
 309         */
 310        local_irq_save(flags);
 311        if (msrs->registered) {
 312                msrs->registered = false;
 313                user_return_notifier_unregister(urn);
 314        }
 315        local_irq_restore(flags);
 316        for (slot = 0; slot < user_return_msrs_global.nr; ++slot) {
 317                values = &msrs->values[slot];
 318                if (values->host != values->curr) {
 319                        wrmsrl(user_return_msrs_global.msrs[slot], values->host);
 320                        values->curr = values->host;
 321                }
 322        }
 323}
 324
 325void kvm_define_user_return_msr(unsigned slot, u32 msr)
 326{
 327        BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS);
 328        user_return_msrs_global.msrs[slot] = msr;
 329        if (slot >= user_return_msrs_global.nr)
 330                user_return_msrs_global.nr = slot + 1;
 331}
 332EXPORT_SYMBOL_GPL(kvm_define_user_return_msr);
 333
 334static void kvm_user_return_msr_cpu_online(void)
 335{
 336        unsigned int cpu = smp_processor_id();
 337        struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
 338        u64 value;
 339        int i;
 340
 341        for (i = 0; i < user_return_msrs_global.nr; ++i) {
 342                rdmsrl_safe(user_return_msrs_global.msrs[i], &value);
 343                msrs->values[i].host = value;
 344                msrs->values[i].curr = value;
 345        }
 346}
 347
 348int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
 349{
 350        unsigned int cpu = smp_processor_id();
 351        struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
 352        int err;
 353
 354        value = (value & mask) | (msrs->values[slot].host & ~mask);
 355        if (value == msrs->values[slot].curr)
 356                return 0;
 357        err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value);
 358        if (err)
 359                return 1;
 360
 361        msrs->values[slot].curr = value;
 362        if (!msrs->registered) {
 363                msrs->urn.on_user_return = kvm_on_user_return;
 364                user_return_notifier_register(&msrs->urn);
 365                msrs->registered = true;
 366        }
 367        return 0;
 368}
 369EXPORT_SYMBOL_GPL(kvm_set_user_return_msr);
 370
 371static void drop_user_return_notifiers(void)
 372{
 373        unsigned int cpu = smp_processor_id();
 374        struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
 375
 376        if (msrs->registered)
 377                kvm_on_user_return(&msrs->urn);
 378}
 379
 380u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
 381{
 382        return vcpu->arch.apic_base;
 383}
 384EXPORT_SYMBOL_GPL(kvm_get_apic_base);
 385
 386enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
 387{
 388        return kvm_apic_mode(kvm_get_apic_base(vcpu));
 389}
 390EXPORT_SYMBOL_GPL(kvm_get_apic_mode);
 391
 392int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 393{
 394        enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
 395        enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
 396        u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff |
 397                (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
 398
 399        if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
 400                return 1;
 401        if (!msr_info->host_initiated) {
 402                if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC)
 403                        return 1;
 404                if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC)
 405                        return 1;
 406        }
 407
 408        kvm_lapic_set_base(vcpu, msr_info->data);
 409        kvm_recalculate_apic_map(vcpu->kvm);
 410        return 0;
 411}
 412EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 413
 414asmlinkage __visible noinstr void kvm_spurious_fault(void)
 415{
 416        /* Fault while not rebooting.  We want the trace. */
 417        BUG_ON(!kvm_rebooting);
 418}
 419EXPORT_SYMBOL_GPL(kvm_spurious_fault);
 420
 421#define EXCPT_BENIGN            0
 422#define EXCPT_CONTRIBUTORY      1
 423#define EXCPT_PF                2
 424
 425static int exception_class(int vector)
 426{
 427        switch (vector) {
 428        case PF_VECTOR:
 429                return EXCPT_PF;
 430        case DE_VECTOR:
 431        case TS_VECTOR:
 432        case NP_VECTOR:
 433        case SS_VECTOR:
 434        case GP_VECTOR:
 435                return EXCPT_CONTRIBUTORY;
 436        default:
 437                break;
 438        }
 439        return EXCPT_BENIGN;
 440}
 441
 442#define EXCPT_FAULT             0
 443#define EXCPT_TRAP              1
 444#define EXCPT_ABORT             2
 445#define EXCPT_INTERRUPT         3
 446
 447static int exception_type(int vector)
 448{
 449        unsigned int mask;
 450
 451        if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
 452                return EXCPT_INTERRUPT;
 453
 454        mask = 1 << vector;
 455
 456        /* #DB is trap, as instruction watchpoints are handled elsewhere */
 457        if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
 458                return EXCPT_TRAP;
 459
 460        if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
 461                return EXCPT_ABORT;
 462
 463        /* Reserved exceptions will result in fault */
 464        return EXCPT_FAULT;
 465}
 466
 467void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
 468{
 469        unsigned nr = vcpu->arch.exception.nr;
 470        bool has_payload = vcpu->arch.exception.has_payload;
 471        unsigned long payload = vcpu->arch.exception.payload;
 472
 473        if (!has_payload)
 474                return;
 475
 476        switch (nr) {
 477        case DB_VECTOR:
 478                /*
 479                 * "Certain debug exceptions may clear bit 0-3.  The
 480                 * remaining contents of the DR6 register are never
 481                 * cleared by the processor".
 482                 */
 483                vcpu->arch.dr6 &= ~DR_TRAP_BITS;
 484                /*
 485                 * DR6.RTM is set by all #DB exceptions that don't clear it.
 486                 */
 487                vcpu->arch.dr6 |= DR6_RTM;
 488                vcpu->arch.dr6 |= payload;
 489                /*
 490                 * Bit 16 should be set in the payload whenever the #DB
 491                 * exception should clear DR6.RTM. This makes the payload
 492                 * compatible with the pending debug exceptions under VMX.
 493                 * Though not currently documented in the SDM, this also
 494                 * makes the payload compatible with the exit qualification
 495                 * for #DB exceptions under VMX.
 496                 */
 497                vcpu->arch.dr6 ^= payload & DR6_RTM;
 498
 499                /*
 500                 * The #DB payload is defined as compatible with the 'pending
 501                 * debug exceptions' field under VMX, not DR6. While bit 12 is
 502                 * defined in the 'pending debug exceptions' field (enabled
 503                 * breakpoint), it is reserved and must be zero in DR6.
 504                 */
 505                vcpu->arch.dr6 &= ~BIT(12);
 506                break;
 507        case PF_VECTOR:
 508                vcpu->arch.cr2 = payload;
 509                break;
 510        }
 511
 512        vcpu->arch.exception.has_payload = false;
 513        vcpu->arch.exception.payload = 0;
 514}
 515EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
 516
 517static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 518                unsigned nr, bool has_error, u32 error_code,
 519                bool has_payload, unsigned long payload, bool reinject)
 520{
 521        u32 prev_nr;
 522        int class1, class2;
 523
 524        kvm_make_request(KVM_REQ_EVENT, vcpu);
 525
 526        if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
 527        queue:
 528                if (has_error && !is_protmode(vcpu))
 529                        has_error = false;
 530                if (reinject) {
 531                        /*
 532                         * On vmentry, vcpu->arch.exception.pending is only
 533                         * true if an event injection was blocked by
 534                         * nested_run_pending.  In that case, however,
 535                         * vcpu_enter_guest requests an immediate exit,
 536                         * and the guest shouldn't proceed far enough to
 537                         * need reinjection.
 538                         */
 539                        WARN_ON_ONCE(vcpu->arch.exception.pending);
 540                        vcpu->arch.exception.injected = true;
 541                        if (WARN_ON_ONCE(has_payload)) {
 542                                /*
 543                                 * A reinjected event has already
 544                                 * delivered its payload.
 545                                 */
 546                                has_payload = false;
 547                                payload = 0;
 548                        }
 549                } else {
 550                        vcpu->arch.exception.pending = true;
 551                        vcpu->arch.exception.injected = false;
 552                }
 553                vcpu->arch.exception.has_error_code = has_error;
 554                vcpu->arch.exception.nr = nr;
 555                vcpu->arch.exception.error_code = error_code;
 556                vcpu->arch.exception.has_payload = has_payload;
 557                vcpu->arch.exception.payload = payload;
 558                if (!is_guest_mode(vcpu))
 559                        kvm_deliver_exception_payload(vcpu);
 560                return;
 561        }
 562
 563        /* to check exception */
 564        prev_nr = vcpu->arch.exception.nr;
 565        if (prev_nr == DF_VECTOR) {
 566                /* triple fault -> shutdown */
 567                kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 568                return;
 569        }
 570        class1 = exception_class(prev_nr);
 571        class2 = exception_class(nr);
 572        if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
 573                || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
 574                /*
 575                 * Generate double fault per SDM Table 5-5.  Set
 576                 * exception.pending = true so that the double fault
 577                 * can trigger a nested vmexit.
 578                 */
 579                vcpu->arch.exception.pending = true;
 580                vcpu->arch.exception.injected = false;
 581                vcpu->arch.exception.has_error_code = true;
 582                vcpu->arch.exception.nr = DF_VECTOR;
 583                vcpu->arch.exception.error_code = 0;
 584                vcpu->arch.exception.has_payload = false;
 585                vcpu->arch.exception.payload = 0;
 586        } else
 587                /* replace previous exception with a new one in a hope
 588                   that instruction re-execution will regenerate lost
 589                   exception */
 590                goto queue;
 591}
 592
 593void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 594{
 595        kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false);
 596}
 597EXPORT_SYMBOL_GPL(kvm_queue_exception);
 598
 599void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 600{
 601        kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true);
 602}
 603EXPORT_SYMBOL_GPL(kvm_requeue_exception);
 604
 605void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
 606                           unsigned long payload)
 607{
 608        kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false);
 609}
 610EXPORT_SYMBOL_GPL(kvm_queue_exception_p);
 611
 612static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
 613                                    u32 error_code, unsigned long payload)
 614{
 615        kvm_multiple_exception(vcpu, nr, true, error_code,
 616                               true, payload, false);
 617}
 618
 619int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
 620{
 621        if (err)
 622                kvm_inject_gp(vcpu, 0);
 623        else
 624                return kvm_skip_emulated_instruction(vcpu);
 625
 626        return 1;
 627}
 628EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
 629
 630void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 631{
 632        ++vcpu->stat.pf_guest;
 633        vcpu->arch.exception.nested_apf =
 634                is_guest_mode(vcpu) && fault->async_page_fault;
 635        if (vcpu->arch.exception.nested_apf) {
 636                vcpu->arch.apf.nested_apf_token = fault->address;
 637                kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
 638        } else {
 639                kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
 640                                        fault->address);
 641        }
 642}
 643EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
 644
 645bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
 646                                    struct x86_exception *fault)
 647{
 648        struct kvm_mmu *fault_mmu;
 649        WARN_ON_ONCE(fault->vector != PF_VECTOR);
 650
 651        fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu :
 652                                               vcpu->arch.walk_mmu;
 653
 654        /*
 655         * Invalidate the TLB entry for the faulting address, if it exists,
 656         * else the access will fault indefinitely (and to emulate hardware).
 657         */
 658        if ((fault->error_code & PFERR_PRESENT_MASK) &&
 659            !(fault->error_code & PFERR_RSVD_MASK))
 660                kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
 661                                       fault_mmu->root_hpa);
 662
 663        fault_mmu->inject_page_fault(vcpu, fault);
 664        return fault->nested_page_fault;
 665}
 666EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
 667
 668void kvm_inject_nmi(struct kvm_vcpu *vcpu)
 669{
 670        atomic_inc(&vcpu->arch.nmi_queued);
 671        kvm_make_request(KVM_REQ_NMI, vcpu);
 672}
 673EXPORT_SYMBOL_GPL(kvm_inject_nmi);
 674
 675void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 676{
 677        kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false);
 678}
 679EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
 680
 681void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 682{
 683        kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true);
 684}
 685EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
 686
 687/*
 688 * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
 689 * a #GP and return false.
 690 */
 691bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
 692{
 693        if (kvm_x86_ops.get_cpl(vcpu) <= required_cpl)
 694                return true;
 695        kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
 696        return false;
 697}
 698EXPORT_SYMBOL_GPL(kvm_require_cpl);
 699
 700bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
 701{
 702        if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
 703                return true;
 704
 705        kvm_queue_exception(vcpu, UD_VECTOR);
 706        return false;
 707}
 708EXPORT_SYMBOL_GPL(kvm_require_dr);
 709
 710/*
 711 * This function will be used to read from the physical memory of the currently
 712 * running guest. The difference to kvm_vcpu_read_guest_page is that this function
 713 * can read from guest physical or from the guest's guest physical memory.
 714 */
 715int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 716                            gfn_t ngfn, void *data, int offset, int len,
 717                            u32 access)
 718{
 719        struct x86_exception exception;
 720        gfn_t real_gfn;
 721        gpa_t ngpa;
 722
 723        ngpa     = gfn_to_gpa(ngfn);
 724        real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception);
 725        if (real_gfn == UNMAPPED_GVA)
 726                return -EFAULT;
 727
 728        real_gfn = gpa_to_gfn(real_gfn);
 729
 730        return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
 731}
 732EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
 733
 734static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 735                               void *data, int offset, int len, u32 access)
 736{
 737        return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
 738                                       data, offset, len, access);
 739}
 740
 741static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
 742{
 743        return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) |
 744               rsvd_bits(1, 2);
 745}
 746
 747/*
 748 * Load the pae pdptrs.  Return 1 if they are all valid, 0 otherwise.
 749 */
 750int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
 751{
 752        gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
 753        unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
 754        int i;
 755        int ret;
 756        u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
 757
 758        ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
 759                                      offset * sizeof(u64), sizeof(pdpte),
 760                                      PFERR_USER_MASK|PFERR_WRITE_MASK);
 761        if (ret < 0) {
 762                ret = 0;
 763                goto out;
 764        }
 765        for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
 766                if ((pdpte[i] & PT_PRESENT_MASK) &&
 767                    (pdpte[i] & pdptr_rsvd_bits(vcpu))) {
 768                        ret = 0;
 769                        goto out;
 770                }
 771        }
 772        ret = 1;
 773
 774        memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
 775        kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
 776
 777out:
 778
 779        return ret;
 780}
 781EXPORT_SYMBOL_GPL(load_pdptrs);
 782
 783bool pdptrs_changed(struct kvm_vcpu *vcpu)
 784{
 785        u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
 786        int offset;
 787        gfn_t gfn;
 788        int r;
 789
 790        if (!is_pae_paging(vcpu))
 791                return false;
 792
 793        if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
 794                return true;
 795
 796        gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
 797        offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1);
 798        r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
 799                                       PFERR_USER_MASK | PFERR_WRITE_MASK);
 800        if (r < 0)
 801                return true;
 802
 803        return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
 804}
 805EXPORT_SYMBOL_GPL(pdptrs_changed);
 806
 807int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 808{
 809        unsigned long old_cr0 = kvm_read_cr0(vcpu);
 810        unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG;
 811        unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
 812
 813        cr0 |= X86_CR0_ET;
 814
 815#ifdef CONFIG_X86_64
 816        if (cr0 & 0xffffffff00000000UL)
 817                return 1;
 818#endif
 819
 820        cr0 &= ~CR0_RESERVED_BITS;
 821
 822        if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
 823                return 1;
 824
 825        if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
 826                return 1;
 827
 828#ifdef CONFIG_X86_64
 829        if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
 830            (cr0 & X86_CR0_PG)) {
 831                int cs_db, cs_l;
 832
 833                if (!is_pae(vcpu))
 834                        return 1;
 835                kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 836                if (cs_l)
 837                        return 1;
 838        }
 839#endif
 840        if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) &&
 841            is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) &&
 842            !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)))
 843                return 1;
 844
 845        if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
 846                return 1;
 847
 848        kvm_x86_ops.set_cr0(vcpu, cr0);
 849
 850        if ((cr0 ^ old_cr0) & X86_CR0_PG) {
 851                kvm_clear_async_pf_completion_queue(vcpu);
 852                kvm_async_pf_hash_reset(vcpu);
 853        }
 854
 855        if ((cr0 ^ old_cr0) & update_bits)
 856                kvm_mmu_reset_context(vcpu);
 857
 858        if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
 859            kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
 860            !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
 861                kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
 862
 863        return 0;
 864}
 865EXPORT_SYMBOL_GPL(kvm_set_cr0);
 866
 867void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 868{
 869        (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
 870}
 871EXPORT_SYMBOL_GPL(kvm_lmsw);
 872
 873void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
 874{
 875        if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
 876
 877                if (vcpu->arch.xcr0 != host_xcr0)
 878                        xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
 879
 880                if (vcpu->arch.xsaves_enabled &&
 881                    vcpu->arch.ia32_xss != host_xss)
 882                        wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
 883        }
 884
 885        if (static_cpu_has(X86_FEATURE_PKU) &&
 886            (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
 887             (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
 888            vcpu->arch.pkru != vcpu->arch.host_pkru)
 889                __write_pkru(vcpu->arch.pkru);
 890}
 891EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
 892
 893void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
 894{
 895        if (static_cpu_has(X86_FEATURE_PKU) &&
 896            (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
 897             (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
 898                vcpu->arch.pkru = rdpkru();
 899                if (vcpu->arch.pkru != vcpu->arch.host_pkru)
 900                        __write_pkru(vcpu->arch.host_pkru);
 901        }
 902
 903        if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
 904
 905                if (vcpu->arch.xcr0 != host_xcr0)
 906                        xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
 907
 908                if (vcpu->arch.xsaves_enabled &&
 909                    vcpu->arch.ia32_xss != host_xss)
 910                        wrmsrl(MSR_IA32_XSS, host_xss);
 911        }
 912
 913}
 914EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
 915
 916static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 917{
 918        u64 xcr0 = xcr;
 919        u64 old_xcr0 = vcpu->arch.xcr0;
 920        u64 valid_bits;
 921
 922        /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
 923        if (index != XCR_XFEATURE_ENABLED_MASK)
 924                return 1;
 925        if (!(xcr0 & XFEATURE_MASK_FP))
 926                return 1;
 927        if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
 928                return 1;
 929
 930        /*
 931         * Do not allow the guest to set bits that we do not support
 932         * saving.  However, xcr0 bit 0 is always set, even if the
 933         * emulated CPU does not support XSAVE (see fx_init).
 934         */
 935        valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
 936        if (xcr0 & ~valid_bits)
 937                return 1;
 938
 939        if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
 940            (!(xcr0 & XFEATURE_MASK_BNDCSR)))
 941                return 1;
 942
 943        if (xcr0 & XFEATURE_MASK_AVX512) {
 944                if (!(xcr0 & XFEATURE_MASK_YMM))
 945                        return 1;
 946                if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
 947                        return 1;
 948        }
 949        vcpu->arch.xcr0 = xcr0;
 950
 951        if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
 952                kvm_update_cpuid_runtime(vcpu);
 953        return 0;
 954}
 955
 956int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 957{
 958        if (kvm_x86_ops.get_cpl(vcpu) != 0 ||
 959            __kvm_set_xcr(vcpu, index, xcr)) {
 960                kvm_inject_gp(vcpu, 0);
 961                return 1;
 962        }
 963        return 0;
 964}
 965EXPORT_SYMBOL_GPL(kvm_set_xcr);
 966
 967int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 968{
 969        if (cr4 & cr4_reserved_bits)
 970                return -EINVAL;
 971
 972        if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
 973                return -EINVAL;
 974
 975        return 0;
 976}
 977EXPORT_SYMBOL_GPL(kvm_valid_cr4);
 978
 979int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 980{
 981        unsigned long old_cr4 = kvm_read_cr4(vcpu);
 982        unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
 983                                   X86_CR4_SMEP;
 984        unsigned long mmu_role_bits = pdptr_bits | X86_CR4_SMAP | X86_CR4_PKE;
 985
 986        if (kvm_valid_cr4(vcpu, cr4))
 987                return 1;
 988
 989        if (is_long_mode(vcpu)) {
 990                if (!(cr4 & X86_CR4_PAE))
 991                        return 1;
 992                if ((cr4 ^ old_cr4) & X86_CR4_LA57)
 993                        return 1;
 994        } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
 995                   && ((cr4 ^ old_cr4) & pdptr_bits)
 996                   && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
 997                                   kvm_read_cr3(vcpu)))
 998                return 1;
 999
1000        if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
1001                if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
1002                        return 1;
1003
1004                /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
1005                if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
1006                        return 1;
1007        }
1008
1009        if (kvm_x86_ops.set_cr4(vcpu, cr4))
1010                return 1;
1011
1012        if (((cr4 ^ old_cr4) & mmu_role_bits) ||
1013            (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
1014                kvm_mmu_reset_context(vcpu);
1015
1016        if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
1017                kvm_update_cpuid_runtime(vcpu);
1018
1019        return 0;
1020}
1021EXPORT_SYMBOL_GPL(kvm_set_cr4);
1022
1023int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1024{
1025        bool skip_tlb_flush = false;
1026#ifdef CONFIG_X86_64
1027        bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
1028
1029        if (pcid_enabled) {
1030                skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
1031                cr3 &= ~X86_CR3_PCID_NOFLUSH;
1032        }
1033#endif
1034
1035        if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
1036                if (!skip_tlb_flush) {
1037                        kvm_mmu_sync_roots(vcpu);
1038                        kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
1039                }
1040                return 0;
1041        }
1042
1043        if (is_long_mode(vcpu) &&
1044            (cr3 & vcpu->arch.cr3_lm_rsvd_bits))
1045                return 1;
1046        else if (is_pae_paging(vcpu) &&
1047                 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
1048                return 1;
1049
1050        kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush);
1051        vcpu->arch.cr3 = cr3;
1052        kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
1053
1054        return 0;
1055}
1056EXPORT_SYMBOL_GPL(kvm_set_cr3);
1057
1058int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
1059{
1060        if (cr8 & CR8_RESERVED_BITS)
1061                return 1;
1062        if (lapic_in_kernel(vcpu))
1063                kvm_lapic_set_tpr(vcpu, cr8);
1064        else
1065                vcpu->arch.cr8 = cr8;
1066        return 0;
1067}
1068EXPORT_SYMBOL_GPL(kvm_set_cr8);
1069
1070unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
1071{
1072        if (lapic_in_kernel(vcpu))
1073                return kvm_lapic_get_cr8(vcpu);
1074        else
1075                return vcpu->arch.cr8;
1076}
1077EXPORT_SYMBOL_GPL(kvm_get_cr8);
1078
1079static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
1080{
1081        int i;
1082
1083        if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
1084                for (i = 0; i < KVM_NR_DB_REGS; i++)
1085                        vcpu->arch.eff_db[i] = vcpu->arch.db[i];
1086                vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
1087        }
1088}
1089
1090void kvm_update_dr7(struct kvm_vcpu *vcpu)
1091{
1092        unsigned long dr7;
1093
1094        if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1095                dr7 = vcpu->arch.guest_debug_dr7;
1096        else
1097                dr7 = vcpu->arch.dr7;
1098        kvm_x86_ops.set_dr7(vcpu, dr7);
1099        vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
1100        if (dr7 & DR7_BP_EN_MASK)
1101                vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
1102}
1103EXPORT_SYMBOL_GPL(kvm_update_dr7);
1104
1105static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
1106{
1107        u64 fixed = DR6_FIXED_1;
1108
1109        if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
1110                fixed |= DR6_RTM;
1111        return fixed;
1112}
1113
1114static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
1115{
1116        size_t size = ARRAY_SIZE(vcpu->arch.db);
1117
1118        switch (dr) {
1119        case 0 ... 3:
1120                vcpu->arch.db[array_index_nospec(dr, size)] = val;
1121                if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
1122                        vcpu->arch.eff_db[dr] = val;
1123                break;
1124        case 4:
1125        case 6:
1126                if (!kvm_dr6_valid(val))
1127                        return -1; /* #GP */
1128                vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
1129                break;
1130        case 5:
1131        default: /* 7 */
1132                if (!kvm_dr7_valid(val))
1133                        return -1; /* #GP */
1134                vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
1135                kvm_update_dr7(vcpu);
1136                break;
1137        }
1138
1139        return 0;
1140}
1141
1142int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
1143{
1144        if (__kvm_set_dr(vcpu, dr, val)) {
1145                kvm_inject_gp(vcpu, 0);
1146                return 1;
1147        }
1148        return 0;
1149}
1150EXPORT_SYMBOL_GPL(kvm_set_dr);
1151
1152int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
1153{
1154        size_t size = ARRAY_SIZE(vcpu->arch.db);
1155
1156        switch (dr) {
1157        case 0 ... 3:
1158                *val = vcpu->arch.db[array_index_nospec(dr, size)];
1159                break;
1160        case 4:
1161        case 6:
1162                *val = vcpu->arch.dr6;
1163                break;
1164        case 5:
1165        default: /* 7 */
1166                *val = vcpu->arch.dr7;
1167                break;
1168        }
1169        return 0;
1170}
1171EXPORT_SYMBOL_GPL(kvm_get_dr);
1172
1173bool kvm_rdpmc(struct kvm_vcpu *vcpu)
1174{
1175        u32 ecx = kvm_rcx_read(vcpu);
1176        u64 data;
1177        int err;
1178
1179        err = kvm_pmu_rdpmc(vcpu, ecx, &data);
1180        if (err)
1181                return err;
1182        kvm_rax_write(vcpu, (u32)data);
1183        kvm_rdx_write(vcpu, data >> 32);
1184        return err;
1185}
1186EXPORT_SYMBOL_GPL(kvm_rdpmc);
1187
1188/*
1189 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
1190 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
1191 *
1192 * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features)
1193 * extract the supported MSRs from the related const lists.
1194 * msrs_to_save is selected from the msrs_to_save_all to reflect the
1195 * capabilities of the host cpu. This capabilities test skips MSRs that are
1196 * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs
1197 * may depend on host virtualization features rather than host cpu features.
1198 */
1199
1200static const u32 msrs_to_save_all[] = {
1201        MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
1202        MSR_STAR,
1203#ifdef CONFIG_X86_64
1204        MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
1205#endif
1206        MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
1207        MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
1208        MSR_IA32_SPEC_CTRL,
1209        MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
1210        MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
1211        MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
1212        MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
1213        MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
1214        MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
1215        MSR_IA32_UMWAIT_CONTROL,
1216
1217        MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
1218        MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3,
1219        MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
1220        MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
1221        MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
1222        MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
1223        MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
1224        MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
1225        MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9,
1226        MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11,
1227        MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13,
1228        MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15,
1229        MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17,
1230        MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
1231        MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
1232        MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
1233        MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
1234        MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9,
1235        MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11,
1236        MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
1237        MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
1238        MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
1239};
1240
1241static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
1242static unsigned num_msrs_to_save;
1243
1244static const u32 emulated_msrs_all[] = {
1245        MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
1246        MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
1247        HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
1248        HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
1249        HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
1250        HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
1251        HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
1252        HV_X64_MSR_RESET,
1253        HV_X64_MSR_VP_INDEX,
1254        HV_X64_MSR_VP_RUNTIME,
1255        HV_X64_MSR_SCONTROL,
1256        HV_X64_MSR_STIMER0_CONFIG,
1257        HV_X64_MSR_VP_ASSIST_PAGE,
1258        HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
1259        HV_X64_MSR_TSC_EMULATION_STATUS,
1260        HV_X64_MSR_SYNDBG_OPTIONS,
1261        HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
1262        HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
1263        HV_X64_MSR_SYNDBG_PENDING_BUFFER,
1264
1265        MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
1266        MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
1267
1268        MSR_IA32_TSC_ADJUST,
1269        MSR_IA32_TSCDEADLINE,
1270        MSR_IA32_ARCH_CAPABILITIES,
1271        MSR_IA32_PERF_CAPABILITIES,
1272        MSR_IA32_MISC_ENABLE,
1273        MSR_IA32_MCG_STATUS,
1274        MSR_IA32_MCG_CTL,
1275        MSR_IA32_MCG_EXT_CTL,
1276        MSR_IA32_SMBASE,
1277        MSR_SMI_COUNT,
1278        MSR_PLATFORM_INFO,
1279        MSR_MISC_FEATURES_ENABLES,
1280        MSR_AMD64_VIRT_SPEC_CTRL,
1281        MSR_IA32_POWER_CTL,
1282        MSR_IA32_UCODE_REV,
1283
1284        /*
1285         * The following list leaves out MSRs whose values are determined
1286         * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs.
1287         * We always support the "true" VMX control MSRs, even if the host
1288         * processor does not, so I am putting these registers here rather
1289         * than in msrs_to_save_all.
1290         */
1291        MSR_IA32_VMX_BASIC,
1292        MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1293        MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
1294        MSR_IA32_VMX_TRUE_EXIT_CTLS,
1295        MSR_IA32_VMX_TRUE_ENTRY_CTLS,
1296        MSR_IA32_VMX_MISC,
1297        MSR_IA32_VMX_CR0_FIXED0,
1298        MSR_IA32_VMX_CR4_FIXED0,
1299        MSR_IA32_VMX_VMCS_ENUM,
1300        MSR_IA32_VMX_PROCBASED_CTLS2,
1301        MSR_IA32_VMX_EPT_VPID_CAP,
1302        MSR_IA32_VMX_VMFUNC,
1303
1304        MSR_K7_HWCR,
1305        MSR_KVM_POLL_CONTROL,
1306};
1307
1308static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
1309static unsigned num_emulated_msrs;
1310
1311/*
1312 * List of msr numbers which are used to expose MSR-based features that
1313 * can be used by a hypervisor to validate requested CPU features.
1314 */
1315static const u32 msr_based_features_all[] = {
1316        MSR_IA32_VMX_BASIC,
1317        MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1318        MSR_IA32_VMX_PINBASED_CTLS,
1319        MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
1320        MSR_IA32_VMX_PROCBASED_CTLS,
1321        MSR_IA32_VMX_TRUE_EXIT_CTLS,
1322        MSR_IA32_VMX_EXIT_CTLS,
1323        MSR_IA32_VMX_TRUE_ENTRY_CTLS,
1324        MSR_IA32_VMX_ENTRY_CTLS,
1325        MSR_IA32_VMX_MISC,
1326        MSR_IA32_VMX_CR0_FIXED0,
1327        MSR_IA32_VMX_CR0_FIXED1,
1328        MSR_IA32_VMX_CR4_FIXED0,
1329        MSR_IA32_VMX_CR4_FIXED1,
1330        MSR_IA32_VMX_VMCS_ENUM,
1331        MSR_IA32_VMX_PROCBASED_CTLS2,
1332        MSR_IA32_VMX_EPT_VPID_CAP,
1333        MSR_IA32_VMX_VMFUNC,
1334
1335        MSR_F10H_DECFG,
1336        MSR_IA32_UCODE_REV,
1337        MSR_IA32_ARCH_CAPABILITIES,
1338        MSR_IA32_PERF_CAPABILITIES,
1339};
1340
1341static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
1342static unsigned int num_msr_based_features;
1343
1344static u64 kvm_get_arch_capabilities(void)
1345{
1346        u64 data = 0;
1347
1348        if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
1349                rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
1350
1351        /*
1352         * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
1353         * the nested hypervisor runs with NX huge pages.  If it is not,
1354         * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other
1355         * L1 guests, so it need not worry about its own (L2) guests.
1356         */
1357        data |= ARCH_CAP_PSCHANGE_MC_NO;
1358
1359        /*
1360         * If we're doing cache flushes (either "always" or "cond")
1361         * we will do one whenever the guest does a vmlaunch/vmresume.
1362         * If an outer hypervisor is doing the cache flush for us
1363         * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that
1364         * capability to the guest too, and if EPT is disabled we're not
1365         * vulnerable.  Overall, only VMENTER_L1D_FLUSH_NEVER will
1366         * require a nested hypervisor to do a flush of its own.
1367         */
1368        if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
1369                data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
1370
1371        if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
1372                data |= ARCH_CAP_RDCL_NO;
1373        if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
1374                data |= ARCH_CAP_SSB_NO;
1375        if (!boot_cpu_has_bug(X86_BUG_MDS))
1376                data |= ARCH_CAP_MDS_NO;
1377
1378        /*
1379         * On TAA affected systems:
1380         *      - nothing to do if TSX is disabled on the host.
1381         *      - we emulate TSX_CTRL if present on the host.
1382         *        This lets the guest use VERW to clear CPU buffers.
1383         */
1384        if (!boot_cpu_has(X86_FEATURE_RTM))
1385                data &= ~(ARCH_CAP_TAA_NO | ARCH_CAP_TSX_CTRL_MSR);
1386        else if (!boot_cpu_has_bug(X86_BUG_TAA))
1387                data |= ARCH_CAP_TAA_NO;
1388
1389        return data;
1390}
1391
1392static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
1393{
1394        switch (msr->index) {
1395        case MSR_IA32_ARCH_CAPABILITIES:
1396                msr->data = kvm_get_arch_capabilities();
1397                break;
1398        case MSR_IA32_UCODE_REV:
1399                rdmsrl_safe(msr->index, &msr->data);
1400                break;
1401        default:
1402                return kvm_x86_ops.get_msr_feature(msr);
1403        }
1404        return 0;
1405}
1406
1407static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1408{
1409        struct kvm_msr_entry msr;
1410        int r;
1411
1412        msr.index = index;
1413        r = kvm_get_msr_feature(&msr);
1414
1415        if (r == KVM_MSR_RET_INVALID) {
1416                /* Unconditionally clear the output for simplicity */
1417                *data = 0;
1418                if (kvm_msr_ignored_check(vcpu, index, 0, false))
1419                        r = 0;
1420        }
1421
1422        if (r)
1423                return r;
1424
1425        *data = msr.data;
1426
1427        return 0;
1428}
1429
1430static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
1431{
1432        if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
1433                return false;
1434
1435        if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
1436                return false;
1437
1438        if (efer & (EFER_LME | EFER_LMA) &&
1439            !guest_cpuid_has(vcpu, X86_FEATURE_LM))
1440                return false;
1441
1442        if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX))
1443                return false;
1444
1445        return true;
1446
1447}
1448bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
1449{
1450        if (efer & efer_reserved_bits)
1451                return false;
1452
1453        return __kvm_valid_efer(vcpu, efer);
1454}
1455EXPORT_SYMBOL_GPL(kvm_valid_efer);
1456
1457static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1458{
1459        u64 old_efer = vcpu->arch.efer;
1460        u64 efer = msr_info->data;
1461        int r;
1462
1463        if (efer & efer_reserved_bits)
1464                return 1;
1465
1466        if (!msr_info->host_initiated) {
1467                if (!__kvm_valid_efer(vcpu, efer))
1468                        return 1;
1469
1470                if (is_paging(vcpu) &&
1471                    (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
1472                        return 1;
1473        }
1474
1475        efer &= ~EFER_LMA;
1476        efer |= vcpu->arch.efer & EFER_LMA;
1477
1478        r = kvm_x86_ops.set_efer(vcpu, efer);
1479        if (r) {
1480                WARN_ON(r > 0);
1481                return r;
1482        }
1483
1484        /* Update reserved bits */
1485        if ((efer ^ old_efer) & EFER_NX)
1486                kvm_mmu_reset_context(vcpu);
1487
1488        return 0;
1489}
1490
1491void kvm_enable_efer_bits(u64 mask)
1492{
1493       efer_reserved_bits &= ~mask;
1494}
1495EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
1496
1497bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
1498{
1499        struct kvm *kvm = vcpu->kvm;
1500        struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges;
1501        u32 count = kvm->arch.msr_filter.count;
1502        u32 i;
1503        bool r = kvm->arch.msr_filter.default_allow;
1504        int idx;
1505
1506        /* MSR filtering not set up or x2APIC enabled, allow everything */
1507        if (!count || (index >= 0x800 && index <= 0x8ff))
1508                return true;
1509
1510        /* Prevent collision with set_msr_filter */
1511        idx = srcu_read_lock(&kvm->srcu);
1512
1513        for (i = 0; i < count; i++) {
1514                u32 start = ranges[i].base;
1515                u32 end = start + ranges[i].nmsrs;
1516                u32 flags = ranges[i].flags;
1517                unsigned long *bitmap = ranges[i].bitmap;
1518
1519                if ((index >= start) && (index < end) && (flags & type)) {
1520                        r = !!test_bit(index - start, bitmap);
1521                        break;
1522                }
1523        }
1524
1525        srcu_read_unlock(&kvm->srcu, idx);
1526
1527        return r;
1528}
1529EXPORT_SYMBOL_GPL(kvm_msr_allowed);
1530
1531/*
1532 * Write @data into the MSR specified by @index.  Select MSR specific fault
1533 * checks are bypassed if @host_initiated is %true.
1534 * Returns 0 on success, non-0 otherwise.
1535 * Assumes vcpu_load() was already called.
1536 */
1537static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
1538                         bool host_initiated)
1539{
1540        struct msr_data msr;
1541
1542        if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
1543                return KVM_MSR_RET_FILTERED;
1544
1545        switch (index) {
1546        case MSR_FS_BASE:
1547        case MSR_GS_BASE:
1548        case MSR_KERNEL_GS_BASE:
1549        case MSR_CSTAR:
1550        case MSR_LSTAR:
1551                if (is_noncanonical_address(data, vcpu))
1552                        return 1;
1553                break;
1554        case MSR_IA32_SYSENTER_EIP:
1555        case MSR_IA32_SYSENTER_ESP:
1556                /*
1557                 * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
1558                 * non-canonical address is written on Intel but not on
1559                 * AMD (which ignores the top 32-bits, because it does
1560                 * not implement 64-bit SYSENTER).
1561                 *
1562                 * 64-bit code should hence be able to write a non-canonical
1563                 * value on AMD.  Making the address canonical ensures that
1564                 * vmentry does not fail on Intel after writing a non-canonical
1565                 * value, and that something deterministic happens if the guest
1566                 * invokes 64-bit SYSENTER.
1567                 */
1568                data = get_canonical(data, vcpu_virt_addr_bits(vcpu));
1569        }
1570
1571        msr.data = data;
1572        msr.index = index;
1573        msr.host_initiated = host_initiated;
1574
1575        return kvm_x86_ops.set_msr(vcpu, &msr);
1576}
1577
1578static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
1579                                     u32 index, u64 data, bool host_initiated)
1580{
1581        int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
1582
1583        if (ret == KVM_MSR_RET_INVALID)
1584                if (kvm_msr_ignored_check(vcpu, index, data, true))
1585                        ret = 0;
1586
1587        return ret;
1588}
1589
1590/*
1591 * Read the MSR specified by @index into @data.  Select MSR specific fault
1592 * checks are bypassed if @host_initiated is %true.
1593 * Returns 0 on success, non-0 otherwise.
1594 * Assumes vcpu_load() was already called.
1595 */
1596int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
1597                  bool host_initiated)
1598{
1599        struct msr_data msr;
1600        int ret;
1601
1602        if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
1603                return KVM_MSR_RET_FILTERED;
1604
1605        msr.index = index;
1606        msr.host_initiated = host_initiated;
1607
1608        ret = kvm_x86_ops.get_msr(vcpu, &msr);
1609        if (!ret)
1610                *data = msr.data;
1611        return ret;
1612}
1613
1614static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
1615                                     u32 index, u64 *data, bool host_initiated)
1616{
1617        int ret = __kvm_get_msr(vcpu, index, data, host_initiated);
1618
1619        if (ret == KVM_MSR_RET_INVALID) {
1620                /* Unconditionally clear *data for simplicity */
1621                *data = 0;
1622                if (kvm_msr_ignored_check(vcpu, index, 0, false))
1623                        ret = 0;
1624        }
1625
1626        return ret;
1627}
1628
1629int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
1630{
1631        return kvm_get_msr_ignored_check(vcpu, index, data, false);
1632}
1633EXPORT_SYMBOL_GPL(kvm_get_msr);
1634
1635int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
1636{
1637        return kvm_set_msr_ignored_check(vcpu, index, data, false);
1638}
1639EXPORT_SYMBOL_GPL(kvm_set_msr);
1640
1641static int complete_emulated_msr(struct kvm_vcpu *vcpu, bool is_read)
1642{
1643        if (vcpu->run->msr.error) {
1644                kvm_inject_gp(vcpu, 0);
1645                return 1;
1646        } else if (is_read) {
1647                kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
1648                kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
1649        }
1650
1651        return kvm_skip_emulated_instruction(vcpu);
1652}
1653
1654static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
1655{
1656        return complete_emulated_msr(vcpu, true);
1657}
1658
1659static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
1660{
1661        return complete_emulated_msr(vcpu, false);
1662}
1663
1664static u64 kvm_msr_reason(int r)
1665{
1666        switch (r) {
1667        case KVM_MSR_RET_INVALID:
1668                return KVM_MSR_EXIT_REASON_UNKNOWN;
1669        case KVM_MSR_RET_FILTERED:
1670                return KVM_MSR_EXIT_REASON_FILTER;
1671        default:
1672                return KVM_MSR_EXIT_REASON_INVAL;
1673        }
1674}
1675
1676static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
1677                              u32 exit_reason, u64 data,
1678                              int (*completion)(struct kvm_vcpu *vcpu),
1679                              int r)
1680{
1681        u64 msr_reason = kvm_msr_reason(r);
1682
1683        /* Check if the user wanted to know about this MSR fault */
1684        if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason))
1685                return 0;
1686
1687        vcpu->run->exit_reason = exit_reason;
1688        vcpu->run->msr.error = 0;
1689        memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad));
1690        vcpu->run->msr.reason = msr_reason;
1691        vcpu->run->msr.index = index;
1692        vcpu->run->msr.data = data;
1693        vcpu->arch.complete_userspace_io = completion;
1694
1695        return 1;
1696}
1697
1698static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r)
1699{
1700        return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0,
1701                                   complete_emulated_rdmsr, r);
1702}
1703
1704static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r)
1705{
1706        return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data,
1707                                   complete_emulated_wrmsr, r);
1708}
1709
1710int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
1711{
1712        u32 ecx = kvm_rcx_read(vcpu);
1713        u64 data;
1714        int r;
1715
1716        r = kvm_get_msr(vcpu, ecx, &data);
1717
1718        /* MSR read failed? See if we should ask user space */
1719        if (r && kvm_get_msr_user_space(vcpu, ecx, r)) {
1720                /* Bounce to user space */
1721                return 0;
1722        }
1723
1724        /* MSR read failed? Inject a #GP */
1725        if (r) {
1726                trace_kvm_msr_read_ex(ecx);
1727                kvm_inject_gp(vcpu, 0);
1728                return 1;
1729        }
1730
1731        trace_kvm_msr_read(ecx, data);
1732
1733        kvm_rax_write(vcpu, data & -1u);
1734        kvm_rdx_write(vcpu, (data >> 32) & -1u);
1735        return kvm_skip_emulated_instruction(vcpu);
1736}
1737EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
1738
1739int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
1740{
1741        u32 ecx = kvm_rcx_read(vcpu);
1742        u64 data = kvm_read_edx_eax(vcpu);
1743        int r;
1744
1745        r = kvm_set_msr(vcpu, ecx, data);
1746
1747        /* MSR write failed? See if we should ask user space */
1748        if (r && kvm_set_msr_user_space(vcpu, ecx, data, r))
1749                /* Bounce to user space */
1750                return 0;
1751
1752        /* Signal all other negative errors to userspace */
1753        if (r < 0)
1754                return r;
1755
1756        /* MSR write failed? Inject a #GP */
1757        if (r > 0) {
1758                trace_kvm_msr_write_ex(ecx, data);
1759                kvm_inject_gp(vcpu, 0);
1760                return 1;
1761        }
1762
1763        trace_kvm_msr_write(ecx, data);
1764        return kvm_skip_emulated_instruction(vcpu);
1765}
1766EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
1767
1768bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
1769{
1770        return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) ||
1771                xfer_to_guest_mode_work_pending();
1772}
1773EXPORT_SYMBOL_GPL(kvm_vcpu_exit_request);
1774
1775/*
1776 * The fast path for frequent and performance sensitive wrmsr emulation,
1777 * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces
1778 * the latency of virtual IPI by avoiding the expensive bits of transitioning
1779 * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the
1780 * other cases which must be called after interrupts are enabled on the host.
1781 */
1782static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data)
1783{
1784        if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic))
1785                return 1;
1786
1787        if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) &&
1788                ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
1789                ((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
1790                ((u32)(data >> 32) != X2APIC_BROADCAST)) {
1791
1792                data &= ~(1 << 12);
1793                kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32));
1794                kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
1795                kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data);
1796                trace_kvm_apic_write(APIC_ICR, (u32)data);
1797                return 0;
1798        }
1799
1800        return 1;
1801}
1802
1803static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data)
1804{
1805        if (!kvm_can_use_hv_timer(vcpu))
1806                return 1;
1807
1808        kvm_set_lapic_tscdeadline_msr(vcpu, data);
1809        return 0;
1810}
1811
1812fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
1813{
1814        u32 msr = kvm_rcx_read(vcpu);
1815        u64 data;
1816        fastpath_t ret = EXIT_FASTPATH_NONE;
1817
1818        switch (msr) {
1819        case APIC_BASE_MSR + (APIC_ICR >> 4):
1820                data = kvm_read_edx_eax(vcpu);
1821                if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) {
1822                        kvm_skip_emulated_instruction(vcpu);
1823                        ret = EXIT_FASTPATH_EXIT_HANDLED;
1824                }
1825                break;
1826        case MSR_IA32_TSCDEADLINE:
1827                data = kvm_read_edx_eax(vcpu);
1828                if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
1829                        kvm_skip_emulated_instruction(vcpu);
1830                        ret = EXIT_FASTPATH_REENTER_GUEST;
1831                }
1832                break;
1833        default:
1834                break;
1835        }
1836
1837        if (ret != EXIT_FASTPATH_NONE)
1838                trace_kvm_msr_write(msr, data);
1839
1840        return ret;
1841}
1842EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);
1843
1844/*
1845 * Adapt set_msr() to msr_io()'s calling convention
1846 */
1847static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1848{
1849        return kvm_get_msr_ignored_check(vcpu, index, data, true);
1850}
1851
1852static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1853{
1854        return kvm_set_msr_ignored_check(vcpu, index, *data, true);
1855}
1856
1857#ifdef CONFIG_X86_64
1858struct pvclock_clock {
1859        int vclock_mode;
1860        u64 cycle_last;
1861        u64 mask;
1862        u32 mult;
1863        u32 shift;
1864        u64 base_cycles;
1865        u64 offset;
1866};
1867
1868struct pvclock_gtod_data {
1869        seqcount_t      seq;
1870
1871        struct pvclock_clock clock; /* extract of a clocksource struct */
1872        struct pvclock_clock raw_clock; /* extract of a clocksource struct */
1873
1874        ktime_t         offs_boot;
1875        u64             wall_time_sec;
1876};
1877
1878static struct pvclock_gtod_data pvclock_gtod_data;
1879
1880static void update_pvclock_gtod(struct timekeeper *tk)
1881{
1882        struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
1883
1884        write_seqcount_begin(&vdata->seq);
1885
1886        /* copy pvclock gtod data */
1887        vdata->clock.vclock_mode        = tk->tkr_mono.clock->vdso_clock_mode;
1888        vdata->clock.cycle_last         = tk->tkr_mono.cycle_last;
1889        vdata->clock.mask               = tk->tkr_mono.mask;
1890        vdata->clock.mult               = tk->tkr_mono.mult;
1891        vdata->clock.shift              = tk->tkr_mono.shift;
1892        vdata->clock.base_cycles        = tk->tkr_mono.xtime_nsec;
1893        vdata->clock.offset             = tk->tkr_mono.base;
1894
1895        vdata->raw_clock.vclock_mode    = tk->tkr_raw.clock->vdso_clock_mode;
1896        vdata->raw_clock.cycle_last     = tk->tkr_raw.cycle_last;
1897        vdata->raw_clock.mask           = tk->tkr_raw.mask;
1898        vdata->raw_clock.mult           = tk->tkr_raw.mult;
1899        vdata->raw_clock.shift          = tk->tkr_raw.shift;
1900        vdata->raw_clock.base_cycles    = tk->tkr_raw.xtime_nsec;
1901        vdata->raw_clock.offset         = tk->tkr_raw.base;
1902
1903        vdata->wall_time_sec            = tk->xtime_sec;
1904
1905        vdata->offs_boot                = tk->offs_boot;
1906
1907        write_seqcount_end(&vdata->seq);
1908}
1909
1910static s64 get_kvmclock_base_ns(void)
1911{
1912        /* Count up from boot time, but with the frequency of the raw clock.  */
1913        return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot));
1914}
1915#else
1916static s64 get_kvmclock_base_ns(void)
1917{
1918        /* Master clock not used, so we can just use CLOCK_BOOTTIME.  */
1919        return ktime_get_boottime_ns();
1920}
1921#endif
1922
1923static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
1924{
1925        int version;
1926        int r;
1927        struct pvclock_wall_clock wc;
1928        u64 wall_nsec;
1929
1930        kvm->arch.wall_clock = wall_clock;
1931
1932        if (!wall_clock)
1933                return;
1934
1935        r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
1936        if (r)
1937                return;
1938
1939        if (version & 1)
1940                ++version;  /* first time write, random junk */
1941
1942        ++version;
1943
1944        if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
1945                return;
1946
1947        /*
1948         * The guest calculates current wall clock time by adding
1949         * system time (updated by kvm_guest_time_update below) to the
1950         * wall clock specified here.  We do the reverse here.
1951         */
1952        wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
1953
1954        wc.nsec = do_div(wall_nsec, 1000000000);
1955        wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
1956        wc.version = version;
1957
1958        kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
1959
1960        version++;
1961        kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
1962}
1963
1964static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
1965                                  bool old_msr, bool host_initiated)
1966{
1967        struct kvm_arch *ka = &vcpu->kvm->arch;
1968
1969        if (vcpu->vcpu_id == 0 && !host_initiated) {
1970                if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
1971                        kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
1972
1973                ka->boot_vcpu_runs_old_kvmclock = old_msr;
1974        }
1975
1976        vcpu->arch.time = system_time;
1977        kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
1978
1979        /* we verify if the enable bit is set... */
1980        vcpu->arch.pv_time_enabled = false;
1981        if (!(system_time & 1))
1982                return;
1983
1984        if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
1985                                       &vcpu->arch.pv_time, system_time & ~1ULL,
1986                                       sizeof(struct pvclock_vcpu_time_info)))
1987                vcpu->arch.pv_time_enabled = true;
1988
1989        return;
1990}
1991
1992static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
1993{
1994        do_shl32_div32(dividend, divisor);
1995        return dividend;
1996}
1997
1998static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
1999                               s8 *pshift, u32 *pmultiplier)
2000{
2001        uint64_t scaled64;
2002        int32_t  shift = 0;
2003        uint64_t tps64;
2004        uint32_t tps32;
2005
2006        tps64 = base_hz;
2007        scaled64 = scaled_hz;
2008        while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
2009                tps64 >>= 1;
2010                shift--;
2011        }
2012
2013        tps32 = (uint32_t)tps64;
2014        while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
2015                if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
2016                        scaled64 >>= 1;
2017                else
2018                        tps32 <<= 1;
2019                shift++;
2020        }
2021
2022        *pshift = shift;
2023        *pmultiplier = div_frac(scaled64, tps32);
2024}
2025
2026#ifdef CONFIG_X86_64
2027static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
2028#endif
2029
2030static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
2031static unsigned long max_tsc_khz;
2032
2033static u32 adjust_tsc_khz(u32 khz, s32 ppm)
2034{
2035        u64 v = (u64)khz * (1000000 + ppm);
2036        do_div(v, 1000000);
2037        return v;
2038}
2039
2040static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
2041{
2042        u64 ratio;
2043
2044        /* Guest TSC same frequency as host TSC? */
2045        if (!scale) {
2046                vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
2047                return 0;
2048        }
2049
2050        /* TSC scaling supported? */
2051        if (!kvm_has_tsc_control) {
2052                if (user_tsc_khz > tsc_khz) {
2053                        vcpu->arch.tsc_catchup = 1;
2054                        vcpu->arch.tsc_always_catchup = 1;
2055                        return 0;
2056                } else {
2057                        pr_warn_ratelimited("user requested TSC rate below hardware speed\n");
2058                        return -1;
2059                }
2060        }
2061
2062        /* TSC scaling required  - calculate ratio */
2063        ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
2064                                user_tsc_khz, tsc_khz);
2065
2066        if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
2067                pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
2068                                    user_tsc_khz);
2069                return -1;
2070        }
2071
2072        vcpu->arch.tsc_scaling_ratio = ratio;
2073        return 0;
2074}
2075
2076static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
2077{
2078        u32 thresh_lo, thresh_hi;
2079        int use_scaling = 0;
2080
2081        /* tsc_khz can be zero if TSC calibration fails */
2082        if (user_tsc_khz == 0) {
2083                /* set tsc_scaling_ratio to a safe value */
2084                vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
2085                return -1;
2086        }
2087
2088        /* Compute a scale to convert nanoseconds in TSC cycles */
2089        kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
2090                           &vcpu->arch.virtual_tsc_shift,
2091                           &vcpu->arch.virtual_tsc_mult);
2092        vcpu->arch.virtual_tsc_khz = user_tsc_khz;
2093
2094        /*
2095         * Compute the variation in TSC rate which is acceptable
2096         * within the range of tolerance and decide if the
2097         * rate being applied is within that bounds of the hardware
2098         * rate.  If so, no scaling or compensation need be done.
2099         */
2100        thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
2101        thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
2102        if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
2103                pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
2104                use_scaling = 1;
2105        }
2106        return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
2107}
2108
2109static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
2110{
2111        u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
2112                                      vcpu->arch.virtual_tsc_mult,
2113                                      vcpu->arch.virtual_tsc_shift);
2114        tsc += vcpu->arch.this_tsc_write;
2115        return tsc;
2116}
2117
2118static inline int gtod_is_based_on_tsc(int mode)
2119{
2120        return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK;
2121}
2122
2123static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
2124{
2125#ifdef CONFIG_X86_64
2126        bool vcpus_matched;
2127        struct kvm_arch *ka = &vcpu->kvm->arch;
2128        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
2129
2130        vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
2131                         atomic_read(&vcpu->kvm->online_vcpus));
2132
2133        /*
2134         * Once the masterclock is enabled, always perform request in
2135         * order to update it.
2136         *
2137         * In order to enable masterclock, the host clocksource must be TSC
2138         * and the vcpus need to have matched TSCs.  When that happens,
2139         * perform request to enable masterclock.
2140         */
2141        if (ka->use_master_clock ||
2142            (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched))
2143                kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
2144
2145        trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
2146                            atomic_read(&vcpu->kvm->online_vcpus),
2147                            ka->use_master_clock, gtod->clock.vclock_mode);
2148#endif
2149}
2150
2151/*
2152 * Multiply tsc by a fixed point number represented by ratio.
2153 *
2154 * The most significant 64-N bits (mult) of ratio represent the
2155 * integral part of the fixed point number; the remaining N bits
2156 * (frac) represent the fractional part, ie. ratio represents a fixed
2157 * point number (mult + frac * 2^(-N)).
2158 *
2159 * N equals to kvm_tsc_scaling_ratio_frac_bits.
2160 */
2161static inline u64 __scale_tsc(u64 ratio, u64 tsc)
2162{
2163        return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
2164}
2165
2166u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
2167{
2168        u64 _tsc = tsc;
2169        u64 ratio = vcpu->arch.tsc_scaling_ratio;
2170
2171        if (ratio != kvm_default_tsc_scaling_ratio)
2172                _tsc = __scale_tsc(ratio, tsc);
2173
2174        return _tsc;
2175}
2176EXPORT_SYMBOL_GPL(kvm_scale_tsc);
2177
2178static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
2179{
2180        u64 tsc;
2181
2182        tsc = kvm_scale_tsc(vcpu, rdtsc());
2183
2184        return target_tsc - tsc;
2185}
2186
2187u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
2188{
2189        return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
2190}
2191EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
2192
2193static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
2194{
2195        vcpu->arch.l1_tsc_offset = offset;
2196        vcpu->arch.tsc_offset = kvm_x86_ops.write_l1_tsc_offset(vcpu, offset);
2197}
2198
2199static inline bool kvm_check_tsc_unstable(void)
2200{
2201#ifdef CONFIG_X86_64
2202        /*
2203         * TSC is marked unstable when we're running on Hyper-V,
2204         * 'TSC page' clocksource is good.
2205         */
2206        if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK)
2207                return false;
2208#endif
2209        return check_tsc_unstable();
2210}
2211
2212static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
2213{
2214        struct kvm *kvm = vcpu->kvm;
2215        u64 offset, ns, elapsed;
2216        unsigned long flags;
2217        bool matched;
2218        bool already_matched;
2219        bool synchronizing = false;
2220
2221        raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
2222        offset = kvm_compute_tsc_offset(vcpu, data);
2223        ns = get_kvmclock_base_ns();
2224        elapsed = ns - kvm->arch.last_tsc_nsec;
2225
2226        if (vcpu->arch.virtual_tsc_khz) {
2227                if (data == 0) {
2228                        /*
2229                         * detection of vcpu initialization -- need to sync
2230                         * with other vCPUs. This particularly helps to keep
2231                         * kvm_clock stable after CPU hotplug
2232                         */
2233                        synchronizing = true;
2234                } else {
2235                        u64 tsc_exp = kvm->arch.last_tsc_write +
2236                                                nsec_to_cycles(vcpu, elapsed);
2237                        u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
2238                        /*
2239                         * Special case: TSC write with a small delta (1 second)
2240                         * of virtual cycle time against real time is
2241                         * interpreted as an attempt to synchronize the CPU.
2242                         */
2243                        synchronizing = data < tsc_exp + tsc_hz &&
2244                                        data + tsc_hz > tsc_exp;
2245                }
2246        }
2247
2248        /*
2249         * For a reliable TSC, we can match TSC offsets, and for an unstable
2250         * TSC, we add elapsed time in this computation.  We could let the
2251         * compensation code attempt to catch up if we fall behind, but
2252         * it's better to try to match offsets from the beginning.
2253         */
2254        if (synchronizing &&
2255            vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
2256                if (!kvm_check_tsc_unstable()) {
2257                        offset = kvm->arch.cur_tsc_offset;
2258                } else {
2259                        u64 delta = nsec_to_cycles(vcpu, elapsed);
2260                        data += delta;
2261                        offset = kvm_compute_tsc_offset(vcpu, data);
2262                }
2263                matched = true;
2264                already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
2265        } else {
2266                /*
2267                 * We split periods of matched TSC writes into generations.
2268                 * For each generation, we track the original measured
2269                 * nanosecond time, offset, and write, so if TSCs are in
2270                 * sync, we can match exact offset, and if not, we can match
2271                 * exact software computation in compute_guest_tsc()
2272                 *
2273                 * These values are tracked in kvm->arch.cur_xxx variables.
2274                 */
2275                kvm->arch.cur_tsc_generation++;
2276                kvm->arch.cur_tsc_nsec = ns;
2277                kvm->arch.cur_tsc_write = data;
2278                kvm->arch.cur_tsc_offset = offset;
2279                matched = false;
2280        }
2281
2282        /*
2283         * We also track th most recent recorded KHZ, write and time to
2284         * allow the matching interval to be extended at each write.
2285         */
2286        kvm->arch.last_tsc_nsec = ns;
2287        kvm->arch.last_tsc_write = data;
2288        kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
2289
2290        vcpu->arch.last_guest_tsc = data;
2291
2292        /* Keep track of which generation this VCPU has synchronized to */
2293        vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
2294        vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
2295        vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
2296
2297        kvm_vcpu_write_tsc_offset(vcpu, offset);
2298        raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
2299
2300        spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
2301        if (!matched) {
2302                kvm->arch.nr_vcpus_matched_tsc = 0;
2303        } else if (!already_matched) {
2304                kvm->arch.nr_vcpus_matched_tsc++;
2305        }
2306
2307        kvm_track_tsc_matching(vcpu);
2308        spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
2309}
2310
2311static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
2312                                           s64 adjustment)
2313{
2314        u64 tsc_offset = vcpu->arch.l1_tsc_offset;
2315        kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
2316}
2317
2318static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
2319{
2320        if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
2321                WARN_ON(adjustment < 0);
2322        adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
2323        adjust_tsc_offset_guest(vcpu, adjustment);
2324}
2325
2326#ifdef CONFIG_X86_64
2327
2328static u64 read_tsc(void)
2329{
2330        u64 ret = (u64)rdtsc_ordered();
2331        u64 last = pvclock_gtod_data.clock.cycle_last;
2332
2333        if (likely(ret >= last))
2334                return ret;
2335
2336        /*
2337         * GCC likes to generate cmov here, but this branch is extremely
2338         * predictable (it's just a function of time and the likely is
2339         * very likely) and there's a data dependence, so force GCC
2340         * to generate a branch instead.  I don't barrier() because
2341         * we don't actually need a barrier, and if this function
2342         * ever gets inlined it will generate worse code.
2343         */
2344        asm volatile ("");
2345        return last;
2346}
2347
2348static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
2349                          int *mode)
2350{
2351        long v;
2352        u64 tsc_pg_val;
2353
2354        switch (clock->vclock_mode) {
2355        case VDSO_CLOCKMODE_HVCLOCK:
2356                tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
2357                                                  tsc_timestamp);
2358                if (tsc_pg_val != U64_MAX) {
2359                        /* TSC page valid */
2360                        *mode = VDSO_CLOCKMODE_HVCLOCK;
2361                        v = (tsc_pg_val - clock->cycle_last) &
2362                                clock->mask;
2363                } else {
2364                        /* TSC page invalid */
2365                        *mode = VDSO_CLOCKMODE_NONE;
2366                }
2367                break;
2368        case VDSO_CLOCKMODE_TSC:
2369                *mode = VDSO_CLOCKMODE_TSC;
2370                *tsc_timestamp = read_tsc();
2371                v = (*tsc_timestamp - clock->cycle_last) &
2372                        clock->mask;
2373                break;
2374        default:
2375                *mode = VDSO_CLOCKMODE_NONE;
2376        }
2377
2378        if (*mode == VDSO_CLOCKMODE_NONE)
2379                *tsc_timestamp = v = 0;
2380
2381        return v * clock->mult;
2382}
2383
2384static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
2385{
2386        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
2387        unsigned long seq;
2388        int mode;
2389        u64 ns;
2390
2391        do {
2392                seq = read_seqcount_begin(&gtod->seq);
2393                ns = gtod->raw_clock.base_cycles;
2394                ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode);
2395                ns >>= gtod->raw_clock.shift;
2396                ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot));
2397        } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
2398        *t = ns;
2399
2400        return mode;
2401}
2402
2403static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
2404{
2405        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
2406        unsigned long seq;
2407        int mode;
2408        u64 ns;
2409
2410        do {
2411                seq = read_seqcount_begin(&gtod->seq);
2412                ts->tv_sec = gtod->wall_time_sec;
2413                ns = gtod->clock.base_cycles;
2414                ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
2415                ns >>= gtod->clock.shift;
2416        } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
2417
2418        ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
2419        ts->tv_nsec = ns;
2420
2421        return mode;
2422}
2423
2424/* returns true if host is using TSC based clocksource */
2425static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
2426{
2427        /* checked again under seqlock below */
2428        if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
2429                return false;
2430
2431        return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
2432                                                      tsc_timestamp));
2433}
2434
2435/* returns true if host is using TSC based clocksource */
2436static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
2437                                           u64 *tsc_timestamp)
2438{
2439        /* checked again under seqlock below */
2440        if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
2441                return false;
2442
2443        return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp));
2444}
2445#endif
2446
2447/*
2448 *
2449 * Assuming a stable TSC across physical CPUS, and a stable TSC
2450 * across virtual CPUs, the following condition is possible.
2451 * Each numbered line represents an event visible to both
2452 * CPUs at the next numbered event.
2453 *
2454 * "timespecX" represents host monotonic time. "tscX" represents
2455 * RDTSC value.
2456 *
2457 *              VCPU0 on CPU0           |       VCPU1 on CPU1
2458 *
2459 * 1.  read timespec0,tsc0
2460 * 2.                                   | timespec1 = timespec0 + N
2461 *                                      | tsc1 = tsc0 + M
2462 * 3. transition to guest               | transition to guest
2463 * 4. ret0 = timespec0 + (rdtsc - tsc0) |
2464 * 5.                                   | ret1 = timespec1 + (rdtsc - tsc1)
2465 *                                      | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
2466 *
2467 * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
2468 *
2469 *      - ret0 < ret1
2470 *      - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
2471 *              ...
2472 *      - 0 < N - M => M < N
2473 *
2474 * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
2475 * always the case (the difference between two distinct xtime instances
2476 * might be smaller then the difference between corresponding TSC reads,
2477 * when updating guest vcpus pvclock areas).
2478 *
2479 * To avoid that problem, do not allow visibility of distinct
2480 * system_timestamp/tsc_timestamp values simultaneously: use a master
2481 * copy of host monotonic time values. Update that master copy
2482 * in lockstep.
2483 *
2484 * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
2485 *
2486 */
2487
2488static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
2489{
2490#ifdef CONFIG_X86_64
2491        struct kvm_arch *ka = &kvm->arch;
2492        int vclock_mode;
2493        bool host_tsc_clocksource, vcpus_matched;
2494
2495        vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
2496                        atomic_read(&kvm->online_vcpus));
2497
2498        /*
2499         * If the host uses TSC clock, then passthrough TSC as stable
2500         * to the guest.
2501         */
2502        host_tsc_clocksource = kvm_get_time_and_clockread(
2503                                        &ka->master_kernel_ns,
2504                                        &ka->master_cycle_now);
2505
2506        ka->use_master_clock = host_tsc_clocksource && vcpus_matched
2507                                && !ka->backwards_tsc_observed
2508                                && !ka->boot_vcpu_runs_old_kvmclock;
2509
2510        if (ka->use_master_clock)
2511                atomic_set(&kvm_guest_has_master_clock, 1);
2512
2513        vclock_mode = pvclock_gtod_data.clock.vclock_mode;
2514        trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
2515                                        vcpus_matched);
2516#endif
2517}
2518
2519void kvm_make_mclock_inprogress_request(struct kvm *kvm)
2520{
2521        kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
2522}
2523
2524static void kvm_gen_update_masterclock(struct kvm *kvm)
2525{
2526#ifdef CONFIG_X86_64
2527        int i;
2528        struct kvm_vcpu *vcpu;
2529        struct kvm_arch *ka = &kvm->arch;
2530
2531        spin_lock(&ka->pvclock_gtod_sync_lock);
2532        kvm_make_mclock_inprogress_request(kvm);
2533        /* no guest entries from this point */
2534        pvclock_update_vm_gtod_copy(kvm);
2535
2536        kvm_for_each_vcpu(i, vcpu, kvm)
2537                kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2538
2539        /* guest entries allowed */
2540        kvm_for_each_vcpu(i, vcpu, kvm)
2541                kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
2542
2543        spin_unlock(&ka->pvclock_gtod_sync_lock);
2544#endif
2545}
2546
2547u64 get_kvmclock_ns(struct kvm *kvm)
2548{
2549        struct kvm_arch *ka = &kvm->arch;
2550        struct pvclock_vcpu_time_info hv_clock;
2551        u64 ret;
2552
2553        spin_lock(&ka->pvclock_gtod_sync_lock);
2554        if (!ka->use_master_clock) {
2555                spin_unlock(&ka->pvclock_gtod_sync_lock);
2556                return get_kvmclock_base_ns() + ka->kvmclock_offset;
2557        }
2558
2559        hv_clock.tsc_timestamp = ka->master_cycle_now;
2560        hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
2561        spin_unlock(&ka->pvclock_gtod_sync_lock);
2562
2563        /* both __this_cpu_read() and rdtsc() should be on the same cpu */
2564        get_cpu();
2565
2566        if (__this_cpu_read(cpu_tsc_khz)) {
2567                kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
2568                                   &hv_clock.tsc_shift,
2569                                   &hv_clock.tsc_to_system_mul);
2570                ret = __pvclock_read_cycles(&hv_clock, rdtsc());
2571        } else
2572                ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
2573
2574        put_cpu();
2575
2576        return ret;
2577}
2578
2579static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
2580{
2581        struct kvm_vcpu_arch *vcpu = &v->arch;
2582        struct pvclock_vcpu_time_info guest_hv_clock;
2583
2584        if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
2585                &guest_hv_clock, sizeof(guest_hv_clock))))
2586                return;
2587
2588        /* This VCPU is paused, but it's legal for a guest to read another
2589         * VCPU's kvmclock, so we really have to follow the specification where
2590         * it says that version is odd if data is being modified, and even after
2591         * it is consistent.
2592         *
2593         * Version field updates must be kept separate.  This is because
2594         * kvm_write_guest_cached might use a "rep movs" instruction, and
2595         * writes within a string instruction are weakly ordered.  So there
2596         * are three writes overall.
2597         *
2598         * As a small optimization, only write the version field in the first
2599         * and third write.  The vcpu->pv_time cache is still valid, because the
2600         * version field is the first in the struct.
2601         */
2602        BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
2603
2604        if (guest_hv_clock.version & 1)
2605                ++guest_hv_clock.version;  /* first time write, random junk */
2606
2607        vcpu->hv_clock.version = guest_hv_clock.version + 1;
2608        kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
2609                                &vcpu->hv_clock,
2610                                sizeof(vcpu->hv_clock.version));
2611
2612        smp_wmb();
2613
2614        /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
2615        vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
2616
2617        if (vcpu->pvclock_set_guest_stopped_request) {
2618                vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
2619                vcpu->pvclock_set_guest_stopped_request = false;
2620        }
2621
2622        trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
2623
2624        kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
2625                                &vcpu->hv_clock,
2626                                sizeof(vcpu->hv_clock));
2627
2628        smp_wmb();
2629
2630        vcpu->hv_clock.version++;
2631        kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
2632                                &vcpu->hv_clock,
2633                                sizeof(vcpu->hv_clock.version));
2634}
2635
2636static int kvm_guest_time_update(struct kvm_vcpu *v)
2637{
2638        unsigned long flags, tgt_tsc_khz;
2639        struct kvm_vcpu_arch *vcpu = &v->arch;
2640        struct kvm_arch *ka = &v->kvm->arch;
2641        s64 kernel_ns;
2642        u64 tsc_timestamp, host_tsc;
2643        u8 pvclock_flags;
2644        bool use_master_clock;
2645
2646        kernel_ns = 0;
2647        host_tsc = 0;
2648
2649        /*
2650         * If the host uses TSC clock, then passthrough TSC as stable
2651         * to the guest.
2652         */
2653        spin_lock(&ka->pvclock_gtod_sync_lock);
2654        use_master_clock = ka->use_master_clock;
2655        if (use_master_clock) {
2656                host_tsc = ka->master_cycle_now;
2657                kernel_ns = ka->master_kernel_ns;
2658        }
2659        spin_unlock(&ka->pvclock_gtod_sync_lock);
2660
2661        /* Keep irq disabled to prevent changes to the clock */
2662        local_irq_save(flags);
2663        tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
2664        if (unlikely(tgt_tsc_khz == 0)) {
2665                local_irq_restore(flags);
2666                kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
2667                return 1;
2668        }
2669        if (!use_master_clock) {
2670                host_tsc = rdtsc();
2671                kernel_ns = get_kvmclock_base_ns();
2672        }
2673
2674        tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
2675
2676        /*
2677         * We may have to catch up the TSC to match elapsed wall clock
2678         * time for two reasons, even if kvmclock is used.
2679         *   1) CPU could have been running below the maximum TSC rate
2680         *   2) Broken TSC compensation resets the base at each VCPU
2681         *      entry to avoid unknown leaps of TSC even when running
2682         *      again on the same CPU.  This may cause apparent elapsed
2683         *      time to disappear, and the guest to stand still or run
2684         *      very slowly.
2685         */
2686        if (vcpu->tsc_catchup) {
2687                u64 tsc = compute_guest_tsc(v, kernel_ns);
2688                if (tsc > tsc_timestamp) {
2689                        adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
2690                        tsc_timestamp = tsc;
2691                }
2692        }
2693
2694        local_irq_restore(flags);
2695
2696        /* With all the info we got, fill in the values */
2697
2698        if (kvm_has_tsc_control)
2699                tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
2700
2701        if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
2702                kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
2703                                   &vcpu->hv_clock.tsc_shift,
2704                                   &vcpu->hv_clock.tsc_to_system_mul);
2705                vcpu->hw_tsc_khz = tgt_tsc_khz;
2706        }
2707
2708        vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
2709        vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
2710        vcpu->last_guest_tsc = tsc_timestamp;
2711
2712        /* If the host uses TSC clocksource, then it is stable */
2713        pvclock_flags = 0;
2714        if (use_master_clock)
2715                pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
2716
2717        vcpu->hv_clock.flags = pvclock_flags;
2718
2719        if (vcpu->pv_time_enabled)
2720                kvm_setup_pvclock_page(v);
2721        if (v == kvm_get_vcpu(v->kvm, 0))
2722                kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
2723        return 0;
2724}
2725
2726/*
2727 * kvmclock updates which are isolated to a given vcpu, such as
2728 * vcpu->cpu migration, should not allow system_timestamp from
2729 * the rest of the vcpus to remain static. Otherwise ntp frequency
2730 * correction applies to one vcpu's system_timestamp but not
2731 * the others.
2732 *
2733 * So in those cases, request a kvmclock update for all vcpus.
2734 * We need to rate-limit these requests though, as they can
2735 * considerably slow guests that have a large number of vcpus.
2736 * The time for a remote vcpu to update its kvmclock is bound
2737 * by the delay we use to rate-limit the updates.
2738 */
2739
2740#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
2741
2742static void kvmclock_update_fn(struct work_struct *work)
2743{
2744        int i;
2745        struct delayed_work *dwork = to_delayed_work(work);
2746        struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
2747                                           kvmclock_update_work);
2748        struct kvm *kvm = container_of(ka, struct kvm, arch);
2749        struct kvm_vcpu *vcpu;
2750
2751        kvm_for_each_vcpu(i, vcpu, kvm) {
2752                kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2753                kvm_vcpu_kick(vcpu);
2754        }
2755}
2756
2757static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
2758{
2759        struct kvm *kvm = v->kvm;
2760
2761        kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
2762        schedule_delayed_work(&kvm->arch.kvmclock_update_work,
2763                                        KVMCLOCK_UPDATE_DELAY);
2764}
2765
2766#define KVMCLOCK_SYNC_PERIOD (300 * HZ)
2767
2768static void kvmclock_sync_fn(struct work_struct *work)
2769{
2770        struct delayed_work *dwork = to_delayed_work(work);
2771        struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
2772                                           kvmclock_sync_work);
2773        struct kvm *kvm = container_of(ka, struct kvm, arch);
2774
2775        if (!kvmclock_periodic_sync)
2776                return;
2777
2778        schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
2779        schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
2780                                        KVMCLOCK_SYNC_PERIOD);
2781}
2782
2783/*
2784 * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP.
2785 */
2786static bool can_set_mci_status(struct kvm_vcpu *vcpu)
2787{
2788        /* McStatusWrEn enabled? */
2789        if (guest_cpuid_is_amd_or_hygon(vcpu))
2790                return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
2791
2792        return false;
2793}
2794
2795static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2796{
2797        u64 mcg_cap = vcpu->arch.mcg_cap;
2798        unsigned bank_num = mcg_cap & 0xff;
2799        u32 msr = msr_info->index;
2800        u64 data = msr_info->data;
2801
2802        switch (msr) {
2803        case MSR_IA32_MCG_STATUS:
2804                vcpu->arch.mcg_status = data;
2805                break;
2806        case MSR_IA32_MCG_CTL:
2807                if (!(mcg_cap & MCG_CTL_P) &&
2808                    (data || !msr_info->host_initiated))
2809                        return 1;
2810                if (data != 0 && data != ~(u64)0)
2811                        return 1;
2812                vcpu->arch.mcg_ctl = data;
2813                break;
2814        default:
2815                if (msr >= MSR_IA32_MC0_CTL &&
2816                    msr < MSR_IA32_MCx_CTL(bank_num)) {
2817                        u32 offset = array_index_nospec(
2818                                msr - MSR_IA32_MC0_CTL,
2819                                MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
2820
2821                        /* only 0 or all 1s can be written to IA32_MCi_CTL
2822                         * some Linux kernels though clear bit 10 in bank 4 to
2823                         * workaround a BIOS/GART TBL issue on AMD K8s, ignore
2824                         * this to avoid an uncatched #GP in the guest
2825                         */
2826                        if ((offset & 0x3) == 0 &&
2827                            data != 0 && (data | (1 << 10)) != ~(u64)0)
2828                                return -1;
2829
2830                        /* MCi_STATUS */
2831                        if (!msr_info->host_initiated &&
2832                            (offset & 0x3) == 1 && data != 0) {
2833                                if (!can_set_mci_status(vcpu))
2834                                        return -1;
2835                        }
2836
2837                        vcpu->arch.mce_banks[offset] = data;
2838                        break;
2839                }
2840                return 1;
2841        }
2842        return 0;
2843}
2844
2845static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
2846{
2847        struct kvm *kvm = vcpu->kvm;
2848        int lm = is_long_mode(vcpu);
2849        u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
2850                : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
2851        u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
2852                : kvm->arch.xen_hvm_config.blob_size_32;
2853        u32 page_num = data & ~PAGE_MASK;
2854        u64 page_addr = data & PAGE_MASK;
2855        u8 *page;
2856
2857        if (page_num >= blob_size)
2858                return 1;
2859
2860        page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
2861        if (IS_ERR(page))
2862                return PTR_ERR(page);
2863
2864        if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) {
2865                kfree(page);
2866                return 1;
2867        }
2868        return 0;
2869}
2870
2871static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
2872{
2873        u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
2874
2875        return (vcpu->arch.apf.msr_en_val & mask) == mask;
2876}
2877
2878static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
2879{
2880        gpa_t gpa = data & ~0x3f;
2881
2882        /* Bits 4:5 are reserved, Should be zero */
2883        if (data & 0x30)
2884                return 1;
2885
2886        if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) &&
2887            (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT))
2888                return 1;
2889
2890        if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) &&
2891            (data & KVM_ASYNC_PF_DELIVERY_AS_INT))
2892                return 1;
2893
2894        if (!lapic_in_kernel(vcpu))
2895                return data ? 1 : 0;
2896
2897        vcpu->arch.apf.msr_en_val = data;
2898
2899        if (!kvm_pv_async_pf_enabled(vcpu)) {
2900                kvm_clear_async_pf_completion_queue(vcpu);
2901                kvm_async_pf_hash_reset(vcpu);
2902                return 0;
2903        }
2904
2905        if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
2906                                        sizeof(u64)))
2907                return 1;
2908
2909        vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
2910        vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
2911
2912        kvm_async_pf_wakeup_all(vcpu);
2913
2914        return 0;
2915}
2916
2917static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
2918{
2919        /* Bits 8-63 are reserved */
2920        if (data >> 8)
2921                return 1;
2922
2923        if (!lapic_in_kernel(vcpu))
2924                return 1;
2925
2926        vcpu->arch.apf.msr_int_val = data;
2927
2928        vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK;
2929
2930        return 0;
2931}
2932
2933static void kvmclock_reset(struct kvm_vcpu *vcpu)
2934{
2935        vcpu->arch.pv_time_enabled = false;
2936        vcpu->arch.time = 0;
2937}
2938
2939static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
2940{
2941        ++vcpu->stat.tlb_flush;
2942        kvm_x86_ops.tlb_flush_all(vcpu);
2943}
2944
2945static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
2946{
2947        ++vcpu->stat.tlb_flush;
2948        kvm_x86_ops.tlb_flush_guest(vcpu);
2949}
2950
2951static void record_steal_time(struct kvm_vcpu *vcpu)
2952{
2953        struct kvm_host_map map;
2954        struct kvm_steal_time *st;
2955
2956        if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
2957                return;
2958
2959        /* -EAGAIN is returned in atomic context so we can just return. */
2960        if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
2961                        &map, &vcpu->arch.st.cache, false))
2962                return;
2963
2964        st = map.hva +
2965                offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
2966
2967        /*
2968         * Doing a TLB flush here, on the guest's behalf, can avoid
2969         * expensive IPIs.
2970         */
2971        if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
2972                trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
2973                                       st->preempted & KVM_VCPU_FLUSH_TLB);
2974                if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
2975                        kvm_vcpu_flush_tlb_guest(vcpu);
2976        }
2977
2978        vcpu->arch.st.preempted = 0;
2979
2980        if (st->version & 1)
2981                st->version += 1;  /* first time write, random junk */
2982
2983        st->version += 1;
2984
2985        smp_wmb();
2986
2987        st->steal += current->sched_info.run_delay -
2988                vcpu->arch.st.last_steal;
2989        vcpu->arch.st.last_steal = current->sched_info.run_delay;
2990
2991        smp_wmb();
2992
2993        st->version += 1;
2994
2995        kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
2996}
2997
2998int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2999{
3000        bool pr = false;
3001        u32 msr = msr_info->index;
3002        u64 data = msr_info->data;
3003
3004        switch (msr) {
3005        case MSR_AMD64_NB_CFG:
3006        case MSR_IA32_UCODE_WRITE:
3007        case MSR_VM_HSAVE_PA:
3008        case MSR_AMD64_PATCH_LOADER:
3009        case MSR_AMD64_BU_CFG2:
3010        case MSR_AMD64_DC_CFG:
3011        case MSR_F15H_EX_CFG:
3012                break;
3013
3014        case MSR_IA32_UCODE_REV:
3015                if (msr_info->host_initiated)
3016                        vcpu->arch.microcode_version = data;
3017                break;
3018        case MSR_IA32_ARCH_CAPABILITIES:
3019                if (!msr_info->host_initiated)
3020                        return 1;
3021                vcpu->arch.arch_capabilities = data;
3022                break;
3023        case MSR_IA32_PERF_CAPABILITIES: {
3024                struct kvm_msr_entry msr_ent = {.index = msr, .data = 0};
3025
3026                if (!msr_info->host_initiated)
3027                        return 1;
3028                if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) && kvm_get_msr_feature(&msr_ent))
3029                        return 1;
3030                if (data & ~msr_ent.data)
3031                        return 1;
3032
3033                vcpu->arch.perf_capabilities = data;
3034
3035                return 0;
3036                }
3037        case MSR_EFER:
3038                return set_efer(vcpu, msr_info);
3039        case MSR_K7_HWCR:
3040                data &= ~(u64)0x40;     /* ignore flush filter disable */
3041                data &= ~(u64)0x100;    /* ignore ignne emulation enable */
3042                data &= ~(u64)0x8;      /* ignore TLB cache disable */
3043
3044                /* Handle McStatusWrEn */
3045                if (data == BIT_ULL(18)) {
3046                        vcpu->arch.msr_hwcr = data;
3047                } else if (data != 0) {
3048                        vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
3049                                    data);
3050                        return 1;
3051                }
3052                break;
3053        case MSR_FAM10H_MMIO_CONF_BASE:
3054                if (data != 0) {
3055                        vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
3056                                    "0x%llx\n", data);
3057                        return 1;
3058                }
3059                break;
3060        case MSR_IA32_DEBUGCTLMSR:
3061                if (!data) {
3062                        /* We support the non-activated case already */
3063                        break;
3064                } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
3065                        /* Values other than LBR and BTF are vendor-specific,
3066                           thus reserved and should throw a #GP */
3067                        return 1;
3068                } else if (report_ignored_msrs)
3069                        vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
3070                                    __func__, data);
3071                break;
3072        case 0x200 ... 0x2ff:
3073                return kvm_mtrr_set_msr(vcpu, msr, data);
3074        case MSR_IA32_APICBASE:
3075                return kvm_set_apic_base(vcpu, msr_info);
3076        case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
3077                return kvm_x2apic_msr_write(vcpu, msr, data);
3078        case MSR_IA32_TSCDEADLINE:
3079                kvm_set_lapic_tscdeadline_msr(vcpu, data);
3080                break;
3081        case MSR_IA32_TSC_ADJUST:
3082                if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
3083                        if (!msr_info->host_initiated) {
3084                                s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
3085                                adjust_tsc_offset_guest(vcpu, adj);
3086                        }
3087                        vcpu->arch.ia32_tsc_adjust_msr = data;
3088                }
3089                break;
3090        case MSR_IA32_MISC_ENABLE:
3091                if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
3092                    ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
3093                        if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
3094                                return 1;
3095                        vcpu->arch.ia32_misc_enable_msr = data;
3096                        kvm_update_cpuid_runtime(vcpu);
3097                } else {
3098                        vcpu->arch.ia32_misc_enable_msr = data;
3099                }
3100                break;
3101        case MSR_IA32_SMBASE:
3102                if (!msr_info->host_initiated)
3103                        return 1;
3104                vcpu->arch.smbase = data;
3105                break;
3106        case MSR_IA32_POWER_CTL:
3107                vcpu->arch.msr_ia32_power_ctl = data;
3108                break;
3109        case MSR_IA32_TSC:
3110                if (msr_info->host_initiated) {
3111                        kvm_synchronize_tsc(vcpu, data);
3112                } else {
3113                        u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
3114                        adjust_tsc_offset_guest(vcpu, adj);
3115                        vcpu->arch.ia32_tsc_adjust_msr += adj;
3116                }
3117                break;
3118        case MSR_IA32_XSS:
3119                if (!msr_info->host_initiated &&
3120                    !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
3121                        return 1;
3122                /*
3123                 * KVM supports exposing PT to the guest, but does not support
3124                 * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
3125                 * XSAVES/XRSTORS to save/restore PT MSRs.
3126                 */
3127                if (data & ~supported_xss)
3128                        return 1;
3129                vcpu->arch.ia32_xss = data;
3130                break;
3131        case MSR_SMI_COUNT:
3132                if (!msr_info->host_initiated)
3133                        return 1;
3134                vcpu->arch.smi_count = data;
3135                break;
3136        case MSR_KVM_WALL_CLOCK_NEW:
3137                if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3138                        return 1;
3139
3140                kvm_write_wall_clock(vcpu->kvm, data);
3141                break;
3142        case MSR_KVM_WALL_CLOCK:
3143                if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3144                        return 1;
3145
3146                kvm_write_wall_clock(vcpu->kvm, data);
3147                break;
3148        case MSR_KVM_SYSTEM_TIME_NEW:
3149                if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3150                        return 1;
3151
3152                kvm_write_system_time(vcpu, data, false, msr_info->host_initiated);
3153                break;
3154        case MSR_KVM_SYSTEM_TIME:
3155                if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3156                        return 1;
3157
3158                kvm_write_system_time(vcpu, data, true,  msr_info->host_initiated);
3159                break;
3160        case MSR_KVM_ASYNC_PF_EN:
3161                if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
3162                        return 1;
3163
3164                if (kvm_pv_enable_async_pf(vcpu, data))
3165                        return 1;
3166                break;
3167        case MSR_KVM_ASYNC_PF_INT:
3168                if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
3169                        return 1;
3170
3171                if (kvm_pv_enable_async_pf_int(vcpu, data))
3172                        return 1;
3173                break;
3174        case MSR_KVM_ASYNC_PF_ACK:
3175                if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
3176                        return 1;
3177                if (data & 0x1) {
3178                        vcpu->arch.apf.pageready_pending = false;
3179                        kvm_check_async_pf_completion(vcpu);
3180                }
3181                break;
3182        case MSR_KVM_STEAL_TIME:
3183                if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
3184                        return 1;
3185
3186                if (unlikely(!sched_info_on()))
3187                        return 1;
3188
3189                if (data & KVM_STEAL_RESERVED_MASK)
3190                        return 1;
3191
3192                vcpu->arch.st.msr_val = data;
3193
3194                if (!(data & KVM_MSR_ENABLED))
3195                        break;
3196
3197                kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
3198
3199                break;
3200        case MSR_KVM_PV_EOI_EN:
3201                if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
3202                        return 1;
3203
3204                if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8)))
3205                        return 1;
3206                break;
3207
3208        case MSR_KVM_POLL_CONTROL:
3209                if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
3210                        return 1;
3211
3212                /* only enable bit supported */
3213                if (data & (-1ULL << 1))
3214                        return 1;
3215
3216                vcpu->arch.msr_kvm_poll_control = data;
3217                break;
3218
3219        case MSR_IA32_MCG_CTL:
3220        case MSR_IA32_MCG_STATUS:
3221        case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
3222                return set_msr_mce(vcpu, msr_info);
3223
3224        case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
3225        case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
3226                pr = true;
3227                fallthrough;
3228        case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
3229        case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
3230                if (kvm_pmu_is_valid_msr(vcpu, msr))
3231                        return kvm_pmu_set_msr(vcpu, msr_info);
3232
3233                if (pr || data != 0)
3234                        vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
3235                                    "0x%x data 0x%llx\n", msr, data);
3236                break;
3237        case MSR_K7_CLK_CTL:
3238                /*
3239                 * Ignore all writes to this no longer documented MSR.
3240                 * Writes are only relevant for old K7 processors,
3241                 * all pre-dating SVM, but a recommended workaround from
3242                 * AMD for these chips. It is possible to specify the
3243                 * affected processor models on the command line, hence
3244                 * the need to ignore the workaround.
3245                 */
3246                break;
3247        case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
3248        case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
3249        case HV_X64_MSR_SYNDBG_OPTIONS:
3250        case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
3251        case HV_X64_MSR_CRASH_CTL:
3252        case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
3253        case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
3254        case HV_X64_MSR_TSC_EMULATION_CONTROL:
3255        case HV_X64_MSR_TSC_EMULATION_STATUS:
3256                return kvm_hv_set_msr_common(vcpu, msr, data,
3257                                             msr_info->host_initiated);
3258        case MSR_IA32_BBL_CR_CTL3:
3259                /* Drop writes to this legacy MSR -- see rdmsr
3260                 * counterpart for further detail.
3261                 */
3262                if (report_ignored_msrs)
3263                        vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n",
3264                                msr, data);
3265                break;
3266        case MSR_AMD64_OSVW_ID_LENGTH:
3267                if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
3268                        return 1;
3269                vcpu->arch.osvw.length = data;
3270                break;
3271        case MSR_AMD64_OSVW_STATUS:
3272                if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
3273                        return 1;
3274                vcpu->arch.osvw.status = data;
3275                break;
3276        case MSR_PLATFORM_INFO:
3277                if (!msr_info->host_initiated ||
3278                    (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) &&
3279                     cpuid_fault_enabled(vcpu)))
3280                        return 1;
3281                vcpu->arch.msr_platform_info = data;
3282                break;
3283        case MSR_MISC_FEATURES_ENABLES:
3284                if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT ||
3285                    (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
3286                     !supports_cpuid_fault(vcpu)))
3287                        return 1;
3288                vcpu->arch.msr_misc_features_enables = data;
3289                break;
3290        default:
3291                if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
3292                        return xen_hvm_config(vcpu, data);
3293                if (kvm_pmu_is_valid_msr(vcpu, msr))
3294                        return kvm_pmu_set_msr(vcpu, msr_info);
3295                return KVM_MSR_RET_INVALID;
3296        }
3297        return 0;
3298}
3299EXPORT_SYMBOL_GPL(kvm_set_msr_common);
3300
3301static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
3302{
3303        u64 data;
3304        u64 mcg_cap = vcpu->arch.mcg_cap;
3305        unsigned bank_num = mcg_cap & 0xff;
3306
3307        switch (msr) {
3308        case MSR_IA32_P5_MC_ADDR:
3309        case MSR_IA32_P5_MC_TYPE:
3310                data = 0;
3311                break;
3312        case MSR_IA32_MCG_CAP:
3313                data = vcpu->arch.mcg_cap;
3314                break;
3315        case MSR_IA32_MCG_CTL:
3316                if (!(mcg_cap & MCG_CTL_P) && !host)
3317                        return 1;
3318                data = vcpu->arch.mcg_ctl;
3319                break;
3320        case MSR_IA32_MCG_STATUS:
3321                data = vcpu->arch.mcg_status;
3322                break;
3323        default:
3324                if (msr >= MSR_IA32_MC0_CTL &&
3325                    msr < MSR_IA32_MCx_CTL(bank_num)) {
3326                        u32 offset = array_index_nospec(
3327                                msr - MSR_IA32_MC0_CTL,
3328                                MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
3329
3330                        data = vcpu->arch.mce_banks[offset];
3331                        break;
3332                }
3333                return 1;
3334        }
3335        *pdata = data;
3336        return 0;
3337}
3338
3339int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3340{
3341        switch (msr_info->index) {
3342        case MSR_IA32_PLATFORM_ID:
3343        case MSR_IA32_EBL_CR_POWERON:
3344        case MSR_IA32_DEBUGCTLMSR:
3345        case MSR_IA32_LASTBRANCHFROMIP:
3346        case MSR_IA32_LASTBRANCHTOIP:
3347        case MSR_IA32_LASTINTFROMIP:
3348        case MSR_IA32_LASTINTTOIP:
3349        case MSR_K8_SYSCFG:
3350        case MSR_K8_TSEG_ADDR:
3351        case MSR_K8_TSEG_MASK:
3352        case MSR_VM_HSAVE_PA:
3353        case MSR_K8_INT_PENDING_MSG:
3354        case MSR_AMD64_NB_CFG:
3355        case MSR_FAM10H_MMIO_CONF_BASE:
3356        case MSR_AMD64_BU_CFG2:
3357        case MSR_IA32_PERF_CTL:
3358        case MSR_AMD64_DC_CFG:
3359        case MSR_F15H_EX_CFG:
3360        /*
3361         * Intel Sandy Bridge CPUs must support the RAPL (running average power
3362         * limit) MSRs. Just return 0, as we do not want to expose the host
3363         * data here. Do not conditionalize this on CPUID, as KVM does not do
3364         * so for existing CPU-specific MSRs.
3365         */
3366        case MSR_RAPL_POWER_UNIT:
3367        case MSR_PP0_ENERGY_STATUS:     /* Power plane 0 (core) */
3368        case MSR_PP1_ENERGY_STATUS:     /* Power plane 1 (graphics uncore) */
3369        case MSR_PKG_ENERGY_STATUS:     /* Total package */
3370        case MSR_DRAM_ENERGY_STATUS:    /* DRAM controller */
3371                msr_info->data = 0;
3372                break;
3373        case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
3374        case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
3375        case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
3376        case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
3377        case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
3378                if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
3379                        return kvm_pmu_get_msr(vcpu, msr_info);
3380                msr_info->data = 0;
3381                break;
3382        case MSR_IA32_UCODE_REV:
3383                msr_info->data = vcpu->arch.microcode_version;
3384                break;
3385        case MSR_IA32_ARCH_CAPABILITIES:
3386                if (!msr_info->host_initiated &&
3387                    !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
3388                        return 1;
3389                msr_info->data = vcpu->arch.arch_capabilities;
3390                break;
3391        case MSR_IA32_PERF_CAPABILITIES:
3392                if (!msr_info->host_initiated &&
3393                    !guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
3394                        return 1;
3395                msr_info->data = vcpu->arch.perf_capabilities;
3396                break;
3397        case MSR_IA32_POWER_CTL:
3398                msr_info->data = vcpu->arch.msr_ia32_power_ctl;
3399                break;
3400        case MSR_IA32_TSC: {
3401                /*
3402                 * Intel SDM states that MSR_IA32_TSC read adds the TSC offset
3403                 * even when not intercepted. AMD manual doesn't explicitly
3404                 * state this but appears to behave the same.
3405                 *
3406                 * On userspace reads and writes, however, we unconditionally
3407                 * return L1's TSC value to ensure backwards-compatible
3408                 * behavior for migration.
3409                 */
3410                u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset :
3411                                                            vcpu->arch.tsc_offset;
3412
3413                msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset;
3414                break;
3415        }
3416        case MSR_MTRRcap:
3417        case 0x200 ... 0x2ff:
3418                return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
3419        case 0xcd: /* fsb frequency */
3420                msr_info->data = 3;
3421                break;
3422                /*
3423                 * MSR_EBC_FREQUENCY_ID
3424                 * Conservative value valid for even the basic CPU models.
3425                 * Models 0,1: 000 in bits 23:21 indicating a bus speed of
3426                 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
3427                 * and 266MHz for model 3, or 4. Set Core Clock
3428                 * Frequency to System Bus Frequency Ratio to 1 (bits
3429                 * 31:24) even though these are only valid for CPU
3430                 * models > 2, however guests may end up dividing or
3431                 * multiplying by zero otherwise.
3432                 */
3433        case MSR_EBC_FREQUENCY_ID:
3434                msr_info->data = 1 << 24;
3435                break;
3436        case MSR_IA32_APICBASE:
3437                msr_info->data = kvm_get_apic_base(vcpu);
3438                break;
3439        case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
3440                return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
3441        case MSR_IA32_TSCDEADLINE:
3442                msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
3443                break;
3444        case MSR_IA32_TSC_ADJUST:
3445                msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
3446                break;
3447        case MSR_IA32_MISC_ENABLE:
3448                msr_info->data = vcpu->arch.ia32_misc_enable_msr;
3449                break;
3450        case MSR_IA32_SMBASE:
3451                if (!msr_info->host_initiated)
3452                        return 1;
3453                msr_info->data = vcpu->arch.smbase;
3454                break;
3455        case MSR_SMI_COUNT:
3456                msr_info->data = vcpu->arch.smi_count;
3457                break;
3458        case MSR_IA32_PERF_STATUS:
3459                /* TSC increment by tick */
3460                msr_info->data = 1000ULL;
3461                /* CPU multiplier */
3462                msr_info->data |= (((uint64_t)4ULL) << 40);
3463                break;
3464        case MSR_EFER:
3465                msr_info->data = vcpu->arch.efer;
3466                break;
3467        case MSR_KVM_WALL_CLOCK:
3468                if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3469                        return 1;
3470
3471                msr_info->data = vcpu->kvm->arch.wall_clock;
3472                break;
3473        case MSR_KVM_WALL_CLOCK_NEW:
3474                if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3475                        return 1;
3476
3477                msr_info->data = vcpu->kvm->arch.wall_clock;
3478                break;
3479        case MSR_KVM_SYSTEM_TIME:
3480                if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3481                        return 1;
3482
3483                msr_info->data = vcpu->arch.time;
3484                break;
3485        case MSR_KVM_SYSTEM_TIME_NEW:
3486                if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3487                        return 1;
3488
3489                msr_info->data = vcpu->arch.time;
3490                break;
3491        case MSR_KVM_ASYNC_PF_EN:
3492                if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
3493                        return 1;
3494
3495                msr_info->data = vcpu->arch.apf.msr_en_val;
3496                break;
3497        case MSR_KVM_ASYNC_PF_INT:
3498                if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
3499                        return 1;
3500
3501                msr_info->data = vcpu->arch.apf.msr_int_val;
3502                break;
3503        case MSR_KVM_ASYNC_PF_ACK:
3504                if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
3505                        return 1;
3506
3507                msr_info->data = 0;
3508                break;
3509        case MSR_KVM_STEAL_TIME:
3510                if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
3511                        return 1;
3512
3513                msr_info->data = vcpu->arch.st.msr_val;
3514                break;
3515        case MSR_KVM_PV_EOI_EN:
3516                if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
3517                        return 1;
3518
3519                msr_info->data = vcpu->arch.pv_eoi.msr_val;
3520                break;
3521        case MSR_KVM_POLL_CONTROL:
3522                if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
3523                        return 1;
3524
3525                msr_info->data = vcpu->arch.msr_kvm_poll_control;
3526                break;
3527        case MSR_IA32_P5_MC_ADDR:
3528        case MSR_IA32_P5_MC_TYPE:
3529        case MSR_IA32_MCG_CAP:
3530        case MSR_IA32_MCG_CTL:
3531        case MSR_IA32_MCG_STATUS:
3532        case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
3533                return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
3534                                   msr_info->host_initiated);
3535        case MSR_IA32_XSS:
3536                if (!msr_info->host_initiated &&
3537                    !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
3538                        return 1;
3539                msr_info->data = vcpu->arch.ia32_xss;
3540                break;
3541        case MSR_K7_CLK_CTL:
3542                /*
3543                 * Provide expected ramp-up count for K7. All other
3544                 * are set to zero, indicating minimum divisors for
3545                 * every field.
3546                 *
3547                 * This prevents guest kernels on AMD host with CPU
3548                 * type 6, model 8 and higher from exploding due to
3549                 * the rdmsr failing.
3550                 */
3551                msr_info->data = 0x20000000;
3552                break;
3553        case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
3554        case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
3555        case HV_X64_MSR_SYNDBG_OPTIONS:
3556        case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
3557        case HV_X64_MSR_CRASH_CTL:
3558        case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
3559        case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
3560        case HV_X64_MSR_TSC_EMULATION_CONTROL:
3561        case HV_X64_MSR_TSC_EMULATION_STATUS:
3562                return kvm_hv_get_msr_common(vcpu,
3563                                             msr_info->index, &msr_info->data,
3564                                             msr_info->host_initiated);
3565        case MSR_IA32_BBL_CR_CTL3:
3566                /* This legacy MSR exists but isn't fully documented in current
3567                 * silicon.  It is however accessed by winxp in very narrow
3568                 * scenarios where it sets bit #19, itself documented as
3569                 * a "reserved" bit.  Best effort attempt to source coherent
3570                 * read data here should the balance of the register be
3571                 * interpreted by the guest:
3572                 *
3573                 * L2 cache control register 3: 64GB range, 256KB size,
3574                 * enabled, latency 0x1, configured
3575                 */
3576                msr_info->data = 0xbe702111;
3577                break;
3578        case MSR_AMD64_OSVW_ID_LENGTH:
3579                if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
3580                        return 1;
3581                msr_info->data = vcpu->arch.osvw.length;
3582                break;
3583        case MSR_AMD64_OSVW_STATUS:
3584                if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
3585                        return 1;
3586                msr_info->data = vcpu->arch.osvw.status;
3587                break;
3588        case MSR_PLATFORM_INFO:
3589                if (!msr_info->host_initiated &&
3590                    !vcpu->kvm->arch.guest_can_read_msr_platform_info)
3591                        return 1;
3592                msr_info->data = vcpu->arch.msr_platform_info;
3593                break;
3594        case MSR_MISC_FEATURES_ENABLES:
3595                msr_info->data = vcpu->arch.msr_misc_features_enables;
3596                break;
3597        case MSR_K7_HWCR:
3598                msr_info->data = vcpu->arch.msr_hwcr;
3599                break;
3600        default:
3601                if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
3602                        return kvm_pmu_get_msr(vcpu, msr_info);
3603                return KVM_MSR_RET_INVALID;
3604        }
3605        return 0;
3606}
3607EXPORT_SYMBOL_GPL(kvm_get_msr_common);
3608
3609/*
3610 * Read or write a bunch of msrs. All parameters are kernel addresses.
3611 *
3612 * @return number of msrs set successfully.
3613 */
3614static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
3615                    struct kvm_msr_entry *entries,
3616                    int (*do_msr)(struct kvm_vcpu *vcpu,
3617                                  unsigned index, u64 *data))
3618{
3619        int i;
3620
3621        for (i = 0; i < msrs->nmsrs; ++i)
3622                if (do_msr(vcpu, entries[i].index, &entries[i].data))
3623                        break;
3624
3625        return i;
3626}
3627
3628/*
3629 * Read or write a bunch of msrs. Parameters are user addresses.
3630 *
3631 * @return number of msrs set successfully.
3632 */
3633static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
3634                  int (*do_msr)(struct kvm_vcpu *vcpu,
3635                                unsigned index, u64 *data),
3636                  int writeback)
3637{
3638        struct kvm_msrs msrs;
3639        struct kvm_msr_entry *entries;
3640        int r, n;
3641        unsigned size;
3642
3643        r = -EFAULT;
3644        if (copy_from_user(&msrs, user_msrs, sizeof(msrs)))
3645                goto out;
3646
3647        r = -E2BIG;
3648        if (msrs.nmsrs >= MAX_IO_MSRS)
3649                goto out;
3650
3651        size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
3652        entries = memdup_user(user_msrs->entries, size);
3653        if (IS_ERR(entries)) {
3654                r = PTR_ERR(entries);
3655                goto out;
3656        }
3657
3658        r = n = __msr_io(vcpu, &msrs, entries, do_msr);
3659        if (r < 0)
3660                goto out_free;
3661
3662        r = -EFAULT;
3663        if (writeback && copy_to_user(user_msrs->entries, entries, size))
3664                goto out_free;
3665
3666        r = n;
3667
3668out_free:
3669        kfree(entries);
3670out:
3671        return r;
3672}
3673
3674static inline bool kvm_can_mwait_in_guest(void)
3675{
3676        return boot_cpu_has(X86_FEATURE_MWAIT) &&
3677                !boot_cpu_has_bug(X86_BUG_MONITOR) &&
3678                boot_cpu_has(X86_FEATURE_ARAT);
3679}
3680
3681int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
3682{
3683        int r = 0;
3684
3685        switch (ext) {
3686        case KVM_CAP_IRQCHIP:
3687        case KVM_CAP_HLT:
3688        case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
3689        case KVM_CAP_SET_TSS_ADDR:
3690        case KVM_CAP_EXT_CPUID:
3691        case KVM_CAP_EXT_EMUL_CPUID:
3692        case KVM_CAP_CLOCKSOURCE:
3693        case KVM_CAP_PIT:
3694        case KVM_CAP_NOP_IO_DELAY:
3695        case KVM_CAP_MP_STATE:
3696        case KVM_CAP_SYNC_MMU:
3697        case KVM_CAP_USER_NMI:
3698        case KVM_CAP_REINJECT_CONTROL:
3699        case KVM_CAP_IRQ_INJECT_STATUS:
3700        case KVM_CAP_IOEVENTFD:
3701        case KVM_CAP_IOEVENTFD_NO_LENGTH:
3702        case KVM_CAP_PIT2:
3703        case KVM_CAP_PIT_STATE2:
3704        case KVM_CAP_SET_IDENTITY_MAP_ADDR:
3705        case KVM_CAP_XEN_HVM:
3706        case KVM_CAP_VCPU_EVENTS:
3707        case KVM_CAP_HYPERV:
3708        case KVM_CAP_HYPERV_VAPIC:
3709        case KVM_CAP_HYPERV_SPIN:
3710        case KVM_CAP_HYPERV_SYNIC:
3711        case KVM_CAP_HYPERV_SYNIC2:
3712        case KVM_CAP_HYPERV_VP_INDEX:
3713        case KVM_CAP_HYPERV_EVENTFD:
3714        case KVM_CAP_HYPERV_TLBFLUSH:
3715        case KVM_CAP_HYPERV_SEND_IPI:
3716        case KVM_CAP_HYPERV_CPUID:
3717        case KVM_CAP_PCI_SEGMENT:
3718        case KVM_CAP_DEBUGREGS:
3719        case KVM_CAP_X86_ROBUST_SINGLESTEP:
3720        case KVM_CAP_XSAVE:
3721        case KVM_CAP_ASYNC_PF:
3722        case KVM_CAP_ASYNC_PF_INT:
3723        case KVM_CAP_GET_TSC_KHZ:
3724        case KVM_CAP_KVMCLOCK_CTRL:
3725        case KVM_CAP_READONLY_MEM:
3726        case KVM_CAP_HYPERV_TIME:
3727        case KVM_CAP_IOAPIC_POLARITY_IGNORED:
3728        case KVM_CAP_TSC_DEADLINE_TIMER:
3729        case KVM_CAP_DISABLE_QUIRKS:
3730        case KVM_CAP_SET_BOOT_CPU_ID:
3731        case KVM_CAP_SPLIT_IRQCHIP:
3732        case KVM_CAP_IMMEDIATE_EXIT:
3733        case KVM_CAP_PMU_EVENT_FILTER:
3734        case KVM_CAP_GET_MSR_FEATURES:
3735        case KVM_CAP_MSR_PLATFORM_INFO:
3736        case KVM_CAP_EXCEPTION_PAYLOAD:
3737        case KVM_CAP_SET_GUEST_DEBUG:
3738        case KVM_CAP_LAST_CPU:
3739        case KVM_CAP_X86_USER_SPACE_MSR:
3740        case KVM_CAP_X86_MSR_FILTER:
3741        case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
3742                r = 1;
3743                break;
3744        case KVM_CAP_SYNC_REGS:
3745                r = KVM_SYNC_X86_VALID_FIELDS;
3746                break;
3747        case KVM_CAP_ADJUST_CLOCK:
3748                r = KVM_CLOCK_TSC_STABLE;
3749                break;
3750        case KVM_CAP_X86_DISABLE_EXITS:
3751                r |=  KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE |
3752                      KVM_X86_DISABLE_EXITS_CSTATE;
3753                if(kvm_can_mwait_in_guest())
3754                        r |= KVM_X86_DISABLE_EXITS_MWAIT;
3755                break;
3756        case KVM_CAP_X86_SMM:
3757                /* SMBASE is usually relocated above 1M on modern chipsets,
3758                 * and SMM handlers might indeed rely on 4G segment limits,
3759                 * so do not report SMM to be available if real mode is
3760                 * emulated via vm86 mode.  Still, do not go to great lengths
3761                 * to avoid userspace's usage of the feature, because it is a
3762                 * fringe case that is not enabled except via specific settings
3763                 * of the module parameters.
3764                 */
3765                r = kvm_x86_ops.has_emulated_msr(MSR_IA32_SMBASE);
3766                break;
3767        case KVM_CAP_VAPIC:
3768                r = !kvm_x86_ops.cpu_has_accelerated_tpr();
3769                break;
3770        case KVM_CAP_NR_VCPUS:
3771                r = KVM_SOFT_MAX_VCPUS;
3772                break;
3773        case KVM_CAP_MAX_VCPUS:
3774                r = KVM_MAX_VCPUS;
3775                break;
3776        case KVM_CAP_MAX_VCPU_ID:
3777                r = KVM_MAX_VCPU_ID;
3778                break;
3779        case KVM_CAP_PV_MMU:    /* obsolete */
3780                r = 0;
3781                break;
3782        case KVM_CAP_MCE:
3783                r = KVM_MAX_MCE_BANKS;
3784                break;
3785        case KVM_CAP_XCRS:
3786                r = boot_cpu_has(X86_FEATURE_XSAVE);
3787                break;
3788        case KVM_CAP_TSC_CONTROL:
3789                r = kvm_has_tsc_control;
3790                break;
3791        case KVM_CAP_X2APIC_API:
3792                r = KVM_X2APIC_API_VALID_FLAGS;
3793                break;
3794        case KVM_CAP_NESTED_STATE:
3795                r = kvm_x86_ops.nested_ops->get_state ?
3796                        kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
3797                break;
3798        case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
3799                r = kvm_x86_ops.enable_direct_tlbflush != NULL;
3800                break;
3801        case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
3802                r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
3803                break;
3804        case KVM_CAP_SMALLER_MAXPHYADDR:
3805                r = (int) allow_smaller_maxphyaddr;
3806                break;
3807        case KVM_CAP_STEAL_TIME:
3808                r = sched_info_on();
3809                break;
3810        default:
3811                break;
3812        }
3813        return r;
3814
3815}
3816
3817long kvm_arch_dev_ioctl(struct file *filp,
3818                        unsigned int ioctl, unsigned long arg)
3819{
3820        void __user *argp = (void __user *)arg;
3821        long r;
3822
3823        switch (ioctl) {
3824        case KVM_GET_MSR_INDEX_LIST: {
3825                struct kvm_msr_list __user *user_msr_list = argp;
3826                struct kvm_msr_list msr_list;
3827                unsigned n;
3828
3829                r = -EFAULT;
3830                if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
3831                        goto out;
3832                n = msr_list.nmsrs;
3833                msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
3834                if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
3835                        goto out;
3836                r = -E2BIG;
3837                if (n < msr_list.nmsrs)
3838                        goto out;
3839                r = -EFAULT;
3840                if (copy_to_user(user_msr_list->indices, &msrs_to_save,
3841                                 num_msrs_to_save * sizeof(u32)))
3842                        goto out;
3843                if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
3844                                 &emulated_msrs,
3845                                 num_emulated_msrs * sizeof(u32)))
3846                        goto out;
3847                r = 0;
3848                break;
3849        }
3850        case KVM_GET_SUPPORTED_CPUID:
3851        case KVM_GET_EMULATED_CPUID: {
3852                struct kvm_cpuid2 __user *cpuid_arg = argp;
3853                struct kvm_cpuid2 cpuid;
3854
3855                r = -EFAULT;
3856                if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
3857                        goto out;
3858
3859                r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
3860                                            ioctl);
3861                if (r)
3862                        goto out;
3863
3864                r = -EFAULT;
3865                if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
3866                        goto out;
3867                r = 0;
3868                break;
3869        }
3870        case KVM_X86_GET_MCE_CAP_SUPPORTED:
3871                r = -EFAULT;
3872                if (copy_to_user(argp, &kvm_mce_cap_supported,
3873                                 sizeof(kvm_mce_cap_supported)))
3874                        goto out;
3875                r = 0;
3876                break;
3877        case KVM_GET_MSR_FEATURE_INDEX_LIST: {
3878                struct kvm_msr_list __user *user_msr_list = argp;
3879                struct kvm_msr_list msr_list;
3880                unsigned int n;
3881
3882                r = -EFAULT;
3883                if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
3884                        goto out;
3885                n = msr_list.nmsrs;
3886                msr_list.nmsrs = num_msr_based_features;
3887                if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
3888                        goto out;
3889                r = -E2BIG;
3890                if (n < msr_list.nmsrs)
3891                        goto out;
3892                r = -EFAULT;
3893                if (copy_to_user(user_msr_list->indices, &msr_based_features,
3894                                 num_msr_based_features * sizeof(u32)))
3895                        goto out;
3896                r = 0;
3897                break;
3898        }
3899        case KVM_GET_MSRS:
3900                r = msr_io(NULL, argp, do_get_msr_feature, 1);
3901                break;
3902        default:
3903                r = -EINVAL;
3904                break;
3905        }
3906out:
3907        return r;
3908}
3909
3910static void wbinvd_ipi(void *garbage)
3911{
3912        wbinvd();
3913}
3914
3915static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
3916{
3917        return kvm_arch_has_noncoherent_dma(vcpu->kvm);
3918}
3919
3920void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
3921{
3922        /* Address WBINVD may be executed by guest */
3923        if (need_emulate_wbinvd(vcpu)) {
3924                if (kvm_x86_ops.has_wbinvd_exit())
3925                        cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
3926                else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
3927                        smp_call_function_single(vcpu->cpu,
3928                                        wbinvd_ipi, NULL, 1);
3929        }
3930
3931        kvm_x86_ops.vcpu_load(vcpu, cpu);
3932
3933        /* Save host pkru register if supported */
3934        vcpu->arch.host_pkru = read_pkru();
3935
3936        /* Apply any externally detected TSC adjustments (due to suspend) */
3937        if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
3938                adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
3939                vcpu->arch.tsc_offset_adjustment = 0;
3940                kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
3941        }
3942
3943        if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
3944                s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
3945                                rdtsc() - vcpu->arch.last_host_tsc;
3946                if (tsc_delta < 0)
3947                        mark_tsc_unstable("KVM discovered backwards TSC");
3948
3949                if (kvm_check_tsc_unstable()) {
3950                        u64 offset = kvm_compute_tsc_offset(vcpu,
3951                                                vcpu->arch.last_guest_tsc);
3952                        kvm_vcpu_write_tsc_offset(vcpu, offset);
3953                        vcpu->arch.tsc_catchup = 1;
3954                }
3955
3956                if (kvm_lapic_hv_timer_in_use(vcpu))
3957                        kvm_lapic_restart_hv_timer(vcpu);
3958
3959                /*
3960                 * On a host with synchronized TSC, there is no need to update
3961                 * kvmclock on vcpu->cpu migration
3962                 */
3963                if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
3964                        kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
3965                if (vcpu->cpu != cpu)
3966                        kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
3967                vcpu->cpu = cpu;
3968        }
3969
3970        kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
3971}
3972
3973static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
3974{
3975        struct kvm_host_map map;
3976        struct kvm_steal_time *st;
3977
3978        if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
3979                return;
3980
3981        if (vcpu->arch.st.preempted)
3982                return;
3983
3984        if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
3985                        &vcpu->arch.st.cache, true))
3986                return;
3987
3988        st = map.hva +
3989                offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
3990
3991        st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
3992
3993        kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
3994}
3995
3996void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
3997{
3998        int idx;
3999
4000        if (vcpu->preempted)
4001                vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu);
4002
4003        /*
4004         * Disable page faults because we're in atomic context here.
4005         * kvm_write_guest_offset_cached() would call might_fault()
4006         * that relies on pagefault_disable() to tell if there's a
4007         * bug. NOTE: the write to guest memory may not go through if
4008         * during postcopy live migration or if there's heavy guest
4009         * paging.
4010         */
4011        pagefault_disable();
4012        /*
4013         * kvm_memslots() will be called by
4014         * kvm_write_guest_offset_cached() so take the srcu lock.
4015         */
4016        idx = srcu_read_lock(&vcpu->kvm->srcu);
4017        kvm_steal_time_set_preempted(vcpu);
4018        srcu_read_unlock(&vcpu->kvm->srcu, idx);
4019        pagefault_enable();
4020        kvm_x86_ops.vcpu_put(vcpu);
4021        vcpu->arch.last_host_tsc = rdtsc();
4022        /*
4023         * If userspace has set any breakpoints or watchpoints, dr6 is restored
4024         * on every vmexit, but if not, we might have a stale dr6 from the
4025         * guest. do_debug expects dr6 to be cleared after it runs, do the same.
4026         */
4027        set_debugreg(0, 6);
4028}
4029
4030static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
4031                                    struct kvm_lapic_state *s)
4032{
4033        if (vcpu->arch.apicv_active)
4034                kvm_x86_ops.sync_pir_to_irr(vcpu);
4035
4036        return kvm_apic_get_state(vcpu, s);
4037}
4038
4039static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
4040                                    struct kvm_lapic_state *s)
4041{
4042        int r;
4043
4044        r = kvm_apic_set_state(vcpu, s);
4045        if (r)
4046                return r;
4047        update_cr8_intercept(vcpu);
4048
4049        return 0;
4050}
4051
4052static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
4053{
4054        /*
4055         * We can accept userspace's request for interrupt injection
4056         * as long as we have a place to store the interrupt number.
4057         * The actual injection will happen when the CPU is able to
4058         * deliver the interrupt.
4059         */
4060        if (kvm_cpu_has_extint(vcpu))
4061                return false;
4062
4063        /* Acknowledging ExtINT does not happen if LINT0 is masked.  */
4064        return (!lapic_in_kernel(vcpu) ||
4065                kvm_apic_accept_pic_intr(vcpu));
4066}
4067
4068static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
4069{
4070        return kvm_arch_interrupt_allowed(vcpu) &&
4071                kvm_cpu_accept_dm_intr(vcpu);
4072}
4073
4074static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
4075                                    struct kvm_interrupt *irq)
4076{
4077        if (irq->irq >= KVM_NR_INTERRUPTS)
4078                return -EINVAL;
4079
4080        if (!irqchip_in_kernel(vcpu->kvm)) {
4081                kvm_queue_interrupt(vcpu, irq->irq, false);
4082                kvm_make_request(KVM_REQ_EVENT, vcpu);
4083                return 0;
4084        }
4085
4086        /*
4087         * With in-kernel LAPIC, we only use this to inject EXTINT, so
4088         * fail for in-kernel 8259.
4089         */
4090        if (pic_in_kernel(vcpu->kvm))
4091                return -ENXIO;
4092
4093        if (vcpu->arch.pending_external_vector != -1)
4094                return -EEXIST;
4095
4096        vcpu->arch.pending_external_vector = irq->irq;
4097        kvm_make_request(KVM_REQ_EVENT, vcpu);
4098        return 0;
4099}
4100
4101static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
4102{
4103        kvm_inject_nmi(vcpu);
4104
4105        return 0;
4106}
4107
4108static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
4109{
4110        kvm_make_request(KVM_REQ_SMI, vcpu);
4111
4112        return 0;
4113}
4114
4115static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
4116                                           struct kvm_tpr_access_ctl *tac)
4117{
4118        if (tac->flags)
4119                return -EINVAL;
4120        vcpu->arch.tpr_access_reporting = !!tac->enabled;
4121        return 0;
4122}
4123
4124static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
4125                                        u64 mcg_cap)
4126{
4127        int r;
4128        unsigned bank_num = mcg_cap & 0xff, bank;
4129
4130        r = -EINVAL;
4131        if (!bank_num || bank_num > KVM_MAX_MCE_BANKS)
4132                goto out;
4133        if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
4134                goto out;
4135        r = 0;
4136        vcpu->arch.mcg_cap = mcg_cap;
4137        /* Init IA32_MCG_CTL to all 1s */
4138        if (mcg_cap & MCG_CTL_P)
4139                vcpu->arch.mcg_ctl = ~(u64)0;
4140        /* Init IA32_MCi_CTL to all 1s */
4141        for (bank = 0; bank < bank_num; bank++)
4142                vcpu->arch.mce_banks[bank*4] = ~(u64)0;
4143
4144        kvm_x86_ops.setup_mce(vcpu);
4145out:
4146        return r;
4147}
4148
4149static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
4150                                      struct kvm_x86_mce *mce)
4151{
4152        u64 mcg_cap = vcpu->arch.mcg_cap;
4153        unsigned bank_num = mcg_cap & 0xff;
4154        u64 *banks = vcpu->arch.mce_banks;
4155
4156        if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
4157                return -EINVAL;
4158        /*
4159         * if IA32_MCG_CTL is not all 1s, the uncorrected error
4160         * reporting is disabled
4161         */
4162        if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
4163            vcpu->arch.mcg_ctl != ~(u64)0)
4164                return 0;
4165        banks += 4 * mce->bank;
4166        /*
4167         * if IA32_MCi_CTL is not all 1s, the uncorrected error
4168         * reporting is disabled for the bank
4169         */
4170        if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
4171                return 0;
4172        if (mce->status & MCI_STATUS_UC) {
4173                if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
4174                    !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
4175                        kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
4176                        return 0;
4177                }
4178                if (banks[1] & MCI_STATUS_VAL)
4179                        mce->status |= MCI_STATUS_OVER;
4180                banks[2] = mce->addr;
4181                banks[3] = mce->misc;
4182                vcpu->arch.mcg_status = mce->mcg_status;
4183                banks[1] = mce->status;
4184                kvm_queue_exception(vcpu, MC_VECTOR);
4185        } else if (!(banks[1] & MCI_STATUS_VAL)
4186                   || !(banks[1] & MCI_STATUS_UC)) {
4187                if (banks[1] & MCI_STATUS_VAL)
4188                        mce->status |= MCI_STATUS_OVER;
4189                banks[2] = mce->addr;
4190                banks[3] = mce->misc;
4191                banks[1] = mce->status;
4192        } else
4193                banks[1] |= MCI_STATUS_OVER;
4194        return 0;
4195}
4196
4197static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
4198                                               struct kvm_vcpu_events *events)
4199{
4200        process_nmi(vcpu);
4201
4202        /*
4203         * In guest mode, payload delivery should be deferred,
4204         * so that the L1 hypervisor can intercept #PF before
4205         * CR2 is modified (or intercept #DB before DR6 is
4206         * modified under nVMX). Unless the per-VM capability,
4207         * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of
4208         * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we
4209         * opportunistically defer the exception payload, deliver it if the
4210         * capability hasn't been requested before processing a
4211         * KVM_GET_VCPU_EVENTS.
4212         */
4213        if (!vcpu->kvm->arch.exception_payload_enabled &&
4214            vcpu->arch.exception.pending && vcpu->arch.exception.has_payload)
4215                kvm_deliver_exception_payload(vcpu);
4216
4217        /*
4218         * The API doesn't provide the instruction length for software
4219         * exceptions, so don't report them. As long as the guest RIP
4220         * isn't advanced, we should expect to encounter the exception
4221         * again.
4222         */
4223        if (kvm_exception_is_soft(vcpu->arch.exception.nr)) {
4224                events->exception.injected = 0;
4225                events->exception.pending = 0;
4226        } else {
4227                events->exception.injected = vcpu->arch.exception.injected;
4228                events->exception.pending = vcpu->arch.exception.pending;
4229                /*
4230                 * For ABI compatibility, deliberately conflate
4231                 * pending and injected exceptions when
4232                 * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
4233                 */
4234                if (!vcpu->kvm->arch.exception_payload_enabled)
4235                        events->exception.injected |=
4236                                vcpu->arch.exception.pending;
4237        }
4238        events->exception.nr = vcpu->arch.exception.nr;
4239        events->exception.has_error_code = vcpu->arch.exception.has_error_code;
4240        events->exception.error_code = vcpu->arch.exception.error_code;
4241        events->exception_has_payload = vcpu->arch.exception.has_payload;
4242        events->exception_payload = vcpu->arch.exception.payload;
4243
4244        events->interrupt.injected =
4245                vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
4246        events->interrupt.nr = vcpu->arch.interrupt.nr;
4247        events->interrupt.soft = 0;
4248        events->interrupt.shadow = kvm_x86_ops.get_interrupt_shadow(vcpu);
4249
4250        events->nmi.injected = vcpu->arch.nmi_injected;
4251        events->nmi.pending = vcpu->arch.nmi_pending != 0;
4252        events->nmi.masked = kvm_x86_ops.get_nmi_mask(vcpu);
4253        events->nmi.pad = 0;
4254
4255        events->sipi_vector = 0; /* never valid when reporting to user space */
4256
4257        events->smi.smm = is_smm(vcpu);
4258        events->smi.pending = vcpu->arch.smi_pending;
4259        events->smi.smm_inside_nmi =
4260                !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
4261        events->smi.latched_init = kvm_lapic_latched_init(vcpu);
4262
4263        events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
4264                         | KVM_VCPUEVENT_VALID_SHADOW
4265                         | KVM_VCPUEVENT_VALID_SMM);
4266        if (vcpu->kvm->arch.exception_payload_enabled)
4267                events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
4268
4269        memset(&events->reserved, 0, sizeof(events->reserved));
4270}
4271
4272static void kvm_smm_changed(struct kvm_vcpu *vcpu);
4273
4274static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
4275                                              struct kvm_vcpu_events *events)
4276{
4277        if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
4278                              | KVM_VCPUEVENT_VALID_SIPI_VECTOR
4279                              | KVM_VCPUEVENT_VALID_SHADOW
4280                              | KVM_VCPUEVENT_VALID_SMM
4281                              | KVM_VCPUEVENT_VALID_PAYLOAD))
4282                return -EINVAL;
4283
4284        if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
4285                if (!vcpu->kvm->arch.exception_payload_enabled)
4286                        return -EINVAL;
4287                if (events->exception.pending)
4288                        events->exception.injected = 0;
4289                else
4290                        events->exception_has_payload = 0;
4291        } else {
4292                events->exception.pending = 0;
4293                events->exception_has_payload = 0;
4294        }
4295
4296        if ((events->exception.injected || events->exception.pending) &&
4297            (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
4298                return -EINVAL;
4299
4300        /* INITs are latched while in SMM */
4301        if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
4302            (events->smi.smm || events->smi.pending) &&
4303            vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
4304                return -EINVAL;
4305
4306        process_nmi(vcpu);
4307        vcpu->arch.exception.injected = events->exception.injected;
4308        vcpu->arch.exception.pending = events->exception.pending;
4309        vcpu->arch.exception.nr = events->exception.nr;
4310        vcpu->arch.exception.has_error_code = events->exception.has_error_code;
4311        vcpu->arch.exception.error_code = events->exception.error_code;
4312        vcpu->arch.exception.has_payload = events->exception_has_payload;
4313        vcpu->arch.exception.payload = events->exception_payload;
4314
4315        vcpu->arch.interrupt.injected = events->interrupt.injected;
4316        vcpu->arch.interrupt.nr = events->interrupt.nr;
4317        vcpu->arch.interrupt.soft = events->interrupt.soft;
4318        if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
4319                kvm_x86_ops.set_interrupt_shadow(vcpu,
4320                                                  events->interrupt.shadow);
4321
4322        vcpu->arch.nmi_injected = events->nmi.injected;
4323        if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
4324                vcpu->arch.nmi_pending = events->nmi.pending;
4325        kvm_x86_ops.set_nmi_mask(vcpu, events->nmi.masked);
4326
4327        if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
4328            lapic_in_kernel(vcpu))
4329                vcpu->arch.apic->sipi_vector = events->sipi_vector;
4330
4331        if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
4332                if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
4333                        if (events->smi.smm)
4334                                vcpu->arch.hflags |= HF_SMM_MASK;
4335                        else
4336                                vcpu->arch.hflags &= ~HF_SMM_MASK;
4337                        kvm_smm_changed(vcpu);
4338                }
4339
4340                vcpu->arch.smi_pending = events->smi.pending;
4341
4342                if (events->smi.smm) {
4343                        if (events->smi.smm_inside_nmi)
4344                                vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
4345                        else
4346                                vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
4347                }
4348
4349                if (lapic_in_kernel(vcpu)) {
4350                        if (events->smi.latched_init)
4351                                set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
4352                        else
4353                                clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
4354                }
4355        }
4356
4357        kvm_make_request(KVM_REQ_EVENT, vcpu);
4358
4359        return 0;
4360}
4361
4362static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
4363                                             struct kvm_debugregs *dbgregs)
4364{
4365        unsigned long val;
4366
4367        memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
4368        kvm_get_dr(vcpu, 6, &val);
4369        dbgregs->dr6 = val;
4370        dbgregs->dr7 = vcpu->arch.dr7;
4371        dbgregs->flags = 0;
4372        memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
4373}
4374
4375static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
4376                                            struct kvm_debugregs *dbgregs)
4377{
4378        if (dbgregs->flags)
4379                return -EINVAL;
4380
4381        if (dbgregs->dr6 & ~0xffffffffull)
4382                return -EINVAL;
4383        if (dbgregs->dr7 & ~0xffffffffull)
4384                return -EINVAL;
4385
4386        memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
4387        kvm_update_dr0123(vcpu);
4388        vcpu->arch.dr6 = dbgregs->dr6;
4389        vcpu->arch.dr7 = dbgregs->dr7;
4390        kvm_update_dr7(vcpu);
4391
4392        return 0;
4393}
4394
4395#define XSTATE_COMPACTION_ENABLED (1ULL << 63)
4396
4397static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
4398{
4399        struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
4400        u64 xstate_bv = xsave->header.xfeatures;
4401        u64 valid;
4402
4403        /*
4404         * Copy legacy XSAVE area, to avoid complications with CPUID
4405         * leaves 0 and 1 in the loop below.
4406         */
4407        memcpy(dest, xsave, XSAVE_HDR_OFFSET);
4408
4409        /* Set XSTATE_BV */
4410        xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE;
4411        *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv;
4412
4413        /*
4414         * Copy each region from the possibly compacted offset to the
4415         * non-compacted offset.
4416         */
4417        valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
4418        while (valid) {
4419                u64 xfeature_mask = valid & -valid;
4420                int xfeature_nr = fls64(xfeature_mask) - 1;
4421                void *src = get_xsave_addr(xsave, xfeature_nr);
4422
4423                if (src) {
4424                        u32 size, offset, ecx, edx;
4425                        cpuid_count(XSTATE_CPUID, xfeature_nr,
4426                                    &size, &offset, &ecx, &edx);
4427                        if (xfeature_nr == XFEATURE_PKRU)
4428                                memcpy(dest + offset, &vcpu->arch.pkru,
4429                                       sizeof(vcpu->arch.pkru));
4430                        else
4431                                memcpy(dest + offset, src, size);
4432
4433                }
4434
4435                valid -= xfeature_mask;
4436        }
4437}
4438
4439static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
4440{
4441        struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
4442        u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
4443        u64 valid;
4444
4445        /*
4446         * Copy legacy XSAVE area, to avoid complications with CPUID
4447         * leaves 0 and 1 in the loop below.
4448         */
4449        memcpy(xsave, src, XSAVE_HDR_OFFSET);
4450
4451        /* Set XSTATE_BV and possibly XCOMP_BV.  */
4452        xsave->header.xfeatures = xstate_bv;
4453        if (boot_cpu_has(X86_FEATURE_XSAVES))
4454                xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
4455
4456        /*
4457         * Copy each region from the non-compacted offset to the
4458         * possibly compacted offset.
4459         */
4460        valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
4461        while (valid) {
4462                u64 xfeature_mask = valid & -valid;
4463                int xfeature_nr = fls64(xfeature_mask) - 1;
4464                void *dest = get_xsave_addr(xsave, xfeature_nr);
4465
4466                if (dest) {
4467                        u32 size, offset, ecx, edx;
4468                        cpuid_count(XSTATE_CPUID, xfeature_nr,
4469                                    &size, &offset, &ecx, &edx);
4470                        if (xfeature_nr == XFEATURE_PKRU)
4471                                memcpy(&vcpu->arch.pkru, src + offset,
4472                                       sizeof(vcpu->arch.pkru));
4473                        else
4474                                memcpy(dest, src + offset, size);
4475                }
4476
4477                valid -= xfeature_mask;
4478        }
4479}
4480
4481static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
4482                                         struct kvm_xsave *guest_xsave)
4483{
4484        if (boot_cpu_has(X86_FEATURE_XSAVE)) {
4485                memset(guest_xsave, 0, sizeof(struct kvm_xsave));
4486                fill_xsave((u8 *) guest_xsave->region, vcpu);
4487        } else {
4488                memcpy(guest_xsave->region,
4489                        &vcpu->arch.guest_fpu->state.fxsave,
4490                        sizeof(struct fxregs_state));
4491                *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
4492                        XFEATURE_MASK_FPSSE;
4493        }
4494}
4495
4496#define XSAVE_MXCSR_OFFSET 24
4497
4498static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
4499                                        struct kvm_xsave *guest_xsave)
4500{
4501        u64 xstate_bv =
4502                *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
4503        u32 mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
4504
4505        if (boot_cpu_has(X86_FEATURE_XSAVE)) {
4506                /*
4507                 * Here we allow setting states that are not present in
4508                 * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility
4509                 * with old userspace.
4510                 */
4511                if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask)
4512                        return -EINVAL;
4513                load_xsave(vcpu, (u8 *)guest_xsave->region);
4514        } else {
4515                if (xstate_bv & ~XFEATURE_MASK_FPSSE ||
4516                        mxcsr & ~mxcsr_feature_mask)
4517                        return -EINVAL;
4518                memcpy(&vcpu->arch.guest_fpu->state.fxsave,
4519                        guest_xsave->region, sizeof(struct fxregs_state));
4520        }
4521        return 0;
4522}
4523
4524static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
4525                                        struct kvm_xcrs *guest_xcrs)
4526{
4527        if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
4528                guest_xcrs->nr_xcrs = 0;
4529                return;
4530        }
4531
4532        guest_xcrs->nr_xcrs = 1;
4533        guest_xcrs->flags = 0;
4534        guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
4535        guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
4536}
4537
4538static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
4539                                       struct kvm_xcrs *guest_xcrs)
4540{
4541        int i, r = 0;
4542
4543        if (!boot_cpu_has(X86_FEATURE_XSAVE))
4544                return -EINVAL;
4545
4546        if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
4547                return -EINVAL;
4548
4549        for (i = 0; i < guest_xcrs->nr_xcrs; i++)
4550                /* Only support XCR0 currently */
4551                if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
4552                        r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
4553                                guest_xcrs->xcrs[i].value);
4554                        break;
4555                }
4556        if (r)
4557                r = -EINVAL;
4558        return r;
4559}
4560
4561/*
4562 * kvm_set_guest_paused() indicates to the guest kernel that it has been
4563 * stopped by the hypervisor.  This function will be called from the host only.
4564 * EINVAL is returned when the host attempts to set the flag for a guest that
4565 * does not support pv clocks.
4566 */
4567static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
4568{
4569        if (!vcpu->arch.pv_time_enabled)
4570                return -EINVAL;
4571        vcpu->arch.pvclock_set_guest_stopped_request = true;
4572        kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
4573        return 0;
4574}
4575
4576static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
4577                                     struct kvm_enable_cap *cap)
4578{
4579        int r;
4580        uint16_t vmcs_version;
4581        void __user *user_ptr;
4582
4583        if (cap->flags)
4584                return -EINVAL;
4585
4586        switch (cap->cap) {
4587        case KVM_CAP_HYPERV_SYNIC2:
4588                if (cap->args[0])
4589                        return -EINVAL;
4590                fallthrough;
4591
4592        case KVM_CAP_HYPERV_SYNIC:
4593                if (!irqchip_in_kernel(vcpu->kvm))
4594                        return -EINVAL;
4595                return kvm_hv_activate_synic(vcpu, cap->cap ==
4596                                             KVM_CAP_HYPERV_SYNIC2);
4597        case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
4598                if (!kvm_x86_ops.nested_ops->enable_evmcs)
4599                        return -ENOTTY;
4600                r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version);
4601                if (!r) {
4602                        user_ptr = (void __user *)(uintptr_t)cap->args[0];
4603                        if (copy_to_user(user_ptr, &vmcs_version,
4604                                         sizeof(vmcs_version)))
4605                                r = -EFAULT;
4606                }
4607                return r;
4608        case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
4609                if (!kvm_x86_ops.enable_direct_tlbflush)
4610                        return -ENOTTY;
4611
4612                return kvm_x86_ops.enable_direct_tlbflush(vcpu);
4613
4614        case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
4615                vcpu->arch.pv_cpuid.enforce = cap->args[0];
4616                if (vcpu->arch.pv_cpuid.enforce)
4617                        kvm_update_pv_runtime(vcpu);
4618
4619                return 0;
4620
4621        default:
4622                return -EINVAL;
4623        }
4624}
4625
4626long kvm_arch_vcpu_ioctl(struct file *filp,
4627                         unsigned int ioctl, unsigned long arg)
4628{
4629        struct kvm_vcpu *vcpu = filp->private_data;
4630        void __user *argp = (void __user *)arg;
4631        int r;
4632        union {
4633                struct kvm_lapic_state *lapic;
4634                struct kvm_xsave *xsave;
4635                struct kvm_xcrs *xcrs;
4636                void *buffer;
4637        } u;
4638
4639        vcpu_load(vcpu);
4640
4641        u.buffer = NULL;
4642        switch (ioctl) {
4643        case KVM_GET_LAPIC: {
4644                r = -EINVAL;
4645                if (!lapic_in_kernel(vcpu))
4646                        goto out;
4647                u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
4648                                GFP_KERNEL_ACCOUNT);
4649
4650                r = -ENOMEM;
4651                if (!u.lapic)
4652                        goto out;
4653                r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
4654                if (r)
4655                        goto out;
4656                r = -EFAULT;
4657                if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
4658                        goto out;
4659                r = 0;
4660                break;
4661        }
4662        case KVM_SET_LAPIC: {
4663                r = -EINVAL;
4664                if (!lapic_in_kernel(vcpu))
4665                        goto out;
4666                u.lapic = memdup_user(argp, sizeof(*u.lapic));
4667                if (IS_ERR(u.lapic)) {
4668                        r = PTR_ERR(u.lapic);
4669                        goto out_nofree;
4670                }
4671
4672                r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
4673                break;
4674        }
4675        case KVM_INTERRUPT: {
4676                struct kvm_interrupt irq;
4677
4678                r = -EFAULT;
4679                if (copy_from_user(&irq, argp, sizeof(irq)))
4680                        goto out;
4681                r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
4682                break;
4683        }
4684        case KVM_NMI: {
4685                r = kvm_vcpu_ioctl_nmi(vcpu);
4686                break;
4687        }
4688        case KVM_SMI: {
4689                r = kvm_vcpu_ioctl_smi(vcpu);
4690                break;
4691        }
4692        case KVM_SET_CPUID: {
4693                struct kvm_cpuid __user *cpuid_arg = argp;
4694                struct kvm_cpuid cpuid;
4695
4696                r = -EFAULT;
4697                if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
4698                        goto out;
4699                r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
4700                break;
4701        }
4702        case KVM_SET_CPUID2: {
4703                struct kvm_cpuid2 __user *cpuid_arg = argp;
4704                struct kvm_cpuid2 cpuid;
4705
4706                r = -EFAULT;
4707                if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
4708                        goto out;
4709                r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
4710                                              cpuid_arg->entries);
4711                break;
4712        }
4713        case KVM_GET_CPUID2: {
4714                struct kvm_cpuid2 __user *cpuid_arg = argp;
4715                struct kvm_cpuid2 cpuid;
4716
4717                r = -EFAULT;
4718                if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
4719                        goto out;
4720                r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
4721                                              cpuid_arg->entries);
4722                if (r)
4723                        goto out;
4724                r = -EFAULT;
4725                if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
4726                        goto out;
4727                r = 0;
4728                break;
4729        }
4730        case KVM_GET_MSRS: {
4731                int idx = srcu_read_lock(&vcpu->kvm->srcu);
4732                r = msr_io(vcpu, argp, do_get_msr, 1);
4733                srcu_read_unlock(&vcpu->kvm->srcu, idx);
4734                break;
4735        }
4736        case KVM_SET_MSRS: {
4737                int idx = srcu_read_lock(&vcpu->kvm->srcu);
4738                r = msr_io(vcpu, argp, do_set_msr, 0);
4739                srcu_read_unlock(&vcpu->kvm->srcu, idx);
4740                break;
4741        }
4742        case KVM_TPR_ACCESS_REPORTING: {
4743                struct kvm_tpr_access_ctl tac;
4744
4745                r = -EFAULT;
4746                if (copy_from_user(&tac, argp, sizeof(tac)))
4747                        goto out;
4748                r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
4749                if (r)
4750                        goto out;
4751                r = -EFAULT;
4752                if (copy_to_user(argp, &tac, sizeof(tac)))
4753                        goto out;
4754                r = 0;
4755                break;
4756        };
4757        case KVM_SET_VAPIC_ADDR: {
4758                struct kvm_vapic_addr va;
4759                int idx;
4760
4761                r = -EINVAL;
4762                if (!lapic_in_kernel(vcpu))
4763                        goto out;
4764                r = -EFAULT;
4765                if (copy_from_user(&va, argp, sizeof(va)))
4766                        goto out;
4767                idx = srcu_read_lock(&vcpu->kvm->srcu);
4768                r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
4769                srcu_read_unlock(&vcpu->kvm->srcu, idx);
4770                break;
4771        }
4772        case KVM_X86_SETUP_MCE: {
4773                u64 mcg_cap;
4774
4775                r = -EFAULT;
4776                if (copy_from_user(&mcg_cap, argp, sizeof(mcg_cap)))
4777                        goto out;
4778                r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
4779                break;
4780        }
4781        case KVM_X86_SET_MCE: {
4782                struct kvm_x86_mce mce;
4783
4784                r = -EFAULT;
4785                if (copy_from_user(&mce, argp, sizeof(mce)))
4786                        goto out;
4787                r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
4788                break;
4789        }
4790        case KVM_GET_VCPU_EVENTS: {
4791                struct kvm_vcpu_events events;
4792
4793                kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
4794
4795                r = -EFAULT;
4796                if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
4797                        break;
4798                r = 0;
4799                break;
4800        }
4801        case KVM_SET_VCPU_EVENTS: {
4802                struct kvm_vcpu_events events;
4803
4804                r = -EFAULT;
4805                if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
4806                        break;
4807
4808                r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
4809                break;
4810        }
4811        case KVM_GET_DEBUGREGS: {
4812                struct kvm_debugregs dbgregs;
4813
4814                kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
4815
4816                r = -EFAULT;
4817                if (copy_to_user(argp, &dbgregs,
4818                                 sizeof(struct kvm_debugregs)))
4819                        break;
4820                r = 0;
4821                break;
4822        }
4823        case KVM_SET_DEBUGREGS: {
4824                struct kvm_debugregs dbgregs;
4825
4826                r = -EFAULT;
4827                if (copy_from_user(&dbgregs, argp,
4828                                   sizeof(struct kvm_debugregs)))
4829                        break;
4830
4831                r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
4832                break;
4833        }
4834        case KVM_GET_XSAVE: {
4835                u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
4836                r = -ENOMEM;
4837                if (!u.xsave)
4838                        break;
4839
4840                kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
4841
4842                r = -EFAULT;
4843                if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
4844                        break;
4845                r = 0;
4846                break;
4847        }
4848        case KVM_SET_XSAVE: {
4849                u.xsave = memdup_user(argp, sizeof(*u.xsave));
4850                if (IS_ERR(u.xsave)) {
4851                        r = PTR_ERR(u.xsave);
4852                        goto out_nofree;
4853                }
4854
4855                r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
4856                break;
4857        }
4858        case KVM_GET_XCRS: {
4859                u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
4860                r = -ENOMEM;
4861                if (!u.xcrs)
4862                        break;
4863
4864                kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
4865
4866                r = -EFAULT;
4867                if (copy_to_user(argp, u.xcrs,
4868                                 sizeof(struct kvm_xcrs)))
4869                        break;
4870                r = 0;
4871                break;
4872        }
4873        case KVM_SET_XCRS: {
4874                u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
4875                if (IS_ERR(u.xcrs)) {
4876                        r = PTR_ERR(u.xcrs);
4877                        goto out_nofree;
4878                }
4879
4880                r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
4881                break;
4882        }
4883        case KVM_SET_TSC_KHZ: {
4884                u32 user_tsc_khz;
4885
4886                r = -EINVAL;
4887                user_tsc_khz = (u32)arg;
4888
4889                if (kvm_has_tsc_control &&
4890                    user_tsc_khz >= kvm_max_guest_tsc_khz)
4891                        goto out;
4892
4893                if (user_tsc_khz == 0)
4894                        user_tsc_khz = tsc_khz;
4895
4896                if (!kvm_set_tsc_khz(vcpu, user_tsc_khz))
4897                        r = 0;
4898
4899                goto out;
4900        }
4901        case KVM_GET_TSC_KHZ: {
4902                r = vcpu->arch.virtual_tsc_khz;
4903                goto out;
4904        }
4905        case KVM_KVMCLOCK_CTRL: {
4906                r = kvm_set_guest_paused(vcpu);
4907                goto out;
4908        }
4909        case KVM_ENABLE_CAP: {
4910                struct kvm_enable_cap cap;
4911
4912                r = -EFAULT;
4913                if (copy_from_user(&cap, argp, sizeof(cap)))
4914                        goto out;
4915                r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
4916                break;
4917        }
4918        case KVM_GET_NESTED_STATE: {
4919                struct kvm_nested_state __user *user_kvm_nested_state = argp;
4920                u32 user_data_size;
4921
4922                r = -EINVAL;
4923                if (!kvm_x86_ops.nested_ops->get_state)
4924                        break;
4925
4926                BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
4927                r = -EFAULT;
4928                if (get_user(user_data_size, &user_kvm_nested_state->size))
4929                        break;
4930
4931                r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state,
4932                                                     user_data_size);
4933                if (r < 0)
4934                        break;
4935
4936                if (r > user_data_size) {
4937                        if (put_user(r, &user_kvm_nested_state->size))
4938                                r = -EFAULT;
4939                        else
4940                                r = -E2BIG;
4941                        break;
4942                }
4943
4944                r = 0;
4945                break;
4946        }
4947        case KVM_SET_NESTED_STATE: {
4948                struct kvm_nested_state __user *user_kvm_nested_state = argp;
4949                struct kvm_nested_state kvm_state;
4950                int idx;
4951
4952                r = -EINVAL;
4953                if (!kvm_x86_ops.nested_ops->set_state)
4954                        break;
4955
4956                r = -EFAULT;
4957                if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state)))
4958                        break;
4959
4960                r = -EINVAL;
4961                if (kvm_state.size < sizeof(kvm_state))
4962                        break;
4963
4964                if (kvm_state.flags &
4965                    ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE
4966                      | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_MTF_PENDING
4967                      | KVM_STATE_NESTED_GIF_SET))
4968                        break;
4969
4970                /* nested_run_pending implies guest_mode.  */
4971                if ((kvm_state.flags & KVM_STATE_NESTED_RUN_PENDING)
4972                    && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE))
4973                        break;
4974
4975                idx = srcu_read_lock(&vcpu->kvm->srcu);
4976                r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state);
4977                srcu_read_unlock(&vcpu->kvm->srcu, idx);
4978                break;
4979        }
4980        case KVM_GET_SUPPORTED_HV_CPUID: {
4981                struct kvm_cpuid2 __user *cpuid_arg = argp;
4982                struct kvm_cpuid2 cpuid;
4983
4984                r = -EFAULT;
4985                if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
4986                        goto out;
4987
4988                r = kvm_vcpu_ioctl_get_hv_cpuid(vcpu, &cpuid,
4989                                                cpuid_arg->entries);
4990                if (r)
4991                        goto out;
4992
4993                r = -EFAULT;
4994                if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
4995                        goto out;
4996                r = 0;
4997                break;
4998        }
4999        default:
5000                r = -EINVAL;
5001        }
5002out:
5003        kfree(u.buffer);
5004out_nofree:
5005        vcpu_put(vcpu);
5006        return r;
5007}
5008
5009vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
5010{
5011        return VM_FAULT_SIGBUS;
5012}
5013
5014static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
5015{
5016        int ret;
5017
5018        if (addr > (unsigned int)(-3 * PAGE_SIZE))
5019                return -EINVAL;
5020        ret = kvm_x86_ops.set_tss_addr(kvm, addr);
5021        return ret;
5022}
5023
5024static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
5025                                              u64 ident_addr)
5026{
5027        return kvm_x86_ops.set_identity_map_addr(kvm, ident_addr);
5028}
5029
5030static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
5031                                         unsigned long kvm_nr_mmu_pages)
5032{
5033        if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
5034                return -EINVAL;
5035
5036        mutex_lock(&kvm->slots_lock);
5037
5038        kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
5039        kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
5040
5041        mutex_unlock(&kvm->slots_lock);
5042        return 0;
5043}
5044
5045static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
5046{
5047        return kvm->arch.n_max_mmu_pages;
5048}
5049
5050static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
5051{
5052        struct kvm_pic *pic = kvm->arch.vpic;
5053        int r;
5054
5055        r = 0;
5056        switch (chip->chip_id) {
5057        case KVM_IRQCHIP_PIC_MASTER:
5058                memcpy(&chip->chip.pic, &pic->pics[0],
5059                        sizeof(struct kvm_pic_state));
5060                break;
5061        case KVM_IRQCHIP_PIC_SLAVE:
5062                memcpy(&chip->chip.pic, &pic->pics[1],
5063                        sizeof(struct kvm_pic_state));
5064                break;
5065        case KVM_IRQCHIP_IOAPIC:
5066                kvm_get_ioapic(kvm, &chip->chip.ioapic);
5067                break;
5068        default:
5069                r = -EINVAL;
5070                break;
5071        }
5072        return r;
5073}
5074
5075static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
5076{
5077        struct kvm_pic *pic = kvm->arch.vpic;
5078        int r;
5079
5080        r = 0;
5081        switch (chip->chip_id) {
5082        case KVM_IRQCHIP_PIC_MASTER:
5083                spin_lock(&pic->lock);
5084                memcpy(&pic->pics[0], &chip->chip.pic,
5085                        sizeof(struct kvm_pic_state));
5086                spin_unlock(&pic->lock);
5087                break;
5088        case KVM_IRQCHIP_PIC_SLAVE:
5089                spin_lock(&pic->lock);
5090                memcpy(&pic->pics[1], &chip->chip.pic,
5091                        sizeof(struct kvm_pic_state));
5092                spin_unlock(&pic->lock);
5093                break;
5094        case KVM_IRQCHIP_IOAPIC:
5095                kvm_set_ioapic(kvm, &chip->chip.ioapic);
5096                break;
5097        default:
5098                r = -EINVAL;
5099                break;
5100        }
5101        kvm_pic_update_irq(pic);
5102        return r;
5103}
5104
5105static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
5106{
5107        struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
5108
5109        BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
5110
5111        mutex_lock(&kps->lock);
5112        memcpy(ps, &kps->channels, sizeof(*ps));
5113        mutex_unlock(&kps->lock);
5114        return 0;
5115}
5116
5117static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
5118{
5119        int i;
5120        struct kvm_pit *pit = kvm->arch.vpit;
5121
5122        mutex_lock(&pit->pit_state.lock);
5123        memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
5124        for (i = 0; i < 3; i++)
5125                kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
5126        mutex_unlock(&pit->pit_state.lock);
5127        return 0;
5128}
5129
5130static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
5131{
5132        mutex_lock(&kvm->arch.vpit->pit_state.lock);
5133        memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
5134                sizeof(ps->channels));
5135        ps->flags = kvm->arch.vpit->pit_state.flags;
5136        mutex_unlock(&kvm->arch.vpit->pit_state.lock);
5137        memset(&ps->reserved, 0, sizeof(ps->reserved));
5138        return 0;
5139}
5140
5141static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
5142{
5143        int start = 0;
5144        int i;
5145        u32 prev_legacy, cur_legacy;
5146        struct kvm_pit *pit = kvm->arch.vpit;
5147
5148        mutex_lock(&pit->pit_state.lock);
5149        prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
5150        cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
5151        if (!prev_legacy && cur_legacy)
5152                start = 1;
5153        memcpy(&pit->pit_state.channels, &ps->channels,
5154               sizeof(pit->pit_state.channels));
5155        pit->pit_state.flags = ps->flags;
5156        for (i = 0; i < 3; i++)
5157                kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
5158                                   start && i == 0);
5159        mutex_unlock(&pit->pit_state.lock);
5160        return 0;
5161}
5162
5163static int kvm_vm_ioctl_reinject(struct kvm *kvm,
5164                                 struct kvm_reinject_control *control)
5165{
5166        struct kvm_pit *pit = kvm->arch.vpit;
5167
5168        /* pit->pit_state.lock was overloaded to prevent userspace from getting
5169         * an inconsistent state after running multiple KVM_REINJECT_CONTROL
5170         * ioctls in parallel.  Use a separate lock if that ioctl isn't rare.
5171         */
5172        mutex_lock(&pit->pit_state.lock);
5173        kvm_pit_set_reinject(pit, control->pit_reinject);
5174        mutex_unlock(&pit->pit_state.lock);
5175
5176        return 0;
5177}
5178
5179void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
5180{
5181        /*
5182         * Flush potentially hardware-cached dirty pages to dirty_bitmap.
5183         */
5184        if (kvm_x86_ops.flush_log_dirty)
5185                kvm_x86_ops.flush_log_dirty(kvm);
5186}
5187
5188int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
5189                        bool line_status)
5190{
5191        if (!irqchip_in_kernel(kvm))
5192                return -ENXIO;
5193
5194        irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
5195                                        irq_event->irq, irq_event->level,
5196                                        line_status);
5197        return 0;
5198}
5199
5200int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
5201                            struct kvm_enable_cap *cap)
5202{
5203        int r;
5204
5205        if (cap->flags)
5206                return -EINVAL;
5207
5208        switch (cap->cap) {
5209        case KVM_CAP_DISABLE_QUIRKS:
5210                kvm->arch.disabled_quirks = cap->args[0];
5211                r = 0;
5212                break;
5213        case KVM_CAP_SPLIT_IRQCHIP: {
5214                mutex_lock(&kvm->lock);
5215                r = -EINVAL;
5216                if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
5217                        goto split_irqchip_unlock;
5218                r = -EEXIST;
5219                if (irqchip_in_kernel(kvm))
5220                        goto split_irqchip_unlock;
5221                if (kvm->created_vcpus)
5222                        goto split_irqchip_unlock;
5223                r = kvm_setup_empty_irq_routing(kvm);
5224                if (r)
5225                        goto split_irqchip_unlock;
5226                /* Pairs with irqchip_in_kernel. */
5227                smp_wmb();
5228                kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
5229                kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
5230                r = 0;
5231split_irqchip_unlock:
5232                mutex_unlock(&kvm->lock);
5233                break;
5234        }
5235        case KVM_CAP_X2APIC_API:
5236                r = -EINVAL;
5237                if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
5238                        break;
5239
5240                if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
5241                        kvm->arch.x2apic_format = true;
5242                if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
5243                        kvm->arch.x2apic_broadcast_quirk_disabled = true;
5244
5245                r = 0;
5246                break;
5247        case KVM_CAP_X86_DISABLE_EXITS:
5248                r = -EINVAL;
5249                if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS)
5250                        break;
5251
5252                if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
5253                        kvm_can_mwait_in_guest())
5254                        kvm->arch.mwait_in_guest = true;
5255                if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
5256                        kvm->arch.hlt_in_guest = true;
5257                if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
5258                        kvm->arch.pause_in_guest = true;
5259                if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
5260                        kvm->arch.cstate_in_guest = true;
5261                r = 0;
5262                break;
5263        case KVM_CAP_MSR_PLATFORM_INFO:
5264                kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
5265                r = 0;
5266                break;
5267        case KVM_CAP_EXCEPTION_PAYLOAD:
5268                kvm->arch.exception_payload_enabled = cap->args[0];
5269                r = 0;
5270                break;
5271        case KVM_CAP_X86_USER_SPACE_MSR:
5272                kvm->arch.user_space_msr_mask = cap->args[0];
5273                r = 0;
5274                break;
5275        default:
5276                r = -EINVAL;
5277                break;
5278        }
5279        return r;
5280}
5281
5282static void kvm_clear_msr_filter(struct kvm *kvm)
5283{
5284        u32 i;
5285        u32 count = kvm->arch.msr_filter.count;
5286        struct msr_bitmap_range ranges[16];
5287
5288        mutex_lock(&kvm->lock);
5289        kvm->arch.msr_filter.count = 0;
5290        memcpy(ranges, kvm->arch.msr_filter.ranges, count * sizeof(ranges[0]));
5291        mutex_unlock(&kvm->lock);
5292        synchronize_srcu(&kvm->srcu);
5293
5294        for (i = 0; i < count; i++)
5295                kfree(ranges[i].bitmap);
5296}
5297
5298static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user_range)
5299{
5300        struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges;
5301        struct msr_bitmap_range range;
5302        unsigned long *bitmap = NULL;
5303        size_t bitmap_size;
5304        int r;
5305
5306        if (!user_range->nmsrs)
5307                return 0;
5308
5309        bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
5310        if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
5311                return -EINVAL;
5312
5313        bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size);
5314        if (IS_ERR(bitmap))
5315                return PTR_ERR(bitmap);
5316
5317        range = (struct msr_bitmap_range) {
5318                .flags = user_range->flags,
5319                .base = user_range->base,
5320                .nmsrs = user_range->nmsrs,
5321                .bitmap = bitmap,
5322        };
5323
5324        if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) {
5325                r = -EINVAL;
5326                goto err;
5327        }
5328
5329        if (!range.flags) {
5330                r = -EINVAL;
5331                goto err;
5332        }
5333
5334        /* Everything ok, add this range identifier to our global pool */
5335        ranges[kvm->arch.msr_filter.count] = range;
5336        /* Make sure we filled the array before we tell anyone to walk it */
5337        smp_wmb();
5338        kvm->arch.msr_filter.count++;
5339
5340        return 0;
5341err:
5342        kfree(bitmap);
5343        return r;
5344}
5345
5346static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
5347{
5348        struct kvm_msr_filter __user *user_msr_filter = argp;
5349        struct kvm_msr_filter filter;
5350        bool default_allow;
5351        int r = 0;
5352        bool empty = true;
5353        u32 i;
5354
5355        if (copy_from_user(&filter, user_msr_filter, sizeof(filter)))
5356                return -EFAULT;
5357
5358        for (i = 0; i < ARRAY_SIZE(filter.ranges); i++)
5359                empty &= !filter.ranges[i].nmsrs;
5360
5361        default_allow = !(filter.flags & KVM_MSR_FILTER_DEFAULT_DENY);
5362        if (empty && !default_allow)
5363                return -EINVAL;
5364
5365        kvm_clear_msr_filter(kvm);
5366
5367        kvm->arch.msr_filter.default_allow = default_allow;
5368
5369        /*
5370         * Protect from concurrent calls to this function that could trigger
5371         * a TOCTOU violation on kvm->arch.msr_filter.count.
5372         */
5373        mutex_lock(&kvm->lock);
5374        for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) {
5375                r = kvm_add_msr_filter(kvm, &filter.ranges[i]);
5376                if (r)
5377                        break;
5378        }
5379
5380        kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED);
5381        mutex_unlock(&kvm->lock);
5382
5383        return r;
5384}
5385
5386long kvm_arch_vm_ioctl(struct file *filp,
5387                       unsigned int ioctl, unsigned long arg)
5388{
5389        struct kvm *kvm = filp->private_data;
5390        void __user *argp = (void __user *)arg;
5391        int r = -ENOTTY;
5392        /*
5393         * This union makes it completely explicit to gcc-3.x
5394         * that these two variables' stack usage should be
5395         * combined, not added together.
5396         */
5397        union {
5398                struct kvm_pit_state ps;
5399                struct kvm_pit_state2 ps2;
5400                struct kvm_pit_config pit_config;
5401        } u;
5402
5403        switch (ioctl) {
5404        case KVM_SET_TSS_ADDR:
5405                r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
5406                break;
5407        case KVM_SET_IDENTITY_MAP_ADDR: {
5408                u64 ident_addr;
5409
5410                mutex_lock(&kvm->lock);
5411                r = -EINVAL;
5412                if (kvm->created_vcpus)
5413                        goto set_identity_unlock;
5414                r = -EFAULT;
5415                if (copy_from_user(&ident_addr, argp, sizeof(ident_addr)))
5416                        goto set_identity_unlock;
5417                r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
5418set_identity_unlock:
5419                mutex_unlock(&kvm->lock);
5420                break;
5421        }
5422        case KVM_SET_NR_MMU_PAGES:
5423                r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
5424                break;
5425        case KVM_GET_NR_MMU_PAGES:
5426                r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
5427                break;
5428        case KVM_CREATE_IRQCHIP: {
5429                mutex_lock(&kvm->lock);
5430
5431                r = -EEXIST;
5432                if (irqchip_in_kernel(kvm))
5433                        goto create_irqchip_unlock;
5434
5435                r = -EINVAL;
5436                if (kvm->created_vcpus)
5437                        goto create_irqchip_unlock;
5438
5439                r = kvm_pic_init(kvm);
5440                if (r)
5441                        goto create_irqchip_unlock;
5442
5443                r = kvm_ioapic_init(kvm);
5444                if (r) {
5445                        kvm_pic_destroy(kvm);
5446                        goto create_irqchip_unlock;
5447                }
5448
5449                r = kvm_setup_default_irq_routing(kvm);
5450                if (r) {
5451                        kvm_ioapic_destroy(kvm);
5452                        kvm_pic_destroy(kvm);
5453                        goto create_irqchip_unlock;
5454                }
5455                /* Write kvm->irq_routing before enabling irqchip_in_kernel. */
5456                smp_wmb();
5457                kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
5458        create_irqchip_unlock:
5459                mutex_unlock(&kvm->lock);
5460                break;
5461        }
5462        case KVM_CREATE_PIT:
5463                u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
5464                goto create_pit;
5465        case KVM_CREATE_PIT2:
5466                r = -EFAULT;
5467                if (copy_from_user(&u.pit_config, argp,
5468                                   sizeof(struct kvm_pit_config)))
5469                        goto out;
5470        create_pit:
5471                mutex_lock(&kvm->lock);
5472                r = -EEXIST;
5473                if (kvm->arch.vpit)
5474                        goto create_pit_unlock;
5475                r = -ENOMEM;
5476                kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
5477                if (kvm->arch.vpit)
5478                        r = 0;
5479        create_pit_unlock:
5480                mutex_unlock(&kvm->lock);
5481                break;
5482        case KVM_GET_IRQCHIP: {
5483                /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
5484                struct kvm_irqchip *chip;
5485
5486                chip = memdup_user(argp, sizeof(*chip));
5487                if (IS_ERR(chip)) {
5488                        r = PTR_ERR(chip);
5489                        goto out;
5490                }
5491
5492                r = -ENXIO;
5493                if (!irqchip_kernel(kvm))
5494                        goto get_irqchip_out;
5495                r = kvm_vm_ioctl_get_irqchip(kvm, chip);
5496                if (r)
5497                        goto get_irqchip_out;
5498                r = -EFAULT;
5499                if (copy_to_user(argp, chip, sizeof(*chip)))
5500                        goto get_irqchip_out;
5501                r = 0;
5502        get_irqchip_out:
5503                kfree(chip);
5504                break;
5505        }
5506        case KVM_SET_IRQCHIP: {
5507                /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
5508                struct kvm_irqchip *chip;
5509
5510                chip = memdup_user(argp, sizeof(*chip));
5511                if (IS_ERR(chip)) {
5512                        r = PTR_ERR(chip);
5513                        goto out;
5514                }
5515
5516                r = -ENXIO;
5517                if (!irqchip_kernel(kvm))
5518                        goto set_irqchip_out;
5519                r = kvm_vm_ioctl_set_irqchip(kvm, chip);
5520        set_irqchip_out:
5521                kfree(chip);
5522                break;
5523        }
5524        case KVM_GET_PIT: {
5525                r = -EFAULT;
5526                if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
5527                        goto out;
5528                r = -ENXIO;
5529                if (!kvm->arch.vpit)
5530                        goto out;
5531                r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
5532                if (r)
5533                        goto out;
5534                r = -EFAULT;
5535                if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
5536                        goto out;
5537                r = 0;
5538                break;
5539        }
5540        case KVM_SET_PIT: {
5541                r = -EFAULT;
5542                if (copy_from_user(&u.ps, argp, sizeof(u.ps)))
5543                        goto out;
5544                mutex_lock(&kvm->lock);
5545                r = -ENXIO;
5546                if (!kvm->arch.vpit)
5547                        goto set_pit_out;
5548                r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
5549set_pit_out:
5550                mutex_unlock(&kvm->lock);
5551                break;
5552        }
5553        case KVM_GET_PIT2: {
5554                r = -ENXIO;
5555                if (!kvm->arch.vpit)
5556                        goto out;
5557                r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
5558                if (r)
5559                        goto out;
5560                r = -EFAULT;
5561                if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
5562                        goto out;
5563                r = 0;
5564                break;
5565        }
5566        case KVM_SET_PIT2: {
5567                r = -EFAULT;
5568                if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
5569                        goto out;
5570                mutex_lock(&kvm->lock);
5571                r = -ENXIO;
5572                if (!kvm->arch.vpit)
5573                        goto set_pit2_out;
5574                r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
5575set_pit2_out:
5576                mutex_unlock(&kvm->lock);
5577                break;
5578        }
5579        case KVM_REINJECT_CONTROL: {
5580                struct kvm_reinject_control control;
5581                r =  -EFAULT;
5582                if (copy_from_user(&control, argp, sizeof(control)))
5583                        goto out;
5584                r = -ENXIO;
5585                if (!kvm->arch.vpit)
5586                        goto out;
5587                r = kvm_vm_ioctl_reinject(kvm, &control);
5588                break;
5589        }
5590        case KVM_SET_BOOT_CPU_ID:
5591                r = 0;
5592                mutex_lock(&kvm->lock);
5593                if (kvm->created_vcpus)
5594                        r = -EBUSY;
5595                else
5596                        kvm->arch.bsp_vcpu_id = arg;
5597                mutex_unlock(&kvm->lock);
5598                break;
5599        case KVM_XEN_HVM_CONFIG: {
5600                struct kvm_xen_hvm_config xhc;
5601                r = -EFAULT;
5602                if (copy_from_user(&xhc, argp, sizeof(xhc)))
5603                        goto out;
5604                r = -EINVAL;
5605                if (xhc.flags)
5606                        goto out;
5607                memcpy(&kvm->arch.xen_hvm_config, &xhc, sizeof(xhc));
5608                r = 0;
5609                break;
5610        }
5611        case KVM_SET_CLOCK: {
5612                struct kvm_clock_data user_ns;
5613                u64 now_ns;
5614
5615                r = -EFAULT;
5616                if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
5617                        goto out;
5618
5619                r = -EINVAL;
5620                if (user_ns.flags)
5621                        goto out;
5622
5623                r = 0;
5624                /*
5625                 * TODO: userspace has to take care of races with VCPU_RUN, so
5626                 * kvm_gen_update_masterclock() can be cut down to locked
5627                 * pvclock_update_vm_gtod_copy().
5628                 */
5629                kvm_gen_update_masterclock(kvm);
5630                now_ns = get_kvmclock_ns(kvm);
5631                kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
5632                kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
5633                break;
5634        }
5635        case KVM_GET_CLOCK: {
5636                struct kvm_clock_data user_ns;
5637                u64 now_ns;
5638
5639                now_ns = get_kvmclock_ns(kvm);
5640                user_ns.clock = now_ns;
5641                user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0;
5642                memset(&user_ns.pad, 0, sizeof(user_ns.pad));
5643
5644                r = -EFAULT;
5645                if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
5646                        goto out;
5647                r = 0;
5648                break;
5649        }
5650        case KVM_MEMORY_ENCRYPT_OP: {
5651                r = -ENOTTY;
5652                if (kvm_x86_ops.mem_enc_op)
5653                        r = kvm_x86_ops.mem_enc_op(kvm, argp);
5654                break;
5655        }
5656        case KVM_MEMORY_ENCRYPT_REG_REGION: {
5657                struct kvm_enc_region region;
5658
5659                r = -EFAULT;
5660                if (copy_from_user(&region, argp, sizeof(region)))
5661                        goto out;
5662
5663                r = -ENOTTY;
5664                if (kvm_x86_ops.mem_enc_reg_region)
5665                        r = kvm_x86_ops.mem_enc_reg_region(kvm, &region);
5666                break;
5667        }
5668        case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
5669                struct kvm_enc_region region;
5670
5671                r = -EFAULT;
5672                if (copy_from_user(&region, argp, sizeof(region)))
5673                        goto out;
5674
5675                r = -ENOTTY;
5676                if (kvm_x86_ops.mem_enc_unreg_region)
5677                        r = kvm_x86_ops.mem_enc_unreg_region(kvm, &region);
5678                break;
5679        }
5680        case KVM_HYPERV_EVENTFD: {
5681                struct kvm_hyperv_eventfd hvevfd;
5682
5683                r = -EFAULT;
5684                if (copy_from_user(&hvevfd, argp, sizeof(hvevfd)))
5685                        goto out;
5686                r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd);
5687                break;
5688        }
5689        case KVM_SET_PMU_EVENT_FILTER:
5690                r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp);
5691                break;
5692        case KVM_X86_SET_MSR_FILTER:
5693                r = kvm_vm_ioctl_set_msr_filter(kvm, argp);
5694                break;
5695        default:
5696                r = -ENOTTY;
5697        }
5698out:
5699        return r;
5700}
5701
5702static void kvm_init_msr_list(void)
5703{
5704        struct x86_pmu_capability x86_pmu;
5705        u32 dummy[2];
5706        unsigned i;
5707
5708        BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4,
5709                         "Please update the fixed PMCs in msrs_to_saved_all[]");
5710
5711        perf_get_x86_pmu_capability(&x86_pmu);
5712
5713        num_msrs_to_save = 0;
5714        num_emulated_msrs = 0;
5715        num_msr_based_features = 0;
5716
5717        for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) {
5718                if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0)
5719                        continue;
5720
5721                /*
5722                 * Even MSRs that are valid in the host may not be exposed
5723                 * to the guests in some cases.
5724                 */
5725                switch (msrs_to_save_all[i]) {
5726                case MSR_IA32_BNDCFGS:
5727                        if (!kvm_mpx_supported())
5728                                continue;
5729                        break;
5730                case MSR_TSC_AUX:
5731                        if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
5732                                continue;
5733                        break;
5734                case MSR_IA32_UMWAIT_CONTROL:
5735                        if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG))
5736                                continue;
5737                        break;
5738                case MSR_IA32_RTIT_CTL:
5739                case MSR_IA32_RTIT_STATUS:
5740                        if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
5741                                continue;
5742                        break;
5743                case MSR_IA32_RTIT_CR3_MATCH:
5744                        if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
5745                            !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
5746                                continue;
5747                        break;
5748                case MSR_IA32_RTIT_OUTPUT_BASE:
5749                case MSR_IA32_RTIT_OUTPUT_MASK:
5750                        if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
5751                                (!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
5752                                 !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
5753                                continue;
5754                        break;
5755                case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
5756                        if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
5757                                msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >=
5758                                intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
5759                                continue;
5760                        break;
5761                case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
5762                        if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
5763                            min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
5764                                continue;
5765                        break;
5766                case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
5767                        if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
5768                            min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
5769                                continue;
5770                        break;
5771                default:
5772                        break;
5773                }
5774
5775                msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i];
5776        }
5777
5778        for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
5779                if (!kvm_x86_ops.has_emulated_msr(emulated_msrs_all[i]))
5780                        continue;
5781
5782                emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
5783        }
5784
5785        for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) {
5786                struct kvm_msr_entry msr;
5787
5788                msr.index = msr_based_features_all[i];
5789                if (kvm_get_msr_feature(&msr))
5790                        continue;
5791
5792                msr_based_features[num_msr_based_features++] = msr_based_features_all[i];
5793        }
5794}
5795
5796static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
5797                           const void *v)
5798{
5799        int handled = 0;
5800        int n;
5801
5802        do {
5803                n = min(len, 8);
5804                if (!(lapic_in_kernel(vcpu) &&
5805                      !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
5806                    && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
5807                        break;
5808                handled += n;
5809                addr += n;
5810                len -= n;
5811                v += n;
5812        } while (len);
5813
5814        return handled;
5815}
5816
5817static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
5818{
5819        int handled = 0;
5820        int n;
5821
5822        do {
5823                n = min(len, 8);
5824                if (!(lapic_in_kernel(vcpu) &&
5825                      !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
5826                                         addr, n, v))
5827                    && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
5828                        break;
5829                trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
5830                handled += n;
5831                addr += n;
5832                len -= n;
5833                v += n;
5834        } while (len);
5835
5836        return handled;
5837}
5838
5839static void kvm_set_segment(struct kvm_vcpu *vcpu,
5840                        struct kvm_segment *var, int seg)
5841{
5842        kvm_x86_ops.set_segment(vcpu, var, seg);
5843}
5844
5845void kvm_get_segment(struct kvm_vcpu *vcpu,
5846                     struct kvm_segment *var, int seg)
5847{
5848        kvm_x86_ops.get_segment(vcpu, var, seg);
5849}
5850
5851gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
5852                           struct x86_exception *exception)
5853{
5854        gpa_t t_gpa;
5855
5856        BUG_ON(!mmu_is_nested(vcpu));
5857
5858        /* NPT walks are always user-walks */
5859        access |= PFERR_USER_MASK;
5860        t_gpa  = vcpu->arch.mmu->gva_to_gpa(vcpu, gpa, access, exception);
5861
5862        return t_gpa;
5863}
5864
5865gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
5866                              struct x86_exception *exception)
5867{
5868        u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
5869        return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
5870}
5871
5872 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
5873                                struct x86_exception *exception)
5874{
5875        u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
5876        access |= PFERR_FETCH_MASK;
5877        return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
5878}
5879
5880gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
5881                               struct x86_exception *exception)
5882{
5883        u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
5884        access |= PFERR_WRITE_MASK;
5885        return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
5886}
5887
5888/* uses this to access any guest's mapped memory without checking CPL */
5889gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
5890                                struct x86_exception *exception)
5891{
5892        return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
5893}
5894
5895static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
5896                                      struct kvm_vcpu *vcpu, u32 access,
5897                                      struct x86_exception *exception)
5898{
5899        void *data = val;
5900        int r = X86EMUL_CONTINUE;
5901
5902        while (bytes) {
5903                gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
5904                                                            exception);
5905                unsigned offset = addr & (PAGE_SIZE-1);
5906                unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
5907                int ret;
5908
5909                if (gpa == UNMAPPED_GVA)
5910                        return X86EMUL_PROPAGATE_FAULT;
5911                ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data,
5912                                               offset, toread);
5913                if (ret < 0) {
5914                        r = X86EMUL_IO_NEEDED;
5915                        goto out;
5916                }
5917
5918                bytes -= toread;
5919                data += toread;
5920                addr += toread;
5921        }
5922out:
5923        return r;
5924}
5925
5926/* used for instruction fetching */
5927static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
5928                                gva_t addr, void *val, unsigned int bytes,
5929                                struct x86_exception *exception)
5930{
5931        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5932        u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
5933        unsigned offset;
5934        int ret;
5935
5936        /* Inline kvm_read_guest_virt_helper for speed.  */
5937        gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK,
5938                                                    exception);
5939        if (unlikely(gpa == UNMAPPED_GVA))
5940                return X86EMUL_PROPAGATE_FAULT;
5941
5942        offset = addr & (PAGE_SIZE-1);
5943        if (WARN_ON(offset + bytes > PAGE_SIZE))
5944                bytes = (unsigned)PAGE_SIZE - offset;
5945        ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, val,
5946                                       offset, bytes);
5947        if (unlikely(ret < 0))
5948                return X86EMUL_IO_NEEDED;
5949
5950        return X86EMUL_CONTINUE;
5951}
5952
5953int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
5954                               gva_t addr, void *val, unsigned int bytes,
5955                               struct x86_exception *exception)
5956{
5957        u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
5958
5959        /*
5960         * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
5961         * is returned, but our callers are not ready for that and they blindly
5962         * call kvm_inject_page_fault.  Ensure that they at least do not leak
5963         * uninitialized kernel stack memory into cr2 and error code.
5964         */
5965        memset(exception, 0, sizeof(*exception));
5966        return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
5967                                          exception);
5968}
5969EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
5970
5971static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
5972                             gva_t addr, void *val, unsigned int bytes,
5973                             struct x86_exception *exception, bool system)
5974{
5975        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5976        u32 access = 0;
5977
5978        if (!system && kvm_x86_ops.get_cpl(vcpu) == 3)
5979                access |= PFERR_USER_MASK;
5980
5981        return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
5982}
5983
5984static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
5985                unsigned long addr, void *val, unsigned int bytes)
5986{
5987        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5988        int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes);
5989
5990        return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
5991}
5992
5993static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
5994                                      struct kvm_vcpu *vcpu, u32 access,
5995                                      struct x86_exception *exception)
5996{
5997        void *data = val;
5998        int r = X86EMUL_CONTINUE;
5999
6000        while (bytes) {
6001                gpa_t gpa =  vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
6002                                                             access,
6003                                                             exception);
6004                unsigned offset = addr & (PAGE_SIZE-1);
6005                unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
6006                int ret;
6007
6008                if (gpa == UNMAPPED_GVA)
6009                        return X86EMUL_PROPAGATE_FAULT;
6010                ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite);
6011                if (ret < 0) {
6012                        r = X86EMUL_IO_NEEDED;
6013                        goto out;
6014                }
6015
6016                bytes -= towrite;
6017                data += towrite;
6018                addr += towrite;
6019        }
6020out:
6021        return r;
6022}
6023
6024static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val,
6025                              unsigned int bytes, struct x86_exception *exception,
6026                              bool system)
6027{
6028        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6029        u32 access = PFERR_WRITE_MASK;
6030
6031        if (!system && kvm_x86_ops.get_cpl(vcpu) == 3)
6032                access |= PFERR_USER_MASK;
6033
6034        return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
6035                                           access, exception);
6036}
6037
6038int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
6039                                unsigned int bytes, struct x86_exception *exception)
6040{
6041        /* kvm_write_guest_virt_system can pull in tons of pages. */
6042        vcpu->arch.l1tf_flush_l1d = true;
6043
6044        return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
6045                                           PFERR_WRITE_MASK, exception);
6046}
6047EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
6048
6049int handle_ud(struct kvm_vcpu *vcpu)
6050{
6051        static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX };
6052        int emul_type = EMULTYPE_TRAP_UD;
6053        char sig[5]; /* ud2; .ascii "kvm" */
6054        struct x86_exception e;
6055
6056        if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, NULL, 0)))
6057                return 1;
6058
6059        if (force_emulation_prefix &&
6060            kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
6061                                sig, sizeof(sig), &e) == 0 &&
6062            memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) {
6063                kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
6064                emul_type = EMULTYPE_TRAP_UD_FORCED;
6065        }
6066
6067        return kvm_emulate_instruction(vcpu, emul_type);
6068}
6069EXPORT_SYMBOL_GPL(handle_ud);
6070
6071static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
6072                            gpa_t gpa, bool write)
6073{
6074        /* For APIC access vmexit */
6075        if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
6076                return 1;
6077
6078        if (vcpu_match_mmio_gpa(vcpu, gpa)) {
6079                trace_vcpu_match_mmio(gva, gpa, write, true);
6080                return 1;
6081        }
6082
6083        return 0;
6084}
6085
6086static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
6087                                gpa_t *gpa, struct x86_exception *exception,
6088                                bool write)
6089{
6090        u32 access = ((kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
6091                | (write ? PFERR_WRITE_MASK : 0);
6092
6093        /*
6094         * currently PKRU is only applied to ept enabled guest so
6095         * there is no pkey in EPT page table for L1 guest or EPT
6096         * shadow page table for L2 guest.
6097         */
6098        if (vcpu_match_mmio_gva(vcpu, gva)
6099            && !permission_fault(vcpu, vcpu->arch.walk_mmu,
6100                                 vcpu->arch.mmio_access, 0, access)) {
6101                *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
6102                                        (gva & (PAGE_SIZE - 1));
6103                trace_vcpu_match_mmio(gva, *gpa, write, false);
6104                return 1;
6105        }
6106
6107        *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
6108
6109        if (*gpa == UNMAPPED_GVA)
6110                return -1;
6111
6112        return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write);
6113}
6114
6115int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
6116                        const void *val, int bytes)
6117{
6118        int ret;
6119
6120        ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
6121        if (ret < 0)
6122                return 0;
6123        kvm_page_track_write(vcpu, gpa, val, bytes);
6124        return 1;
6125}
6126
6127struct read_write_emulator_ops {
6128        int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,
6129                                  int bytes);
6130        int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,
6131                                  void *val, int bytes);
6132        int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
6133                               int bytes, void *val);
6134        int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
6135                                    void *val, int bytes);
6136        bool write;
6137};
6138
6139static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
6140{
6141        if (vcpu->mmio_read_completed) {
6142                trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
6143                               vcpu->mmio_fragments[0].gpa, val);
6144                vcpu->mmio_read_completed = 0;
6145                return 1;
6146        }
6147
6148        return 0;
6149}
6150
6151static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
6152                        void *val, int bytes)
6153{
6154        return !kvm_vcpu_read_guest(vcpu, gpa, val, bytes);
6155}
6156
6157static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
6158                         void *val, int bytes)
6159{
6160        return emulator_write_phys(vcpu, gpa, val, bytes);
6161}
6162
6163static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
6164{
6165        trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);
6166        return vcpu_mmio_write(vcpu, gpa, bytes, val);
6167}
6168
6169static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
6170                          void *val, int bytes)
6171{
6172        trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);
6173        return X86EMUL_IO_NEEDED;
6174}
6175
6176static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
6177                           void *val, int bytes)
6178{
6179        struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
6180
6181        memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
6182        return X86EMUL_CONTINUE;
6183}
6184
6185static const struct read_write_emulator_ops read_emultor = {
6186        .read_write_prepare = read_prepare,
6187        .read_write_emulate = read_emulate,
6188        .read_write_mmio = vcpu_mmio_read,
6189        .read_write_exit_mmio = read_exit_mmio,
6190};
6191
6192static const struct read_write_emulator_ops write_emultor = {
6193        .read_write_emulate = write_emulate,
6194        .read_write_mmio = write_mmio,
6195        .read_write_exit_mmio = write_exit_mmio,
6196        .write = true,
6197};
6198
6199static int emulator_read_write_onepage(unsigned long addr, void *val,
6200                                       unsigned int bytes,
6201                                       struct x86_exception *exception,
6202                                       struct kvm_vcpu *vcpu,
6203                                       const struct read_write_emulator_ops *ops)
6204{
6205        gpa_t gpa;
6206        int handled, ret;
6207        bool write = ops->write;
6208        struct kvm_mmio_fragment *frag;
6209        struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
6210
6211        /*
6212         * If the exit was due to a NPF we may already have a GPA.
6213         * If the GPA is present, use it to avoid the GVA to GPA table walk.
6214         * Note, this cannot be used on string operations since string
6215         * operation using rep will only have the initial GPA from the NPF
6216         * occurred.
6217         */
6218        if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) &&
6219            (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) {
6220                gpa = ctxt->gpa_val;
6221                ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
6222        } else {
6223                ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
6224                if (ret < 0)
6225                        return X86EMUL_PROPAGATE_FAULT;
6226        }
6227
6228        if (!ret && ops->read_write_emulate(vcpu, gpa, val, bytes))
6229                return X86EMUL_CONTINUE;
6230
6231        /*
6232         * Is this MMIO handled locally?
6233         */
6234        handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
6235        if (handled == bytes)
6236                return X86EMUL_CONTINUE;
6237
6238        gpa += handled;
6239        bytes -= handled;
6240        val += handled;
6241
6242        WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
6243        frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
6244        frag->gpa = gpa;
6245        frag->data = val;
6246        frag->len = bytes;
6247        return X86EMUL_CONTINUE;
6248}
6249
6250static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
6251                        unsigned long addr,
6252                        void *val, unsigned int bytes,
6253                        struct x86_exception *exception,
6254                        const struct read_write_emulator_ops *ops)
6255{
6256        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6257        gpa_t gpa;
6258        int rc;
6259
6260        if (ops->read_write_prepare &&
6261                  ops->read_write_prepare(vcpu, val, bytes))
6262                return X86EMUL_CONTINUE;
6263
6264        vcpu->mmio_nr_fragments = 0;
6265
6266        /* Crossing a page boundary? */
6267        if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
6268                int now;
6269
6270                now = -addr & ~PAGE_MASK;
6271                rc = emulator_read_write_onepage(addr, val, now, exception,
6272                                                 vcpu, ops);
6273
6274                if (rc != X86EMUL_CONTINUE)
6275                        return rc;
6276                addr += now;
6277                if (ctxt->mode != X86EMUL_MODE_PROT64)
6278                        addr = (u32)addr;
6279                val += now;
6280                bytes -= now;
6281        }
6282
6283        rc = emulator_read_write_onepage(addr, val, bytes, exception,
6284                                         vcpu, ops);
6285        if (rc != X86EMUL_CONTINUE)
6286                return rc;
6287
6288        if (!vcpu->mmio_nr_fragments)
6289                return rc;
6290
6291        gpa = vcpu->mmio_fragments[0].gpa;
6292
6293        vcpu->mmio_needed = 1;
6294        vcpu->mmio_cur_fragment = 0;
6295
6296        vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
6297        vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
6298        vcpu->run->exit_reason = KVM_EXIT_MMIO;
6299        vcpu->run->mmio.phys_addr = gpa;
6300
6301        return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
6302}
6303
6304static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
6305                                  unsigned long addr,
6306                                  void *val,
6307                                  unsigned int bytes,
6308                                  struct x86_exception *exception)
6309{
6310        return emulator_read_write(ctxt, addr, val, bytes,
6311                                   exception, &read_emultor);
6312}
6313
6314static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
6315                            unsigned long addr,
6316                            const void *val,
6317                            unsigned int bytes,
6318                            struct x86_exception *exception)
6319{
6320        return emulator_read_write(ctxt, addr, (void *)val, bytes,
6321                                   exception, &write_emultor);
6322}
6323
6324#define CMPXCHG_TYPE(t, ptr, old, new) \
6325        (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
6326
6327#ifdef CONFIG_X86_64
6328#  define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
6329#else
6330#  define CMPXCHG64(ptr, old, new) \
6331        (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
6332#endif
6333
6334static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
6335                                     unsigned long addr,
6336                                     const void *old,
6337                                     const void *new,
6338                                     unsigned int bytes,
6339                                     struct x86_exception *exception)
6340{
6341        struct kvm_host_map map;
6342        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6343        u64 page_line_mask;
6344        gpa_t gpa;
6345        char *kaddr;
6346        bool exchanged;
6347
6348        /* guests cmpxchg8b have to be emulated atomically */
6349        if (bytes > 8 || (bytes & (bytes - 1)))
6350                goto emul_write;
6351
6352        gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
6353
6354        if (gpa == UNMAPPED_GVA ||
6355            (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
6356                goto emul_write;
6357
6358        /*
6359         * Emulate the atomic as a straight write to avoid #AC if SLD is
6360         * enabled in the host and the access splits a cache line.
6361         */
6362        if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
6363                page_line_mask = ~(cache_line_size() - 1);
6364        else
6365                page_line_mask = PAGE_MASK;
6366
6367        if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
6368                goto emul_write;
6369
6370        if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
6371                goto emul_write;
6372
6373        kaddr = map.hva + offset_in_page(gpa);
6374
6375        switch (bytes) {
6376        case 1:
6377                exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
6378                break;
6379        case 2:
6380                exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
6381                break;
6382        case 4:
6383                exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
6384                break;
6385        case 8:
6386                exchanged = CMPXCHG64(kaddr, old, new);
6387                break;
6388        default:
6389                BUG();
6390        }
6391
6392        kvm_vcpu_unmap(vcpu, &map, true);
6393
6394        if (!exchanged)
6395                return X86EMUL_CMPXCHG_FAILED;
6396
6397        kvm_page_track_write(vcpu, gpa, new, bytes);
6398
6399        return X86EMUL_CONTINUE;
6400
6401emul_write:
6402        printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
6403
6404        return emulator_write_emulated(ctxt, addr, new, bytes, exception);
6405}
6406
6407static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
6408{
6409        int r = 0, i;
6410
6411        for (i = 0; i < vcpu->arch.pio.count; i++) {
6412                if (vcpu->arch.pio.in)
6413                        r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
6414                                            vcpu->arch.pio.size, pd);
6415                else
6416                        r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
6417                                             vcpu->arch.pio.port, vcpu->arch.pio.size,
6418                                             pd);
6419                if (r)
6420                        break;
6421                pd += vcpu->arch.pio.size;
6422        }
6423        return r;
6424}
6425
6426static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
6427                               unsigned short port, void *val,
6428                               unsigned int count, bool in)
6429{
6430        vcpu->arch.pio.port = port;
6431        vcpu->arch.pio.in = in;
6432        vcpu->arch.pio.count  = count;
6433        vcpu->arch.pio.size = size;
6434
6435        if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
6436                vcpu->arch.pio.count = 0;
6437                return 1;
6438        }
6439
6440        vcpu->run->exit_reason = KVM_EXIT_IO;
6441        vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
6442        vcpu->run->io.size = size;
6443        vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
6444        vcpu->run->io.count = count;
6445        vcpu->run->io.port = port;
6446
6447        return 0;
6448}
6449
6450static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
6451                           unsigned short port, void *val, unsigned int count)
6452{
6453        int ret;
6454
6455        if (vcpu->arch.pio.count)
6456                goto data_avail;
6457
6458        memset(vcpu->arch.pio_data, 0, size * count);
6459
6460        ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
6461        if (ret) {
6462data_avail:
6463                memcpy(val, vcpu->arch.pio_data, size * count);
6464                trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data);
6465                vcpu->arch.pio.count = 0;
6466                return 1;
6467        }
6468
6469        return 0;
6470}
6471
6472static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
6473                                    int size, unsigned short port, void *val,
6474                                    unsigned int count)
6475{
6476        return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count);
6477
6478}
6479
6480static int emulator_pio_out(struct kvm_vcpu *vcpu, int size,
6481                            unsigned short port, const void *val,
6482                            unsigned int count)
6483{
6484        memcpy(vcpu->arch.pio_data, val, size * count);
6485        trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
6486        return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
6487}
6488
6489static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
6490                                     int size, unsigned short port,
6491                                     const void *val, unsigned int count)
6492{
6493        return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count);
6494}
6495
6496static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
6497{
6498        return kvm_x86_ops.get_segment_base(vcpu, seg);
6499}
6500
6501static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
6502{
6503        kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
6504}
6505
6506static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
6507{
6508        if (!need_emulate_wbinvd(vcpu))
6509                return X86EMUL_CONTINUE;
6510
6511        if (kvm_x86_ops.has_wbinvd_exit()) {
6512                int cpu = get_cpu();
6513
6514                cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
6515                smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
6516                                wbinvd_ipi, NULL, 1);
6517                put_cpu();
6518                cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
6519        } else
6520                wbinvd();
6521        return X86EMUL_CONTINUE;
6522}
6523
6524int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
6525{
6526        kvm_emulate_wbinvd_noskip(vcpu);
6527        return kvm_skip_emulated_instruction(vcpu);
6528}
6529EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
6530
6531
6532
6533static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
6534{
6535        kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
6536}
6537
6538static int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
6539                           unsigned long *dest)
6540{
6541        return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
6542}
6543
6544static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
6545                           unsigned long value)
6546{
6547
6548        return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
6549}
6550
6551static u64 mk_cr_64(u64 curr_cr, u32 new_val)
6552{
6553        return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
6554}
6555
6556static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
6557{
6558        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6559        unsigned long value;
6560
6561        switch (cr) {
6562        case 0:
6563                value = kvm_read_cr0(vcpu);
6564                break;
6565        case 2:
6566                value = vcpu->arch.cr2;
6567                break;
6568        case 3:
6569                value = kvm_read_cr3(vcpu);
6570                break;
6571        case 4:
6572                value = kvm_read_cr4(vcpu);
6573                break;
6574        case 8:
6575                value = kvm_get_cr8(vcpu);
6576                break;
6577        default:
6578                kvm_err("%s: unexpected cr %u\n", __func__, cr);
6579                return 0;
6580        }
6581
6582        return value;
6583}
6584
6585static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
6586{
6587        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6588        int res = 0;
6589
6590        switch (cr) {
6591        case 0:
6592                res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
6593                break;
6594        case 2:
6595                vcpu->arch.cr2 = val;
6596                break;
6597        case 3:
6598                res = kvm_set_cr3(vcpu, val);
6599                break;
6600        case 4:
6601                res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
6602                break;
6603        case 8:
6604                res = kvm_set_cr8(vcpu, val);
6605                break;
6606        default:
6607                kvm_err("%s: unexpected cr %u\n", __func__, cr);
6608                res = -1;
6609        }
6610
6611        return res;
6612}
6613
6614static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
6615{
6616        return kvm_x86_ops.get_cpl(emul_to_vcpu(ctxt));
6617}
6618
6619static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
6620{
6621        kvm_x86_ops.get_gdt(emul_to_vcpu(ctxt), dt);
6622}
6623
6624static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
6625{
6626        kvm_x86_ops.get_idt(emul_to_vcpu(ctxt), dt);
6627}
6628
6629static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
6630{
6631        kvm_x86_ops.set_gdt(emul_to_vcpu(ctxt), dt);
6632}
6633
6634static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
6635{
6636        kvm_x86_ops.set_idt(emul_to_vcpu(ctxt), dt);
6637}
6638
6639static unsigned long emulator_get_cached_segment_base(
6640        struct x86_emulate_ctxt *ctxt, int seg)
6641{
6642        return get_segment_base(emul_to_vcpu(ctxt), seg);
6643}
6644
6645static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
6646                                 struct desc_struct *desc, u32 *base3,
6647                                 int seg)
6648{
6649        struct kvm_segment var;
6650
6651        kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
6652        *selector = var.selector;
6653
6654        if (var.unusable) {
6655                memset(desc, 0, sizeof(*desc));
6656                if (base3)
6657                        *base3 = 0;
6658                return false;
6659        }
6660
6661        if (var.g)
6662                var.limit >>= 12;
6663        set_desc_limit(desc, var.limit);
6664        set_desc_base(desc, (unsigned long)var.base);
6665#ifdef CONFIG_X86_64
6666        if (base3)
6667                *base3 = var.base >> 32;
6668#endif
6669        desc->type = var.type;
6670        desc->s = var.s;
6671        desc->dpl = var.dpl;
6672        desc->p = var.present;
6673        desc->avl = var.avl;
6674        desc->l = var.l;
6675        desc->d = var.db;
6676        desc->g = var.g;
6677
6678        return true;
6679}
6680
6681static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
6682                                 struct desc_struct *desc, u32 base3,
6683                                 int seg)
6684{
6685        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6686        struct kvm_segment var;
6687
6688        var.selector = selector;
6689        var.base = get_desc_base(desc);
6690#ifdef CONFIG_X86_64
6691        var.base |= ((u64)base3) << 32;
6692#endif
6693        var.limit = get_desc_limit(desc);
6694        if (desc->g)
6695                var.limit = (var.limit << 12) | 0xfff;
6696        var.type = desc->type;
6697        var.dpl = desc->dpl;
6698        var.db = desc->d;
6699        var.s = desc->s;
6700        var.l = desc->l;
6701        var.g = desc->g;
6702        var.avl = desc->avl;
6703        var.present = desc->p;
6704        var.unusable = !var.present;
6705        var.padding = 0;
6706
6707        kvm_set_segment(vcpu, &var, seg);
6708        return;
6709}
6710
6711static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
6712                            u32 msr_index, u64 *pdata)
6713{
6714        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6715        int r;
6716
6717        r = kvm_get_msr(vcpu, msr_index, pdata);
6718
6719        if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) {
6720                /* Bounce to user space */
6721                return X86EMUL_IO_NEEDED;
6722        }
6723
6724        return r;
6725}
6726
6727static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
6728                            u32 msr_index, u64 data)
6729{
6730        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6731        int r;
6732
6733        r = kvm_set_msr(vcpu, msr_index, data);
6734
6735        if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) {
6736                /* Bounce to user space */
6737                return X86EMUL_IO_NEEDED;
6738        }
6739
6740        return r;
6741}
6742
6743static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
6744{
6745        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6746
6747        return vcpu->arch.smbase;
6748}
6749
6750static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase)
6751{
6752        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6753
6754        vcpu->arch.smbase = smbase;
6755}
6756
6757static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
6758                              u32 pmc)
6759{
6760        return kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc);
6761}
6762
6763static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
6764                             u32 pmc, u64 *pdata)
6765{
6766        return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, pdata);
6767}
6768
6769static void emulator_halt(struct x86_emulate_ctxt *ctxt)
6770{
6771        emul_to_vcpu(ctxt)->arch.halt_request = 1;
6772}
6773
6774static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
6775                              struct x86_instruction_info *info,
6776                              enum x86_intercept_stage stage)
6777{
6778        return kvm_x86_ops.check_intercept(emul_to_vcpu(ctxt), info, stage,
6779                                            &ctxt->exception);
6780}
6781
6782static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
6783                              u32 *eax, u32 *ebx, u32 *ecx, u32 *edx,
6784                              bool exact_only)
6785{
6786        return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only);
6787}
6788
6789static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt)
6790{
6791        return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_LM);
6792}
6793
6794static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt)
6795{
6796        return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE);
6797}
6798
6799static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt)
6800{
6801        return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR);
6802}
6803
6804static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
6805{
6806        return kvm_register_read(emul_to_vcpu(ctxt), reg);
6807}
6808
6809static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
6810{
6811        kvm_register_write(emul_to_vcpu(ctxt), reg, val);
6812}
6813
6814static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
6815{
6816        kvm_x86_ops.set_nmi_mask(emul_to_vcpu(ctxt), masked);
6817}
6818
6819static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
6820{
6821        return emul_to_vcpu(ctxt)->arch.hflags;
6822}
6823
6824static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
6825{
6826        emul_to_vcpu(ctxt)->arch.hflags = emul_flags;
6827}
6828
6829static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt,
6830                                  const char *smstate)
6831{
6832        return kvm_x86_ops.pre_leave_smm(emul_to_vcpu(ctxt), smstate);
6833}
6834
6835static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt)
6836{
6837        kvm_smm_changed(emul_to_vcpu(ctxt));
6838}
6839
6840static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
6841{
6842        return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr);
6843}
6844
6845static const struct x86_emulate_ops emulate_ops = {
6846        .read_gpr            = emulator_read_gpr,
6847        .write_gpr           = emulator_write_gpr,
6848        .read_std            = emulator_read_std,
6849        .write_std           = emulator_write_std,
6850        .read_phys           = kvm_read_guest_phys_system,
6851        .fetch               = kvm_fetch_guest_virt,
6852        .read_emulated       = emulator_read_emulated,
6853        .write_emulated      = emulator_write_emulated,
6854        .cmpxchg_emulated    = emulator_cmpxchg_emulated,
6855        .invlpg              = emulator_invlpg,
6856        .pio_in_emulated     = emulator_pio_in_emulated,
6857        .pio_out_emulated    = emulator_pio_out_emulated,
6858        .get_segment         = emulator_get_segment,
6859        .set_segment         = emulator_set_segment,
6860        .get_cached_segment_base = emulator_get_cached_segment_base,
6861        .get_gdt             = emulator_get_gdt,
6862        .get_idt             = emulator_get_idt,
6863        .set_gdt             = emulator_set_gdt,
6864        .set_idt             = emulator_set_idt,
6865        .get_cr              = emulator_get_cr,
6866        .set_cr              = emulator_set_cr,
6867        .cpl                 = emulator_get_cpl,
6868        .get_dr              = emulator_get_dr,
6869        .set_dr              = emulator_set_dr,
6870        .get_smbase          = emulator_get_smbase,
6871        .set_smbase          = emulator_set_smbase,
6872        .set_msr             = emulator_set_msr,
6873        .get_msr             = emulator_get_msr,
6874        .check_pmc           = emulator_check_pmc,
6875        .read_pmc            = emulator_read_pmc,
6876        .halt                = emulator_halt,
6877        .wbinvd              = emulator_wbinvd,
6878        .fix_hypercall       = emulator_fix_hypercall,
6879        .intercept           = emulator_intercept,
6880        .get_cpuid           = emulator_get_cpuid,
6881        .guest_has_long_mode = emulator_guest_has_long_mode,
6882        .guest_has_movbe     = emulator_guest_has_movbe,
6883        .guest_has_fxsr      = emulator_guest_has_fxsr,
6884        .set_nmi_mask        = emulator_set_nmi_mask,
6885        .get_hflags          = emulator_get_hflags,
6886        .set_hflags          = emulator_set_hflags,
6887        .pre_leave_smm       = emulator_pre_leave_smm,
6888        .post_leave_smm      = emulator_post_leave_smm,
6889        .set_xcr             = emulator_set_xcr,
6890};
6891
6892static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
6893{
6894        u32 int_shadow = kvm_x86_ops.get_interrupt_shadow(vcpu);
6895        /*
6896         * an sti; sti; sequence only disable interrupts for the first
6897         * instruction. So, if the last instruction, be it emulated or
6898         * not, left the system with the INT_STI flag enabled, it
6899         * means that the last instruction is an sti. We should not
6900         * leave the flag on in this case. The same goes for mov ss
6901         */
6902        if (int_shadow & mask)
6903                mask = 0;
6904        if (unlikely(int_shadow || mask)) {
6905                kvm_x86_ops.set_interrupt_shadow(vcpu, mask);
6906                if (!mask)
6907                        kvm_make_request(KVM_REQ_EVENT, vcpu);
6908        }
6909}
6910
6911static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
6912{
6913        struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
6914        if (ctxt->exception.vector == PF_VECTOR)
6915                return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
6916
6917        if (ctxt->exception.error_code_valid)
6918                kvm_queue_exception_e(vcpu, ctxt->exception.vector,
6919                                      ctxt->exception.error_code);
6920        else
6921                kvm_queue_exception(vcpu, ctxt->exception.vector);
6922        return false;
6923}
6924
6925static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
6926{
6927        struct x86_emulate_ctxt *ctxt;
6928
6929        ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT);
6930        if (!ctxt) {
6931                pr_err("kvm: failed to allocate vcpu's emulator\n");
6932                return NULL;
6933        }
6934
6935        ctxt->vcpu = vcpu;
6936        ctxt->ops = &emulate_ops;
6937        vcpu->arch.emulate_ctxt = ctxt;
6938
6939        return ctxt;
6940}
6941
6942static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
6943{
6944        struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
6945        int cs_db, cs_l;
6946
6947        kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
6948
6949        ctxt->gpa_available = false;
6950        ctxt->eflags = kvm_get_rflags(vcpu);
6951        ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
6952
6953        ctxt->eip = kvm_rip_read(vcpu);
6954        ctxt->mode = (!is_protmode(vcpu))               ? X86EMUL_MODE_REAL :
6955                     (ctxt->eflags & X86_EFLAGS_VM)     ? X86EMUL_MODE_VM86 :
6956                     (cs_l && is_long_mode(vcpu))       ? X86EMUL_MODE_PROT64 :
6957                     cs_db                              ? X86EMUL_MODE_PROT32 :
6958                                                          X86EMUL_MODE_PROT16;
6959        BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
6960        BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
6961        BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
6962
6963        init_decode_cache(ctxt);
6964        vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
6965}
6966
6967void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
6968{
6969        struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
6970        int ret;
6971
6972        init_emulate_ctxt(vcpu);
6973
6974        ctxt->op_bytes = 2;
6975        ctxt->ad_bytes = 2;
6976        ctxt->_eip = ctxt->eip + inc_eip;
6977        ret = emulate_int_real(ctxt, irq);
6978
6979        if (ret != X86EMUL_CONTINUE) {
6980                kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
6981        } else {
6982                ctxt->eip = ctxt->_eip;
6983                kvm_rip_write(vcpu, ctxt->eip);
6984                kvm_set_rflags(vcpu, ctxt->eflags);
6985        }
6986}
6987EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
6988
6989static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
6990{
6991        ++vcpu->stat.insn_emulation_fail;
6992        trace_kvm_emulate_insn_failed(vcpu);
6993
6994        if (emulation_type & EMULTYPE_VMWARE_GP) {
6995                kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
6996                return 1;
6997        }
6998
6999        if (emulation_type & EMULTYPE_SKIP) {
7000                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
7001                vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
7002                vcpu->run->internal.ndata = 0;
7003                return 0;
7004        }
7005
7006        kvm_queue_exception(vcpu, UD_VECTOR);
7007
7008        if (!is_guest_mode(vcpu) && kvm_x86_ops.get_cpl(vcpu) == 0) {
7009                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
7010                vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
7011                vcpu->run->internal.ndata = 0;
7012                return 0;
7013        }
7014
7015        return 1;
7016}
7017
7018static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
7019                                  bool write_fault_to_shadow_pgtable,
7020                                  int emulation_type)
7021{
7022        gpa_t gpa = cr2_or_gpa;
7023        kvm_pfn_t pfn;
7024
7025        if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
7026                return false;
7027
7028        if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
7029            WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
7030                return false;
7031
7032        if (!vcpu->arch.mmu->direct_map) {
7033                /*
7034                 * Write permission should be allowed since only
7035                 * write access need to be emulated.
7036                 */
7037                gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
7038
7039                /*
7040                 * If the mapping is invalid in guest, let cpu retry
7041                 * it to generate fault.
7042                 */
7043                if (gpa == UNMAPPED_GVA)
7044                        return true;
7045        }
7046
7047        /*
7048         * Do not retry the unhandleable instruction if it faults on the
7049         * readonly host memory, otherwise it will goto a infinite loop:
7050         * retry instruction -> write #PF -> emulation fail -> retry
7051         * instruction -> ...
7052         */
7053        pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
7054
7055        /*
7056         * If the instruction failed on the error pfn, it can not be fixed,
7057         * report the error to userspace.
7058         */
7059        if (is_error_noslot_pfn(pfn))
7060                return false;
7061
7062        kvm_release_pfn_clean(pfn);
7063
7064        /* The instructions are well-emulated on direct mmu. */
7065        if (vcpu->arch.mmu->direct_map) {
7066                unsigned int indirect_shadow_pages;
7067
7068                spin_lock(&vcpu->kvm->mmu_lock);
7069                indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
7070                spin_unlock(&vcpu->kvm->mmu_lock);
7071
7072                if (indirect_shadow_pages)
7073                        kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
7074
7075                return true;
7076        }
7077
7078        /*
7079         * if emulation was due to access to shadowed page table
7080         * and it failed try to unshadow page and re-enter the
7081         * guest to let CPU execute the instruction.
7082         */
7083        kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
7084
7085        /*
7086         * If the access faults on its page table, it can not
7087         * be fixed by unprotecting shadow page and it should
7088         * be reported to userspace.
7089         */
7090        return !write_fault_to_shadow_pgtable;
7091}
7092
7093static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
7094                              gpa_t cr2_or_gpa,  int emulation_type)
7095{
7096        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
7097        unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa;
7098
7099        last_retry_eip = vcpu->arch.last_retry_eip;
7100        last_retry_addr = vcpu->arch.last_retry_addr;
7101
7102        /*
7103         * If the emulation is caused by #PF and it is non-page_table
7104         * writing instruction, it means the VM-EXIT is caused by shadow
7105         * page protected, we can zap the shadow page and retry this
7106         * instruction directly.
7107         *
7108         * Note: if the guest uses a non-page-table modifying instruction
7109         * on the PDE that points to the instruction, then we will unmap
7110         * the instruction and go to an infinite loop. So, we cache the
7111         * last retried eip and the last fault address, if we meet the eip
7112         * and the address again, we can break out of the potential infinite
7113         * loop.
7114         */
7115        vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
7116
7117        if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
7118                return false;
7119
7120        if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
7121            WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
7122                return false;
7123
7124        if (x86_page_table_writing_insn(ctxt))
7125                return false;
7126
7127        if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa)
7128                return false;
7129
7130        vcpu->arch.last_retry_eip = ctxt->eip;
7131        vcpu->arch.last_retry_addr = cr2_or_gpa;
7132
7133        if (!vcpu->arch.mmu->direct_map)
7134                gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
7135
7136        kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
7137
7138        return true;
7139}
7140
7141static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
7142static int complete_emulated_pio(struct kvm_vcpu *vcpu);
7143
7144static void kvm_smm_changed(struct kvm_vcpu *vcpu)
7145{
7146        if (!(vcpu->arch.hflags & HF_SMM_MASK)) {
7147                /* This is a good place to trace that we are exiting SMM.  */
7148                trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
7149
7150                /* Process a latched INIT or SMI, if any.  */
7151                kvm_make_request(KVM_REQ_EVENT, vcpu);
7152        }
7153
7154        kvm_mmu_reset_context(vcpu);
7155}
7156
7157static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
7158                                unsigned long *db)
7159{
7160        u32 dr6 = 0;
7161        int i;
7162        u32 enable, rwlen;
7163
7164        enable = dr7;
7165        rwlen = dr7 >> 16;
7166        for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
7167                if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
7168                        dr6 |= (1 << i);
7169        return dr6;
7170}
7171
7172static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
7173{
7174        struct kvm_run *kvm_run = vcpu->run;
7175
7176        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
7177                kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM;
7178                kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
7179                kvm_run->debug.arch.exception = DB_VECTOR;
7180                kvm_run->exit_reason = KVM_EXIT_DEBUG;
7181                return 0;
7182        }
7183        kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS);
7184        return 1;
7185}
7186
7187int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
7188{
7189        unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
7190        int r;
7191
7192        r = kvm_x86_ops.skip_emulated_instruction(vcpu);
7193        if (unlikely(!r))
7194                return 0;
7195
7196        /*
7197         * rflags is the old, "raw" value of the flags.  The new value has
7198         * not been saved yet.
7199         *
7200         * This is correct even for TF set by the guest, because "the
7201         * processor will not generate this exception after the instruction
7202         * that sets the TF flag".
7203         */
7204        if (unlikely(rflags & X86_EFLAGS_TF))
7205                r = kvm_vcpu_do_singlestep(vcpu);
7206        return r;
7207}
7208EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
7209
7210static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
7211{
7212        if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
7213            (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
7214                struct kvm_run *kvm_run = vcpu->run;
7215                unsigned long eip = kvm_get_linear_rip(vcpu);
7216                u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
7217                                           vcpu->arch.guest_debug_dr7,
7218                                           vcpu->arch.eff_db);
7219
7220                if (dr6 != 0) {
7221                        kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
7222                        kvm_run->debug.arch.pc = eip;
7223                        kvm_run->debug.arch.exception = DB_VECTOR;
7224                        kvm_run->exit_reason = KVM_EXIT_DEBUG;
7225                        *r = 0;
7226                        return true;
7227                }
7228        }
7229
7230        if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
7231            !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) {
7232                unsigned long eip = kvm_get_linear_rip(vcpu);
7233                u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
7234                                           vcpu->arch.dr7,
7235                                           vcpu->arch.db);
7236
7237                if (dr6 != 0) {
7238                        kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
7239                        *r = 1;
7240                        return true;
7241                }
7242        }
7243
7244        return false;
7245}
7246
7247static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
7248{
7249        switch (ctxt->opcode_len) {
7250        case 1:
7251                switch (ctxt->b) {
7252                case 0xe4:      /* IN */
7253                case 0xe5:
7254                case 0xec:
7255                case 0xed:
7256                case 0xe6:      /* OUT */
7257                case 0xe7:
7258                case 0xee:
7259                case 0xef:
7260                case 0x6c:      /* INS */
7261                case 0x6d:
7262                case 0x6e:      /* OUTS */
7263                case 0x6f:
7264                        return true;
7265                }
7266                break;
7267        case 2:
7268                switch (ctxt->b) {
7269                case 0x33:      /* RDPMC */
7270                        return true;
7271                }
7272                break;
7273        }
7274
7275        return false;
7276}
7277
7278int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
7279                            int emulation_type, void *insn, int insn_len)
7280{
7281        int r;
7282        struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
7283        bool writeback = true;
7284        bool write_fault_to_spt;
7285
7286        if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len)))
7287                return 1;
7288
7289        vcpu->arch.l1tf_flush_l1d = true;
7290
7291        /*
7292         * Clear write_fault_to_shadow_pgtable here to ensure it is
7293         * never reused.
7294         */
7295        write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
7296        vcpu->arch.write_fault_to_shadow_pgtable = false;
7297        kvm_clear_exception_queue(vcpu);
7298
7299        if (!(emulation_type & EMULTYPE_NO_DECODE)) {
7300                init_emulate_ctxt(vcpu);
7301
7302                /*
7303                 * We will reenter on the same instruction since
7304                 * we do not set complete_userspace_io.  This does not
7305                 * handle watchpoints yet, those would be handled in
7306                 * the emulate_ops.
7307                 */
7308                if (!(emulation_type & EMULTYPE_SKIP) &&
7309                    kvm_vcpu_check_breakpoint(vcpu, &r))
7310                        return r;
7311
7312                ctxt->interruptibility = 0;
7313                ctxt->have_exception = false;
7314                ctxt->exception.vector = -1;
7315                ctxt->perm_ok = false;
7316
7317                ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
7318
7319                r = x86_decode_insn(ctxt, insn, insn_len);
7320
7321                trace_kvm_emulate_insn_start(vcpu);
7322                ++vcpu->stat.insn_emulation;
7323                if (r != EMULATION_OK)  {
7324                        if ((emulation_type & EMULTYPE_TRAP_UD) ||
7325                            (emulation_type & EMULTYPE_TRAP_UD_FORCED)) {
7326                                kvm_queue_exception(vcpu, UD_VECTOR);
7327                                return 1;
7328                        }
7329                        if (reexecute_instruction(vcpu, cr2_or_gpa,
7330                                                  write_fault_to_spt,
7331                                                  emulation_type))
7332                                return 1;
7333                        if (ctxt->have_exception) {
7334                                /*
7335                                 * #UD should result in just EMULATION_FAILED, and trap-like
7336                                 * exception should not be encountered during decode.
7337                                 */
7338                                WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR ||
7339                                             exception_type(ctxt->exception.vector) == EXCPT_TRAP);
7340                                inject_emulated_exception(vcpu);
7341                                return 1;
7342                        }
7343                        return handle_emulation_failure(vcpu, emulation_type);
7344                }
7345        }
7346
7347        if ((emulation_type & EMULTYPE_VMWARE_GP) &&
7348            !is_vmware_backdoor_opcode(ctxt)) {
7349                kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
7350                return 1;
7351        }
7352
7353        /*
7354         * Note, EMULTYPE_SKIP is intended for use *only* by vendor callbacks
7355         * for kvm_skip_emulated_instruction().  The caller is responsible for
7356         * updating interruptibility state and injecting single-step #DBs.
7357         */
7358        if (emulation_type & EMULTYPE_SKIP) {
7359                kvm_rip_write(vcpu, ctxt->_eip);
7360                if (ctxt->eflags & X86_EFLAGS_RF)
7361                        kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
7362                return 1;
7363        }
7364
7365        if (retry_instruction(ctxt, cr2_or_gpa, emulation_type))
7366                return 1;
7367
7368        /* this is needed for vmware backdoor interface to work since it
7369           changes registers values  during IO operation */
7370        if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
7371                vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
7372                emulator_invalidate_register_cache(ctxt);
7373        }
7374
7375restart:
7376        if (emulation_type & EMULTYPE_PF) {
7377                /* Save the faulting GPA (cr2) in the address field */
7378                ctxt->exception.address = cr2_or_gpa;
7379
7380                /* With shadow page tables, cr2 contains a GVA or nGPA. */
7381                if (vcpu->arch.mmu->direct_map) {
7382                        ctxt->gpa_available = true;
7383                        ctxt->gpa_val = cr2_or_gpa;
7384                }
7385        } else {
7386                /* Sanitize the address out of an abundance of paranoia. */
7387                ctxt->exception.address = 0;
7388        }
7389
7390        r = x86_emulate_insn(ctxt);
7391
7392        if (r == EMULATION_INTERCEPTED)
7393                return 1;
7394
7395        if (r == EMULATION_FAILED) {
7396                if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
7397                                        emulation_type))
7398                        return 1;
7399
7400                return handle_emulation_failure(vcpu, emulation_type);
7401        }
7402
7403        if (ctxt->have_exception) {
7404                r = 1;
7405                if (inject_emulated_exception(vcpu))
7406                        return r;
7407        } else if (vcpu->arch.pio.count) {
7408                if (!vcpu->arch.pio.in) {
7409                        /* FIXME: return into emulator if single-stepping.  */
7410                        vcpu->arch.pio.count = 0;
7411                } else {
7412                        writeback = false;
7413                        vcpu->arch.complete_userspace_io = complete_emulated_pio;
7414                }
7415                r = 0;
7416        } else if (vcpu->mmio_needed) {
7417                ++vcpu->stat.mmio_exits;
7418
7419                if (!vcpu->mmio_is_write)
7420                        writeback = false;
7421                r = 0;
7422                vcpu->arch.complete_userspace_io = complete_emulated_mmio;
7423        } else if (r == EMULATION_RESTART)
7424                goto restart;
7425        else
7426                r = 1;
7427
7428        if (writeback) {
7429                unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
7430                toggle_interruptibility(vcpu, ctxt->interruptibility);
7431                vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
7432                if (!ctxt->have_exception ||
7433                    exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
7434                        kvm_rip_write(vcpu, ctxt->eip);
7435                        if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
7436                                r = kvm_vcpu_do_singlestep(vcpu);
7437                        if (kvm_x86_ops.update_emulated_instruction)
7438                                kvm_x86_ops.update_emulated_instruction(vcpu);
7439                        __kvm_set_rflags(vcpu, ctxt->eflags);
7440                }
7441
7442                /*
7443                 * For STI, interrupts are shadowed; so KVM_REQ_EVENT will
7444                 * do nothing, and it will be requested again as soon as
7445                 * the shadow expires.  But we still need to check here,
7446                 * because POPF has no interrupt shadow.
7447                 */
7448                if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF))
7449                        kvm_make_request(KVM_REQ_EVENT, vcpu);
7450        } else
7451                vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
7452
7453        return r;
7454}
7455
7456int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type)
7457{
7458        return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
7459}
7460EXPORT_SYMBOL_GPL(kvm_emulate_instruction);
7461
7462int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
7463                                        void *insn, int insn_len)
7464{
7465        return x86_emulate_instruction(vcpu, 0, 0, insn, insn_len);
7466}
7467EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer);
7468
7469static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu)
7470{
7471        vcpu->arch.pio.count = 0;
7472        return 1;
7473}
7474
7475static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
7476{
7477        vcpu->arch.pio.count = 0;
7478
7479        if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip)))
7480                return 1;
7481
7482        return kvm_skip_emulated_instruction(vcpu);
7483}
7484
7485static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
7486                            unsigned short port)
7487{
7488        unsigned long val = kvm_rax_read(vcpu);
7489        int ret = emulator_pio_out(vcpu, size, port, &val, 1);
7490
7491        if (ret)
7492                return ret;
7493
7494        /*
7495         * Workaround userspace that relies on old KVM behavior of %rip being
7496         * incremented prior to exiting to userspace to handle "OUT 0x7e".
7497         */
7498        if (port == 0x7e &&
7499            kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) {
7500                vcpu->arch.complete_userspace_io =
7501                        complete_fast_pio_out_port_0x7e;
7502                kvm_skip_emulated_instruction(vcpu);
7503        } else {
7504                vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
7505                vcpu->arch.complete_userspace_io = complete_fast_pio_out;
7506        }
7507        return 0;
7508}
7509
7510static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
7511{
7512        unsigned long val;
7513
7514        /* We should only ever be called with arch.pio.count equal to 1 */
7515        BUG_ON(vcpu->arch.pio.count != 1);
7516
7517        if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) {
7518                vcpu->arch.pio.count = 0;
7519                return 1;
7520        }
7521
7522        /* For size less than 4 we merge, else we zero extend */
7523        val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
7524
7525        /*
7526         * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform
7527         * the copy and tracing
7528         */
7529        emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1);
7530        kvm_rax_write(vcpu, val);
7531
7532        return kvm_skip_emulated_instruction(vcpu);
7533}
7534
7535static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
7536                           unsigned short port)
7537{
7538        unsigned long val;
7539        int ret;
7540
7541        /* For size less than 4 we merge, else we zero extend */
7542        val = (size < 4) ? kvm_rax_read(vcpu) : 0;
7543
7544        ret = emulator_pio_in(vcpu, size, port, &val, 1);
7545        if (ret) {
7546                kvm_rax_write(vcpu, val);
7547                return ret;
7548        }
7549
7550        vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
7551        vcpu->arch.complete_userspace_io = complete_fast_pio_in;
7552
7553        return 0;
7554}
7555
7556int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in)
7557{
7558        int ret;
7559
7560        if (in)
7561                ret = kvm_fast_pio_in(vcpu, size, port);
7562        else
7563                ret = kvm_fast_pio_out(vcpu, size, port);
7564        return ret && kvm_skip_emulated_instruction(vcpu);
7565}
7566EXPORT_SYMBOL_GPL(kvm_fast_pio);
7567
7568static int kvmclock_cpu_down_prep(unsigned int cpu)
7569{
7570        __this_cpu_write(cpu_tsc_khz, 0);
7571        return 0;
7572}
7573
7574static void tsc_khz_changed(void *data)
7575{
7576        struct cpufreq_freqs *freq = data;
7577        unsigned long khz = 0;
7578
7579        if (data)
7580                khz = freq->new;
7581        else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
7582                khz = cpufreq_quick_get(raw_smp_processor_id());
7583        if (!khz)
7584                khz = tsc_khz;
7585        __this_cpu_write(cpu_tsc_khz, khz);
7586}
7587
7588#ifdef CONFIG_X86_64
7589static void kvm_hyperv_tsc_notifier(void)
7590{
7591        struct kvm *kvm;
7592        struct kvm_vcpu *vcpu;
7593        int cpu;
7594
7595        mutex_lock(&kvm_lock);
7596        list_for_each_entry(kvm, &vm_list, vm_list)
7597                kvm_make_mclock_inprogress_request(kvm);
7598
7599        hyperv_stop_tsc_emulation();
7600
7601        /* TSC frequency always matches when on Hyper-V */
7602        for_each_present_cpu(cpu)
7603                per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
7604        kvm_max_guest_tsc_khz = tsc_khz;
7605
7606        list_for_each_entry(kvm, &vm_list, vm_list) {
7607                struct kvm_arch *ka = &kvm->arch;
7608
7609                spin_lock(&ka->pvclock_gtod_sync_lock);
7610
7611                pvclock_update_vm_gtod_copy(kvm);
7612
7613                kvm_for_each_vcpu(cpu, vcpu, kvm)
7614                        kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
7615
7616                kvm_for_each_vcpu(cpu, vcpu, kvm)
7617                        kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
7618
7619                spin_unlock(&ka->pvclock_gtod_sync_lock);
7620        }
7621        mutex_unlock(&kvm_lock);
7622}
7623#endif
7624
7625static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
7626{
7627        struct kvm *kvm;
7628        struct kvm_vcpu *vcpu;
7629        int i, send_ipi = 0;
7630
7631        /*
7632         * We allow guests to temporarily run on slowing clocks,
7633         * provided we notify them after, or to run on accelerating
7634         * clocks, provided we notify them before.  Thus time never
7635         * goes backwards.
7636         *
7637         * However, we have a problem.  We can't atomically update
7638         * the frequency of a given CPU from this function; it is
7639         * merely a notifier, which can be called from any CPU.
7640         * Changing the TSC frequency at arbitrary points in time
7641         * requires a recomputation of local variables related to
7642         * the TSC for each VCPU.  We must flag these local variables
7643         * to be updated and be sure the update takes place with the
7644         * new frequency before any guests proceed.
7645         *
7646         * Unfortunately, the combination of hotplug CPU and frequency
7647         * change creates an intractable locking scenario; the order
7648         * of when these callouts happen is undefined with respect to
7649         * CPU hotplug, and they can race with each other.  As such,
7650         * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
7651         * undefined; you can actually have a CPU frequency change take
7652         * place in between the computation of X and the setting of the
7653         * variable.  To protect against this problem, all updates of
7654         * the per_cpu tsc_khz variable are done in an interrupt
7655         * protected IPI, and all callers wishing to update the value
7656         * must wait for a synchronous IPI to complete (which is trivial
7657         * if the caller is on the CPU already).  This establishes the
7658         * necessary total order on variable updates.
7659         *
7660         * Note that because a guest time update may take place
7661         * anytime after the setting of the VCPU's request bit, the
7662         * correct TSC value must be set before the request.  However,
7663         * to ensure the update actually makes it to any guest which
7664         * starts running in hardware virtualization between the set
7665         * and the acquisition of the spinlock, we must also ping the
7666         * CPU after setting the request bit.
7667         *
7668         */
7669
7670        smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
7671
7672        mutex_lock(&kvm_lock);
7673        list_for_each_entry(kvm, &vm_list, vm_list) {
7674                kvm_for_each_vcpu(i, vcpu, kvm) {
7675                        if (vcpu->cpu != cpu)
7676                                continue;
7677                        kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
7678                        if (vcpu->cpu != raw_smp_processor_id())
7679                                send_ipi = 1;
7680                }
7681        }
7682        mutex_unlock(&kvm_lock);
7683
7684        if (freq->old < freq->new && send_ipi) {
7685                /*
7686                 * We upscale the frequency.  Must make the guest
7687                 * doesn't see old kvmclock values while running with
7688                 * the new frequency, otherwise we risk the guest sees
7689                 * time go backwards.
7690                 *
7691                 * In case we update the frequency for another cpu
7692                 * (which might be in guest context) send an interrupt
7693                 * to kick the cpu out of guest context.  Next time
7694                 * guest context is entered kvmclock will be updated,
7695                 * so the guest will not see stale values.
7696                 */
7697                smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
7698        }
7699}
7700
7701static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
7702                                     void *data)
7703{
7704        struct cpufreq_freqs *freq = data;
7705        int cpu;
7706
7707        if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
7708                return 0;
7709        if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
7710                return 0;
7711
7712        for_each_cpu(cpu, freq->policy->cpus)
7713                __kvmclock_cpufreq_notifier(freq, cpu);
7714
7715        return 0;
7716}
7717
7718static struct notifier_block kvmclock_cpufreq_notifier_block = {
7719        .notifier_call  = kvmclock_cpufreq_notifier
7720};
7721
7722static int kvmclock_cpu_online(unsigned int cpu)
7723{
7724        tsc_khz_changed(NULL);
7725        return 0;
7726}
7727
7728static void kvm_timer_init(void)
7729{
7730        max_tsc_khz = tsc_khz;
7731
7732        if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
7733#ifdef CONFIG_CPU_FREQ
7734                struct cpufreq_policy *policy;
7735                int cpu;
7736
7737                cpu = get_cpu();
7738                policy = cpufreq_cpu_get(cpu);
7739                if (policy) {
7740                        if (policy->cpuinfo.max_freq)
7741                                max_tsc_khz = policy->cpuinfo.max_freq;
7742                        cpufreq_cpu_put(policy);
7743                }
7744                put_cpu();
7745#endif
7746                cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
7747                                          CPUFREQ_TRANSITION_NOTIFIER);
7748        }
7749
7750        cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online",
7751                          kvmclock_cpu_online, kvmclock_cpu_down_prep);
7752}
7753
7754DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
7755EXPORT_PER_CPU_SYMBOL_GPL(current_vcpu);
7756
7757int kvm_is_in_guest(void)
7758{
7759        return __this_cpu_read(current_vcpu) != NULL;
7760}
7761
7762static int kvm_is_user_mode(void)
7763{
7764        int user_mode = 3;
7765
7766        if (__this_cpu_read(current_vcpu))
7767                user_mode = kvm_x86_ops.get_cpl(__this_cpu_read(current_vcpu));
7768
7769        return user_mode != 0;
7770}
7771
7772static unsigned long kvm_get_guest_ip(void)
7773{
7774        unsigned long ip = 0;
7775
7776        if (__this_cpu_read(current_vcpu))
7777                ip = kvm_rip_read(__this_cpu_read(current_vcpu));
7778
7779        return ip;
7780}
7781
7782static void kvm_handle_intel_pt_intr(void)
7783{
7784        struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu);
7785
7786        kvm_make_request(KVM_REQ_PMI, vcpu);
7787        __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
7788                        (unsigned long *)&vcpu->arch.pmu.global_status);
7789}
7790
7791static struct perf_guest_info_callbacks kvm_guest_cbs = {
7792        .is_in_guest            = kvm_is_in_guest,
7793        .is_user_mode           = kvm_is_user_mode,
7794        .get_guest_ip           = kvm_get_guest_ip,
7795        .handle_intel_pt_intr   = kvm_handle_intel_pt_intr,
7796};
7797
7798#ifdef CONFIG_X86_64
7799static void pvclock_gtod_update_fn(struct work_struct *work)
7800{
7801        struct kvm *kvm;
7802
7803        struct kvm_vcpu *vcpu;
7804        int i;
7805
7806        mutex_lock(&kvm_lock);
7807        list_for_each_entry(kvm, &vm_list, vm_list)
7808                kvm_for_each_vcpu(i, vcpu, kvm)
7809                        kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
7810        atomic_set(&kvm_guest_has_master_clock, 0);
7811        mutex_unlock(&kvm_lock);
7812}
7813
7814static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
7815
7816/*
7817 * Notification about pvclock gtod data update.
7818 */
7819static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
7820                               void *priv)
7821{
7822        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
7823        struct timekeeper *tk = priv;
7824
7825        update_pvclock_gtod(tk);
7826
7827        /* disable master clock if host does not trust, or does not
7828         * use, TSC based clocksource.
7829         */
7830        if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
7831            atomic_read(&kvm_guest_has_master_clock) != 0)
7832                queue_work(system_long_wq, &pvclock_gtod_work);
7833
7834        return 0;
7835}
7836
7837static struct notifier_block pvclock_gtod_notifier = {
7838        .notifier_call = pvclock_gtod_notify,
7839};
7840#endif
7841
7842int kvm_arch_init(void *opaque)
7843{
7844        struct kvm_x86_init_ops *ops = opaque;
7845        int r;
7846
7847        if (kvm_x86_ops.hardware_enable) {
7848                printk(KERN_ERR "kvm: already loaded the other module\n");
7849                r = -EEXIST;
7850                goto out;
7851        }
7852
7853        if (!ops->cpu_has_kvm_support()) {
7854                pr_err_ratelimited("kvm: no hardware support\n");
7855                r = -EOPNOTSUPP;
7856                goto out;
7857        }
7858        if (ops->disabled_by_bios()) {
7859                pr_err_ratelimited("kvm: disabled by bios\n");
7860                r = -EOPNOTSUPP;
7861                goto out;
7862        }
7863
7864        /*
7865         * KVM explicitly assumes that the guest has an FPU and
7866         * FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the
7867         * vCPU's FPU state as a fxregs_state struct.
7868         */
7869        if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) {
7870                printk(KERN_ERR "kvm: inadequate fpu\n");
7871                r = -EOPNOTSUPP;
7872                goto out;
7873        }
7874
7875        r = -ENOMEM;
7876        x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu),
7877                                          __alignof__(struct fpu), SLAB_ACCOUNT,
7878                                          NULL);
7879        if (!x86_fpu_cache) {
7880                printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n");
7881                goto out;
7882        }
7883
7884        x86_emulator_cache = kvm_alloc_emulator_cache();
7885        if (!x86_emulator_cache) {
7886                pr_err("kvm: failed to allocate cache for x86 emulator\n");
7887                goto out_free_x86_fpu_cache;
7888        }
7889
7890        user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
7891        if (!user_return_msrs) {
7892                printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
7893                goto out_free_x86_emulator_cache;
7894        }
7895
7896        r = kvm_mmu_module_init();
7897        if (r)
7898                goto out_free_percpu;
7899
7900        kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
7901                        PT_DIRTY_MASK, PT64_NX_MASK, 0,
7902                        PT_PRESENT_MASK, 0, sme_me_mask);
7903        kvm_timer_init();
7904
7905        perf_register_guest_info_callbacks(&kvm_guest_cbs);
7906
7907        if (boot_cpu_has(X86_FEATURE_XSAVE)) {
7908                host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
7909                supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
7910        }
7911
7912        kvm_lapic_init();
7913        if (pi_inject_timer == -1)
7914                pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER);
7915#ifdef CONFIG_X86_64
7916        pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
7917
7918        if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
7919                set_hv_tscchange_cb(kvm_hyperv_tsc_notifier);
7920#endif
7921
7922        return 0;
7923
7924out_free_percpu:
7925        free_percpu(user_return_msrs);
7926out_free_x86_emulator_cache:
7927        kmem_cache_destroy(x86_emulator_cache);
7928out_free_x86_fpu_cache:
7929        kmem_cache_destroy(x86_fpu_cache);
7930out:
7931        return r;
7932}
7933
7934void kvm_arch_exit(void)
7935{
7936#ifdef CONFIG_X86_64
7937        if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
7938                clear_hv_tscchange_cb();
7939#endif
7940        kvm_lapic_exit();
7941        perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
7942
7943        if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
7944                cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
7945                                            CPUFREQ_TRANSITION_NOTIFIER);
7946        cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
7947#ifdef CONFIG_X86_64
7948        pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
7949#endif
7950        kvm_x86_ops.hardware_enable = NULL;
7951        kvm_mmu_module_exit();
7952        free_percpu(user_return_msrs);
7953        kmem_cache_destroy(x86_fpu_cache);
7954}
7955
7956int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
7957{
7958        ++vcpu->stat.halt_exits;
7959        if (lapic_in_kernel(vcpu)) {
7960                vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
7961                return 1;
7962        } else {
7963                vcpu->run->exit_reason = KVM_EXIT_HLT;
7964                return 0;
7965        }
7966}
7967EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
7968
7969int kvm_emulate_halt(struct kvm_vcpu *vcpu)
7970{
7971        int ret = kvm_skip_emulated_instruction(vcpu);
7972        /*
7973         * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
7974         * KVM_EXIT_DEBUG here.
7975         */
7976        return kvm_vcpu_halt(vcpu) && ret;
7977}
7978EXPORT_SYMBOL_GPL(kvm_emulate_halt);
7979
7980#ifdef CONFIG_X86_64
7981static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
7982                                unsigned long clock_type)
7983{
7984        struct kvm_clock_pairing clock_pairing;
7985        struct timespec64 ts;
7986        u64 cycle;
7987        int ret;
7988
7989        if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
7990                return -KVM_EOPNOTSUPP;
7991
7992        if (kvm_get_walltime_and_clockread(&ts, &cycle) == false)
7993                return -KVM_EOPNOTSUPP;
7994
7995        clock_pairing.sec = ts.tv_sec;
7996        clock_pairing.nsec = ts.tv_nsec;
7997        clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle);
7998        clock_pairing.flags = 0;
7999        memset(&clock_pairing.pad, 0, sizeof(clock_pairing.pad));
8000
8001        ret = 0;
8002        if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing,
8003                            sizeof(struct kvm_clock_pairing)))
8004                ret = -KVM_EFAULT;
8005
8006        return ret;
8007}
8008#endif
8009
8010/*
8011 * kvm_pv_kick_cpu_op:  Kick a vcpu.
8012 *
8013 * @apicid - apicid of vcpu to be kicked.
8014 */
8015static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
8016{
8017        struct kvm_lapic_irq lapic_irq;
8018
8019        lapic_irq.shorthand = APIC_DEST_NOSHORT;
8020        lapic_irq.dest_mode = APIC_DEST_PHYSICAL;
8021        lapic_irq.level = 0;
8022        lapic_irq.dest_id = apicid;
8023        lapic_irq.msi_redir_hint = false;
8024
8025        lapic_irq.delivery_mode = APIC_DM_REMRD;
8026        kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
8027}
8028
8029bool kvm_apicv_activated(struct kvm *kvm)
8030{
8031        return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
8032}
8033EXPORT_SYMBOL_GPL(kvm_apicv_activated);
8034
8035void kvm_apicv_init(struct kvm *kvm, bool enable)
8036{
8037        if (enable)
8038                clear_bit(APICV_INHIBIT_REASON_DISABLE,
8039                          &kvm->arch.apicv_inhibit_reasons);
8040        else
8041                set_bit(APICV_INHIBIT_REASON_DISABLE,
8042                        &kvm->arch.apicv_inhibit_reasons);
8043}
8044EXPORT_SYMBOL_GPL(kvm_apicv_init);
8045
8046static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id)
8047{
8048        struct kvm_vcpu *target = NULL;
8049        struct kvm_apic_map *map;
8050
8051        rcu_read_lock();
8052        map = rcu_dereference(kvm->arch.apic_map);
8053
8054        if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
8055                target = map->phys_map[dest_id]->vcpu;
8056
8057        rcu_read_unlock();
8058
8059        if (target && READ_ONCE(target->ready))
8060                kvm_vcpu_yield_to(target);
8061}
8062
8063int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
8064{
8065        unsigned long nr, a0, a1, a2, a3, ret;
8066        int op_64_bit;
8067
8068        if (kvm_hv_hypercall_enabled(vcpu->kvm))
8069                return kvm_hv_hypercall(vcpu);
8070
8071        nr = kvm_rax_read(vcpu);
8072        a0 = kvm_rbx_read(vcpu);
8073        a1 = kvm_rcx_read(vcpu);
8074        a2 = kvm_rdx_read(vcpu);
8075        a3 = kvm_rsi_read(vcpu);
8076
8077        trace_kvm_hypercall(nr, a0, a1, a2, a3);
8078
8079        op_64_bit = is_64_bit_mode(vcpu);
8080        if (!op_64_bit) {
8081                nr &= 0xFFFFFFFF;
8082                a0 &= 0xFFFFFFFF;
8083                a1 &= 0xFFFFFFFF;
8084                a2 &= 0xFFFFFFFF;
8085                a3 &= 0xFFFFFFFF;
8086        }
8087
8088        if (kvm_x86_ops.get_cpl(vcpu) != 0) {
8089                ret = -KVM_EPERM;
8090                goto out;
8091        }
8092
8093        ret = -KVM_ENOSYS;
8094
8095        switch (nr) {
8096        case KVM_HC_VAPIC_POLL_IRQ:
8097                ret = 0;
8098                break;
8099        case KVM_HC_KICK_CPU:
8100                if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT))
8101                        break;
8102
8103                kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
8104                kvm_sched_yield(vcpu->kvm, a1);
8105                ret = 0;
8106                break;
8107#ifdef CONFIG_X86_64
8108        case KVM_HC_CLOCK_PAIRING:
8109                ret = kvm_pv_clock_pairing(vcpu, a0, a1);
8110                break;
8111#endif
8112        case KVM_HC_SEND_IPI:
8113                if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI))
8114                        break;
8115
8116                ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
8117                break;
8118        case KVM_HC_SCHED_YIELD:
8119                if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
8120                        break;
8121
8122                kvm_sched_yield(vcpu->kvm, a0);
8123                ret = 0;
8124                break;
8125        default:
8126                ret = -KVM_ENOSYS;
8127                break;
8128        }
8129out:
8130        if (!op_64_bit)
8131                ret = (u32)ret;
8132        kvm_rax_write(vcpu, ret);
8133
8134        ++vcpu->stat.hypercalls;
8135        return kvm_skip_emulated_instruction(vcpu);
8136}
8137EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
8138
8139static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
8140{
8141        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
8142        char instruction[3];
8143        unsigned long rip = kvm_rip_read(vcpu);
8144
8145        kvm_x86_ops.patch_hypercall(vcpu, instruction);
8146
8147        return emulator_write_emulated(ctxt, rip, instruction, 3,
8148                &ctxt->exception);
8149}
8150
8151static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
8152{
8153        return vcpu->run->request_interrupt_window &&
8154                likely(!pic_in_kernel(vcpu->kvm));
8155}
8156
8157static void post_kvm_run_save(struct kvm_vcpu *vcpu)
8158{
8159        struct kvm_run *kvm_run = vcpu->run;
8160
8161        kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
8162        kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
8163        kvm_run->cr8 = kvm_get_cr8(vcpu);
8164        kvm_run->apic_base = kvm_get_apic_base(vcpu);
8165        kvm_run->ready_for_interrupt_injection =
8166                pic_in_kernel(vcpu->kvm) ||
8167                kvm_vcpu_ready_for_interrupt_injection(vcpu);
8168}
8169
8170static void update_cr8_intercept(struct kvm_vcpu *vcpu)
8171{
8172        int max_irr, tpr;
8173
8174        if (!kvm_x86_ops.update_cr8_intercept)
8175                return;
8176
8177        if (!lapic_in_kernel(vcpu))
8178                return;
8179
8180        if (vcpu->arch.apicv_active)
8181                return;
8182
8183        if (!vcpu->arch.apic->vapic_addr)
8184                max_irr = kvm_lapic_find_highest_irr(vcpu);
8185        else
8186                max_irr = -1;
8187
8188        if (max_irr != -1)
8189                max_irr >>= 4;
8190
8191        tpr = kvm_lapic_get_cr8(vcpu);
8192
8193        kvm_x86_ops.update_cr8_intercept(vcpu, tpr, max_irr);
8194}
8195
8196static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
8197{
8198        int r;
8199        bool can_inject = true;
8200
8201        /* try to reinject previous events if any */
8202
8203        if (vcpu->arch.exception.injected) {
8204                kvm_x86_ops.queue_exception(vcpu);
8205                can_inject = false;
8206        }
8207        /*
8208         * Do not inject an NMI or interrupt if there is a pending
8209         * exception.  Exceptions and interrupts are recognized at
8210         * instruction boundaries, i.e. the start of an instruction.
8211         * Trap-like exceptions, e.g. #DB, have higher priority than
8212         * NMIs and interrupts, i.e. traps are recognized before an
8213         * NMI/interrupt that's pending on the same instruction.
8214         * Fault-like exceptions, e.g. #GP and #PF, are the lowest
8215         * priority, but are only generated (pended) during instruction
8216         * execution, i.e. a pending fault-like exception means the
8217         * fault occurred on the *previous* instruction and must be
8218         * serviced prior to recognizing any new events in order to
8219         * fully complete the previous instruction.
8220         */
8221        else if (!vcpu->arch.exception.pending) {
8222                if (vcpu->arch.nmi_injected) {
8223                        kvm_x86_ops.set_nmi(vcpu);
8224                        can_inject = false;
8225                } else if (vcpu->arch.interrupt.injected) {
8226                        kvm_x86_ops.set_irq(vcpu);
8227                        can_inject = false;
8228                }
8229        }
8230
8231        WARN_ON_ONCE(vcpu->arch.exception.injected &&
8232                     vcpu->arch.exception.pending);
8233
8234        /*
8235         * Call check_nested_events() even if we reinjected a previous event
8236         * in order for caller to determine if it should require immediate-exit
8237         * from L2 to L1 due to pending L1 events which require exit
8238         * from L2 to L1.
8239         */
8240        if (is_guest_mode(vcpu)) {
8241                r = kvm_x86_ops.nested_ops->check_events(vcpu);
8242                if (r < 0)
8243                        goto busy;
8244        }
8245
8246        /* try to inject new event if pending */
8247        if (vcpu->arch.exception.pending) {
8248                trace_kvm_inj_exception(vcpu->arch.exception.nr,
8249                                        vcpu->arch.exception.has_error_code,
8250                                        vcpu->arch.exception.error_code);
8251
8252                vcpu->arch.exception.pending = false;
8253                vcpu->arch.exception.injected = true;
8254
8255                if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
8256                        __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
8257                                             X86_EFLAGS_RF);
8258
8259                if (vcpu->arch.exception.nr == DB_VECTOR) {
8260                        kvm_deliver_exception_payload(vcpu);
8261                        if (vcpu->arch.dr7 & DR7_GD) {
8262                                vcpu->arch.dr7 &= ~DR7_GD;
8263                                kvm_update_dr7(vcpu);
8264                        }
8265                }
8266
8267                kvm_x86_ops.queue_exception(vcpu);
8268                can_inject = false;
8269        }
8270
8271        /*
8272         * Finally, inject interrupt events.  If an event cannot be injected
8273         * due to architectural conditions (e.g. IF=0) a window-open exit
8274         * will re-request KVM_REQ_EVENT.  Sometimes however an event is pending
8275         * and can architecturally be injected, but we cannot do it right now:
8276         * an interrupt could have arrived just now and we have to inject it
8277         * as a vmexit, or there could already an event in the queue, which is
8278         * indicated by can_inject.  In that case we request an immediate exit
8279         * in order to make progress and get back here for another iteration.
8280         * The kvm_x86_ops hooks communicate this by returning -EBUSY.
8281         */
8282        if (vcpu->arch.smi_pending) {
8283                r = can_inject ? kvm_x86_ops.smi_allowed(vcpu, true) : -EBUSY;
8284                if (r < 0)
8285                        goto busy;
8286                if (r) {
8287                        vcpu->arch.smi_pending = false;
8288                        ++vcpu->arch.smi_count;
8289                        enter_smm(vcpu);
8290                        can_inject = false;
8291                } else
8292                        kvm_x86_ops.enable_smi_window(vcpu);
8293        }
8294
8295        if (vcpu->arch.nmi_pending) {
8296                r = can_inject ? kvm_x86_ops.nmi_allowed(vcpu, true) : -EBUSY;
8297                if (r < 0)
8298                        goto busy;
8299                if (r) {
8300                        --vcpu->arch.nmi_pending;
8301                        vcpu->arch.nmi_injected = true;
8302                        kvm_x86_ops.set_nmi(vcpu);
8303                        can_inject = false;
8304                        WARN_ON(kvm_x86_ops.nmi_allowed(vcpu, true) < 0);
8305                }
8306                if (vcpu->arch.nmi_pending)
8307                        kvm_x86_ops.enable_nmi_window(vcpu);
8308        }
8309
8310        if (kvm_cpu_has_injectable_intr(vcpu)) {
8311                r = can_inject ? kvm_x86_ops.interrupt_allowed(vcpu, true) : -EBUSY;
8312                if (r < 0)
8313                        goto busy;
8314                if (r) {
8315                        kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
8316                        kvm_x86_ops.set_irq(vcpu);
8317                        WARN_ON(kvm_x86_ops.interrupt_allowed(vcpu, true) < 0);
8318                }
8319                if (kvm_cpu_has_injectable_intr(vcpu))
8320                        kvm_x86_ops.enable_irq_window(vcpu);
8321        }
8322
8323        if (is_guest_mode(vcpu) &&
8324            kvm_x86_ops.nested_ops->hv_timer_pending &&
8325            kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
8326                *req_immediate_exit = true;
8327
8328        WARN_ON(vcpu->arch.exception.pending);
8329        return;
8330
8331busy:
8332        *req_immediate_exit = true;
8333        return;
8334}
8335
8336static void process_nmi(struct kvm_vcpu *vcpu)
8337{
8338        unsigned limit = 2;
8339
8340        /*
8341         * x86 is limited to one NMI running, and one NMI pending after it.
8342         * If an NMI is already in progress, limit further NMIs to just one.
8343         * Otherwise, allow two (and we'll inject the first one immediately).
8344         */
8345        if (kvm_x86_ops.get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
8346                limit = 1;
8347
8348        vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
8349        vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
8350        kvm_make_request(KVM_REQ_EVENT, vcpu);
8351}
8352
8353static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
8354{
8355        u32 flags = 0;
8356        flags |= seg->g       << 23;
8357        flags |= seg->db      << 22;
8358        flags |= seg->l       << 21;
8359        flags |= seg->avl     << 20;
8360        flags |= seg->present << 15;
8361        flags |= seg->dpl     << 13;
8362        flags |= seg->s       << 12;
8363        flags |= seg->type    << 8;
8364        return flags;
8365}
8366
8367static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
8368{
8369        struct kvm_segment seg;
8370        int offset;
8371
8372        kvm_get_segment(vcpu, &seg, n);
8373        put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector);
8374
8375        if (n < 3)
8376                offset = 0x7f84 + n * 12;
8377        else
8378                offset = 0x7f2c + (n - 3) * 12;
8379
8380        put_smstate(u32, buf, offset + 8, seg.base);
8381        put_smstate(u32, buf, offset + 4, seg.limit);
8382        put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg));
8383}
8384
8385#ifdef CONFIG_X86_64
8386static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
8387{
8388        struct kvm_segment seg;
8389        int offset;
8390        u16 flags;
8391
8392        kvm_get_segment(vcpu, &seg, n);
8393        offset = 0x7e00 + n * 16;
8394
8395        flags = enter_smm_get_segment_flags(&seg) >> 8;
8396        put_smstate(u16, buf, offset, seg.selector);
8397        put_smstate(u16, buf, offset + 2, flags);
8398        put_smstate(u32, buf, offset + 4, seg.limit);
8399        put_smstate(u64, buf, offset + 8, seg.base);
8400}
8401#endif
8402
8403static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
8404{
8405        struct desc_ptr dt;
8406        struct kvm_segment seg;
8407        unsigned long val;
8408        int i;
8409
8410        put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu));
8411        put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu));
8412        put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu));
8413        put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu));
8414
8415        for (i = 0; i < 8; i++)
8416                put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read(vcpu, i));
8417
8418        kvm_get_dr(vcpu, 6, &val);
8419        put_smstate(u32, buf, 0x7fcc, (u32)val);
8420        kvm_get_dr(vcpu, 7, &val);
8421        put_smstate(u32, buf, 0x7fc8, (u32)val);
8422
8423        kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
8424        put_smstate(u32, buf, 0x7fc4, seg.selector);
8425        put_smstate(u32, buf, 0x7f64, seg.base);
8426        put_smstate(u32, buf, 0x7f60, seg.limit);
8427        put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg));
8428
8429        kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
8430        put_smstate(u32, buf, 0x7fc0, seg.selector);
8431        put_smstate(u32, buf, 0x7f80, seg.base);
8432        put_smstate(u32, buf, 0x7f7c, seg.limit);
8433        put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
8434
8435        kvm_x86_ops.get_gdt(vcpu, &dt);
8436        put_smstate(u32, buf, 0x7f74, dt.address);
8437        put_smstate(u32, buf, 0x7f70, dt.size);
8438
8439        kvm_x86_ops.get_idt(vcpu, &dt);
8440        put_smstate(u32, buf, 0x7f58, dt.address);
8441        put_smstate(u32, buf, 0x7f54, dt.size);
8442
8443        for (i = 0; i < 6; i++)
8444                enter_smm_save_seg_32(vcpu, buf, i);
8445
8446        put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
8447
8448        /* revision id */
8449        put_smstate(u32, buf, 0x7efc, 0x00020000);
8450        put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
8451}
8452
8453#ifdef CONFIG_X86_64
8454static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
8455{
8456        struct desc_ptr dt;
8457        struct kvm_segment seg;
8458        unsigned long val;
8459        int i;
8460
8461        for (i = 0; i < 16; i++)
8462                put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read(vcpu, i));
8463
8464        put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu));
8465        put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu));
8466
8467        kvm_get_dr(vcpu, 6, &val);
8468        put_smstate(u64, buf, 0x7f68, val);
8469        kvm_get_dr(vcpu, 7, &val);
8470        put_smstate(u64, buf, 0x7f60, val);
8471
8472        put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu));
8473        put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu));
8474        put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu));
8475
8476        put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase);
8477
8478        /* revision id */
8479        put_smstate(u32, buf, 0x7efc, 0x00020064);
8480
8481        put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer);
8482
8483        kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
8484        put_smstate(u16, buf, 0x7e90, seg.selector);
8485        put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8);
8486        put_smstate(u32, buf, 0x7e94, seg.limit);
8487        put_smstate(u64, buf, 0x7e98, seg.base);
8488
8489        kvm_x86_ops.get_idt(vcpu, &dt);
8490        put_smstate(u32, buf, 0x7e84, dt.size);
8491        put_smstate(u64, buf, 0x7e88, dt.address);
8492
8493        kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
8494        put_smstate(u16, buf, 0x7e70, seg.selector);
8495        put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8);
8496        put_smstate(u32, buf, 0x7e74, seg.limit);
8497        put_smstate(u64, buf, 0x7e78, seg.base);
8498
8499        kvm_x86_ops.get_gdt(vcpu, &dt);
8500        put_smstate(u32, buf, 0x7e64, dt.size);
8501        put_smstate(u64, buf, 0x7e68, dt.address);
8502
8503        for (i = 0; i < 6; i++)
8504                enter_smm_save_seg_64(vcpu, buf, i);
8505}
8506#endif
8507
8508static void enter_smm(struct kvm_vcpu *vcpu)
8509{
8510        struct kvm_segment cs, ds;
8511        struct desc_ptr dt;
8512        char buf[512];
8513        u32 cr0;
8514
8515        trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
8516        memset(buf, 0, 512);
8517#ifdef CONFIG_X86_64
8518        if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
8519                enter_smm_save_state_64(vcpu, buf);
8520        else
8521#endif
8522                enter_smm_save_state_32(vcpu, buf);
8523
8524        /*
8525         * Give pre_enter_smm() a chance to make ISA-specific changes to the
8526         * vCPU state (e.g. leave guest mode) after we've saved the state into
8527         * the SMM state-save area.
8528         */
8529        kvm_x86_ops.pre_enter_smm(vcpu, buf);
8530
8531        vcpu->arch.hflags |= HF_SMM_MASK;
8532        kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
8533
8534        if (kvm_x86_ops.get_nmi_mask(vcpu))
8535                vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
8536        else
8537                kvm_x86_ops.set_nmi_mask(vcpu, true);
8538
8539        kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
8540        kvm_rip_write(vcpu, 0x8000);
8541
8542        cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
8543        kvm_x86_ops.set_cr0(vcpu, cr0);
8544        vcpu->arch.cr0 = cr0;
8545
8546        kvm_x86_ops.set_cr4(vcpu, 0);
8547
8548        /* Undocumented: IDT limit is set to zero on entry to SMM.  */
8549        dt.address = dt.size = 0;
8550        kvm_x86_ops.set_idt(vcpu, &dt);
8551
8552        __kvm_set_dr(vcpu, 7, DR7_FIXED_1);
8553
8554        cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
8555        cs.base = vcpu->arch.smbase;
8556
8557        ds.selector = 0;
8558        ds.base = 0;
8559
8560        cs.limit    = ds.limit = 0xffffffff;
8561        cs.type     = ds.type = 0x3;
8562        cs.dpl      = ds.dpl = 0;
8563        cs.db       = ds.db = 0;
8564        cs.s        = ds.s = 1;
8565        cs.l        = ds.l = 0;
8566        cs.g        = ds.g = 1;
8567        cs.avl      = ds.avl = 0;
8568        cs.present  = ds.present = 1;
8569        cs.unusable = ds.unusable = 0;
8570        cs.padding  = ds.padding = 0;
8571
8572        kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
8573        kvm_set_segment(vcpu, &ds, VCPU_SREG_DS);
8574        kvm_set_segment(vcpu, &ds, VCPU_SREG_ES);
8575        kvm_set_segment(vcpu, &ds, VCPU_SREG_FS);
8576        kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
8577        kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
8578
8579#ifdef CONFIG_X86_64
8580        if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
8581                kvm_x86_ops.set_efer(vcpu, 0);
8582#endif
8583
8584        kvm_update_cpuid_runtime(vcpu);
8585        kvm_mmu_reset_context(vcpu);
8586}
8587
8588static void process_smi(struct kvm_vcpu *vcpu)
8589{
8590        vcpu->arch.smi_pending = true;
8591        kvm_make_request(KVM_REQ_EVENT, vcpu);
8592}
8593
8594void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
8595                                       unsigned long *vcpu_bitmap)
8596{
8597        cpumask_var_t cpus;
8598
8599        zalloc_cpumask_var(&cpus, GFP_ATOMIC);
8600
8601        kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC,
8602                                    NULL, vcpu_bitmap, cpus);
8603
8604        free_cpumask_var(cpus);
8605}
8606
8607void kvm_make_scan_ioapic_request(struct kvm *kvm)
8608{
8609        kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
8610}
8611
8612void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
8613{
8614        if (!lapic_in_kernel(vcpu))
8615                return;
8616
8617        vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm);
8618        kvm_apic_update_apicv(vcpu);
8619        kvm_x86_ops.refresh_apicv_exec_ctrl(vcpu);
8620}
8621EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
8622
8623/*
8624 * NOTE: Do not hold any lock prior to calling this.
8625 *
8626 * In particular, kvm_request_apicv_update() expects kvm->srcu not to be
8627 * locked, because it calls __x86_set_memory_region() which does
8628 * synchronize_srcu(&kvm->srcu).
8629 */
8630void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
8631{
8632        struct kvm_vcpu *except;
8633        unsigned long old, new, expected;
8634
8635        if (!kvm_x86_ops.check_apicv_inhibit_reasons ||
8636            !kvm_x86_ops.check_apicv_inhibit_reasons(bit))
8637                return;
8638
8639        old = READ_ONCE(kvm->arch.apicv_inhibit_reasons);
8640        do {
8641                expected = new = old;
8642                if (activate)
8643                        __clear_bit(bit, &new);
8644                else
8645                        __set_bit(bit, &new);
8646                if (new == old)
8647                        break;
8648                old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new);
8649        } while (old != expected);
8650
8651        if (!!old == !!new)
8652                return;
8653
8654        trace_kvm_apicv_update_request(activate, bit);
8655        if (kvm_x86_ops.pre_update_apicv_exec_ctrl)
8656                kvm_x86_ops.pre_update_apicv_exec_ctrl(kvm, activate);
8657
8658        /*
8659         * Sending request to update APICV for all other vcpus,
8660         * while update the calling vcpu immediately instead of
8661         * waiting for another #VMEXIT to handle the request.
8662         */
8663        except = kvm_get_running_vcpu();
8664        kvm_make_all_cpus_request_except(kvm, KVM_REQ_APICV_UPDATE,
8665                                         except);
8666        if (except)
8667                kvm_vcpu_update_apicv(except);
8668}
8669EXPORT_SYMBOL_GPL(kvm_request_apicv_update);
8670
8671static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
8672{
8673        if (!kvm_apic_present(vcpu))
8674                return;
8675
8676        bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
8677
8678        if (irqchip_split(vcpu->kvm))
8679                kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
8680        else {
8681                if (vcpu->arch.apicv_active)
8682                        kvm_x86_ops.sync_pir_to_irr(vcpu);
8683                if (ioapic_in_kernel(vcpu->kvm))
8684                        kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
8685        }
8686
8687        if (is_guest_mode(vcpu))
8688                vcpu->arch.load_eoi_exitmap_pending = true;
8689        else
8690                kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu);
8691}
8692
8693static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
8694{
8695        u64 eoi_exit_bitmap[4];
8696
8697        if (!kvm_apic_hw_enabled(vcpu->arch.apic))
8698                return;
8699
8700        bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors,
8701                  vcpu_to_synic(vcpu)->vec_bitmap, 256);
8702        kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap);
8703}
8704
8705void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
8706                                            unsigned long start, unsigned long end)
8707{
8708        unsigned long apic_address;
8709
8710        /*
8711         * The physical address of apic access page is stored in the VMCS.
8712         * Update it when it becomes invalid.
8713         */
8714        apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
8715        if (start <= apic_address && apic_address < end)
8716                kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
8717}
8718
8719void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
8720{
8721        if (!lapic_in_kernel(vcpu))
8722                return;
8723
8724        if (!kvm_x86_ops.set_apic_access_page_addr)
8725                return;
8726
8727        kvm_x86_ops.set_apic_access_page_addr(vcpu);
8728}
8729
8730void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
8731{
8732        smp_send_reschedule(vcpu->cpu);
8733}
8734EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
8735
8736/*
8737 * Returns 1 to let vcpu_run() continue the guest execution loop without
8738 * exiting to the userspace.  Otherwise, the value will be returned to the
8739 * userspace.
8740 */
8741static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
8742{
8743        int r;
8744        bool req_int_win =
8745                dm_request_for_irq_injection(vcpu) &&
8746                kvm_cpu_accept_dm_intr(vcpu);
8747        fastpath_t exit_fastpath;
8748
8749        bool req_immediate_exit = false;
8750
8751        if (kvm_request_pending(vcpu)) {
8752                if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
8753                        if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
8754                                r = 0;
8755                                goto out;
8756                        }
8757                }
8758                if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
8759                        kvm_mmu_unload(vcpu);
8760                if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
8761                        __kvm_migrate_timers(vcpu);
8762                if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
8763                        kvm_gen_update_masterclock(vcpu->kvm);
8764                if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
8765                        kvm_gen_kvmclock_update(vcpu);
8766                if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
8767                        r = kvm_guest_time_update(vcpu);
8768                        if (unlikely(r))
8769                                goto out;
8770                }
8771                if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
8772                        kvm_mmu_sync_roots(vcpu);
8773                if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
8774                        kvm_mmu_load_pgd(vcpu);
8775                if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
8776                        kvm_vcpu_flush_tlb_all(vcpu);
8777
8778                        /* Flushing all ASIDs flushes the current ASID... */
8779                        kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
8780                }
8781                if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
8782                        kvm_vcpu_flush_tlb_current(vcpu);
8783                if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu))
8784                        kvm_vcpu_flush_tlb_guest(vcpu);
8785
8786                if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
8787                        vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
8788                        r = 0;
8789                        goto out;
8790                }
8791                if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
8792                        vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
8793                        vcpu->mmio_needed = 0;
8794                        r = 0;
8795                        goto out;
8796                }
8797                if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
8798                        /* Page is swapped out. Do synthetic halt */
8799                        vcpu->arch.apf.halted = true;
8800                        r = 1;
8801                        goto out;
8802                }
8803                if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
8804                        record_steal_time(vcpu);
8805                if (kvm_check_request(KVM_REQ_SMI, vcpu))
8806                        process_smi(vcpu);
8807                if (kvm_check_request(KVM_REQ_NMI, vcpu))
8808                        process_nmi(vcpu);
8809                if (kvm_check_request(KVM_REQ_PMU, vcpu))
8810                        kvm_pmu_handle_event(vcpu);
8811                if (kvm_check_request(KVM_REQ_PMI, vcpu))
8812                        kvm_pmu_deliver_pmi(vcpu);
8813                if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
8814                        BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
8815                        if (test_bit(vcpu->arch.pending_ioapic_eoi,
8816                                     vcpu->arch.ioapic_handled_vectors)) {
8817                                vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
8818                                vcpu->run->eoi.vector =
8819                                                vcpu->arch.pending_ioapic_eoi;
8820                                r = 0;
8821                                goto out;
8822                        }
8823                }
8824                if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
8825                        vcpu_scan_ioapic(vcpu);
8826                if (kvm_check_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu))
8827                        vcpu_load_eoi_exitmap(vcpu);
8828                if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
8829                        kvm_vcpu_reload_apic_access_page(vcpu);
8830                if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
8831                        vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
8832                        vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
8833                        r = 0;
8834                        goto out;
8835                }
8836                if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
8837                        vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
8838                        vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
8839                        r = 0;
8840                        goto out;
8841                }
8842                if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) {
8843                        vcpu->run->exit_reason = KVM_EXIT_HYPERV;
8844                        vcpu->run->hyperv = vcpu->arch.hyperv.exit;
8845                        r = 0;
8846                        goto out;
8847                }
8848
8849                /*
8850                 * KVM_REQ_HV_STIMER has to be processed after
8851                 * KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers
8852                 * depend on the guest clock being up-to-date
8853                 */
8854                if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
8855                        kvm_hv_process_stimers(vcpu);
8856                if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
8857                        kvm_vcpu_update_apicv(vcpu);
8858                if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
8859                        kvm_check_async_pf_completion(vcpu);
8860                if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
8861                        kvm_x86_ops.msr_filter_changed(vcpu);
8862        }
8863
8864        if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
8865                ++vcpu->stat.req_event;
8866                kvm_apic_accept_events(vcpu);
8867                if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
8868                        r = 1;
8869                        goto out;
8870                }
8871
8872                inject_pending_event(vcpu, &req_immediate_exit);
8873                if (req_int_win)
8874                        kvm_x86_ops.enable_irq_window(vcpu);
8875
8876                if (kvm_lapic_enabled(vcpu)) {
8877                        update_cr8_intercept(vcpu);
8878                        kvm_lapic_sync_to_vapic(vcpu);
8879                }
8880        }
8881
8882        r = kvm_mmu_reload(vcpu);
8883        if (unlikely(r)) {
8884                goto cancel_injection;
8885        }
8886
8887        preempt_disable();
8888
8889        kvm_x86_ops.prepare_guest_switch(vcpu);
8890
8891        /*
8892         * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt
8893         * IPI are then delayed after guest entry, which ensures that they
8894         * result in virtual interrupt delivery.
8895         */
8896        local_irq_disable();
8897        vcpu->mode = IN_GUEST_MODE;
8898
8899        srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
8900
8901        /*
8902         * 1) We should set ->mode before checking ->requests.  Please see
8903         * the comment in kvm_vcpu_exiting_guest_mode().
8904         *
8905         * 2) For APICv, we should set ->mode before checking PID.ON. This
8906         * pairs with the memory barrier implicit in pi_test_and_set_on
8907         * (see vmx_deliver_posted_interrupt).
8908         *
8909         * 3) This also orders the write to mode from any reads to the page
8910         * tables done while the VCPU is running.  Please see the comment
8911         * in kvm_flush_remote_tlbs.
8912         */
8913        smp_mb__after_srcu_read_unlock();
8914
8915        /*
8916         * This handles the case where a posted interrupt was
8917         * notified with kvm_vcpu_kick.
8918         */
8919        if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
8920                kvm_x86_ops.sync_pir_to_irr(vcpu);
8921
8922        if (kvm_vcpu_exit_request(vcpu)) {
8923                vcpu->mode = OUTSIDE_GUEST_MODE;
8924                smp_wmb();
8925                local_irq_enable();
8926                preempt_enable();
8927                vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
8928                r = 1;
8929                goto cancel_injection;
8930        }
8931
8932        if (req_immediate_exit) {
8933                kvm_make_request(KVM_REQ_EVENT, vcpu);
8934                kvm_x86_ops.request_immediate_exit(vcpu);
8935        }
8936
8937        trace_kvm_entry(vcpu);
8938
8939        fpregs_assert_state_consistent();
8940        if (test_thread_flag(TIF_NEED_FPU_LOAD))
8941                switch_fpu_return();
8942
8943        if (unlikely(vcpu->arch.switch_db_regs)) {
8944                set_debugreg(0, 7);
8945                set_debugreg(vcpu->arch.eff_db[0], 0);
8946                set_debugreg(vcpu->arch.eff_db[1], 1);
8947                set_debugreg(vcpu->arch.eff_db[2], 2);
8948                set_debugreg(vcpu->arch.eff_db[3], 3);
8949                set_debugreg(vcpu->arch.dr6, 6);
8950                vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
8951        }
8952
8953        exit_fastpath = kvm_x86_ops.run(vcpu);
8954
8955        /*
8956         * Do this here before restoring debug registers on the host.  And
8957         * since we do this before handling the vmexit, a DR access vmexit
8958         * can (a) read the correct value of the debug registers, (b) set
8959         * KVM_DEBUGREG_WONT_EXIT again.
8960         */
8961        if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
8962                WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
8963                kvm_x86_ops.sync_dirty_debug_regs(vcpu);
8964                kvm_update_dr0123(vcpu);
8965                kvm_update_dr7(vcpu);
8966                vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
8967        }
8968
8969        /*
8970         * If the guest has used debug registers, at least dr7
8971         * will be disabled while returning to the host.
8972         * If we don't have active breakpoints in the host, we don't
8973         * care about the messed up debug address registers. But if
8974         * we have some of them active, restore the old state.
8975         */
8976        if (hw_breakpoint_active())
8977                hw_breakpoint_restore();
8978
8979        vcpu->arch.last_vmentry_cpu = vcpu->cpu;
8980        vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
8981
8982        vcpu->mode = OUTSIDE_GUEST_MODE;
8983        smp_wmb();
8984
8985        kvm_x86_ops.handle_exit_irqoff(vcpu);
8986
8987        /*
8988         * Consume any pending interrupts, including the possible source of
8989         * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
8990         * An instruction is required after local_irq_enable() to fully unblock
8991         * interrupts on processors that implement an interrupt shadow, the
8992         * stat.exits increment will do nicely.
8993         */
8994        kvm_before_interrupt(vcpu);
8995        local_irq_enable();
8996        ++vcpu->stat.exits;
8997        local_irq_disable();
8998        kvm_after_interrupt(vcpu);
8999
9000        if (lapic_in_kernel(vcpu)) {
9001                s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
9002                if (delta != S64_MIN) {
9003                        trace_kvm_wait_lapic_expire(vcpu->vcpu_id, delta);
9004                        vcpu->arch.apic->lapic_timer.advance_expire_delta = S64_MIN;
9005                }
9006        }
9007
9008        local_irq_enable();
9009        preempt_enable();
9010
9011        vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
9012
9013        /*
9014         * Profile KVM exit RIPs:
9015         */
9016        if (unlikely(prof_on == KVM_PROFILING)) {
9017                unsigned long rip = kvm_rip_read(vcpu);
9018                profile_hit(KVM_PROFILING, (void *)rip);
9019        }
9020
9021        if (unlikely(vcpu->arch.tsc_always_catchup))
9022                kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
9023
9024        if (vcpu->arch.apic_attention)
9025                kvm_lapic_sync_from_vapic(vcpu);
9026
9027        r = kvm_x86_ops.handle_exit(vcpu, exit_fastpath);
9028        return r;
9029
9030cancel_injection:
9031        if (req_immediate_exit)
9032                kvm_make_request(KVM_REQ_EVENT, vcpu);
9033        kvm_x86_ops.cancel_injection(vcpu);
9034        if (unlikely(vcpu->arch.apic_attention))
9035                kvm_lapic_sync_from_vapic(vcpu);
9036out:
9037        return r;
9038}
9039
9040static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
9041{
9042        if (!kvm_arch_vcpu_runnable(vcpu) &&
9043            (!kvm_x86_ops.pre_block || kvm_x86_ops.pre_block(vcpu) == 0)) {
9044                srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
9045                kvm_vcpu_block(vcpu);
9046                vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
9047
9048                if (kvm_x86_ops.post_block)
9049                        kvm_x86_ops.post_block(vcpu);
9050
9051                if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
9052                        return 1;
9053        }
9054
9055        kvm_apic_accept_events(vcpu);
9056        switch(vcpu->arch.mp_state) {
9057        case KVM_MP_STATE_HALTED:
9058                vcpu->arch.pv.pv_unhalted = false;
9059                vcpu->arch.mp_state =
9060                        KVM_MP_STATE_RUNNABLE;
9061                fallthrough;
9062        case KVM_MP_STATE_RUNNABLE:
9063                vcpu->arch.apf.halted = false;
9064                break;
9065        case KVM_MP_STATE_INIT_RECEIVED:
9066                break;
9067        default:
9068                return -EINTR;
9069        }
9070        return 1;
9071}
9072
9073static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
9074{
9075        if (is_guest_mode(vcpu))
9076                kvm_x86_ops.nested_ops->check_events(vcpu);
9077
9078        return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
9079                !vcpu->arch.apf.halted);
9080}
9081
9082static int vcpu_run(struct kvm_vcpu *vcpu)
9083{
9084        int r;
9085        struct kvm *kvm = vcpu->kvm;
9086
9087        vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
9088        vcpu->arch.l1tf_flush_l1d = true;
9089
9090        for (;;) {
9091                if (kvm_vcpu_running(vcpu)) {
9092                        r = vcpu_enter_guest(vcpu);
9093                } else {
9094                        r = vcpu_block(kvm, vcpu);
9095                }
9096
9097                if (r <= 0)
9098                        break;
9099
9100                kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu);
9101                if (kvm_cpu_has_pending_timer(vcpu))
9102                        kvm_inject_pending_timer_irqs(vcpu);
9103
9104                if (dm_request_for_irq_injection(vcpu) &&
9105                        kvm_vcpu_ready_for_interrupt_injection(vcpu)) {
9106                        r = 0;
9107                        vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
9108                        ++vcpu->stat.request_irq_exits;
9109                        break;
9110                }
9111
9112                if (__xfer_to_guest_mode_work_pending()) {
9113                        srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
9114                        r = xfer_to_guest_mode_handle_work(vcpu);
9115                        if (r)
9116                                return r;
9117                        vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
9118                }
9119        }
9120
9121        srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
9122
9123        return r;
9124}
9125
9126static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
9127{
9128        int r;
9129
9130        vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
9131        r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
9132        srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
9133        return r;
9134}
9135
9136static int complete_emulated_pio(struct kvm_vcpu *vcpu)
9137{
9138        BUG_ON(!vcpu->arch.pio.count);
9139
9140        return complete_emulated_io(vcpu);
9141}
9142
9143/*
9144 * Implements the following, as a state machine:
9145 *
9146 * read:
9147 *   for each fragment
9148 *     for each mmio piece in the fragment
9149 *       write gpa, len
9150 *       exit
9151 *       copy data
9152 *   execute insn
9153 *
9154 * write:
9155 *   for each fragment
9156 *     for each mmio piece in the fragment
9157 *       write gpa, len
9158 *       copy data
9159 *       exit
9160 */
9161static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
9162{
9163        struct kvm_run *run = vcpu->run;
9164        struct kvm_mmio_fragment *frag;
9165        unsigned len;
9166
9167        BUG_ON(!vcpu->mmio_needed);
9168
9169        /* Complete previous fragment */
9170        frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
9171        len = min(8u, frag->len);
9172        if (!vcpu->mmio_is_write)
9173                memcpy(frag->data, run->mmio.data, len);
9174
9175        if (frag->len <= 8) {
9176                /* Switch to the next fragment. */
9177                frag++;
9178                vcpu->mmio_cur_fragment++;
9179        } else {
9180                /* Go forward to the next mmio piece. */
9181                frag->data += len;
9182                frag->gpa += len;
9183                frag->len -= len;
9184        }
9185
9186        if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
9187                vcpu->mmio_needed = 0;
9188
9189                /* FIXME: return into emulator if single-stepping.  */
9190                if (vcpu->mmio_is_write)
9191                        return 1;
9192                vcpu->mmio_read_completed = 1;
9193                return complete_emulated_io(vcpu);
9194        }
9195
9196        run->exit_reason = KVM_EXIT_MMIO;
9197        run->mmio.phys_addr = frag->gpa;
9198        if (vcpu->mmio_is_write)
9199                memcpy(run->mmio.data, frag->data, min(8u, frag->len));
9200        run->mmio.len = min(8u, frag->len);
9201        run->mmio.is_write = vcpu->mmio_is_write;
9202        vcpu->arch.complete_userspace_io = complete_emulated_mmio;
9203        return 0;
9204}
9205
9206static void kvm_save_current_fpu(struct fpu *fpu)
9207{
9208        /*
9209         * If the target FPU state is not resident in the CPU registers, just
9210         * memcpy() from current, else save CPU state directly to the target.
9211         */
9212        if (test_thread_flag(TIF_NEED_FPU_LOAD))
9213                memcpy(&fpu->state, &current->thread.fpu.state,
9214                       fpu_kernel_xstate_size);
9215        else
9216                copy_fpregs_to_fpstate(fpu);
9217}
9218
9219/* Swap (qemu) user FPU context for the guest FPU context. */
9220static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
9221{
9222        fpregs_lock();
9223
9224        kvm_save_current_fpu(vcpu->arch.user_fpu);
9225
9226        /* PKRU is separately restored in kvm_x86_ops.run.  */
9227        __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
9228                                ~XFEATURE_MASK_PKRU);
9229
9230        fpregs_mark_activate();
9231        fpregs_unlock();
9232
9233        trace_kvm_fpu(1);
9234}
9235
9236/* When vcpu_run ends, restore user space FPU context. */
9237static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
9238{
9239        fpregs_lock();
9240
9241        kvm_save_current_fpu(vcpu->arch.guest_fpu);
9242
9243        copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state);
9244
9245        fpregs_mark_activate();
9246        fpregs_unlock();
9247
9248        ++vcpu->stat.fpu_reload;
9249        trace_kvm_fpu(0);
9250}
9251
9252int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
9253{
9254        struct kvm_run *kvm_run = vcpu->run;
9255        int r;
9256
9257        vcpu_load(vcpu);
9258        kvm_sigset_activate(vcpu);
9259        kvm_load_guest_fpu(vcpu);
9260
9261        if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
9262                if (kvm_run->immediate_exit) {
9263                        r = -EINTR;
9264                        goto out;
9265                }
9266                kvm_vcpu_block(vcpu);
9267                kvm_apic_accept_events(vcpu);
9268                kvm_clear_request(KVM_REQ_UNHALT, vcpu);
9269                r = -EAGAIN;
9270                if (signal_pending(current)) {
9271                        r = -EINTR;
9272                        kvm_run->exit_reason = KVM_EXIT_INTR;
9273                        ++vcpu->stat.signal_exits;
9274                }
9275                goto out;
9276        }
9277
9278        if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
9279                r = -EINVAL;
9280                goto out;
9281        }
9282
9283        if (kvm_run->kvm_dirty_regs) {
9284                r = sync_regs(vcpu);
9285                if (r != 0)
9286                        goto out;
9287        }
9288
9289        /* re-sync apic's tpr */
9290        if (!lapic_in_kernel(vcpu)) {
9291                if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
9292                        r = -EINVAL;
9293                        goto out;
9294                }
9295        }
9296
9297        if (unlikely(vcpu->arch.complete_userspace_io)) {
9298                int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
9299                vcpu->arch.complete_userspace_io = NULL;
9300                r = cui(vcpu);
9301                if (r <= 0)
9302                        goto out;
9303        } else
9304                WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
9305
9306        if (kvm_run->immediate_exit)
9307                r = -EINTR;
9308        else
9309                r = vcpu_run(vcpu);
9310
9311out:
9312        kvm_put_guest_fpu(vcpu);
9313        if (kvm_run->kvm_valid_regs)
9314                store_regs(vcpu);
9315        post_kvm_run_save(vcpu);
9316        kvm_sigset_deactivate(vcpu);
9317
9318        vcpu_put(vcpu);
9319        return r;
9320}
9321
9322static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
9323{
9324        if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
9325                /*
9326                 * We are here if userspace calls get_regs() in the middle of
9327                 * instruction emulation. Registers state needs to be copied
9328                 * back from emulation context to vcpu. Userspace shouldn't do
9329                 * that usually, but some bad designed PV devices (vmware
9330                 * backdoor interface) need this to work
9331                 */
9332                emulator_writeback_register_cache(vcpu->arch.emulate_ctxt);
9333                vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
9334        }
9335        regs->rax = kvm_rax_read(vcpu);
9336        regs->rbx = kvm_rbx_read(vcpu);
9337        regs->rcx = kvm_rcx_read(vcpu);
9338        regs->rdx = kvm_rdx_read(vcpu);
9339        regs->rsi = kvm_rsi_read(vcpu);
9340        regs->rdi = kvm_rdi_read(vcpu);
9341        regs->rsp = kvm_rsp_read(vcpu);
9342        regs->rbp = kvm_rbp_read(vcpu);
9343#ifdef CONFIG_X86_64
9344        regs->r8 = kvm_r8_read(vcpu);
9345        regs->r9 = kvm_r9_read(vcpu);
9346        regs->r10 = kvm_r10_read(vcpu);
9347        regs->r11 = kvm_r11_read(vcpu);
9348        regs->r12 = kvm_r12_read(vcpu);
9349        regs->r13 = kvm_r13_read(vcpu);
9350        regs->r14 = kvm_r14_read(vcpu);
9351        regs->r15 = kvm_r15_read(vcpu);
9352#endif
9353
9354        regs->rip = kvm_rip_read(vcpu);
9355        regs->rflags = kvm_get_rflags(vcpu);
9356}
9357
9358int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
9359{
9360        vcpu_load(vcpu);
9361        __get_regs(vcpu, regs);
9362        vcpu_put(vcpu);
9363        return 0;
9364}
9365
9366static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
9367{
9368        vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
9369        vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
9370
9371        kvm_rax_write(vcpu, regs->rax);
9372        kvm_rbx_write(vcpu, regs->rbx);
9373        kvm_rcx_write(vcpu, regs->rcx);
9374        kvm_rdx_write(vcpu, regs->rdx);
9375        kvm_rsi_write(vcpu, regs->rsi);
9376        kvm_rdi_write(vcpu, regs->rdi);
9377        kvm_rsp_write(vcpu, regs->rsp);
9378        kvm_rbp_write(vcpu, regs->rbp);
9379#ifdef CONFIG_X86_64
9380        kvm_r8_write(vcpu, regs->r8);
9381        kvm_r9_write(vcpu, regs->r9);
9382        kvm_r10_write(vcpu, regs->r10);
9383        kvm_r11_write(vcpu, regs->r11);
9384        kvm_r12_write(vcpu, regs->r12);
9385        kvm_r13_write(vcpu, regs->r13);
9386        kvm_r14_write(vcpu, regs->r14);
9387        kvm_r15_write(vcpu, regs->r15);
9388#endif
9389
9390        kvm_rip_write(vcpu, regs->rip);
9391        kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
9392
9393        vcpu->arch.exception.pending = false;
9394
9395        kvm_make_request(KVM_REQ_EVENT, vcpu);
9396}
9397
9398int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
9399{
9400        vcpu_load(vcpu);
9401        __set_regs(vcpu, regs);
9402        vcpu_put(vcpu);
9403        return 0;
9404}
9405
9406void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
9407{
9408        struct kvm_segment cs;
9409
9410        kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
9411        *db = cs.db;
9412        *l = cs.l;
9413}
9414EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
9415
9416static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
9417{
9418        struct desc_ptr dt;
9419
9420        kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
9421        kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
9422        kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
9423        kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
9424        kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
9425        kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
9426
9427        kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
9428        kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
9429
9430        kvm_x86_ops.get_idt(vcpu, &dt);
9431        sregs->idt.limit = dt.size;
9432        sregs->idt.base = dt.address;
9433        kvm_x86_ops.get_gdt(vcpu, &dt);
9434        sregs->gdt.limit = dt.size;
9435        sregs->gdt.base = dt.address;
9436
9437        sregs->cr0 = kvm_read_cr0(vcpu);
9438        sregs->cr2 = vcpu->arch.cr2;
9439        sregs->cr3 = kvm_read_cr3(vcpu);
9440        sregs->cr4 = kvm_read_cr4(vcpu);
9441        sregs->cr8 = kvm_get_cr8(vcpu);
9442        sregs->efer = vcpu->arch.efer;
9443        sregs->apic_base = kvm_get_apic_base(vcpu);
9444
9445        memset(sregs->interrupt_bitmap, 0, sizeof(sregs->interrupt_bitmap));
9446
9447        if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
9448                set_bit(vcpu->arch.interrupt.nr,
9449                        (unsigned long *)sregs->interrupt_bitmap);
9450}
9451
9452int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
9453                                  struct kvm_sregs *sregs)
9454{
9455        vcpu_load(vcpu);
9456        __get_sregs(vcpu, sregs);
9457        vcpu_put(vcpu);
9458        return 0;
9459}
9460
9461int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
9462                                    struct kvm_mp_state *mp_state)
9463{
9464        vcpu_load(vcpu);
9465        if (kvm_mpx_supported())
9466                kvm_load_guest_fpu(vcpu);
9467
9468        kvm_apic_accept_events(vcpu);
9469        if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
9470                                        vcpu->arch.pv.pv_unhalted)
9471                mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
9472        else
9473                mp_state->mp_state = vcpu->arch.mp_state;
9474
9475        if (kvm_mpx_supported())
9476                kvm_put_guest_fpu(vcpu);
9477        vcpu_put(vcpu);
9478        return 0;
9479}
9480
9481int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
9482                                    struct kvm_mp_state *mp_state)
9483{
9484        int ret = -EINVAL;
9485
9486        vcpu_load(vcpu);
9487
9488        if (!lapic_in_kernel(vcpu) &&
9489            mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
9490                goto out;
9491
9492        /*
9493         * KVM_MP_STATE_INIT_RECEIVED means the processor is in
9494         * INIT state; latched init should be reported using
9495         * KVM_SET_VCPU_EVENTS, so reject it here.
9496         */
9497        if ((kvm_vcpu_latch_init(vcpu) || vcpu->arch.smi_pending) &&
9498            (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
9499             mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
9500                goto out;
9501
9502        if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
9503                vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
9504                set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
9505        } else
9506                vcpu->arch.mp_state = mp_state->mp_state;
9507        kvm_make_request(KVM_REQ_EVENT, vcpu);
9508
9509        ret = 0;
9510out:
9511        vcpu_put(vcpu);
9512        return ret;
9513}
9514
9515int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
9516                    int reason, bool has_error_code, u32 error_code)
9517{
9518        struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
9519        int ret;
9520
9521        init_emulate_ctxt(vcpu);
9522
9523        ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
9524                                   has_error_code, error_code);
9525        if (ret) {
9526                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
9527                vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
9528                vcpu->run->internal.ndata = 0;
9529                return 0;
9530        }
9531
9532        kvm_rip_write(vcpu, ctxt->eip);
9533        kvm_set_rflags(vcpu, ctxt->eflags);
9534        return 1;
9535}
9536EXPORT_SYMBOL_GPL(kvm_task_switch);
9537
9538static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
9539{
9540        if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
9541                /*
9542                 * When EFER.LME and CR0.PG are set, the processor is in
9543                 * 64-bit mode (though maybe in a 32-bit code segment).
9544                 * CR4.PAE and EFER.LMA must be set.
9545                 */
9546                if (!(sregs->cr4 & X86_CR4_PAE)
9547                    || !(sregs->efer & EFER_LMA))
9548                        return -EINVAL;
9549        } else {
9550                /*
9551                 * Not in 64-bit mode: EFER.LMA is clear and the code
9552                 * segment cannot be 64-bit.
9553                 */
9554                if (sregs->efer & EFER_LMA || sregs->cs.l)
9555                        return -EINVAL;
9556        }
9557
9558        return kvm_valid_cr4(vcpu, sregs->cr4);
9559}
9560
9561static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
9562{
9563        struct msr_data apic_base_msr;
9564        int mmu_reset_needed = 0;
9565        int cpuid_update_needed = 0;
9566        int pending_vec, max_bits, idx;
9567        struct desc_ptr dt;
9568        int ret = -EINVAL;
9569
9570        if (kvm_valid_sregs(vcpu, sregs))
9571                goto out;
9572
9573        apic_base_msr.data = sregs->apic_base;
9574        apic_base_msr.host_initiated = true;
9575        if (kvm_set_apic_base(vcpu, &apic_base_msr))
9576                goto out;
9577
9578        dt.size = sregs->idt.limit;
9579        dt.address = sregs->idt.base;
9580        kvm_x86_ops.set_idt(vcpu, &dt);
9581        dt.size = sregs->gdt.limit;
9582        dt.address = sregs->gdt.base;
9583        kvm_x86_ops.set_gdt(vcpu, &dt);
9584
9585        vcpu->arch.cr2 = sregs->cr2;
9586        mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
9587        vcpu->arch.cr3 = sregs->cr3;
9588        kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
9589
9590        kvm_set_cr8(vcpu, sregs->cr8);
9591
9592        mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
9593        kvm_x86_ops.set_efer(vcpu, sregs->efer);
9594
9595        mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
9596        kvm_x86_ops.set_cr0(vcpu, sregs->cr0);
9597        vcpu->arch.cr0 = sregs->cr0;
9598
9599        mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
9600        cpuid_update_needed |= ((kvm_read_cr4(vcpu) ^ sregs->cr4) &
9601                                (X86_CR4_OSXSAVE | X86_CR4_PKE));
9602        kvm_x86_ops.set_cr4(vcpu, sregs->cr4);
9603        if (cpuid_update_needed)
9604                kvm_update_cpuid_runtime(vcpu);
9605
9606        idx = srcu_read_lock(&vcpu->kvm->srcu);
9607        if (is_pae_paging(vcpu)) {
9608                load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
9609                mmu_reset_needed = 1;
9610        }
9611        srcu_read_unlock(&vcpu->kvm->srcu, idx);
9612
9613        if (mmu_reset_needed)
9614                kvm_mmu_reset_context(vcpu);
9615
9616        max_bits = KVM_NR_INTERRUPTS;
9617        pending_vec = find_first_bit(
9618                (const unsigned long *)sregs->interrupt_bitmap, max_bits);
9619        if (pending_vec < max_bits) {
9620                kvm_queue_interrupt(vcpu, pending_vec, false);
9621                pr_debug("Set back pending irq %d\n", pending_vec);
9622        }
9623
9624        kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
9625        kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
9626        kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
9627        kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
9628        kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
9629        kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
9630
9631        kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
9632        kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
9633
9634        update_cr8_intercept(vcpu);
9635
9636        /* Older userspace won't unhalt the vcpu on reset. */
9637        if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
9638            sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
9639            !is_protmode(vcpu))
9640                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
9641
9642        kvm_make_request(KVM_REQ_EVENT, vcpu);
9643
9644        ret = 0;
9645out:
9646        return ret;
9647}
9648
9649int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
9650                                  struct kvm_sregs *sregs)
9651{
9652        int ret;
9653
9654        vcpu_load(vcpu);
9655        ret = __set_sregs(vcpu, sregs);
9656        vcpu_put(vcpu);
9657        return ret;
9658}
9659
9660int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
9661                                        struct kvm_guest_debug *dbg)
9662{
9663        unsigned long rflags;
9664        int i, r;
9665
9666        vcpu_load(vcpu);
9667
9668        if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
9669                r = -EBUSY;
9670                if (vcpu->arch.exception.pending)
9671                        goto out;
9672                if (dbg->control & KVM_GUESTDBG_INJECT_DB)
9673                        kvm_queue_exception(vcpu, DB_VECTOR);
9674                else
9675                        kvm_queue_exception(vcpu, BP_VECTOR);
9676        }
9677
9678        /*
9679         * Read rflags as long as potentially injected trace flags are still
9680         * filtered out.
9681         */
9682        rflags = kvm_get_rflags(vcpu);
9683
9684        vcpu->guest_debug = dbg->control;
9685        if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
9686                vcpu->guest_debug = 0;
9687
9688        if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
9689                for (i = 0; i < KVM_NR_DB_REGS; ++i)
9690                        vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
9691                vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
9692        } else {
9693                for (i = 0; i < KVM_NR_DB_REGS; i++)
9694                        vcpu->arch.eff_db[i] = vcpu->arch.db[i];
9695        }
9696        kvm_update_dr7(vcpu);
9697
9698        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
9699                vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
9700                        get_segment_base(vcpu, VCPU_SREG_CS);
9701
9702        /*
9703         * Trigger an rflags update that will inject or remove the trace
9704         * flags.
9705         */
9706        kvm_set_rflags(vcpu, rflags);
9707
9708        kvm_x86_ops.update_exception_bitmap(vcpu);
9709
9710        r = 0;
9711
9712out:
9713        vcpu_put(vcpu);
9714        return r;
9715}
9716
9717/*
9718 * Translate a guest virtual address to a guest physical address.
9719 */
9720int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
9721                                    struct kvm_translation *tr)
9722{
9723        unsigned long vaddr = tr->linear_address;
9724        gpa_t gpa;
9725        int idx;
9726
9727        vcpu_load(vcpu);
9728
9729        idx = srcu_read_lock(&vcpu->kvm->srcu);
9730        gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
9731        srcu_read_unlock(&vcpu->kvm->srcu, idx);
9732        tr->physical_address = gpa;
9733        tr->valid = gpa != UNMAPPED_GVA;
9734        tr->writeable = 1;
9735        tr->usermode = 0;
9736
9737        vcpu_put(vcpu);
9738        return 0;
9739}
9740
9741int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
9742{
9743        struct fxregs_state *fxsave;
9744
9745        vcpu_load(vcpu);
9746
9747        fxsave = &vcpu->arch.guest_fpu->state.fxsave;
9748        memcpy(fpu->fpr, fxsave->st_space, 128);
9749        fpu->fcw = fxsave->cwd;
9750        fpu->fsw = fxsave->swd;
9751        fpu->ftwx = fxsave->twd;
9752        fpu->last_opcode = fxsave->fop;
9753        fpu->last_ip = fxsave->rip;
9754        fpu->last_dp = fxsave->rdp;
9755        memcpy(fpu->xmm, fxsave->xmm_space, sizeof(fxsave->xmm_space));
9756
9757        vcpu_put(vcpu);
9758        return 0;
9759}
9760
9761int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
9762{
9763        struct fxregs_state *fxsave;
9764
9765        vcpu_load(vcpu);
9766
9767        fxsave = &vcpu->arch.guest_fpu->state.fxsave;
9768
9769        memcpy(fxsave->st_space, fpu->fpr, 128);
9770        fxsave->cwd = fpu->fcw;
9771        fxsave->swd = fpu->fsw;
9772        fxsave->twd = fpu->ftwx;
9773        fxsave->fop = fpu->last_opcode;
9774        fxsave->rip = fpu->last_ip;
9775        fxsave->rdp = fpu->last_dp;
9776        memcpy(fxsave->xmm_space, fpu->xmm, sizeof(fxsave->xmm_space));
9777
9778        vcpu_put(vcpu);
9779        return 0;
9780}
9781
9782static void store_regs(struct kvm_vcpu *vcpu)
9783{
9784        BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_SIZE_BYTES);
9785
9786        if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS)
9787                __get_regs(vcpu, &vcpu->run->s.regs.regs);
9788
9789        if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS)
9790                __get_sregs(vcpu, &vcpu->run->s.regs.sregs);
9791
9792        if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS)
9793                kvm_vcpu_ioctl_x86_get_vcpu_events(
9794                                vcpu, &vcpu->run->s.regs.events);
9795}
9796
9797static int sync_regs(struct kvm_vcpu *vcpu)
9798{
9799        if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)
9800                return -EINVAL;
9801
9802        if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
9803                __set_regs(vcpu, &vcpu->run->s.regs.regs);
9804                vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
9805        }
9806        if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
9807                if (__set_sregs(vcpu, &vcpu->run->s.regs.sregs))
9808                        return -EINVAL;
9809                vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
9810        }
9811        if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
9812                if (kvm_vcpu_ioctl_x86_set_vcpu_events(
9813                                vcpu, &vcpu->run->s.regs.events))
9814                        return -EINVAL;
9815                vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
9816        }
9817
9818        return 0;
9819}
9820
9821static void fx_init(struct kvm_vcpu *vcpu)
9822{
9823        fpstate_init(&vcpu->arch.guest_fpu->state);
9824        if (boot_cpu_has(X86_FEATURE_XSAVES))
9825                vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv =
9826                        host_xcr0 | XSTATE_COMPACTION_ENABLED;
9827
9828        /*
9829         * Ensure guest xcr0 is valid for loading
9830         */
9831        vcpu->arch.xcr0 = XFEATURE_MASK_FP;
9832
9833        vcpu->arch.cr0 |= X86_CR0_ET;
9834}
9835
9836int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
9837{
9838        if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
9839                pr_warn_once("kvm: SMP vm created on host with unstable TSC; "
9840                             "guest TSC will not be reliable\n");
9841
9842        return 0;
9843}
9844
9845int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
9846{
9847        struct page *page;
9848        int r;
9849
9850        if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
9851                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
9852        else
9853                vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
9854
9855        kvm_set_tsc_khz(vcpu, max_tsc_khz);
9856
9857        r = kvm_mmu_create(vcpu);
9858        if (r < 0)
9859                return r;
9860
9861        if (irqchip_in_kernel(vcpu->kvm)) {
9862                r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
9863                if (r < 0)
9864                        goto fail_mmu_destroy;
9865                if (kvm_apicv_activated(vcpu->kvm))
9866                        vcpu->arch.apicv_active = true;
9867        } else
9868                static_key_slow_inc(&kvm_no_apic_vcpu);
9869
9870        r = -ENOMEM;
9871
9872        page = alloc_page(GFP_KERNEL | __GFP_ZERO);
9873        if (!page)
9874                goto fail_free_lapic;
9875        vcpu->arch.pio_data = page_address(page);
9876
9877        vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
9878                                       GFP_KERNEL_ACCOUNT);
9879        if (!vcpu->arch.mce_banks)
9880                goto fail_free_pio_data;
9881        vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
9882
9883        if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
9884                                GFP_KERNEL_ACCOUNT))
9885                goto fail_free_mce_banks;
9886
9887        if (!alloc_emulate_ctxt(vcpu))
9888                goto free_wbinvd_dirty_mask;
9889
9890        vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
9891                                                GFP_KERNEL_ACCOUNT);
9892        if (!vcpu->arch.user_fpu) {
9893                pr_err("kvm: failed to allocate userspace's fpu\n");
9894                goto free_emulate_ctxt;
9895        }
9896
9897        vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
9898                                                 GFP_KERNEL_ACCOUNT);
9899        if (!vcpu->arch.guest_fpu) {
9900                pr_err("kvm: failed to allocate vcpu's fpu\n");
9901                goto free_user_fpu;
9902        }
9903        fx_init(vcpu);
9904
9905        vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
9906
9907        vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
9908
9909        kvm_async_pf_hash_reset(vcpu);
9910        kvm_pmu_init(vcpu);
9911
9912        vcpu->arch.pending_external_vector = -1;
9913        vcpu->arch.preempted_in_kernel = false;
9914
9915        kvm_hv_vcpu_init(vcpu);
9916
9917        r = kvm_x86_ops.vcpu_create(vcpu);
9918        if (r)
9919                goto free_guest_fpu;
9920
9921        vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
9922        vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
9923        kvm_vcpu_mtrr_init(vcpu);
9924        vcpu_load(vcpu);
9925        kvm_vcpu_reset(vcpu, false);
9926        kvm_init_mmu(vcpu, false);
9927        vcpu_put(vcpu);
9928        return 0;
9929
9930free_guest_fpu:
9931        kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
9932free_user_fpu:
9933        kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
9934free_emulate_ctxt:
9935        kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
9936free_wbinvd_dirty_mask:
9937        free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
9938fail_free_mce_banks:
9939        kfree(vcpu->arch.mce_banks);
9940fail_free_pio_data:
9941        free_page((unsigned long)vcpu->arch.pio_data);
9942fail_free_lapic:
9943        kvm_free_lapic(vcpu);
9944fail_mmu_destroy:
9945        kvm_mmu_destroy(vcpu);
9946        return r;
9947}
9948
9949void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
9950{
9951        struct kvm *kvm = vcpu->kvm;
9952
9953        kvm_hv_vcpu_postcreate(vcpu);
9954
9955        if (mutex_lock_killable(&vcpu->mutex))
9956                return;
9957        vcpu_load(vcpu);
9958        kvm_synchronize_tsc(vcpu, 0);
9959        vcpu_put(vcpu);
9960
9961        /* poll control enabled by default */
9962        vcpu->arch.msr_kvm_poll_control = 1;
9963
9964        mutex_unlock(&vcpu->mutex);
9965
9966        if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0)
9967                schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
9968                                                KVMCLOCK_SYNC_PERIOD);
9969}
9970
9971void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
9972{
9973        struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
9974        int idx;
9975
9976        kvm_release_pfn(cache->pfn, cache->dirty, cache);
9977
9978        kvmclock_reset(vcpu);
9979
9980        kvm_x86_ops.vcpu_free(vcpu);
9981
9982        kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
9983        free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
9984        kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
9985        kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
9986
9987        kvm_hv_vcpu_uninit(vcpu);
9988        kvm_pmu_destroy(vcpu);
9989        kfree(vcpu->arch.mce_banks);
9990        kvm_free_lapic(vcpu);
9991        idx = srcu_read_lock(&vcpu->kvm->srcu);
9992        kvm_mmu_destroy(vcpu);
9993        srcu_read_unlock(&vcpu->kvm->srcu, idx);
9994        free_page((unsigned long)vcpu->arch.pio_data);
9995        kvfree(vcpu->arch.cpuid_entries);
9996        if (!lapic_in_kernel(vcpu))
9997                static_key_slow_dec(&kvm_no_apic_vcpu);
9998}
9999
10000void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
10001{
10002        kvm_lapic_reset(vcpu, init_event);
10003
10004        vcpu->arch.hflags = 0;
10005
10006        vcpu->arch.smi_pending = 0;
10007        vcpu->arch.smi_count = 0;
10008        atomic_set(&vcpu->arch.nmi_queued, 0);
10009        vcpu->arch.nmi_pending = 0;
10010        vcpu->arch.nmi_injected = false;
10011        kvm_clear_interrupt_queue(vcpu);
10012        kvm_clear_exception_queue(vcpu);
10013
10014        memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
10015        kvm_update_dr0123(vcpu);
10016        vcpu->arch.dr6 = DR6_INIT;
10017        vcpu->arch.dr7 = DR7_FIXED_1;
10018        kvm_update_dr7(vcpu);
10019
10020        vcpu->arch.cr2 = 0;
10021
10022        kvm_make_request(KVM_REQ_EVENT, vcpu);
10023        vcpu->arch.apf.msr_en_val = 0;
10024        vcpu->arch.apf.msr_int_val = 0;
10025        vcpu->arch.st.msr_val = 0;
10026
10027        kvmclock_reset(vcpu);
10028
10029        kvm_clear_async_pf_completion_queue(vcpu);
10030        kvm_async_pf_hash_reset(vcpu);
10031        vcpu->arch.apf.halted = false;
10032
10033        if (kvm_mpx_supported()) {
10034                void *mpx_state_buffer;
10035
10036                /*
10037                 * To avoid have the INIT path from kvm_apic_has_events() that be
10038                 * called with loaded FPU and does not let userspace fix the state.
10039                 */
10040                if (init_event)
10041                        kvm_put_guest_fpu(vcpu);
10042                mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
10043                                        XFEATURE_BNDREGS);
10044                if (mpx_state_buffer)
10045                        memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state));
10046                mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
10047                                        XFEATURE_BNDCSR);
10048                if (mpx_state_buffer)
10049                        memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
10050                if (init_event)
10051                        kvm_load_guest_fpu(vcpu);
10052        }
10053
10054        if (!init_event) {
10055                kvm_pmu_reset(vcpu);
10056                vcpu->arch.smbase = 0x30000;
10057
10058                vcpu->arch.msr_misc_features_enables = 0;
10059
10060                vcpu->arch.xcr0 = XFEATURE_MASK_FP;
10061        }
10062
10063        memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
10064        vcpu->arch.regs_avail = ~0;
10065        vcpu->arch.regs_dirty = ~0;
10066
10067        vcpu->arch.ia32_xss = 0;
10068
10069        kvm_x86_ops.vcpu_reset(vcpu, init_event);
10070}
10071
10072void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
10073{
10074        struct kvm_segment cs;
10075
10076        kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
10077        cs.selector = vector << 8;
10078        cs.base = vector << 12;
10079        kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
10080        kvm_rip_write(vcpu, 0);
10081}
10082
10083int kvm_arch_hardware_enable(void)
10084{
10085        struct kvm *kvm;
10086        struct kvm_vcpu *vcpu;
10087        int i;
10088        int ret;
10089        u64 local_tsc;
10090        u64 max_tsc = 0;
10091        bool stable, backwards_tsc = false;
10092
10093        kvm_user_return_msr_cpu_online();
10094        ret = kvm_x86_ops.hardware_enable();
10095        if (ret != 0)
10096                return ret;
10097
10098        local_tsc = rdtsc();
10099        stable = !kvm_check_tsc_unstable();
10100        list_for_each_entry(kvm, &vm_list, vm_list) {
10101                kvm_for_each_vcpu(i, vcpu, kvm) {
10102                        if (!stable && vcpu->cpu == smp_processor_id())
10103                                kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
10104                        if (stable && vcpu->arch.last_host_tsc > local_tsc) {
10105                                backwards_tsc = true;
10106                                if (vcpu->arch.last_host_tsc > max_tsc)
10107                                        max_tsc = vcpu->arch.last_host_tsc;
10108                        }
10109                }
10110        }
10111
10112        /*
10113         * Sometimes, even reliable TSCs go backwards.  This happens on
10114         * platforms that reset TSC during suspend or hibernate actions, but
10115         * maintain synchronization.  We must compensate.  Fortunately, we can
10116         * detect that condition here, which happens early in CPU bringup,
10117         * before any KVM threads can be running.  Unfortunately, we can't
10118         * bring the TSCs fully up to date with real time, as we aren't yet far
10119         * enough into CPU bringup that we know how much real time has actually
10120         * elapsed; our helper function, ktime_get_boottime_ns() will be using boot
10121         * variables that haven't been updated yet.
10122         *
10123         * So we simply find the maximum observed TSC above, then record the
10124         * adjustment to TSC in each VCPU.  When the VCPU later gets loaded,
10125         * the adjustment will be applied.  Note that we accumulate
10126         * adjustments, in case multiple suspend cycles happen before some VCPU
10127         * gets a chance to run again.  In the event that no KVM threads get a
10128         * chance to run, we will miss the entire elapsed period, as we'll have
10129         * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
10130         * loose cycle time.  This isn't too big a deal, since the loss will be
10131         * uniform across all VCPUs (not to mention the scenario is extremely
10132         * unlikely). It is possible that a second hibernate recovery happens
10133         * much faster than a first, causing the observed TSC here to be
10134         * smaller; this would require additional padding adjustment, which is
10135         * why we set last_host_tsc to the local tsc observed here.
10136         *
10137         * N.B. - this code below runs only on platforms with reliable TSC,
10138         * as that is the only way backwards_tsc is set above.  Also note
10139         * that this runs for ALL vcpus, which is not a bug; all VCPUs should
10140         * have the same delta_cyc adjustment applied if backwards_tsc
10141         * is detected.  Note further, this adjustment is only done once,
10142         * as we reset last_host_tsc on all VCPUs to stop this from being
10143         * called multiple times (one for each physical CPU bringup).
10144         *
10145         * Platforms with unreliable TSCs don't have to deal with this, they
10146         * will be compensated by the logic in vcpu_load, which sets the TSC to
10147         * catchup mode.  This will catchup all VCPUs to real time, but cannot
10148         * guarantee that they stay in perfect synchronization.
10149         */
10150        if (backwards_tsc) {
10151                u64 delta_cyc = max_tsc - local_tsc;
10152                list_for_each_entry(kvm, &vm_list, vm_list) {
10153                        kvm->arch.backwards_tsc_observed = true;
10154                        kvm_for_each_vcpu(i, vcpu, kvm) {
10155                                vcpu->arch.tsc_offset_adjustment += delta_cyc;
10156                                vcpu->arch.last_host_tsc = local_tsc;
10157                                kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
10158                        }
10159
10160                        /*
10161                         * We have to disable TSC offset matching.. if you were
10162                         * booting a VM while issuing an S4 host suspend....
10163                         * you may have some problem.  Solving this issue is
10164                         * left as an exercise to the reader.
10165                         */
10166                        kvm->arch.last_tsc_nsec = 0;
10167                        kvm->arch.last_tsc_write = 0;
10168                }
10169
10170        }
10171        return 0;
10172}
10173
10174void kvm_arch_hardware_disable(void)
10175{
10176        kvm_x86_ops.hardware_disable();
10177        drop_user_return_notifiers();
10178}
10179
10180int kvm_arch_hardware_setup(void *opaque)
10181{
10182        struct kvm_x86_init_ops *ops = opaque;
10183        int r;
10184
10185        rdmsrl_safe(MSR_EFER, &host_efer);
10186
10187        if (boot_cpu_has(X86_FEATURE_XSAVES))
10188                rdmsrl(MSR_IA32_XSS, host_xss);
10189
10190        r = ops->hardware_setup();
10191        if (r != 0)
10192                return r;
10193
10194        memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
10195
10196        if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
10197                supported_xss = 0;
10198
10199#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
10200        cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
10201#undef __kvm_cpu_cap_has
10202
10203        if (kvm_has_tsc_control) {
10204                /*
10205                 * Make sure the user can only configure tsc_khz values that
10206                 * fit into a signed integer.
10207                 * A min value is not calculated because it will always
10208                 * be 1 on all machines.
10209                 */
10210                u64 max = min(0x7fffffffULL,
10211                              __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz));
10212                kvm_max_guest_tsc_khz = max;
10213
10214                kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
10215        }
10216
10217        kvm_init_msr_list();
10218        return 0;
10219}
10220
10221void kvm_arch_hardware_unsetup(void)
10222{
10223        kvm_x86_ops.hardware_unsetup();
10224}
10225
10226int kvm_arch_check_processor_compat(void *opaque)
10227{
10228        struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
10229        struct kvm_x86_init_ops *ops = opaque;
10230
10231        WARN_ON(!irqs_disabled());
10232
10233        if (__cr4_reserved_bits(cpu_has, c) !=
10234            __cr4_reserved_bits(cpu_has, &boot_cpu_data))
10235                return -EIO;
10236
10237        return ops->check_processor_compatibility();
10238}
10239
10240bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
10241{
10242        return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
10243}
10244EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp);
10245
10246bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
10247{
10248        return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
10249}
10250
10251struct static_key kvm_no_apic_vcpu __read_mostly;
10252EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
10253
10254void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
10255{
10256        struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
10257
10258        vcpu->arch.l1tf_flush_l1d = true;
10259        if (pmu->version && unlikely(pmu->event_count)) {
10260                pmu->need_cleanup = true;
10261                kvm_make_request(KVM_REQ_PMU, vcpu);
10262        }
10263        kvm_x86_ops.sched_in(vcpu, cpu);
10264}
10265
10266void kvm_arch_free_vm(struct kvm *kvm)
10267{
10268        kfree(kvm->arch.hyperv.hv_pa_pg);
10269        vfree(kvm);
10270}
10271
10272
10273int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
10274{
10275        if (type)
10276                return -EINVAL;
10277
10278        INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
10279        INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
10280        INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
10281        INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
10282        INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
10283        atomic_set(&kvm->arch.noncoherent_dma_count, 0);
10284
10285        /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
10286        set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
10287        /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
10288        set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
10289                &kvm->arch.irq_sources_bitmap);
10290
10291        raw_spin_lock_init(&kvm->arch.tsc_write_lock);
10292        mutex_init(&kvm->arch.apic_map_lock);
10293        spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
10294
10295        kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
10296        pvclock_update_vm_gtod_copy(kvm);
10297
10298        kvm->arch.guest_can_read_msr_platform_info = true;
10299
10300        INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
10301        INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
10302
10303        kvm_hv_init_vm(kvm);
10304        kvm_page_track_init(kvm);
10305        kvm_mmu_init_vm(kvm);
10306
10307        return kvm_x86_ops.vm_init(kvm);
10308}
10309
10310int kvm_arch_post_init_vm(struct kvm *kvm)
10311{
10312        return kvm_mmu_post_init_vm(kvm);
10313}
10314
10315static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
10316{
10317        vcpu_load(vcpu);
10318        kvm_mmu_unload(vcpu);
10319        vcpu_put(vcpu);
10320}
10321
10322static void kvm_free_vcpus(struct kvm *kvm)
10323{
10324        unsigned int i;
10325        struct kvm_vcpu *vcpu;
10326
10327        /*
10328         * Unpin any mmu pages first.
10329         */
10330        kvm_for_each_vcpu(i, vcpu, kvm) {
10331                kvm_clear_async_pf_completion_queue(vcpu);
10332                kvm_unload_vcpu_mmu(vcpu);
10333        }
10334        kvm_for_each_vcpu(i, vcpu, kvm)
10335                kvm_vcpu_destroy(vcpu);
10336
10337        mutex_lock(&kvm->lock);
10338        for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
10339                kvm->vcpus[i] = NULL;
10340
10341        atomic_set(&kvm->online_vcpus, 0);
10342        mutex_unlock(&kvm->lock);
10343}
10344
10345void kvm_arch_sync_events(struct kvm *kvm)
10346{
10347        cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
10348        cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
10349        kvm_free_pit(kvm);
10350}
10351
10352int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
10353{
10354        int i, r;
10355        unsigned long hva, old_npages;
10356        struct kvm_memslots *slots = kvm_memslots(kvm);
10357        struct kvm_memory_slot *slot;
10358
10359        /* Called with kvm->slots_lock held.  */
10360        if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
10361                return -EINVAL;
10362
10363        slot = id_to_memslot(slots, id);
10364        if (size) {
10365                if (slot && slot->npages)
10366                        return -EEXIST;
10367
10368                /*
10369                 * MAP_SHARED to prevent internal slot pages from being moved
10370                 * by fork()/COW.
10371                 */
10372                hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE,
10373                              MAP_SHARED | MAP_ANONYMOUS, 0);
10374                if (IS_ERR((void *)hva))
10375                        return PTR_ERR((void *)hva);
10376        } else {
10377                if (!slot || !slot->npages)
10378                        return 0;
10379
10380                old_npages = slot->npages;
10381                hva = 0;
10382        }
10383
10384        for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
10385                struct kvm_userspace_memory_region m;
10386
10387                m.slot = id | (i << 16);
10388                m.flags = 0;
10389                m.guest_phys_addr = gpa;
10390                m.userspace_addr = hva;
10391                m.memory_size = size;
10392                r = __kvm_set_memory_region(kvm, &m);
10393                if (r < 0)
10394                        return r;
10395        }
10396
10397        if (!size)
10398                vm_munmap(hva, old_npages * PAGE_SIZE);
10399
10400        return 0;
10401}
10402EXPORT_SYMBOL_GPL(__x86_set_memory_region);
10403
10404void kvm_arch_pre_destroy_vm(struct kvm *kvm)
10405{
10406        kvm_mmu_pre_destroy_vm(kvm);
10407}
10408
10409void kvm_arch_destroy_vm(struct kvm *kvm)
10410{
10411        u32 i;
10412
10413        if (current->mm == kvm->mm) {
10414                /*
10415                 * Free memory regions allocated on behalf of userspace,
10416                 * unless the the memory map has changed due to process exit
10417                 * or fd copying.
10418                 */
10419                mutex_lock(&kvm->slots_lock);
10420                __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
10421                                        0, 0);
10422                __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
10423                                        0, 0);
10424                __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
10425                mutex_unlock(&kvm->slots_lock);
10426        }
10427        if (kvm_x86_ops.vm_destroy)
10428                kvm_x86_ops.vm_destroy(kvm);
10429        for (i = 0; i < kvm->arch.msr_filter.count; i++)
10430                kfree(kvm->arch.msr_filter.ranges[i].bitmap);
10431        kvm_pic_destroy(kvm);
10432        kvm_ioapic_destroy(kvm);
10433        kvm_free_vcpus(kvm);
10434        kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
10435        kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
10436        kvm_mmu_uninit_vm(kvm);
10437        kvm_page_track_cleanup(kvm);
10438        kvm_hv_destroy_vm(kvm);
10439}
10440
10441void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
10442{
10443        int i;
10444
10445        for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
10446                kvfree(slot->arch.rmap[i]);
10447                slot->arch.rmap[i] = NULL;
10448
10449                if (i == 0)
10450                        continue;
10451
10452                kvfree(slot->arch.lpage_info[i - 1]);
10453                slot->arch.lpage_info[i - 1] = NULL;
10454        }
10455
10456        kvm_page_track_free_memslot(slot);
10457}
10458
10459static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
10460                                      unsigned long npages)
10461{
10462        int i;
10463
10464        /*
10465         * Clear out the previous array pointers for the KVM_MR_MOVE case.  The
10466         * old arrays will be freed by __kvm_set_memory_region() if installing
10467         * the new memslot is successful.
10468         */
10469        memset(&slot->arch, 0, sizeof(slot->arch));
10470
10471        for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
10472                struct kvm_lpage_info *linfo;
10473                unsigned long ugfn;
10474                int lpages;
10475                int level = i + 1;
10476
10477                lpages = gfn_to_index(slot->base_gfn + npages - 1,
10478                                      slot->base_gfn, level) + 1;
10479
10480                slot->arch.rmap[i] =
10481                        kvcalloc(lpages, sizeof(*slot->arch.rmap[i]),
10482                                 GFP_KERNEL_ACCOUNT);
10483                if (!slot->arch.rmap[i])
10484                        goto out_free;
10485                if (i == 0)
10486                        continue;
10487
10488                linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
10489                if (!linfo)
10490                        goto out_free;
10491
10492                slot->arch.lpage_info[i - 1] = linfo;
10493
10494                if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
10495                        linfo[0].disallow_lpage = 1;
10496                if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
10497                        linfo[lpages - 1].disallow_lpage = 1;
10498                ugfn = slot->userspace_addr >> PAGE_SHIFT;
10499                /*
10500                 * If the gfn and userspace address are not aligned wrt each
10501                 * other, disable large page support for this slot.
10502                 */
10503                if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) {
10504                        unsigned long j;
10505
10506                        for (j = 0; j < lpages; ++j)
10507                                linfo[j].disallow_lpage = 1;
10508                }
10509        }
10510
10511        if (kvm_page_track_create_memslot(slot, npages))
10512                goto out_free;
10513
10514        return 0;
10515
10516out_free:
10517        for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
10518                kvfree(slot->arch.rmap[i]);
10519                slot->arch.rmap[i] = NULL;
10520                if (i == 0)
10521                        continue;
10522
10523                kvfree(slot->arch.lpage_info[i - 1]);
10524                slot->arch.lpage_info[i - 1] = NULL;
10525        }
10526        return -ENOMEM;
10527}
10528
10529void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
10530{
10531        struct kvm_vcpu *vcpu;
10532        int i;
10533
10534        /*
10535         * memslots->generation has been incremented.
10536         * mmio generation may have reached its maximum value.
10537         */
10538        kvm_mmu_invalidate_mmio_sptes(kvm, gen);
10539
10540        /* Force re-initialization of steal_time cache */
10541        kvm_for_each_vcpu(i, vcpu, kvm)
10542                kvm_vcpu_kick(vcpu);
10543}
10544
10545int kvm_arch_prepare_memory_region(struct kvm *kvm,
10546                                struct kvm_memory_slot *memslot,
10547                                const struct kvm_userspace_memory_region *mem,
10548                                enum kvm_mr_change change)
10549{
10550        if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
10551                return kvm_alloc_memslot_metadata(memslot,
10552                                                  mem->memory_size >> PAGE_SHIFT);
10553        return 0;
10554}
10555
10556static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
10557                                     struct kvm_memory_slot *old,
10558                                     struct kvm_memory_slot *new,
10559                                     enum kvm_mr_change change)
10560{
10561        /*
10562         * Nothing to do for RO slots or CREATE/MOVE/DELETE of a slot.
10563         * See comments below.
10564         */
10565        if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY))
10566                return;
10567
10568        /*
10569         * Dirty logging tracks sptes in 4k granularity, meaning that large
10570         * sptes have to be split.  If live migration is successful, the guest
10571         * in the source machine will be destroyed and large sptes will be
10572         * created in the destination. However, if the guest continues to run
10573         * in the source machine (for example if live migration fails), small
10574         * sptes will remain around and cause bad performance.
10575         *
10576         * Scan sptes if dirty logging has been stopped, dropping those
10577         * which can be collapsed into a single large-page spte.  Later
10578         * page faults will create the large-page sptes.
10579         *
10580         * There is no need to do this in any of the following cases:
10581         * CREATE:      No dirty mappings will already exist.
10582         * MOVE/DELETE: The old mappings will already have been cleaned up by
10583         *              kvm_arch_flush_shadow_memslot()
10584         */
10585        if ((old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
10586            !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
10587                kvm_mmu_zap_collapsible_sptes(kvm, new);
10588
10589        /*
10590         * Enable or disable dirty logging for the slot.
10591         *
10592         * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of the old
10593         * slot have been zapped so no dirty logging updates are needed for
10594         * the old slot.
10595         * For KVM_MR_CREATE and KVM_MR_MOVE, once the new slot is visible
10596         * any mappings that might be created in it will consume the
10597         * properties of the new slot and do not need to be updated here.
10598         *
10599         * When PML is enabled, the kvm_x86_ops dirty logging hooks are
10600         * called to enable/disable dirty logging.
10601         *
10602         * When disabling dirty logging with PML enabled, the D-bit is set
10603         * for sptes in the slot in order to prevent unnecessary GPA
10604         * logging in the PML buffer (and potential PML buffer full VMEXIT).
10605         * This guarantees leaving PML enabled for the guest's lifetime
10606         * won't have any additional overhead from PML when the guest is
10607         * running with dirty logging disabled.
10608         *
10609         * When enabling dirty logging, large sptes are write-protected
10610         * so they can be split on first write.  New large sptes cannot
10611         * be created for this slot until the end of the logging.
10612         * See the comments in fast_page_fault().
10613         * For small sptes, nothing is done if the dirty log is in the
10614         * initial-all-set state.  Otherwise, depending on whether pml
10615         * is enabled the D-bit or the W-bit will be cleared.
10616         */
10617        if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
10618                if (kvm_x86_ops.slot_enable_log_dirty) {
10619                        kvm_x86_ops.slot_enable_log_dirty(kvm, new);
10620                } else {
10621                        int level =
10622                                kvm_dirty_log_manual_protect_and_init_set(kvm) ?
10623                                PG_LEVEL_2M : PG_LEVEL_4K;
10624
10625                        /*
10626                         * If we're with initial-all-set, we don't need
10627                         * to write protect any small page because
10628                         * they're reported as dirty already.  However
10629                         * we still need to write-protect huge pages
10630                         * so that the page split can happen lazily on
10631                         * the first write to the huge page.
10632                         */
10633                        kvm_mmu_slot_remove_write_access(kvm, new, level);
10634                }
10635        } else {
10636                if (kvm_x86_ops.slot_disable_log_dirty)
10637                        kvm_x86_ops.slot_disable_log_dirty(kvm, new);
10638        }
10639}
10640
10641void kvm_arch_commit_memory_region(struct kvm *kvm,
10642                                const struct kvm_userspace_memory_region *mem,
10643                                struct kvm_memory_slot *old,
10644                                const struct kvm_memory_slot *new,
10645                                enum kvm_mr_change change)
10646{
10647        if (!kvm->arch.n_requested_mmu_pages)
10648                kvm_mmu_change_mmu_pages(kvm,
10649                                kvm_mmu_calculate_default_mmu_pages(kvm));
10650
10651        /*
10652         * FIXME: const-ify all uses of struct kvm_memory_slot.
10653         */
10654        kvm_mmu_slot_apply_flags(kvm, old, (struct kvm_memory_slot *) new, change);
10655
10656        /* Free the arrays associated with the old memslot. */
10657        if (change == KVM_MR_MOVE)
10658                kvm_arch_free_memslot(kvm, old);
10659}
10660
10661void kvm_arch_flush_shadow_all(struct kvm *kvm)
10662{
10663        kvm_mmu_zap_all(kvm);
10664}
10665
10666void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
10667                                   struct kvm_memory_slot *slot)
10668{
10669        kvm_page_track_flush_slot(kvm, slot);
10670}
10671
10672static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
10673{
10674        return (is_guest_mode(vcpu) &&
10675                        kvm_x86_ops.guest_apic_has_interrupt &&
10676                        kvm_x86_ops.guest_apic_has_interrupt(vcpu));
10677}
10678
10679static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
10680{
10681        if (!list_empty_careful(&vcpu->async_pf.done))
10682                return true;
10683
10684        if (kvm_apic_has_events(vcpu))
10685                return true;
10686
10687        if (vcpu->arch.pv.pv_unhalted)
10688                return true;
10689
10690        if (vcpu->arch.exception.pending)
10691                return true;
10692
10693        if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
10694            (vcpu->arch.nmi_pending &&
10695             kvm_x86_ops.nmi_allowed(vcpu, false)))
10696                return true;
10697
10698        if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
10699            (vcpu->arch.smi_pending &&
10700             kvm_x86_ops.smi_allowed(vcpu, false)))
10701                return true;
10702
10703        if (kvm_arch_interrupt_allowed(vcpu) &&
10704            (kvm_cpu_has_interrupt(vcpu) ||
10705            kvm_guest_apic_has_interrupt(vcpu)))
10706                return true;
10707
10708        if (kvm_hv_has_stimer_pending(vcpu))
10709                return true;
10710
10711        if (is_guest_mode(vcpu) &&
10712            kvm_x86_ops.nested_ops->hv_timer_pending &&
10713            kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
10714                return true;
10715
10716        return false;
10717}
10718
10719int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
10720{
10721        return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
10722}
10723
10724bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
10725{
10726        if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
10727                return true;
10728
10729        if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
10730                kvm_test_request(KVM_REQ_SMI, vcpu) ||
10731                 kvm_test_request(KVM_REQ_EVENT, vcpu))
10732                return true;
10733
10734        if (vcpu->arch.apicv_active && kvm_x86_ops.dy_apicv_has_pending_interrupt(vcpu))
10735                return true;
10736
10737        return false;
10738}
10739
10740bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
10741{
10742        return vcpu->arch.preempted_in_kernel;
10743}
10744
10745int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
10746{
10747        return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
10748}
10749
10750int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
10751{
10752        return kvm_x86_ops.interrupt_allowed(vcpu, false);
10753}
10754
10755unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
10756{
10757        if (is_64_bit_mode(vcpu))
10758                return kvm_rip_read(vcpu);
10759        return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) +
10760                     kvm_rip_read(vcpu));
10761}
10762EXPORT_SYMBOL_GPL(kvm_get_linear_rip);
10763
10764bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
10765{
10766        return kvm_get_linear_rip(vcpu) == linear_rip;
10767}
10768EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
10769
10770unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
10771{
10772        unsigned long rflags;
10773
10774        rflags = kvm_x86_ops.get_rflags(vcpu);
10775        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
10776                rflags &= ~X86_EFLAGS_TF;
10777        return rflags;
10778}
10779EXPORT_SYMBOL_GPL(kvm_get_rflags);
10780
10781static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
10782{
10783        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
10784            kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
10785                rflags |= X86_EFLAGS_TF;
10786        kvm_x86_ops.set_rflags(vcpu, rflags);
10787}
10788
10789void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
10790{
10791        __kvm_set_rflags(vcpu, rflags);
10792        kvm_make_request(KVM_REQ_EVENT, vcpu);
10793}
10794EXPORT_SYMBOL_GPL(kvm_set_rflags);
10795
10796void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
10797{
10798        int r;
10799
10800        if ((vcpu->arch.mmu->direct_map != work->arch.direct_map) ||
10801              work->wakeup_all)
10802                return;
10803
10804        r = kvm_mmu_reload(vcpu);
10805        if (unlikely(r))
10806                return;
10807
10808        if (!vcpu->arch.mmu->direct_map &&
10809              work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
10810                return;
10811
10812        kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
10813}
10814
10815static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
10816{
10817        BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU));
10818
10819        return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
10820}
10821
10822static inline u32 kvm_async_pf_next_probe(u32 key)
10823{
10824        return (key + 1) & (ASYNC_PF_PER_VCPU - 1);
10825}
10826
10827static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
10828{
10829        u32 key = kvm_async_pf_hash_fn(gfn);
10830
10831        while (vcpu->arch.apf.gfns[key] != ~0)
10832                key = kvm_async_pf_next_probe(key);
10833
10834        vcpu->arch.apf.gfns[key] = gfn;
10835}
10836
10837static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
10838{
10839        int i;
10840        u32 key = kvm_async_pf_hash_fn(gfn);
10841
10842        for (i = 0; i < ASYNC_PF_PER_VCPU &&
10843                     (vcpu->arch.apf.gfns[key] != gfn &&
10844                      vcpu->arch.apf.gfns[key] != ~0); i++)
10845                key = kvm_async_pf_next_probe(key);
10846
10847        return key;
10848}
10849
10850bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
10851{
10852        return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
10853}
10854
10855static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
10856{
10857        u32 i, j, k;
10858
10859        i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
10860
10861        if (WARN_ON_ONCE(vcpu->arch.apf.gfns[i] != gfn))
10862                return;
10863
10864        while (true) {
10865                vcpu->arch.apf.gfns[i] = ~0;
10866                do {
10867                        j = kvm_async_pf_next_probe(j);
10868                        if (vcpu->arch.apf.gfns[j] == ~0)
10869                                return;
10870                        k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
10871                        /*
10872                         * k lies cyclically in ]i,j]
10873                         * |    i.k.j |
10874                         * |....j i.k.| or  |.k..j i...|
10875                         */
10876                } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
10877                vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
10878                i = j;
10879        }
10880}
10881
10882static inline int apf_put_user_notpresent(struct kvm_vcpu *vcpu)
10883{
10884        u32 reason = KVM_PV_REASON_PAGE_NOT_PRESENT;
10885
10886        return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &reason,
10887                                      sizeof(reason));
10888}
10889
10890static inline int apf_put_user_ready(struct kvm_vcpu *vcpu, u32 token)
10891{
10892        unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
10893
10894        return kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
10895                                             &token, offset, sizeof(token));
10896}
10897
10898static inline bool apf_pageready_slot_free(struct kvm_vcpu *vcpu)
10899{
10900        unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
10901        u32 val;
10902
10903        if (kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
10904                                         &val, offset, sizeof(val)))
10905                return false;
10906
10907        return !val;
10908}
10909
10910static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
10911{
10912        if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
10913                return false;
10914
10915        if (!kvm_pv_async_pf_enabled(vcpu) ||
10916            (vcpu->arch.apf.send_user_only && kvm_x86_ops.get_cpl(vcpu) == 0))
10917                return false;
10918
10919        return true;
10920}
10921
10922bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
10923{
10924        if (unlikely(!lapic_in_kernel(vcpu) ||
10925                     kvm_event_needs_reinjection(vcpu) ||
10926                     vcpu->arch.exception.pending))
10927                return false;
10928
10929        if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
10930                return false;
10931
10932        /*
10933         * If interrupts are off we cannot even use an artificial
10934         * halt state.
10935         */
10936        return kvm_arch_interrupt_allowed(vcpu);
10937}
10938
10939bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
10940                                     struct kvm_async_pf *work)
10941{
10942        struct x86_exception fault;
10943
10944        trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa);
10945        kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
10946
10947        if (kvm_can_deliver_async_pf(vcpu) &&
10948            !apf_put_user_notpresent(vcpu)) {
10949                fault.vector = PF_VECTOR;
10950                fault.error_code_valid = true;
10951                fault.error_code = 0;
10952                fault.nested_page_fault = false;
10953                fault.address = work->arch.token;
10954                fault.async_page_fault = true;
10955                kvm_inject_page_fault(vcpu, &fault);
10956                return true;
10957        } else {
10958                /*
10959                 * It is not possible to deliver a paravirtualized asynchronous
10960                 * page fault, but putting the guest in an artificial halt state
10961                 * can be beneficial nevertheless: if an interrupt arrives, we
10962                 * can deliver it timely and perhaps the guest will schedule
10963                 * another process.  When the instruction that triggered a page
10964                 * fault is retried, hopefully the page will be ready in the host.
10965                 */
10966                kvm_make_request(KVM_REQ_APF_HALT, vcpu);
10967                return false;
10968        }
10969}
10970
10971void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
10972                                 struct kvm_async_pf *work)
10973{
10974        struct kvm_lapic_irq irq = {
10975                .delivery_mode = APIC_DM_FIXED,
10976                .vector = vcpu->arch.apf.vec
10977        };
10978
10979        if (work->wakeup_all)
10980                work->arch.token = ~0; /* broadcast wakeup */
10981        else
10982                kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
10983        trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
10984
10985        if ((work->wakeup_all || work->notpresent_injected) &&
10986            kvm_pv_async_pf_enabled(vcpu) &&
10987            !apf_put_user_ready(vcpu, work->arch.token)) {
10988                vcpu->arch.apf.pageready_pending = true;
10989                kvm_apic_set_irq(vcpu, &irq, NULL);
10990        }
10991
10992        vcpu->arch.apf.halted = false;
10993        vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
10994}
10995
10996void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu)
10997{
10998        kvm_make_request(KVM_REQ_APF_READY, vcpu);
10999        if (!vcpu->arch.apf.pageready_pending)
11000                kvm_vcpu_kick(vcpu);
11001}
11002
11003bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
11004{
11005        if (!kvm_pv_async_pf_enabled(vcpu))
11006                return true;
11007        else
11008                return apf_pageready_slot_free(vcpu);
11009}
11010
11011void kvm_arch_start_assignment(struct kvm *kvm)
11012{
11013        atomic_inc(&kvm->arch.assigned_device_count);
11014}
11015EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
11016
11017void kvm_arch_end_assignment(struct kvm *kvm)
11018{
11019        atomic_dec(&kvm->arch.assigned_device_count);
11020}
11021EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
11022
11023bool kvm_arch_has_assigned_device(struct kvm *kvm)
11024{
11025        return atomic_read(&kvm->arch.assigned_device_count);
11026}
11027EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
11028
11029void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
11030{
11031        atomic_inc(&kvm->arch.noncoherent_dma_count);
11032}
11033EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma);
11034
11035void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
11036{
11037        atomic_dec(&kvm->arch.noncoherent_dma_count);
11038}
11039EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma);
11040
11041bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
11042{
11043        return atomic_read(&kvm->arch.noncoherent_dma_count);
11044}
11045EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
11046
11047bool kvm_arch_has_irq_bypass(void)
11048{
11049        return true;
11050}
11051
11052int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
11053                                      struct irq_bypass_producer *prod)
11054{
11055        struct kvm_kernel_irqfd *irqfd =
11056                container_of(cons, struct kvm_kernel_irqfd, consumer);
11057        int ret;
11058
11059        irqfd->producer = prod;
11060        kvm_arch_start_assignment(irqfd->kvm);
11061        ret = kvm_x86_ops.update_pi_irte(irqfd->kvm,
11062                                         prod->irq, irqfd->gsi, 1);
11063
11064        if (ret)
11065                kvm_arch_end_assignment(irqfd->kvm);
11066
11067        return ret;
11068}
11069
11070void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
11071                                      struct irq_bypass_producer *prod)
11072{
11073        int ret;
11074        struct kvm_kernel_irqfd *irqfd =
11075                container_of(cons, struct kvm_kernel_irqfd, consumer);
11076
11077        WARN_ON(irqfd->producer != prod);
11078        irqfd->producer = NULL;
11079
11080        /*
11081         * When producer of consumer is unregistered, we change back to
11082         * remapped mode, so we can re-use the current implementation
11083         * when the irq is masked/disabled or the consumer side (KVM
11084         * int this case doesn't want to receive the interrupts.
11085        */
11086        ret = kvm_x86_ops.update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
11087        if (ret)
11088                printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
11089                       " fails: %d\n", irqfd->consumer.token, ret);
11090
11091        kvm_arch_end_assignment(irqfd->kvm);
11092}
11093
11094int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
11095                                   uint32_t guest_irq, bool set)
11096{
11097        return kvm_x86_ops.update_pi_irte(kvm, host_irq, guest_irq, set);
11098}
11099
11100bool kvm_vector_hashing_enabled(void)
11101{
11102        return vector_hashing;
11103}
11104
11105bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
11106{
11107        return (vcpu->arch.msr_kvm_poll_control & 1) == 0;
11108}
11109EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
11110
11111
11112int kvm_spec_ctrl_test_value(u64 value)
11113{
11114        /*
11115         * test that setting IA32_SPEC_CTRL to given value
11116         * is allowed by the host processor
11117         */
11118
11119        u64 saved_value;
11120        unsigned long flags;
11121        int ret = 0;
11122
11123        local_irq_save(flags);
11124
11125        if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value))
11126                ret = 1;
11127        else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value))
11128                ret = 1;
11129        else
11130                wrmsrl(MSR_IA32_SPEC_CTRL, saved_value);
11131
11132        local_irq_restore(flags);
11133
11134        return ret;
11135}
11136EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value);
11137
11138void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code)
11139{
11140        struct x86_exception fault;
11141        u32 access = error_code &
11142                (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK);
11143
11144        if (!(error_code & PFERR_PRESENT_MASK) ||
11145            vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, &fault) != UNMAPPED_GVA) {
11146                /*
11147                 * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
11148                 * tables probably do not match the TLB.  Just proceed
11149                 * with the error code that the processor gave.
11150                 */
11151                fault.vector = PF_VECTOR;
11152                fault.error_code_valid = true;
11153                fault.error_code = error_code;
11154                fault.nested_page_fault = false;
11155                fault.address = gva;
11156        }
11157        vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault);
11158}
11159EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error);
11160
11161/*
11162 * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns
11163 * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value
11164 * indicates whether exit to userspace is needed.
11165 */
11166int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
11167                              struct x86_exception *e)
11168{
11169        if (r == X86EMUL_PROPAGATE_FAULT) {
11170                kvm_inject_emulated_page_fault(vcpu, e);
11171                return 1;
11172        }
11173
11174        /*
11175         * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED
11176         * while handling a VMX instruction KVM could've handled the request
11177         * correctly by exiting to userspace and performing I/O but there
11178         * doesn't seem to be a real use-case behind such requests, just return
11179         * KVM_EXIT_INTERNAL_ERROR for now.
11180         */
11181        vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
11182        vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
11183        vcpu->run->internal.ndata = 0;
11184
11185        return 0;
11186}
11187EXPORT_SYMBOL_GPL(kvm_handle_memory_failure);
11188
11189int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
11190{
11191        bool pcid_enabled;
11192        struct x86_exception e;
11193        unsigned i;
11194        unsigned long roots_to_free = 0;
11195        struct {
11196                u64 pcid;
11197                u64 gla;
11198        } operand;
11199        int r;
11200
11201        r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
11202        if (r != X86EMUL_CONTINUE)
11203                return kvm_handle_memory_failure(vcpu, r, &e);
11204
11205        if (operand.pcid >> 12 != 0) {
11206                kvm_inject_gp(vcpu, 0);
11207                return 1;
11208        }
11209
11210        pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
11211
11212        switch (type) {
11213        case INVPCID_TYPE_INDIV_ADDR:
11214                if ((!pcid_enabled && (operand.pcid != 0)) ||
11215                    is_noncanonical_address(operand.gla, vcpu)) {
11216                        kvm_inject_gp(vcpu, 0);
11217                        return 1;
11218                }
11219                kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
11220                return kvm_skip_emulated_instruction(vcpu);
11221
11222        case INVPCID_TYPE_SINGLE_CTXT:
11223                if (!pcid_enabled && (operand.pcid != 0)) {
11224                        kvm_inject_gp(vcpu, 0);
11225                        return 1;
11226                }
11227
11228                if (kvm_get_active_pcid(vcpu) == operand.pcid) {
11229                        kvm_mmu_sync_roots(vcpu);
11230                        kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
11231                }
11232
11233                for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
11234                        if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd)
11235                            == operand.pcid)
11236                                roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
11237
11238                kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
11239                /*
11240                 * If neither the current cr3 nor any of the prev_roots use the
11241                 * given PCID, then nothing needs to be done here because a
11242                 * resync will happen anyway before switching to any other CR3.
11243                 */
11244
11245                return kvm_skip_emulated_instruction(vcpu);
11246
11247        case INVPCID_TYPE_ALL_NON_GLOBAL:
11248                /*
11249                 * Currently, KVM doesn't mark global entries in the shadow
11250                 * page tables, so a non-global flush just degenerates to a
11251                 * global flush. If needed, we could optimize this later by
11252                 * keeping track of global entries in shadow page tables.
11253                 */
11254
11255                fallthrough;
11256        case INVPCID_TYPE_ALL_INCL_GLOBAL:
11257                kvm_mmu_unload(vcpu);
11258                return kvm_skip_emulated_instruction(vcpu);
11259
11260        default:
11261                BUG(); /* We have already checked above that type <= 3 */
11262        }
11263}
11264EXPORT_SYMBOL_GPL(kvm_handle_invpcid);
11265
11266EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
11267EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
11268EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
11269EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
11270EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
11271EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
11272EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
11273EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
11274EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
11275EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
11276EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter_failed);
11277EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
11278EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
11279EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
11280EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
11281EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update);
11282EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
11283EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
11284EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
11285EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
11286EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
11287EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request);
11288