LXR linux/arch/x86/kvm/x86.c

   1/*
   2 * Kernel-based Virtual Machine driver for Linux
   3 *
   4 * derived from drivers/kvm/kvm_main.c
   5 *
   6 * Copyright (C) 2006 Qumranet, Inc.
   7 * Copyright (C) 2008 Qumranet, Inc.
   8 * Copyright IBM Corporation, 2008
   9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  10 *
  11 * Authors:
  12 *   Avi Kivity   <avi@qumranet.com>
  13 *   Yaniv Kamay  <yaniv@qumranet.com>
  14 *   Amit Shah    <amit.shah@qumranet.com>
  15 *   Ben-Ami Yassour <benami@il.ibm.com>
  16 *
  17 * This work is licensed under the terms of the GNU GPL, version 2.  See
  18 * the COPYING file in the top-level directory.
  19 *
  20 */
  21
  22#include <linux/kvm_host.h>
  23#include "irq.h"
  24#include "mmu.h"
  25#include "i8254.h"
  26#include "tss.h"
  27#include "kvm_cache_regs.h"
  28#include "x86.h"
  29#include "cpuid.h"
  30
  31#include <linux/clocksource.h>
  32#include <linux/interrupt.h>
  33#include <linux/kvm.h>
  34#include <linux/fs.h>
  35#include <linux/vmalloc.h>
  36#include <linux/module.h>
  37#include <linux/mman.h>
  38#include <linux/highmem.h>
  39#include <linux/iommu.h>
  40#include <linux/intel-iommu.h>
  41#include <linux/cpufreq.h>
  42#include <linux/user-return-notifier.h>
  43#include <linux/srcu.h>
  44#include <linux/slab.h>
  45#include <linux/perf_event.h>
  46#include <linux/uaccess.h>
  47#include <linux/hash.h>
  48#include <linux/pci.h>
  49#include <linux/timekeeper_internal.h>
  50#include <linux/pvclock_gtod.h>
  51#include <trace/events/kvm.h>
  52
  53#define CREATE_TRACE_POINTS
  54#include "trace.h"
  55
  56#include <asm/debugreg.h>
  57#include <asm/msr.h>
  58#include <asm/desc.h>
  59#include <asm/mtrr.h>
  60#include <asm/mce.h>
  61#include <asm/i387.h>
  62#include <asm/fpu-internal.h> /* Ugh! */
  63#include <asm/xcr.h>
  64#include <asm/pvclock.h>
  65#include <asm/div64.h>
  66
  67#define MAX_IO_MSRS 256
  68#define KVM_MAX_MCE_BANKS 32
  69#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
  70
  71#define emul_to_vcpu(ctxt) \
  72        container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
  73
  74/* EFER defaults:
  75 * - enable syscall per default because its emulated by KVM
  76 * - enable LME and LMA per default on 64 bit KVM
  77 */
  78#ifdef CONFIG_X86_64
  79static
  80u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
  81#else
  82static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
  83#endif
  84
  85#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
  86#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
  87
  88static void update_cr8_intercept(struct kvm_vcpu *vcpu);
  89static void process_nmi(struct kvm_vcpu *vcpu);
  90
  91struct kvm_x86_ops *kvm_x86_ops;
  92EXPORT_SYMBOL_GPL(kvm_x86_ops);
  93
  94static bool ignore_msrs = 0;
  95module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
  96
  97unsigned int min_timer_period_us = 500;
  98module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
  99
 100bool kvm_has_tsc_control;
 101EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
 102u32  kvm_max_guest_tsc_khz;
 103EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
 104
 105/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
 106static u32 tsc_tolerance_ppm = 250;
 107module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
 108
 109static bool backwards_tsc_observed = false;
 110
 111#define KVM_NR_SHARED_MSRS 16
 112
 113struct kvm_shared_msrs_global {
 114        int nr;
 115        u32 msrs[KVM_NR_SHARED_MSRS];
 116};
 117
 118struct kvm_shared_msrs {
 119        struct user_return_notifier urn;
 120        bool registered;
 121        struct kvm_shared_msr_values {
 122                u64 host;
 123                u64 curr;
 124        } values[KVM_NR_SHARED_MSRS];
 125};
 126
 127static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
 128static struct kvm_shared_msrs __percpu *shared_msrs;
 129
 130struct kvm_stats_debugfs_item debugfs_entries[] = {
 131        { "pf_fixed", VCPU_STAT(pf_fixed) },
 132        { "pf_guest", VCPU_STAT(pf_guest) },
 133        { "tlb_flush", VCPU_STAT(tlb_flush) },
 134        { "invlpg", VCPU_STAT(invlpg) },
 135        { "exits", VCPU_STAT(exits) },
 136        { "io_exits", VCPU_STAT(io_exits) },
 137        { "mmio_exits", VCPU_STAT(mmio_exits) },
 138        { "signal_exits", VCPU_STAT(signal_exits) },
 139        { "irq_window", VCPU_STAT(irq_window_exits) },
 140        { "nmi_window", VCPU_STAT(nmi_window_exits) },
 141        { "halt_exits", VCPU_STAT(halt_exits) },
 142        { "halt_wakeup", VCPU_STAT(halt_wakeup) },
 143        { "hypercalls", VCPU_STAT(hypercalls) },
 144        { "request_irq", VCPU_STAT(request_irq_exits) },
 145        { "irq_exits", VCPU_STAT(irq_exits) },
 146        { "host_state_reload", VCPU_STAT(host_state_reload) },
 147        { "efer_reload", VCPU_STAT(efer_reload) },
 148        { "fpu_reload", VCPU_STAT(fpu_reload) },
 149        { "insn_emulation", VCPU_STAT(insn_emulation) },
 150        { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
 151        { "irq_injections", VCPU_STAT(irq_injections) },
 152        { "nmi_injections", VCPU_STAT(nmi_injections) },
 153        { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 154        { "mmu_pte_write", VM_STAT(mmu_pte_write) },
 155        { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
 156        { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
 157        { "mmu_flooded", VM_STAT(mmu_flooded) },
 158        { "mmu_recycled", VM_STAT(mmu_recycled) },
 159        { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
 160        { "mmu_unsync", VM_STAT(mmu_unsync) },
 161        { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
 162        { "largepages", VM_STAT(lpages) },
 163        { NULL }
 164};
 165
 166u64 __read_mostly host_xcr0;
 167
 168static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
 169
 170static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
 171{
 172        int i;
 173        for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
 174                vcpu->arch.apf.gfns[i] = ~0;
 175}
 176
 177static void kvm_on_user_return(struct user_return_notifier *urn)
 178{
 179        unsigned slot;
 180        struct kvm_shared_msrs *locals
 181                = container_of(urn, struct kvm_shared_msrs, urn);
 182        struct kvm_shared_msr_values *values;
 183
 184        for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
 185                values = &locals->values[slot];
 186                if (values->host != values->curr) {
 187                        wrmsrl(shared_msrs_global.msrs[slot], values->host);
 188                        values->curr = values->host;
 189                }
 190        }
 191        locals->registered = false;
 192        user_return_notifier_unregister(urn);
 193}
 194
 195static void shared_msr_update(unsigned slot, u32 msr)
 196{
 197        u64 value;
 198        unsigned int cpu = smp_processor_id();
 199        struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
 200
 201        /* only read, and nobody should modify it at this time,
 202         * so don't need lock */
 203        if (slot >= shared_msrs_global.nr) {
 204                printk(KERN_ERR "kvm: invalid MSR slot!");
 205                return;
 206        }
 207        rdmsrl_safe(msr, &value);
 208        smsr->values[slot].host = value;
 209        smsr->values[slot].curr = value;
 210}
 211
 212void kvm_define_shared_msr(unsigned slot, u32 msr)
 213{
 214        if (slot >= shared_msrs_global.nr)
 215                shared_msrs_global.nr = slot + 1;
 216        shared_msrs_global.msrs[slot] = msr;
 217        /* we need ensured the shared_msr_global have been updated */
 218        smp_wmb();
 219}
 220EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
 221
 222static void kvm_shared_msr_cpu_online(void)
 223{
 224        unsigned i;
 225
 226        for (i = 0; i < shared_msrs_global.nr; ++i)
 227                shared_msr_update(i, shared_msrs_global.msrs[i]);
 228}
 229
 230void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
 231{
 232        unsigned int cpu = smp_processor_id();
 233        struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
 234
 235        if (((value ^ smsr->values[slot].curr) & mask) == 0)
 236                return;
 237        smsr->values[slot].curr = value;
 238        wrmsrl(shared_msrs_global.msrs[slot], value);
 239        if (!smsr->registered) {
 240                smsr->urn.on_user_return = kvm_on_user_return;
 241                user_return_notifier_register(&smsr->urn);
 242                smsr->registered = true;
 243        }
 244}
 245EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
 246
 247static void drop_user_return_notifiers(void *ignore)
 248{
 249        unsigned int cpu = smp_processor_id();
 250        struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
 251
 252        if (smsr->registered)
 253                kvm_on_user_return(&smsr->urn);
 254}
 255
 256u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
 257{
 258        return vcpu->arch.apic_base;
 259}
 260EXPORT_SYMBOL_GPL(kvm_get_apic_base);
 261
 262int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 263{
 264        u64 old_state = vcpu->arch.apic_base &
 265                (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
 266        u64 new_state = msr_info->data &
 267                (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
 268        u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) |
 269                0x2ff | (guest_cpuid_has_x2apic(vcpu) ? 0 : X2APIC_ENABLE);
 270
 271        if (!msr_info->host_initiated &&
 272            ((msr_info->data & reserved_bits) != 0 ||
 273             new_state == X2APIC_ENABLE ||
 274             (new_state == MSR_IA32_APICBASE_ENABLE &&
 275              old_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) ||
 276             (new_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE) &&
 277              old_state == 0)))
 278                return 1;
 279
 280        kvm_lapic_set_base(vcpu, msr_info->data);
 281        return 0;
 282}
 283EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 284
 285asmlinkage __visible void kvm_spurious_fault(void)
 286{
 287        /* Fault while not rebooting.  We want the trace. */
 288        BUG();
 289}
 290EXPORT_SYMBOL_GPL(kvm_spurious_fault);
 291
 292#define EXCPT_BENIGN            0
 293#define EXCPT_CONTRIBUTORY      1
 294#define EXCPT_PF                2
 295
 296static int exception_class(int vector)
 297{
 298        switch (vector) {
 299        case PF_VECTOR:
 300                return EXCPT_PF;
 301        case DE_VECTOR:
 302        case TS_VECTOR:
 303        case NP_VECTOR:
 304        case SS_VECTOR:
 305        case GP_VECTOR:
 306                return EXCPT_CONTRIBUTORY;
 307        default:
 308                break;
 309        }
 310        return EXCPT_BENIGN;
 311}
 312
 313static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 314                unsigned nr, bool has_error, u32 error_code,
 315                bool reinject)
 316{
 317        u32 prev_nr;
 318        int class1, class2;
 319
 320        kvm_make_request(KVM_REQ_EVENT, vcpu);
 321
 322        if (!vcpu->arch.exception.pending) {
 323        queue:
 324                vcpu->arch.exception.pending = true;
 325                vcpu->arch.exception.has_error_code = has_error;
 326                vcpu->arch.exception.nr = nr;
 327                vcpu->arch.exception.error_code = error_code;
 328                vcpu->arch.exception.reinject = reinject;
 329                return;
 330        }
 331
 332        /* to check exception */
 333        prev_nr = vcpu->arch.exception.nr;
 334        if (prev_nr == DF_VECTOR) {
 335                /* triple fault -> shutdown */
 336                kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 337                return;
 338        }
 339        class1 = exception_class(prev_nr);
 340        class2 = exception_class(nr);
 341        if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
 342                || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
 343                /* generate double fault per SDM Table 5-5 */
 344                vcpu->arch.exception.pending = true;
 345                vcpu->arch.exception.has_error_code = true;
 346                vcpu->arch.exception.nr = DF_VECTOR;
 347                vcpu->arch.exception.error_code = 0;
 348        } else
 349                /* replace previous exception with a new one in a hope
 350                   that instruction re-execution will regenerate lost
 351                   exception */
 352                goto queue;
 353}
 354
 355void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 356{
 357        kvm_multiple_exception(vcpu, nr, false, 0, false);
 358}
 359EXPORT_SYMBOL_GPL(kvm_queue_exception);
 360
 361void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 362{
 363        kvm_multiple_exception(vcpu, nr, false, 0, true);
 364}
 365EXPORT_SYMBOL_GPL(kvm_requeue_exception);
 366
 367void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
 368{
 369        if (err)
 370                kvm_inject_gp(vcpu, 0);
 371        else
 372                kvm_x86_ops->skip_emulated_instruction(vcpu);
 373}
 374EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
 375
 376void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 377{
 378        ++vcpu->stat.pf_guest;
 379        vcpu->arch.cr2 = fault->address;
 380        kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
 381}
 382EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
 383
 384void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 385{
 386        if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
 387                vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
 388        else
 389                vcpu->arch.mmu.inject_page_fault(vcpu, fault);
 390}
 391
 392void kvm_inject_nmi(struct kvm_vcpu *vcpu)
 393{
 394        atomic_inc(&vcpu->arch.nmi_queued);
 395        kvm_make_request(KVM_REQ_NMI, vcpu);
 396}
 397EXPORT_SYMBOL_GPL(kvm_inject_nmi);
 398
 399void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 400{
 401        kvm_multiple_exception(vcpu, nr, true, error_code, false);
 402}
 403EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
 404
 405void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 406{
 407        kvm_multiple_exception(vcpu, nr, true, error_code, true);
 408}
 409EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
 410
 411/*
 412 * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
 413 * a #GP and return false.
 414 */
 415bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
 416{
 417        if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
 418                return true;
 419        kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
 420        return false;
 421}
 422EXPORT_SYMBOL_GPL(kvm_require_cpl);
 423
 424/*
 425 * This function will be used to read from the physical memory of the currently
 426 * running guest. The difference to kvm_read_guest_page is that this function
 427 * can read from guest physical or from the guest's guest physical memory.
 428 */
 429int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 430                            gfn_t ngfn, void *data, int offset, int len,
 431                            u32 access)
 432{
 433        gfn_t real_gfn;
 434        gpa_t ngpa;
 435
 436        ngpa     = gfn_to_gpa(ngfn);
 437        real_gfn = mmu->translate_gpa(vcpu, ngpa, access);
 438        if (real_gfn == UNMAPPED_GVA)
 439                return -EFAULT;
 440
 441        real_gfn = gpa_to_gfn(real_gfn);
 442
 443        return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
 444}
 445EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
 446
 447int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 448                               void *data, int offset, int len, u32 access)
 449{
 450        return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
 451                                       data, offset, len, access);
 452}
 453
 454/*
 455 * Load the pae pdptrs.  Return true is they are all valid.
 456 */
 457int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
 458{
 459        gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
 460        unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
 461        int i;
 462        int ret;
 463        u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
 464
 465        ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
 466                                      offset * sizeof(u64), sizeof(pdpte),
 467                                      PFERR_USER_MASK|PFERR_WRITE_MASK);
 468        if (ret < 0) {
 469                ret = 0;
 470                goto out;
 471        }
 472        for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
 473                if (is_present_gpte(pdpte[i]) &&
 474                    (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
 475                        ret = 0;
 476                        goto out;
 477                }
 478        }
 479        ret = 1;
 480
 481        memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
 482        __set_bit(VCPU_EXREG_PDPTR,
 483                  (unsigned long *)&vcpu->arch.regs_avail);
 484        __set_bit(VCPU_EXREG_PDPTR,
 485                  (unsigned long *)&vcpu->arch.regs_dirty);
 486out:
 487
 488        return ret;
 489}
 490EXPORT_SYMBOL_GPL(load_pdptrs);
 491
 492static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 493{
 494        u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
 495        bool changed = true;
 496        int offset;
 497        gfn_t gfn;
 498        int r;
 499
 500        if (is_long_mode(vcpu) || !is_pae(vcpu))
 501                return false;
 502
 503        if (!test_bit(VCPU_EXREG_PDPTR,
 504                      (unsigned long *)&vcpu->arch.regs_avail))
 505                return true;
 506
 507        gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
 508        offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
 509        r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
 510                                       PFERR_USER_MASK | PFERR_WRITE_MASK);
 511        if (r < 0)
 512                goto out;
 513        changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
 514out:
 515
 516        return changed;
 517}
 518
 519int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 520{
 521        unsigned long old_cr0 = kvm_read_cr0(vcpu);
 522        unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
 523                                    X86_CR0_CD | X86_CR0_NW;
 524
 525        cr0 |= X86_CR0_ET;
 526
 527#ifdef CONFIG_X86_64
 528        if (cr0 & 0xffffffff00000000UL)
 529                return 1;
 530#endif
 531
 532        cr0 &= ~CR0_RESERVED_BITS;
 533
 534        if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
 535                return 1;
 536
 537        if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
 538                return 1;
 539
 540        if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
 541#ifdef CONFIG_X86_64
 542                if ((vcpu->arch.efer & EFER_LME)) {
 543                        int cs_db, cs_l;
 544
 545                        if (!is_pae(vcpu))
 546                                return 1;
 547                        kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 548                        if (cs_l)
 549                                return 1;
 550                } else
 551#endif
 552                if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
 553                                                 kvm_read_cr3(vcpu)))
 554                        return 1;
 555        }
 556
 557        if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
 558                return 1;
 559
 560        kvm_x86_ops->set_cr0(vcpu, cr0);
 561
 562        if ((cr0 ^ old_cr0) & X86_CR0_PG) {
 563                kvm_clear_async_pf_completion_queue(vcpu);
 564                kvm_async_pf_hash_reset(vcpu);
 565        }
 566
 567        if ((cr0 ^ old_cr0) & update_bits)
 568                kvm_mmu_reset_context(vcpu);
 569        return 0;
 570}
 571EXPORT_SYMBOL_GPL(kvm_set_cr0);
 572
 573void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 574{
 575        (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
 576}
 577EXPORT_SYMBOL_GPL(kvm_lmsw);
 578
 579static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
 580{
 581        if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
 582                        !vcpu->guest_xcr0_loaded) {
 583                /* kvm_set_xcr() also depends on this */
 584                xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
 585                vcpu->guest_xcr0_loaded = 1;
 586        }
 587}
 588
 589static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
 590{
 591        if (vcpu->guest_xcr0_loaded) {
 592                if (vcpu->arch.xcr0 != host_xcr0)
 593                        xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
 594                vcpu->guest_xcr0_loaded = 0;
 595        }
 596}
 597
 598int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 599{
 600        u64 xcr0 = xcr;
 601        u64 old_xcr0 = vcpu->arch.xcr0;
 602        u64 valid_bits;
 603
 604        /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
 605        if (index != XCR_XFEATURE_ENABLED_MASK)
 606                return 1;
 607        if (!(xcr0 & XSTATE_FP))
 608                return 1;
 609        if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
 610                return 1;
 611
 612        /*
 613         * Do not allow the guest to set bits that we do not support
 614         * saving.  However, xcr0 bit 0 is always set, even if the
 615         * emulated CPU does not support XSAVE (see fx_init).
 616         */
 617        valid_bits = vcpu->arch.guest_supported_xcr0 | XSTATE_FP;
 618        if (xcr0 & ~valid_bits)
 619                return 1;
 620
 621        if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR)))
 622                return 1;
 623
 624        kvm_put_guest_xcr0(vcpu);
 625        vcpu->arch.xcr0 = xcr0;
 626
 627        if ((xcr0 ^ old_xcr0) & XSTATE_EXTEND_MASK)
 628                kvm_update_cpuid(vcpu);
 629        return 0;
 630}
 631
 632int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 633{
 634        if (kvm_x86_ops->get_cpl(vcpu) != 0 ||
 635            __kvm_set_xcr(vcpu, index, xcr)) {
 636                kvm_inject_gp(vcpu, 0);
 637                return 1;
 638        }
 639        return 0;
 640}
 641EXPORT_SYMBOL_GPL(kvm_set_xcr);
 642
 643int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 644{
 645        unsigned long old_cr4 = kvm_read_cr4(vcpu);
 646        unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE |
 647                                   X86_CR4_PAE | X86_CR4_SMEP;
 648        if (cr4 & CR4_RESERVED_BITS)
 649                return 1;
 650
 651        if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
 652                return 1;
 653
 654        if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))
 655                return 1;
 656
 657        if (!guest_cpuid_has_smap(vcpu) && (cr4 & X86_CR4_SMAP))
 658                return 1;
 659
 660        if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE))
 661                return 1;
 662
 663        if (is_long_mode(vcpu)) {
 664                if (!(cr4 & X86_CR4_PAE))
 665                        return 1;
 666        } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
 667                   && ((cr4 ^ old_cr4) & pdptr_bits)
 668                   && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
 669                                   kvm_read_cr3(vcpu)))
 670                return 1;
 671
 672        if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
 673                if (!guest_cpuid_has_pcid(vcpu))
 674                        return 1;
 675
 676                /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
 677                if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
 678                        return 1;
 679        }
 680
 681        if (kvm_x86_ops->set_cr4(vcpu, cr4))
 682                return 1;
 683
 684        if (((cr4 ^ old_cr4) & pdptr_bits) ||
 685            (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
 686                kvm_mmu_reset_context(vcpu);
 687
 688        if ((cr4 ^ old_cr4) & X86_CR4_SMAP)
 689                update_permission_bitmask(vcpu, vcpu->arch.walk_mmu, false);
 690
 691        if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
 692                kvm_update_cpuid(vcpu);
 693
 694        return 0;
 695}
 696EXPORT_SYMBOL_GPL(kvm_set_cr4);
 697
 698int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 699{
 700        if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
 701                kvm_mmu_sync_roots(vcpu);
 702                kvm_mmu_flush_tlb(vcpu);
 703                return 0;
 704        }
 705
 706        if (is_long_mode(vcpu)) {
 707                if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) {
 708                        if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
 709                                return 1;
 710                } else
 711                        if (cr3 & CR3_L_MODE_RESERVED_BITS)
 712                                return 1;
 713        } else {
 714                if (is_pae(vcpu)) {
 715                        if (cr3 & CR3_PAE_RESERVED_BITS)
 716                                return 1;
 717                        if (is_paging(vcpu) &&
 718                            !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
 719                                return 1;
 720                }
 721                /*
 722                 * We don't check reserved bits in nonpae mode, because
 723                 * this isn't enforced, and VMware depends on this.
 724                 */
 725        }
 726
 727        vcpu->arch.cr3 = cr3;
 728        __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
 729        kvm_mmu_new_cr3(vcpu);
 730        return 0;
 731}
 732EXPORT_SYMBOL_GPL(kvm_set_cr3);
 733
 734int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 735{
 736        if (cr8 & CR8_RESERVED_BITS)
 737                return 1;
 738        if (irqchip_in_kernel(vcpu->kvm))
 739                kvm_lapic_set_tpr(vcpu, cr8);
 740        else
 741                vcpu->arch.cr8 = cr8;
 742        return 0;
 743}
 744EXPORT_SYMBOL_GPL(kvm_set_cr8);
 745
 746unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 747{
 748        if (irqchip_in_kernel(vcpu->kvm))
 749                return kvm_lapic_get_cr8(vcpu);
 750        else
 751                return vcpu->arch.cr8;
 752}
 753EXPORT_SYMBOL_GPL(kvm_get_cr8);
 754
 755static void kvm_update_dr6(struct kvm_vcpu *vcpu)
 756{
 757        if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
 758                kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6);
 759}
 760
 761static void kvm_update_dr7(struct kvm_vcpu *vcpu)
 762{
 763        unsigned long dr7;
 764
 765        if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
 766                dr7 = vcpu->arch.guest_debug_dr7;
 767        else
 768                dr7 = vcpu->arch.dr7;
 769        kvm_x86_ops->set_dr7(vcpu, dr7);
 770        vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
 771        if (dr7 & DR7_BP_EN_MASK)
 772                vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
 773}
 774
 775static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 776{
 777        switch (dr) {
 778        case 0 ... 3:
 779                vcpu->arch.db[dr] = val;
 780                if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
 781                        vcpu->arch.eff_db[dr] = val;
 782                break;
 783        case 4:
 784                if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
 785                        return 1; /* #UD */
 786                /* fall through */
 787        case 6:
 788                if (val & 0xffffffff00000000ULL)
 789                        return -1; /* #GP */
 790                vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
 791                kvm_update_dr6(vcpu);
 792                break;
 793        case 5:
 794                if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
 795                        return 1; /* #UD */
 796                /* fall through */
 797        default: /* 7 */
 798                if (val & 0xffffffff00000000ULL)
 799                        return -1; /* #GP */
 800                vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
 801                kvm_update_dr7(vcpu);
 802                break;
 803        }
 804
 805        return 0;
 806}
 807
 808int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 809{
 810        int res;
 811
 812        res = __kvm_set_dr(vcpu, dr, val);
 813        if (res > 0)
 814                kvm_queue_exception(vcpu, UD_VECTOR);
 815        else if (res < 0)
 816                kvm_inject_gp(vcpu, 0);
 817
 818        return res;
 819}
 820EXPORT_SYMBOL_GPL(kvm_set_dr);
 821
 822static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 823{
 824        switch (dr) {
 825        case 0 ... 3:
 826                *val = vcpu->arch.db[dr];
 827                break;
 828        case 4:
 829                if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
 830                        return 1;
 831                /* fall through */
 832        case 6:
 833                if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
 834                        *val = vcpu->arch.dr6;
 835                else
 836                        *val = kvm_x86_ops->get_dr6(vcpu);
 837                break;
 838        case 5:
 839                if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
 840                        return 1;
 841                /* fall through */
 842        default: /* 7 */
 843                *val = vcpu->arch.dr7;
 844                break;
 845        }
 846
 847        return 0;
 848}
 849
 850int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 851{
 852        if (_kvm_get_dr(vcpu, dr, val)) {
 853                kvm_queue_exception(vcpu, UD_VECTOR);
 854                return 1;
 855        }
 856        return 0;
 857}
 858EXPORT_SYMBOL_GPL(kvm_get_dr);
 859
 860bool kvm_rdpmc(struct kvm_vcpu *vcpu)
 861{
 862        u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
 863        u64 data;
 864        int err;
 865
 866        err = kvm_pmu_read_pmc(vcpu, ecx, &data);
 867        if (err)
 868                return err;
 869        kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
 870        kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
 871        return err;
 872}
 873EXPORT_SYMBOL_GPL(kvm_rdpmc);
 874
 875/*
 876 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
 877 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
 878 *
 879 * This list is modified at module load time to reflect the
 880 * capabilities of the host cpu. This capabilities test skips MSRs that are
 881 * kvm-specific. Those are put in the beginning of the list.
 882 */
 883
 884#define KVM_SAVE_MSRS_BEGIN     12
 885static u32 msrs_to_save[] = {
 886        MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 887        MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
 888        HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
 889        HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
 890        HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
 891        MSR_KVM_PV_EOI_EN,
 892        MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 893        MSR_STAR,
 894#ifdef CONFIG_X86_64
 895        MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 896#endif
 897        MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
 898        MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS
 899};
 900
 901static unsigned num_msrs_to_save;
 902
 903static const u32 emulated_msrs[] = {
 904        MSR_IA32_TSC_ADJUST,
 905        MSR_IA32_TSCDEADLINE,
 906        MSR_IA32_MISC_ENABLE,
 907        MSR_IA32_MCG_STATUS,
 908        MSR_IA32_MCG_CTL,
 909};
 910
 911bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
 912{
 913        if (efer & efer_reserved_bits)
 914                return false;
 915
 916        if (efer & EFER_FFXSR) {
 917                struct kvm_cpuid_entry2 *feat;
 918
 919                feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
 920                if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
 921                        return false;
 922        }
 923
 924        if (efer & EFER_SVME) {
 925                struct kvm_cpuid_entry2 *feat;
 926
 927                feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
 928                if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
 929                        return false;
 930        }
 931
 932        return true;
 933}
 934EXPORT_SYMBOL_GPL(kvm_valid_efer);
 935
 936static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
 937{
 938        u64 old_efer = vcpu->arch.efer;
 939
 940        if (!kvm_valid_efer(vcpu, efer))
 941                return 1;
 942
 943        if (is_paging(vcpu)
 944            && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
 945                return 1;
 946
 947        efer &= ~EFER_LMA;
 948        efer |= vcpu->arch.efer & EFER_LMA;
 949
 950        kvm_x86_ops->set_efer(vcpu, efer);
 951
 952        /* Update reserved bits */
 953        if ((efer ^ old_efer) & EFER_NX)
 954                kvm_mmu_reset_context(vcpu);
 955
 956        return 0;
 957}
 958
 959void kvm_enable_efer_bits(u64 mask)
 960{
 961       efer_reserved_bits &= ~mask;
 962}
 963EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
 964
 965
 966/*
 967 * Writes msr value into into the appropriate "register".
 968 * Returns 0 on success, non-0 otherwise.
 969 * Assumes vcpu_load() was already called.
 970 */
 971int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 972{
 973        return kvm_x86_ops->set_msr(vcpu, msr);
 974}
 975
 976/*
 977 * Adapt set_msr() to msr_io()'s calling convention
 978 */
 979static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 980{
 981        struct msr_data msr;
 982
 983        msr.data = *data;
 984        msr.index = index;
 985        msr.host_initiated = true;
 986        return kvm_set_msr(vcpu, &msr);
 987}
 988
 989#ifdef CONFIG_X86_64
 990struct pvclock_gtod_data {
 991        seqcount_t      seq;
 992
 993        struct { /* extract of a clocksource struct */
 994                int vclock_mode;
 995                cycle_t cycle_last;
 996                cycle_t mask;
 997                u32     mult;
 998                u32     shift;
 999        } clock;
1000

1001        /* open coded 'struct timespec' */
1002        u64             monotonic_time_snsec;
1003        time_t          monotonic_time_sec;
1004};
1005
1006static struct pvclock_gtod_data pvclock_gtod_data;
1007
1008static void update_pvclock_gtod(struct timekeeper *tk)
1009{
1010        struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
1011
1012        write_seqcount_begin(&vdata->seq);
1013
1014        /* copy pvclock gtod data */
1015        vdata->clock.vclock_mode        = tk->clock->archdata.vclock_mode;
1016        vdata->clock.cycle_last         = tk->clock->cycle_last;
1017        vdata->clock.mask               = tk->clock->mask;
1018        vdata->clock.mult               = tk->mult;
1019        vdata->clock.shift              = tk->shift;
1020
1021        vdata->monotonic_time_sec       = tk->xtime_sec
1022                                        + tk->wall_to_monotonic.tv_sec;
1023        vdata->monotonic_time_snsec     = tk->xtime_nsec
1024                                        + (tk->wall_to_monotonic.tv_nsec
1025                                                << tk->shift);
1026        while (vdata->monotonic_time_snsec >=
1027                                        (((u64)NSEC_PER_SEC) << tk->shift)) {
1028                vdata->monotonic_time_snsec -=
1029                                        ((u64)NSEC_PER_SEC) << tk->shift;
1030                vdata->monotonic_time_sec++;
1031        }
1032
1033        write_seqcount_end(&vdata->seq);
1034}
1035#endif
1036
1037
1038static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
1039{
1040        int version;
1041        int r;
1042        struct pvclock_wall_clock wc;
1043        struct timespec boot;
1044
1045        if (!wall_clock)
1046                return;
1047
1048        r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
1049        if (r)
1050                return;
1051
1052        if (version & 1)
1053                ++version;  /* first time write, random junk */
1054
1055        ++version;
1056
1057        kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
1058
1059        /*
1060         * The guest calculates current wall clock time by adding
1061         * system time (updated by kvm_guest_time_update below) to the
1062         * wall clock specified here.  guest system time equals host
1063         * system time for us, thus we must fill in host boot time here.
1064         */
1065        getboottime(&boot);
1066
1067        if (kvm->arch.kvmclock_offset) {
1068                struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset);
1069                boot = timespec_sub(boot, ts);
1070        }
1071        wc.sec = boot.tv_sec;
1072        wc.nsec = boot.tv_nsec;
1073        wc.version = version;
1074
1075        kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
1076
1077        version++;
1078        kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
1079}
1080
1081static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
1082{
1083        uint32_t quotient, remainder;
1084
1085        /* Don't try to replace with do_div(), this one calculates
1086         * "(dividend << 32) / divisor" */
1087        __asm__ ( "divl %4"
1088                  : "=a" (quotient), "=d" (remainder)
1089                  : "0" (0), "1" (dividend), "r" (divisor) );
1090        return quotient;
1091}
1092
1093static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
1094                               s8 *pshift, u32 *pmultiplier)
1095{
1096        uint64_t scaled64;
1097        int32_t  shift = 0;
1098        uint64_t tps64;
1099        uint32_t tps32;
1100
1101        tps64 = base_khz * 1000LL;
1102        scaled64 = scaled_khz * 1000LL;
1103        while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
1104                tps64 >>= 1;
1105                shift--;
1106        }
1107
1108        tps32 = (uint32_t)tps64;
1109        while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
1110                if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
1111                        scaled64 >>= 1;
1112                else
1113                        tps32 <<= 1;
1114                shift++;
1115        }
1116
1117        *pshift = shift;
1118        *pmultiplier = div_frac(scaled64, tps32);
1119
1120        pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
1121                 __func__, base_khz, scaled_khz, shift, *pmultiplier);
1122}
1123
1124static inline u64 get_kernel_ns(void)
1125{
1126        struct timespec ts;
1127
1128        ktime_get_ts(&ts);
1129        monotonic_to_bootbased(&ts);
1130        return timespec_to_ns(&ts);
1131}
1132
1133#ifdef CONFIG_X86_64
1134static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
1135#endif
1136
1137static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
1138unsigned long max_tsc_khz;
1139
1140static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
1141{
1142        return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
1143                                   vcpu->arch.virtual_tsc_shift);
1144}
1145
1146static u32 adjust_tsc_khz(u32 khz, s32 ppm)
1147{
1148        u64 v = (u64)khz * (1000000 + ppm);
1149        do_div(v, 1000000);
1150        return v;
1151}
1152
1153static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
1154{
1155        u32 thresh_lo, thresh_hi;
1156        int use_scaling = 0;
1157
1158        /* tsc_khz can be zero if TSC calibration fails */
1159        if (this_tsc_khz == 0)
1160                return;
1161
1162        /* Compute a scale to convert nanoseconds in TSC cycles */
1163        kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
1164                           &vcpu->arch.virtual_tsc_shift,
1165                           &vcpu->arch.virtual_tsc_mult);
1166        vcpu->arch.virtual_tsc_khz = this_tsc_khz;
1167
1168        /*
1169         * Compute the variation in TSC rate which is acceptable
1170         * within the range of tolerance and decide if the
1171         * rate being applied is within that bounds of the hardware
1172         * rate.  If so, no scaling or compensation need be done.
1173         */
1174        thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
1175        thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
1176        if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
1177                pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
1178                use_scaling = 1;
1179        }
1180        kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
1181}
1182
1183static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
1184{
1185        u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
1186                                      vcpu->arch.virtual_tsc_mult,
1187                                      vcpu->arch.virtual_tsc_shift);
1188        tsc += vcpu->arch.this_tsc_write;
1189        return tsc;
1190}
1191
1192void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
1193{
1194#ifdef CONFIG_X86_64
1195        bool vcpus_matched;
1196        bool do_request = false;
1197        struct kvm_arch *ka = &vcpu->kvm->arch;
1198        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1199
1200        vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
1201                         atomic_read(&vcpu->kvm->online_vcpus));
1202
1203        if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC)
1204                if (!ka->use_master_clock)
1205                        do_request = 1;
1206
1207        if (!vcpus_matched && ka->use_master_clock)
1208                        do_request = 1;
1209
1210        if (do_request)
1211                kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
1212
1213        trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
1214                            atomic_read(&vcpu->kvm->online_vcpus),
1215                            ka->use_master_clock, gtod->clock.vclock_mode);
1216#endif
1217}
1218
1219static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
1220{
1221        u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu);
1222        vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
1223}
1224
1225void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1226{
1227        struct kvm *kvm = vcpu->kvm;
1228        u64 offset, ns, elapsed;
1229        unsigned long flags;
1230        s64 usdiff;
1231        bool matched;
1232        u64 data = msr->data;
1233
1234        raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1235        offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
1236        ns = get_kernel_ns();
1237        elapsed = ns - kvm->arch.last_tsc_nsec;
1238
1239        if (vcpu->arch.virtual_tsc_khz) {
1240                int faulted = 0;
1241
1242                /* n.b - signed multiplication and division required */
1243                usdiff = data - kvm->arch.last_tsc_write;
1244#ifdef CONFIG_X86_64
1245                usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
1246#else
1247                /* do_div() only does unsigned */
1248                asm("1: idivl %[divisor]\n"
1249                    "2: xor %%edx, %%edx\n"
1250                    "   movl $0, %[faulted]\n"
1251                    "3:\n"
1252                    ".section .fixup,\"ax\"\n"
1253                    "4: movl $1, %[faulted]\n"
1254                    "   jmp  3b\n"
1255                    ".previous\n"
1256
1257                _ASM_EXTABLE(1b, 4b)
1258
1259                : "=A"(usdiff), [faulted] "=r" (faulted)
1260                : "A"(usdiff * 1000), [divisor] "rm"(vcpu->arch.virtual_tsc_khz));
1261
1262#endif
1263                do_div(elapsed, 1000);
1264                usdiff -= elapsed;
1265                if (usdiff < 0)
1266                        usdiff = -usdiff;
1267
1268                /* idivl overflow => difference is larger than USEC_PER_SEC */
1269                if (faulted)
1270                        usdiff = USEC_PER_SEC;
1271        } else
1272                usdiff = USEC_PER_SEC; /* disable TSC match window below */
1273
1274        /*
1275         * Special case: TSC write with a small delta (1 second) of virtual
1276         * cycle time against real time is interpreted as an attempt to
1277         * synchronize the CPU.
1278         *
1279         * For a reliable TSC, we can match TSC offsets, and for an unstable
1280         * TSC, we add elapsed time in this computation.  We could let the
1281         * compensation code attempt to catch up if we fall behind, but
1282         * it's better to try to match offsets from the beginning.
1283         */
1284        if (usdiff < USEC_PER_SEC &&
1285            vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
1286                if (!check_tsc_unstable()) {
1287                        offset = kvm->arch.cur_tsc_offset;
1288                        pr_debug("kvm: matched tsc offset for %llu\n", data);
1289                } else {
1290                        u64 delta = nsec_to_cycles(vcpu, elapsed);
1291                        data += delta;
1292                        offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
1293                        pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1294                }
1295                matched = true;
1296        } else {
1297                /*
1298                 * We split periods of matched TSC writes into generations.
1299                 * For each generation, we track the original measured
1300                 * nanosecond time, offset, and write, so if TSCs are in
1301                 * sync, we can match exact offset, and if not, we can match
1302                 * exact software computation in compute_guest_tsc()
1303                 *
1304                 * These values are tracked in kvm->arch.cur_xxx variables.
1305                 */
1306                kvm->arch.cur_tsc_generation++;
1307                kvm->arch.cur_tsc_nsec = ns;
1308                kvm->arch.cur_tsc_write = data;
1309                kvm->arch.cur_tsc_offset = offset;
1310                matched = false;
1311                pr_debug("kvm: new tsc generation %u, clock %llu\n",
1312                         kvm->arch.cur_tsc_generation, data);
1313        }
1314
1315        /*
1316         * We also track th most recent recorded KHZ, write and time to
1317         * allow the matching interval to be extended at each write.
1318         */
1319        kvm->arch.last_tsc_nsec = ns;
1320        kvm->arch.last_tsc_write = data;
1321        kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
1322
1323        vcpu->arch.last_guest_tsc = data;
1324
1325        /* Keep track of which generation this VCPU has synchronized to */
1326        vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
1327        vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
1328        vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
1329
1330        if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated)
1331                update_ia32_tsc_adjust_msr(vcpu, offset);
1332        kvm_x86_ops->write_tsc_offset(vcpu, offset);
1333        raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1334
1335        spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
1336        if (matched)
1337                kvm->arch.nr_vcpus_matched_tsc++;
1338        else
1339                kvm->arch.nr_vcpus_matched_tsc = 0;
1340
1341        kvm_track_tsc_matching(vcpu);
1342        spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
1343}
1344
1345EXPORT_SYMBOL_GPL(kvm_write_tsc);
1346
1347#ifdef CONFIG_X86_64
1348
1349static cycle_t read_tsc(void)
1350{
1351        cycle_t ret;
1352        u64 last;
1353
1354        /*
1355         * Empirically, a fence (of type that depends on the CPU)
1356         * before rdtsc is enough to ensure that rdtsc is ordered
1357         * with respect to loads.  The various CPU manuals are unclear
1358         * as to whether rdtsc can be reordered with later loads,
1359         * but no one has ever seen it happen.
1360         */
1361        rdtsc_barrier();
1362        ret = (cycle_t)vget_cycles();
1363
1364        last = pvclock_gtod_data.clock.cycle_last;
1365
1366        if (likely(ret >= last))
1367                return ret;
1368
1369        /*
1370         * GCC likes to generate cmov here, but this branch is extremely
1371         * predictable (it's just a funciton of time and the likely is
1372         * very likely) and there's a data dependence, so force GCC
1373         * to generate a branch instead.  I don't barrier() because
1374         * we don't actually need a barrier, and if this function
1375         * ever gets inlined it will generate worse code.
1376         */
1377        asm volatile ("");
1378        return last;
1379}
1380
1381static inline u64 vgettsc(cycle_t *cycle_now)
1382{
1383        long v;
1384        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1385
1386        *cycle_now = read_tsc();
1387
1388        v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
1389        return v * gtod->clock.mult;
1390}
1391
1392static int do_monotonic(struct timespec *ts, cycle_t *cycle_now)
1393{
1394        unsigned long seq;
1395        u64 ns;
1396        int mode;
1397        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1398
1399        ts->tv_nsec = 0;
1400        do {
1401                seq = read_seqcount_begin(&gtod->seq);
1402                mode = gtod->clock.vclock_mode;
1403                ts->tv_sec = gtod->monotonic_time_sec;
1404                ns = gtod->monotonic_time_snsec;
1405                ns += vgettsc(cycle_now);
1406                ns >>= gtod->clock.shift;
1407        } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
1408        timespec_add_ns(ts, ns);
1409
1410        return mode;
1411}
1412
1413/* returns true if host is using tsc clocksource */
1414static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
1415{
1416        struct timespec ts;
1417
1418        /* checked again under seqlock below */
1419        if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
1420                return false;
1421
1422        if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC)
1423                return false;
1424
1425        monotonic_to_bootbased(&ts);
1426        *kernel_ns = timespec_to_ns(&ts);
1427
1428        return true;
1429}
1430#endif
1431
1432/*
1433 *
1434 * Assuming a stable TSC across physical CPUS, and a stable TSC
1435 * across virtual CPUs, the following condition is possible.
1436 * Each numbered line represents an event visible to both
1437 * CPUs at the next numbered event.
1438 *
1439 * "timespecX" represents host monotonic time. "tscX" represents
1440 * RDTSC value.
1441 *
1442 *              VCPU0 on CPU0           |       VCPU1 on CPU1
1443 *
1444 * 1.  read timespec0,tsc0
1445 * 2.                                   | timespec1 = timespec0 + N
1446 *                                      | tsc1 = tsc0 + M
1447 * 3. transition to guest               | transition to guest
1448 * 4. ret0 = timespec0 + (rdtsc - tsc0) |
1449 * 5.                                   | ret1 = timespec1 + (rdtsc - tsc1)
1450 *                                      | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
1451 *
1452 * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
1453 *
1454 *      - ret0 < ret1
1455 *      - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
1456 *              ...
1457 *      - 0 < N - M => M < N
1458 *
1459 * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
1460 * always the case (the difference between two distinct xtime instances
1461 * might be smaller then the difference between corresponding TSC reads,
1462 * when updating guest vcpus pvclock areas).
1463 *
1464 * To avoid that problem, do not allow visibility of distinct
1465 * system_timestamp/tsc_timestamp values simultaneously: use a master
1466 * copy of host monotonic time values. Update that master copy
1467 * in lockstep.
1468 *
1469 * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
1470 *
1471 */
1472
1473static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
1474{
1475#ifdef CONFIG_X86_64
1476        struct kvm_arch *ka = &kvm->arch;
1477        int vclock_mode;
1478        bool host_tsc_clocksource, vcpus_matched;
1479
1480        vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
1481                        atomic_read(&kvm->online_vcpus));
1482
1483        /*
1484         * If the host uses TSC clock, then passthrough TSC as stable
1485         * to the guest.
1486         */
1487        host_tsc_clocksource = kvm_get_time_and_clockread(
1488                                        &ka->master_kernel_ns,
1489                                        &ka->master_cycle_now);
1490
1491        ka->use_master_clock = host_tsc_clocksource && vcpus_matched
1492                                && !backwards_tsc_observed;
1493
1494        if (ka->use_master_clock)
1495                atomic_set(&kvm_guest_has_master_clock, 1);
1496
1497        vclock_mode = pvclock_gtod_data.clock.vclock_mode;
1498        trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
1499                                        vcpus_matched);
1500#endif
1501}
1502
1503static void kvm_gen_update_masterclock(struct kvm *kvm)
1504{
1505#ifdef CONFIG_X86_64
1506        int i;
1507        struct kvm_vcpu *vcpu;
1508        struct kvm_arch *ka = &kvm->arch;
1509
1510        spin_lock(&ka->pvclock_gtod_sync_lock);
1511        kvm_make_mclock_inprogress_request(kvm);
1512        /* no guest entries from this point */
1513        pvclock_update_vm_gtod_copy(kvm);
1514
1515        kvm_for_each_vcpu(i, vcpu, kvm)
1516                set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
1517
1518        /* guest entries allowed */
1519        kvm_for_each_vcpu(i, vcpu, kvm)
1520                clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
1521
1522        spin_unlock(&ka->pvclock_gtod_sync_lock);
1523#endif
1524}
1525
1526static int kvm_guest_time_update(struct kvm_vcpu *v)
1527{
1528        unsigned long flags, this_tsc_khz;
1529        struct kvm_vcpu_arch *vcpu = &v->arch;
1530        struct kvm_arch *ka = &v->kvm->arch;
1531        s64 kernel_ns;
1532        u64 tsc_timestamp, host_tsc;
1533        struct pvclock_vcpu_time_info guest_hv_clock;
1534        u8 pvclock_flags;
1535        bool use_master_clock;
1536
1537        kernel_ns = 0;
1538        host_tsc = 0;
1539
1540        /*
1541         * If the host uses TSC clock, then passthrough TSC as stable
1542         * to the guest.
1543         */
1544        spin_lock(&ka->pvclock_gtod_sync_lock);
1545        use_master_clock = ka->use_master_clock;
1546        if (use_master_clock) {
1547                host_tsc = ka->master_cycle_now;
1548                kernel_ns = ka->master_kernel_ns;
1549        }
1550        spin_unlock(&ka->pvclock_gtod_sync_lock);
1551
1552        /* Keep irq disabled to prevent changes to the clock */
1553        local_irq_save(flags);
1554        this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
1555        if (unlikely(this_tsc_khz == 0)) {
1556                local_irq_restore(flags);
1557                kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
1558                return 1;
1559        }
1560        if (!use_master_clock) {
1561                host_tsc = native_read_tsc();
1562                kernel_ns = get_kernel_ns();
1563        }
1564
1565        tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc);
1566
1567        /*
1568         * We may have to catch up the TSC to match elapsed wall clock
1569         * time for two reasons, even if kvmclock is used.
1570         *   1) CPU could have been running below the maximum TSC rate
1571         *   2) Broken TSC compensation resets the base at each VCPU
1572         *      entry to avoid unknown leaps of TSC even when running
1573         *      again on the same CPU.  This may cause apparent elapsed
1574         *      time to disappear, and the guest to stand still or run
1575         *      very slowly.
1576         */
1577        if (vcpu->tsc_catchup) {
1578                u64 tsc = compute_guest_tsc(v, kernel_ns);
1579                if (tsc > tsc_timestamp) {
1580                        adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
1581                        tsc_timestamp = tsc;
1582                }
1583        }
1584
1585        local_irq_restore(flags);
1586
1587        if (!vcpu->pv_time_enabled)
1588                return 0;
1589
1590        if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
1591                kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
1592                                   &vcpu->hv_clock.tsc_shift,
1593                                   &vcpu->hv_clock.tsc_to_system_mul);
1594                vcpu->hw_tsc_khz = this_tsc_khz;
1595        }
1596
1597        /* With all the info we got, fill in the values */
1598        vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
1599        vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
1600        vcpu->last_guest_tsc = tsc_timestamp;
1601
1602        /*
1603         * The interface expects us to write an even number signaling that the
1604         * update is finished. Since the guest won't see the intermediate
1605         * state, we just increase by 2 at the end.
1606         */
1607        vcpu->hv_clock.version += 2;
1608
1609        if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
1610                &guest_hv_clock, sizeof(guest_hv_clock))))
1611                return 0;
1612
1613        /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
1614        pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
1615
1616        if (vcpu->pvclock_set_guest_stopped_request) {
1617                pvclock_flags |= PVCLOCK_GUEST_STOPPED;
1618                vcpu->pvclock_set_guest_stopped_request = false;
1619        }
1620
1621        /* If the host uses TSC clocksource, then it is stable */
1622        if (use_master_clock)
1623                pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
1624
1625        vcpu->hv_clock.flags = pvclock_flags;
1626
1627        kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
1628                                &vcpu->hv_clock,
1629                                sizeof(vcpu->hv_clock));
1630        return 0;
1631}
1632
1633/*
1634 * kvmclock updates which are isolated to a given vcpu, such as
1635 * vcpu->cpu migration, should not allow system_timestamp from
1636 * the rest of the vcpus to remain static. Otherwise ntp frequency
1637 * correction applies to one vcpu's system_timestamp but not
1638 * the others.
1639 *
1640 * So in those cases, request a kvmclock update for all vcpus.
1641 * We need to rate-limit these requests though, as they can
1642 * considerably slow guests that have a large number of vcpus.
1643 * The time for a remote vcpu to update its kvmclock is bound
1644 * by the delay we use to rate-limit the updates.
1645 */
1646
1647#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
1648
1649static void kvmclock_update_fn(struct work_struct *work)
1650{
1651        int i;
1652        struct delayed_work *dwork = to_delayed_work(work);
1653        struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
1654                                           kvmclock_update_work);
1655        struct kvm *kvm = container_of(ka, struct kvm, arch);
1656        struct kvm_vcpu *vcpu;
1657
1658        kvm_for_each_vcpu(i, vcpu, kvm) {
1659                set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
1660                kvm_vcpu_kick(vcpu);
1661        }
1662}
1663
1664static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
1665{
1666        struct kvm *kvm = v->kvm;
1667
1668        set_bit(KVM_REQ_CLOCK_UPDATE, &v->requests);
1669        schedule_delayed_work(&kvm->arch.kvmclock_update_work,
1670                                        KVMCLOCK_UPDATE_DELAY);
1671}
1672
1673#define KVMCLOCK_SYNC_PERIOD (300 * HZ)
1674
1675static void kvmclock_sync_fn(struct work_struct *work)
1676{
1677        struct delayed_work *dwork = to_delayed_work(work);
1678        struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
1679                                           kvmclock_sync_work);
1680        struct kvm *kvm = container_of(ka, struct kvm, arch);
1681
1682        schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
1683        schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
1684                                        KVMCLOCK_SYNC_PERIOD);
1685}
1686
1687static bool msr_mtrr_valid(unsigned msr)
1688{
1689        switch (msr) {
1690        case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
1691        case MSR_MTRRfix64K_00000:
1692        case MSR_MTRRfix16K_80000:
1693        case MSR_MTRRfix16K_A0000:
1694        case MSR_MTRRfix4K_C0000:
1695        case MSR_MTRRfix4K_C8000:
1696        case MSR_MTRRfix4K_D0000:
1697        case MSR_MTRRfix4K_D8000:
1698        case MSR_MTRRfix4K_E0000:
1699        case MSR_MTRRfix4K_E8000:
1700        case MSR_MTRRfix4K_F0000:
1701        case MSR_MTRRfix4K_F8000:
1702        case MSR_MTRRdefType:
1703        case MSR_IA32_CR_PAT:
1704                return true;
1705        case 0x2f8:
1706                return true;
1707        }
1708        return false;
1709}
1710
1711static bool valid_pat_type(unsigned t)
1712{
1713        return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
1714}
1715
1716static bool valid_mtrr_type(unsigned t)
1717{
1718        return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
1719}
1720
1721static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1722{
1723        int i;
1724
1725        if (!msr_mtrr_valid(msr))
1726                return false;
1727
1728        if (msr == MSR_IA32_CR_PAT) {
1729                for (i = 0; i < 8; i++)
1730                        if (!valid_pat_type((data >> (i * 8)) & 0xff))
1731                                return false;
1732                return true;
1733        } else if (msr == MSR_MTRRdefType) {
1734                if (data & ~0xcff)
1735                        return false;
1736                return valid_mtrr_type(data & 0xff);
1737        } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
1738                for (i = 0; i < 8 ; i++)
1739                        if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
1740                                return false;
1741                return true;
1742        }
1743
1744        /* variable MTRRs */
1745        return valid_mtrr_type(data & 0xff);
1746}
1747
1748static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1749{
1750        u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
1751
1752        if (!mtrr_valid(vcpu, msr, data))
1753                return 1;
1754
1755        if (msr == MSR_MTRRdefType) {
1756                vcpu->arch.mtrr_state.def_type = data;
1757                vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
1758        } else if (msr == MSR_MTRRfix64K_00000)
1759                p[0] = data;
1760        else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
1761                p[1 + msr - MSR_MTRRfix16K_80000] = data;
1762        else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
1763                p[3 + msr - MSR_MTRRfix4K_C0000] = data;
1764        else if (msr == MSR_IA32_CR_PAT)
1765                vcpu->arch.pat = data;
1766        else {  /* Variable MTRRs */
1767                int idx, is_mtrr_mask;
1768                u64 *pt;
1769
1770                idx = (msr - 0x200) / 2;
1771                is_mtrr_mask = msr - 0x200 - 2 * idx;
1772                if (!is_mtrr_mask)
1773                        pt =
1774                          (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
1775                else
1776                        pt =
1777                          (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
1778                *pt = data;
1779        }
1780
1781        kvm_mmu_reset_context(vcpu);
1782        return 0;
1783}
1784
1785static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1786{
1787        u64 mcg_cap = vcpu->arch.mcg_cap;
1788        unsigned bank_num = mcg_cap & 0xff;
1789
1790        switch (msr) {
1791        case MSR_IA32_MCG_STATUS:
1792                vcpu->arch.mcg_status = data;
1793                break;
1794        case MSR_IA32_MCG_CTL:
1795                if (!(mcg_cap & MCG_CTL_P))
1796                        return 1;
1797                if (data != 0 && data != ~(u64)0)
1798                        return -1;
1799                vcpu->arch.mcg_ctl = data;
1800                break;
1801        default:
1802                if (msr >= MSR_IA32_MC0_CTL &&
1803                    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
1804                        u32 offset = msr - MSR_IA32_MC0_CTL;
1805                        /* only 0 or all 1s can be written to IA32_MCi_CTL
1806                         * some Linux kernels though clear bit 10 in bank 4 to
1807                         * workaround a BIOS/GART TBL issue on AMD K8s, ignore
1808                         * this to avoid an uncatched #GP in the guest
1809                         */
1810                        if ((offset & 0x3) == 0 &&
1811                            data != 0 && (data | (1 << 10)) != ~(u64)0)
1812                                return -1;
1813                        vcpu->arch.mce_banks[offset] = data;
1814                        break;
1815                }
1816                return 1;
1817        }
1818        return 0;
1819}
1820
1821static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
1822{
1823        struct kvm *kvm = vcpu->kvm;
1824        int lm = is_long_mode(vcpu);
1825        u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
1826                : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
1827        u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
1828                : kvm->arch.xen_hvm_config.blob_size_32;
1829        u32 page_num = data & ~PAGE_MASK;
1830        u64 page_addr = data & PAGE_MASK;
1831        u8 *page;
1832        int r;
1833
1834        r = -E2BIG;
1835        if (page_num >= blob_size)
1836                goto out;
1837        r = -ENOMEM;
1838        page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
1839        if (IS_ERR(page)) {
1840                r = PTR_ERR(page);
1841                goto out;
1842        }
1843        if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
1844                goto out_free;
1845        r = 0;
1846out_free:
1847        kfree(page);
1848out:
1849        return r;
1850}
1851
1852static bool kvm_hv_hypercall_enabled(struct kvm *kvm)
1853{
1854        return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
1855}
1856
1857static bool kvm_hv_msr_partition_wide(u32 msr)
1858{
1859        bool r = false;
1860        switch (msr) {
1861        case HV_X64_MSR_GUEST_OS_ID:
1862        case HV_X64_MSR_HYPERCALL:
1863        case HV_X64_MSR_REFERENCE_TSC:
1864        case HV_X64_MSR_TIME_REF_COUNT:
1865                r = true;
1866                break;
1867        }
1868
1869        return r;
1870}
1871
1872static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1873{
1874        struct kvm *kvm = vcpu->kvm;
1875
1876        switch (msr) {
1877        case HV_X64_MSR_GUEST_OS_ID:
1878                kvm->arch.hv_guest_os_id = data;
1879                /* setting guest os id to zero disables hypercall page */
1880                if (!kvm->arch.hv_guest_os_id)
1881                        kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
1882                break;
1883        case HV_X64_MSR_HYPERCALL: {
1884                u64 gfn;
1885                unsigned long addr;
1886                u8 instructions[4];
1887
1888                /* if guest os id is not set hypercall should remain disabled */
1889                if (!kvm->arch.hv_guest_os_id)
1890                        break;
1891                if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
1892                        kvm->arch.hv_hypercall = data;
1893                        break;
1894                }
1895                gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
1896                addr = gfn_to_hva(kvm, gfn);
1897                if (kvm_is_error_hva(addr))
1898                        return 1;
1899                kvm_x86_ops->patch_hypercall(vcpu, instructions);
1900                ((unsigned char *)instructions)[3] = 0xc3; /* ret */
1901                if (__copy_to_user((void __user *)addr, instructions, 4))
1902                        return 1;
1903                kvm->arch.hv_hypercall = data;
1904                mark_page_dirty(kvm, gfn);
1905                break;
1906        }
1907        case HV_X64_MSR_REFERENCE_TSC: {
1908                u64 gfn;
1909                HV_REFERENCE_TSC_PAGE tsc_ref;
1910                memset(&tsc_ref, 0, sizeof(tsc_ref));
1911                kvm->arch.hv_tsc_page = data;
1912                if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
1913                        break;
1914                gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
1915                if (kvm_write_guest(kvm, data,
1916                        &tsc_ref, sizeof(tsc_ref)))
1917                        return 1;
1918                mark_page_dirty(kvm, gfn);
1919                break;
1920        }
1921        default:
1922                vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1923                            "data 0x%llx\n", msr, data);
1924                return 1;
1925        }
1926        return 0;
1927}
1928
1929static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1930{
1931        switch (msr) {
1932        case HV_X64_MSR_APIC_ASSIST_PAGE: {
1933                u64 gfn;
1934                unsigned long addr;
1935
1936                if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
1937                        vcpu->arch.hv_vapic = data;
1938                        break;
1939                }
1940                gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT;
1941                addr = gfn_to_hva(vcpu->kvm, gfn);
1942                if (kvm_is_error_hva(addr))
1943                        return 1;
1944                if (__clear_user((void __user *)addr, PAGE_SIZE))
1945                        return 1;
1946                vcpu->arch.hv_vapic = data;
1947                mark_page_dirty(vcpu->kvm, gfn);
1948                break;
1949        }
1950        case HV_X64_MSR_EOI:
1951                return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
1952        case HV_X64_MSR_ICR:
1953                return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
1954        case HV_X64_MSR_TPR:
1955                return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
1956        default:
1957                vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1958                            "data 0x%llx\n", msr, data);
1959                return 1;
1960        }
1961
1962        return 0;
1963}
1964
1965static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
1966{
1967        gpa_t gpa = data & ~0x3f;
1968
1969        /* Bits 2:5 are reserved, Should be zero */
1970        if (data & 0x3c)
1971                return 1;
1972
1973        vcpu->arch.apf.msr_val = data;
1974
1975        if (!(data & KVM_ASYNC_PF_ENABLED)) {
1976                kvm_clear_async_pf_completion_queue(vcpu);
1977                kvm_async_pf_hash_reset(vcpu);
1978                return 0;
1979        }
1980
1981        if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
1982                                        sizeof(u32)))
1983                return 1;
1984
1985        vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
1986        kvm_async_pf_wakeup_all(vcpu);
1987        return 0;
1988}
1989
1990static void kvmclock_reset(struct kvm_vcpu *vcpu)
1991{
1992        vcpu->arch.pv_time_enabled = false;
1993}
1994
1995static void accumulate_steal_time(struct kvm_vcpu *vcpu)
1996{
1997        u64 delta;
1998
1999        if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
2000                return;

2001
2002        delta = current->sched_info.run_delay - vcpu->arch.st.last_steal;
2003        vcpu->arch.st.last_steal = current->sched_info.run_delay;
2004        vcpu->arch.st.accum_steal = delta;
2005}
2006
2007static void record_steal_time(struct kvm_vcpu *vcpu)
2008{
2009        if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
2010                return;
2011
2012        if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
2013                &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
2014                return;
2015
2016        vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal;
2017        vcpu->arch.st.steal.version += 2;
2018        vcpu->arch.st.accum_steal = 0;
2019
2020        kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
2021                &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
2022}
2023
2024int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2025{
2026        bool pr = false;
2027        u32 msr = msr_info->index;
2028        u64 data = msr_info->data;
2029
2030        switch (msr) {
2031        case MSR_AMD64_NB_CFG:
2032        case MSR_IA32_UCODE_REV:
2033        case MSR_IA32_UCODE_WRITE:
2034        case MSR_VM_HSAVE_PA:
2035        case MSR_AMD64_PATCH_LOADER:
2036        case MSR_AMD64_BU_CFG2:
2037                break;
2038
2039        case MSR_EFER:
2040                return set_efer(vcpu, data);
2041        case MSR_K7_HWCR:
2042                data &= ~(u64)0x40;     /* ignore flush filter disable */
2043                data &= ~(u64)0x100;    /* ignore ignne emulation enable */
2044                data &= ~(u64)0x8;      /* ignore TLB cache disable */
2045                if (data != 0) {
2046                        vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
2047                                    data);
2048                        return 1;
2049                }
2050                break;
2051        case MSR_FAM10H_MMIO_CONF_BASE:
2052                if (data != 0) {
2053                        vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
2054                                    "0x%llx\n", data);
2055                        return 1;
2056                }
2057                break;
2058        case MSR_IA32_DEBUGCTLMSR:
2059                if (!data) {
2060                        /* We support the non-activated case already */
2061                        break;
2062                } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
2063                        /* Values other than LBR and BTF are vendor-specific,
2064                           thus reserved and should throw a #GP */
2065                        return 1;
2066                }
2067                vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
2068                            __func__, data);
2069                break;
2070        case 0x200 ... 0x2ff:
2071                return set_msr_mtrr(vcpu, msr, data);
2072        case MSR_IA32_APICBASE:
2073                return kvm_set_apic_base(vcpu, msr_info);
2074        case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
2075                return kvm_x2apic_msr_write(vcpu, msr, data);
2076        case MSR_IA32_TSCDEADLINE:
2077                kvm_set_lapic_tscdeadline_msr(vcpu, data);
2078                break;
2079        case MSR_IA32_TSC_ADJUST:
2080                if (guest_cpuid_has_tsc_adjust(vcpu)) {
2081                        if (!msr_info->host_initiated) {
2082                                u64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
2083                                kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true);
2084                        }
2085                        vcpu->arch.ia32_tsc_adjust_msr = data;
2086                }
2087                break;
2088        case MSR_IA32_MISC_ENABLE:
2089                vcpu->arch.ia32_misc_enable_msr = data;
2090                break;
2091        case MSR_KVM_WALL_CLOCK_NEW:
2092        case MSR_KVM_WALL_CLOCK:
2093                vcpu->kvm->arch.wall_clock = data;
2094                kvm_write_wall_clock(vcpu->kvm, data);
2095                break;
2096        case MSR_KVM_SYSTEM_TIME_NEW:
2097        case MSR_KVM_SYSTEM_TIME: {
2098                u64 gpa_offset;
2099                kvmclock_reset(vcpu);
2100
2101                vcpu->arch.time = data;
2102                kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2103
2104                /* we verify if the enable bit is set... */
2105                if (!(data & 1))
2106                        break;
2107
2108                gpa_offset = data & ~(PAGE_MASK | 1);
2109
2110                if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
2111                     &vcpu->arch.pv_time, data & ~1ULL,
2112                     sizeof(struct pvclock_vcpu_time_info)))
2113                        vcpu->arch.pv_time_enabled = false;
2114                else
2115                        vcpu->arch.pv_time_enabled = true;
2116
2117                break;
2118        }
2119        case MSR_KVM_ASYNC_PF_EN:
2120                if (kvm_pv_enable_async_pf(vcpu, data))
2121                        return 1;
2122                break;
2123        case MSR_KVM_STEAL_TIME:
2124
2125                if (unlikely(!sched_info_on()))
2126                        return 1;
2127
2128                if (data & KVM_STEAL_RESERVED_MASK)
2129                        return 1;
2130
2131                if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
2132                                                data & KVM_STEAL_VALID_BITS,
2133                                                sizeof(struct kvm_steal_time)))
2134                        return 1;
2135
2136                vcpu->arch.st.msr_val = data;
2137
2138                if (!(data & KVM_MSR_ENABLED))
2139                        break;
2140
2141                vcpu->arch.st.last_steal = current->sched_info.run_delay;
2142
2143                preempt_disable();
2144                accumulate_steal_time(vcpu);
2145                preempt_enable();
2146
2147                kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
2148
2149                break;
2150        case MSR_KVM_PV_EOI_EN:
2151                if (kvm_lapic_enable_pv_eoi(vcpu, data))
2152                        return 1;
2153                break;
2154
2155        case MSR_IA32_MCG_CTL:
2156        case MSR_IA32_MCG_STATUS:
2157        case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
2158                return set_msr_mce(vcpu, msr, data);
2159
2160        /* Performance counters are not protected by a CPUID bit,
2161         * so we should check all of them in the generic path for the sake of
2162         * cross vendor migration.
2163         * Writing a zero into the event select MSRs disables them,
2164         * which we perfectly emulate ;-). Any other value should be at least
2165         * reported, some guests depend on them.
2166         */
2167        case MSR_K7_EVNTSEL0:
2168        case MSR_K7_EVNTSEL1:
2169        case MSR_K7_EVNTSEL2:
2170        case MSR_K7_EVNTSEL3:
2171                if (data != 0)
2172                        vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
2173                                    "0x%x data 0x%llx\n", msr, data);
2174                break;
2175        /* at least RHEL 4 unconditionally writes to the perfctr registers,
2176         * so we ignore writes to make it happy.
2177         */
2178        case MSR_K7_PERFCTR0:
2179        case MSR_K7_PERFCTR1:
2180        case MSR_K7_PERFCTR2:
2181        case MSR_K7_PERFCTR3:
2182                vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
2183                            "0x%x data 0x%llx\n", msr, data);
2184                break;
2185        case MSR_P6_PERFCTR0:
2186        case MSR_P6_PERFCTR1:
2187                pr = true;
2188        case MSR_P6_EVNTSEL0:
2189        case MSR_P6_EVNTSEL1:
2190                if (kvm_pmu_msr(vcpu, msr))
2191                        return kvm_pmu_set_msr(vcpu, msr_info);
2192
2193                if (pr || data != 0)
2194                        vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
2195                                    "0x%x data 0x%llx\n", msr, data);
2196                break;
2197        case MSR_K7_CLK_CTL:
2198                /*
2199                 * Ignore all writes to this no longer documented MSR.
2200                 * Writes are only relevant for old K7 processors,
2201                 * all pre-dating SVM, but a recommended workaround from
2202                 * AMD for these chips. It is possible to specify the
2203                 * affected processor models on the command line, hence
2204                 * the need to ignore the workaround.
2205                 */
2206                break;
2207        case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
2208                if (kvm_hv_msr_partition_wide(msr)) {
2209                        int r;
2210                        mutex_lock(&vcpu->kvm->lock);
2211                        r = set_msr_hyperv_pw(vcpu, msr, data);
2212                        mutex_unlock(&vcpu->kvm->lock);
2213                        return r;
2214                } else
2215                        return set_msr_hyperv(vcpu, msr, data);
2216                break;
2217        case MSR_IA32_BBL_CR_CTL3:
2218                /* Drop writes to this legacy MSR -- see rdmsr
2219                 * counterpart for further detail.
2220                 */
2221                vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
2222                break;
2223        case MSR_AMD64_OSVW_ID_LENGTH:
2224                if (!guest_cpuid_has_osvw(vcpu))
2225                        return 1;
2226                vcpu->arch.osvw.length = data;
2227                break;
2228        case MSR_AMD64_OSVW_STATUS:
2229                if (!guest_cpuid_has_osvw(vcpu))
2230                        return 1;
2231                vcpu->arch.osvw.status = data;
2232                break;
2233        default:
2234                if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
2235                        return xen_hvm_config(vcpu, data);
2236                if (kvm_pmu_msr(vcpu, msr))
2237                        return kvm_pmu_set_msr(vcpu, msr_info);
2238                if (!ignore_msrs) {
2239                        vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
2240                                    msr, data);
2241                        return 1;
2242                } else {
2243                        vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
2244                                    msr, data);
2245                        break;
2246                }
2247        }
2248        return 0;
2249}
2250EXPORT_SYMBOL_GPL(kvm_set_msr_common);
2251
2252
2253/*
2254 * Reads an msr value (of 'msr_index') into 'pdata'.
2255 * Returns 0 on success, non-0 otherwise.
2256 * Assumes vcpu_load() was already called.
2257 */
2258int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2259{
2260        return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
2261}
2262
2263static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2264{
2265        u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
2266
2267        if (!msr_mtrr_valid(msr))
2268                return 1;
2269
2270        if (msr == MSR_MTRRdefType)
2271                *pdata = vcpu->arch.mtrr_state.def_type +
2272                         (vcpu->arch.mtrr_state.enabled << 10);
2273        else if (msr == MSR_MTRRfix64K_00000)
2274                *pdata = p[0];
2275        else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
2276                *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
2277        else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
2278                *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
2279        else if (msr == MSR_IA32_CR_PAT)
2280                *pdata = vcpu->arch.pat;
2281        else {  /* Variable MTRRs */
2282                int idx, is_mtrr_mask;
2283                u64 *pt;
2284
2285                idx = (msr - 0x200) / 2;
2286                is_mtrr_mask = msr - 0x200 - 2 * idx;
2287                if (!is_mtrr_mask)
2288                        pt =
2289                          (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
2290                else
2291                        pt =
2292                          (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
2293                *pdata = *pt;
2294        }
2295
2296        return 0;
2297}
2298
2299static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2300{
2301        u64 data;
2302        u64 mcg_cap = vcpu->arch.mcg_cap;
2303        unsigned bank_num = mcg_cap & 0xff;
2304
2305        switch (msr) {
2306        case MSR_IA32_P5_MC_ADDR:
2307        case MSR_IA32_P5_MC_TYPE:
2308                data = 0;
2309                break;
2310        case MSR_IA32_MCG_CAP:
2311                data = vcpu->arch.mcg_cap;
2312                break;
2313        case MSR_IA32_MCG_CTL:
2314                if (!(mcg_cap & MCG_CTL_P))
2315                        return 1;
2316                data = vcpu->arch.mcg_ctl;
2317                break;
2318        case MSR_IA32_MCG_STATUS:
2319                data = vcpu->arch.mcg_status;
2320                break;
2321        default:
2322                if (msr >= MSR_IA32_MC0_CTL &&
2323                    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
2324                        u32 offset = msr - MSR_IA32_MC0_CTL;
2325                        data = vcpu->arch.mce_banks[offset];
2326                        break;
2327                }
2328                return 1;
2329        }
2330        *pdata = data;
2331        return 0;
2332}
2333
2334static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2335{
2336        u64 data = 0;
2337        struct kvm *kvm = vcpu->kvm;
2338
2339        switch (msr) {
2340        case HV_X64_MSR_GUEST_OS_ID:
2341                data = kvm->arch.hv_guest_os_id;
2342                break;
2343        case HV_X64_MSR_HYPERCALL:
2344                data = kvm->arch.hv_hypercall;
2345                break;
2346        case HV_X64_MSR_TIME_REF_COUNT: {
2347                data =
2348                     div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
2349                break;
2350        }
2351        case HV_X64_MSR_REFERENCE_TSC:
2352                data = kvm->arch.hv_tsc_page;
2353                break;
2354        default:
2355                vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
2356                return 1;
2357        }
2358
2359        *pdata = data;
2360        return 0;
2361}
2362
2363static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2364{
2365        u64 data = 0;
2366
2367        switch (msr) {
2368        case HV_X64_MSR_VP_INDEX: {
2369                int r;
2370                struct kvm_vcpu *v;
2371                kvm_for_each_vcpu(r, v, vcpu->kvm) {
2372                        if (v == vcpu) {
2373                                data = r;
2374                                break;
2375                        }
2376                }
2377                break;
2378        }
2379        case HV_X64_MSR_EOI:
2380                return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
2381        case HV_X64_MSR_ICR:
2382                return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
2383        case HV_X64_MSR_TPR:
2384                return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
2385        case HV_X64_MSR_APIC_ASSIST_PAGE:
2386                data = vcpu->arch.hv_vapic;
2387                break;
2388        default:
2389                vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
2390                return 1;
2391        }
2392        *pdata = data;
2393        return 0;
2394}
2395
2396int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2397{
2398        u64 data;
2399
2400        switch (msr) {
2401        case MSR_IA32_PLATFORM_ID:
2402        case MSR_IA32_EBL_CR_POWERON:
2403        case MSR_IA32_DEBUGCTLMSR:
2404        case MSR_IA32_LASTBRANCHFROMIP:
2405        case MSR_IA32_LASTBRANCHTOIP:
2406        case MSR_IA32_LASTINTFROMIP:
2407        case MSR_IA32_LASTINTTOIP:
2408        case MSR_K8_SYSCFG:
2409        case MSR_K7_HWCR:
2410        case MSR_VM_HSAVE_PA:
2411        case MSR_K7_EVNTSEL0:
2412        case MSR_K7_PERFCTR0:
2413        case MSR_K8_INT_PENDING_MSG:
2414        case MSR_AMD64_NB_CFG:
2415        case MSR_FAM10H_MMIO_CONF_BASE:
2416        case MSR_AMD64_BU_CFG2:
2417                data = 0;
2418                break;
2419        case MSR_P6_PERFCTR0:
2420        case MSR_P6_PERFCTR1:
2421        case MSR_P6_EVNTSEL0:
2422        case MSR_P6_EVNTSEL1:
2423                if (kvm_pmu_msr(vcpu, msr))
2424                        return kvm_pmu_get_msr(vcpu, msr, pdata);
2425                data = 0;
2426                break;
2427        case MSR_IA32_UCODE_REV:
2428                data = 0x100000000ULL;
2429                break;
2430        case MSR_MTRRcap:
2431                data = 0x500 | KVM_NR_VAR_MTRR;
2432                break;
2433        case 0x200 ... 0x2ff:
2434                return get_msr_mtrr(vcpu, msr, pdata);
2435        case 0xcd: /* fsb frequency */
2436                data = 3;
2437                break;
2438                /*
2439                 * MSR_EBC_FREQUENCY_ID
2440                 * Conservative value valid for even the basic CPU models.
2441                 * Models 0,1: 000 in bits 23:21 indicating a bus speed of
2442                 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
2443                 * and 266MHz for model 3, or 4. Set Core Clock
2444                 * Frequency to System Bus Frequency Ratio to 1 (bits
2445                 * 31:24) even though these are only valid for CPU
2446                 * models > 2, however guests may end up dividing or
2447                 * multiplying by zero otherwise.
2448                 */
2449        case MSR_EBC_FREQUENCY_ID:
2450                data = 1 << 24;
2451                break;
2452        case MSR_IA32_APICBASE:
2453                data = kvm_get_apic_base(vcpu);
2454                break;
2455        case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
2456                return kvm_x2apic_msr_read(vcpu, msr, pdata);
2457                break;
2458        case MSR_IA32_TSCDEADLINE:
2459                data = kvm_get_lapic_tscdeadline_msr(vcpu);
2460                break;
2461        case MSR_IA32_TSC_ADJUST:
2462                data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
2463                break;
2464        case MSR_IA32_MISC_ENABLE:
2465                data = vcpu->arch.ia32_misc_enable_msr;
2466                break;
2467        case MSR_IA32_PERF_STATUS:
2468                /* TSC increment by tick */
2469                data = 1000ULL;
2470                /* CPU multiplier */
2471                data |= (((uint64_t)4ULL) << 40);
2472                break;
2473        case MSR_EFER:
2474                data = vcpu->arch.efer;
2475                break;
2476        case MSR_KVM_WALL_CLOCK:
2477        case MSR_KVM_WALL_CLOCK_NEW:
2478                data = vcpu->kvm->arch.wall_clock;
2479                break;
2480        case MSR_KVM_SYSTEM_TIME:
2481        case MSR_KVM_SYSTEM_TIME_NEW:
2482                data = vcpu->arch.time;
2483                break;
2484        case MSR_KVM_ASYNC_PF_EN:
2485                data = vcpu->arch.apf.msr_val;
2486                break;
2487        case MSR_KVM_STEAL_TIME:
2488                data = vcpu->arch.st.msr_val;
2489                break;
2490        case MSR_KVM_PV_EOI_EN:
2491                data = vcpu->arch.pv_eoi.msr_val;
2492                break;
2493        case MSR_IA32_P5_MC_ADDR:
2494        case MSR_IA32_P5_MC_TYPE:
2495        case MSR_IA32_MCG_CAP:
2496        case MSR_IA32_MCG_CTL:
2497        case MSR_IA32_MCG_STATUS:
2498        case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
2499                return get_msr_mce(vcpu, msr, pdata);
2500        case MSR_K7_CLK_CTL:
2501                /*
2502                 * Provide expected ramp-up count for K7. All other
2503                 * are set to zero, indicating minimum divisors for
2504                 * every field.
2505                 *
2506                 * This prevents guest kernels on AMD host with CPU
2507                 * type 6, model 8 and higher from exploding due to
2508                 * the rdmsr failing.
2509                 */
2510                data = 0x20000000;
2511                break;
2512        case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
2513                if (kvm_hv_msr_partition_wide(msr)) {
2514                        int r;
2515                        mutex_lock(&vcpu->kvm->lock);
2516                        r = get_msr_hyperv_pw(vcpu, msr, pdata);
2517                        mutex_unlock(&vcpu->kvm->lock);
2518                        return r;
2519                } else
2520                        return get_msr_hyperv(vcpu, msr, pdata);
2521                break;
2522        case MSR_IA32_BBL_CR_CTL3:
2523                /* This legacy MSR exists but isn't fully documented in current
2524                 * silicon.  It is however accessed by winxp in very narrow
2525                 * scenarios where it sets bit #19, itself documented as
2526                 * a "reserved" bit.  Best effort attempt to source coherent
2527                 * read data here should the balance of the register be
2528                 * interpreted by the guest:
2529                 *
2530                 * L2 cache control register 3: 64GB range, 256KB size,
2531                 * enabled, latency 0x1, configured
2532                 */
2533                data = 0xbe702111;
2534                break;
2535        case MSR_AMD64_OSVW_ID_LENGTH:
2536                if (!guest_cpuid_has_osvw(vcpu))
2537                        return 1;
2538                data = vcpu->arch.osvw.length;
2539                break;
2540        case MSR_AMD64_OSVW_STATUS:
2541                if (!guest_cpuid_has_osvw(vcpu))
2542                        return 1;
2543                data = vcpu->arch.osvw.status;
2544                break;
2545        default:
2546                if (kvm_pmu_msr(vcpu, msr))
2547                        return kvm_pmu_get_msr(vcpu, msr, pdata);
2548                if (!ignore_msrs) {
2549                        vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
2550                        return 1;
2551                } else {
2552                        vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
2553                        data = 0;
2554                }
2555                break;
2556        }
2557        *pdata = data;
2558        return 0;
2559}
2560EXPORT_SYMBOL_GPL(kvm_get_msr_common);
2561
2562/*
2563 * Read or write a bunch of msrs. All parameters are kernel addresses.
2564 *
2565 * @return number of msrs set successfully.
2566 */
2567static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
2568                    struct kvm_msr_entry *entries,
2569                    int (*do_msr)(struct kvm_vcpu *vcpu,
2570                                  unsigned index, u64 *data))
2571{
2572        int i, idx;
2573
2574        idx = srcu_read_lock(&vcpu->kvm->srcu);
2575        for (i = 0; i < msrs->nmsrs; ++i)
2576                if (do_msr(vcpu, entries[i].index, &entries[i].data))
2577                        break;
2578        srcu_read_unlock(&vcpu->kvm->srcu, idx);
2579
2580        return i;
2581}
2582
2583/*
2584 * Read or write a bunch of msrs. Parameters are user addresses.
2585 *
2586 * @return number of msrs set successfully.
2587 */
2588static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
2589                  int (*do_msr)(struct kvm_vcpu *vcpu,
2590                                unsigned index, u64 *data),
2591                  int writeback)
2592{
2593        struct kvm_msrs msrs;
2594        struct kvm_msr_entry *entries;
2595        int r, n;
2596        unsigned size;
2597
2598        r = -EFAULT;
2599        if (copy_from_user(&msrs, user_msrs, sizeof msrs))
2600                goto out;
2601
2602        r = -E2BIG;
2603        if (msrs.nmsrs >= MAX_IO_MSRS)
2604                goto out;
2605
2606        size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
2607        entries = memdup_user(user_msrs->entries, size);
2608        if (IS_ERR(entries)) {
2609                r = PTR_ERR(entries);
2610                goto out;
2611        }
2612
2613        r = n = __msr_io(vcpu, &msrs, entries, do_msr);
2614        if (r < 0)
2615                goto out_free;
2616
2617        r = -EFAULT;
2618        if (writeback && copy_to_user(user_msrs->entries, entries, size))
2619                goto out_free;
2620
2621        r = n;
2622
2623out_free:
2624        kfree(entries);
2625out:
2626        return r;
2627}
2628
2629int kvm_dev_ioctl_check_extension(long ext)
2630{
2631        int r;
2632
2633        switch (ext) {
2634        case KVM_CAP_IRQCHIP:
2635        case KVM_CAP_HLT:
2636        case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
2637        case KVM_CAP_SET_TSS_ADDR:
2638        case KVM_CAP_EXT_CPUID:
2639        case KVM_CAP_EXT_EMUL_CPUID:
2640        case KVM_CAP_CLOCKSOURCE:
2641        case KVM_CAP_PIT:
2642        case KVM_CAP_NOP_IO_DELAY:
2643        case KVM_CAP_MP_STATE:
2644        case KVM_CAP_SYNC_MMU:
2645        case KVM_CAP_USER_NMI:
2646        case KVM_CAP_REINJECT_CONTROL:
2647        case KVM_CAP_IRQ_INJECT_STATUS:
2648        case KVM_CAP_IRQFD:
2649        case KVM_CAP_IOEVENTFD:
2650        case KVM_CAP_PIT2:
2651        case KVM_CAP_PIT_STATE2:
2652        case KVM_CAP_SET_IDENTITY_MAP_ADDR:
2653        case KVM_CAP_XEN_HVM:
2654        case KVM_CAP_ADJUST_CLOCK:
2655        case KVM_CAP_VCPU_EVENTS:
2656        case KVM_CAP_HYPERV:
2657        case KVM_CAP_HYPERV_VAPIC:
2658        case KVM_CAP_HYPERV_SPIN:
2659        case KVM_CAP_PCI_SEGMENT:
2660        case KVM_CAP_DEBUGREGS:
2661        case KVM_CAP_X86_ROBUST_SINGLESTEP:
2662        case KVM_CAP_XSAVE:
2663        case KVM_CAP_ASYNC_PF:
2664        case KVM_CAP_GET_TSC_KHZ:
2665        case KVM_CAP_KVMCLOCK_CTRL:
2666        case KVM_CAP_READONLY_MEM:
2667        case KVM_CAP_HYPERV_TIME:
2668        case KVM_CAP_IOAPIC_POLARITY_IGNORED:
2669#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
2670        case KVM_CAP_ASSIGN_DEV_IRQ:
2671        case KVM_CAP_PCI_2_3:
2672#endif
2673                r = 1;
2674                break;
2675        case KVM_CAP_COALESCED_MMIO:
2676                r = KVM_COALESCED_MMIO_PAGE_OFFSET;
2677                break;
2678        case KVM_CAP_VAPIC:
2679                r = !kvm_x86_ops->cpu_has_accelerated_tpr();
2680                break;
2681        case KVM_CAP_NR_VCPUS:
2682                r = KVM_SOFT_MAX_VCPUS;
2683                break;
2684        case KVM_CAP_MAX_VCPUS:
2685                r = KVM_MAX_VCPUS;
2686                break;
2687        case KVM_CAP_NR_MEMSLOTS:
2688                r = KVM_USER_MEM_SLOTS;
2689                break;
2690        case KVM_CAP_PV_MMU:    /* obsolete */
2691                r = 0;
2692                break;
2693#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
2694        case KVM_CAP_IOMMU:
2695                r = iommu_present(&pci_bus_type);
2696                break;
2697#endif
2698        case KVM_CAP_MCE:
2699                r = KVM_MAX_MCE_BANKS;
2700                break;
2701        case KVM_CAP_XCRS:
2702                r = cpu_has_xsave;
2703                break;
2704        case KVM_CAP_TSC_CONTROL:
2705                r = kvm_has_tsc_control;
2706                break;
2707        case KVM_CAP_TSC_DEADLINE_TIMER:
2708                r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER);
2709                break;
2710        default:
2711                r = 0;
2712                break;
2713        }
2714        return r;
2715
2716}
2717
2718long kvm_arch_dev_ioctl(struct file *filp,
2719                        unsigned int ioctl, unsigned long arg)
2720{
2721        void __user *argp = (void __user *)arg;
2722        long r;
2723
2724        switch (ioctl) {
2725        case KVM_GET_MSR_INDEX_LIST: {
2726                struct kvm_msr_list __user *user_msr_list = argp;
2727                struct kvm_msr_list msr_list;
2728                unsigned n;
2729
2730                r = -EFAULT;
2731                if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
2732                        goto out;
2733                n = msr_list.nmsrs;
2734                msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
2735                if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
2736                        goto out;
2737                r = -E2BIG;
2738                if (n < msr_list.nmsrs)
2739                        goto out;
2740                r = -EFAULT;
2741                if (copy_to_user(user_msr_list->indices, &msrs_to_save,
2742                                 num_msrs_to_save * sizeof(u32)))
2743                        goto out;
2744                if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
2745                                 &emulated_msrs,
2746                                 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
2747                        goto out;
2748                r = 0;
2749                break;
2750        }
2751        case KVM_GET_SUPPORTED_CPUID:
2752        case KVM_GET_EMULATED_CPUID: {
2753                struct kvm_cpuid2 __user *cpuid_arg = argp;
2754                struct kvm_cpuid2 cpuid;
2755
2756                r = -EFAULT;
2757                if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2758                        goto out;
2759
2760                r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
2761                                            ioctl);
2762                if (r)
2763                        goto out;
2764
2765                r = -EFAULT;
2766                if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
2767                        goto out;
2768                r = 0;
2769                break;
2770        }
2771        case KVM_X86_GET_MCE_CAP_SUPPORTED: {
2772                u64 mce_cap;
2773
2774                mce_cap = KVM_MCE_CAP_SUPPORTED;
2775                r = -EFAULT;
2776                if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
2777                        goto out;
2778                r = 0;
2779                break;
2780        }
2781        default:
2782                r = -EINVAL;
2783        }
2784out:
2785        return r;
2786}
2787
2788static void wbinvd_ipi(void *garbage)
2789{
2790        wbinvd();
2791}
2792
2793static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
2794{
2795        return kvm_arch_has_noncoherent_dma(vcpu->kvm);
2796}
2797
2798void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2799{
2800        /* Address WBINVD may be executed by guest */
2801        if (need_emulate_wbinvd(vcpu)) {
2802                if (kvm_x86_ops->has_wbinvd_exit())
2803                        cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
2804                else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
2805                        smp_call_function_single(vcpu->cpu,
2806                                        wbinvd_ipi, NULL, 1);
2807        }
2808
2809        kvm_x86_ops->vcpu_load(vcpu, cpu);
2810
2811        /* Apply any externally detected TSC adjustments (due to suspend) */
2812        if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
2813                adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
2814                vcpu->arch.tsc_offset_adjustment = 0;
2815                set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
2816        }
2817
2818        if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
2819                s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
2820                                native_read_tsc() - vcpu->arch.last_host_tsc;
2821                if (tsc_delta < 0)
2822                        mark_tsc_unstable("KVM discovered backwards TSC");
2823                if (check_tsc_unstable()) {
2824                        u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu,
2825                                                vcpu->arch.last_guest_tsc);
2826                        kvm_x86_ops->write_tsc_offset(vcpu, offset);
2827                        vcpu->arch.tsc_catchup = 1;
2828                }
2829                /*
2830                 * On a host with synchronized TSC, there is no need to update
2831                 * kvmclock on vcpu->cpu migration
2832                 */
2833                if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
2834                        kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2835                if (vcpu->cpu != cpu)
2836                        kvm_migrate_timers(vcpu);
2837                vcpu->cpu = cpu;
2838        }
2839
2840        accumulate_steal_time(vcpu);
2841        kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
2842}
2843
2844void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
2845{
2846        kvm_x86_ops->vcpu_put(vcpu);
2847        kvm_put_guest_fpu(vcpu);
2848        vcpu->arch.last_host_tsc = native_read_tsc();
2849}
2850
2851static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2852                                    struct kvm_lapic_state *s)
2853{
2854        kvm_x86_ops->sync_pir_to_irr(vcpu);
2855        memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
2856
2857        return 0;
2858}
2859
2860static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
2861                                    struct kvm_lapic_state *s)
2862{
2863        kvm_apic_post_state_restore(vcpu, s);
2864        update_cr8_intercept(vcpu);
2865
2866        return 0;
2867}
2868
2869static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2870                                    struct kvm_interrupt *irq)
2871{
2872        if (irq->irq >= KVM_NR_INTERRUPTS)
2873                return -EINVAL;
2874        if (irqchip_in_kernel(vcpu->kvm))
2875                return -ENXIO;
2876
2877        kvm_queue_interrupt(vcpu, irq->irq, false);
2878        kvm_make_request(KVM_REQ_EVENT, vcpu);
2879
2880        return 0;
2881}
2882
2883static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
2884{
2885        kvm_inject_nmi(vcpu);
2886
2887        return 0;
2888}
2889
2890static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
2891                                           struct kvm_tpr_access_ctl *tac)
2892{
2893        if (tac->flags)
2894                return -EINVAL;
2895        vcpu->arch.tpr_access_reporting = !!tac->enabled;
2896        return 0;
2897}
2898
2899static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
2900                                        u64 mcg_cap)
2901{
2902        int r;
2903        unsigned bank_num = mcg_cap & 0xff, bank;
2904
2905        r = -EINVAL;
2906        if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
2907                goto out;
2908        if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
2909                goto out;
2910        r = 0;
2911        vcpu->arch.mcg_cap = mcg_cap;
2912        /* Init IA32_MCG_CTL to all 1s */
2913        if (mcg_cap & MCG_CTL_P)
2914                vcpu->arch.mcg_ctl = ~(u64)0;
2915        /* Init IA32_MCi_CTL to all 1s */
2916        for (bank = 0; bank < bank_num; bank++)
2917                vcpu->arch.mce_banks[bank*4] = ~(u64)0;
2918out:
2919        return r;
2920}
2921
2922static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
2923                                      struct kvm_x86_mce *mce)
2924{
2925        u64 mcg_cap = vcpu->arch.mcg_cap;
2926        unsigned bank_num = mcg_cap & 0xff;
2927        u64 *banks = vcpu->arch.mce_banks;
2928
2929        if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
2930                return -EINVAL;
2931        /*
2932         * if IA32_MCG_CTL is not all 1s, the uncorrected error
2933         * reporting is disabled
2934         */
2935        if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
2936            vcpu->arch.mcg_ctl != ~(u64)0)
2937                return 0;
2938        banks += 4 * mce->bank;
2939        /*
2940         * if IA32_MCi_CTL is not all 1s, the uncorrected error
2941         * reporting is disabled for the bank
2942         */
2943        if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
2944                return 0;
2945        if (mce->status & MCI_STATUS_UC) {
2946                if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
2947                    !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
2948                        kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2949                        return 0;
2950                }
2951                if (banks[1] & MCI_STATUS_VAL)
2952                        mce->status |= MCI_STATUS_OVER;
2953                banks[2] = mce->addr;
2954                banks[3] = mce->misc;
2955                vcpu->arch.mcg_status = mce->mcg_status;
2956                banks[1] = mce->status;
2957                kvm_queue_exception(vcpu, MC_VECTOR);
2958        } else if (!(banks[1] & MCI_STATUS_VAL)
2959                   || !(banks[1] & MCI_STATUS_UC)) {
2960                if (banks[1] & MCI_STATUS_VAL)
2961                        mce->status |= MCI_STATUS_OVER;
2962                banks[2] = mce->addr;
2963                banks[3] = mce->misc;
2964                banks[1] = mce->status;
2965        } else
2966                banks[1] |= MCI_STATUS_OVER;
2967        return 0;
2968}
2969
2970static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2971                                               struct kvm_vcpu_events *events)
2972{
2973        process_nmi(vcpu);
2974        events->exception.injected =
2975                vcpu->arch.exception.pending &&
2976                !kvm_exception_is_soft(vcpu->arch.exception.nr);
2977        events->exception.nr = vcpu->arch.exception.nr;
2978        events->exception.has_error_code = vcpu->arch.exception.has_error_code;
2979        events->exception.pad = 0;
2980        events->exception.error_code = vcpu->arch.exception.error_code;
2981
2982        events->interrupt.injected =
2983                vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
2984        events->interrupt.nr = vcpu->arch.interrupt.nr;
2985        events->interrupt.soft = 0;
2986        events->interrupt.shadow =
2987                kvm_x86_ops->get_interrupt_shadow(vcpu,
2988                        KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
2989
2990        events->nmi.injected = vcpu->arch.nmi_injected;
2991        events->nmi.pending = vcpu->arch.nmi_pending != 0;
2992        events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
2993        events->nmi.pad = 0;
2994
2995        events->sipi_vector = 0; /* never valid when reporting to user space */
2996
2997        events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
2998                         | KVM_VCPUEVENT_VALID_SHADOW);
2999        memset(&events->reserved, 0, sizeof(events->reserved));
3000}

3001
3002static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
3003                                              struct kvm_vcpu_events *events)
3004{
3005        if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
3006                              | KVM_VCPUEVENT_VALID_SIPI_VECTOR
3007                              | KVM_VCPUEVENT_VALID_SHADOW))
3008                return -EINVAL;
3009
3010        process_nmi(vcpu);
3011        vcpu->arch.exception.pending = events->exception.injected;
3012        vcpu->arch.exception.nr = events->exception.nr;
3013        vcpu->arch.exception.has_error_code = events->exception.has_error_code;
3014        vcpu->arch.exception.error_code = events->exception.error_code;
3015
3016        vcpu->arch.interrupt.pending = events->interrupt.injected;
3017        vcpu->arch.interrupt.nr = events->interrupt.nr;
3018        vcpu->arch.interrupt.soft = events->interrupt.soft;
3019        if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
3020                kvm_x86_ops->set_interrupt_shadow(vcpu,
3021                                                  events->interrupt.shadow);
3022
3023        vcpu->arch.nmi_injected = events->nmi.injected;
3024        if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
3025                vcpu->arch.nmi_pending = events->nmi.pending;
3026        kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
3027
3028        if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
3029            kvm_vcpu_has_lapic(vcpu))
3030                vcpu->arch.apic->sipi_vector = events->sipi_vector;
3031
3032        kvm_make_request(KVM_REQ_EVENT, vcpu);
3033
3034        return 0;
3035}
3036
3037static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
3038                                             struct kvm_debugregs *dbgregs)
3039{
3040        unsigned long val;
3041
3042        memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
3043        _kvm_get_dr(vcpu, 6, &val);
3044        dbgregs->dr6 = val;
3045        dbgregs->dr7 = vcpu->arch.dr7;
3046        dbgregs->flags = 0;
3047        memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
3048}
3049
3050static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
3051                                            struct kvm_debugregs *dbgregs)
3052{
3053        if (dbgregs->flags)
3054                return -EINVAL;
3055
3056        memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
3057        vcpu->arch.dr6 = dbgregs->dr6;
3058        kvm_update_dr6(vcpu);
3059        vcpu->arch.dr7 = dbgregs->dr7;
3060        kvm_update_dr7(vcpu);
3061
3062        return 0;
3063}
3064
3065static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
3066                                         struct kvm_xsave *guest_xsave)
3067{
3068        if (cpu_has_xsave) {
3069                memcpy(guest_xsave->region,
3070                        &vcpu->arch.guest_fpu.state->xsave,
3071                        vcpu->arch.guest_xstate_size);
3072                *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] &=
3073                        vcpu->arch.guest_supported_xcr0 | XSTATE_FPSSE;
3074        } else {
3075                memcpy(guest_xsave->region,
3076                        &vcpu->arch.guest_fpu.state->fxsave,
3077                        sizeof(struct i387_fxsave_struct));
3078                *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
3079                        XSTATE_FPSSE;
3080        }
3081}
3082
3083static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
3084                                        struct kvm_xsave *guest_xsave)
3085{
3086        u64 xstate_bv =
3087                *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
3088
3089        if (cpu_has_xsave) {
3090                /*
3091                 * Here we allow setting states that are not present in
3092                 * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility
3093                 * with old userspace.
3094                 */
3095                if (xstate_bv & ~kvm_supported_xcr0())
3096                        return -EINVAL;
3097                memcpy(&vcpu->arch.guest_fpu.state->xsave,
3098                        guest_xsave->region, vcpu->arch.guest_xstate_size);
3099        } else {
3100                if (xstate_bv & ~XSTATE_FPSSE)
3101                        return -EINVAL;
3102                memcpy(&vcpu->arch.guest_fpu.state->fxsave,
3103                        guest_xsave->region, sizeof(struct i387_fxsave_struct));
3104        }
3105        return 0;
3106}
3107
3108static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
3109                                        struct kvm_xcrs *guest_xcrs)
3110{
3111        if (!cpu_has_xsave) {
3112                guest_xcrs->nr_xcrs = 0;
3113                return;
3114        }
3115
3116        guest_xcrs->nr_xcrs = 1;
3117        guest_xcrs->flags = 0;
3118        guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
3119        guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
3120}
3121
3122static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
3123                                       struct kvm_xcrs *guest_xcrs)
3124{
3125        int i, r = 0;
3126
3127        if (!cpu_has_xsave)
3128                return -EINVAL;
3129
3130        if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
3131                return -EINVAL;
3132
3133        for (i = 0; i < guest_xcrs->nr_xcrs; i++)
3134                /* Only support XCR0 currently */
3135                if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
3136                        r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
3137                                guest_xcrs->xcrs[i].value);
3138                        break;
3139                }
3140        if (r)
3141                r = -EINVAL;
3142        return r;
3143}
3144
3145/*
3146 * kvm_set_guest_paused() indicates to the guest kernel that it has been
3147 * stopped by the hypervisor.  This function will be called from the host only.
3148 * EINVAL is returned when the host attempts to set the flag for a guest that
3149 * does not support pv clocks.
3150 */
3151static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
3152{
3153        if (!vcpu->arch.pv_time_enabled)
3154                return -EINVAL;
3155        vcpu->arch.pvclock_set_guest_stopped_request = true;
3156        kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
3157        return 0;
3158}
3159
3160long kvm_arch_vcpu_ioctl(struct file *filp,
3161                         unsigned int ioctl, unsigned long arg)
3162{
3163        struct kvm_vcpu *vcpu = filp->private_data;
3164        void __user *argp = (void __user *)arg;
3165        int r;
3166        union {
3167                struct kvm_lapic_state *lapic;
3168                struct kvm_xsave *xsave;
3169                struct kvm_xcrs *xcrs;
3170                void *buffer;
3171        } u;
3172
3173        u.buffer = NULL;
3174        switch (ioctl) {
3175        case KVM_GET_LAPIC: {
3176                r = -EINVAL;
3177                if (!vcpu->arch.apic)
3178                        goto out;
3179                u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
3180
3181                r = -ENOMEM;
3182                if (!u.lapic)
3183                        goto out;
3184                r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
3185                if (r)
3186                        goto out;
3187                r = -EFAULT;
3188                if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
3189                        goto out;
3190                r = 0;
3191                break;
3192        }
3193        case KVM_SET_LAPIC: {
3194                r = -EINVAL;
3195                if (!vcpu->arch.apic)
3196                        goto out;
3197                u.lapic = memdup_user(argp, sizeof(*u.lapic));
3198                if (IS_ERR(u.lapic))
3199                        return PTR_ERR(u.lapic);
3200
3201                r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
3202                break;
3203        }
3204        case KVM_INTERRUPT: {
3205                struct kvm_interrupt irq;
3206
3207                r = -EFAULT;
3208                if (copy_from_user(&irq, argp, sizeof irq))
3209                        goto out;
3210                r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
3211                break;
3212        }
3213        case KVM_NMI: {
3214                r = kvm_vcpu_ioctl_nmi(vcpu);
3215                break;
3216        }
3217        case KVM_SET_CPUID: {
3218                struct kvm_cpuid __user *cpuid_arg = argp;
3219                struct kvm_cpuid cpuid;
3220
3221                r = -EFAULT;
3222                if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
3223                        goto out;
3224                r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
3225                break;
3226        }
3227        case KVM_SET_CPUID2: {
3228                struct kvm_cpuid2 __user *cpuid_arg = argp;
3229                struct kvm_cpuid2 cpuid;
3230
3231                r = -EFAULT;
3232                if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
3233                        goto out;
3234                r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
3235                                              cpuid_arg->entries);
3236                break;
3237        }
3238        case KVM_GET_CPUID2: {
3239                struct kvm_cpuid2 __user *cpuid_arg = argp;
3240                struct kvm_cpuid2 cpuid;
3241
3242                r = -EFAULT;
3243                if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
3244                        goto out;
3245                r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
3246                                              cpuid_arg->entries);
3247                if (r)
3248                        goto out;
3249                r = -EFAULT;
3250                if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
3251                        goto out;
3252                r = 0;
3253                break;
3254        }
3255        case KVM_GET_MSRS:
3256                r = msr_io(vcpu, argp, kvm_get_msr, 1);
3257                break;
3258        case KVM_SET_MSRS:
3259                r = msr_io(vcpu, argp, do_set_msr, 0);
3260                break;
3261        case KVM_TPR_ACCESS_REPORTING: {
3262                struct kvm_tpr_access_ctl tac;
3263
3264                r = -EFAULT;
3265                if (copy_from_user(&tac, argp, sizeof tac))
3266                        goto out;
3267                r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
3268                if (r)
3269                        goto out;
3270                r = -EFAULT;
3271                if (copy_to_user(argp, &tac, sizeof tac))
3272                        goto out;
3273                r = 0;
3274                break;
3275        };
3276        case KVM_SET_VAPIC_ADDR: {
3277                struct kvm_vapic_addr va;
3278
3279                r = -EINVAL;
3280                if (!irqchip_in_kernel(vcpu->kvm))
3281                        goto out;
3282                r = -EFAULT;
3283                if (copy_from_user(&va, argp, sizeof va))
3284                        goto out;
3285                r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
3286                break;
3287        }
3288        case KVM_X86_SETUP_MCE: {
3289                u64 mcg_cap;
3290
3291                r = -EFAULT;
3292                if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
3293                        goto out;
3294                r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
3295                break;
3296        }
3297        case KVM_X86_SET_MCE: {
3298                struct kvm_x86_mce mce;
3299
3300                r = -EFAULT;
3301                if (copy_from_user(&mce, argp, sizeof mce))
3302                        goto out;
3303                r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
3304                break;
3305        }
3306        case KVM_GET_VCPU_EVENTS: {
3307                struct kvm_vcpu_events events;
3308
3309                kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
3310
3311                r = -EFAULT;
3312                if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
3313                        break;
3314                r = 0;
3315                break;
3316        }
3317        case KVM_SET_VCPU_EVENTS: {
3318                struct kvm_vcpu_events events;
3319
3320                r = -EFAULT;
3321                if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
3322                        break;
3323
3324                r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
3325                break;
3326        }
3327        case KVM_GET_DEBUGREGS: {
3328                struct kvm_debugregs dbgregs;
3329
3330                kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
3331
3332                r = -EFAULT;
3333                if (copy_to_user(argp, &dbgregs,
3334                                 sizeof(struct kvm_debugregs)))
3335                        break;
3336                r = 0;
3337                break;
3338        }
3339        case KVM_SET_DEBUGREGS: {
3340                struct kvm_debugregs dbgregs;
3341
3342                r = -EFAULT;
3343                if (copy_from_user(&dbgregs, argp,
3344                                   sizeof(struct kvm_debugregs)))
3345                        break;
3346
3347                r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
3348                break;
3349        }
3350        case KVM_GET_XSAVE: {
3351                u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
3352                r = -ENOMEM;
3353                if (!u.xsave)
3354                        break;
3355
3356                kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
3357
3358                r = -EFAULT;
3359                if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
3360                        break;
3361                r = 0;
3362                break;
3363        }
3364        case KVM_SET_XSAVE: {
3365                u.xsave = memdup_user(argp, sizeof(*u.xsave));
3366                if (IS_ERR(u.xsave))
3367                        return PTR_ERR(u.xsave);
3368
3369                r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
3370                break;
3371        }
3372        case KVM_GET_XCRS: {
3373                u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
3374                r = -ENOMEM;
3375                if (!u.xcrs)
3376                        break;
3377
3378                kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
3379
3380                r = -EFAULT;
3381                if (copy_to_user(argp, u.xcrs,
3382                                 sizeof(struct kvm_xcrs)))
3383                        break;
3384                r = 0;
3385                break;
3386        }
3387        case KVM_SET_XCRS: {
3388                u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
3389                if (IS_ERR(u.xcrs))
3390                        return PTR_ERR(u.xcrs);
3391
3392                r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
3393                break;
3394        }
3395        case KVM_SET_TSC_KHZ: {
3396                u32 user_tsc_khz;
3397
3398                r = -EINVAL;
3399                user_tsc_khz = (u32)arg;
3400
3401                if (user_tsc_khz >= kvm_max_guest_tsc_khz)
3402                        goto out;
3403
3404                if (user_tsc_khz == 0)
3405                        user_tsc_khz = tsc_khz;
3406
3407                kvm_set_tsc_khz(vcpu, user_tsc_khz);
3408
3409                r = 0;
3410                goto out;
3411        }
3412        case KVM_GET_TSC_KHZ: {
3413                r = vcpu->arch.virtual_tsc_khz;
3414                goto out;
3415        }
3416        case KVM_KVMCLOCK_CTRL: {
3417                r = kvm_set_guest_paused(vcpu);
3418                goto out;
3419        }
3420        default:
3421                r = -EINVAL;
3422        }
3423out:
3424        kfree(u.buffer);
3425        return r;
3426}
3427
3428int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
3429{
3430        return VM_FAULT_SIGBUS;
3431}
3432
3433static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
3434{
3435        int ret;
3436
3437        if (addr > (unsigned int)(-3 * PAGE_SIZE))
3438                return -EINVAL;
3439        ret = kvm_x86_ops->set_tss_addr(kvm, addr);
3440        return ret;
3441}
3442
3443static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
3444                                              u64 ident_addr)
3445{
3446        kvm->arch.ept_identity_map_addr = ident_addr;
3447        return 0;
3448}
3449
3450static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
3451                                          u32 kvm_nr_mmu_pages)
3452{
3453        if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
3454                return -EINVAL;
3455
3456        mutex_lock(&kvm->slots_lock);
3457
3458        kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
3459        kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
3460
3461        mutex_unlock(&kvm->slots_lock);
3462        return 0;
3463}
3464
3465static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
3466{
3467        return kvm->arch.n_max_mmu_pages;
3468}
3469
3470static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
3471{
3472        int r;
3473
3474        r = 0;
3475        switch (chip->chip_id) {
3476        case KVM_IRQCHIP_PIC_MASTER:
3477                memcpy(&chip->chip.pic,
3478                        &pic_irqchip(kvm)->pics[0],
3479                        sizeof(struct kvm_pic_state));
3480                break;
3481        case KVM_IRQCHIP_PIC_SLAVE:
3482                memcpy(&chip->chip.pic,
3483                        &pic_irqchip(kvm)->pics[1],
3484                        sizeof(struct kvm_pic_state));
3485                break;
3486        case KVM_IRQCHIP_IOAPIC:
3487                r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
3488                break;
3489        default:
3490                r = -EINVAL;
3491                break;
3492        }
3493        return r;
3494}
3495
3496static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
3497{
3498        int r;
3499
3500        r = 0;
3501        switch (chip->chip_id) {
3502        case KVM_IRQCHIP_PIC_MASTER:
3503                spin_lock(&pic_irqchip(kvm)->lock);
3504                memcpy(&pic_irqchip(kvm)->pics[0],
3505                        &chip->chip.pic,
3506                        sizeof(struct kvm_pic_state));
3507                spin_unlock(&pic_irqchip(kvm)->lock);
3508                break;
3509        case KVM_IRQCHIP_PIC_SLAVE:
3510                spin_lock(&pic_irqchip(kvm)->lock);
3511                memcpy(&pic_irqchip(kvm)->pics[1],
3512                        &chip->chip.pic,
3513                        sizeof(struct kvm_pic_state));
3514                spin_unlock(&pic_irqchip(kvm)->lock);
3515                break;
3516        case KVM_IRQCHIP_IOAPIC:
3517                r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
3518                break;
3519        default:
3520                r = -EINVAL;
3521                break;
3522        }
3523        kvm_pic_update_irq(pic_irqchip(kvm));
3524        return r;
3525}
3526
3527static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
3528{
3529        int r = 0;
3530
3531        mutex_lock(&kvm->arch.vpit->pit_state.lock);
3532        memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
3533        mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3534        return r;
3535}
3536
3537static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
3538{
3539        int r = 0;
3540
3541        mutex_lock(&kvm->arch.vpit->pit_state.lock);
3542        memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
3543        kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
3544        mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3545        return r;
3546}
3547
3548static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
3549{
3550        int r = 0;
3551
3552        mutex_lock(&kvm->arch.vpit->pit_state.lock);
3553        memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
3554                sizeof(ps->channels));
3555        ps->flags = kvm->arch.vpit->pit_state.flags;
3556        mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3557        memset(&ps->reserved, 0, sizeof(ps->reserved));
3558        return r;
3559}
3560
3561static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
3562{
3563        int r = 0, start = 0;
3564        u32 prev_legacy, cur_legacy;
3565        mutex_lock(&kvm->arch.vpit->pit_state.lock);
3566        prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
3567        cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
3568        if (!prev_legacy && cur_legacy)
3569                start = 1;
3570        memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
3571               sizeof(kvm->arch.vpit->pit_state.channels));
3572        kvm->arch.vpit->pit_state.flags = ps->flags;
3573        kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
3574        mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3575        return r;
3576}
3577
3578static int kvm_vm_ioctl_reinject(struct kvm *kvm,
3579                                 struct kvm_reinject_control *control)
3580{
3581        if (!kvm->arch.vpit)
3582                return -ENXIO;
3583        mutex_lock(&kvm->arch.vpit->pit_state.lock);
3584        kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
3585        mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3586        return 0;
3587}
3588
3589/**
3590 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
3591 * @kvm: kvm instance
3592 * @log: slot id and address to which we copy the log
3593 *
3594 * We need to keep it in mind that VCPU threads can write to the bitmap
3595 * concurrently.  So, to avoid losing data, we keep the following order for
3596 * each bit:
3597 *
3598 *   1. Take a snapshot of the bit and clear it if needed.
3599 *   2. Write protect the corresponding page.
3600 *   3. Flush TLB's if needed.
3601 *   4. Copy the snapshot to the userspace.
3602 *
3603 * Between 2 and 3, the guest may write to the page using the remaining TLB
3604 * entry.  This is not a problem because the page will be reported dirty at
3605 * step 4 using the snapshot taken before and step 3 ensures that successive
3606 * writes will be logged for the next call.
3607 */
3608int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
3609{
3610        int r;
3611        struct kvm_memory_slot *memslot;
3612        unsigned long n, i;
3613        unsigned long *dirty_bitmap;
3614        unsigned long *dirty_bitmap_buffer;
3615        bool is_dirty = false;
3616
3617        mutex_lock(&kvm->slots_lock);
3618
3619        r = -EINVAL;
3620        if (log->slot >= KVM_USER_MEM_SLOTS)
3621                goto out;
3622
3623        memslot = id_to_memslot(kvm->memslots, log->slot);
3624
3625        dirty_bitmap = memslot->dirty_bitmap;
3626        r = -ENOENT;
3627        if (!dirty_bitmap)
3628                goto out;
3629
3630        n = kvm_dirty_bitmap_bytes(memslot);
3631
3632        dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
3633        memset(dirty_bitmap_buffer, 0, n);
3634
3635        spin_lock(&kvm->mmu_lock);
3636
3637        for (i = 0; i < n / sizeof(long); i++) {
3638                unsigned long mask;
3639                gfn_t offset;
3640
3641                if (!dirty_bitmap[i])
3642                        continue;
3643
3644                is_dirty = true;
3645
3646                mask = xchg(&dirty_bitmap[i], 0);
3647                dirty_bitmap_buffer[i] = mask;
3648
3649                offset = i * BITS_PER_LONG;
3650                kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
3651        }
3652        if (is_dirty)
3653                kvm_flush_remote_tlbs(kvm);
3654
3655        spin_unlock(&kvm->mmu_lock);
3656
3657        r = -EFAULT;
3658        if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
3659                goto out;
3660
3661        r = 0;
3662out:
3663        mutex_unlock(&kvm->slots_lock);
3664        return r;
3665}
3666
3667int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
3668                        bool line_status)
3669{
3670        if (!irqchip_in_kernel(kvm))
3671                return -ENXIO;
3672
3673        irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
3674                                        irq_event->irq, irq_event->level,
3675                                        line_status);
3676        return 0;
3677}
3678
3679long kvm_arch_vm_ioctl(struct file *filp,
3680                       unsigned int ioctl, unsigned long arg)
3681{
3682        struct kvm *kvm = filp->private_data;
3683        void __user *argp = (void __user *)arg;
3684        int r = -ENOTTY;
3685        /*
3686         * This union makes it completely explicit to gcc-3.x
3687         * that these two variables' stack usage should be
3688         * combined, not added together.
3689         */
3690        union {
3691                struct kvm_pit_state ps;
3692                struct kvm_pit_state2 ps2;
3693                struct kvm_pit_config pit_config;
3694        } u;
3695
3696        switch (ioctl) {
3697        case KVM_SET_TSS_ADDR:
3698                r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
3699                break;
3700        case KVM_SET_IDENTITY_MAP_ADDR: {
3701                u64 ident_addr;
3702
3703                r = -EFAULT;
3704                if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
3705                        goto out;
3706                r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
3707                break;
3708        }
3709        case KVM_SET_NR_MMU_PAGES:
3710                r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
3711                break;
3712        case KVM_GET_NR_MMU_PAGES:
3713                r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
3714                break;
3715        case KVM_CREATE_IRQCHIP: {
3716                struct kvm_pic *vpic;
3717
3718                mutex_lock(&kvm->lock);
3719                r = -EEXIST;
3720                if (kvm->arch.vpic)
3721                        goto create_irqchip_unlock;
3722                r = -EINVAL;
3723                if (atomic_read(&kvm->online_vcpus))
3724                        goto create_irqchip_unlock;
3725                r = -ENOMEM;
3726                vpic = kvm_create_pic(kvm);
3727                if (vpic) {
3728                        r = kvm_ioapic_init(kvm);
3729                        if (r) {
3730                                mutex_lock(&kvm->slots_lock);
3731                                kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
3732                                                          &vpic->dev_master);
3733                                kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
3734                                                          &vpic->dev_slave);
3735                                kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
3736                                                          &vpic->dev_eclr);
3737                                mutex_unlock(&kvm->slots_lock);
3738                                kfree(vpic);
3739                                goto create_irqchip_unlock;
3740                        }
3741                } else
3742                        goto create_irqchip_unlock;
3743                smp_wmb();
3744                kvm->arch.vpic = vpic;
3745                smp_wmb();
3746                r = kvm_setup_default_irq_routing(kvm);
3747                if (r) {
3748                        mutex_lock(&kvm->slots_lock);
3749                        mutex_lock(&kvm->irq_lock);
3750                        kvm_ioapic_destroy(kvm);
3751                        kvm_destroy_pic(kvm);
3752                        mutex_unlock(&kvm->irq_lock);
3753                        mutex_unlock(&kvm->slots_lock);
3754                }
3755        create_irqchip_unlock:
3756                mutex_unlock(&kvm->lock);
3757                break;
3758        }
3759        case KVM_CREATE_PIT:
3760                u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
3761                goto create_pit;
3762        case KVM_CREATE_PIT2:
3763                r = -EFAULT;
3764                if (copy_from_user(&u.pit_config, argp,
3765                                   sizeof(struct kvm_pit_config)))
3766                        goto out;
3767        create_pit:
3768                mutex_lock(&kvm->slots_lock);
3769                r = -EEXIST;
3770                if (kvm->arch.vpit)
3771                        goto create_pit_unlock;
3772                r = -ENOMEM;
3773                kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
3774                if (kvm->arch.vpit)
3775                        r = 0;
3776        create_pit_unlock:
3777                mutex_unlock(&kvm->slots_lock);
3778                break;
3779        case KVM_GET_IRQCHIP: {
3780                /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3781                struct kvm_irqchip *chip;
3782
3783                chip = memdup_user(argp, sizeof(*chip));
3784                if (IS_ERR(chip)) {
3785                        r = PTR_ERR(chip);
3786                        goto out;
3787                }
3788
3789                r = -ENXIO;
3790                if (!irqchip_in_kernel(kvm))
3791                        goto get_irqchip_out;
3792                r = kvm_vm_ioctl_get_irqchip(kvm, chip);
3793                if (r)
3794                        goto get_irqchip_out;
3795                r = -EFAULT;
3796                if (copy_to_user(argp, chip, sizeof *chip))
3797                        goto get_irqchip_out;
3798                r = 0;
3799        get_irqchip_out:
3800                kfree(chip);
3801                break;
3802        }
3803        case KVM_SET_IRQCHIP: {
3804                /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3805                struct kvm_irqchip *chip;
3806
3807                chip = memdup_user(argp, sizeof(*chip));
3808                if (IS_ERR(chip)) {
3809                        r = PTR_ERR(chip);
3810                        goto out;
3811                }
3812
3813                r = -ENXIO;
3814                if (!irqchip_in_kernel(kvm))
3815                        goto set_irqchip_out;
3816                r = kvm_vm_ioctl_set_irqchip(kvm, chip);
3817                if (r)
3818                        goto set_irqchip_out;
3819                r = 0;
3820        set_irqchip_out:
3821                kfree(chip);
3822                break;
3823        }
3824        case KVM_GET_PIT: {
3825                r = -EFAULT;
3826                if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
3827                        goto out;
3828                r = -ENXIO;
3829                if (!kvm->arch.vpit)
3830                        goto out;
3831                r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
3832                if (r)
3833                        goto out;
3834                r = -EFAULT;
3835                if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
3836                        goto out;
3837                r = 0;
3838                break;
3839        }
3840        case KVM_SET_PIT: {
3841                r = -EFAULT;
3842                if (copy_from_user(&u.ps, argp, sizeof u.ps))
3843                        goto out;
3844                r = -ENXIO;
3845                if (!kvm->arch.vpit)
3846                        goto out;
3847                r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
3848                break;
3849        }
3850        case KVM_GET_PIT2: {
3851                r = -ENXIO;
3852                if (!kvm->arch.vpit)
3853                        goto out;
3854                r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
3855                if (r)
3856                        goto out;
3857                r = -EFAULT;
3858                if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
3859                        goto out;
3860                r = 0;
3861                break;
3862        }
3863        case KVM_SET_PIT2: {
3864                r = -EFAULT;
3865                if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
3866                        goto out;
3867                r = -ENXIO;
3868                if (!kvm->arch.vpit)
3869                        goto out;
3870                r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
3871                break;
3872        }
3873        case KVM_REINJECT_CONTROL: {
3874                struct kvm_reinject_control control;
3875                r =  -EFAULT;
3876                if (copy_from_user(&control, argp, sizeof(control)))
3877                        goto out;
3878                r = kvm_vm_ioctl_reinject(kvm, &control);
3879                break;
3880        }
3881        case KVM_XEN_HVM_CONFIG: {
3882                r = -EFAULT;
3883                if (copy_from_user(&kvm->arch.xen_hvm_config, argp,
3884                                   sizeof(struct kvm_xen_hvm_config)))
3885                        goto out;
3886                r = -EINVAL;
3887                if (kvm->arch.xen_hvm_config.flags)
3888                        goto out;
3889                r = 0;
3890                break;
3891        }
3892        case KVM_SET_CLOCK: {
3893                struct kvm_clock_data user_ns;
3894                u64 now_ns;
3895                s64 delta;
3896
3897                r = -EFAULT;
3898                if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
3899                        goto out;
3900
3901                r = -EINVAL;
3902                if (user_ns.flags)
3903                        goto out;
3904
3905                r = 0;
3906                local_irq_disable();
3907                now_ns = get_kernel_ns();
3908                delta = user_ns.clock - now_ns;
3909                local_irq_enable();
3910                kvm->arch.kvmclock_offset = delta;
3911                kvm_gen_update_masterclock(kvm);
3912                break;
3913        }
3914        case KVM_GET_CLOCK: {
3915                struct kvm_clock_data user_ns;
3916                u64 now_ns;
3917
3918                local_irq_disable();
3919                now_ns = get_kernel_ns();
3920                user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
3921                local_irq_enable();
3922                user_ns.flags = 0;
3923                memset(&user_ns.pad, 0, sizeof(user_ns.pad));
3924
3925                r = -EFAULT;
3926                if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
3927                        goto out;
3928                r = 0;
3929                break;
3930        }
3931
3932        default:
3933                ;
3934        }
3935out:
3936        return r;
3937}
3938
3939static void kvm_init_msr_list(void)
3940{
3941        u32 dummy[2];
3942        unsigned i, j;
3943
3944        /* skip the first msrs in the list. KVM-specific */
3945        for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
3946                if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
3947                        continue;
3948
3949                /*
3950                 * Even MSRs that are valid in the host may not be exposed
3951                 * to the guests in some cases.  We could work around this
3952                 * in VMX with the generic MSR save/load machinery, but it
3953                 * is not really worthwhile since it will really only
3954                 * happen with nested virtualization.
3955                 */
3956                switch (msrs_to_save[i]) {
3957                case MSR_IA32_BNDCFGS:
3958                        if (!kvm_x86_ops->mpx_supported())
3959                                continue;
3960                        break;
3961                default:
3962                        break;
3963                }
3964
3965                if (j < i)
3966                        msrs_to_save[j] = msrs_to_save[i];
3967                j++;
3968        }
3969        num_msrs_to_save = j;
3970}
3971
3972static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
3973                           const void *v)
3974{
3975        int handled = 0;
3976        int n;
3977
3978        do {
3979                n = min(len, 8);
3980                if (!(vcpu->arch.apic &&
3981                      !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v))
3982                    && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
3983                        break;
3984                handled += n;
3985                addr += n;
3986                len -= n;
3987                v += n;
3988        } while (len);
3989
3990        return handled;
3991}
3992
3993static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
3994{
3995        int handled = 0;
3996        int n;
3997
3998        do {
3999                n = min(len, 8);
4000                if (!(vcpu->arch.apic &&

4001                      !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v))
4002                    && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
4003                        break;
4004                trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
4005                handled += n;
4006                addr += n;
4007                len -= n;
4008                v += n;
4009        } while (len);
4010
4011        return handled;
4012}
4013
4014static void kvm_set_segment(struct kvm_vcpu *vcpu,
4015                        struct kvm_segment *var, int seg)
4016{
4017        kvm_x86_ops->set_segment(vcpu, var, seg);
4018}
4019
4020void kvm_get_segment(struct kvm_vcpu *vcpu,
4021                     struct kvm_segment *var, int seg)
4022{
4023        kvm_x86_ops->get_segment(vcpu, var, seg);
4024}
4025
4026gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
4027{
4028        gpa_t t_gpa;
4029        struct x86_exception exception;
4030
4031        BUG_ON(!mmu_is_nested(vcpu));
4032
4033        /* NPT walks are always user-walks */
4034        access |= PFERR_USER_MASK;
4035        t_gpa  = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception);
4036
4037        return t_gpa;
4038}
4039
4040gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
4041                              struct x86_exception *exception)
4042{
4043        u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
4044        return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
4045}
4046
4047 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
4048                                struct x86_exception *exception)
4049{
4050        u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
4051        access |= PFERR_FETCH_MASK;
4052        return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
4053}
4054
4055gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
4056                               struct x86_exception *exception)
4057{
4058        u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
4059        access |= PFERR_WRITE_MASK;
4060        return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
4061}
4062
4063/* uses this to access any guest's mapped memory without checking CPL */
4064gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
4065                                struct x86_exception *exception)
4066{
4067        return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
4068}
4069
4070static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
4071                                      struct kvm_vcpu *vcpu, u32 access,
4072                                      struct x86_exception *exception)
4073{
4074        void *data = val;
4075        int r = X86EMUL_CONTINUE;
4076
4077        while (bytes) {
4078                gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
4079                                                            exception);
4080                unsigned offset = addr & (PAGE_SIZE-1);
4081                unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
4082                int ret;
4083
4084                if (gpa == UNMAPPED_GVA)
4085                        return X86EMUL_PROPAGATE_FAULT;
4086                ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
4087                if (ret < 0) {
4088                        r = X86EMUL_IO_NEEDED;
4089                        goto out;
4090                }
4091
4092                bytes -= toread;
4093                data += toread;
4094                addr += toread;
4095        }
4096out:
4097        return r;
4098}
4099
4100/* used for instruction fetching */
4101static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
4102                                gva_t addr, void *val, unsigned int bytes,
4103                                struct x86_exception *exception)
4104{
4105        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4106        u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
4107
4108        return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
4109                                          access | PFERR_FETCH_MASK,
4110                                          exception);
4111}
4112
4113int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
4114                               gva_t addr, void *val, unsigned int bytes,
4115                               struct x86_exception *exception)
4116{
4117        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4118        u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
4119
4120        return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
4121                                          exception);
4122}
4123EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
4124
4125static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
4126                                      gva_t addr, void *val, unsigned int bytes,
4127                                      struct x86_exception *exception)
4128{
4129        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4130        return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
4131}
4132
4133int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
4134                                       gva_t addr, void *val,
4135                                       unsigned int bytes,
4136                                       struct x86_exception *exception)
4137{
4138        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4139        void *data = val;
4140        int r = X86EMUL_CONTINUE;
4141
4142        while (bytes) {
4143                gpa_t gpa =  vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
4144                                                             PFERR_WRITE_MASK,
4145                                                             exception);
4146                unsigned offset = addr & (PAGE_SIZE-1);
4147                unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
4148                int ret;
4149
4150                if (gpa == UNMAPPED_GVA)
4151                        return X86EMUL_PROPAGATE_FAULT;
4152                ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
4153                if (ret < 0) {
4154                        r = X86EMUL_IO_NEEDED;
4155                        goto out;
4156                }
4157
4158                bytes -= towrite;
4159                data += towrite;
4160                addr += towrite;
4161        }
4162out:
4163        return r;
4164}
4165EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
4166
4167static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
4168                                gpa_t *gpa, struct x86_exception *exception,
4169                                bool write)
4170{
4171        u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
4172                | (write ? PFERR_WRITE_MASK : 0);
4173
4174        if (vcpu_match_mmio_gva(vcpu, gva)
4175            && !permission_fault(vcpu, vcpu->arch.walk_mmu,
4176                                 vcpu->arch.access, access)) {
4177                *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
4178                                        (gva & (PAGE_SIZE - 1));
4179                trace_vcpu_match_mmio(gva, *gpa, write, false);
4180                return 1;
4181        }
4182
4183        *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
4184
4185        if (*gpa == UNMAPPED_GVA)
4186                return -1;
4187
4188        /* For APIC access vmexit */
4189        if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
4190                return 1;
4191
4192        if (vcpu_match_mmio_gpa(vcpu, *gpa)) {
4193                trace_vcpu_match_mmio(gva, *gpa, write, true);
4194                return 1;
4195        }
4196
4197        return 0;
4198}
4199
4200int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
4201                        const void *val, int bytes)
4202{
4203        int ret;
4204
4205        ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
4206        if (ret < 0)
4207                return 0;
4208        kvm_mmu_pte_write(vcpu, gpa, val, bytes);
4209        return 1;
4210}
4211
4212struct read_write_emulator_ops {
4213        int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,
4214                                  int bytes);
4215        int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,
4216                                  void *val, int bytes);
4217        int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
4218                               int bytes, void *val);
4219        int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
4220                                    void *val, int bytes);
4221        bool write;
4222};
4223
4224static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
4225{
4226        if (vcpu->mmio_read_completed) {
4227                trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
4228                               vcpu->mmio_fragments[0].gpa, *(u64 *)val);
4229                vcpu->mmio_read_completed = 0;
4230                return 1;
4231        }
4232
4233        return 0;
4234}
4235
4236static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
4237                        void *val, int bytes)
4238{
4239        return !kvm_read_guest(vcpu->kvm, gpa, val, bytes);
4240}
4241
4242static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
4243                         void *val, int bytes)
4244{
4245        return emulator_write_phys(vcpu, gpa, val, bytes);
4246}
4247
4248static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
4249{
4250        trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
4251        return vcpu_mmio_write(vcpu, gpa, bytes, val);
4252}
4253
4254static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
4255                          void *val, int bytes)
4256{
4257        trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
4258        return X86EMUL_IO_NEEDED;
4259}
4260
4261static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
4262                           void *val, int bytes)
4263{
4264        struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
4265
4266        memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
4267        return X86EMUL_CONTINUE;
4268}
4269
4270static const struct read_write_emulator_ops read_emultor = {
4271        .read_write_prepare = read_prepare,
4272        .read_write_emulate = read_emulate,
4273        .read_write_mmio = vcpu_mmio_read,
4274        .read_write_exit_mmio = read_exit_mmio,
4275};
4276
4277static const struct read_write_emulator_ops write_emultor = {
4278        .read_write_emulate = write_emulate,
4279        .read_write_mmio = write_mmio,
4280        .read_write_exit_mmio = write_exit_mmio,
4281        .write = true,
4282};
4283
4284static int emulator_read_write_onepage(unsigned long addr, void *val,
4285                                       unsigned int bytes,
4286                                       struct x86_exception *exception,
4287                                       struct kvm_vcpu *vcpu,
4288                                       const struct read_write_emulator_ops *ops)
4289{
4290        gpa_t gpa;
4291        int handled, ret;
4292        bool write = ops->write;
4293        struct kvm_mmio_fragment *frag;
4294
4295        ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
4296
4297        if (ret < 0)
4298                return X86EMUL_PROPAGATE_FAULT;
4299
4300        /* For APIC access vmexit */
4301        if (ret)
4302                goto mmio;
4303
4304        if (ops->read_write_emulate(vcpu, gpa, val, bytes))
4305                return X86EMUL_CONTINUE;
4306
4307mmio:
4308        /*
4309         * Is this MMIO handled locally?
4310         */
4311        handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
4312        if (handled == bytes)
4313                return X86EMUL_CONTINUE;
4314
4315        gpa += handled;
4316        bytes -= handled;
4317        val += handled;
4318
4319        WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
4320        frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
4321        frag->gpa = gpa;
4322        frag->data = val;
4323        frag->len = bytes;
4324        return X86EMUL_CONTINUE;
4325}
4326
4327int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
4328                        void *val, unsigned int bytes,
4329                        struct x86_exception *exception,
4330                        const struct read_write_emulator_ops *ops)
4331{
4332        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4333        gpa_t gpa;
4334        int rc;
4335
4336        if (ops->read_write_prepare &&
4337                  ops->read_write_prepare(vcpu, val, bytes))
4338                return X86EMUL_CONTINUE;
4339
4340        vcpu->mmio_nr_fragments = 0;
4341
4342        /* Crossing a page boundary? */
4343        if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
4344                int now;
4345
4346                now = -addr & ~PAGE_MASK;
4347                rc = emulator_read_write_onepage(addr, val, now, exception,
4348                                                 vcpu, ops);
4349
4350                if (rc != X86EMUL_CONTINUE)
4351                        return rc;
4352                addr += now;
4353                val += now;
4354                bytes -= now;
4355        }
4356
4357        rc = emulator_read_write_onepage(addr, val, bytes, exception,
4358                                         vcpu, ops);
4359        if (rc != X86EMUL_CONTINUE)
4360                return rc;
4361
4362        if (!vcpu->mmio_nr_fragments)
4363                return rc;
4364
4365        gpa = vcpu->mmio_fragments[0].gpa;
4366
4367        vcpu->mmio_needed = 1;
4368        vcpu->mmio_cur_fragment = 0;
4369
4370        vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
4371        vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
4372        vcpu->run->exit_reason = KVM_EXIT_MMIO;
4373        vcpu->run->mmio.phys_addr = gpa;
4374
4375        return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
4376}
4377
4378static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
4379                                  unsigned long addr,
4380                                  void *val,
4381                                  unsigned int bytes,
4382                                  struct x86_exception *exception)
4383{
4384        return emulator_read_write(ctxt, addr, val, bytes,
4385                                   exception, &read_emultor);
4386}
4387
4388int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
4389                            unsigned long addr,
4390                            const void *val,
4391                            unsigned int bytes,
4392                            struct x86_exception *exception)
4393{
4394        return emulator_read_write(ctxt, addr, (void *)val, bytes,
4395                                   exception, &write_emultor);
4396}
4397
4398#define CMPXCHG_TYPE(t, ptr, old, new) \
4399        (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
4400
4401#ifdef CONFIG_X86_64
4402#  define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
4403#else
4404#  define CMPXCHG64(ptr, old, new) \
4405        (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
4406#endif
4407
4408static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
4409                                     unsigned long addr,
4410                                     const void *old,
4411                                     const void *new,
4412                                     unsigned int bytes,
4413                                     struct x86_exception *exception)
4414{
4415        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4416        gpa_t gpa;
4417        struct page *page;
4418        char *kaddr;
4419        bool exchanged;
4420
4421        /* guests cmpxchg8b have to be emulated atomically */
4422        if (bytes > 8 || (bytes & (bytes - 1)))
4423                goto emul_write;
4424
4425        gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
4426
4427        if (gpa == UNMAPPED_GVA ||
4428            (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
4429                goto emul_write;
4430
4431        if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
4432                goto emul_write;
4433
4434        page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
4435        if (is_error_page(page))
4436                goto emul_write;
4437
4438        kaddr = kmap_atomic(page);
4439        kaddr += offset_in_page(gpa);
4440        switch (bytes) {
4441        case 1:
4442                exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
4443                break;
4444        case 2:
4445                exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
4446                break;
4447        case 4:
4448                exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
4449                break;
4450        case 8:
4451                exchanged = CMPXCHG64(kaddr, old, new);
4452                break;
4453        default:
4454                BUG();
4455        }
4456        kunmap_atomic(kaddr);
4457        kvm_release_page_dirty(page);
4458
4459        if (!exchanged)
4460                return X86EMUL_CMPXCHG_FAILED;
4461
4462        mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
4463        kvm_mmu_pte_write(vcpu, gpa, new, bytes);
4464
4465        return X86EMUL_CONTINUE;
4466
4467emul_write:
4468        printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
4469
4470        return emulator_write_emulated(ctxt, addr, new, bytes, exception);
4471}
4472
4473static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
4474{
4475        /* TODO: String I/O for in kernel device */
4476        int r;
4477
4478        if (vcpu->arch.pio.in)
4479                r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
4480                                    vcpu->arch.pio.size, pd);
4481        else
4482                r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
4483                                     vcpu->arch.pio.port, vcpu->arch.pio.size,
4484                                     pd);
4485        return r;
4486}
4487
4488static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
4489                               unsigned short port, void *val,
4490                               unsigned int count, bool in)
4491{
4492        trace_kvm_pio(!in, port, size, count);
4493
4494        vcpu->arch.pio.port = port;
4495        vcpu->arch.pio.in = in;
4496        vcpu->arch.pio.count  = count;
4497        vcpu->arch.pio.size = size;
4498
4499        if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
4500                vcpu->arch.pio.count = 0;
4501                return 1;
4502        }
4503
4504        vcpu->run->exit_reason = KVM_EXIT_IO;
4505        vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
4506        vcpu->run->io.size = size;
4507        vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
4508        vcpu->run->io.count = count;
4509        vcpu->run->io.port = port;
4510
4511        return 0;
4512}
4513
4514static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
4515                                    int size, unsigned short port, void *val,
4516                                    unsigned int count)
4517{
4518        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4519        int ret;
4520
4521        if (vcpu->arch.pio.count)
4522                goto data_avail;
4523
4524        ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
4525        if (ret) {
4526data_avail:
4527                memcpy(val, vcpu->arch.pio_data, size * count);
4528                vcpu->arch.pio.count = 0;
4529                return 1;
4530        }
4531
4532        return 0;
4533}
4534
4535static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
4536                                     int size, unsigned short port,
4537                                     const void *val, unsigned int count)
4538{
4539        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4540
4541        memcpy(vcpu->arch.pio_data, val, size * count);
4542        return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
4543}
4544
4545static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
4546{
4547        return kvm_x86_ops->get_segment_base(vcpu, seg);
4548}
4549
4550static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
4551{
4552        kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
4553}
4554
4555int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
4556{
4557        if (!need_emulate_wbinvd(vcpu))
4558                return X86EMUL_CONTINUE;
4559
4560        if (kvm_x86_ops->has_wbinvd_exit()) {
4561                int cpu = get_cpu();
4562
4563                cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
4564                smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
4565                                wbinvd_ipi, NULL, 1);
4566                put_cpu();
4567                cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
4568        } else
4569                wbinvd();
4570        return X86EMUL_CONTINUE;
4571}
4572EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
4573
4574static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
4575{
4576        kvm_emulate_wbinvd(emul_to_vcpu(ctxt));
4577}
4578
4579int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
4580{
4581        return _kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
4582}
4583
4584int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
4585{
4586
4587        return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
4588}
4589
4590static u64 mk_cr_64(u64 curr_cr, u32 new_val)
4591{
4592        return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
4593}
4594
4595static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
4596{
4597        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4598        unsigned long value;
4599
4600        switch (cr) {
4601        case 0:
4602                value = kvm_read_cr0(vcpu);
4603                break;
4604        case 2:
4605                value = vcpu->arch.cr2;
4606                break;
4607        case 3:
4608                value = kvm_read_cr3(vcpu);
4609                break;
4610        case 4:
4611                value = kvm_read_cr4(vcpu);
4612                break;
4613        case 8:
4614                value = kvm_get_cr8(vcpu);
4615                break;
4616        default:
4617                kvm_err("%s: unexpected cr %u\n", __func__, cr);
4618                return 0;
4619        }
4620
4621        return value;
4622}
4623
4624static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
4625{
4626        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4627        int res = 0;
4628
4629        switch (cr) {
4630        case 0:
4631                res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
4632                break;
4633        case 2:
4634                vcpu->arch.cr2 = val;
4635                break;
4636        case 3:
4637                res = kvm_set_cr3(vcpu, val);
4638                break;
4639        case 4:
4640                res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
4641                break;
4642        case 8:
4643                res = kvm_set_cr8(vcpu, val);
4644                break;
4645        default:
4646                kvm_err("%s: unexpected cr %u\n", __func__, cr);
4647                res = -1;
4648        }
4649
4650        return res;
4651}
4652
4653static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val)
4654{
4655        kvm_set_rflags(emul_to_vcpu(ctxt), val);
4656}
4657
4658static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
4659{
4660        return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
4661}
4662
4663static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4664{
4665        kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt);
4666}
4667
4668static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4669{
4670        kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt);
4671}
4672
4673static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4674{
4675        kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt);
4676}
4677
4678static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4679{
4680        kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt);
4681}
4682
4683static unsigned long emulator_get_cached_segment_base(
4684        struct x86_emulate_ctxt *ctxt, int seg)
4685{
4686        return get_segment_base(emul_to_vcpu(ctxt), seg);
4687}
4688
4689static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
4690                                 struct desc_struct *desc, u32 *base3,
4691                                 int seg)
4692{
4693        struct kvm_segment var;
4694
4695        kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
4696        *selector = var.selector;
4697
4698        if (var.unusable) {
4699                memset(desc, 0, sizeof(*desc));
4700                return false;
4701        }
4702
4703        if (var.g)
4704                var.limit >>= 12;
4705        set_desc_limit(desc, var.limit);
4706        set_desc_base(desc, (unsigned long)var.base);
4707#ifdef CONFIG_X86_64
4708        if (base3)
4709                *base3 = var.base >> 32;
4710#endif
4711        desc->type = var.type;
4712        desc->s = var.s;
4713        desc->dpl = var.dpl;
4714        desc->p = var.present;
4715        desc->avl = var.avl;
4716        desc->l = var.l;
4717        desc->d = var.db;
4718        desc->g = var.g;
4719
4720        return true;
4721}
4722
4723static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
4724                                 struct desc_struct *desc, u32 base3,
4725                                 int seg)
4726{
4727        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4728        struct kvm_segment var;
4729
4730        var.selector = selector;
4731        var.base = get_desc_base(desc);
4732#ifdef CONFIG_X86_64
4733        var.base |= ((u64)base3) << 32;
4734#endif
4735        var.limit = get_desc_limit(desc);
4736        if (desc->g)
4737                var.limit = (var.limit << 12) | 0xfff;
4738        var.type = desc->type;
4739        var.present = desc->p;
4740        var.dpl = desc->dpl;
4741        var.db = desc->d;
4742        var.s = desc->s;
4743        var.l = desc->l;
4744        var.g = desc->g;
4745        var.avl = desc->avl;
4746        var.present = desc->p;
4747        var.unusable = !var.present;
4748        var.padding = 0;
4749
4750        kvm_set_segment(vcpu, &var, seg);
4751        return;
4752}
4753
4754static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
4755                            u32 msr_index, u64 *pdata)
4756{
4757        return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
4758}
4759
4760static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
4761                            u32 msr_index, u64 data)
4762{
4763        struct msr_data msr;
4764
4765        msr.data = data;
4766        msr.index = msr_index;
4767        msr.host_initiated = false;
4768        return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
4769}
4770
4771static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
4772                             u32 pmc, u64 *pdata)
4773{
4774        return kvm_pmu_read_pmc(emul_to_vcpu(ctxt), pmc, pdata);
4775}
4776
4777static void emulator_halt(struct x86_emulate_ctxt *ctxt)
4778{
4779        emul_to_vcpu(ctxt)->arch.halt_request = 1;
4780}
4781
4782static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt)
4783{
4784        preempt_disable();
4785        kvm_load_guest_fpu(emul_to_vcpu(ctxt));
4786        /*
4787         * CR0.TS may reference the host fpu state, not the guest fpu state,
4788         * so it may be clear at this point.
4789         */
4790        clts();
4791}
4792
4793static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt)
4794{
4795        preempt_enable();
4796}
4797
4798static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
4799                              struct x86_instruction_info *info,
4800                              enum x86_intercept_stage stage)
4801{
4802        return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
4803}
4804
4805static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
4806                               u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
4807{
4808        kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);
4809}
4810
4811static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
4812{
4813        return kvm_register_read(emul_to_vcpu(ctxt), reg);
4814}
4815
4816static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
4817{
4818        kvm_register_write(emul_to_vcpu(ctxt), reg, val);
4819}
4820
4821static const struct x86_emulate_ops emulate_ops = {
4822        .read_gpr            = emulator_read_gpr,
4823        .write_gpr           = emulator_write_gpr,
4824        .read_std            = kvm_read_guest_virt_system,
4825        .write_std           = kvm_write_guest_virt_system,
4826        .fetch               = kvm_fetch_guest_virt,
4827        .read_emulated       = emulator_read_emulated,
4828        .write_emulated      = emulator_write_emulated,
4829        .cmpxchg_emulated    = emulator_cmpxchg_emulated,
4830        .invlpg              = emulator_invlpg,
4831        .pio_in_emulated     = emulator_pio_in_emulated,
4832        .pio_out_emulated    = emulator_pio_out_emulated,
4833        .get_segment         = emulator_get_segment,
4834        .set_segment         = emulator_set_segment,
4835        .get_cached_segment_base = emulator_get_cached_segment_base,
4836        .get_gdt             = emulator_get_gdt,
4837        .get_idt             = emulator_get_idt,
4838        .set_gdt             = emulator_set_gdt,
4839        .set_idt             = emulator_set_idt,
4840        .get_cr              = emulator_get_cr,
4841        .set_cr              = emulator_set_cr,
4842        .set_rflags          = emulator_set_rflags,
4843        .cpl                 = emulator_get_cpl,
4844        .get_dr              = emulator_get_dr,
4845        .set_dr              = emulator_set_dr,
4846        .set_msr             = emulator_set_msr,
4847        .get_msr             = emulator_get_msr,
4848        .read_pmc            = emulator_read_pmc,
4849        .halt                = emulator_halt,
4850        .wbinvd              = emulator_wbinvd,
4851        .fix_hypercall       = emulator_fix_hypercall,
4852        .get_fpu             = emulator_get_fpu,
4853        .put_fpu             = emulator_put_fpu,
4854        .intercept           = emulator_intercept,
4855        .get_cpuid           = emulator_get_cpuid,
4856};
4857
4858static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
4859{
4860        u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
4861        /*
4862         * an sti; sti; sequence only disable interrupts for the first
4863         * instruction. So, if the last instruction, be it emulated or
4864         * not, left the system with the INT_STI flag enabled, it
4865         * means that the last instruction is an sti. We should not
4866         * leave the flag on in this case. The same goes for mov ss
4867         */
4868        if (!(int_shadow & mask))
4869                kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
4870}
4871
4872static void inject_emulated_exception(struct kvm_vcpu *vcpu)
4873{
4874        struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4875        if (ctxt->exception.vector == PF_VECTOR)
4876                kvm_propagate_fault(vcpu, &ctxt->exception);
4877        else if (ctxt->exception.error_code_valid)
4878                kvm_queue_exception_e(vcpu, ctxt->exception.vector,
4879                                      ctxt->exception.error_code);
4880        else
4881                kvm_queue_exception(vcpu, ctxt->exception.vector);
4882}
4883
4884static void init_decode_cache(struct x86_emulate_ctxt *ctxt)
4885{
4886        memset(&ctxt->opcode_len, 0,
4887               (void *)&ctxt->_regs - (void *)&ctxt->opcode_len);
4888
4889        ctxt->fetch.start = 0;
4890        ctxt->fetch.end = 0;
4891        ctxt->io_read.pos = 0;
4892        ctxt->io_read.end = 0;
4893        ctxt->mem_read.pos = 0;
4894        ctxt->mem_read.end = 0;
4895}
4896
4897static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
4898{
4899        struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4900        int cs_db, cs_l;
4901
4902        kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
4903
4904        ctxt->eflags = kvm_get_rflags(vcpu);
4905        ctxt->eip = kvm_rip_read(vcpu);
4906        ctxt->mode = (!is_protmode(vcpu))               ? X86EMUL_MODE_REAL :
4907                     (ctxt->eflags & X86_EFLAGS_VM)     ? X86EMUL_MODE_VM86 :
4908                     cs_l                               ? X86EMUL_MODE_PROT64 :
4909                     cs_db                              ? X86EMUL_MODE_PROT32 :
4910                                                          X86EMUL_MODE_PROT16;
4911        ctxt->guest_mode = is_guest_mode(vcpu);
4912
4913        init_decode_cache(ctxt);
4914        vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
4915}
4916
4917int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
4918{
4919        struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4920        int ret;
4921
4922        init_emulate_ctxt(vcpu);
4923
4924        ctxt->op_bytes = 2;
4925        ctxt->ad_bytes = 2;
4926        ctxt->_eip = ctxt->eip + inc_eip;
4927        ret = emulate_int_real(ctxt, irq);
4928
4929        if (ret != X86EMUL_CONTINUE)
4930                return EMULATE_FAIL;
4931
4932        ctxt->eip = ctxt->_eip;
4933        kvm_rip_write(vcpu, ctxt->eip);
4934        kvm_set_rflags(vcpu, ctxt->eflags);
4935
4936        if (irq == NMI_VECTOR)
4937                vcpu->arch.nmi_pending = 0;
4938        else
4939                vcpu->arch.interrupt.pending = false;
4940
4941        return EMULATE_DONE;
4942}
4943EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
4944
4945static int handle_emulation_failure(struct kvm_vcpu *vcpu)
4946{
4947        int r = EMULATE_DONE;
4948
4949        ++vcpu->stat.insn_emulation_fail;
4950        trace_kvm_emulate_insn_failed(vcpu);
4951        if (!is_guest_mode(vcpu)) {
4952                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4953                vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
4954                vcpu->run->internal.ndata = 0;
4955                r = EMULATE_FAIL;
4956        }
4957        kvm_queue_exception(vcpu, UD_VECTOR);
4958
4959        return r;
4960}
4961
4962static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
4963                                  bool write_fault_to_shadow_pgtable,
4964                                  int emulation_type)
4965{
4966        gpa_t gpa = cr2;
4967        pfn_t pfn;
4968
4969        if (emulation_type & EMULTYPE_NO_REEXECUTE)
4970                return false;
4971
4972        if (!vcpu->arch.mmu.direct_map) {
4973                /*
4974                 * Write permission should be allowed since only
4975                 * write access need to be emulated.
4976                 */
4977                gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
4978
4979                /*
4980                 * If the mapping is invalid in guest, let cpu retry
4981                 * it to generate fault.
4982                 */
4983                if (gpa == UNMAPPED_GVA)
4984                        return true;
4985        }
4986
4987        /*
4988         * Do not retry the unhandleable instruction if it faults on the
4989         * readonly host memory, otherwise it will goto a infinite loop:
4990         * retry instruction -> write #PF -> emulation fail -> retry
4991         * instruction -> ...
4992         */
4993        pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
4994
4995        /*
4996         * If the instruction failed on the error pfn, it can not be fixed,
4997         * report the error to userspace.
4998         */
4999        if (is_error_noslot_pfn(pfn))
5000                return false;

5001
5002        kvm_release_pfn_clean(pfn);
5003
5004        /* The instructions are well-emulated on direct mmu. */
5005        if (vcpu->arch.mmu.direct_map) {
5006                unsigned int indirect_shadow_pages;
5007
5008                spin_lock(&vcpu->kvm->mmu_lock);
5009                indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
5010                spin_unlock(&vcpu->kvm->mmu_lock);
5011
5012                if (indirect_shadow_pages)
5013                        kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
5014
5015                return true;
5016        }
5017
5018        /*
5019         * if emulation was due to access to shadowed page table
5020         * and it failed try to unshadow page and re-enter the
5021         * guest to let CPU execute the instruction.
5022         */
5023        kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
5024
5025        /*
5026         * If the access faults on its page table, it can not
5027         * be fixed by unprotecting shadow page and it should
5028         * be reported to userspace.
5029         */
5030        return !write_fault_to_shadow_pgtable;
5031}
5032
5033static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
5034                              unsigned long cr2,  int emulation_type)
5035{
5036        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5037        unsigned long last_retry_eip, last_retry_addr, gpa = cr2;
5038
5039        last_retry_eip = vcpu->arch.last_retry_eip;
5040        last_retry_addr = vcpu->arch.last_retry_addr;
5041
5042        /*
5043         * If the emulation is caused by #PF and it is non-page_table
5044         * writing instruction, it means the VM-EXIT is caused by shadow
5045         * page protected, we can zap the shadow page and retry this
5046         * instruction directly.
5047         *
5048         * Note: if the guest uses a non-page-table modifying instruction
5049         * on the PDE that points to the instruction, then we will unmap
5050         * the instruction and go to an infinite loop. So, we cache the
5051         * last retried eip and the last fault address, if we meet the eip
5052         * and the address again, we can break out of the potential infinite
5053         * loop.
5054         */
5055        vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
5056
5057        if (!(emulation_type & EMULTYPE_RETRY))
5058                return false;
5059
5060        if (x86_page_table_writing_insn(ctxt))
5061                return false;
5062
5063        if (ctxt->eip == last_retry_eip && last_retry_addr == cr2)
5064                return false;
5065
5066        vcpu->arch.last_retry_eip = ctxt->eip;
5067        vcpu->arch.last_retry_addr = cr2;
5068
5069        if (!vcpu->arch.mmu.direct_map)
5070                gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
5071
5072        kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
5073
5074        return true;
5075}
5076
5077static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
5078static int complete_emulated_pio(struct kvm_vcpu *vcpu);
5079
5080static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
5081                                unsigned long *db)
5082{
5083        u32 dr6 = 0;
5084        int i;
5085        u32 enable, rwlen;
5086
5087        enable = dr7;
5088        rwlen = dr7 >> 16;
5089        for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
5090                if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
5091                        dr6 |= (1 << i);
5092        return dr6;
5093}
5094
5095static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r)
5096{
5097        struct kvm_run *kvm_run = vcpu->run;
5098
5099        /*
5100         * Use the "raw" value to see if TF was passed to the processor.
5101         * Note that the new value of the flags has not been saved yet.
5102         *
5103         * This is correct even for TF set by the guest, because "the
5104         * processor will not generate this exception after the instruction
5105         * that sets the TF flag".
5106         */
5107        unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
5108
5109        if (unlikely(rflags & X86_EFLAGS_TF)) {
5110                if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
5111                        kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1;
5112                        kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
5113                        kvm_run->debug.arch.exception = DB_VECTOR;
5114                        kvm_run->exit_reason = KVM_EXIT_DEBUG;
5115                        *r = EMULATE_USER_EXIT;
5116                } else {
5117                        vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF;
5118                        /*
5119                         * "Certain debug exceptions may clear bit 0-3.  The
5120                         * remaining contents of the DR6 register are never
5121                         * cleared by the processor".
5122                         */
5123                        vcpu->arch.dr6 &= ~15;
5124                        vcpu->arch.dr6 |= DR6_BS;
5125                        kvm_queue_exception(vcpu, DB_VECTOR);
5126                }
5127        }
5128}
5129
5130static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
5131{
5132        struct kvm_run *kvm_run = vcpu->run;
5133        unsigned long eip = vcpu->arch.emulate_ctxt.eip;
5134        u32 dr6 = 0;
5135
5136        if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
5137            (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
5138                dr6 = kvm_vcpu_check_hw_bp(eip, 0,
5139                                           vcpu->arch.guest_debug_dr7,
5140                                           vcpu->arch.eff_db);
5141
5142                if (dr6 != 0) {
5143                        kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
5144                        kvm_run->debug.arch.pc = kvm_rip_read(vcpu) +
5145                                get_segment_base(vcpu, VCPU_SREG_CS);
5146
5147                        kvm_run->debug.arch.exception = DB_VECTOR;
5148                        kvm_run->exit_reason = KVM_EXIT_DEBUG;
5149                        *r = EMULATE_USER_EXIT;
5150                        return true;
5151                }
5152        }
5153
5154        if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK)) {
5155                dr6 = kvm_vcpu_check_hw_bp(eip, 0,
5156                                           vcpu->arch.dr7,
5157                                           vcpu->arch.db);
5158
5159                if (dr6 != 0) {
5160                        vcpu->arch.dr6 &= ~15;
5161                        vcpu->arch.dr6 |= dr6;
5162                        kvm_queue_exception(vcpu, DB_VECTOR);
5163                        *r = EMULATE_DONE;
5164                        return true;
5165                }
5166        }
5167
5168        return false;
5169}
5170
5171int x86_emulate_instruction(struct kvm_vcpu *vcpu,
5172                            unsigned long cr2,
5173                            int emulation_type,
5174                            void *insn,
5175                            int insn_len)
5176{
5177        int r;
5178        struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
5179        bool writeback = true;
5180        bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
5181
5182        /*
5183         * Clear write_fault_to_shadow_pgtable here to ensure it is
5184         * never reused.
5185         */
5186        vcpu->arch.write_fault_to_shadow_pgtable = false;
5187        kvm_clear_exception_queue(vcpu);
5188
5189        if (!(emulation_type & EMULTYPE_NO_DECODE)) {
5190                init_emulate_ctxt(vcpu);
5191
5192                /*
5193                 * We will reenter on the same instruction since
5194                 * we do not set complete_userspace_io.  This does not
5195                 * handle watchpoints yet, those would be handled in
5196                 * the emulate_ops.
5197                 */
5198                if (kvm_vcpu_check_breakpoint(vcpu, &r))
5199                        return r;
5200
5201                ctxt->interruptibility = 0;
5202                ctxt->have_exception = false;
5203                ctxt->perm_ok = false;
5204
5205                ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
5206
5207                r = x86_decode_insn(ctxt, insn, insn_len);
5208
5209                trace_kvm_emulate_insn_start(vcpu);
5210                ++vcpu->stat.insn_emulation;
5211                if (r != EMULATION_OK)  {
5212                        if (emulation_type & EMULTYPE_TRAP_UD)
5213                                return EMULATE_FAIL;
5214                        if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
5215                                                emulation_type))
5216                                return EMULATE_DONE;
5217                        if (emulation_type & EMULTYPE_SKIP)
5218                                return EMULATE_FAIL;
5219                        return handle_emulation_failure(vcpu);
5220                }
5221        }
5222
5223        if (emulation_type & EMULTYPE_SKIP) {
5224                kvm_rip_write(vcpu, ctxt->_eip);
5225                return EMULATE_DONE;
5226        }
5227
5228        if (retry_instruction(ctxt, cr2, emulation_type))
5229                return EMULATE_DONE;
5230
5231        /* this is needed for vmware backdoor interface to work since it
5232           changes registers values  during IO operation */
5233        if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
5234                vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
5235                emulator_invalidate_register_cache(ctxt);
5236        }
5237
5238restart:
5239        r = x86_emulate_insn(ctxt);
5240
5241        if (r == EMULATION_INTERCEPTED)
5242                return EMULATE_DONE;
5243
5244        if (r == EMULATION_FAILED) {
5245                if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
5246                                        emulation_type))
5247                        return EMULATE_DONE;
5248
5249                return handle_emulation_failure(vcpu);
5250        }
5251
5252        if (ctxt->have_exception) {
5253                inject_emulated_exception(vcpu);
5254                r = EMULATE_DONE;
5255        } else if (vcpu->arch.pio.count) {
5256                if (!vcpu->arch.pio.in) {
5257                        /* FIXME: return into emulator if single-stepping.  */
5258                        vcpu->arch.pio.count = 0;
5259                } else {
5260                        writeback = false;
5261                        vcpu->arch.complete_userspace_io = complete_emulated_pio;
5262                }
5263                r = EMULATE_USER_EXIT;
5264        } else if (vcpu->mmio_needed) {
5265                if (!vcpu->mmio_is_write)
5266                        writeback = false;
5267                r = EMULATE_USER_EXIT;
5268                vcpu->arch.complete_userspace_io = complete_emulated_mmio;
5269        } else if (r == EMULATION_RESTART)
5270                goto restart;
5271        else
5272                r = EMULATE_DONE;
5273
5274        if (writeback) {
5275                toggle_interruptibility(vcpu, ctxt->interruptibility);
5276                kvm_make_request(KVM_REQ_EVENT, vcpu);
5277                vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
5278                kvm_rip_write(vcpu, ctxt->eip);
5279                if (r == EMULATE_DONE)
5280                        kvm_vcpu_check_singlestep(vcpu, &r);
5281                kvm_set_rflags(vcpu, ctxt->eflags);
5282        } else
5283                vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
5284
5285        return r;
5286}
5287EXPORT_SYMBOL_GPL(x86_emulate_instruction);
5288
5289int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
5290{
5291        unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
5292        int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
5293                                            size, port, &val, 1);
5294        /* do not return to emulator after return from userspace */
5295        vcpu->arch.pio.count = 0;
5296        return ret;
5297}
5298EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
5299
5300static void tsc_bad(void *info)
5301{
5302        __this_cpu_write(cpu_tsc_khz, 0);
5303}
5304
5305static void tsc_khz_changed(void *data)
5306{
5307        struct cpufreq_freqs *freq = data;
5308        unsigned long khz = 0;
5309
5310        if (data)
5311                khz = freq->new;
5312        else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
5313                khz = cpufreq_quick_get(raw_smp_processor_id());
5314        if (!khz)
5315                khz = tsc_khz;
5316        __this_cpu_write(cpu_tsc_khz, khz);
5317}
5318
5319static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
5320                                     void *data)
5321{
5322        struct cpufreq_freqs *freq = data;
5323        struct kvm *kvm;
5324        struct kvm_vcpu *vcpu;
5325        int i, send_ipi = 0;
5326
5327        /*
5328         * We allow guests to temporarily run on slowing clocks,
5329         * provided we notify them after, or to run on accelerating
5330         * clocks, provided we notify them before.  Thus time never
5331         * goes backwards.
5332         *
5333         * However, we have a problem.  We can't atomically update
5334         * the frequency of a given CPU from this function; it is
5335         * merely a notifier, which can be called from any CPU.
5336         * Changing the TSC frequency at arbitrary points in time
5337         * requires a recomputation of local variables related to
5338         * the TSC for each VCPU.  We must flag these local variables
5339         * to be updated and be sure the update takes place with the
5340         * new frequency before any guests proceed.
5341         *
5342         * Unfortunately, the combination of hotplug CPU and frequency
5343         * change creates an intractable locking scenario; the order
5344         * of when these callouts happen is undefined with respect to
5345         * CPU hotplug, and they can race with each other.  As such,
5346         * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
5347         * undefined; you can actually have a CPU frequency change take
5348         * place in between the computation of X and the setting of the
5349         * variable.  To protect against this problem, all updates of
5350         * the per_cpu tsc_khz variable are done in an interrupt
5351         * protected IPI, and all callers wishing to update the value
5352         * must wait for a synchronous IPI to complete (which is trivial
5353         * if the caller is on the CPU already).  This establishes the
5354         * necessary total order on variable updates.
5355         *
5356         * Note that because a guest time update may take place
5357         * anytime after the setting of the VCPU's request bit, the
5358         * correct TSC value must be set before the request.  However,
5359         * to ensure the update actually makes it to any guest which
5360         * starts running in hardware virtualization between the set
5361         * and the acquisition of the spinlock, we must also ping the
5362         * CPU after setting the request bit.
5363         *
5364         */
5365
5366        if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
5367                return 0;
5368        if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
5369                return 0;
5370
5371        smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
5372
5373        spin_lock(&kvm_lock);
5374        list_for_each_entry(kvm, &vm_list, vm_list) {
5375                kvm_for_each_vcpu(i, vcpu, kvm) {
5376                        if (vcpu->cpu != freq->cpu)
5377                                continue;
5378                        kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
5379                        if (vcpu->cpu != smp_processor_id())
5380                                send_ipi = 1;
5381                }
5382        }
5383        spin_unlock(&kvm_lock);
5384
5385        if (freq->old < freq->new && send_ipi) {
5386                /*
5387                 * We upscale the frequency.  Must make the guest
5388                 * doesn't see old kvmclock values while running with
5389                 * the new frequency, otherwise we risk the guest sees
5390                 * time go backwards.
5391                 *
5392                 * In case we update the frequency for another cpu
5393                 * (which might be in guest context) send an interrupt
5394                 * to kick the cpu out of guest context.  Next time
5395                 * guest context is entered kvmclock will be updated,
5396                 * so the guest will not see stale values.
5397                 */
5398                smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
5399        }
5400        return 0;
5401}
5402
5403static struct notifier_block kvmclock_cpufreq_notifier_block = {
5404        .notifier_call  = kvmclock_cpufreq_notifier
5405};
5406
5407static int kvmclock_cpu_notifier(struct notifier_block *nfb,
5408                                        unsigned long action, void *hcpu)
5409{
5410        unsigned int cpu = (unsigned long)hcpu;
5411
5412        switch (action) {
5413                case CPU_ONLINE:
5414                case CPU_DOWN_FAILED:
5415                        smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
5416                        break;
5417                case CPU_DOWN_PREPARE:
5418                        smp_call_function_single(cpu, tsc_bad, NULL, 1);
5419                        break;
5420        }
5421        return NOTIFY_OK;
5422}
5423
5424static struct notifier_block kvmclock_cpu_notifier_block = {
5425        .notifier_call  = kvmclock_cpu_notifier,
5426        .priority = -INT_MAX
5427};
5428
5429static void kvm_timer_init(void)
5430{
5431        int cpu;
5432
5433        max_tsc_khz = tsc_khz;
5434
5435        cpu_notifier_register_begin();
5436        if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
5437#ifdef CONFIG_CPU_FREQ
5438                struct cpufreq_policy policy;
5439                memset(&policy, 0, sizeof(policy));
5440                cpu = get_cpu();
5441                cpufreq_get_policy(&policy, cpu);
5442                if (policy.cpuinfo.max_freq)
5443                        max_tsc_khz = policy.cpuinfo.max_freq;
5444                put_cpu();
5445#endif
5446                cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
5447                                          CPUFREQ_TRANSITION_NOTIFIER);
5448        }
5449        pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
5450        for_each_online_cpu(cpu)
5451                smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
5452
5453        __register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
5454        cpu_notifier_register_done();
5455
5456}
5457
5458static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
5459
5460int kvm_is_in_guest(void)
5461{
5462        return __this_cpu_read(current_vcpu) != NULL;
5463}
5464
5465static int kvm_is_user_mode(void)
5466{
5467        int user_mode = 3;
5468
5469        if (__this_cpu_read(current_vcpu))
5470                user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu));
5471
5472        return user_mode != 0;
5473}
5474
5475static unsigned long kvm_get_guest_ip(void)
5476{
5477        unsigned long ip = 0;
5478
5479        if (__this_cpu_read(current_vcpu))
5480                ip = kvm_rip_read(__this_cpu_read(current_vcpu));
5481
5482        return ip;
5483}
5484
5485static struct perf_guest_info_callbacks kvm_guest_cbs = {
5486        .is_in_guest            = kvm_is_in_guest,
5487        .is_user_mode           = kvm_is_user_mode,
5488        .get_guest_ip           = kvm_get_guest_ip,
5489};
5490
5491void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
5492{
5493        __this_cpu_write(current_vcpu, vcpu);
5494}
5495EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
5496
5497void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
5498{
5499        __this_cpu_write(current_vcpu, NULL);
5500}
5501EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
5502
5503static void kvm_set_mmio_spte_mask(void)
5504{
5505        u64 mask;
5506        int maxphyaddr = boot_cpu_data.x86_phys_bits;
5507
5508        /*
5509         * Set the reserved bits and the present bit of an paging-structure
5510         * entry to generate page fault with PFER.RSV = 1.
5511         */
5512         /* Mask the reserved physical address bits. */
5513        mask = ((1ull << (51 - maxphyaddr + 1)) - 1) << maxphyaddr;
5514
5515        /* Bit 62 is always reserved for 32bit host. */
5516        mask |= 0x3ull << 62;
5517
5518        /* Set the present bit. */
5519        mask |= 1ull;
5520
5521#ifdef CONFIG_X86_64
5522        /*
5523         * If reserved bit is not supported, clear the present bit to disable
5524         * mmio page fault.
5525         */
5526        if (maxphyaddr == 52)
5527                mask &= ~1ull;
5528#endif
5529
5530        kvm_mmu_set_mmio_spte_mask(mask);
5531}
5532
5533#ifdef CONFIG_X86_64
5534static void pvclock_gtod_update_fn(struct work_struct *work)
5535{
5536        struct kvm *kvm;
5537
5538        struct kvm_vcpu *vcpu;
5539        int i;
5540
5541        spin_lock(&kvm_lock);
5542        list_for_each_entry(kvm, &vm_list, vm_list)
5543                kvm_for_each_vcpu(i, vcpu, kvm)
5544                        set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);
5545        atomic_set(&kvm_guest_has_master_clock, 0);
5546        spin_unlock(&kvm_lock);
5547}
5548
5549static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
5550
5551/*
5552 * Notification about pvclock gtod data update.
5553 */
5554static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
5555                               void *priv)
5556{
5557        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
5558        struct timekeeper *tk = priv;
5559
5560        update_pvclock_gtod(tk);
5561
5562        /* disable master clock if host does not trust, or does not
5563         * use, TSC clocksource
5564         */
5565        if (gtod->clock.vclock_mode != VCLOCK_TSC &&
5566            atomic_read(&kvm_guest_has_master_clock) != 0)
5567                queue_work(system_long_wq, &pvclock_gtod_work);
5568
5569        return 0;
5570}
5571
5572static struct notifier_block pvclock_gtod_notifier = {
5573        .notifier_call = pvclock_gtod_notify,
5574};
5575#endif
5576
5577int kvm_arch_init(void *opaque)
5578{
5579        int r;
5580        struct kvm_x86_ops *ops = opaque;
5581
5582        if (kvm_x86_ops) {
5583                printk(KERN_ERR "kvm: already loaded the other module\n");
5584                r = -EEXIST;
5585                goto out;
5586        }
5587
5588        if (!ops->cpu_has_kvm_support()) {
5589                printk(KERN_ERR "kvm: no hardware support\n");
5590                r = -EOPNOTSUPP;
5591                goto out;
5592        }
5593        if (ops->disabled_by_bios()) {
5594                printk(KERN_ERR "kvm: disabled by bios\n");
5595                r = -EOPNOTSUPP;
5596                goto out;
5597        }
5598
5599        r = -ENOMEM;
5600        shared_msrs = alloc_percpu(struct kvm_shared_msrs);
5601        if (!shared_msrs) {
5602                printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
5603                goto out;
5604        }
5605
5606        r = kvm_mmu_module_init();
5607        if (r)
5608                goto out_free_percpu;
5609
5610        kvm_set_mmio_spte_mask();
5611
5612        kvm_x86_ops = ops;
5613        kvm_init_msr_list();
5614
5615        kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
5616                        PT_DIRTY_MASK, PT64_NX_MASK, 0);
5617
5618        kvm_timer_init();
5619
5620        perf_register_guest_info_callbacks(&kvm_guest_cbs);
5621
5622        if (cpu_has_xsave)
5623                host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
5624
5625        kvm_lapic_init();
5626#ifdef CONFIG_X86_64
5627        pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
5628#endif
5629
5630        return 0;
5631
5632out_free_percpu:
5633        free_percpu(shared_msrs);
5634out:
5635        return r;
5636}
5637
5638void kvm_arch_exit(void)
5639{
5640        perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
5641
5642        if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
5643                cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
5644                                            CPUFREQ_TRANSITION_NOTIFIER);
5645        unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
5646#ifdef CONFIG_X86_64
5647        pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
5648#endif
5649        kvm_x86_ops = NULL;
5650        kvm_mmu_module_exit();
5651        free_percpu(shared_msrs);
5652}
5653
5654int kvm_emulate_halt(struct kvm_vcpu *vcpu)
5655{
5656        ++vcpu->stat.halt_exits;
5657        if (irqchip_in_kernel(vcpu->kvm)) {
5658                vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
5659                return 1;
5660        } else {
5661                vcpu->run->exit_reason = KVM_EXIT_HLT;
5662                return 0;
5663        }
5664}
5665EXPORT_SYMBOL_GPL(kvm_emulate_halt);
5666
5667int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
5668{
5669        u64 param, ingpa, outgpa, ret;
5670        uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
5671        bool fast, longmode;
5672        int cs_db, cs_l;
5673
5674        /*
5675         * hypercall generates UD from non zero cpl and real mode
5676         * per HYPER-V spec
5677         */
5678        if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
5679                kvm_queue_exception(vcpu, UD_VECTOR);
5680                return 0;
5681        }
5682
5683        kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
5684        longmode = is_long_mode(vcpu) && cs_l == 1;
5685
5686        if (!longmode) {
5687                param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
5688                        (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
5689                ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
5690                        (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
5691                outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
5692                        (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
5693        }
5694#ifdef CONFIG_X86_64
5695        else {
5696                param = kvm_register_read(vcpu, VCPU_REGS_RCX);
5697                ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
5698                outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
5699        }
5700#endif
5701
5702        code = param & 0xffff;
5703        fast = (param >> 16) & 0x1;
5704        rep_cnt = (param >> 32) & 0xfff;
5705        rep_idx = (param >> 48) & 0xfff;
5706
5707        trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
5708
5709        switch (code) {
5710        case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
5711                kvm_vcpu_on_spin(vcpu);
5712                break;
5713        default:
5714                res = HV_STATUS_INVALID_HYPERCALL_CODE;
5715                break;
5716        }
5717
5718        ret = res | (((u64)rep_done & 0xfff) << 32);
5719        if (longmode) {
5720                kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
5721        } else {
5722                kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
5723                kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
5724        }
5725
5726        return 1;
5727}
5728
5729/*
5730 * kvm_pv_kick_cpu_op:  Kick a vcpu.
5731 *
5732 * @apicid - apicid of vcpu to be kicked.
5733 */
5734static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
5735{
5736        struct kvm_lapic_irq lapic_irq;
5737
5738        lapic_irq.shorthand = 0;
5739        lapic_irq.dest_mode = 0;
5740        lapic_irq.dest_id = apicid;
5741
5742        lapic_irq.delivery_mode = APIC_DM_REMRD;
5743        kvm_irq_delivery_to_apic(kvm, 0, &lapic_irq, NULL);
5744}
5745
5746int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
5747{
5748        unsigned long nr, a0, a1, a2, a3, ret;
5749        int r = 1;
5750
5751        if (kvm_hv_hypercall_enabled(vcpu->kvm))
5752                return kvm_hv_hypercall(vcpu);
5753
5754        nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
5755        a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
5756        a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
5757        a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
5758        a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
5759
5760        trace_kvm_hypercall(nr, a0, a1, a2, a3);
5761
5762        if (!is_long_mode(vcpu)) {
5763                nr &= 0xFFFFFFFF;
5764                a0 &= 0xFFFFFFFF;
5765                a1 &= 0xFFFFFFFF;
5766                a2 &= 0xFFFFFFFF;
5767                a3 &= 0xFFFFFFFF;
5768        }
5769
5770        if (kvm_x86_ops->get_cpl(vcpu) != 0) {
5771                ret = -KVM_EPERM;
5772                goto out;
5773        }
5774
5775        switch (nr) {
5776        case KVM_HC_VAPIC_POLL_IRQ:
5777                ret = 0;
5778                break;
5779        case KVM_HC_KICK_CPU:
5780                kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
5781                ret = 0;
5782                break;
5783        default:
5784                ret = -KVM_ENOSYS;
5785                break;
5786        }
5787out:
5788        kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
5789        ++vcpu->stat.hypercalls;
5790        return r;
5791}
5792EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
5793
5794static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
5795{
5796        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5797        char instruction[3];
5798        unsigned long rip = kvm_rip_read(vcpu);
5799
5800        kvm_x86_ops->patch_hypercall(vcpu, instruction);
5801
5802        return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
5803}
5804
5805/*
5806 * Check if userspace requested an interrupt window, and that the
5807 * interrupt window is open.
5808 *
5809 * No need to exit to userspace if we already have an interrupt queued.
5810 */
5811static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
5812{
5813        return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
5814                vcpu->run->request_interrupt_window &&
5815                kvm_arch_interrupt_allowed(vcpu));
5816}
5817
5818static void post_kvm_run_save(struct kvm_vcpu *vcpu)
5819{
5820        struct kvm_run *kvm_run = vcpu->run;
5821
5822        kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
5823        kvm_run->cr8 = kvm_get_cr8(vcpu);
5824        kvm_run->apic_base = kvm_get_apic_base(vcpu);
5825        if (irqchip_in_kernel(vcpu->kvm))
5826                kvm_run->ready_for_interrupt_injection = 1;
5827        else
5828                kvm_run->ready_for_interrupt_injection =
5829                        kvm_arch_interrupt_allowed(vcpu) &&
5830                        !kvm_cpu_has_interrupt(vcpu) &&
5831                        !kvm_event_needs_reinjection(vcpu);
5832}
5833
5834static void update_cr8_intercept(struct kvm_vcpu *vcpu)
5835{
5836        int max_irr, tpr;
5837
5838        if (!kvm_x86_ops->update_cr8_intercept)
5839                return;
5840
5841        if (!vcpu->arch.apic)
5842                return;
5843
5844        if (!vcpu->arch.apic->vapic_addr)
5845                max_irr = kvm_lapic_find_highest_irr(vcpu);
5846        else
5847                max_irr = -1;
5848
5849        if (max_irr != -1)
5850                max_irr >>= 4;
5851
5852        tpr = kvm_lapic_get_cr8(vcpu);
5853
5854        kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
5855}
5856
5857static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
5858{
5859        int r;
5860
5861        /* try to reinject previous events if any */
5862        if (vcpu->arch.exception.pending) {
5863                trace_kvm_inj_exception(vcpu->arch.exception.nr,
5864                                        vcpu->arch.exception.has_error_code,
5865                                        vcpu->arch.exception.error_code);
5866                kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
5867                                          vcpu->arch.exception.has_error_code,
5868                                          vcpu->arch.exception.error_code,
5869                                          vcpu->arch.exception.reinject);
5870                return 0;
5871        }
5872
5873        if (vcpu->arch.nmi_injected) {
5874                kvm_x86_ops->set_nmi(vcpu);
5875                return 0;
5876        }
5877
5878        if (vcpu->arch.interrupt.pending) {
5879                kvm_x86_ops->set_irq(vcpu);
5880                return 0;
5881        }
5882
5883        if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
5884                r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
5885                if (r != 0)
5886                        return r;
5887        }
5888
5889        /* try to inject new event if pending */
5890        if (vcpu->arch.nmi_pending) {
5891                if (kvm_x86_ops->nmi_allowed(vcpu)) {
5892                        --vcpu->arch.nmi_pending;
5893                        vcpu->arch.nmi_injected = true;
5894                        kvm_x86_ops->set_nmi(vcpu);
5895                }
5896        } else if (kvm_cpu_has_injectable_intr(vcpu)) {
5897                if (kvm_x86_ops->interrupt_allowed(vcpu)) {
5898                        kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
5899                                            false);
5900                        kvm_x86_ops->set_irq(vcpu);
5901                }
5902        }
5903        return 0;
5904}
5905
5906static void process_nmi(struct kvm_vcpu *vcpu)
5907{
5908        unsigned limit = 2;
5909
5910        /*
5911         * x86 is limited to one NMI running, and one NMI pending after it.
5912         * If an NMI is already in progress, limit further NMIs to just one.
5913         * Otherwise, allow two (and we'll inject the first one immediately).
5914         */
5915        if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
5916                limit = 1;
5917
5918        vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
5919        vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
5920        kvm_make_request(KVM_REQ_EVENT, vcpu);
5921}
5922
5923static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
5924{
5925        u64 eoi_exit_bitmap[4];
5926        u32 tmr[8];
5927
5928        if (!kvm_apic_hw_enabled(vcpu->arch.apic))
5929                return;
5930
5931        memset(eoi_exit_bitmap, 0, 32);
5932        memset(tmr, 0, 32);
5933
5934        kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap, tmr);
5935        kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
5936        kvm_apic_update_tmr(vcpu, tmr);
5937}
5938
5939/*
5940 * Returns 1 to let __vcpu_run() continue the guest execution loop without
5941 * exiting to the userspace.  Otherwise, the value will be returned to the
5942 * userspace.
5943 */
5944static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5945{
5946        int r;
5947        bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
5948                vcpu->run->request_interrupt_window;
5949        bool req_immediate_exit = false;
5950
5951        if (vcpu->requests) {
5952                if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
5953                        kvm_mmu_unload(vcpu);
5954                if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
5955                        __kvm_migrate_timers(vcpu);
5956                if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
5957                        kvm_gen_update_masterclock(vcpu->kvm);
5958                if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
5959                        kvm_gen_kvmclock_update(vcpu);
5960                if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
5961                        r = kvm_guest_time_update(vcpu);
5962                        if (unlikely(r))
5963                                goto out;
5964                }
5965                if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
5966                        kvm_mmu_sync_roots(vcpu);
5967                if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
5968                        kvm_x86_ops->tlb_flush(vcpu);
5969                if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
5970                        vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
5971                        r = 0;
5972                        goto out;
5973                }
5974                if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
5975                        vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
5976                        r = 0;
5977                        goto out;
5978                }
5979                if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
5980                        vcpu->fpu_active = 0;
5981                        kvm_x86_ops->fpu_deactivate(vcpu);
5982                }
5983                if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
5984                        /* Page is swapped out. Do synthetic halt */
5985                        vcpu->arch.apf.halted = true;
5986                        r = 1;
5987                        goto out;
5988                }
5989                if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
5990                        record_steal_time(vcpu);
5991                if (kvm_check_request(KVM_REQ_NMI, vcpu))
5992                        process_nmi(vcpu);
5993                if (kvm_check_request(KVM_REQ_PMU, vcpu))
5994                        kvm_handle_pmu_event(vcpu);
5995                if (kvm_check_request(KVM_REQ_PMI, vcpu))
5996                        kvm_deliver_pmi(vcpu);
5997                if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
5998                        vcpu_scan_ioapic(vcpu);
5999        }
6000

6001        if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
6002                kvm_apic_accept_events(vcpu);
6003                if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
6004                        r = 1;
6005                        goto out;
6006                }
6007
6008                if (inject_pending_event(vcpu, req_int_win) != 0)
6009                        req_immediate_exit = true;
6010                /* enable NMI/IRQ window open exits if needed */
6011                else if (vcpu->arch.nmi_pending)
6012                        kvm_x86_ops->enable_nmi_window(vcpu);
6013                else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
6014                        kvm_x86_ops->enable_irq_window(vcpu);
6015
6016                if (kvm_lapic_enabled(vcpu)) {
6017                        /*
6018                         * Update architecture specific hints for APIC
6019                         * virtual interrupt delivery.
6020                         */
6021                        if (kvm_x86_ops->hwapic_irr_update)
6022                                kvm_x86_ops->hwapic_irr_update(vcpu,
6023                                        kvm_lapic_find_highest_irr(vcpu));
6024                        update_cr8_intercept(vcpu);
6025                        kvm_lapic_sync_to_vapic(vcpu);
6026                }
6027        }
6028
6029        r = kvm_mmu_reload(vcpu);
6030        if (unlikely(r)) {
6031                goto cancel_injection;
6032        }
6033
6034        preempt_disable();
6035
6036        kvm_x86_ops->prepare_guest_switch(vcpu);
6037        if (vcpu->fpu_active)
6038                kvm_load_guest_fpu(vcpu);
6039        kvm_load_guest_xcr0(vcpu);
6040
6041        vcpu->mode = IN_GUEST_MODE;
6042
6043        srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
6044
6045        /* We should set ->mode before check ->requests,
6046         * see the comment in make_all_cpus_request.
6047         */
6048        smp_mb__after_srcu_read_unlock();
6049
6050        local_irq_disable();
6051
6052        if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
6053            || need_resched() || signal_pending(current)) {
6054                vcpu->mode = OUTSIDE_GUEST_MODE;
6055                smp_wmb();
6056                local_irq_enable();
6057                preempt_enable();
6058                vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
6059                r = 1;
6060                goto cancel_injection;
6061        }
6062
6063        if (req_immediate_exit)
6064                smp_send_reschedule(vcpu->cpu);
6065
6066        kvm_guest_enter();
6067
6068        if (unlikely(vcpu->arch.switch_db_regs)) {
6069                set_debugreg(0, 7);
6070                set_debugreg(vcpu->arch.eff_db[0], 0);
6071                set_debugreg(vcpu->arch.eff_db[1], 1);
6072                set_debugreg(vcpu->arch.eff_db[2], 2);
6073                set_debugreg(vcpu->arch.eff_db[3], 3);
6074                set_debugreg(vcpu->arch.dr6, 6);
6075        }
6076
6077        trace_kvm_entry(vcpu->vcpu_id);
6078        kvm_x86_ops->run(vcpu);
6079
6080        /*
6081         * Do this here before restoring debug registers on the host.  And
6082         * since we do this before handling the vmexit, a DR access vmexit
6083         * can (a) read the correct value of the debug registers, (b) set
6084         * KVM_DEBUGREG_WONT_EXIT again.
6085         */
6086        if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
6087                int i;
6088
6089                WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
6090                kvm_x86_ops->sync_dirty_debug_regs(vcpu);
6091                for (i = 0; i < KVM_NR_DB_REGS; i++)
6092                        vcpu->arch.eff_db[i] = vcpu->arch.db[i];
6093        }
6094
6095        /*
6096         * If the guest has used debug registers, at least dr7
6097         * will be disabled while returning to the host.
6098         * If we don't have active breakpoints in the host, we don't
6099         * care about the messed up debug address registers. But if
6100         * we have some of them active, restore the old state.
6101         */
6102        if (hw_breakpoint_active())
6103                hw_breakpoint_restore();
6104
6105        vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
6106                                                           native_read_tsc());
6107
6108        vcpu->mode = OUTSIDE_GUEST_MODE;
6109        smp_wmb();
6110
6111        /* Interrupt is enabled by handle_external_intr() */
6112        kvm_x86_ops->handle_external_intr(vcpu);
6113
6114        ++vcpu->stat.exits;
6115
6116        /*
6117         * We must have an instruction between local_irq_enable() and
6118         * kvm_guest_exit(), so the timer interrupt isn't delayed by
6119         * the interrupt shadow.  The stat.exits increment will do nicely.
6120         * But we need to prevent reordering, hence this barrier():
6121         */
6122        barrier();
6123
6124        kvm_guest_exit();
6125
6126        preempt_enable();
6127
6128        vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
6129
6130        /*
6131         * Profile KVM exit RIPs:
6132         */
6133        if (unlikely(prof_on == KVM_PROFILING)) {
6134                unsigned long rip = kvm_rip_read(vcpu);
6135                profile_hit(KVM_PROFILING, (void *)rip);
6136        }
6137
6138        if (unlikely(vcpu->arch.tsc_always_catchup))
6139                kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
6140
6141        if (vcpu->arch.apic_attention)
6142                kvm_lapic_sync_from_vapic(vcpu);
6143
6144        r = kvm_x86_ops->handle_exit(vcpu);
6145        return r;
6146
6147cancel_injection:
6148        kvm_x86_ops->cancel_injection(vcpu);
6149        if (unlikely(vcpu->arch.apic_attention))
6150                kvm_lapic_sync_from_vapic(vcpu);
6151out:
6152        return r;
6153}
6154
6155
6156static int __vcpu_run(struct kvm_vcpu *vcpu)
6157{
6158        int r;
6159        struct kvm *kvm = vcpu->kvm;
6160
6161        vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
6162
6163        r = 1;
6164        while (r > 0) {
6165                if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
6166                    !vcpu->arch.apf.halted)
6167                        r = vcpu_enter_guest(vcpu);
6168                else {
6169                        srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
6170                        kvm_vcpu_block(vcpu);
6171                        vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
6172                        if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
6173                                kvm_apic_accept_events(vcpu);
6174                                switch(vcpu->arch.mp_state) {
6175                                case KVM_MP_STATE_HALTED:
6176                                        vcpu->arch.pv.pv_unhalted = false;
6177                                        vcpu->arch.mp_state =
6178                                                KVM_MP_STATE_RUNNABLE;
6179                                case KVM_MP_STATE_RUNNABLE:
6180                                        vcpu->arch.apf.halted = false;
6181                                        break;
6182                                case KVM_MP_STATE_INIT_RECEIVED:
6183                                        break;
6184                                default:
6185                                        r = -EINTR;
6186                                        break;
6187                                }
6188                        }
6189                }
6190
6191                if (r <= 0)
6192                        break;
6193
6194                clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
6195                if (kvm_cpu_has_pending_timer(vcpu))
6196                        kvm_inject_pending_timer_irqs(vcpu);
6197
6198                if (dm_request_for_irq_injection(vcpu)) {
6199                        r = -EINTR;
6200                        vcpu->run->exit_reason = KVM_EXIT_INTR;
6201                        ++vcpu->stat.request_irq_exits;
6202                }
6203
6204                kvm_check_async_pf_completion(vcpu);
6205
6206                if (signal_pending(current)) {
6207                        r = -EINTR;
6208                        vcpu->run->exit_reason = KVM_EXIT_INTR;
6209                        ++vcpu->stat.signal_exits;
6210                }
6211                if (need_resched()) {
6212                        srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
6213                        cond_resched();
6214                        vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
6215                }
6216        }
6217
6218        srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
6219
6220        return r;
6221}
6222
6223static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
6224{
6225        int r;
6226        vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
6227        r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
6228        srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
6229        if (r != EMULATE_DONE)
6230                return 0;
6231        return 1;
6232}
6233
6234static int complete_emulated_pio(struct kvm_vcpu *vcpu)
6235{
6236        BUG_ON(!vcpu->arch.pio.count);
6237
6238        return complete_emulated_io(vcpu);
6239}
6240
6241/*
6242 * Implements the following, as a state machine:
6243 *
6244 * read:
6245 *   for each fragment
6246 *     for each mmio piece in the fragment
6247 *       write gpa, len
6248 *       exit
6249 *       copy data
6250 *   execute insn
6251 *
6252 * write:
6253 *   for each fragment
6254 *     for each mmio piece in the fragment
6255 *       write gpa, len
6256 *       copy data
6257 *       exit
6258 */
6259static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
6260{
6261        struct kvm_run *run = vcpu->run;
6262        struct kvm_mmio_fragment *frag;
6263        unsigned len;
6264
6265        BUG_ON(!vcpu->mmio_needed);
6266
6267        /* Complete previous fragment */
6268        frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
6269        len = min(8u, frag->len);
6270        if (!vcpu->mmio_is_write)
6271                memcpy(frag->data, run->mmio.data, len);
6272
6273        if (frag->len <= 8) {
6274                /* Switch to the next fragment. */
6275                frag++;
6276                vcpu->mmio_cur_fragment++;
6277        } else {
6278                /* Go forward to the next mmio piece. */
6279                frag->data += len;
6280                frag->gpa += len;
6281                frag->len -= len;
6282        }
6283
6284        if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
6285                vcpu->mmio_needed = 0;
6286
6287                /* FIXME: return into emulator if single-stepping.  */
6288                if (vcpu->mmio_is_write)
6289                        return 1;
6290                vcpu->mmio_read_completed = 1;
6291                return complete_emulated_io(vcpu);
6292        }
6293
6294        run->exit_reason = KVM_EXIT_MMIO;
6295        run->mmio.phys_addr = frag->gpa;
6296        if (vcpu->mmio_is_write)
6297                memcpy(run->mmio.data, frag->data, min(8u, frag->len));
6298        run->mmio.len = min(8u, frag->len);
6299        run->mmio.is_write = vcpu->mmio_is_write;
6300        vcpu->arch.complete_userspace_io = complete_emulated_mmio;
6301        return 0;
6302}
6303
6304
6305int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
6306{
6307        int r;
6308        sigset_t sigsaved;
6309
6310        if (!tsk_used_math(current) && init_fpu(current))
6311                return -ENOMEM;
6312
6313        if (vcpu->sigset_active)
6314                sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
6315
6316        if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
6317                kvm_vcpu_block(vcpu);
6318                kvm_apic_accept_events(vcpu);
6319                clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
6320                r = -EAGAIN;
6321                goto out;
6322        }
6323
6324        /* re-sync apic's tpr */
6325        if (!irqchip_in_kernel(vcpu->kvm)) {
6326                if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
6327                        r = -EINVAL;
6328                        goto out;
6329                }
6330        }
6331
6332        if (unlikely(vcpu->arch.complete_userspace_io)) {
6333                int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
6334                vcpu->arch.complete_userspace_io = NULL;
6335                r = cui(vcpu);
6336                if (r <= 0)
6337                        goto out;
6338        } else
6339                WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
6340
6341        r = __vcpu_run(vcpu);
6342
6343out:
6344        post_kvm_run_save(vcpu);
6345        if (vcpu->sigset_active)
6346                sigprocmask(SIG_SETMASK, &sigsaved, NULL);
6347
6348        return r;
6349}
6350
6351int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
6352{
6353        if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
6354                /*
6355                 * We are here if userspace calls get_regs() in the middle of
6356                 * instruction emulation. Registers state needs to be copied
6357                 * back from emulation context to vcpu. Userspace shouldn't do
6358                 * that usually, but some bad designed PV devices (vmware
6359                 * backdoor interface) need this to work
6360                 */
6361                emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
6362                vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
6363        }
6364        regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
6365        regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
6366        regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
6367        regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
6368        regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
6369        regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
6370        regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
6371        regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
6372#ifdef CONFIG_X86_64
6373        regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
6374        regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
6375        regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
6376        regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
6377        regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
6378        regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
6379        regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
6380        regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
6381#endif
6382
6383        regs->rip = kvm_rip_read(vcpu);
6384        regs->rflags = kvm_get_rflags(vcpu);
6385
6386        return 0;
6387}
6388
6389int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
6390{
6391        vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
6392        vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
6393
6394        kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
6395        kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
6396        kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
6397        kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
6398        kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
6399        kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
6400        kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
6401        kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
6402#ifdef CONFIG_X86_64
6403        kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
6404        kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
6405        kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
6406        kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
6407        kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
6408        kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
6409        kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
6410        kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
6411#endif
6412
6413        kvm_rip_write(vcpu, regs->rip);
6414        kvm_set_rflags(vcpu, regs->rflags);
6415
6416        vcpu->arch.exception.pending = false;
6417
6418        kvm_make_request(KVM_REQ_EVENT, vcpu);
6419
6420        return 0;
6421}
6422
6423void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
6424{
6425        struct kvm_segment cs;
6426
6427        kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
6428        *db = cs.db;
6429        *l = cs.l;
6430}
6431EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
6432
6433int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
6434                                  struct kvm_sregs *sregs)
6435{
6436        struct desc_ptr dt;
6437
6438        kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
6439        kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
6440        kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
6441        kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
6442        kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
6443        kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
6444
6445        kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
6446        kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
6447
6448        kvm_x86_ops->get_idt(vcpu, &dt);
6449        sregs->idt.limit = dt.size;
6450        sregs->idt.base = dt.address;
6451        kvm_x86_ops->get_gdt(vcpu, &dt);
6452        sregs->gdt.limit = dt.size;
6453        sregs->gdt.base = dt.address;
6454
6455        sregs->cr0 = kvm_read_cr0(vcpu);
6456        sregs->cr2 = vcpu->arch.cr2;
6457        sregs->cr3 = kvm_read_cr3(vcpu);
6458        sregs->cr4 = kvm_read_cr4(vcpu);
6459        sregs->cr8 = kvm_get_cr8(vcpu);
6460        sregs->efer = vcpu->arch.efer;
6461        sregs->apic_base = kvm_get_apic_base(vcpu);
6462
6463        memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
6464
6465        if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
6466                set_bit(vcpu->arch.interrupt.nr,
6467                        (unsigned long *)sregs->interrupt_bitmap);
6468
6469        return 0;
6470}
6471
6472int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
6473                                    struct kvm_mp_state *mp_state)
6474{
6475        kvm_apic_accept_events(vcpu);
6476        if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
6477                                        vcpu->arch.pv.pv_unhalted)
6478                mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
6479        else
6480                mp_state->mp_state = vcpu->arch.mp_state;
6481
6482        return 0;
6483}
6484
6485int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
6486                                    struct kvm_mp_state *mp_state)
6487{
6488        if (!kvm_vcpu_has_lapic(vcpu) &&
6489            mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
6490                return -EINVAL;
6491
6492        if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
6493                vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
6494                set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
6495        } else
6496                vcpu->arch.mp_state = mp_state->mp_state;
6497        kvm_make_request(KVM_REQ_EVENT, vcpu);
6498        return 0;
6499}
6500
6501int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
6502                    int reason, bool has_error_code, u32 error_code)
6503{
6504        struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
6505        int ret;
6506
6507        init_emulate_ctxt(vcpu);
6508
6509        ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
6510                                   has_error_code, error_code);
6511
6512        if (ret)
6513                return EMULATE_FAIL;
6514
6515        kvm_rip_write(vcpu, ctxt->eip);
6516        kvm_set_rflags(vcpu, ctxt->eflags);
6517        kvm_make_request(KVM_REQ_EVENT, vcpu);
6518        return EMULATE_DONE;
6519}
6520EXPORT_SYMBOL_GPL(kvm_task_switch);
6521
6522int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
6523                                  struct kvm_sregs *sregs)
6524{
6525        struct msr_data apic_base_msr;
6526        int mmu_reset_needed = 0;
6527        int pending_vec, max_bits, idx;
6528        struct desc_ptr dt;
6529
6530        if (!guest_cpuid_has_xsave(vcpu) && (sregs->cr4 & X86_CR4_OSXSAVE))
6531                return -EINVAL;
6532
6533        dt.size = sregs->idt.limit;
6534        dt.address = sregs->idt.base;
6535        kvm_x86_ops->set_idt(vcpu, &dt);
6536        dt.size = sregs->gdt.limit;
6537        dt.address = sregs->gdt.base;
6538        kvm_x86_ops->set_gdt(vcpu, &dt);
6539
6540        vcpu->arch.cr2 = sregs->cr2;
6541        mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
6542        vcpu->arch.cr3 = sregs->cr3;
6543        __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
6544
6545        kvm_set_cr8(vcpu, sregs->cr8);
6546
6547        mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
6548        kvm_x86_ops->set_efer(vcpu, sregs->efer);
6549        apic_base_msr.data = sregs->apic_base;
6550        apic_base_msr.host_initiated = true;
6551        kvm_set_apic_base(vcpu, &apic_base_msr);
6552
6553        mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
6554        kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
6555        vcpu->arch.cr0 = sregs->cr0;
6556
6557        mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
6558        kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
6559        if (sregs->cr4 & X86_CR4_OSXSAVE)
6560                kvm_update_cpuid(vcpu);
6561
6562        idx = srcu_read_lock(&vcpu->kvm->srcu);
6563        if (!is_long_mode(vcpu) && is_pae(vcpu)) {
6564                load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
6565                mmu_reset_needed = 1;
6566        }
6567        srcu_read_unlock(&vcpu->kvm->srcu, idx);
6568
6569        if (mmu_reset_needed)
6570                kvm_mmu_reset_context(vcpu);
6571
6572        max_bits = KVM_NR_INTERRUPTS;
6573        pending_vec = find_first_bit(
6574                (const unsigned long *)sregs->interrupt_bitmap, max_bits);
6575        if (pending_vec < max_bits) {
6576                kvm_queue_interrupt(vcpu, pending_vec, false);
6577                pr_debug("Set back pending irq %d\n", pending_vec);
6578        }
6579
6580        kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
6581        kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
6582        kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
6583        kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
6584        kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
6585        kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
6586
6587        kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
6588        kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
6589
6590        update_cr8_intercept(vcpu);
6591
6592        /* Older userspace won't unhalt the vcpu on reset. */
6593        if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
6594            sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
6595            !is_protmode(vcpu))
6596                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
6597
6598        kvm_make_request(KVM_REQ_EVENT, vcpu);
6599
6600        return 0;
6601}
6602
6603int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
6604                                        struct kvm_guest_debug *dbg)
6605{
6606        unsigned long rflags;
6607        int i, r;
6608
6609        if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
6610                r = -EBUSY;
6611                if (vcpu->arch.exception.pending)
6612                        goto out;
6613                if (dbg->control & KVM_GUESTDBG_INJECT_DB)
6614                        kvm_queue_exception(vcpu, DB_VECTOR);
6615                else
6616                        kvm_queue_exception(vcpu, BP_VECTOR);
6617        }
6618
6619        /*
6620         * Read rflags as long as potentially injected trace flags are still
6621         * filtered out.
6622         */
6623        rflags = kvm_get_rflags(vcpu);
6624
6625        vcpu->guest_debug = dbg->control;
6626        if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
6627                vcpu->guest_debug = 0;
6628
6629        if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
6630                for (i = 0; i < KVM_NR_DB_REGS; ++i)
6631                        vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
6632                vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
6633        } else {
6634                for (i = 0; i < KVM_NR_DB_REGS; i++)
6635                        vcpu->arch.eff_db[i] = vcpu->arch.db[i];
6636        }
6637        kvm_update_dr7(vcpu);
6638
6639        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
6640                vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
6641                        get_segment_base(vcpu, VCPU_SREG_CS);
6642
6643        /*
6644         * Trigger an rflags update that will inject or remove the trace
6645         * flags.
6646         */
6647        kvm_set_rflags(vcpu, rflags);
6648
6649        kvm_x86_ops->update_db_bp_intercept(vcpu);
6650
6651        r = 0;
6652
6653out:
6654
6655        return r;
6656}
6657
6658/*
6659 * Translate a guest virtual address to a guest physical address.
6660 */
6661int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
6662                                    struct kvm_translation *tr)
6663{
6664        unsigned long vaddr = tr->linear_address;
6665        gpa_t gpa;
6666        int idx;
6667
6668        idx = srcu_read_lock(&vcpu->kvm->srcu);
6669        gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
6670        srcu_read_unlock(&vcpu->kvm->srcu, idx);
6671        tr->physical_address = gpa;
6672        tr->valid = gpa != UNMAPPED_GVA;
6673        tr->writeable = 1;
6674        tr->usermode = 0;
6675
6676        return 0;
6677}
6678
6679int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
6680{
6681        struct i387_fxsave_struct *fxsave =
6682                        &vcpu->arch.guest_fpu.state->fxsave;
6683
6684        memcpy(fpu->fpr, fxsave->st_space, 128);
6685        fpu->fcw = fxsave->cwd;
6686        fpu->fsw = fxsave->swd;
6687        fpu->ftwx = fxsave->twd;
6688        fpu->last_opcode = fxsave->fop;
6689        fpu->last_ip = fxsave->rip;
6690        fpu->last_dp = fxsave->rdp;
6691        memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
6692
6693        return 0;
6694}
6695
6696int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
6697{
6698        struct i387_fxsave_struct *fxsave =
6699                        &vcpu->arch.guest_fpu.state->fxsave;
6700
6701        memcpy(fxsave->st_space, fpu->fpr, 128);
6702        fxsave->cwd = fpu->fcw;
6703        fxsave->swd = fpu->fsw;
6704        fxsave->twd = fpu->ftwx;
6705        fxsave->fop = fpu->last_opcode;
6706        fxsave->rip = fpu->last_ip;
6707        fxsave->rdp = fpu->last_dp;
6708        memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
6709
6710        return 0;
6711}
6712
6713int fx_init(struct kvm_vcpu *vcpu)
6714{
6715        int err;
6716
6717        err = fpu_alloc(&vcpu->arch.guest_fpu);
6718        if (err)
6719                return err;
6720
6721        fpu_finit(&vcpu->arch.guest_fpu);
6722
6723        /*
6724         * Ensure guest xcr0 is valid for loading
6725         */
6726        vcpu->arch.xcr0 = XSTATE_FP;
6727
6728        vcpu->arch.cr0 |= X86_CR0_ET;
6729
6730        return 0;
6731}
6732EXPORT_SYMBOL_GPL(fx_init);
6733
6734static void fx_free(struct kvm_vcpu *vcpu)
6735{
6736        fpu_free(&vcpu->arch.guest_fpu);
6737}
6738
6739void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
6740{
6741        if (vcpu->guest_fpu_loaded)
6742                return;
6743
6744        /*
6745         * Restore all possible states in the guest,
6746         * and assume host would use all available bits.
6747         * Guest xcr0 would be loaded later.
6748         */
6749        kvm_put_guest_xcr0(vcpu);
6750        vcpu->guest_fpu_loaded = 1;
6751        __kernel_fpu_begin();
6752        fpu_restore_checking(&vcpu->arch.guest_fpu);
6753        trace_kvm_fpu(1);
6754}
6755
6756void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
6757{
6758        kvm_put_guest_xcr0(vcpu);
6759
6760        if (!vcpu->guest_fpu_loaded)
6761                return;
6762
6763        vcpu->guest_fpu_loaded = 0;
6764        fpu_save_init(&vcpu->arch.guest_fpu);
6765        __kernel_fpu_end();
6766        ++vcpu->stat.fpu_reload;
6767        kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
6768        trace_kvm_fpu(0);
6769}
6770
6771void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
6772{
6773        kvmclock_reset(vcpu);
6774
6775        free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
6776        fx_free(vcpu);
6777        kvm_x86_ops->vcpu_free(vcpu);
6778}
6779
6780struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
6781                                                unsigned int id)
6782{
6783        if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
6784                printk_once(KERN_WARNING
6785                "kvm: SMP vm created on host with unstable TSC; "
6786                "guest TSC will not be reliable\n");
6787        return kvm_x86_ops->vcpu_create(kvm, id);
6788}
6789
6790int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
6791{
6792        int r;
6793
6794        vcpu->arch.mtrr_state.have_fixed = 1;
6795        r = vcpu_load(vcpu);
6796        if (r)
6797                return r;
6798        kvm_vcpu_reset(vcpu);
6799        kvm_mmu_setup(vcpu);
6800        vcpu_put(vcpu);
6801
6802        return r;
6803}
6804
6805int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
6806{
6807        int r;
6808        struct msr_data msr;
6809        struct kvm *kvm = vcpu->kvm;
6810
6811        r = vcpu_load(vcpu);
6812        if (r)
6813                return r;
6814        msr.data = 0x0;
6815        msr.index = MSR_IA32_TSC;
6816        msr.host_initiated = true;
6817        kvm_write_tsc(vcpu, &msr);
6818        vcpu_put(vcpu);
6819
6820        schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
6821                                        KVMCLOCK_SYNC_PERIOD);
6822
6823        return r;
6824}
6825
6826void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
6827{
6828        int r;
6829        vcpu->arch.apf.msr_val = 0;
6830
6831        r = vcpu_load(vcpu);
6832        BUG_ON(r);
6833        kvm_mmu_unload(vcpu);
6834        vcpu_put(vcpu);
6835
6836        fx_free(vcpu);
6837        kvm_x86_ops->vcpu_free(vcpu);
6838}
6839
6840void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
6841{
6842        atomic_set(&vcpu->arch.nmi_queued, 0);
6843        vcpu->arch.nmi_pending = 0;
6844        vcpu->arch.nmi_injected = false;
6845
6846        memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
6847        vcpu->arch.dr6 = DR6_FIXED_1;
6848        kvm_update_dr6(vcpu);
6849        vcpu->arch.dr7 = DR7_FIXED_1;
6850        kvm_update_dr7(vcpu);
6851
6852        kvm_make_request(KVM_REQ_EVENT, vcpu);
6853        vcpu->arch.apf.msr_val = 0;
6854        vcpu->arch.st.msr_val = 0;
6855
6856        kvmclock_reset(vcpu);
6857
6858        kvm_clear_async_pf_completion_queue(vcpu);
6859        kvm_async_pf_hash_reset(vcpu);
6860        vcpu->arch.apf.halted = false;
6861
6862        kvm_pmu_reset(vcpu);
6863
6864        memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
6865        vcpu->arch.regs_avail = ~0;
6866        vcpu->arch.regs_dirty = ~0;
6867
6868        kvm_x86_ops->vcpu_reset(vcpu);
6869}
6870
6871void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector)
6872{
6873        struct kvm_segment cs;
6874
6875        kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
6876        cs.selector = vector << 8;
6877        cs.base = vector << 12;
6878        kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
6879        kvm_rip_write(vcpu, 0);
6880}
6881
6882int kvm_arch_hardware_enable(void *garbage)
6883{
6884        struct kvm *kvm;
6885        struct kvm_vcpu *vcpu;
6886        int i;
6887        int ret;
6888        u64 local_tsc;
6889        u64 max_tsc = 0;
6890        bool stable, backwards_tsc = false;
6891
6892        kvm_shared_msr_cpu_online();
6893        ret = kvm_x86_ops->hardware_enable(garbage);
6894        if (ret != 0)
6895                return ret;
6896
6897        local_tsc = native_read_tsc();
6898        stable = !check_tsc_unstable();
6899        list_for_each_entry(kvm, &vm_list, vm_list) {
6900                kvm_for_each_vcpu(i, vcpu, kvm) {
6901                        if (!stable && vcpu->cpu == smp_processor_id())
6902                                set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
6903                        if (stable && vcpu->arch.last_host_tsc > local_tsc) {
6904                                backwards_tsc = true;
6905                                if (vcpu->arch.last_host_tsc > max_tsc)
6906                                        max_tsc = vcpu->arch.last_host_tsc;
6907                        }
6908                }
6909        }
6910
6911        /*
6912         * Sometimes, even reliable TSCs go backwards.  This happens on
6913         * platforms that reset TSC during suspend or hibernate actions, but
6914         * maintain synchronization.  We must compensate.  Fortunately, we can
6915         * detect that condition here, which happens early in CPU bringup,
6916         * before any KVM threads can be running.  Unfortunately, we can't
6917         * bring the TSCs fully up to date with real time, as we aren't yet far
6918         * enough into CPU bringup that we know how much real time has actually
6919         * elapsed; our helper function, get_kernel_ns() will be using boot
6920         * variables that haven't been updated yet.
6921         *
6922         * So we simply find the maximum observed TSC above, then record the
6923         * adjustment to TSC in each VCPU.  When the VCPU later gets loaded,
6924         * the adjustment will be applied.  Note that we accumulate
6925         * adjustments, in case multiple suspend cycles happen before some VCPU
6926         * gets a chance to run again.  In the event that no KVM threads get a
6927         * chance to run, we will miss the entire elapsed period, as we'll have
6928         * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
6929         * loose cycle time.  This isn't too big a deal, since the loss will be
6930         * uniform across all VCPUs (not to mention the scenario is extremely
6931         * unlikely). It is possible that a second hibernate recovery happens
6932         * much faster than a first, causing the observed TSC here to be
6933         * smaller; this would require additional padding adjustment, which is
6934         * why we set last_host_tsc to the local tsc observed here.
6935         *
6936         * N.B. - this code below runs only on platforms with reliable TSC,
6937         * as that is the only way backwards_tsc is set above.  Also note
6938         * that this runs for ALL vcpus, which is not a bug; all VCPUs should
6939         * have the same delta_cyc adjustment applied if backwards_tsc
6940         * is detected.  Note further, this adjustment is only done once,
6941         * as we reset last_host_tsc on all VCPUs to stop this from being
6942         * called multiple times (one for each physical CPU bringup).
6943         *
6944         * Platforms with unreliable TSCs don't have to deal with this, they
6945         * will be compensated by the logic in vcpu_load, which sets the TSC to
6946         * catchup mode.  This will catchup all VCPUs to real time, but cannot
6947         * guarantee that they stay in perfect synchronization.
6948         */
6949        if (backwards_tsc) {
6950                u64 delta_cyc = max_tsc - local_tsc;
6951                backwards_tsc_observed = true;
6952                list_for_each_entry(kvm, &vm_list, vm_list) {
6953                        kvm_for_each_vcpu(i, vcpu, kvm) {
6954                                vcpu->arch.tsc_offset_adjustment += delta_cyc;
6955                                vcpu->arch.last_host_tsc = local_tsc;
6956                                set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
6957                                        &vcpu->requests);
6958                        }
6959
6960                        /*
6961                         * We have to disable TSC offset matching.. if you were
6962                         * booting a VM while issuing an S4 host suspend....
6963                         * you may have some problem.  Solving this issue is
6964                         * left as an exercise to the reader.
6965                         */
6966                        kvm->arch.last_tsc_nsec = 0;
6967                        kvm->arch.last_tsc_write = 0;
6968                }
6969
6970        }
6971        return 0;
6972}
6973
6974void kvm_arch_hardware_disable(void *garbage)
6975{
6976        kvm_x86_ops->hardware_disable(garbage);
6977        drop_user_return_notifiers(garbage);
6978}
6979
6980int kvm_arch_hardware_setup(void)
6981{
6982        return kvm_x86_ops->hardware_setup();
6983}
6984
6985void kvm_arch_hardware_unsetup(void)
6986{
6987        kvm_x86_ops->hardware_unsetup();
6988}
6989
6990void kvm_arch_check_processor_compat(void *rtn)
6991{
6992        kvm_x86_ops->check_processor_compatibility(rtn);
6993}
6994
6995bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
6996{
6997        return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
6998}
6999
7000struct static_key kvm_no_apic_vcpu __read_mostly;

7001
7002int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
7003{
7004        struct page *page;
7005        struct kvm *kvm;
7006        int r;
7007
7008        BUG_ON(vcpu->kvm == NULL);
7009        kvm = vcpu->kvm;
7010
7011        vcpu->arch.pv.pv_unhalted = false;
7012        vcpu->arch.emulate_ctxt.ops = &emulate_ops;
7013        if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
7014                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
7015        else
7016                vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
7017
7018        page = alloc_page(GFP_KERNEL | __GFP_ZERO);
7019        if (!page) {
7020                r = -ENOMEM;
7021                goto fail;
7022        }
7023        vcpu->arch.pio_data = page_address(page);
7024
7025        kvm_set_tsc_khz(vcpu, max_tsc_khz);
7026
7027        r = kvm_mmu_create(vcpu);
7028        if (r < 0)
7029                goto fail_free_pio_data;
7030
7031        if (irqchip_in_kernel(kvm)) {
7032                r = kvm_create_lapic(vcpu);
7033                if (r < 0)
7034                        goto fail_mmu_destroy;
7035        } else
7036                static_key_slow_inc(&kvm_no_apic_vcpu);
7037
7038        vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
7039                                       GFP_KERNEL);
7040        if (!vcpu->arch.mce_banks) {
7041                r = -ENOMEM;
7042                goto fail_free_lapic;
7043        }
7044        vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
7045
7046        if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) {
7047                r = -ENOMEM;
7048                goto fail_free_mce_banks;
7049        }
7050
7051        r = fx_init(vcpu);
7052        if (r)
7053                goto fail_free_wbinvd_dirty_mask;
7054
7055        vcpu->arch.ia32_tsc_adjust_msr = 0x0;
7056        vcpu->arch.pv_time_enabled = false;
7057
7058        vcpu->arch.guest_supported_xcr0 = 0;
7059        vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
7060
7061        kvm_async_pf_hash_reset(vcpu);
7062        kvm_pmu_init(vcpu);
7063
7064        return 0;
7065fail_free_wbinvd_dirty_mask:
7066        free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
7067fail_free_mce_banks:
7068        kfree(vcpu->arch.mce_banks);
7069fail_free_lapic:
7070        kvm_free_lapic(vcpu);
7071fail_mmu_destroy:
7072        kvm_mmu_destroy(vcpu);
7073fail_free_pio_data:
7074        free_page((unsigned long)vcpu->arch.pio_data);
7075fail:
7076        return r;
7077}
7078
7079void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
7080{
7081        int idx;
7082
7083        kvm_pmu_destroy(vcpu);
7084        kfree(vcpu->arch.mce_banks);
7085        kvm_free_lapic(vcpu);
7086        idx = srcu_read_lock(&vcpu->kvm->srcu);
7087        kvm_mmu_destroy(vcpu);
7088        srcu_read_unlock(&vcpu->kvm->srcu, idx);
7089        free_page((unsigned long)vcpu->arch.pio_data);
7090        if (!irqchip_in_kernel(vcpu->kvm))
7091                static_key_slow_dec(&kvm_no_apic_vcpu);
7092}
7093
7094int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
7095{
7096        if (type)
7097                return -EINVAL;
7098
7099        INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
7100        INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
7101        INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
7102        atomic_set(&kvm->arch.noncoherent_dma_count, 0);
7103
7104        /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
7105        set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
7106        /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
7107        set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
7108                &kvm->arch.irq_sources_bitmap);
7109
7110        raw_spin_lock_init(&kvm->arch.tsc_write_lock);
7111        mutex_init(&kvm->arch.apic_map_lock);
7112        spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
7113
7114        pvclock_update_vm_gtod_copy(kvm);
7115
7116        INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
7117        INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
7118
7119        return 0;
7120}
7121
7122static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
7123{
7124        int r;
7125        r = vcpu_load(vcpu);
7126        BUG_ON(r);
7127        kvm_mmu_unload(vcpu);
7128        vcpu_put(vcpu);
7129}
7130
7131static void kvm_free_vcpus(struct kvm *kvm)
7132{
7133        unsigned int i;
7134        struct kvm_vcpu *vcpu;
7135
7136        /*
7137         * Unpin any mmu pages first.
7138         */
7139        kvm_for_each_vcpu(i, vcpu, kvm) {
7140                kvm_clear_async_pf_completion_queue(vcpu);
7141                kvm_unload_vcpu_mmu(vcpu);
7142        }
7143        kvm_for_each_vcpu(i, vcpu, kvm)
7144                kvm_arch_vcpu_free(vcpu);
7145
7146        mutex_lock(&kvm->lock);
7147        for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
7148                kvm->vcpus[i] = NULL;
7149
7150        atomic_set(&kvm->online_vcpus, 0);
7151        mutex_unlock(&kvm->lock);
7152}
7153
7154void kvm_arch_sync_events(struct kvm *kvm)
7155{
7156        cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
7157        cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
7158        kvm_free_all_assigned_devices(kvm);
7159        kvm_free_pit(kvm);
7160}
7161
7162void kvm_arch_destroy_vm(struct kvm *kvm)
7163{
7164        if (current->mm == kvm->mm) {
7165                /*
7166                 * Free memory regions allocated on behalf of userspace,
7167                 * unless the the memory map has changed due to process exit
7168                 * or fd copying.
7169                 */
7170                struct kvm_userspace_memory_region mem;
7171                memset(&mem, 0, sizeof(mem));
7172                mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
7173                kvm_set_memory_region(kvm, &mem);
7174
7175                mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
7176                kvm_set_memory_region(kvm, &mem);
7177
7178                mem.slot = TSS_PRIVATE_MEMSLOT;
7179                kvm_set_memory_region(kvm, &mem);
7180        }
7181        kvm_iommu_unmap_guest(kvm);
7182        kfree(kvm->arch.vpic);
7183        kfree(kvm->arch.vioapic);
7184        kvm_free_vcpus(kvm);
7185        if (kvm->arch.apic_access_page)
7186                put_page(kvm->arch.apic_access_page);
7187        if (kvm->arch.ept_identity_pagetable)
7188                put_page(kvm->arch.ept_identity_pagetable);
7189        kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
7190}
7191
7192void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
7193                           struct kvm_memory_slot *dont)
7194{
7195        int i;
7196
7197        for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
7198                if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
7199                        kvm_kvfree(free->arch.rmap[i]);
7200                        free->arch.rmap[i] = NULL;
7201                }
7202                if (i == 0)
7203                        continue;
7204
7205                if (!dont || free->arch.lpage_info[i - 1] !=
7206                             dont->arch.lpage_info[i - 1]) {
7207                        kvm_kvfree(free->arch.lpage_info[i - 1]);
7208                        free->arch.lpage_info[i - 1] = NULL;
7209                }
7210        }
7211}
7212
7213int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
7214                            unsigned long npages)
7215{
7216        int i;
7217
7218        for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
7219                unsigned long ugfn;
7220                int lpages;
7221                int level = i + 1;
7222
7223                lpages = gfn_to_index(slot->base_gfn + npages - 1,
7224                                      slot->base_gfn, level) + 1;
7225
7226                slot->arch.rmap[i] =
7227                        kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i]));
7228                if (!slot->arch.rmap[i])
7229                        goto out_free;
7230                if (i == 0)
7231                        continue;
7232
7233                slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *
7234                                        sizeof(*slot->arch.lpage_info[i - 1]));
7235                if (!slot->arch.lpage_info[i - 1])
7236                        goto out_free;
7237
7238                if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
7239                        slot->arch.lpage_info[i - 1][0].write_count = 1;
7240                if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
7241                        slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;
7242                ugfn = slot->userspace_addr >> PAGE_SHIFT;
7243                /*
7244                 * If the gfn and userspace address are not aligned wrt each
7245                 * other, or if explicitly asked to, disable large page
7246                 * support for this slot
7247                 */
7248                if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
7249                    !kvm_largepages_enabled()) {
7250                        unsigned long j;
7251
7252                        for (j = 0; j < lpages; ++j)
7253                                slot->arch.lpage_info[i - 1][j].write_count = 1;
7254                }
7255        }
7256
7257        return 0;
7258
7259out_free:
7260        for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
7261                kvm_kvfree(slot->arch.rmap[i]);
7262                slot->arch.rmap[i] = NULL;
7263                if (i == 0)
7264                        continue;
7265
7266                kvm_kvfree(slot->arch.lpage_info[i - 1]);
7267                slot->arch.lpage_info[i - 1] = NULL;
7268        }
7269        return -ENOMEM;
7270}
7271
7272void kvm_arch_memslots_updated(struct kvm *kvm)
7273{
7274        /*
7275         * memslots->generation has been incremented.
7276         * mmio generation may have reached its maximum value.
7277         */
7278        kvm_mmu_invalidate_mmio_sptes(kvm);
7279}
7280
7281int kvm_arch_prepare_memory_region(struct kvm *kvm,
7282                                struct kvm_memory_slot *memslot,
7283                                struct kvm_userspace_memory_region *mem,
7284                                enum kvm_mr_change change)
7285{
7286        /*
7287         * Only private memory slots need to be mapped here since
7288         * KVM_SET_MEMORY_REGION ioctl is no longer supported.
7289         */
7290        if ((memslot->id >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_CREATE)) {
7291                unsigned long userspace_addr;
7292
7293                /*
7294                 * MAP_SHARED to prevent internal slot pages from being moved
7295                 * by fork()/COW.
7296                 */
7297                userspace_addr = vm_mmap(NULL, 0, memslot->npages * PAGE_SIZE,
7298                                         PROT_READ | PROT_WRITE,
7299                                         MAP_SHARED | MAP_ANONYMOUS, 0);
7300
7301                if (IS_ERR((void *)userspace_addr))
7302                        return PTR_ERR((void *)userspace_addr);
7303
7304                memslot->userspace_addr = userspace_addr;
7305        }
7306
7307        return 0;
7308}
7309
7310void kvm_arch_commit_memory_region(struct kvm *kvm,
7311                                struct kvm_userspace_memory_region *mem,
7312                                const struct kvm_memory_slot *old,
7313                                enum kvm_mr_change change)
7314{
7315
7316        int nr_mmu_pages = 0;
7317
7318        if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) {
7319                int ret;
7320
7321                ret = vm_munmap(old->userspace_addr,
7322                                old->npages * PAGE_SIZE);
7323                if (ret < 0)
7324                        printk(KERN_WARNING
7325                               "kvm_vm_ioctl_set_memory_region: "
7326                               "failed to munmap memory\n");
7327        }
7328
7329        if (!kvm->arch.n_requested_mmu_pages)
7330                nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
7331
7332        if (nr_mmu_pages)
7333                kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
7334        /*
7335         * Write protect all pages for dirty logging.
7336         * Existing largepage mappings are destroyed here and new ones will
7337         * not be created until the end of the logging.
7338         */
7339        if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
7340                kvm_mmu_slot_remove_write_access(kvm, mem->slot);
7341}
7342
7343void kvm_arch_flush_shadow_all(struct kvm *kvm)
7344{
7345        kvm_mmu_invalidate_zap_all_pages(kvm);
7346}
7347
7348void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
7349                                   struct kvm_memory_slot *slot)
7350{
7351        kvm_mmu_invalidate_zap_all_pages(kvm);
7352}
7353
7354int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
7355{
7356        if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
7357                kvm_x86_ops->check_nested_events(vcpu, false);
7358
7359        return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
7360                !vcpu->arch.apf.halted)
7361                || !list_empty_careful(&vcpu->async_pf.done)
7362                || kvm_apic_has_events(vcpu)
7363                || vcpu->arch.pv.pv_unhalted
7364                || atomic_read(&vcpu->arch.nmi_queued) ||
7365                (kvm_arch_interrupt_allowed(vcpu) &&
7366                 kvm_cpu_has_interrupt(vcpu));
7367}
7368
7369int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
7370{
7371        return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
7372}
7373
7374int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
7375{
7376        return kvm_x86_ops->interrupt_allowed(vcpu);
7377}
7378
7379bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
7380{
7381        unsigned long current_rip = kvm_rip_read(vcpu) +
7382                get_segment_base(vcpu, VCPU_SREG_CS);
7383
7384        return current_rip == linear_rip;
7385}
7386EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
7387
7388unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
7389{
7390        unsigned long rflags;
7391
7392        rflags = kvm_x86_ops->get_rflags(vcpu);
7393        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
7394                rflags &= ~X86_EFLAGS_TF;
7395        return rflags;
7396}
7397EXPORT_SYMBOL_GPL(kvm_get_rflags);
7398
7399void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
7400{
7401        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
7402            kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
7403                rflags |= X86_EFLAGS_TF;
7404        kvm_x86_ops->set_rflags(vcpu, rflags);
7405        kvm_make_request(KVM_REQ_EVENT, vcpu);
7406}
7407EXPORT_SYMBOL_GPL(kvm_set_rflags);
7408
7409void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
7410{
7411        int r;
7412
7413        if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
7414              work->wakeup_all)
7415                return;
7416
7417        r = kvm_mmu_reload(vcpu);
7418        if (unlikely(r))
7419                return;
7420
7421        if (!vcpu->arch.mmu.direct_map &&
7422              work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
7423                return;
7424
7425        vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
7426}
7427
7428static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
7429{
7430        return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
7431}
7432
7433static inline u32 kvm_async_pf_next_probe(u32 key)
7434{
7435        return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
7436}
7437
7438static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
7439{
7440        u32 key = kvm_async_pf_hash_fn(gfn);
7441
7442        while (vcpu->arch.apf.gfns[key] != ~0)
7443                key = kvm_async_pf_next_probe(key);
7444
7445        vcpu->arch.apf.gfns[key] = gfn;
7446}
7447
7448static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
7449{
7450        int i;
7451        u32 key = kvm_async_pf_hash_fn(gfn);
7452
7453        for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
7454                     (vcpu->arch.apf.gfns[key] != gfn &&
7455                      vcpu->arch.apf.gfns[key] != ~0); i++)
7456                key = kvm_async_pf_next_probe(key);
7457
7458        return key;
7459}
7460
7461bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
7462{
7463        return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
7464}
7465
7466static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
7467{
7468        u32 i, j, k;
7469
7470        i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
7471        while (true) {
7472                vcpu->arch.apf.gfns[i] = ~0;
7473                do {
7474                        j = kvm_async_pf_next_probe(j);
7475                        if (vcpu->arch.apf.gfns[j] == ~0)
7476                                return;
7477                        k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
7478                        /*
7479                         * k lies cyclically in ]i,j]
7480                         * |    i.k.j |
7481                         * |....j i.k.| or  |.k..j i...|
7482                         */
7483                } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
7484                vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
7485                i = j;
7486        }
7487}
7488
7489static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
7490{
7491
7492        return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
7493                                      sizeof(val));
7494}
7495
7496void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
7497                                     struct kvm_async_pf *work)
7498{
7499        struct x86_exception fault;
7500
7501        trace_kvm_async_pf_not_present(work->arch.token, work->gva);
7502        kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
7503
7504        if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
7505            (vcpu->arch.apf.send_user_only &&
7506             kvm_x86_ops->get_cpl(vcpu) == 0))
7507                kvm_make_request(KVM_REQ_APF_HALT, vcpu);
7508        else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
7509                fault.vector = PF_VECTOR;
7510                fault.error_code_valid = true;
7511                fault.error_code = 0;
7512                fault.nested_page_fault = false;
7513                fault.address = work->arch.token;
7514                kvm_inject_page_fault(vcpu, &fault);
7515        }
7516}
7517
7518void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
7519                                 struct kvm_async_pf *work)
7520{
7521        struct x86_exception fault;
7522
7523        trace_kvm_async_pf_ready(work->arch.token, work->gva);
7524        if (work->wakeup_all)
7525                work->arch.token = ~0; /* broadcast wakeup */
7526        else
7527                kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
7528
7529        if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
7530            !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
7531                fault.vector = PF_VECTOR;
7532                fault.error_code_valid = true;
7533                fault.error_code = 0;
7534                fault.nested_page_fault = false;
7535                fault.address = work->arch.token;
7536                kvm_inject_page_fault(vcpu, &fault);
7537        }
7538        vcpu->arch.apf.halted = false;
7539        vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
7540}
7541
7542bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
7543{
7544        if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
7545                return true;
7546        else
7547                return !kvm_event_needs_reinjection(vcpu) &&
7548                        kvm_x86_ops->interrupt_allowed(vcpu);
7549}
7550
7551void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
7552{
7553        atomic_inc(&kvm->arch.noncoherent_dma_count);
7554}
7555EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma);
7556
7557void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
7558{
7559        atomic_dec(&kvm->arch.noncoherent_dma_count);
7560}
7561EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma);
7562
7563bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
7564{
7565        return atomic_read(&kvm->arch.noncoherent_dma_count);
7566}
7567EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
7568
7569EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
7570EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
7571EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
7572EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
7573EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
7574EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
7575EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
7576EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
7577EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
7578EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
7579EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
7580EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
7581EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
7582