LXR linux/arch/x86/kvm/x86.c

   1/*
   2 * Kernel-based Virtual Machine driver for Linux
   3 *
   4 * derived from drivers/kvm/kvm_main.c
   5 *
   6 * Copyright (C) 2006 Qumranet, Inc.
   7 * Copyright (C) 2008 Qumranet, Inc.
   8 * Copyright IBM Corporation, 2008
   9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  10 *
  11 * Authors:
  12 *   Avi Kivity   <avi@qumranet.com>
  13 *   Yaniv Kamay  <yaniv@qumranet.com>
  14 *   Amit Shah    <amit.shah@qumranet.com>
  15 *   Ben-Ami Yassour <benami@il.ibm.com>
  16 *
  17 * This work is licensed under the terms of the GNU GPL, version 2.  See
  18 * the COPYING file in the top-level directory.
  19 *
  20 */
  21
  22#include <linux/kvm_host.h>
  23#include "irq.h"
  24#include "mmu.h"
  25#include "i8254.h"
  26#include "tss.h"
  27#include "kvm_cache_regs.h"
  28#include "x86.h"
  29#include "cpuid.h"
  30
  31#include <linux/clocksource.h>
  32#include <linux/interrupt.h>
  33#include <linux/kvm.h>
  34#include <linux/fs.h>
  35#include <linux/vmalloc.h>
  36#include <linux/module.h>
  37#include <linux/mman.h>
  38#include <linux/highmem.h>
  39#include <linux/iommu.h>
  40#include <linux/intel-iommu.h>
  41#include <linux/cpufreq.h>
  42#include <linux/user-return-notifier.h>
  43#include <linux/srcu.h>
  44#include <linux/slab.h>
  45#include <linux/perf_event.h>
  46#include <linux/uaccess.h>
  47#include <linux/hash.h>
  48#include <linux/pci.h>
  49#include <linux/timekeeper_internal.h>
  50#include <linux/pvclock_gtod.h>
  51#include <trace/events/kvm.h>
  52
  53#define CREATE_TRACE_POINTS
  54#include "trace.h"
  55
  56#include <asm/debugreg.h>
  57#include <asm/msr.h>
  58#include <asm/desc.h>
  59#include <asm/mtrr.h>
  60#include <asm/mce.h>
  61#include <asm/i387.h>
  62#include <asm/fpu-internal.h> /* Ugh! */
  63#include <asm/xcr.h>
  64#include <asm/pvclock.h>
  65#include <asm/div64.h>
  66
  67#define MAX_IO_MSRS 256
  68#define KVM_MAX_MCE_BANKS 32
  69#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
  70
  71#define emul_to_vcpu(ctxt) \
  72        container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
  73
  74/* EFER defaults:
  75 * - enable syscall per default because its emulated by KVM
  76 * - enable LME and LMA per default on 64 bit KVM
  77 */
  78#ifdef CONFIG_X86_64
  79static
  80u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
  81#else
  82static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
  83#endif
  84
  85#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
  86#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
  87
  88static void update_cr8_intercept(struct kvm_vcpu *vcpu);
  89static void process_nmi(struct kvm_vcpu *vcpu);
  90
  91struct kvm_x86_ops *kvm_x86_ops;
  92EXPORT_SYMBOL_GPL(kvm_x86_ops);
  93
  94static bool ignore_msrs = 0;
  95module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
  96
  97unsigned int min_timer_period_us = 500;
  98module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
  99
 100bool kvm_has_tsc_control;
 101EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
 102u32  kvm_max_guest_tsc_khz;
 103EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
 104
 105/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
 106static u32 tsc_tolerance_ppm = 250;
 107module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
 108
 109#define KVM_NR_SHARED_MSRS 16
 110
 111struct kvm_shared_msrs_global {
 112        int nr;
 113        u32 msrs[KVM_NR_SHARED_MSRS];
 114};
 115
 116struct kvm_shared_msrs {
 117        struct user_return_notifier urn;
 118        bool registered;
 119        struct kvm_shared_msr_values {
 120                u64 host;
 121                u64 curr;
 122        } values[KVM_NR_SHARED_MSRS];
 123};
 124
 125static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
 126static struct kvm_shared_msrs __percpu *shared_msrs;
 127
 128struct kvm_stats_debugfs_item debugfs_entries[] = {
 129        { "pf_fixed", VCPU_STAT(pf_fixed) },
 130        { "pf_guest", VCPU_STAT(pf_guest) },
 131        { "tlb_flush", VCPU_STAT(tlb_flush) },
 132        { "invlpg", VCPU_STAT(invlpg) },
 133        { "exits", VCPU_STAT(exits) },
 134        { "io_exits", VCPU_STAT(io_exits) },
 135        { "mmio_exits", VCPU_STAT(mmio_exits) },
 136        { "signal_exits", VCPU_STAT(signal_exits) },
 137        { "irq_window", VCPU_STAT(irq_window_exits) },
 138        { "nmi_window", VCPU_STAT(nmi_window_exits) },
 139        { "halt_exits", VCPU_STAT(halt_exits) },
 140        { "halt_wakeup", VCPU_STAT(halt_wakeup) },
 141        { "hypercalls", VCPU_STAT(hypercalls) },
 142        { "request_irq", VCPU_STAT(request_irq_exits) },
 143        { "irq_exits", VCPU_STAT(irq_exits) },
 144        { "host_state_reload", VCPU_STAT(host_state_reload) },
 145        { "efer_reload", VCPU_STAT(efer_reload) },
 146        { "fpu_reload", VCPU_STAT(fpu_reload) },
 147        { "insn_emulation", VCPU_STAT(insn_emulation) },
 148        { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
 149        { "irq_injections", VCPU_STAT(irq_injections) },
 150        { "nmi_injections", VCPU_STAT(nmi_injections) },
 151        { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 152        { "mmu_pte_write", VM_STAT(mmu_pte_write) },
 153        { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
 154        { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
 155        { "mmu_flooded", VM_STAT(mmu_flooded) },
 156        { "mmu_recycled", VM_STAT(mmu_recycled) },
 157        { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
 158        { "mmu_unsync", VM_STAT(mmu_unsync) },
 159        { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
 160        { "largepages", VM_STAT(lpages) },
 161        { NULL }
 162};
 163
 164u64 __read_mostly host_xcr0;
 165
 166static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
 167
 168static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
 169{
 170        int i;
 171        for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
 172                vcpu->arch.apf.gfns[i] = ~0;
 173}
 174
 175static void kvm_on_user_return(struct user_return_notifier *urn)
 176{
 177        unsigned slot;
 178        struct kvm_shared_msrs *locals
 179                = container_of(urn, struct kvm_shared_msrs, urn);
 180        struct kvm_shared_msr_values *values;
 181
 182        for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
 183                values = &locals->values[slot];
 184                if (values->host != values->curr) {
 185                        wrmsrl(shared_msrs_global.msrs[slot], values->host);
 186                        values->curr = values->host;
 187                }
 188        }
 189        locals->registered = false;
 190        user_return_notifier_unregister(urn);
 191}
 192
 193static void shared_msr_update(unsigned slot, u32 msr)
 194{
 195        u64 value;
 196        unsigned int cpu = smp_processor_id();
 197        struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
 198
 199        /* only read, and nobody should modify it at this time,
 200         * so don't need lock */
 201        if (slot >= shared_msrs_global.nr) {
 202                printk(KERN_ERR "kvm: invalid MSR slot!");
 203                return;
 204        }
 205        rdmsrl_safe(msr, &value);
 206        smsr->values[slot].host = value;
 207        smsr->values[slot].curr = value;
 208}
 209
 210void kvm_define_shared_msr(unsigned slot, u32 msr)
 211{
 212        if (slot >= shared_msrs_global.nr)
 213                shared_msrs_global.nr = slot + 1;
 214        shared_msrs_global.msrs[slot] = msr;
 215        /* we need ensured the shared_msr_global have been updated */
 216        smp_wmb();
 217}
 218EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
 219
 220static void kvm_shared_msr_cpu_online(void)
 221{
 222        unsigned i;
 223
 224        for (i = 0; i < shared_msrs_global.nr; ++i)
 225                shared_msr_update(i, shared_msrs_global.msrs[i]);
 226}
 227
 228void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
 229{
 230        unsigned int cpu = smp_processor_id();
 231        struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
 232
 233        if (((value ^ smsr->values[slot].curr) & mask) == 0)
 234                return;
 235        smsr->values[slot].curr = value;
 236        wrmsrl(shared_msrs_global.msrs[slot], value);
 237        if (!smsr->registered) {
 238                smsr->urn.on_user_return = kvm_on_user_return;
 239                user_return_notifier_register(&smsr->urn);
 240                smsr->registered = true;
 241        }
 242}
 243EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
 244
 245static void drop_user_return_notifiers(void *ignore)
 246{
 247        unsigned int cpu = smp_processor_id();
 248        struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
 249
 250        if (smsr->registered)
 251                kvm_on_user_return(&smsr->urn);
 252}
 253
 254u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
 255{
 256        return vcpu->arch.apic_base;
 257}
 258EXPORT_SYMBOL_GPL(kvm_get_apic_base);
 259
 260int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 261{
 262        u64 old_state = vcpu->arch.apic_base &
 263                (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
 264        u64 new_state = msr_info->data &
 265                (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
 266        u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) |
 267                0x2ff | (guest_cpuid_has_x2apic(vcpu) ? 0 : X2APIC_ENABLE);
 268
 269        if (!msr_info->host_initiated &&
 270            ((msr_info->data & reserved_bits) != 0 ||
 271             new_state == X2APIC_ENABLE ||
 272             (new_state == MSR_IA32_APICBASE_ENABLE &&
 273              old_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) ||
 274             (new_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE) &&
 275              old_state == 0)))
 276                return 1;
 277
 278        kvm_lapic_set_base(vcpu, msr_info->data);
 279        return 0;
 280}
 281EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 282
 283asmlinkage void kvm_spurious_fault(void)
 284{
 285        /* Fault while not rebooting.  We want the trace. */
 286        BUG();
 287}
 288EXPORT_SYMBOL_GPL(kvm_spurious_fault);
 289
 290#define EXCPT_BENIGN            0
 291#define EXCPT_CONTRIBUTORY      1
 292#define EXCPT_PF                2
 293
 294static int exception_class(int vector)
 295{
 296        switch (vector) {
 297        case PF_VECTOR:
 298                return EXCPT_PF;
 299        case DE_VECTOR:
 300        case TS_VECTOR:
 301        case NP_VECTOR:
 302        case SS_VECTOR:
 303        case GP_VECTOR:
 304                return EXCPT_CONTRIBUTORY;
 305        default:
 306                break;
 307        }
 308        return EXCPT_BENIGN;
 309}
 310
 311static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 312                unsigned nr, bool has_error, u32 error_code,
 313                bool reinject)
 314{
 315        u32 prev_nr;
 316        int class1, class2;
 317
 318        kvm_make_request(KVM_REQ_EVENT, vcpu);
 319
 320        if (!vcpu->arch.exception.pending) {
 321        queue:
 322                vcpu->arch.exception.pending = true;
 323                vcpu->arch.exception.has_error_code = has_error;
 324                vcpu->arch.exception.nr = nr;
 325                vcpu->arch.exception.error_code = error_code;
 326                vcpu->arch.exception.reinject = reinject;
 327                return;
 328        }
 329
 330        /* to check exception */
 331        prev_nr = vcpu->arch.exception.nr;
 332        if (prev_nr == DF_VECTOR) {
 333                /* triple fault -> shutdown */
 334                kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 335                return;
 336        }
 337        class1 = exception_class(prev_nr);
 338        class2 = exception_class(nr);
 339        if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
 340                || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
 341                /* generate double fault per SDM Table 5-5 */
 342                vcpu->arch.exception.pending = true;
 343                vcpu->arch.exception.has_error_code = true;
 344                vcpu->arch.exception.nr = DF_VECTOR;
 345                vcpu->arch.exception.error_code = 0;
 346        } else
 347                /* replace previous exception with a new one in a hope
 348                   that instruction re-execution will regenerate lost
 349                   exception */
 350                goto queue;
 351}
 352
 353void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 354{
 355        kvm_multiple_exception(vcpu, nr, false, 0, false);
 356}
 357EXPORT_SYMBOL_GPL(kvm_queue_exception);
 358
 359void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 360{
 361        kvm_multiple_exception(vcpu, nr, false, 0, true);
 362}
 363EXPORT_SYMBOL_GPL(kvm_requeue_exception);
 364
 365void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
 366{
 367        if (err)
 368                kvm_inject_gp(vcpu, 0);
 369        else
 370                kvm_x86_ops->skip_emulated_instruction(vcpu);
 371}
 372EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
 373
 374void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 375{
 376        ++vcpu->stat.pf_guest;
 377        vcpu->arch.cr2 = fault->address;
 378        kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
 379}
 380EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
 381
 382void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 383{
 384        if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
 385                vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
 386        else
 387                vcpu->arch.mmu.inject_page_fault(vcpu, fault);
 388}
 389
 390void kvm_inject_nmi(struct kvm_vcpu *vcpu)
 391{
 392        atomic_inc(&vcpu->arch.nmi_queued);
 393        kvm_make_request(KVM_REQ_NMI, vcpu);
 394}
 395EXPORT_SYMBOL_GPL(kvm_inject_nmi);
 396
 397void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 398{
 399        kvm_multiple_exception(vcpu, nr, true, error_code, false);
 400}
 401EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
 402
 403void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 404{
 405        kvm_multiple_exception(vcpu, nr, true, error_code, true);
 406}
 407EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
 408
 409/*
 410 * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
 411 * a #GP and return false.
 412 */
 413bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
 414{
 415        if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
 416                return true;
 417        kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
 418        return false;
 419}
 420EXPORT_SYMBOL_GPL(kvm_require_cpl);
 421
 422/*
 423 * This function will be used to read from the physical memory of the currently
 424 * running guest. The difference to kvm_read_guest_page is that this function
 425 * can read from guest physical or from the guest's guest physical memory.
 426 */
 427int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 428                            gfn_t ngfn, void *data, int offset, int len,
 429                            u32 access)
 430{
 431        gfn_t real_gfn;
 432        gpa_t ngpa;
 433
 434        ngpa     = gfn_to_gpa(ngfn);
 435        real_gfn = mmu->translate_gpa(vcpu, ngpa, access);
 436        if (real_gfn == UNMAPPED_GVA)
 437                return -EFAULT;
 438
 439        real_gfn = gpa_to_gfn(real_gfn);
 440
 441        return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
 442}
 443EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
 444
 445int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 446                               void *data, int offset, int len, u32 access)
 447{
 448        return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
 449                                       data, offset, len, access);
 450}
 451
 452/*
 453 * Load the pae pdptrs.  Return true is they are all valid.
 454 */
 455int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
 456{
 457        gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
 458        unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
 459        int i;
 460        int ret;
 461        u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
 462
 463        ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
 464                                      offset * sizeof(u64), sizeof(pdpte),
 465                                      PFERR_USER_MASK|PFERR_WRITE_MASK);
 466        if (ret < 0) {
 467                ret = 0;
 468                goto out;
 469        }
 470        for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
 471                if (is_present_gpte(pdpte[i]) &&
 472                    (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
 473                        ret = 0;
 474                        goto out;
 475                }
 476        }
 477        ret = 1;
 478
 479        memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
 480        __set_bit(VCPU_EXREG_PDPTR,
 481                  (unsigned long *)&vcpu->arch.regs_avail);
 482        __set_bit(VCPU_EXREG_PDPTR,
 483                  (unsigned long *)&vcpu->arch.regs_dirty);
 484out:
 485
 486        return ret;
 487}
 488EXPORT_SYMBOL_GPL(load_pdptrs);
 489
 490static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 491{
 492        u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
 493        bool changed = true;
 494        int offset;
 495        gfn_t gfn;
 496        int r;
 497
 498        if (is_long_mode(vcpu) || !is_pae(vcpu))
 499                return false;
 500
 501        if (!test_bit(VCPU_EXREG_PDPTR,
 502                      (unsigned long *)&vcpu->arch.regs_avail))
 503                return true;
 504
 505        gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
 506        offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
 507        r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
 508                                       PFERR_USER_MASK | PFERR_WRITE_MASK);
 509        if (r < 0)
 510                goto out;
 511        changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
 512out:
 513
 514        return changed;
 515}
 516
 517int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 518{
 519        unsigned long old_cr0 = kvm_read_cr0(vcpu);
 520        unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
 521                                    X86_CR0_CD | X86_CR0_NW;
 522
 523        cr0 |= X86_CR0_ET;
 524
 525#ifdef CONFIG_X86_64
 526        if (cr0 & 0xffffffff00000000UL)
 527                return 1;
 528#endif
 529
 530        cr0 &= ~CR0_RESERVED_BITS;
 531
 532        if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
 533                return 1;
 534
 535        if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
 536                return 1;
 537
 538        if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
 539#ifdef CONFIG_X86_64
 540                if ((vcpu->arch.efer & EFER_LME)) {
 541                        int cs_db, cs_l;
 542
 543                        if (!is_pae(vcpu))
 544                                return 1;
 545                        kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 546                        if (cs_l)
 547                                return 1;
 548                } else
 549#endif
 550                if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
 551                                                 kvm_read_cr3(vcpu)))
 552                        return 1;
 553        }
 554
 555        if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
 556                return 1;
 557
 558        kvm_x86_ops->set_cr0(vcpu, cr0);
 559
 560        if ((cr0 ^ old_cr0) & X86_CR0_PG) {
 561                kvm_clear_async_pf_completion_queue(vcpu);
 562                kvm_async_pf_hash_reset(vcpu);
 563        }
 564
 565        if ((cr0 ^ old_cr0) & update_bits)
 566                kvm_mmu_reset_context(vcpu);
 567        return 0;
 568}
 569EXPORT_SYMBOL_GPL(kvm_set_cr0);
 570
 571void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 572{
 573        (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
 574}
 575EXPORT_SYMBOL_GPL(kvm_lmsw);
 576
 577static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
 578{
 579        if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
 580                        !vcpu->guest_xcr0_loaded) {
 581                /* kvm_set_xcr() also depends on this */
 582                xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
 583                vcpu->guest_xcr0_loaded = 1;
 584        }
 585}
 586
 587static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
 588{
 589        if (vcpu->guest_xcr0_loaded) {
 590                if (vcpu->arch.xcr0 != host_xcr0)
 591                        xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
 592                vcpu->guest_xcr0_loaded = 0;
 593        }
 594}
 595
 596int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 597{
 598        u64 xcr0;
 599        u64 valid_bits;
 600
 601        /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
 602        if (index != XCR_XFEATURE_ENABLED_MASK)
 603                return 1;
 604        xcr0 = xcr;
 605        if (!(xcr0 & XSTATE_FP))
 606                return 1;
 607        if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
 608                return 1;
 609
 610        /*
 611         * Do not allow the guest to set bits that we do not support
 612         * saving.  However, xcr0 bit 0 is always set, even if the
 613         * emulated CPU does not support XSAVE (see fx_init).
 614         */
 615        valid_bits = vcpu->arch.guest_supported_xcr0 | XSTATE_FP;
 616        if (xcr0 & ~valid_bits)
 617                return 1;
 618
 619        kvm_put_guest_xcr0(vcpu);
 620        vcpu->arch.xcr0 = xcr0;
 621        return 0;
 622}
 623
 624int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 625{
 626        if (kvm_x86_ops->get_cpl(vcpu) != 0 ||
 627            __kvm_set_xcr(vcpu, index, xcr)) {
 628                kvm_inject_gp(vcpu, 0);
 629                return 1;
 630        }
 631        return 0;
 632}
 633EXPORT_SYMBOL_GPL(kvm_set_xcr);
 634
 635int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 636{
 637        unsigned long old_cr4 = kvm_read_cr4(vcpu);
 638        unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE |
 639                                   X86_CR4_PAE | X86_CR4_SMEP;
 640        if (cr4 & CR4_RESERVED_BITS)
 641                return 1;
 642
 643        if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
 644                return 1;
 645
 646        if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))
 647                return 1;
 648
 649        if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE))
 650                return 1;
 651
 652        if (is_long_mode(vcpu)) {
 653                if (!(cr4 & X86_CR4_PAE))
 654                        return 1;
 655        } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
 656                   && ((cr4 ^ old_cr4) & pdptr_bits)
 657                   && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
 658                                   kvm_read_cr3(vcpu)))
 659                return 1;
 660
 661        if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
 662                if (!guest_cpuid_has_pcid(vcpu))
 663                        return 1;
 664
 665                /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
 666                if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
 667                        return 1;
 668        }
 669
 670        if (kvm_x86_ops->set_cr4(vcpu, cr4))
 671                return 1;
 672
 673        if (((cr4 ^ old_cr4) & pdptr_bits) ||
 674            (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
 675                kvm_mmu_reset_context(vcpu);
 676
 677        if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
 678                kvm_update_cpuid(vcpu);
 679
 680        return 0;
 681}
 682EXPORT_SYMBOL_GPL(kvm_set_cr4);
 683
 684int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 685{
 686        if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
 687                kvm_mmu_sync_roots(vcpu);
 688                kvm_mmu_flush_tlb(vcpu);
 689                return 0;
 690        }
 691
 692        if (is_long_mode(vcpu)) {
 693                if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) {
 694                        if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
 695                                return 1;
 696                } else
 697                        if (cr3 & CR3_L_MODE_RESERVED_BITS)
 698                                return 1;
 699        } else {
 700                if (is_pae(vcpu)) {
 701                        if (cr3 & CR3_PAE_RESERVED_BITS)
 702                                return 1;
 703                        if (is_paging(vcpu) &&
 704                            !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
 705                                return 1;
 706                }
 707                /*
 708                 * We don't check reserved bits in nonpae mode, because
 709                 * this isn't enforced, and VMware depends on this.
 710                 */
 711        }
 712
 713        vcpu->arch.cr3 = cr3;
 714        __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
 715        kvm_mmu_new_cr3(vcpu);
 716        return 0;
 717}
 718EXPORT_SYMBOL_GPL(kvm_set_cr3);
 719
 720int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 721{
 722        if (cr8 & CR8_RESERVED_BITS)
 723                return 1;
 724        if (irqchip_in_kernel(vcpu->kvm))
 725                kvm_lapic_set_tpr(vcpu, cr8);
 726        else
 727                vcpu->arch.cr8 = cr8;
 728        return 0;
 729}
 730EXPORT_SYMBOL_GPL(kvm_set_cr8);
 731
 732unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 733{
 734        if (irqchip_in_kernel(vcpu->kvm))
 735                return kvm_lapic_get_cr8(vcpu);
 736        else
 737                return vcpu->arch.cr8;
 738}
 739EXPORT_SYMBOL_GPL(kvm_get_cr8);
 740
 741static void kvm_update_dr6(struct kvm_vcpu *vcpu)
 742{
 743        if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
 744                kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6);
 745}
 746
 747static void kvm_update_dr7(struct kvm_vcpu *vcpu)
 748{
 749        unsigned long dr7;
 750
 751        if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
 752                dr7 = vcpu->arch.guest_debug_dr7;
 753        else
 754                dr7 = vcpu->arch.dr7;
 755        kvm_x86_ops->set_dr7(vcpu, dr7);
 756        vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK);
 757}
 758
 759static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 760{
 761        switch (dr) {
 762        case 0 ... 3:
 763                vcpu->arch.db[dr] = val;
 764                if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
 765                        vcpu->arch.eff_db[dr] = val;
 766                break;
 767        case 4:
 768                if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
 769                        return 1; /* #UD */
 770                /* fall through */
 771        case 6:
 772                if (val & 0xffffffff00000000ULL)
 773                        return -1; /* #GP */
 774                vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
 775                kvm_update_dr6(vcpu);
 776                break;
 777        case 5:
 778                if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
 779                        return 1; /* #UD */
 780                /* fall through */
 781        default: /* 7 */
 782                if (val & 0xffffffff00000000ULL)
 783                        return -1; /* #GP */
 784                vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
 785                kvm_update_dr7(vcpu);
 786                break;
 787        }
 788
 789        return 0;
 790}
 791
 792int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 793{
 794        int res;
 795
 796        res = __kvm_set_dr(vcpu, dr, val);
 797        if (res > 0)
 798                kvm_queue_exception(vcpu, UD_VECTOR);
 799        else if (res < 0)
 800                kvm_inject_gp(vcpu, 0);
 801
 802        return res;
 803}
 804EXPORT_SYMBOL_GPL(kvm_set_dr);
 805
 806static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 807{
 808        switch (dr) {
 809        case 0 ... 3:
 810                *val = vcpu->arch.db[dr];
 811                break;
 812        case 4:
 813                if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
 814                        return 1;
 815                /* fall through */
 816        case 6:
 817                if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
 818                        *val = vcpu->arch.dr6;
 819                else
 820                        *val = kvm_x86_ops->get_dr6(vcpu);
 821                break;
 822        case 5:
 823                if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
 824                        return 1;
 825                /* fall through */
 826        default: /* 7 */
 827                *val = vcpu->arch.dr7;
 828                break;
 829        }
 830
 831        return 0;
 832}
 833
 834int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 835{
 836        if (_kvm_get_dr(vcpu, dr, val)) {
 837                kvm_queue_exception(vcpu, UD_VECTOR);
 838                return 1;
 839        }
 840        return 0;
 841}
 842EXPORT_SYMBOL_GPL(kvm_get_dr);
 843
 844bool kvm_rdpmc(struct kvm_vcpu *vcpu)
 845{
 846        u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
 847        u64 data;
 848        int err;
 849
 850        err = kvm_pmu_read_pmc(vcpu, ecx, &data);
 851        if (err)
 852                return err;
 853        kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
 854        kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
 855        return err;
 856}
 857EXPORT_SYMBOL_GPL(kvm_rdpmc);
 858
 859/*
 860 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
 861 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
 862 *
 863 * This list is modified at module load time to reflect the
 864 * capabilities of the host cpu. This capabilities test skips MSRs that are
 865 * kvm-specific. Those are put in the beginning of the list.
 866 */
 867
 868#define KVM_SAVE_MSRS_BEGIN     12
 869static u32 msrs_to_save[] = {
 870        MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 871        MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
 872        HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
 873        HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
 874        HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
 875        MSR_KVM_PV_EOI_EN,
 876        MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 877        MSR_STAR,
 878#ifdef CONFIG_X86_64
 879        MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 880#endif
 881        MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
 882        MSR_IA32_FEATURE_CONTROL
 883};
 884
 885static unsigned num_msrs_to_save;
 886
 887static const u32 emulated_msrs[] = {
 888        MSR_IA32_TSC_ADJUST,
 889        MSR_IA32_TSCDEADLINE,
 890        MSR_IA32_MISC_ENABLE,
 891        MSR_IA32_MCG_STATUS,
 892        MSR_IA32_MCG_CTL,
 893};
 894
 895bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
 896{
 897        if (efer & efer_reserved_bits)
 898                return false;
 899
 900        if (efer & EFER_FFXSR) {
 901                struct kvm_cpuid_entry2 *feat;
 902
 903                feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
 904                if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
 905                        return false;
 906        }
 907
 908        if (efer & EFER_SVME) {
 909                struct kvm_cpuid_entry2 *feat;
 910
 911                feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
 912                if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
 913                        return false;
 914        }
 915
 916        return true;
 917}
 918EXPORT_SYMBOL_GPL(kvm_valid_efer);
 919
 920static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
 921{
 922        u64 old_efer = vcpu->arch.efer;
 923
 924        if (!kvm_valid_efer(vcpu, efer))
 925                return 1;
 926
 927        if (is_paging(vcpu)
 928            && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
 929                return 1;
 930
 931        efer &= ~EFER_LMA;
 932        efer |= vcpu->arch.efer & EFER_LMA;
 933
 934        kvm_x86_ops->set_efer(vcpu, efer);
 935
 936        /* Update reserved bits */
 937        if ((efer ^ old_efer) & EFER_NX)
 938                kvm_mmu_reset_context(vcpu);
 939
 940        return 0;
 941}
 942
 943void kvm_enable_efer_bits(u64 mask)
 944{
 945       efer_reserved_bits &= ~mask;
 946}
 947EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
 948
 949
 950/*
 951 * Writes msr value into into the appropriate "register".
 952 * Returns 0 on success, non-0 otherwise.
 953 * Assumes vcpu_load() was already called.
 954 */
 955int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 956{
 957        return kvm_x86_ops->set_msr(vcpu, msr);
 958}
 959
 960/*
 961 * Adapt set_msr() to msr_io()'s calling convention
 962 */
 963static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 964{
 965        struct msr_data msr;
 966
 967        msr.data = *data;
 968        msr.index = index;
 969        msr.host_initiated = true;
 970        return kvm_set_msr(vcpu, &msr);
 971}
 972
 973#ifdef CONFIG_X86_64
 974struct pvclock_gtod_data {
 975        seqcount_t      seq;
 976
 977        struct { /* extract of a clocksource struct */
 978                int vclock_mode;
 979                cycle_t cycle_last;
 980                cycle_t mask;
 981                u32     mult;
 982                u32     shift;
 983        } clock;
 984
 985        /* open coded 'struct timespec' */
 986        u64             monotonic_time_snsec;
 987        time_t          monotonic_time_sec;
 988};
 989
 990static struct pvclock_gtod_data pvclock_gtod_data;
 991
 992static void update_pvclock_gtod(struct timekeeper *tk)
 993{
 994        struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
 995
 996        write_seqcount_begin(&vdata->seq);
 997
 998        /* copy pvclock gtod data */
 999        vdata->clock.vclock_mode        = tk->clock->archdata.vclock_mode;
1000        vdata->clock.cycle_last         = tk->clock->cycle_last;

1001        vdata->clock.mask               = tk->clock->mask;
1002        vdata->clock.mult               = tk->mult;
1003        vdata->clock.shift              = tk->shift;
1004
1005        vdata->monotonic_time_sec       = tk->xtime_sec
1006                                        + tk->wall_to_monotonic.tv_sec;
1007        vdata->monotonic_time_snsec     = tk->xtime_nsec
1008                                        + (tk->wall_to_monotonic.tv_nsec
1009                                                << tk->shift);
1010        while (vdata->monotonic_time_snsec >=
1011                                        (((u64)NSEC_PER_SEC) << tk->shift)) {
1012                vdata->monotonic_time_snsec -=
1013                                        ((u64)NSEC_PER_SEC) << tk->shift;
1014                vdata->monotonic_time_sec++;
1015        }
1016
1017        write_seqcount_end(&vdata->seq);
1018}
1019#endif
1020
1021
1022static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
1023{
1024        int version;
1025        int r;
1026        struct pvclock_wall_clock wc;
1027        struct timespec boot;
1028
1029        if (!wall_clock)
1030                return;
1031
1032        r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
1033        if (r)
1034                return;
1035
1036        if (version & 1)
1037                ++version;  /* first time write, random junk */
1038
1039        ++version;
1040
1041        kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
1042
1043        /*
1044         * The guest calculates current wall clock time by adding
1045         * system time (updated by kvm_guest_time_update below) to the
1046         * wall clock specified here.  guest system time equals host
1047         * system time for us, thus we must fill in host boot time here.
1048         */
1049        getboottime(&boot);
1050
1051        if (kvm->arch.kvmclock_offset) {
1052                struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset);
1053                boot = timespec_sub(boot, ts);
1054        }
1055        wc.sec = boot.tv_sec;
1056        wc.nsec = boot.tv_nsec;
1057        wc.version = version;
1058
1059        kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
1060
1061        version++;
1062        kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
1063}
1064
1065static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
1066{
1067        uint32_t quotient, remainder;
1068
1069        /* Don't try to replace with do_div(), this one calculates
1070         * "(dividend << 32) / divisor" */
1071        __asm__ ( "divl %4"
1072                  : "=a" (quotient), "=d" (remainder)
1073                  : "0" (0), "1" (dividend), "r" (divisor) );
1074        return quotient;
1075}
1076
1077static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
1078                               s8 *pshift, u32 *pmultiplier)
1079{
1080        uint64_t scaled64;
1081        int32_t  shift = 0;
1082        uint64_t tps64;
1083        uint32_t tps32;
1084
1085        tps64 = base_khz * 1000LL;
1086        scaled64 = scaled_khz * 1000LL;
1087        while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
1088                tps64 >>= 1;
1089                shift--;
1090        }
1091
1092        tps32 = (uint32_t)tps64;
1093        while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
1094                if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
1095                        scaled64 >>= 1;
1096                else
1097                        tps32 <<= 1;
1098                shift++;
1099        }
1100
1101        *pshift = shift;
1102        *pmultiplier = div_frac(scaled64, tps32);
1103
1104        pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
1105                 __func__, base_khz, scaled_khz, shift, *pmultiplier);
1106}
1107
1108static inline u64 get_kernel_ns(void)
1109{
1110        struct timespec ts;
1111
1112        WARN_ON(preemptible());
1113        ktime_get_ts(&ts);
1114        monotonic_to_bootbased(&ts);
1115        return timespec_to_ns(&ts);
1116}
1117
1118#ifdef CONFIG_X86_64
1119static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
1120#endif
1121
1122static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
1123unsigned long max_tsc_khz;
1124
1125static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
1126{
1127        return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
1128                                   vcpu->arch.virtual_tsc_shift);
1129}
1130
1131static u32 adjust_tsc_khz(u32 khz, s32 ppm)
1132{
1133        u64 v = (u64)khz * (1000000 + ppm);
1134        do_div(v, 1000000);
1135        return v;
1136}
1137
1138static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
1139{
1140        u32 thresh_lo, thresh_hi;
1141        int use_scaling = 0;
1142
1143        /* tsc_khz can be zero if TSC calibration fails */
1144        if (this_tsc_khz == 0)
1145                return;
1146
1147        /* Compute a scale to convert nanoseconds in TSC cycles */
1148        kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
1149                           &vcpu->arch.virtual_tsc_shift,
1150                           &vcpu->arch.virtual_tsc_mult);
1151        vcpu->arch.virtual_tsc_khz = this_tsc_khz;
1152
1153        /*
1154         * Compute the variation in TSC rate which is acceptable
1155         * within the range of tolerance and decide if the
1156         * rate being applied is within that bounds of the hardware
1157         * rate.  If so, no scaling or compensation need be done.
1158         */
1159        thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
1160        thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
1161        if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
1162                pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
1163                use_scaling = 1;
1164        }
1165        kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
1166}
1167
1168static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
1169{
1170        u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
1171                                      vcpu->arch.virtual_tsc_mult,
1172                                      vcpu->arch.virtual_tsc_shift);
1173        tsc += vcpu->arch.this_tsc_write;
1174        return tsc;
1175}
1176
1177void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
1178{
1179#ifdef CONFIG_X86_64
1180        bool vcpus_matched;
1181        bool do_request = false;
1182        struct kvm_arch *ka = &vcpu->kvm->arch;
1183        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1184
1185        vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
1186                         atomic_read(&vcpu->kvm->online_vcpus));
1187
1188        if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC)
1189                if (!ka->use_master_clock)
1190                        do_request = 1;
1191
1192        if (!vcpus_matched && ka->use_master_clock)
1193                        do_request = 1;
1194
1195        if (do_request)
1196                kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
1197
1198        trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
1199                            atomic_read(&vcpu->kvm->online_vcpus),
1200                            ka->use_master_clock, gtod->clock.vclock_mode);
1201#endif
1202}
1203
1204static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
1205{
1206        u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu);
1207        vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
1208}
1209
1210void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1211{
1212        struct kvm *kvm = vcpu->kvm;
1213        u64 offset, ns, elapsed;
1214        unsigned long flags;
1215        s64 usdiff;
1216        bool matched;
1217        u64 data = msr->data;
1218
1219        raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1220        offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
1221        ns = get_kernel_ns();
1222        elapsed = ns - kvm->arch.last_tsc_nsec;
1223
1224        if (vcpu->arch.virtual_tsc_khz) {
1225                int faulted = 0;
1226
1227                /* n.b - signed multiplication and division required */
1228                usdiff = data - kvm->arch.last_tsc_write;
1229#ifdef CONFIG_X86_64
1230                usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
1231#else
1232                /* do_div() only does unsigned */
1233                asm("1: idivl %[divisor]\n"
1234                    "2: xor %%edx, %%edx\n"
1235                    "   movl $0, %[faulted]\n"
1236                    "3:\n"
1237                    ".section .fixup,\"ax\"\n"
1238                    "4: movl $1, %[faulted]\n"
1239                    "   jmp  3b\n"
1240                    ".previous\n"
1241
1242                _ASM_EXTABLE(1b, 4b)
1243
1244                : "=A"(usdiff), [faulted] "=r" (faulted)
1245                : "A"(usdiff * 1000), [divisor] "rm"(vcpu->arch.virtual_tsc_khz));
1246
1247#endif
1248                do_div(elapsed, 1000);
1249                usdiff -= elapsed;
1250                if (usdiff < 0)
1251                        usdiff = -usdiff;
1252
1253                /* idivl overflow => difference is larger than USEC_PER_SEC */
1254                if (faulted)
1255                        usdiff = USEC_PER_SEC;
1256        } else
1257                usdiff = USEC_PER_SEC; /* disable TSC match window below */
1258
1259        /*
1260         * Special case: TSC write with a small delta (1 second) of virtual
1261         * cycle time against real time is interpreted as an attempt to
1262         * synchronize the CPU.
1263         *
1264         * For a reliable TSC, we can match TSC offsets, and for an unstable
1265         * TSC, we add elapsed time in this computation.  We could let the
1266         * compensation code attempt to catch up if we fall behind, but
1267         * it's better to try to match offsets from the beginning.
1268         */
1269        if (usdiff < USEC_PER_SEC &&
1270            vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
1271                if (!check_tsc_unstable()) {
1272                        offset = kvm->arch.cur_tsc_offset;
1273                        pr_debug("kvm: matched tsc offset for %llu\n", data);
1274                } else {
1275                        u64 delta = nsec_to_cycles(vcpu, elapsed);
1276                        data += delta;
1277                        offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
1278                        pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1279                }
1280                matched = true;
1281        } else {
1282                /*
1283                 * We split periods of matched TSC writes into generations.
1284                 * For each generation, we track the original measured
1285                 * nanosecond time, offset, and write, so if TSCs are in
1286                 * sync, we can match exact offset, and if not, we can match
1287                 * exact software computation in compute_guest_tsc()
1288                 *
1289                 * These values are tracked in kvm->arch.cur_xxx variables.
1290                 */
1291                kvm->arch.cur_tsc_generation++;
1292                kvm->arch.cur_tsc_nsec = ns;
1293                kvm->arch.cur_tsc_write = data;
1294                kvm->arch.cur_tsc_offset = offset;
1295                matched = false;
1296                pr_debug("kvm: new tsc generation %u, clock %llu\n",
1297                         kvm->arch.cur_tsc_generation, data);
1298        }
1299
1300        /*
1301         * We also track th most recent recorded KHZ, write and time to
1302         * allow the matching interval to be extended at each write.
1303         */
1304        kvm->arch.last_tsc_nsec = ns;
1305        kvm->arch.last_tsc_write = data;
1306        kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
1307
1308        vcpu->arch.last_guest_tsc = data;
1309
1310        /* Keep track of which generation this VCPU has synchronized to */
1311        vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
1312        vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
1313        vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
1314
1315        if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated)
1316                update_ia32_tsc_adjust_msr(vcpu, offset);
1317        kvm_x86_ops->write_tsc_offset(vcpu, offset);
1318        raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1319
1320        spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
1321        if (matched)
1322                kvm->arch.nr_vcpus_matched_tsc++;
1323        else
1324                kvm->arch.nr_vcpus_matched_tsc = 0;
1325
1326        kvm_track_tsc_matching(vcpu);
1327        spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
1328}
1329
1330EXPORT_SYMBOL_GPL(kvm_write_tsc);
1331
1332#ifdef CONFIG_X86_64
1333
1334static cycle_t read_tsc(void)
1335{
1336        cycle_t ret;
1337        u64 last;
1338
1339        /*
1340         * Empirically, a fence (of type that depends on the CPU)
1341         * before rdtsc is enough to ensure that rdtsc is ordered
1342         * with respect to loads.  The various CPU manuals are unclear
1343         * as to whether rdtsc can be reordered with later loads,
1344         * but no one has ever seen it happen.
1345         */
1346        rdtsc_barrier();
1347        ret = (cycle_t)vget_cycles();
1348
1349        last = pvclock_gtod_data.clock.cycle_last;
1350
1351        if (likely(ret >= last))
1352                return ret;
1353
1354        /*
1355         * GCC likes to generate cmov here, but this branch is extremely
1356         * predictable (it's just a funciton of time and the likely is
1357         * very likely) and there's a data dependence, so force GCC
1358         * to generate a branch instead.  I don't barrier() because
1359         * we don't actually need a barrier, and if this function
1360         * ever gets inlined it will generate worse code.
1361         */
1362        asm volatile ("");
1363        return last;
1364}
1365
1366static inline u64 vgettsc(cycle_t *cycle_now)
1367{
1368        long v;
1369        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1370
1371        *cycle_now = read_tsc();
1372
1373        v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
1374        return v * gtod->clock.mult;
1375}
1376
1377static int do_monotonic(struct timespec *ts, cycle_t *cycle_now)
1378{
1379        unsigned long seq;
1380        u64 ns;
1381        int mode;
1382        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1383
1384        ts->tv_nsec = 0;
1385        do {
1386                seq = read_seqcount_begin(&gtod->seq);
1387                mode = gtod->clock.vclock_mode;
1388                ts->tv_sec = gtod->monotonic_time_sec;
1389                ns = gtod->monotonic_time_snsec;
1390                ns += vgettsc(cycle_now);
1391                ns >>= gtod->clock.shift;
1392        } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
1393        timespec_add_ns(ts, ns);
1394
1395        return mode;
1396}
1397
1398/* returns true if host is using tsc clocksource */
1399static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
1400{
1401        struct timespec ts;
1402
1403        /* checked again under seqlock below */
1404        if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
1405                return false;
1406
1407        if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC)
1408                return false;
1409
1410        monotonic_to_bootbased(&ts);
1411        *kernel_ns = timespec_to_ns(&ts);
1412
1413        return true;
1414}
1415#endif
1416
1417/*
1418 *
1419 * Assuming a stable TSC across physical CPUS, and a stable TSC
1420 * across virtual CPUs, the following condition is possible.
1421 * Each numbered line represents an event visible to both
1422 * CPUs at the next numbered event.
1423 *
1424 * "timespecX" represents host monotonic time. "tscX" represents
1425 * RDTSC value.
1426 *
1427 *              VCPU0 on CPU0           |       VCPU1 on CPU1
1428 *
1429 * 1.  read timespec0,tsc0
1430 * 2.                                   | timespec1 = timespec0 + N
1431 *                                      | tsc1 = tsc0 + M
1432 * 3. transition to guest               | transition to guest
1433 * 4. ret0 = timespec0 + (rdtsc - tsc0) |
1434 * 5.                                   | ret1 = timespec1 + (rdtsc - tsc1)
1435 *                                      | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
1436 *
1437 * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
1438 *
1439 *      - ret0 < ret1
1440 *      - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
1441 *              ...
1442 *      - 0 < N - M => M < N
1443 *
1444 * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
1445 * always the case (the difference between two distinct xtime instances
1446 * might be smaller then the difference between corresponding TSC reads,
1447 * when updating guest vcpus pvclock areas).
1448 *
1449 * To avoid that problem, do not allow visibility of distinct
1450 * system_timestamp/tsc_timestamp values simultaneously: use a master
1451 * copy of host monotonic time values. Update that master copy
1452 * in lockstep.
1453 *
1454 * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
1455 *
1456 */
1457
1458static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
1459{
1460#ifdef CONFIG_X86_64
1461        struct kvm_arch *ka = &kvm->arch;
1462        int vclock_mode;
1463        bool host_tsc_clocksource, vcpus_matched;
1464
1465        vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
1466                        atomic_read(&kvm->online_vcpus));
1467
1468        /*
1469         * If the host uses TSC clock, then passthrough TSC as stable
1470         * to the guest.
1471         */
1472        host_tsc_clocksource = kvm_get_time_and_clockread(
1473                                        &ka->master_kernel_ns,
1474                                        &ka->master_cycle_now);
1475
1476        ka->use_master_clock = host_tsc_clocksource & vcpus_matched;
1477
1478        if (ka->use_master_clock)
1479                atomic_set(&kvm_guest_has_master_clock, 1);
1480
1481        vclock_mode = pvclock_gtod_data.clock.vclock_mode;
1482        trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
1483                                        vcpus_matched);
1484#endif
1485}
1486
1487static void kvm_gen_update_masterclock(struct kvm *kvm)
1488{
1489#ifdef CONFIG_X86_64
1490        int i;
1491        struct kvm_vcpu *vcpu;
1492        struct kvm_arch *ka = &kvm->arch;
1493
1494        spin_lock(&ka->pvclock_gtod_sync_lock);
1495        kvm_make_mclock_inprogress_request(kvm);
1496        /* no guest entries from this point */
1497        pvclock_update_vm_gtod_copy(kvm);
1498
1499        kvm_for_each_vcpu(i, vcpu, kvm)
1500                set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
1501
1502        /* guest entries allowed */
1503        kvm_for_each_vcpu(i, vcpu, kvm)
1504                clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
1505
1506        spin_unlock(&ka->pvclock_gtod_sync_lock);
1507#endif
1508}
1509
1510static int kvm_guest_time_update(struct kvm_vcpu *v)
1511{
1512        unsigned long flags, this_tsc_khz;
1513        struct kvm_vcpu_arch *vcpu = &v->arch;
1514        struct kvm_arch *ka = &v->kvm->arch;
1515        s64 kernel_ns;
1516        u64 tsc_timestamp, host_tsc;
1517        struct pvclock_vcpu_time_info guest_hv_clock;
1518        u8 pvclock_flags;
1519        bool use_master_clock;
1520
1521        kernel_ns = 0;
1522        host_tsc = 0;
1523
1524        /*
1525         * If the host uses TSC clock, then passthrough TSC as stable
1526         * to the guest.
1527         */
1528        spin_lock(&ka->pvclock_gtod_sync_lock);
1529        use_master_clock = ka->use_master_clock;
1530        if (use_master_clock) {
1531                host_tsc = ka->master_cycle_now;
1532                kernel_ns = ka->master_kernel_ns;
1533        }
1534        spin_unlock(&ka->pvclock_gtod_sync_lock);
1535
1536        /* Keep irq disabled to prevent changes to the clock */
1537        local_irq_save(flags);
1538        this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
1539        if (unlikely(this_tsc_khz == 0)) {
1540                local_irq_restore(flags);
1541                kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
1542                return 1;
1543        }
1544        if (!use_master_clock) {
1545                host_tsc = native_read_tsc();
1546                kernel_ns = get_kernel_ns();
1547        }
1548
1549        tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc);
1550
1551        /*
1552         * We may have to catch up the TSC to match elapsed wall clock
1553         * time for two reasons, even if kvmclock is used.
1554         *   1) CPU could have been running below the maximum TSC rate
1555         *   2) Broken TSC compensation resets the base at each VCPU
1556         *      entry to avoid unknown leaps of TSC even when running
1557         *      again on the same CPU.  This may cause apparent elapsed
1558         *      time to disappear, and the guest to stand still or run
1559         *      very slowly.
1560         */
1561        if (vcpu->tsc_catchup) {
1562                u64 tsc = compute_guest_tsc(v, kernel_ns);
1563                if (tsc > tsc_timestamp) {
1564                        adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
1565                        tsc_timestamp = tsc;
1566                }
1567        }
1568
1569        local_irq_restore(flags);
1570
1571        if (!vcpu->pv_time_enabled)
1572                return 0;
1573
1574        if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
1575                kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
1576                                   &vcpu->hv_clock.tsc_shift,
1577                                   &vcpu->hv_clock.tsc_to_system_mul);
1578                vcpu->hw_tsc_khz = this_tsc_khz;
1579        }
1580
1581        /* With all the info we got, fill in the values */
1582        vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
1583        vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
1584        vcpu->last_kernel_ns = kernel_ns;
1585        vcpu->last_guest_tsc = tsc_timestamp;
1586
1587        /*
1588         * The interface expects us to write an even number signaling that the
1589         * update is finished. Since the guest won't see the intermediate
1590         * state, we just increase by 2 at the end.
1591         */
1592        vcpu->hv_clock.version += 2;
1593
1594        if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
1595                &guest_hv_clock, sizeof(guest_hv_clock))))
1596                return 0;
1597
1598        /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
1599        pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
1600
1601        if (vcpu->pvclock_set_guest_stopped_request) {
1602                pvclock_flags |= PVCLOCK_GUEST_STOPPED;
1603                vcpu->pvclock_set_guest_stopped_request = false;
1604        }
1605
1606        /* If the host uses TSC clocksource, then it is stable */
1607        if (use_master_clock)
1608                pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
1609
1610        vcpu->hv_clock.flags = pvclock_flags;
1611
1612        kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
1613                                &vcpu->hv_clock,
1614                                sizeof(vcpu->hv_clock));
1615        return 0;
1616}
1617
1618/*
1619 * kvmclock updates which are isolated to a given vcpu, such as
1620 * vcpu->cpu migration, should not allow system_timestamp from
1621 * the rest of the vcpus to remain static. Otherwise ntp frequency
1622 * correction applies to one vcpu's system_timestamp but not
1623 * the others.
1624 *
1625 * So in those cases, request a kvmclock update for all vcpus.
1626 * The worst case for a remote vcpu to update its kvmclock
1627 * is then bounded by maximum nohz sleep latency.
1628 */
1629
1630static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
1631{
1632        int i;
1633        struct kvm *kvm = v->kvm;
1634        struct kvm_vcpu *vcpu;
1635
1636        kvm_for_each_vcpu(i, vcpu, kvm) {
1637                set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
1638                kvm_vcpu_kick(vcpu);
1639        }
1640}
1641
1642static bool msr_mtrr_valid(unsigned msr)
1643{
1644        switch (msr) {
1645        case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
1646        case MSR_MTRRfix64K_00000:
1647        case MSR_MTRRfix16K_80000:
1648        case MSR_MTRRfix16K_A0000:
1649        case MSR_MTRRfix4K_C0000:
1650        case MSR_MTRRfix4K_C8000:
1651        case MSR_MTRRfix4K_D0000:
1652        case MSR_MTRRfix4K_D8000:
1653        case MSR_MTRRfix4K_E0000:
1654        case MSR_MTRRfix4K_E8000:
1655        case MSR_MTRRfix4K_F0000:
1656        case MSR_MTRRfix4K_F8000:
1657        case MSR_MTRRdefType:
1658        case MSR_IA32_CR_PAT:
1659                return true;
1660        case 0x2f8:
1661                return true;
1662        }
1663        return false;
1664}
1665
1666static bool valid_pat_type(unsigned t)
1667{
1668        return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
1669}
1670
1671static bool valid_mtrr_type(unsigned t)
1672{
1673        return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
1674}
1675
1676static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1677{
1678        int i;
1679
1680        if (!msr_mtrr_valid(msr))
1681                return false;
1682
1683        if (msr == MSR_IA32_CR_PAT) {
1684                for (i = 0; i < 8; i++)
1685                        if (!valid_pat_type((data >> (i * 8)) & 0xff))
1686                                return false;
1687                return true;
1688        } else if (msr == MSR_MTRRdefType) {
1689                if (data & ~0xcff)
1690                        return false;
1691                return valid_mtrr_type(data & 0xff);
1692        } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
1693                for (i = 0; i < 8 ; i++)
1694                        if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
1695                                return false;
1696                return true;
1697        }
1698
1699        /* variable MTRRs */
1700        return valid_mtrr_type(data & 0xff);
1701}
1702
1703static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1704{
1705        u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
1706
1707        if (!mtrr_valid(vcpu, msr, data))
1708                return 1;
1709
1710        if (msr == MSR_MTRRdefType) {
1711                vcpu->arch.mtrr_state.def_type = data;
1712                vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
1713        } else if (msr == MSR_MTRRfix64K_00000)
1714                p[0] = data;
1715        else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
1716                p[1 + msr - MSR_MTRRfix16K_80000] = data;
1717        else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
1718                p[3 + msr - MSR_MTRRfix4K_C0000] = data;
1719        else if (msr == MSR_IA32_CR_PAT)
1720                vcpu->arch.pat = data;
1721        else {  /* Variable MTRRs */
1722                int idx, is_mtrr_mask;
1723                u64 *pt;
1724
1725                idx = (msr - 0x200) / 2;
1726                is_mtrr_mask = msr - 0x200 - 2 * idx;
1727                if (!is_mtrr_mask)
1728                        pt =
1729                          (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
1730                else
1731                        pt =
1732                          (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
1733                *pt = data;
1734        }
1735
1736        kvm_mmu_reset_context(vcpu);
1737        return 0;
1738}
1739
1740static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1741{
1742        u64 mcg_cap = vcpu->arch.mcg_cap;
1743        unsigned bank_num = mcg_cap & 0xff;
1744
1745        switch (msr) {
1746        case MSR_IA32_MCG_STATUS:
1747                vcpu->arch.mcg_status = data;
1748                break;
1749        case MSR_IA32_MCG_CTL:
1750                if (!(mcg_cap & MCG_CTL_P))
1751                        return 1;
1752                if (data != 0 && data != ~(u64)0)
1753                        return -1;
1754                vcpu->arch.mcg_ctl = data;
1755                break;
1756        default:
1757                if (msr >= MSR_IA32_MC0_CTL &&
1758                    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
1759                        u32 offset = msr - MSR_IA32_MC0_CTL;
1760                        /* only 0 or all 1s can be written to IA32_MCi_CTL
1761                         * some Linux kernels though clear bit 10 in bank 4 to
1762                         * workaround a BIOS/GART TBL issue on AMD K8s, ignore
1763                         * this to avoid an uncatched #GP in the guest
1764                         */
1765                        if ((offset & 0x3) == 0 &&
1766                            data != 0 && (data | (1 << 10)) != ~(u64)0)
1767                                return -1;
1768                        vcpu->arch.mce_banks[offset] = data;
1769                        break;
1770                }
1771                return 1;
1772        }
1773        return 0;
1774}
1775
1776static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
1777{
1778        struct kvm *kvm = vcpu->kvm;
1779        int lm = is_long_mode(vcpu);
1780        u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
1781                : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
1782        u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
1783                : kvm->arch.xen_hvm_config.blob_size_32;
1784        u32 page_num = data & ~PAGE_MASK;
1785        u64 page_addr = data & PAGE_MASK;
1786        u8 *page;
1787        int r;
1788
1789        r = -E2BIG;
1790        if (page_num >= blob_size)
1791                goto out;
1792        r = -ENOMEM;
1793        page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
1794        if (IS_ERR(page)) {
1795                r = PTR_ERR(page);
1796                goto out;
1797        }
1798        if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
1799                goto out_free;
1800        r = 0;
1801out_free:
1802        kfree(page);
1803out:
1804        return r;
1805}
1806
1807static bool kvm_hv_hypercall_enabled(struct kvm *kvm)
1808{
1809        return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
1810}
1811
1812static bool kvm_hv_msr_partition_wide(u32 msr)
1813{
1814        bool r = false;
1815        switch (msr) {
1816        case HV_X64_MSR_GUEST_OS_ID:
1817        case HV_X64_MSR_HYPERCALL:
1818        case HV_X64_MSR_REFERENCE_TSC:
1819        case HV_X64_MSR_TIME_REF_COUNT:
1820                r = true;
1821                break;
1822        }
1823
1824        return r;
1825}
1826
1827static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1828{
1829        struct kvm *kvm = vcpu->kvm;
1830
1831        switch (msr) {
1832        case HV_X64_MSR_GUEST_OS_ID:
1833                kvm->arch.hv_guest_os_id = data;
1834                /* setting guest os id to zero disables hypercall page */
1835                if (!kvm->arch.hv_guest_os_id)
1836                        kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
1837                break;
1838        case HV_X64_MSR_HYPERCALL: {
1839                u64 gfn;
1840                unsigned long addr;
1841                u8 instructions[4];
1842
1843                /* if guest os id is not set hypercall should remain disabled */
1844                if (!kvm->arch.hv_guest_os_id)
1845                        break;
1846                if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
1847                        kvm->arch.hv_hypercall = data;
1848                        break;
1849                }
1850                gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
1851                addr = gfn_to_hva(kvm, gfn);
1852                if (kvm_is_error_hva(addr))
1853                        return 1;
1854                kvm_x86_ops->patch_hypercall(vcpu, instructions);
1855                ((unsigned char *)instructions)[3] = 0xc3; /* ret */
1856                if (__copy_to_user((void __user *)addr, instructions, 4))
1857                        return 1;
1858                kvm->arch.hv_hypercall = data;
1859                mark_page_dirty(kvm, gfn);
1860                break;
1861        }
1862        case HV_X64_MSR_REFERENCE_TSC: {
1863                u64 gfn;
1864                HV_REFERENCE_TSC_PAGE tsc_ref;
1865                memset(&tsc_ref, 0, sizeof(tsc_ref));
1866                kvm->arch.hv_tsc_page = data;
1867                if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
1868                        break;
1869                gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
1870                if (kvm_write_guest(kvm, data,
1871                        &tsc_ref, sizeof(tsc_ref)))
1872                        return 1;
1873                mark_page_dirty(kvm, gfn);
1874                break;
1875        }
1876        default:
1877                vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1878                            "data 0x%llx\n", msr, data);
1879                return 1;
1880        }
1881        return 0;
1882}
1883
1884static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1885{
1886        switch (msr) {
1887        case HV_X64_MSR_APIC_ASSIST_PAGE: {
1888                u64 gfn;
1889                unsigned long addr;
1890
1891                if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
1892                        vcpu->arch.hv_vapic = data;
1893                        break;
1894                }
1895                gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT;
1896                addr = gfn_to_hva(vcpu->kvm, gfn);
1897                if (kvm_is_error_hva(addr))
1898                        return 1;
1899                if (__clear_user((void __user *)addr, PAGE_SIZE))
1900                        return 1;
1901                vcpu->arch.hv_vapic = data;
1902                mark_page_dirty(vcpu->kvm, gfn);
1903                break;
1904        }
1905        case HV_X64_MSR_EOI:
1906                return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
1907        case HV_X64_MSR_ICR:
1908                return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
1909        case HV_X64_MSR_TPR:
1910                return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
1911        default:
1912                vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1913                            "data 0x%llx\n", msr, data);
1914                return 1;
1915        }
1916
1917        return 0;
1918}
1919
1920static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
1921{
1922        gpa_t gpa = data & ~0x3f;
1923
1924        /* Bits 2:5 are reserved, Should be zero */
1925        if (data & 0x3c)
1926                return 1;
1927
1928        vcpu->arch.apf.msr_val = data;
1929
1930        if (!(data & KVM_ASYNC_PF_ENABLED)) {
1931                kvm_clear_async_pf_completion_queue(vcpu);
1932                kvm_async_pf_hash_reset(vcpu);
1933                return 0;
1934        }
1935
1936        if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
1937                                        sizeof(u32)))
1938                return 1;
1939
1940        vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
1941        kvm_async_pf_wakeup_all(vcpu);
1942        return 0;
1943}
1944
1945static void kvmclock_reset(struct kvm_vcpu *vcpu)
1946{
1947        vcpu->arch.pv_time_enabled = false;
1948}
1949
1950static void accumulate_steal_time(struct kvm_vcpu *vcpu)
1951{
1952        u64 delta;
1953
1954        if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
1955                return;
1956
1957        delta = current->sched_info.run_delay - vcpu->arch.st.last_steal;
1958        vcpu->arch.st.last_steal = current->sched_info.run_delay;
1959        vcpu->arch.st.accum_steal = delta;
1960}
1961
1962static void record_steal_time(struct kvm_vcpu *vcpu)
1963{
1964        if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
1965                return;
1966
1967        if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
1968                &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
1969                return;
1970
1971        vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal;
1972        vcpu->arch.st.steal.version += 2;
1973        vcpu->arch.st.accum_steal = 0;
1974
1975        kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
1976                &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
1977}
1978
1979int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1980{
1981        bool pr = false;
1982        u32 msr = msr_info->index;
1983        u64 data = msr_info->data;
1984
1985        switch (msr) {
1986        case MSR_AMD64_NB_CFG:
1987        case MSR_IA32_UCODE_REV:
1988        case MSR_IA32_UCODE_WRITE:
1989        case MSR_VM_HSAVE_PA:
1990        case MSR_AMD64_PATCH_LOADER:
1991        case MSR_AMD64_BU_CFG2:
1992                break;
1993
1994        case MSR_EFER:
1995                return set_efer(vcpu, data);
1996        case MSR_K7_HWCR:
1997                data &= ~(u64)0x40;     /* ignore flush filter disable */
1998                data &= ~(u64)0x100;    /* ignore ignne emulation enable */
1999                data &= ~(u64)0x8;      /* ignore TLB cache disable */
2000                if (data != 0) {

2001                        vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
2002                                    data);
2003                        return 1;
2004                }
2005                break;
2006        case MSR_FAM10H_MMIO_CONF_BASE:
2007                if (data != 0) {
2008                        vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
2009                                    "0x%llx\n", data);
2010                        return 1;
2011                }
2012                break;
2013        case MSR_IA32_DEBUGCTLMSR:
2014                if (!data) {
2015                        /* We support the non-activated case already */
2016                        break;
2017                } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
2018                        /* Values other than LBR and BTF are vendor-specific,
2019                           thus reserved and should throw a #GP */
2020                        return 1;
2021                }
2022                vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
2023                            __func__, data);
2024                break;
2025        case 0x200 ... 0x2ff:
2026                return set_msr_mtrr(vcpu, msr, data);
2027        case MSR_IA32_APICBASE:
2028                return kvm_set_apic_base(vcpu, msr_info);
2029        case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
2030                return kvm_x2apic_msr_write(vcpu, msr, data);
2031        case MSR_IA32_TSCDEADLINE:
2032                kvm_set_lapic_tscdeadline_msr(vcpu, data);
2033                break;
2034        case MSR_IA32_TSC_ADJUST:
2035                if (guest_cpuid_has_tsc_adjust(vcpu)) {
2036                        if (!msr_info->host_initiated) {
2037                                u64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
2038                                kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true);
2039                        }
2040                        vcpu->arch.ia32_tsc_adjust_msr = data;
2041                }
2042                break;
2043        case MSR_IA32_MISC_ENABLE:
2044                vcpu->arch.ia32_misc_enable_msr = data;
2045                break;
2046        case MSR_KVM_WALL_CLOCK_NEW:
2047        case MSR_KVM_WALL_CLOCK:
2048                vcpu->kvm->arch.wall_clock = data;
2049                kvm_write_wall_clock(vcpu->kvm, data);
2050                break;
2051        case MSR_KVM_SYSTEM_TIME_NEW:
2052        case MSR_KVM_SYSTEM_TIME: {
2053                u64 gpa_offset;
2054                kvmclock_reset(vcpu);
2055
2056                vcpu->arch.time = data;
2057                kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2058
2059                /* we verify if the enable bit is set... */
2060                if (!(data & 1))
2061                        break;
2062
2063                gpa_offset = data & ~(PAGE_MASK | 1);
2064
2065                if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
2066                     &vcpu->arch.pv_time, data & ~1ULL,
2067                     sizeof(struct pvclock_vcpu_time_info)))
2068                        vcpu->arch.pv_time_enabled = false;
2069                else
2070                        vcpu->arch.pv_time_enabled = true;
2071
2072                break;
2073        }
2074        case MSR_KVM_ASYNC_PF_EN:
2075                if (kvm_pv_enable_async_pf(vcpu, data))
2076                        return 1;
2077                break;
2078        case MSR_KVM_STEAL_TIME:
2079
2080                if (unlikely(!sched_info_on()))
2081                        return 1;
2082
2083                if (data & KVM_STEAL_RESERVED_MASK)
2084                        return 1;
2085
2086                if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
2087                                                data & KVM_STEAL_VALID_BITS,
2088                                                sizeof(struct kvm_steal_time)))
2089                        return 1;
2090
2091                vcpu->arch.st.msr_val = data;
2092
2093                if (!(data & KVM_MSR_ENABLED))
2094                        break;
2095
2096                vcpu->arch.st.last_steal = current->sched_info.run_delay;
2097
2098                preempt_disable();
2099                accumulate_steal_time(vcpu);
2100                preempt_enable();
2101
2102                kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
2103
2104                break;
2105        case MSR_KVM_PV_EOI_EN:
2106                if (kvm_lapic_enable_pv_eoi(vcpu, data))
2107                        return 1;
2108                break;
2109
2110        case MSR_IA32_MCG_CTL:
2111        case MSR_IA32_MCG_STATUS:
2112        case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
2113                return set_msr_mce(vcpu, msr, data);
2114
2115        /* Performance counters are not protected by a CPUID bit,
2116         * so we should check all of them in the generic path for the sake of
2117         * cross vendor migration.
2118         * Writing a zero into the event select MSRs disables them,
2119         * which we perfectly emulate ;-). Any other value should be at least
2120         * reported, some guests depend on them.
2121         */
2122        case MSR_K7_EVNTSEL0:
2123        case MSR_K7_EVNTSEL1:
2124        case MSR_K7_EVNTSEL2:
2125        case MSR_K7_EVNTSEL3:
2126                if (data != 0)
2127                        vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
2128                                    "0x%x data 0x%llx\n", msr, data);
2129                break;
2130        /* at least RHEL 4 unconditionally writes to the perfctr registers,
2131         * so we ignore writes to make it happy.
2132         */
2133        case MSR_K7_PERFCTR0:
2134        case MSR_K7_PERFCTR1:
2135        case MSR_K7_PERFCTR2:
2136        case MSR_K7_PERFCTR3:
2137                vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
2138                            "0x%x data 0x%llx\n", msr, data);
2139                break;
2140        case MSR_P6_PERFCTR0:
2141        case MSR_P6_PERFCTR1:
2142                pr = true;
2143        case MSR_P6_EVNTSEL0:
2144        case MSR_P6_EVNTSEL1:
2145                if (kvm_pmu_msr(vcpu, msr))
2146                        return kvm_pmu_set_msr(vcpu, msr_info);
2147
2148                if (pr || data != 0)
2149                        vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
2150                                    "0x%x data 0x%llx\n", msr, data);
2151                break;
2152        case MSR_K7_CLK_CTL:
2153                /*
2154                 * Ignore all writes to this no longer documented MSR.
2155                 * Writes are only relevant for old K7 processors,
2156                 * all pre-dating SVM, but a recommended workaround from
2157                 * AMD for these chips. It is possible to specify the
2158                 * affected processor models on the command line, hence
2159                 * the need to ignore the workaround.
2160                 */
2161                break;
2162        case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
2163                if (kvm_hv_msr_partition_wide(msr)) {
2164                        int r;
2165                        mutex_lock(&vcpu->kvm->lock);
2166                        r = set_msr_hyperv_pw(vcpu, msr, data);
2167                        mutex_unlock(&vcpu->kvm->lock);
2168                        return r;
2169                } else
2170                        return set_msr_hyperv(vcpu, msr, data);
2171                break;
2172        case MSR_IA32_BBL_CR_CTL3:
2173                /* Drop writes to this legacy MSR -- see rdmsr
2174                 * counterpart for further detail.
2175                 */
2176                vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
2177                break;
2178        case MSR_AMD64_OSVW_ID_LENGTH:
2179                if (!guest_cpuid_has_osvw(vcpu))
2180                        return 1;
2181                vcpu->arch.osvw.length = data;
2182                break;
2183        case MSR_AMD64_OSVW_STATUS:
2184                if (!guest_cpuid_has_osvw(vcpu))
2185                        return 1;
2186                vcpu->arch.osvw.status = data;
2187                break;
2188        default:
2189                if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
2190                        return xen_hvm_config(vcpu, data);
2191                if (kvm_pmu_msr(vcpu, msr))
2192                        return kvm_pmu_set_msr(vcpu, msr_info);
2193                if (!ignore_msrs) {
2194                        vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
2195                                    msr, data);
2196                        return 1;
2197                } else {
2198                        vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
2199                                    msr, data);
2200                        break;
2201                }
2202        }
2203        return 0;
2204}
2205EXPORT_SYMBOL_GPL(kvm_set_msr_common);
2206
2207
2208/*
2209 * Reads an msr value (of 'msr_index') into 'pdata'.
2210 * Returns 0 on success, non-0 otherwise.
2211 * Assumes vcpu_load() was already called.
2212 */
2213int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2214{
2215        return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
2216}
2217
2218static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2219{
2220        u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
2221
2222        if (!msr_mtrr_valid(msr))
2223                return 1;
2224
2225        if (msr == MSR_MTRRdefType)
2226                *pdata = vcpu->arch.mtrr_state.def_type +
2227                         (vcpu->arch.mtrr_state.enabled << 10);
2228        else if (msr == MSR_MTRRfix64K_00000)
2229                *pdata = p[0];
2230        else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
2231                *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
2232        else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
2233                *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
2234        else if (msr == MSR_IA32_CR_PAT)
2235                *pdata = vcpu->arch.pat;
2236        else {  /* Variable MTRRs */
2237                int idx, is_mtrr_mask;
2238                u64 *pt;
2239
2240                idx = (msr - 0x200) / 2;
2241                is_mtrr_mask = msr - 0x200 - 2 * idx;
2242                if (!is_mtrr_mask)
2243                        pt =
2244                          (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
2245                else
2246                        pt =
2247                          (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
2248                *pdata = *pt;
2249        }
2250
2251        return 0;
2252}
2253
2254static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2255{
2256        u64 data;
2257        u64 mcg_cap = vcpu->arch.mcg_cap;
2258        unsigned bank_num = mcg_cap & 0xff;
2259
2260        switch (msr) {
2261        case MSR_IA32_P5_MC_ADDR:
2262        case MSR_IA32_P5_MC_TYPE:
2263                data = 0;
2264                break;
2265        case MSR_IA32_MCG_CAP:
2266                data = vcpu->arch.mcg_cap;
2267                break;
2268        case MSR_IA32_MCG_CTL:
2269                if (!(mcg_cap & MCG_CTL_P))
2270                        return 1;
2271                data = vcpu->arch.mcg_ctl;
2272                break;
2273        case MSR_IA32_MCG_STATUS:
2274                data = vcpu->arch.mcg_status;
2275                break;
2276        default:
2277                if (msr >= MSR_IA32_MC0_CTL &&
2278                    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
2279                        u32 offset = msr - MSR_IA32_MC0_CTL;
2280                        data = vcpu->arch.mce_banks[offset];
2281                        break;
2282                }
2283                return 1;
2284        }
2285        *pdata = data;
2286        return 0;
2287}
2288
2289static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2290{
2291        u64 data = 0;
2292        struct kvm *kvm = vcpu->kvm;
2293
2294        switch (msr) {
2295        case HV_X64_MSR_GUEST_OS_ID:
2296                data = kvm->arch.hv_guest_os_id;
2297                break;
2298        case HV_X64_MSR_HYPERCALL:
2299                data = kvm->arch.hv_hypercall;
2300                break;
2301        case HV_X64_MSR_TIME_REF_COUNT: {
2302                data =
2303                     div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
2304                break;
2305        }
2306        case HV_X64_MSR_REFERENCE_TSC:
2307                data = kvm->arch.hv_tsc_page;
2308                break;
2309        default:
2310                vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
2311                return 1;
2312        }
2313
2314        *pdata = data;
2315        return 0;
2316}
2317
2318static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2319{
2320        u64 data = 0;
2321
2322        switch (msr) {
2323        case HV_X64_MSR_VP_INDEX: {
2324                int r;
2325                struct kvm_vcpu *v;
2326                kvm_for_each_vcpu(r, v, vcpu->kvm)
2327                        if (v == vcpu)
2328                                data = r;
2329                break;
2330        }
2331        case HV_X64_MSR_EOI:
2332                return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
2333        case HV_X64_MSR_ICR:
2334                return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
2335        case HV_X64_MSR_TPR:
2336                return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
2337        case HV_X64_MSR_APIC_ASSIST_PAGE:
2338                data = vcpu->arch.hv_vapic;
2339                break;
2340        default:
2341                vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
2342                return 1;
2343        }
2344        *pdata = data;
2345        return 0;
2346}
2347
2348int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2349{
2350        u64 data;
2351
2352        switch (msr) {
2353        case MSR_IA32_PLATFORM_ID:
2354        case MSR_IA32_EBL_CR_POWERON:
2355        case MSR_IA32_DEBUGCTLMSR:
2356        case MSR_IA32_LASTBRANCHFROMIP:
2357        case MSR_IA32_LASTBRANCHTOIP:
2358        case MSR_IA32_LASTINTFROMIP:
2359        case MSR_IA32_LASTINTTOIP:
2360        case MSR_K8_SYSCFG:
2361        case MSR_K7_HWCR:
2362        case MSR_VM_HSAVE_PA:
2363        case MSR_K7_EVNTSEL0:
2364        case MSR_K7_PERFCTR0:
2365        case MSR_K8_INT_PENDING_MSG:
2366        case MSR_AMD64_NB_CFG:
2367        case MSR_FAM10H_MMIO_CONF_BASE:
2368        case MSR_AMD64_BU_CFG2:
2369                data = 0;
2370                break;
2371        case MSR_P6_PERFCTR0:
2372        case MSR_P6_PERFCTR1:
2373        case MSR_P6_EVNTSEL0:
2374        case MSR_P6_EVNTSEL1:
2375                if (kvm_pmu_msr(vcpu, msr))
2376                        return kvm_pmu_get_msr(vcpu, msr, pdata);
2377                data = 0;
2378                break;
2379        case MSR_IA32_UCODE_REV:
2380                data = 0x100000000ULL;
2381                break;
2382        case MSR_MTRRcap:
2383                data = 0x500 | KVM_NR_VAR_MTRR;
2384                break;
2385        case 0x200 ... 0x2ff:
2386                return get_msr_mtrr(vcpu, msr, pdata);
2387        case 0xcd: /* fsb frequency */
2388                data = 3;
2389                break;
2390                /*
2391                 * MSR_EBC_FREQUENCY_ID
2392                 * Conservative value valid for even the basic CPU models.
2393                 * Models 0,1: 000 in bits 23:21 indicating a bus speed of
2394                 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
2395                 * and 266MHz for model 3, or 4. Set Core Clock
2396                 * Frequency to System Bus Frequency Ratio to 1 (bits
2397                 * 31:24) even though these are only valid for CPU
2398                 * models > 2, however guests may end up dividing or
2399                 * multiplying by zero otherwise.
2400                 */
2401        case MSR_EBC_FREQUENCY_ID:
2402                data = 1 << 24;
2403                break;
2404        case MSR_IA32_APICBASE:
2405                data = kvm_get_apic_base(vcpu);
2406                break;
2407        case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
2408                return kvm_x2apic_msr_read(vcpu, msr, pdata);
2409                break;
2410        case MSR_IA32_TSCDEADLINE:
2411                data = kvm_get_lapic_tscdeadline_msr(vcpu);
2412                break;
2413        case MSR_IA32_TSC_ADJUST:
2414                data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
2415                break;
2416        case MSR_IA32_MISC_ENABLE:
2417                data = vcpu->arch.ia32_misc_enable_msr;
2418                break;
2419        case MSR_IA32_PERF_STATUS:
2420                /* TSC increment by tick */
2421                data = 1000ULL;
2422                /* CPU multiplier */
2423                data |= (((uint64_t)4ULL) << 40);
2424                break;
2425        case MSR_EFER:
2426                data = vcpu->arch.efer;
2427                break;
2428        case MSR_KVM_WALL_CLOCK:
2429        case MSR_KVM_WALL_CLOCK_NEW:
2430                data = vcpu->kvm->arch.wall_clock;
2431                break;
2432        case MSR_KVM_SYSTEM_TIME:
2433        case MSR_KVM_SYSTEM_TIME_NEW:
2434                data = vcpu->arch.time;
2435                break;
2436        case MSR_KVM_ASYNC_PF_EN:
2437                data = vcpu->arch.apf.msr_val;
2438                break;
2439        case MSR_KVM_STEAL_TIME:
2440                data = vcpu->arch.st.msr_val;
2441                break;
2442        case MSR_KVM_PV_EOI_EN:
2443                data = vcpu->arch.pv_eoi.msr_val;
2444                break;
2445        case MSR_IA32_P5_MC_ADDR:
2446        case MSR_IA32_P5_MC_TYPE:
2447        case MSR_IA32_MCG_CAP:
2448        case MSR_IA32_MCG_CTL:
2449        case MSR_IA32_MCG_STATUS:
2450        case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
2451                return get_msr_mce(vcpu, msr, pdata);
2452        case MSR_K7_CLK_CTL:
2453                /*
2454                 * Provide expected ramp-up count for K7. All other
2455                 * are set to zero, indicating minimum divisors for
2456                 * every field.
2457                 *
2458                 * This prevents guest kernels on AMD host with CPU
2459                 * type 6, model 8 and higher from exploding due to
2460                 * the rdmsr failing.
2461                 */
2462                data = 0x20000000;
2463                break;
2464        case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
2465                if (kvm_hv_msr_partition_wide(msr)) {
2466                        int r;
2467                        mutex_lock(&vcpu->kvm->lock);
2468                        r = get_msr_hyperv_pw(vcpu, msr, pdata);
2469                        mutex_unlock(&vcpu->kvm->lock);
2470                        return r;
2471                } else
2472                        return get_msr_hyperv(vcpu, msr, pdata);
2473                break;
2474        case MSR_IA32_BBL_CR_CTL3:
2475                /* This legacy MSR exists but isn't fully documented in current
2476                 * silicon.  It is however accessed by winxp in very narrow
2477                 * scenarios where it sets bit #19, itself documented as
2478                 * a "reserved" bit.  Best effort attempt to source coherent
2479                 * read data here should the balance of the register be
2480                 * interpreted by the guest:
2481                 *
2482                 * L2 cache control register 3: 64GB range, 256KB size,
2483                 * enabled, latency 0x1, configured
2484                 */
2485                data = 0xbe702111;
2486                break;
2487        case MSR_AMD64_OSVW_ID_LENGTH:
2488                if (!guest_cpuid_has_osvw(vcpu))
2489                        return 1;
2490                data = vcpu->arch.osvw.length;
2491                break;
2492        case MSR_AMD64_OSVW_STATUS:
2493                if (!guest_cpuid_has_osvw(vcpu))
2494                        return 1;
2495                data = vcpu->arch.osvw.status;
2496                break;
2497        default:
2498                if (kvm_pmu_msr(vcpu, msr))
2499                        return kvm_pmu_get_msr(vcpu, msr, pdata);
2500                if (!ignore_msrs) {
2501                        vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
2502                        return 1;
2503                } else {
2504                        vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
2505                        data = 0;
2506                }
2507                break;
2508        }
2509        *pdata = data;
2510        return 0;
2511}
2512EXPORT_SYMBOL_GPL(kvm_get_msr_common);
2513
2514/*
2515 * Read or write a bunch of msrs. All parameters are kernel addresses.
2516 *
2517 * @return number of msrs set successfully.
2518 */
2519static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
2520                    struct kvm_msr_entry *entries,
2521                    int (*do_msr)(struct kvm_vcpu *vcpu,
2522                                  unsigned index, u64 *data))
2523{
2524        int i, idx;
2525
2526        idx = srcu_read_lock(&vcpu->kvm->srcu);
2527        for (i = 0; i < msrs->nmsrs; ++i)
2528                if (do_msr(vcpu, entries[i].index, &entries[i].data))
2529                        break;
2530        srcu_read_unlock(&vcpu->kvm->srcu, idx);
2531
2532        return i;
2533}
2534
2535/*
2536 * Read or write a bunch of msrs. Parameters are user addresses.
2537 *
2538 * @return number of msrs set successfully.
2539 */
2540static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
2541                  int (*do_msr)(struct kvm_vcpu *vcpu,
2542                                unsigned index, u64 *data),
2543                  int writeback)
2544{
2545        struct kvm_msrs msrs;
2546        struct kvm_msr_entry *entries;
2547        int r, n;
2548        unsigned size;
2549
2550        r = -EFAULT;
2551        if (copy_from_user(&msrs, user_msrs, sizeof msrs))
2552                goto out;
2553
2554        r = -E2BIG;
2555        if (msrs.nmsrs >= MAX_IO_MSRS)
2556                goto out;
2557
2558        size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
2559        entries = memdup_user(user_msrs->entries, size);
2560        if (IS_ERR(entries)) {
2561                r = PTR_ERR(entries);
2562                goto out;
2563        }
2564
2565        r = n = __msr_io(vcpu, &msrs, entries, do_msr);
2566        if (r < 0)
2567                goto out_free;
2568
2569        r = -EFAULT;
2570        if (writeback && copy_to_user(user_msrs->entries, entries, size))
2571                goto out_free;
2572
2573        r = n;
2574
2575out_free:
2576        kfree(entries);
2577out:
2578        return r;
2579}
2580
2581int kvm_dev_ioctl_check_extension(long ext)
2582{
2583        int r;
2584
2585        switch (ext) {
2586        case KVM_CAP_IRQCHIP:
2587        case KVM_CAP_HLT:
2588        case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
2589        case KVM_CAP_SET_TSS_ADDR:
2590        case KVM_CAP_EXT_CPUID:
2591        case KVM_CAP_EXT_EMUL_CPUID:
2592        case KVM_CAP_CLOCKSOURCE:
2593        case KVM_CAP_PIT:
2594        case KVM_CAP_NOP_IO_DELAY:
2595        case KVM_CAP_MP_STATE:
2596        case KVM_CAP_SYNC_MMU:
2597        case KVM_CAP_USER_NMI:
2598        case KVM_CAP_REINJECT_CONTROL:
2599        case KVM_CAP_IRQ_INJECT_STATUS:
2600        case KVM_CAP_IRQFD:
2601        case KVM_CAP_IOEVENTFD:
2602        case KVM_CAP_PIT2:
2603        case KVM_CAP_PIT_STATE2:
2604        case KVM_CAP_SET_IDENTITY_MAP_ADDR:
2605        case KVM_CAP_XEN_HVM:
2606        case KVM_CAP_ADJUST_CLOCK:
2607        case KVM_CAP_VCPU_EVENTS:
2608        case KVM_CAP_HYPERV:
2609        case KVM_CAP_HYPERV_VAPIC:
2610        case KVM_CAP_HYPERV_SPIN:
2611        case KVM_CAP_PCI_SEGMENT:
2612        case KVM_CAP_DEBUGREGS:
2613        case KVM_CAP_X86_ROBUST_SINGLESTEP:
2614        case KVM_CAP_XSAVE:
2615        case KVM_CAP_ASYNC_PF:
2616        case KVM_CAP_GET_TSC_KHZ:
2617        case KVM_CAP_KVMCLOCK_CTRL:
2618        case KVM_CAP_READONLY_MEM:
2619        case KVM_CAP_HYPERV_TIME:
2620#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
2621        case KVM_CAP_ASSIGN_DEV_IRQ:
2622        case KVM_CAP_PCI_2_3:
2623#endif
2624                r = 1;
2625                break;
2626        case KVM_CAP_COALESCED_MMIO:
2627                r = KVM_COALESCED_MMIO_PAGE_OFFSET;
2628                break;
2629        case KVM_CAP_VAPIC:
2630                r = !kvm_x86_ops->cpu_has_accelerated_tpr();
2631                break;
2632        case KVM_CAP_NR_VCPUS:
2633                r = KVM_SOFT_MAX_VCPUS;
2634                break;
2635        case KVM_CAP_MAX_VCPUS:
2636                r = KVM_MAX_VCPUS;
2637                break;
2638        case KVM_CAP_NR_MEMSLOTS:
2639                r = KVM_USER_MEM_SLOTS;
2640                break;
2641        case KVM_CAP_PV_MMU:    /* obsolete */
2642                r = 0;
2643                break;
2644#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
2645        case KVM_CAP_IOMMU:
2646                r = iommu_present(&pci_bus_type);
2647                break;
2648#endif
2649        case KVM_CAP_MCE:
2650                r = KVM_MAX_MCE_BANKS;
2651                break;
2652        case KVM_CAP_XCRS:
2653                r = cpu_has_xsave;
2654                break;
2655        case KVM_CAP_TSC_CONTROL:
2656                r = kvm_has_tsc_control;
2657                break;
2658        case KVM_CAP_TSC_DEADLINE_TIMER:
2659                r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER);
2660                break;
2661        default:
2662                r = 0;
2663                break;
2664        }
2665        return r;
2666
2667}
2668
2669long kvm_arch_dev_ioctl(struct file *filp,
2670                        unsigned int ioctl, unsigned long arg)
2671{
2672        void __user *argp = (void __user *)arg;
2673        long r;
2674
2675        switch (ioctl) {
2676        case KVM_GET_MSR_INDEX_LIST: {
2677                struct kvm_msr_list __user *user_msr_list = argp;
2678                struct kvm_msr_list msr_list;
2679                unsigned n;
2680
2681                r = -EFAULT;
2682                if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
2683                        goto out;
2684                n = msr_list.nmsrs;
2685                msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
2686                if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
2687                        goto out;
2688                r = -E2BIG;
2689                if (n < msr_list.nmsrs)
2690                        goto out;
2691                r = -EFAULT;
2692                if (copy_to_user(user_msr_list->indices, &msrs_to_save,
2693                                 num_msrs_to_save * sizeof(u32)))
2694                        goto out;
2695                if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
2696                                 &emulated_msrs,
2697                                 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
2698                        goto out;
2699                r = 0;
2700                break;
2701        }
2702        case KVM_GET_SUPPORTED_CPUID:
2703        case KVM_GET_EMULATED_CPUID: {
2704                struct kvm_cpuid2 __user *cpuid_arg = argp;
2705                struct kvm_cpuid2 cpuid;
2706
2707                r = -EFAULT;
2708                if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2709                        goto out;
2710
2711                r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
2712                                            ioctl);
2713                if (r)
2714                        goto out;
2715
2716                r = -EFAULT;
2717                if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
2718                        goto out;
2719                r = 0;
2720                break;
2721        }
2722        case KVM_X86_GET_MCE_CAP_SUPPORTED: {
2723                u64 mce_cap;
2724
2725                mce_cap = KVM_MCE_CAP_SUPPORTED;
2726                r = -EFAULT;
2727                if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
2728                        goto out;
2729                r = 0;
2730                break;
2731        }
2732        default:
2733                r = -EINVAL;
2734        }
2735out:
2736        return r;
2737}
2738
2739static void wbinvd_ipi(void *garbage)
2740{
2741        wbinvd();
2742}
2743
2744static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
2745{
2746        return kvm_arch_has_noncoherent_dma(vcpu->kvm);
2747}
2748
2749void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2750{
2751        /* Address WBINVD may be executed by guest */
2752        if (need_emulate_wbinvd(vcpu)) {
2753                if (kvm_x86_ops->has_wbinvd_exit())
2754                        cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
2755                else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
2756                        smp_call_function_single(vcpu->cpu,
2757                                        wbinvd_ipi, NULL, 1);
2758        }
2759
2760        kvm_x86_ops->vcpu_load(vcpu, cpu);
2761
2762        /* Apply any externally detected TSC adjustments (due to suspend) */
2763        if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
2764                adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
2765                vcpu->arch.tsc_offset_adjustment = 0;
2766                set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
2767        }
2768
2769        if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
2770                s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
2771                                native_read_tsc() - vcpu->arch.last_host_tsc;
2772                if (tsc_delta < 0)
2773                        mark_tsc_unstable("KVM discovered backwards TSC");
2774                if (check_tsc_unstable()) {
2775                        u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu,
2776                                                vcpu->arch.last_guest_tsc);
2777                        kvm_x86_ops->write_tsc_offset(vcpu, offset);
2778                        vcpu->arch.tsc_catchup = 1;
2779                }
2780                /*
2781                 * On a host with synchronized TSC, there is no need to update
2782                 * kvmclock on vcpu->cpu migration
2783                 */
2784                if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
2785                        kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2786                if (vcpu->cpu != cpu)
2787                        kvm_migrate_timers(vcpu);
2788                vcpu->cpu = cpu;
2789        }
2790
2791        accumulate_steal_time(vcpu);
2792        kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
2793}
2794
2795void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
2796{
2797        kvm_x86_ops->vcpu_put(vcpu);
2798        kvm_put_guest_fpu(vcpu);
2799        vcpu->arch.last_host_tsc = native_read_tsc();
2800}
2801
2802static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2803                                    struct kvm_lapic_state *s)
2804{
2805        kvm_x86_ops->sync_pir_to_irr(vcpu);
2806        memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
2807
2808        return 0;
2809}
2810
2811static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
2812                                    struct kvm_lapic_state *s)
2813{
2814        kvm_apic_post_state_restore(vcpu, s);
2815        update_cr8_intercept(vcpu);
2816
2817        return 0;
2818}
2819
2820static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2821                                    struct kvm_interrupt *irq)
2822{
2823        if (irq->irq >= KVM_NR_INTERRUPTS)
2824                return -EINVAL;
2825        if (irqchip_in_kernel(vcpu->kvm))
2826                return -ENXIO;
2827
2828        kvm_queue_interrupt(vcpu, irq->irq, false);
2829        kvm_make_request(KVM_REQ_EVENT, vcpu);
2830
2831        return 0;
2832}
2833
2834static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
2835{
2836        kvm_inject_nmi(vcpu);
2837
2838        return 0;
2839}
2840
2841static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
2842                                           struct kvm_tpr_access_ctl *tac)
2843{
2844        if (tac->flags)
2845                return -EINVAL;
2846        vcpu->arch.tpr_access_reporting = !!tac->enabled;
2847        return 0;
2848}
2849
2850static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
2851                                        u64 mcg_cap)
2852{
2853        int r;
2854        unsigned bank_num = mcg_cap & 0xff, bank;
2855
2856        r = -EINVAL;
2857        if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
2858                goto out;
2859        if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
2860                goto out;
2861        r = 0;
2862        vcpu->arch.mcg_cap = mcg_cap;
2863        /* Init IA32_MCG_CTL to all 1s */
2864        if (mcg_cap & MCG_CTL_P)
2865                vcpu->arch.mcg_ctl = ~(u64)0;
2866        /* Init IA32_MCi_CTL to all 1s */
2867        for (bank = 0; bank < bank_num; bank++)
2868                vcpu->arch.mce_banks[bank*4] = ~(u64)0;
2869out:
2870        return r;
2871}
2872
2873static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
2874                                      struct kvm_x86_mce *mce)
2875{
2876        u64 mcg_cap = vcpu->arch.mcg_cap;
2877        unsigned bank_num = mcg_cap & 0xff;
2878        u64 *banks = vcpu->arch.mce_banks;
2879
2880        if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
2881                return -EINVAL;
2882        /*
2883         * if IA32_MCG_CTL is not all 1s, the uncorrected error
2884         * reporting is disabled
2885         */
2886        if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
2887            vcpu->arch.mcg_ctl != ~(u64)0)
2888                return 0;
2889        banks += 4 * mce->bank;
2890        /*
2891         * if IA32_MCi_CTL is not all 1s, the uncorrected error
2892         * reporting is disabled for the bank
2893         */
2894        if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
2895                return 0;
2896        if (mce->status & MCI_STATUS_UC) {
2897                if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
2898                    !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
2899                        kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2900                        return 0;
2901                }
2902                if (banks[1] & MCI_STATUS_VAL)
2903                        mce->status |= MCI_STATUS_OVER;
2904                banks[2] = mce->addr;
2905                banks[3] = mce->misc;
2906                vcpu->arch.mcg_status = mce->mcg_status;
2907                banks[1] = mce->status;
2908                kvm_queue_exception(vcpu, MC_VECTOR);
2909        } else if (!(banks[1] & MCI_STATUS_VAL)
2910                   || !(banks[1] & MCI_STATUS_UC)) {
2911                if (banks[1] & MCI_STATUS_VAL)
2912                        mce->status |= MCI_STATUS_OVER;
2913                banks[2] = mce->addr;
2914                banks[3] = mce->misc;
2915                banks[1] = mce->status;
2916        } else
2917                banks[1] |= MCI_STATUS_OVER;
2918        return 0;
2919}
2920
2921static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2922                                               struct kvm_vcpu_events *events)
2923{
2924        process_nmi(vcpu);
2925        events->exception.injected =
2926                vcpu->arch.exception.pending &&
2927                !kvm_exception_is_soft(vcpu->arch.exception.nr);
2928        events->exception.nr = vcpu->arch.exception.nr;
2929        events->exception.has_error_code = vcpu->arch.exception.has_error_code;
2930        events->exception.pad = 0;
2931        events->exception.error_code = vcpu->arch.exception.error_code;
2932
2933        events->interrupt.injected =
2934                vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
2935        events->interrupt.nr = vcpu->arch.interrupt.nr;
2936        events->interrupt.soft = 0;
2937        events->interrupt.shadow =
2938                kvm_x86_ops->get_interrupt_shadow(vcpu,
2939                        KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
2940
2941        events->nmi.injected = vcpu->arch.nmi_injected;
2942        events->nmi.pending = vcpu->arch.nmi_pending != 0;
2943        events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
2944        events->nmi.pad = 0;
2945
2946        events->sipi_vector = 0; /* never valid when reporting to user space */
2947
2948        events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
2949                         | KVM_VCPUEVENT_VALID_SHADOW);
2950        memset(&events->reserved, 0, sizeof(events->reserved));
2951}
2952
2953static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2954                                              struct kvm_vcpu_events *events)
2955{
2956        if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
2957                              | KVM_VCPUEVENT_VALID_SIPI_VECTOR
2958                              | KVM_VCPUEVENT_VALID_SHADOW))
2959                return -EINVAL;
2960
2961        process_nmi(vcpu);
2962        vcpu->arch.exception.pending = events->exception.injected;
2963        vcpu->arch.exception.nr = events->exception.nr;
2964        vcpu->arch.exception.has_error_code = events->exception.has_error_code;
2965        vcpu->arch.exception.error_code = events->exception.error_code;
2966
2967        vcpu->arch.interrupt.pending = events->interrupt.injected;
2968        vcpu->arch.interrupt.nr = events->interrupt.nr;
2969        vcpu->arch.interrupt.soft = events->interrupt.soft;
2970        if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
2971                kvm_x86_ops->set_interrupt_shadow(vcpu,
2972                                                  events->interrupt.shadow);
2973
2974        vcpu->arch.nmi_injected = events->nmi.injected;
2975        if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
2976                vcpu->arch.nmi_pending = events->nmi.pending;
2977        kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
2978
2979        if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
2980            kvm_vcpu_has_lapic(vcpu))
2981                vcpu->arch.apic->sipi_vector = events->sipi_vector;
2982
2983        kvm_make_request(KVM_REQ_EVENT, vcpu);
2984
2985        return 0;
2986}
2987
2988static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
2989                                             struct kvm_debugregs *dbgregs)
2990{
2991        unsigned long val;
2992
2993        memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
2994        _kvm_get_dr(vcpu, 6, &val);
2995        dbgregs->dr6 = val;
2996        dbgregs->dr7 = vcpu->arch.dr7;
2997        dbgregs->flags = 0;
2998        memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
2999}
3000

3001static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
3002                                            struct kvm_debugregs *dbgregs)
3003{
3004        if (dbgregs->flags)
3005                return -EINVAL;
3006
3007        memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
3008        vcpu->arch.dr6 = dbgregs->dr6;
3009        kvm_update_dr6(vcpu);
3010        vcpu->arch.dr7 = dbgregs->dr7;
3011        kvm_update_dr7(vcpu);
3012
3013        return 0;
3014}
3015
3016static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
3017                                         struct kvm_xsave *guest_xsave)
3018{
3019        if (cpu_has_xsave) {
3020                memcpy(guest_xsave->region,
3021                        &vcpu->arch.guest_fpu.state->xsave,
3022                        vcpu->arch.guest_xstate_size);
3023                *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] &=
3024                        vcpu->arch.guest_supported_xcr0 | XSTATE_FPSSE;
3025        } else {
3026                memcpy(guest_xsave->region,
3027                        &vcpu->arch.guest_fpu.state->fxsave,
3028                        sizeof(struct i387_fxsave_struct));
3029                *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
3030                        XSTATE_FPSSE;
3031        }
3032}
3033
3034static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
3035                                        struct kvm_xsave *guest_xsave)
3036{
3037        u64 xstate_bv =
3038                *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
3039
3040        if (cpu_has_xsave) {
3041                /*
3042                 * Here we allow setting states that are not present in
3043                 * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility
3044                 * with old userspace.
3045                 */
3046                if (xstate_bv & ~KVM_SUPPORTED_XCR0)
3047                        return -EINVAL;
3048                if (xstate_bv & ~host_xcr0)
3049                        return -EINVAL;
3050                memcpy(&vcpu->arch.guest_fpu.state->xsave,
3051                        guest_xsave->region, vcpu->arch.guest_xstate_size);
3052        } else {
3053                if (xstate_bv & ~XSTATE_FPSSE)
3054                        return -EINVAL;
3055                memcpy(&vcpu->arch.guest_fpu.state->fxsave,
3056                        guest_xsave->region, sizeof(struct i387_fxsave_struct));
3057        }
3058        return 0;
3059}
3060
3061static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
3062                                        struct kvm_xcrs *guest_xcrs)
3063{
3064        if (!cpu_has_xsave) {
3065                guest_xcrs->nr_xcrs = 0;
3066                return;
3067        }
3068
3069        guest_xcrs->nr_xcrs = 1;
3070        guest_xcrs->flags = 0;
3071        guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
3072        guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
3073}
3074
3075static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
3076                                       struct kvm_xcrs *guest_xcrs)
3077{
3078        int i, r = 0;
3079
3080        if (!cpu_has_xsave)
3081                return -EINVAL;
3082
3083        if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
3084                return -EINVAL;
3085
3086        for (i = 0; i < guest_xcrs->nr_xcrs; i++)
3087                /* Only support XCR0 currently */
3088                if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
3089                        r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
3090                                guest_xcrs->xcrs[i].value);
3091                        break;
3092                }
3093        if (r)
3094                r = -EINVAL;
3095        return r;
3096}
3097
3098/*
3099 * kvm_set_guest_paused() indicates to the guest kernel that it has been
3100 * stopped by the hypervisor.  This function will be called from the host only.
3101 * EINVAL is returned when the host attempts to set the flag for a guest that
3102 * does not support pv clocks.
3103 */
3104static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
3105{
3106        if (!vcpu->arch.pv_time_enabled)
3107                return -EINVAL;
3108        vcpu->arch.pvclock_set_guest_stopped_request = true;
3109        kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
3110        return 0;
3111}
3112
3113long kvm_arch_vcpu_ioctl(struct file *filp,
3114                         unsigned int ioctl, unsigned long arg)
3115{
3116        struct kvm_vcpu *vcpu = filp->private_data;
3117        void __user *argp = (void __user *)arg;
3118        int r;
3119        union {
3120                struct kvm_lapic_state *lapic;
3121                struct kvm_xsave *xsave;
3122                struct kvm_xcrs *xcrs;
3123                void *buffer;
3124        } u;
3125
3126        u.buffer = NULL;
3127        switch (ioctl) {
3128        case KVM_GET_LAPIC: {
3129                r = -EINVAL;
3130                if (!vcpu->arch.apic)
3131                        goto out;
3132                u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
3133
3134                r = -ENOMEM;
3135                if (!u.lapic)
3136                        goto out;
3137                r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
3138                if (r)
3139                        goto out;
3140                r = -EFAULT;
3141                if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
3142                        goto out;
3143                r = 0;
3144                break;
3145        }
3146        case KVM_SET_LAPIC: {
3147                r = -EINVAL;
3148                if (!vcpu->arch.apic)
3149                        goto out;
3150                u.lapic = memdup_user(argp, sizeof(*u.lapic));
3151                if (IS_ERR(u.lapic))
3152                        return PTR_ERR(u.lapic);
3153
3154                r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
3155                break;
3156        }
3157        case KVM_INTERRUPT: {
3158                struct kvm_interrupt irq;
3159
3160                r = -EFAULT;
3161                if (copy_from_user(&irq, argp, sizeof irq))
3162                        goto out;
3163                r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
3164                break;
3165        }
3166        case KVM_NMI: {
3167                r = kvm_vcpu_ioctl_nmi(vcpu);
3168                break;
3169        }
3170        case KVM_SET_CPUID: {
3171                struct kvm_cpuid __user *cpuid_arg = argp;
3172                struct kvm_cpuid cpuid;
3173
3174                r = -EFAULT;
3175                if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
3176                        goto out;
3177                r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
3178                break;
3179        }
3180        case KVM_SET_CPUID2: {
3181                struct kvm_cpuid2 __user *cpuid_arg = argp;
3182                struct kvm_cpuid2 cpuid;
3183
3184                r = -EFAULT;
3185                if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
3186                        goto out;
3187                r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
3188                                              cpuid_arg->entries);
3189                break;
3190        }
3191        case KVM_GET_CPUID2: {
3192                struct kvm_cpuid2 __user *cpuid_arg = argp;
3193                struct kvm_cpuid2 cpuid;
3194
3195                r = -EFAULT;
3196                if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
3197                        goto out;
3198                r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
3199                                              cpuid_arg->entries);
3200                if (r)
3201                        goto out;
3202                r = -EFAULT;
3203                if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
3204                        goto out;
3205                r = 0;
3206                break;
3207        }
3208        case KVM_GET_MSRS:
3209                r = msr_io(vcpu, argp, kvm_get_msr, 1);
3210                break;
3211        case KVM_SET_MSRS:
3212                r = msr_io(vcpu, argp, do_set_msr, 0);
3213                break;
3214        case KVM_TPR_ACCESS_REPORTING: {
3215                struct kvm_tpr_access_ctl tac;
3216
3217                r = -EFAULT;
3218                if (copy_from_user(&tac, argp, sizeof tac))
3219                        goto out;
3220                r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
3221                if (r)
3222                        goto out;
3223                r = -EFAULT;
3224                if (copy_to_user(argp, &tac, sizeof tac))
3225                        goto out;
3226                r = 0;
3227                break;
3228        };
3229        case KVM_SET_VAPIC_ADDR: {
3230                struct kvm_vapic_addr va;
3231
3232                r = -EINVAL;
3233                if (!irqchip_in_kernel(vcpu->kvm))
3234                        goto out;
3235                r = -EFAULT;
3236                if (copy_from_user(&va, argp, sizeof va))
3237                        goto out;
3238                r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
3239                break;
3240        }
3241        case KVM_X86_SETUP_MCE: {
3242                u64 mcg_cap;
3243
3244                r = -EFAULT;
3245                if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
3246                        goto out;
3247                r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
3248                break;
3249        }
3250        case KVM_X86_SET_MCE: {
3251                struct kvm_x86_mce mce;
3252
3253                r = -EFAULT;
3254                if (copy_from_user(&mce, argp, sizeof mce))
3255                        goto out;
3256                r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
3257                break;
3258        }
3259        case KVM_GET_VCPU_EVENTS: {
3260                struct kvm_vcpu_events events;
3261
3262                kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
3263
3264                r = -EFAULT;
3265                if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
3266                        break;
3267                r = 0;
3268                break;
3269        }
3270        case KVM_SET_VCPU_EVENTS: {
3271                struct kvm_vcpu_events events;
3272
3273                r = -EFAULT;
3274                if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
3275                        break;
3276
3277                r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
3278                break;
3279        }
3280        case KVM_GET_DEBUGREGS: {
3281                struct kvm_debugregs dbgregs;
3282
3283                kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
3284
3285                r = -EFAULT;
3286                if (copy_to_user(argp, &dbgregs,
3287                                 sizeof(struct kvm_debugregs)))
3288                        break;
3289                r = 0;
3290                break;
3291        }
3292        case KVM_SET_DEBUGREGS: {
3293                struct kvm_debugregs dbgregs;
3294
3295                r = -EFAULT;
3296                if (copy_from_user(&dbgregs, argp,
3297                                   sizeof(struct kvm_debugregs)))
3298                        break;
3299
3300                r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
3301                break;
3302        }
3303        case KVM_GET_XSAVE: {
3304                u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
3305                r = -ENOMEM;
3306                if (!u.xsave)
3307                        break;
3308
3309                kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
3310
3311                r = -EFAULT;
3312                if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
3313                        break;
3314                r = 0;
3315                break;
3316        }
3317        case KVM_SET_XSAVE: {
3318                u.xsave = memdup_user(argp, sizeof(*u.xsave));
3319                if (IS_ERR(u.xsave))
3320                        return PTR_ERR(u.xsave);
3321
3322                r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
3323                break;
3324        }
3325        case KVM_GET_XCRS: {
3326                u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
3327                r = -ENOMEM;
3328                if (!u.xcrs)
3329                        break;
3330
3331                kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
3332
3333                r = -EFAULT;
3334                if (copy_to_user(argp, u.xcrs,
3335                                 sizeof(struct kvm_xcrs)))
3336                        break;
3337                r = 0;
3338                break;
3339        }
3340        case KVM_SET_XCRS: {
3341                u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
3342                if (IS_ERR(u.xcrs))
3343                        return PTR_ERR(u.xcrs);
3344
3345                r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
3346                break;
3347        }
3348        case KVM_SET_TSC_KHZ: {
3349                u32 user_tsc_khz;
3350
3351                r = -EINVAL;
3352                user_tsc_khz = (u32)arg;
3353
3354                if (user_tsc_khz >= kvm_max_guest_tsc_khz)
3355                        goto out;
3356
3357                if (user_tsc_khz == 0)
3358                        user_tsc_khz = tsc_khz;
3359
3360                kvm_set_tsc_khz(vcpu, user_tsc_khz);
3361
3362                r = 0;
3363                goto out;
3364        }
3365        case KVM_GET_TSC_KHZ: {
3366                r = vcpu->arch.virtual_tsc_khz;
3367                goto out;
3368        }
3369        case KVM_KVMCLOCK_CTRL: {
3370                r = kvm_set_guest_paused(vcpu);
3371                goto out;
3372        }
3373        default:
3374                r = -EINVAL;
3375        }
3376out:
3377        kfree(u.buffer);
3378        return r;
3379}
3380
3381int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
3382{
3383        return VM_FAULT_SIGBUS;
3384}
3385
3386static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
3387{
3388        int ret;
3389
3390        if (addr > (unsigned int)(-3 * PAGE_SIZE))
3391                return -EINVAL;
3392        ret = kvm_x86_ops->set_tss_addr(kvm, addr);
3393        return ret;
3394}
3395
3396static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
3397                                              u64 ident_addr)
3398{
3399        kvm->arch.ept_identity_map_addr = ident_addr;
3400        return 0;
3401}
3402
3403static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
3404                                          u32 kvm_nr_mmu_pages)
3405{
3406        if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
3407                return -EINVAL;
3408
3409        mutex_lock(&kvm->slots_lock);
3410
3411        kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
3412        kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
3413
3414        mutex_unlock(&kvm->slots_lock);
3415        return 0;
3416}
3417
3418static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
3419{
3420        return kvm->arch.n_max_mmu_pages;
3421}
3422
3423static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
3424{
3425        int r;
3426
3427        r = 0;
3428        switch (chip->chip_id) {
3429        case KVM_IRQCHIP_PIC_MASTER:
3430                memcpy(&chip->chip.pic,
3431                        &pic_irqchip(kvm)->pics[0],
3432                        sizeof(struct kvm_pic_state));
3433                break;
3434        case KVM_IRQCHIP_PIC_SLAVE:
3435                memcpy(&chip->chip.pic,
3436                        &pic_irqchip(kvm)->pics[1],
3437                        sizeof(struct kvm_pic_state));
3438                break;
3439        case KVM_IRQCHIP_IOAPIC:
3440                r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
3441                break;
3442        default:
3443                r = -EINVAL;
3444                break;
3445        }
3446        return r;
3447}
3448
3449static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
3450{
3451        int r;
3452
3453        r = 0;
3454        switch (chip->chip_id) {
3455        case KVM_IRQCHIP_PIC_MASTER:
3456                spin_lock(&pic_irqchip(kvm)->lock);
3457                memcpy(&pic_irqchip(kvm)->pics[0],
3458                        &chip->chip.pic,
3459                        sizeof(struct kvm_pic_state));
3460                spin_unlock(&pic_irqchip(kvm)->lock);
3461                break;
3462        case KVM_IRQCHIP_PIC_SLAVE:
3463                spin_lock(&pic_irqchip(kvm)->lock);
3464                memcpy(&pic_irqchip(kvm)->pics[1],
3465                        &chip->chip.pic,
3466                        sizeof(struct kvm_pic_state));
3467                spin_unlock(&pic_irqchip(kvm)->lock);
3468                break;
3469        case KVM_IRQCHIP_IOAPIC:
3470                r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
3471                break;
3472        default:
3473                r = -EINVAL;
3474                break;
3475        }
3476        kvm_pic_update_irq(pic_irqchip(kvm));
3477        return r;
3478}
3479
3480static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
3481{
3482        int r = 0;
3483
3484        mutex_lock(&kvm->arch.vpit->pit_state.lock);
3485        memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
3486        mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3487        return r;
3488}
3489
3490static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
3491{
3492        int r = 0;
3493
3494        mutex_lock(&kvm->arch.vpit->pit_state.lock);
3495        memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
3496        kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
3497        mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3498        return r;
3499}
3500
3501static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
3502{
3503        int r = 0;
3504
3505        mutex_lock(&kvm->arch.vpit->pit_state.lock);
3506        memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
3507                sizeof(ps->channels));
3508        ps->flags = kvm->arch.vpit->pit_state.flags;
3509        mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3510        memset(&ps->reserved, 0, sizeof(ps->reserved));
3511        return r;
3512}
3513
3514static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
3515{
3516        int r = 0, start = 0;
3517        u32 prev_legacy, cur_legacy;
3518        mutex_lock(&kvm->arch.vpit->pit_state.lock);
3519        prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
3520        cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
3521        if (!prev_legacy && cur_legacy)
3522                start = 1;
3523        memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
3524               sizeof(kvm->arch.vpit->pit_state.channels));
3525        kvm->arch.vpit->pit_state.flags = ps->flags;
3526        kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
3527        mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3528        return r;
3529}
3530
3531static int kvm_vm_ioctl_reinject(struct kvm *kvm,
3532                                 struct kvm_reinject_control *control)
3533{
3534        if (!kvm->arch.vpit)
3535                return -ENXIO;
3536        mutex_lock(&kvm->arch.vpit->pit_state.lock);
3537        kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
3538        mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3539        return 0;
3540}
3541
3542/**
3543 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
3544 * @kvm: kvm instance
3545 * @log: slot id and address to which we copy the log
3546 *
3547 * We need to keep it in mind that VCPU threads can write to the bitmap
3548 * concurrently.  So, to avoid losing data, we keep the following order for
3549 * each bit:
3550 *
3551 *   1. Take a snapshot of the bit and clear it if needed.
3552 *   2. Write protect the corresponding page.
3553 *   3. Flush TLB's if needed.
3554 *   4. Copy the snapshot to the userspace.
3555 *
3556 * Between 2 and 3, the guest may write to the page using the remaining TLB
3557 * entry.  This is not a problem because the page will be reported dirty at
3558 * step 4 using the snapshot taken before and step 3 ensures that successive
3559 * writes will be logged for the next call.
3560 */
3561int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
3562{
3563        int r;
3564        struct kvm_memory_slot *memslot;
3565        unsigned long n, i;
3566        unsigned long *dirty_bitmap;
3567        unsigned long *dirty_bitmap_buffer;
3568        bool is_dirty = false;
3569
3570        mutex_lock(&kvm->slots_lock);
3571
3572        r = -EINVAL;
3573        if (log->slot >= KVM_USER_MEM_SLOTS)
3574                goto out;
3575
3576        memslot = id_to_memslot(kvm->memslots, log->slot);
3577
3578        dirty_bitmap = memslot->dirty_bitmap;
3579        r = -ENOENT;
3580        if (!dirty_bitmap)
3581                goto out;
3582
3583        n = kvm_dirty_bitmap_bytes(memslot);
3584
3585        dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
3586        memset(dirty_bitmap_buffer, 0, n);
3587
3588        spin_lock(&kvm->mmu_lock);
3589
3590        for (i = 0; i < n / sizeof(long); i++) {
3591                unsigned long mask;
3592                gfn_t offset;
3593
3594                if (!dirty_bitmap[i])
3595                        continue;
3596
3597                is_dirty = true;
3598
3599                mask = xchg(&dirty_bitmap[i], 0);
3600                dirty_bitmap_buffer[i] = mask;
3601
3602                offset = i * BITS_PER_LONG;
3603                kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
3604        }
3605        if (is_dirty)
3606                kvm_flush_remote_tlbs(kvm);
3607
3608        spin_unlock(&kvm->mmu_lock);
3609
3610        r = -EFAULT;
3611        if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
3612                goto out;
3613
3614        r = 0;
3615out:
3616        mutex_unlock(&kvm->slots_lock);
3617        return r;
3618}
3619
3620int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
3621                        bool line_status)
3622{
3623        if (!irqchip_in_kernel(kvm))
3624                return -ENXIO;
3625
3626        irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
3627                                        irq_event->irq, irq_event->level,
3628                                        line_status);
3629        return 0;
3630}
3631
3632long kvm_arch_vm_ioctl(struct file *filp,
3633                       unsigned int ioctl, unsigned long arg)
3634{
3635        struct kvm *kvm = filp->private_data;
3636        void __user *argp = (void __user *)arg;
3637        int r = -ENOTTY;
3638        /*
3639         * This union makes it completely explicit to gcc-3.x
3640         * that these two variables' stack usage should be
3641         * combined, not added together.
3642         */
3643        union {
3644                struct kvm_pit_state ps;
3645                struct kvm_pit_state2 ps2;
3646                struct kvm_pit_config pit_config;
3647        } u;
3648
3649        switch (ioctl) {
3650        case KVM_SET_TSS_ADDR:
3651                r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
3652                break;
3653        case KVM_SET_IDENTITY_MAP_ADDR: {
3654                u64 ident_addr;
3655
3656                r = -EFAULT;
3657                if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
3658                        goto out;
3659                r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
3660                break;
3661        }
3662        case KVM_SET_NR_MMU_PAGES:
3663                r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
3664                break;
3665        case KVM_GET_NR_MMU_PAGES:
3666                r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
3667                break;
3668        case KVM_CREATE_IRQCHIP: {
3669                struct kvm_pic *vpic;
3670
3671                mutex_lock(&kvm->lock);
3672                r = -EEXIST;
3673                if (kvm->arch.vpic)
3674                        goto create_irqchip_unlock;
3675                r = -EINVAL;
3676                if (atomic_read(&kvm->online_vcpus))
3677                        goto create_irqchip_unlock;
3678                r = -ENOMEM;
3679                vpic = kvm_create_pic(kvm);
3680                if (vpic) {
3681                        r = kvm_ioapic_init(kvm);
3682                        if (r) {
3683                                mutex_lock(&kvm->slots_lock);
3684                                kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
3685                                                          &vpic->dev_master);
3686                                kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
3687                                                          &vpic->dev_slave);
3688                                kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
3689                                                          &vpic->dev_eclr);
3690                                mutex_unlock(&kvm->slots_lock);
3691                                kfree(vpic);
3692                                goto create_irqchip_unlock;
3693                        }
3694                } else
3695                        goto create_irqchip_unlock;
3696                smp_wmb();
3697                kvm->arch.vpic = vpic;
3698                smp_wmb();
3699                r = kvm_setup_default_irq_routing(kvm);
3700                if (r) {
3701                        mutex_lock(&kvm->slots_lock);
3702                        mutex_lock(&kvm->irq_lock);
3703                        kvm_ioapic_destroy(kvm);
3704                        kvm_destroy_pic(kvm);
3705                        mutex_unlock(&kvm->irq_lock);
3706                        mutex_unlock(&kvm->slots_lock);
3707                }
3708        create_irqchip_unlock:
3709                mutex_unlock(&kvm->lock);
3710                break;
3711        }
3712        case KVM_CREATE_PIT:
3713                u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
3714                goto create_pit;
3715        case KVM_CREATE_PIT2:
3716                r = -EFAULT;
3717                if (copy_from_user(&u.pit_config, argp,
3718                                   sizeof(struct kvm_pit_config)))
3719                        goto out;
3720        create_pit:
3721                mutex_lock(&kvm->slots_lock);
3722                r = -EEXIST;
3723                if (kvm->arch.vpit)
3724                        goto create_pit_unlock;
3725                r = -ENOMEM;
3726                kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
3727                if (kvm->arch.vpit)
3728                        r = 0;
3729        create_pit_unlock:
3730                mutex_unlock(&kvm->slots_lock);
3731                break;
3732        case KVM_GET_IRQCHIP: {
3733                /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3734                struct kvm_irqchip *chip;
3735
3736                chip = memdup_user(argp, sizeof(*chip));
3737                if (IS_ERR(chip)) {
3738                        r = PTR_ERR(chip);
3739                        goto out;
3740                }
3741
3742                r = -ENXIO;
3743                if (!irqchip_in_kernel(kvm))
3744                        goto get_irqchip_out;
3745                r = kvm_vm_ioctl_get_irqchip(kvm, chip);
3746                if (r)
3747                        goto get_irqchip_out;
3748                r = -EFAULT;
3749                if (copy_to_user(argp, chip, sizeof *chip))
3750                        goto get_irqchip_out;
3751                r = 0;
3752        get_irqchip_out:
3753                kfree(chip);
3754                break;
3755        }
3756        case KVM_SET_IRQCHIP: {
3757                /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3758                struct kvm_irqchip *chip;
3759
3760                chip = memdup_user(argp, sizeof(*chip));
3761                if (IS_ERR(chip)) {
3762                        r = PTR_ERR(chip);
3763                        goto out;
3764                }
3765
3766                r = -ENXIO;
3767                if (!irqchip_in_kernel(kvm))
3768                        goto set_irqchip_out;
3769                r = kvm_vm_ioctl_set_irqchip(kvm, chip);
3770                if (r)
3771                        goto set_irqchip_out;
3772                r = 0;
3773        set_irqchip_out:
3774                kfree(chip);
3775                break;
3776        }
3777        case KVM_GET_PIT: {
3778                r = -EFAULT;
3779                if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
3780                        goto out;
3781                r = -ENXIO;
3782                if (!kvm->arch.vpit)
3783                        goto out;
3784                r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
3785                if (r)
3786                        goto out;
3787                r = -EFAULT;
3788                if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
3789                        goto out;
3790                r = 0;
3791                break;
3792        }
3793        case KVM_SET_PIT: {
3794                r = -EFAULT;
3795                if (copy_from_user(&u.ps, argp, sizeof u.ps))
3796                        goto out;
3797                r = -ENXIO;
3798                if (!kvm->arch.vpit)
3799                        goto out;
3800                r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
3801                break;
3802        }
3803        case KVM_GET_PIT2: {
3804                r = -ENXIO;
3805                if (!kvm->arch.vpit)
3806                        goto out;
3807                r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
3808                if (r)
3809                        goto out;
3810                r = -EFAULT;
3811                if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
3812                        goto out;
3813                r = 0;
3814                break;
3815        }
3816        case KVM_SET_PIT2: {
3817                r = -EFAULT;
3818                if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
3819                        goto out;
3820                r = -ENXIO;
3821                if (!kvm->arch.vpit)
3822                        goto out;
3823                r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
3824                break;
3825        }
3826        case KVM_REINJECT_CONTROL: {
3827                struct kvm_reinject_control control;
3828                r =  -EFAULT;
3829                if (copy_from_user(&control, argp, sizeof(control)))
3830                        goto out;
3831                r = kvm_vm_ioctl_reinject(kvm, &control);
3832                break;
3833        }
3834        case KVM_XEN_HVM_CONFIG: {
3835                r = -EFAULT;
3836                if (copy_from_user(&kvm->arch.xen_hvm_config, argp,
3837                                   sizeof(struct kvm_xen_hvm_config)))
3838                        goto out;
3839                r = -EINVAL;
3840                if (kvm->arch.xen_hvm_config.flags)
3841                        goto out;
3842                r = 0;
3843                break;
3844        }
3845        case KVM_SET_CLOCK: {
3846                struct kvm_clock_data user_ns;
3847                u64 now_ns;
3848                s64 delta;
3849
3850                r = -EFAULT;
3851                if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
3852                        goto out;
3853
3854                r = -EINVAL;
3855                if (user_ns.flags)
3856                        goto out;
3857
3858                r = 0;
3859                local_irq_disable();
3860                now_ns = get_kernel_ns();
3861                delta = user_ns.clock - now_ns;
3862                local_irq_enable();
3863                kvm->arch.kvmclock_offset = delta;
3864                kvm_gen_update_masterclock(kvm);
3865                break;
3866        }
3867        case KVM_GET_CLOCK: {
3868                struct kvm_clock_data user_ns;
3869                u64 now_ns;
3870
3871                local_irq_disable();
3872                now_ns = get_kernel_ns();
3873                user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
3874                local_irq_enable();
3875                user_ns.flags = 0;
3876                memset(&user_ns.pad, 0, sizeof(user_ns.pad));
3877
3878                r = -EFAULT;
3879                if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
3880                        goto out;
3881                r = 0;
3882                break;
3883        }
3884
3885        default:
3886                ;
3887        }
3888out:
3889        return r;
3890}
3891
3892static void kvm_init_msr_list(void)
3893{
3894        u32 dummy[2];
3895        unsigned i, j;
3896
3897        /* skip the first msrs in the list. KVM-specific */
3898        for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
3899                if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
3900                        continue;
3901                if (j < i)
3902                        msrs_to_save[j] = msrs_to_save[i];
3903                j++;
3904        }
3905        num_msrs_to_save = j;
3906}
3907
3908static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
3909                           const void *v)
3910{
3911        int handled = 0;
3912        int n;
3913
3914        do {
3915                n = min(len, 8);
3916                if (!(vcpu->arch.apic &&
3917                      !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v))
3918                    && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
3919                        break;
3920                handled += n;
3921                addr += n;
3922                len -= n;
3923                v += n;
3924        } while (len);
3925
3926        return handled;
3927}
3928
3929static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
3930{
3931        int handled = 0;
3932        int n;
3933
3934        do {
3935                n = min(len, 8);
3936                if (!(vcpu->arch.apic &&
3937                      !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v))
3938                    && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
3939                        break;
3940                trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
3941                handled += n;
3942                addr += n;
3943                len -= n;
3944                v += n;
3945        } while (len);
3946
3947        return handled;
3948}
3949
3950static void kvm_set_segment(struct kvm_vcpu *vcpu,
3951                        struct kvm_segment *var, int seg)
3952{
3953        kvm_x86_ops->set_segment(vcpu, var, seg);
3954}
3955
3956void kvm_get_segment(struct kvm_vcpu *vcpu,
3957                     struct kvm_segment *var, int seg)
3958{
3959        kvm_x86_ops->get_segment(vcpu, var, seg);
3960}
3961
3962gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3963{
3964        gpa_t t_gpa;
3965        struct x86_exception exception;
3966
3967        BUG_ON(!mmu_is_nested(vcpu));
3968
3969        /* NPT walks are always user-walks */
3970        access |= PFERR_USER_MASK;
3971        t_gpa  = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception);
3972
3973        return t_gpa;
3974}
3975
3976gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
3977                              struct x86_exception *exception)
3978{
3979        u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3980        return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3981}
3982
3983 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
3984                                struct x86_exception *exception)
3985{
3986        u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3987        access |= PFERR_FETCH_MASK;
3988        return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3989}
3990
3991gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
3992                               struct x86_exception *exception)
3993{
3994        u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3995        access |= PFERR_WRITE_MASK;
3996        return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3997}
3998
3999/* uses this to access any guest's mapped memory without checking CPL */
4000gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,

4001                                struct x86_exception *exception)
4002{
4003        return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
4004}
4005
4006static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
4007                                      struct kvm_vcpu *vcpu, u32 access,
4008                                      struct x86_exception *exception)
4009{
4010        void *data = val;
4011        int r = X86EMUL_CONTINUE;
4012
4013        while (bytes) {
4014                gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
4015                                                            exception);
4016                unsigned offset = addr & (PAGE_SIZE-1);
4017                unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
4018                int ret;
4019
4020                if (gpa == UNMAPPED_GVA)
4021                        return X86EMUL_PROPAGATE_FAULT;
4022                ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
4023                if (ret < 0) {
4024                        r = X86EMUL_IO_NEEDED;
4025                        goto out;
4026                }
4027
4028                bytes -= toread;
4029                data += toread;
4030                addr += toread;
4031        }
4032out:
4033        return r;
4034}
4035
4036/* used for instruction fetching */
4037static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
4038                                gva_t addr, void *val, unsigned int bytes,
4039                                struct x86_exception *exception)
4040{
4041        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4042        u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
4043
4044        return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
4045                                          access | PFERR_FETCH_MASK,
4046                                          exception);
4047}
4048
4049int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
4050                               gva_t addr, void *val, unsigned int bytes,
4051                               struct x86_exception *exception)
4052{
4053        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4054        u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
4055
4056        return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
4057                                          exception);
4058}
4059EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
4060
4061static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
4062                                      gva_t addr, void *val, unsigned int bytes,
4063                                      struct x86_exception *exception)
4064{
4065        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4066        return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
4067}
4068
4069int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
4070                                       gva_t addr, void *val,
4071                                       unsigned int bytes,
4072                                       struct x86_exception *exception)
4073{
4074        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4075        void *data = val;
4076        int r = X86EMUL_CONTINUE;
4077
4078        while (bytes) {
4079                gpa_t gpa =  vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
4080                                                             PFERR_WRITE_MASK,
4081                                                             exception);
4082                unsigned offset = addr & (PAGE_SIZE-1);
4083                unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
4084                int ret;
4085
4086                if (gpa == UNMAPPED_GVA)
4087                        return X86EMUL_PROPAGATE_FAULT;
4088                ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
4089                if (ret < 0) {
4090                        r = X86EMUL_IO_NEEDED;
4091                        goto out;
4092                }
4093
4094                bytes -= towrite;
4095                data += towrite;
4096                addr += towrite;
4097        }
4098out:
4099        return r;
4100}
4101EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
4102
4103static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
4104                                gpa_t *gpa, struct x86_exception *exception,
4105                                bool write)
4106{
4107        u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
4108                | (write ? PFERR_WRITE_MASK : 0);
4109
4110        if (vcpu_match_mmio_gva(vcpu, gva)
4111            && !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) {
4112                *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
4113                                        (gva & (PAGE_SIZE - 1));
4114                trace_vcpu_match_mmio(gva, *gpa, write, false);
4115                return 1;
4116        }
4117
4118        *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
4119
4120        if (*gpa == UNMAPPED_GVA)
4121                return -1;
4122
4123        /* For APIC access vmexit */
4124        if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
4125                return 1;
4126
4127        if (vcpu_match_mmio_gpa(vcpu, *gpa)) {
4128                trace_vcpu_match_mmio(gva, *gpa, write, true);
4129                return 1;
4130        }
4131
4132        return 0;
4133}
4134
4135int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
4136                        const void *val, int bytes)
4137{
4138        int ret;
4139
4140        ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
4141        if (ret < 0)
4142                return 0;
4143        kvm_mmu_pte_write(vcpu, gpa, val, bytes);
4144        return 1;
4145}
4146
4147struct read_write_emulator_ops {
4148        int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,
4149                                  int bytes);
4150        int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,
4151                                  void *val, int bytes);
4152        int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
4153                               int bytes, void *val);
4154        int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
4155                                    void *val, int bytes);
4156        bool write;
4157};
4158
4159static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
4160{
4161        if (vcpu->mmio_read_completed) {
4162                trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
4163                               vcpu->mmio_fragments[0].gpa, *(u64 *)val);
4164                vcpu->mmio_read_completed = 0;
4165                return 1;
4166        }
4167
4168        return 0;
4169}
4170
4171static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
4172                        void *val, int bytes)
4173{
4174        return !kvm_read_guest(vcpu->kvm, gpa, val, bytes);
4175}
4176
4177static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
4178                         void *val, int bytes)
4179{
4180        return emulator_write_phys(vcpu, gpa, val, bytes);
4181}
4182
4183static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
4184{
4185        trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
4186        return vcpu_mmio_write(vcpu, gpa, bytes, val);
4187}
4188
4189static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
4190                          void *val, int bytes)
4191{
4192        trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
4193        return X86EMUL_IO_NEEDED;
4194}
4195
4196static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
4197                           void *val, int bytes)
4198{
4199        struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
4200
4201        memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
4202        return X86EMUL_CONTINUE;
4203}
4204
4205static const struct read_write_emulator_ops read_emultor = {
4206        .read_write_prepare = read_prepare,
4207        .read_write_emulate = read_emulate,
4208        .read_write_mmio = vcpu_mmio_read,
4209        .read_write_exit_mmio = read_exit_mmio,
4210};
4211
4212static const struct read_write_emulator_ops write_emultor = {
4213        .read_write_emulate = write_emulate,
4214        .read_write_mmio = write_mmio,
4215        .read_write_exit_mmio = write_exit_mmio,
4216        .write = true,
4217};
4218
4219static int emulator_read_write_onepage(unsigned long addr, void *val,
4220                                       unsigned int bytes,
4221                                       struct x86_exception *exception,
4222                                       struct kvm_vcpu *vcpu,
4223                                       const struct read_write_emulator_ops *ops)
4224{
4225        gpa_t gpa;
4226        int handled, ret;
4227        bool write = ops->write;
4228        struct kvm_mmio_fragment *frag;
4229
4230        ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
4231
4232        if (ret < 0)
4233                return X86EMUL_PROPAGATE_FAULT;
4234
4235        /* For APIC access vmexit */
4236        if (ret)
4237                goto mmio;
4238
4239        if (ops->read_write_emulate(vcpu, gpa, val, bytes))
4240                return X86EMUL_CONTINUE;
4241
4242mmio:
4243        /*
4244         * Is this MMIO handled locally?
4245         */
4246        handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
4247        if (handled == bytes)
4248                return X86EMUL_CONTINUE;
4249
4250        gpa += handled;
4251        bytes -= handled;
4252        val += handled;
4253
4254        WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
4255        frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
4256        frag->gpa = gpa;
4257        frag->data = val;
4258        frag->len = bytes;
4259        return X86EMUL_CONTINUE;
4260}
4261
4262int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
4263                        void *val, unsigned int bytes,
4264                        struct x86_exception *exception,
4265                        const struct read_write_emulator_ops *ops)
4266{
4267        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4268        gpa_t gpa;
4269        int rc;
4270
4271        if (ops->read_write_prepare &&
4272                  ops->read_write_prepare(vcpu, val, bytes))
4273                return X86EMUL_CONTINUE;
4274
4275        vcpu->mmio_nr_fragments = 0;
4276
4277        /* Crossing a page boundary? */
4278        if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
4279                int now;
4280
4281                now = -addr & ~PAGE_MASK;
4282                rc = emulator_read_write_onepage(addr, val, now, exception,
4283                                                 vcpu, ops);
4284
4285                if (rc != X86EMUL_CONTINUE)
4286                        return rc;
4287                addr += now;
4288                val += now;
4289                bytes -= now;
4290        }
4291
4292        rc = emulator_read_write_onepage(addr, val, bytes, exception,
4293                                         vcpu, ops);
4294        if (rc != X86EMUL_CONTINUE)
4295                return rc;
4296
4297        if (!vcpu->mmio_nr_fragments)
4298                return rc;
4299
4300        gpa = vcpu->mmio_fragments[0].gpa;
4301
4302        vcpu->mmio_needed = 1;
4303        vcpu->mmio_cur_fragment = 0;
4304
4305        vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
4306        vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
4307        vcpu->run->exit_reason = KVM_EXIT_MMIO;
4308        vcpu->run->mmio.phys_addr = gpa;
4309
4310        return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
4311}
4312
4313static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
4314                                  unsigned long addr,
4315                                  void *val,
4316                                  unsigned int bytes,
4317                                  struct x86_exception *exception)
4318{
4319        return emulator_read_write(ctxt, addr, val, bytes,
4320                                   exception, &read_emultor);
4321}
4322
4323int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
4324                            unsigned long addr,
4325                            const void *val,
4326                            unsigned int bytes,
4327                            struct x86_exception *exception)
4328{
4329        return emulator_read_write(ctxt, addr, (void *)val, bytes,
4330                                   exception, &write_emultor);
4331}
4332
4333#define CMPXCHG_TYPE(t, ptr, old, new) \
4334        (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
4335
4336#ifdef CONFIG_X86_64
4337#  define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
4338#else
4339#  define CMPXCHG64(ptr, old, new) \
4340        (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
4341#endif
4342
4343static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
4344                                     unsigned long addr,
4345                                     const void *old,
4346                                     const void *new,
4347                                     unsigned int bytes,
4348                                     struct x86_exception *exception)
4349{
4350        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4351        gpa_t gpa;
4352        struct page *page;
4353        char *kaddr;
4354        bool exchanged;
4355
4356        /* guests cmpxchg8b have to be emulated atomically */
4357        if (bytes > 8 || (bytes & (bytes - 1)))
4358                goto emul_write;
4359
4360        gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
4361
4362        if (gpa == UNMAPPED_GVA ||
4363            (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
4364                goto emul_write;
4365
4366        if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
4367                goto emul_write;
4368
4369        page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
4370        if (is_error_page(page))
4371                goto emul_write;
4372
4373        kaddr = kmap_atomic(page);
4374        kaddr += offset_in_page(gpa);
4375        switch (bytes) {
4376        case 1:
4377                exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
4378                break;
4379        case 2:
4380                exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
4381                break;
4382        case 4:
4383                exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
4384                break;
4385        case 8:
4386                exchanged = CMPXCHG64(kaddr, old, new);
4387                break;
4388        default:
4389                BUG();
4390        }
4391        kunmap_atomic(kaddr);
4392        kvm_release_page_dirty(page);
4393
4394        if (!exchanged)
4395                return X86EMUL_CMPXCHG_FAILED;
4396
4397        kvm_mmu_pte_write(vcpu, gpa, new, bytes);
4398
4399        return X86EMUL_CONTINUE;
4400
4401emul_write:
4402        printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
4403
4404        return emulator_write_emulated(ctxt, addr, new, bytes, exception);
4405}
4406
4407static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
4408{
4409        /* TODO: String I/O for in kernel device */
4410        int r;
4411
4412        if (vcpu->arch.pio.in)
4413                r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
4414                                    vcpu->arch.pio.size, pd);
4415        else
4416                r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
4417                                     vcpu->arch.pio.port, vcpu->arch.pio.size,
4418                                     pd);
4419        return r;
4420}
4421
4422static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
4423                               unsigned short port, void *val,
4424                               unsigned int count, bool in)
4425{
4426        trace_kvm_pio(!in, port, size, count);
4427
4428        vcpu->arch.pio.port = port;
4429        vcpu->arch.pio.in = in;
4430        vcpu->arch.pio.count  = count;
4431        vcpu->arch.pio.size = size;
4432
4433        if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
4434                vcpu->arch.pio.count = 0;
4435                return 1;
4436        }
4437
4438        vcpu->run->exit_reason = KVM_EXIT_IO;
4439        vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
4440        vcpu->run->io.size = size;
4441        vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
4442        vcpu->run->io.count = count;
4443        vcpu->run->io.port = port;
4444
4445        return 0;
4446}
4447
4448static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
4449                                    int size, unsigned short port, void *val,
4450                                    unsigned int count)
4451{
4452        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4453        int ret;
4454
4455        if (vcpu->arch.pio.count)
4456                goto data_avail;
4457
4458        ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
4459        if (ret) {
4460data_avail:
4461                memcpy(val, vcpu->arch.pio_data, size * count);
4462                vcpu->arch.pio.count = 0;
4463                return 1;
4464        }
4465
4466        return 0;
4467}
4468
4469static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
4470                                     int size, unsigned short port,
4471                                     const void *val, unsigned int count)
4472{
4473        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4474
4475        memcpy(vcpu->arch.pio_data, val, size * count);
4476        return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
4477}
4478
4479static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
4480{
4481        return kvm_x86_ops->get_segment_base(vcpu, seg);
4482}
4483
4484static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
4485{
4486        kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
4487}
4488
4489int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
4490{
4491        if (!need_emulate_wbinvd(vcpu))
4492                return X86EMUL_CONTINUE;
4493
4494        if (kvm_x86_ops->has_wbinvd_exit()) {
4495                int cpu = get_cpu();
4496
4497                cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
4498                smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
4499                                wbinvd_ipi, NULL, 1);
4500                put_cpu();
4501                cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
4502        } else
4503                wbinvd();
4504        return X86EMUL_CONTINUE;
4505}
4506EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
4507
4508static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
4509{
4510        kvm_emulate_wbinvd(emul_to_vcpu(ctxt));
4511}
4512
4513int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
4514{
4515        return _kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
4516}
4517
4518int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
4519{
4520
4521        return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
4522}
4523
4524static u64 mk_cr_64(u64 curr_cr, u32 new_val)
4525{
4526        return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
4527}
4528
4529static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
4530{
4531        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4532        unsigned long value;
4533
4534        switch (cr) {
4535        case 0:
4536                value = kvm_read_cr0(vcpu);
4537                break;
4538        case 2:
4539                value = vcpu->arch.cr2;
4540                break;
4541        case 3:
4542                value = kvm_read_cr3(vcpu);
4543                break;
4544        case 4:
4545                value = kvm_read_cr4(vcpu);
4546                break;
4547        case 8:
4548                value = kvm_get_cr8(vcpu);
4549                break;
4550        default:
4551                kvm_err("%s: unexpected cr %u\n", __func__, cr);
4552                return 0;
4553        }
4554
4555        return value;
4556}
4557
4558static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
4559{
4560        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4561        int res = 0;
4562
4563        switch (cr) {
4564        case 0:
4565                res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
4566                break;
4567        case 2:
4568                vcpu->arch.cr2 = val;
4569                break;
4570        case 3:
4571                res = kvm_set_cr3(vcpu, val);
4572                break;
4573        case 4:
4574                res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
4575                break;
4576        case 8:
4577                res = kvm_set_cr8(vcpu, val);
4578                break;
4579        default:
4580                kvm_err("%s: unexpected cr %u\n", __func__, cr);
4581                res = -1;
4582        }
4583
4584        return res;
4585}
4586
4587static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val)
4588{
4589        kvm_set_rflags(emul_to_vcpu(ctxt), val);
4590}
4591
4592static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
4593{
4594        return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
4595}
4596
4597static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4598{
4599        kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt);
4600}
4601
4602static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4603{
4604        kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt);
4605}
4606
4607static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4608{
4609        kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt);
4610}
4611
4612static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4613{
4614        kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt);
4615}
4616
4617static unsigned long emulator_get_cached_segment_base(
4618        struct x86_emulate_ctxt *ctxt, int seg)
4619{
4620        return get_segment_base(emul_to_vcpu(ctxt), seg);
4621}
4622
4623static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
4624                                 struct desc_struct *desc, u32 *base3,
4625                                 int seg)
4626{
4627        struct kvm_segment var;
4628
4629        kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
4630        *selector = var.selector;
4631
4632        if (var.unusable) {
4633                memset(desc, 0, sizeof(*desc));
4634                return false;
4635        }
4636
4637        if (var.g)
4638                var.limit >>= 12;
4639        set_desc_limit(desc, var.limit);
4640        set_desc_base(desc, (unsigned long)var.base);
4641#ifdef CONFIG_X86_64
4642        if (base3)
4643                *base3 = var.base >> 32;
4644#endif
4645        desc->type = var.type;
4646        desc->s = var.s;
4647        desc->dpl = var.dpl;
4648        desc->p = var.present;
4649        desc->avl = var.avl;
4650        desc->l = var.l;
4651        desc->d = var.db;
4652        desc->g = var.g;
4653
4654        return true;
4655}
4656
4657static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
4658                                 struct desc_struct *desc, u32 base3,
4659                                 int seg)
4660{
4661        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4662        struct kvm_segment var;
4663
4664        var.selector = selector;
4665        var.base = get_desc_base(desc);
4666#ifdef CONFIG_X86_64
4667        var.base |= ((u64)base3) << 32;
4668#endif
4669        var.limit = get_desc_limit(desc);
4670        if (desc->g)
4671                var.limit = (var.limit << 12) | 0xfff;
4672        var.type = desc->type;
4673        var.present = desc->p;
4674        var.dpl = desc->dpl;
4675        var.db = desc->d;
4676        var.s = desc->s;
4677        var.l = desc->l;
4678        var.g = desc->g;
4679        var.avl = desc->avl;
4680        var.present = desc->p;
4681        var.unusable = !var.present;
4682        var.padding = 0;
4683
4684        kvm_set_segment(vcpu, &var, seg);
4685        return;
4686}
4687
4688static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
4689                            u32 msr_index, u64 *pdata)
4690{
4691        return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
4692}
4693
4694static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
4695                            u32 msr_index, u64 data)
4696{
4697        struct msr_data msr;
4698
4699        msr.data = data;
4700        msr.index = msr_index;
4701        msr.host_initiated = false;
4702        return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
4703}
4704
4705static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
4706                             u32 pmc, u64 *pdata)
4707{
4708        return kvm_pmu_read_pmc(emul_to_vcpu(ctxt), pmc, pdata);
4709}
4710
4711static void emulator_halt(struct x86_emulate_ctxt *ctxt)
4712{
4713        emul_to_vcpu(ctxt)->arch.halt_request = 1;
4714}
4715
4716static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt)
4717{
4718        preempt_disable();
4719        kvm_load_guest_fpu(emul_to_vcpu(ctxt));
4720        /*
4721         * CR0.TS may reference the host fpu state, not the guest fpu state,
4722         * so it may be clear at this point.
4723         */
4724        clts();
4725}
4726
4727static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt)
4728{
4729        preempt_enable();
4730}
4731
4732static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
4733                              struct x86_instruction_info *info,
4734                              enum x86_intercept_stage stage)
4735{
4736        return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
4737}
4738
4739static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
4740                               u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
4741{
4742        kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);
4743}
4744
4745static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
4746{
4747        return kvm_register_read(emul_to_vcpu(ctxt), reg);
4748}
4749
4750static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
4751{
4752        kvm_register_write(emul_to_vcpu(ctxt), reg, val);
4753}
4754
4755static const struct x86_emulate_ops emulate_ops = {
4756        .read_gpr            = emulator_read_gpr,
4757        .write_gpr           = emulator_write_gpr,
4758        .read_std            = kvm_read_guest_virt_system,
4759        .write_std           = kvm_write_guest_virt_system,
4760        .fetch               = kvm_fetch_guest_virt,
4761        .read_emulated       = emulator_read_emulated,
4762        .write_emulated      = emulator_write_emulated,
4763        .cmpxchg_emulated    = emulator_cmpxchg_emulated,
4764        .invlpg              = emulator_invlpg,
4765        .pio_in_emulated     = emulator_pio_in_emulated,
4766        .pio_out_emulated    = emulator_pio_out_emulated,
4767        .get_segment         = emulator_get_segment,
4768        .set_segment         = emulator_set_segment,
4769        .get_cached_segment_base = emulator_get_cached_segment_base,
4770        .get_gdt             = emulator_get_gdt,
4771        .get_idt             = emulator_get_idt,
4772        .set_gdt             = emulator_set_gdt,
4773        .set_idt             = emulator_set_idt,
4774        .get_cr              = emulator_get_cr,
4775        .set_cr              = emulator_set_cr,
4776        .set_rflags          = emulator_set_rflags,
4777        .cpl                 = emulator_get_cpl,
4778        .get_dr              = emulator_get_dr,
4779        .set_dr              = emulator_set_dr,
4780        .set_msr             = emulator_set_msr,
4781        .get_msr             = emulator_get_msr,
4782        .read_pmc            = emulator_read_pmc,
4783        .halt                = emulator_halt,
4784        .wbinvd              = emulator_wbinvd,
4785        .fix_hypercall       = emulator_fix_hypercall,
4786        .get_fpu             = emulator_get_fpu,
4787        .put_fpu             = emulator_put_fpu,
4788        .intercept           = emulator_intercept,
4789        .get_cpuid           = emulator_get_cpuid,
4790};
4791
4792static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
4793{
4794        u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
4795        /*
4796         * an sti; sti; sequence only disable interrupts for the first
4797         * instruction. So, if the last instruction, be it emulated or
4798         * not, left the system with the INT_STI flag enabled, it
4799         * means that the last instruction is an sti. We should not
4800         * leave the flag on in this case. The same goes for mov ss
4801         */
4802        if (!(int_shadow & mask))
4803                kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
4804}
4805
4806static void inject_emulated_exception(struct kvm_vcpu *vcpu)
4807{
4808        struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4809        if (ctxt->exception.vector == PF_VECTOR)
4810                kvm_propagate_fault(vcpu, &ctxt->exception);
4811        else if (ctxt->exception.error_code_valid)
4812                kvm_queue_exception_e(vcpu, ctxt->exception.vector,
4813                                      ctxt->exception.error_code);
4814        else
4815                kvm_queue_exception(vcpu, ctxt->exception.vector);
4816}
4817
4818static void init_decode_cache(struct x86_emulate_ctxt *ctxt)
4819{
4820        memset(&ctxt->opcode_len, 0,
4821               (void *)&ctxt->_regs - (void *)&ctxt->opcode_len);
4822
4823        ctxt->fetch.start = 0;
4824        ctxt->fetch.end = 0;
4825        ctxt->io_read.pos = 0;
4826        ctxt->io_read.end = 0;
4827        ctxt->mem_read.pos = 0;
4828        ctxt->mem_read.end = 0;
4829}
4830
4831static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
4832{
4833        struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4834        int cs_db, cs_l;
4835
4836        kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
4837
4838        ctxt->eflags = kvm_get_rflags(vcpu);
4839        ctxt->eip = kvm_rip_read(vcpu);
4840        ctxt->mode = (!is_protmode(vcpu))               ? X86EMUL_MODE_REAL :
4841                     (ctxt->eflags & X86_EFLAGS_VM)     ? X86EMUL_MODE_VM86 :
4842                     cs_l                               ? X86EMUL_MODE_PROT64 :
4843                     cs_db                              ? X86EMUL_MODE_PROT32 :
4844                                                          X86EMUL_MODE_PROT16;
4845        ctxt->guest_mode = is_guest_mode(vcpu);
4846
4847        init_decode_cache(ctxt);
4848        vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
4849}
4850
4851int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
4852{
4853        struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4854        int ret;
4855
4856        init_emulate_ctxt(vcpu);
4857
4858        ctxt->op_bytes = 2;
4859        ctxt->ad_bytes = 2;
4860        ctxt->_eip = ctxt->eip + inc_eip;
4861        ret = emulate_int_real(ctxt, irq);
4862
4863        if (ret != X86EMUL_CONTINUE)
4864                return EMULATE_FAIL;
4865
4866        ctxt->eip = ctxt->_eip;
4867        kvm_rip_write(vcpu, ctxt->eip);
4868        kvm_set_rflags(vcpu, ctxt->eflags);
4869
4870        if (irq == NMI_VECTOR)
4871                vcpu->arch.nmi_pending = 0;
4872        else
4873                vcpu->arch.interrupt.pending = false;
4874
4875        return EMULATE_DONE;
4876}
4877EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
4878
4879static int handle_emulation_failure(struct kvm_vcpu *vcpu)
4880{
4881        int r = EMULATE_DONE;
4882
4883        ++vcpu->stat.insn_emulation_fail;
4884        trace_kvm_emulate_insn_failed(vcpu);
4885        if (!is_guest_mode(vcpu)) {
4886                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4887                vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
4888                vcpu->run->internal.ndata = 0;
4889                r = EMULATE_FAIL;
4890        }
4891        kvm_queue_exception(vcpu, UD_VECTOR);
4892
4893        return r;
4894}
4895
4896static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
4897                                  bool write_fault_to_shadow_pgtable,
4898                                  int emulation_type)
4899{
4900        gpa_t gpa = cr2;
4901        pfn_t pfn;
4902
4903        if (emulation_type & EMULTYPE_NO_REEXECUTE)
4904                return false;
4905
4906        if (!vcpu->arch.mmu.direct_map) {
4907                /*
4908                 * Write permission should be allowed since only
4909                 * write access need to be emulated.
4910                 */
4911                gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
4912
4913                /*
4914                 * If the mapping is invalid in guest, let cpu retry
4915                 * it to generate fault.
4916                 */
4917                if (gpa == UNMAPPED_GVA)
4918                        return true;
4919        }
4920
4921        /*
4922         * Do not retry the unhandleable instruction if it faults on the
4923         * readonly host memory, otherwise it will goto a infinite loop:
4924         * retry instruction -> write #PF -> emulation fail -> retry
4925         * instruction -> ...
4926         */
4927        pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
4928
4929        /*
4930         * If the instruction failed on the error pfn, it can not be fixed,
4931         * report the error to userspace.
4932         */
4933        if (is_error_noslot_pfn(pfn))
4934                return false;
4935
4936        kvm_release_pfn_clean(pfn);
4937
4938        /* The instructions are well-emulated on direct mmu. */
4939        if (vcpu->arch.mmu.direct_map) {
4940                unsigned int indirect_shadow_pages;
4941
4942                spin_lock(&vcpu->kvm->mmu_lock);
4943                indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
4944                spin_unlock(&vcpu->kvm->mmu_lock);
4945
4946                if (indirect_shadow_pages)
4947                        kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
4948
4949                return true;
4950        }
4951
4952        /*
4953         * if emulation was due to access to shadowed page table
4954         * and it failed try to unshadow page and re-enter the
4955         * guest to let CPU execute the instruction.
4956         */
4957        kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
4958
4959        /*
4960         * If the access faults on its page table, it can not
4961         * be fixed by unprotecting shadow page and it should
4962         * be reported to userspace.
4963         */
4964        return !write_fault_to_shadow_pgtable;
4965}
4966
4967static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
4968                              unsigned long cr2,  int emulation_type)
4969{
4970        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4971        unsigned long last_retry_eip, last_retry_addr, gpa = cr2;
4972
4973        last_retry_eip = vcpu->arch.last_retry_eip;
4974        last_retry_addr = vcpu->arch.last_retry_addr;
4975
4976        /*
4977         * If the emulation is caused by #PF and it is non-page_table
4978         * writing instruction, it means the VM-EXIT is caused by shadow
4979         * page protected, we can zap the shadow page and retry this
4980         * instruction directly.
4981         *
4982         * Note: if the guest uses a non-page-table modifying instruction
4983         * on the PDE that points to the instruction, then we will unmap
4984         * the instruction and go to an infinite loop. So, we cache the
4985         * last retried eip and the last fault address, if we meet the eip
4986         * and the address again, we can break out of the potential infinite
4987         * loop.
4988         */
4989        vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
4990
4991        if (!(emulation_type & EMULTYPE_RETRY))
4992                return false;
4993
4994        if (x86_page_table_writing_insn(ctxt))
4995                return false;
4996
4997        if (ctxt->eip == last_retry_eip && last_retry_addr == cr2)
4998                return false;
4999
5000        vcpu->arch.last_retry_eip = ctxt->eip;

5001        vcpu->arch.last_retry_addr = cr2;
5002
5003        if (!vcpu->arch.mmu.direct_map)
5004                gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
5005
5006        kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
5007
5008        return true;
5009}
5010
5011static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
5012static int complete_emulated_pio(struct kvm_vcpu *vcpu);
5013
5014static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
5015                                unsigned long *db)
5016{
5017        u32 dr6 = 0;
5018        int i;
5019        u32 enable, rwlen;
5020
5021        enable = dr7;
5022        rwlen = dr7 >> 16;
5023        for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
5024                if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
5025                        dr6 |= (1 << i);
5026        return dr6;
5027}
5028
5029static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r)
5030{
5031        struct kvm_run *kvm_run = vcpu->run;
5032
5033        /*
5034         * Use the "raw" value to see if TF was passed to the processor.
5035         * Note that the new value of the flags has not been saved yet.
5036         *
5037         * This is correct even for TF set by the guest, because "the
5038         * processor will not generate this exception after the instruction
5039         * that sets the TF flag".
5040         */
5041        unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
5042
5043        if (unlikely(rflags & X86_EFLAGS_TF)) {
5044                if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
5045                        kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1;
5046                        kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
5047                        kvm_run->debug.arch.exception = DB_VECTOR;
5048                        kvm_run->exit_reason = KVM_EXIT_DEBUG;
5049                        *r = EMULATE_USER_EXIT;
5050                } else {
5051                        vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF;
5052                        /*
5053                         * "Certain debug exceptions may clear bit 0-3.  The
5054                         * remaining contents of the DR6 register are never
5055                         * cleared by the processor".
5056                         */
5057                        vcpu->arch.dr6 &= ~15;
5058                        vcpu->arch.dr6 |= DR6_BS;
5059                        kvm_queue_exception(vcpu, DB_VECTOR);
5060                }
5061        }
5062}
5063
5064static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
5065{
5066        struct kvm_run *kvm_run = vcpu->run;
5067        unsigned long eip = vcpu->arch.emulate_ctxt.eip;
5068        u32 dr6 = 0;
5069
5070        if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
5071            (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
5072                dr6 = kvm_vcpu_check_hw_bp(eip, 0,
5073                                           vcpu->arch.guest_debug_dr7,
5074                                           vcpu->arch.eff_db);
5075
5076                if (dr6 != 0) {
5077                        kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
5078                        kvm_run->debug.arch.pc = kvm_rip_read(vcpu) +
5079                                get_segment_base(vcpu, VCPU_SREG_CS);
5080
5081                        kvm_run->debug.arch.exception = DB_VECTOR;
5082                        kvm_run->exit_reason = KVM_EXIT_DEBUG;
5083                        *r = EMULATE_USER_EXIT;
5084                        return true;
5085                }
5086        }
5087
5088        if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK)) {
5089                dr6 = kvm_vcpu_check_hw_bp(eip, 0,
5090                                           vcpu->arch.dr7,
5091                                           vcpu->arch.db);
5092
5093                if (dr6 != 0) {
5094                        vcpu->arch.dr6 &= ~15;
5095                        vcpu->arch.dr6 |= dr6;
5096                        kvm_queue_exception(vcpu, DB_VECTOR);
5097                        *r = EMULATE_DONE;
5098                        return true;
5099                }
5100        }
5101
5102        return false;
5103}
5104
5105int x86_emulate_instruction(struct kvm_vcpu *vcpu,
5106                            unsigned long cr2,
5107                            int emulation_type,
5108                            void *insn,
5109                            int insn_len)
5110{
5111        int r;
5112        struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
5113        bool writeback = true;
5114        bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
5115
5116        /*
5117         * Clear write_fault_to_shadow_pgtable here to ensure it is
5118         * never reused.
5119         */
5120        vcpu->arch.write_fault_to_shadow_pgtable = false;
5121        kvm_clear_exception_queue(vcpu);
5122
5123        if (!(emulation_type & EMULTYPE_NO_DECODE)) {
5124                init_emulate_ctxt(vcpu);
5125
5126                /*
5127                 * We will reenter on the same instruction since
5128                 * we do not set complete_userspace_io.  This does not
5129                 * handle watchpoints yet, those would be handled in
5130                 * the emulate_ops.
5131                 */
5132                if (kvm_vcpu_check_breakpoint(vcpu, &r))
5133                        return r;
5134
5135                ctxt->interruptibility = 0;
5136                ctxt->have_exception = false;
5137                ctxt->perm_ok = false;
5138
5139                ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
5140
5141                r = x86_decode_insn(ctxt, insn, insn_len);
5142
5143                trace_kvm_emulate_insn_start(vcpu);
5144                ++vcpu->stat.insn_emulation;
5145                if (r != EMULATION_OK)  {
5146                        if (emulation_type & EMULTYPE_TRAP_UD)
5147                                return EMULATE_FAIL;
5148                        if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
5149                                                emulation_type))
5150                                return EMULATE_DONE;
5151                        if (emulation_type & EMULTYPE_SKIP)
5152                                return EMULATE_FAIL;
5153                        return handle_emulation_failure(vcpu);
5154                }
5155        }
5156
5157        if (emulation_type & EMULTYPE_SKIP) {
5158                kvm_rip_write(vcpu, ctxt->_eip);
5159                return EMULATE_DONE;
5160        }
5161
5162        if (retry_instruction(ctxt, cr2, emulation_type))
5163                return EMULATE_DONE;
5164
5165        /* this is needed for vmware backdoor interface to work since it
5166           changes registers values  during IO operation */
5167        if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
5168                vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
5169                emulator_invalidate_register_cache(ctxt);
5170        }
5171
5172restart:
5173        r = x86_emulate_insn(ctxt);
5174
5175        if (r == EMULATION_INTERCEPTED)
5176                return EMULATE_DONE;
5177
5178        if (r == EMULATION_FAILED) {
5179                if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
5180                                        emulation_type))
5181                        return EMULATE_DONE;
5182
5183                return handle_emulation_failure(vcpu);
5184        }
5185
5186        if (ctxt->have_exception) {
5187                inject_emulated_exception(vcpu);
5188                r = EMULATE_DONE;
5189        } else if (vcpu->arch.pio.count) {
5190                if (!vcpu->arch.pio.in) {
5191                        /* FIXME: return into emulator if single-stepping.  */
5192                        vcpu->arch.pio.count = 0;
5193                } else {
5194                        writeback = false;
5195                        vcpu->arch.complete_userspace_io = complete_emulated_pio;
5196                }
5197                r = EMULATE_USER_EXIT;
5198        } else if (vcpu->mmio_needed) {
5199                if (!vcpu->mmio_is_write)
5200                        writeback = false;
5201                r = EMULATE_USER_EXIT;
5202                vcpu->arch.complete_userspace_io = complete_emulated_mmio;
5203        } else if (r == EMULATION_RESTART)
5204                goto restart;
5205        else
5206                r = EMULATE_DONE;
5207
5208        if (writeback) {
5209                toggle_interruptibility(vcpu, ctxt->interruptibility);
5210                kvm_make_request(KVM_REQ_EVENT, vcpu);
5211                vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
5212                kvm_rip_write(vcpu, ctxt->eip);
5213                if (r == EMULATE_DONE)
5214                        kvm_vcpu_check_singlestep(vcpu, &r);
5215                kvm_set_rflags(vcpu, ctxt->eflags);
5216        } else
5217                vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
5218
5219        return r;
5220}
5221EXPORT_SYMBOL_GPL(x86_emulate_instruction);
5222
5223int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
5224{
5225        unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
5226        int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
5227                                            size, port, &val, 1);
5228        /* do not return to emulator after return from userspace */
5229        vcpu->arch.pio.count = 0;
5230        return ret;
5231}
5232EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
5233
5234static void tsc_bad(void *info)
5235{
5236        __this_cpu_write(cpu_tsc_khz, 0);
5237}
5238
5239static void tsc_khz_changed(void *data)
5240{
5241        struct cpufreq_freqs *freq = data;
5242        unsigned long khz = 0;
5243
5244        if (data)
5245                khz = freq->new;
5246        else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
5247                khz = cpufreq_quick_get(raw_smp_processor_id());
5248        if (!khz)
5249                khz = tsc_khz;
5250        __this_cpu_write(cpu_tsc_khz, khz);
5251}
5252
5253static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
5254                                     void *data)
5255{
5256        struct cpufreq_freqs *freq = data;
5257        struct kvm *kvm;
5258        struct kvm_vcpu *vcpu;
5259        int i, send_ipi = 0;
5260
5261        /*
5262         * We allow guests to temporarily run on slowing clocks,
5263         * provided we notify them after, or to run on accelerating
5264         * clocks, provided we notify them before.  Thus time never
5265         * goes backwards.
5266         *
5267         * However, we have a problem.  We can't atomically update
5268         * the frequency of a given CPU from this function; it is
5269         * merely a notifier, which can be called from any CPU.
5270         * Changing the TSC frequency at arbitrary points in time
5271         * requires a recomputation of local variables related to
5272         * the TSC for each VCPU.  We must flag these local variables
5273         * to be updated and be sure the update takes place with the
5274         * new frequency before any guests proceed.
5275         *
5276         * Unfortunately, the combination of hotplug CPU and frequency
5277         * change creates an intractable locking scenario; the order
5278         * of when these callouts happen is undefined with respect to
5279         * CPU hotplug, and they can race with each other.  As such,
5280         * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
5281         * undefined; you can actually have a CPU frequency change take
5282         * place in between the computation of X and the setting of the
5283         * variable.  To protect against this problem, all updates of
5284         * the per_cpu tsc_khz variable are done in an interrupt
5285         * protected IPI, and all callers wishing to update the value
5286         * must wait for a synchronous IPI to complete (which is trivial
5287         * if the caller is on the CPU already).  This establishes the
5288         * necessary total order on variable updates.
5289         *
5290         * Note that because a guest time update may take place
5291         * anytime after the setting of the VCPU's request bit, the
5292         * correct TSC value must be set before the request.  However,
5293         * to ensure the update actually makes it to any guest which
5294         * starts running in hardware virtualization between the set
5295         * and the acquisition of the spinlock, we must also ping the
5296         * CPU after setting the request bit.
5297         *
5298         */
5299
5300        if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
5301                return 0;
5302        if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
5303                return 0;
5304
5305        smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
5306
5307        spin_lock(&kvm_lock);
5308        list_for_each_entry(kvm, &vm_list, vm_list) {
5309                kvm_for_each_vcpu(i, vcpu, kvm) {
5310                        if (vcpu->cpu != freq->cpu)
5311                                continue;
5312                        kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
5313                        if (vcpu->cpu != smp_processor_id())
5314                                send_ipi = 1;
5315                }
5316        }
5317        spin_unlock(&kvm_lock);
5318
5319        if (freq->old < freq->new && send_ipi) {
5320                /*
5321                 * We upscale the frequency.  Must make the guest
5322                 * doesn't see old kvmclock values while running with
5323                 * the new frequency, otherwise we risk the guest sees
5324                 * time go backwards.
5325                 *
5326                 * In case we update the frequency for another cpu
5327                 * (which might be in guest context) send an interrupt
5328                 * to kick the cpu out of guest context.  Next time
5329                 * guest context is entered kvmclock will be updated,
5330                 * so the guest will not see stale values.
5331                 */
5332                smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
5333        }
5334        return 0;
5335}
5336
5337static struct notifier_block kvmclock_cpufreq_notifier_block = {
5338        .notifier_call  = kvmclock_cpufreq_notifier
5339};
5340
5341static int kvmclock_cpu_notifier(struct notifier_block *nfb,
5342                                        unsigned long action, void *hcpu)
5343{
5344        unsigned int cpu = (unsigned long)hcpu;
5345
5346        switch (action) {
5347                case CPU_ONLINE:
5348                case CPU_DOWN_FAILED:
5349                        smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
5350                        break;
5351                case CPU_DOWN_PREPARE:
5352                        smp_call_function_single(cpu, tsc_bad, NULL, 1);
5353                        break;
5354        }
5355        return NOTIFY_OK;
5356}
5357
5358static struct notifier_block kvmclock_cpu_notifier_block = {
5359        .notifier_call  = kvmclock_cpu_notifier,
5360        .priority = -INT_MAX
5361};
5362
5363static void kvm_timer_init(void)
5364{
5365        int cpu;
5366
5367        max_tsc_khz = tsc_khz;
5368        register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
5369        if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
5370#ifdef CONFIG_CPU_FREQ
5371                struct cpufreq_policy policy;
5372                memset(&policy, 0, sizeof(policy));
5373                cpu = get_cpu();
5374                cpufreq_get_policy(&policy, cpu);
5375                if (policy.cpuinfo.max_freq)
5376                        max_tsc_khz = policy.cpuinfo.max_freq;
5377                put_cpu();
5378#endif
5379                cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
5380                                          CPUFREQ_TRANSITION_NOTIFIER);
5381        }
5382        pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
5383        for_each_online_cpu(cpu)
5384                smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
5385}
5386
5387static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
5388
5389int kvm_is_in_guest(void)
5390{
5391        return __this_cpu_read(current_vcpu) != NULL;
5392}
5393
5394static int kvm_is_user_mode(void)
5395{
5396        int user_mode = 3;
5397
5398        if (__this_cpu_read(current_vcpu))
5399                user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu));
5400
5401        return user_mode != 0;
5402}
5403
5404static unsigned long kvm_get_guest_ip(void)
5405{
5406        unsigned long ip = 0;
5407
5408        if (__this_cpu_read(current_vcpu))
5409                ip = kvm_rip_read(__this_cpu_read(current_vcpu));
5410
5411        return ip;
5412}
5413
5414static struct perf_guest_info_callbacks kvm_guest_cbs = {
5415        .is_in_guest            = kvm_is_in_guest,
5416        .is_user_mode           = kvm_is_user_mode,
5417        .get_guest_ip           = kvm_get_guest_ip,
5418};
5419
5420void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
5421{
5422        __this_cpu_write(current_vcpu, vcpu);
5423}
5424EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
5425
5426void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
5427{
5428        __this_cpu_write(current_vcpu, NULL);
5429}
5430EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
5431
5432static void kvm_set_mmio_spte_mask(void)
5433{
5434        u64 mask;
5435        int maxphyaddr = boot_cpu_data.x86_phys_bits;
5436
5437        /*
5438         * Set the reserved bits and the present bit of an paging-structure
5439         * entry to generate page fault with PFER.RSV = 1.
5440         */
5441         /* Mask the reserved physical address bits. */
5442        mask = ((1ull << (51 - maxphyaddr + 1)) - 1) << maxphyaddr;
5443
5444        /* Bit 62 is always reserved for 32bit host. */
5445        mask |= 0x3ull << 62;
5446
5447        /* Set the present bit. */
5448        mask |= 1ull;
5449
5450#ifdef CONFIG_X86_64
5451        /*
5452         * If reserved bit is not supported, clear the present bit to disable
5453         * mmio page fault.
5454         */
5455        if (maxphyaddr == 52)
5456                mask &= ~1ull;
5457#endif
5458
5459        kvm_mmu_set_mmio_spte_mask(mask);
5460}
5461
5462#ifdef CONFIG_X86_64
5463static void pvclock_gtod_update_fn(struct work_struct *work)
5464{
5465        struct kvm *kvm;
5466
5467        struct kvm_vcpu *vcpu;
5468        int i;
5469
5470        spin_lock(&kvm_lock);
5471        list_for_each_entry(kvm, &vm_list, vm_list)
5472                kvm_for_each_vcpu(i, vcpu, kvm)
5473                        set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);
5474        atomic_set(&kvm_guest_has_master_clock, 0);
5475        spin_unlock(&kvm_lock);
5476}
5477
5478static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
5479
5480/*
5481 * Notification about pvclock gtod data update.
5482 */
5483static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
5484                               void *priv)
5485{
5486        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
5487        struct timekeeper *tk = priv;
5488
5489        update_pvclock_gtod(tk);
5490
5491        /* disable master clock if host does not trust, or does not
5492         * use, TSC clocksource
5493         */
5494        if (gtod->clock.vclock_mode != VCLOCK_TSC &&
5495            atomic_read(&kvm_guest_has_master_clock) != 0)
5496                queue_work(system_long_wq, &pvclock_gtod_work);
5497
5498        return 0;
5499}
5500
5501static struct notifier_block pvclock_gtod_notifier = {
5502        .notifier_call = pvclock_gtod_notify,
5503};
5504#endif
5505
5506int kvm_arch_init(void *opaque)
5507{
5508        int r;
5509        struct kvm_x86_ops *ops = opaque;
5510
5511        if (kvm_x86_ops) {
5512                printk(KERN_ERR "kvm: already loaded the other module\n");
5513                r = -EEXIST;
5514                goto out;
5515        }
5516
5517        if (!ops->cpu_has_kvm_support()) {
5518                printk(KERN_ERR "kvm: no hardware support\n");
5519                r = -EOPNOTSUPP;
5520                goto out;
5521        }
5522        if (ops->disabled_by_bios()) {
5523                printk(KERN_ERR "kvm: disabled by bios\n");
5524                r = -EOPNOTSUPP;
5525                goto out;
5526        }
5527
5528        r = -ENOMEM;
5529        shared_msrs = alloc_percpu(struct kvm_shared_msrs);
5530        if (!shared_msrs) {
5531                printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
5532                goto out;
5533        }
5534
5535        r = kvm_mmu_module_init();
5536        if (r)
5537                goto out_free_percpu;
5538
5539        kvm_set_mmio_spte_mask();
5540        kvm_init_msr_list();
5541
5542        kvm_x86_ops = ops;
5543        kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
5544                        PT_DIRTY_MASK, PT64_NX_MASK, 0);
5545
5546        kvm_timer_init();
5547
5548        perf_register_guest_info_callbacks(&kvm_guest_cbs);
5549
5550        if (cpu_has_xsave)
5551                host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
5552
5553        kvm_lapic_init();
5554#ifdef CONFIG_X86_64
5555        pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
5556#endif
5557
5558        return 0;
5559
5560out_free_percpu:
5561        free_percpu(shared_msrs);
5562out:
5563        return r;
5564}
5565
5566void kvm_arch_exit(void)
5567{
5568        perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
5569
5570        if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
5571                cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
5572                                            CPUFREQ_TRANSITION_NOTIFIER);
5573        unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
5574#ifdef CONFIG_X86_64
5575        pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
5576#endif
5577        kvm_x86_ops = NULL;
5578        kvm_mmu_module_exit();
5579        free_percpu(shared_msrs);
5580}
5581
5582int kvm_emulate_halt(struct kvm_vcpu *vcpu)
5583{
5584        ++vcpu->stat.halt_exits;
5585        if (irqchip_in_kernel(vcpu->kvm)) {
5586                vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
5587                return 1;
5588        } else {
5589                vcpu->run->exit_reason = KVM_EXIT_HLT;
5590                return 0;
5591        }
5592}
5593EXPORT_SYMBOL_GPL(kvm_emulate_halt);
5594
5595int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
5596{
5597        u64 param, ingpa, outgpa, ret;
5598        uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
5599        bool fast, longmode;
5600        int cs_db, cs_l;
5601
5602        /*
5603         * hypercall generates UD from non zero cpl and real mode
5604         * per HYPER-V spec
5605         */
5606        if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
5607                kvm_queue_exception(vcpu, UD_VECTOR);
5608                return 0;
5609        }
5610
5611        kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
5612        longmode = is_long_mode(vcpu) && cs_l == 1;
5613
5614        if (!longmode) {
5615                param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
5616                        (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
5617                ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
5618                        (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
5619                outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
5620                        (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
5621        }
5622#ifdef CONFIG_X86_64
5623        else {
5624                param = kvm_register_read(vcpu, VCPU_REGS_RCX);
5625                ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
5626                outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
5627        }
5628#endif
5629
5630        code = param & 0xffff;
5631        fast = (param >> 16) & 0x1;
5632        rep_cnt = (param >> 32) & 0xfff;
5633        rep_idx = (param >> 48) & 0xfff;
5634
5635        trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
5636
5637        switch (code) {
5638        case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
5639                kvm_vcpu_on_spin(vcpu);
5640                break;
5641        default:
5642                res = HV_STATUS_INVALID_HYPERCALL_CODE;
5643                break;
5644        }
5645
5646        ret = res | (((u64)rep_done & 0xfff) << 32);
5647        if (longmode) {
5648                kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
5649        } else {
5650                kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
5651                kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
5652        }
5653
5654        return 1;
5655}
5656
5657/*
5658 * kvm_pv_kick_cpu_op:  Kick a vcpu.
5659 *
5660 * @apicid - apicid of vcpu to be kicked.
5661 */
5662static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
5663{
5664        struct kvm_lapic_irq lapic_irq;
5665
5666        lapic_irq.shorthand = 0;
5667        lapic_irq.dest_mode = 0;
5668        lapic_irq.dest_id = apicid;
5669
5670        lapic_irq.delivery_mode = APIC_DM_REMRD;
5671        kvm_irq_delivery_to_apic(kvm, 0, &lapic_irq, NULL);
5672}
5673
5674int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
5675{
5676        unsigned long nr, a0, a1, a2, a3, ret;
5677        int r = 1;
5678
5679        if (kvm_hv_hypercall_enabled(vcpu->kvm))
5680                return kvm_hv_hypercall(vcpu);
5681
5682        nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
5683        a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
5684        a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
5685        a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
5686        a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
5687
5688        trace_kvm_hypercall(nr, a0, a1, a2, a3);
5689
5690        if (!is_long_mode(vcpu)) {
5691                nr &= 0xFFFFFFFF;
5692                a0 &= 0xFFFFFFFF;
5693                a1 &= 0xFFFFFFFF;
5694                a2 &= 0xFFFFFFFF;
5695                a3 &= 0xFFFFFFFF;
5696        }
5697
5698        if (kvm_x86_ops->get_cpl(vcpu) != 0) {
5699                ret = -KVM_EPERM;
5700                goto out;
5701        }
5702
5703        switch (nr) {
5704        case KVM_HC_VAPIC_POLL_IRQ:
5705                ret = 0;
5706                break;
5707        case KVM_HC_KICK_CPU:
5708                kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
5709                ret = 0;
5710                break;
5711        default:
5712                ret = -KVM_ENOSYS;
5713                break;
5714        }
5715out:
5716        kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
5717        ++vcpu->stat.hypercalls;
5718        return r;
5719}
5720EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
5721
5722static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
5723{
5724        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5725        char instruction[3];
5726        unsigned long rip = kvm_rip_read(vcpu);
5727
5728        kvm_x86_ops->patch_hypercall(vcpu, instruction);
5729
5730        return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
5731}
5732
5733/*
5734 * Check if userspace requested an interrupt window, and that the
5735 * interrupt window is open.
5736 *
5737 * No need to exit to userspace if we already have an interrupt queued.
5738 */
5739static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
5740{
5741        return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
5742                vcpu->run->request_interrupt_window &&
5743                kvm_arch_interrupt_allowed(vcpu));
5744}
5745
5746static void post_kvm_run_save(struct kvm_vcpu *vcpu)
5747{
5748        struct kvm_run *kvm_run = vcpu->run;
5749
5750        kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
5751        kvm_run->cr8 = kvm_get_cr8(vcpu);
5752        kvm_run->apic_base = kvm_get_apic_base(vcpu);
5753        if (irqchip_in_kernel(vcpu->kvm))
5754                kvm_run->ready_for_interrupt_injection = 1;
5755        else
5756                kvm_run->ready_for_interrupt_injection =
5757                        kvm_arch_interrupt_allowed(vcpu) &&
5758                        !kvm_cpu_has_interrupt(vcpu) &&
5759                        !kvm_event_needs_reinjection(vcpu);
5760}
5761
5762static void update_cr8_intercept(struct kvm_vcpu *vcpu)
5763{
5764        int max_irr, tpr;
5765
5766        if (!kvm_x86_ops->update_cr8_intercept)
5767                return;
5768
5769        if (!vcpu->arch.apic)
5770                return;
5771
5772        if (!vcpu->arch.apic->vapic_addr)
5773                max_irr = kvm_lapic_find_highest_irr(vcpu);
5774        else
5775                max_irr = -1;
5776
5777        if (max_irr != -1)
5778                max_irr >>= 4;
5779
5780        tpr = kvm_lapic_get_cr8(vcpu);
5781
5782        kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
5783}
5784
5785static void inject_pending_event(struct kvm_vcpu *vcpu)
5786{
5787        /* try to reinject previous events if any */
5788        if (vcpu->arch.exception.pending) {
5789                trace_kvm_inj_exception(vcpu->arch.exception.nr,
5790                                        vcpu->arch.exception.has_error_code,
5791                                        vcpu->arch.exception.error_code);
5792                kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
5793                                          vcpu->arch.exception.has_error_code,
5794                                          vcpu->arch.exception.error_code,
5795                                          vcpu->arch.exception.reinject);
5796                return;
5797        }
5798
5799        if (vcpu->arch.nmi_injected) {
5800                kvm_x86_ops->set_nmi(vcpu);
5801                return;
5802        }
5803
5804        if (vcpu->arch.interrupt.pending) {
5805                kvm_x86_ops->set_irq(vcpu);
5806                return;
5807        }
5808
5809        /* try to inject new event if pending */
5810        if (vcpu->arch.nmi_pending) {
5811                if (kvm_x86_ops->nmi_allowed(vcpu)) {
5812                        --vcpu->arch.nmi_pending;
5813                        vcpu->arch.nmi_injected = true;
5814                        kvm_x86_ops->set_nmi(vcpu);
5815                }
5816        } else if (kvm_cpu_has_injectable_intr(vcpu)) {
5817                if (kvm_x86_ops->interrupt_allowed(vcpu)) {
5818                        kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
5819                                            false);
5820                        kvm_x86_ops->set_irq(vcpu);
5821                }
5822        }
5823}
5824
5825static void process_nmi(struct kvm_vcpu *vcpu)
5826{
5827        unsigned limit = 2;
5828
5829        /*
5830         * x86 is limited to one NMI running, and one NMI pending after it.
5831         * If an NMI is already in progress, limit further NMIs to just one.
5832         * Otherwise, allow two (and we'll inject the first one immediately).
5833         */
5834        if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
5835                limit = 1;
5836
5837        vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
5838        vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
5839        kvm_make_request(KVM_REQ_EVENT, vcpu);
5840}
5841
5842static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
5843{
5844        u64 eoi_exit_bitmap[4];
5845        u32 tmr[8];
5846
5847        if (!kvm_apic_hw_enabled(vcpu->arch.apic))
5848                return;
5849
5850        memset(eoi_exit_bitmap, 0, 32);
5851        memset(tmr, 0, 32);
5852
5853        kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap, tmr);
5854        kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
5855        kvm_apic_update_tmr(vcpu, tmr);
5856}
5857
5858/*
5859 * Returns 1 to let __vcpu_run() continue the guest execution loop without
5860 * exiting to the userspace.  Otherwise, the value will be returned to the
5861 * userspace.
5862 */
5863static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5864{
5865        int r;
5866        bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
5867                vcpu->run->request_interrupt_window;
5868        bool req_immediate_exit = false;
5869
5870        if (vcpu->requests) {
5871                if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
5872                        kvm_mmu_unload(vcpu);
5873                if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
5874                        __kvm_migrate_timers(vcpu);
5875                if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
5876                        kvm_gen_update_masterclock(vcpu->kvm);
5877                if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
5878                        kvm_gen_kvmclock_update(vcpu);
5879                if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
5880                        r = kvm_guest_time_update(vcpu);
5881                        if (unlikely(r))
5882                                goto out;
5883                }
5884                if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
5885                        kvm_mmu_sync_roots(vcpu);
5886                if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
5887                        kvm_x86_ops->tlb_flush(vcpu);
5888                if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
5889                        vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
5890                        r = 0;
5891                        goto out;
5892                }
5893                if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
5894                        vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
5895                        r = 0;
5896                        goto out;
5897                }
5898                if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
5899                        vcpu->fpu_active = 0;
5900                        kvm_x86_ops->fpu_deactivate(vcpu);
5901                }
5902                if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
5903                        /* Page is swapped out. Do synthetic halt */
5904                        vcpu->arch.apf.halted = true;
5905                        r = 1;
5906                        goto out;
5907                }
5908                if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
5909                        record_steal_time(vcpu);
5910                if (kvm_check_request(KVM_REQ_NMI, vcpu))
5911                        process_nmi(vcpu);
5912                if (kvm_check_request(KVM_REQ_PMU, vcpu))
5913                        kvm_handle_pmu_event(vcpu);
5914                if (kvm_check_request(KVM_REQ_PMI, vcpu))
5915                        kvm_deliver_pmi(vcpu);
5916                if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
5917                        vcpu_scan_ioapic(vcpu);
5918        }
5919
5920        if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
5921                kvm_apic_accept_events(vcpu);
5922                if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
5923                        r = 1;
5924                        goto out;
5925                }
5926
5927                inject_pending_event(vcpu);
5928
5929                /* enable NMI/IRQ window open exits if needed */
5930                if (vcpu->arch.nmi_pending)
5931                        req_immediate_exit =
5932                                kvm_x86_ops->enable_nmi_window(vcpu) != 0;
5933                else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
5934                        req_immediate_exit =
5935                                kvm_x86_ops->enable_irq_window(vcpu) != 0;
5936
5937                if (kvm_lapic_enabled(vcpu)) {
5938                        /*
5939                         * Update architecture specific hints for APIC
5940                         * virtual interrupt delivery.
5941                         */
5942                        if (kvm_x86_ops->hwapic_irr_update)
5943                                kvm_x86_ops->hwapic_irr_update(vcpu,
5944                                        kvm_lapic_find_highest_irr(vcpu));
5945                        update_cr8_intercept(vcpu);
5946                        kvm_lapic_sync_to_vapic(vcpu);
5947                }
5948        }
5949
5950        r = kvm_mmu_reload(vcpu);
5951        if (unlikely(r)) {
5952                goto cancel_injection;
5953        }
5954
5955        preempt_disable();
5956
5957        kvm_x86_ops->prepare_guest_switch(vcpu);
5958        if (vcpu->fpu_active)
5959                kvm_load_guest_fpu(vcpu);
5960        kvm_load_guest_xcr0(vcpu);
5961
5962        vcpu->mode = IN_GUEST_MODE;
5963
5964        srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
5965
5966        /* We should set ->mode before check ->requests,
5967         * see the comment in make_all_cpus_request.
5968         */
5969        smp_mb__after_srcu_read_unlock();
5970
5971        local_irq_disable();
5972
5973        if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
5974            || need_resched() || signal_pending(current)) {
5975                vcpu->mode = OUTSIDE_GUEST_MODE;
5976                smp_wmb();
5977                local_irq_enable();
5978                preempt_enable();
5979                vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
5980                r = 1;
5981                goto cancel_injection;
5982        }
5983
5984        if (req_immediate_exit)
5985                smp_send_reschedule(vcpu->cpu);
5986
5987        kvm_guest_enter();
5988
5989        if (unlikely(vcpu->arch.switch_db_regs)) {
5990                set_debugreg(0, 7);
5991                set_debugreg(vcpu->arch.eff_db[0], 0);
5992                set_debugreg(vcpu->arch.eff_db[1], 1);
5993                set_debugreg(vcpu->arch.eff_db[2], 2);
5994                set_debugreg(vcpu->arch.eff_db[3], 3);
5995        }
5996
5997        trace_kvm_entry(vcpu->vcpu_id);
5998        kvm_x86_ops->run(vcpu);
5999
6000        /*

6001         * If the guest has used debug registers, at least dr7
6002         * will be disabled while returning to the host.
6003         * If we don't have active breakpoints in the host, we don't
6004         * care about the messed up debug address registers. But if
6005         * we have some of them active, restore the old state.
6006         */
6007        if (hw_breakpoint_active())
6008                hw_breakpoint_restore();
6009
6010        vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
6011                                                           native_read_tsc());
6012
6013        vcpu->mode = OUTSIDE_GUEST_MODE;
6014        smp_wmb();
6015
6016        /* Interrupt is enabled by handle_external_intr() */
6017        kvm_x86_ops->handle_external_intr(vcpu);
6018
6019        ++vcpu->stat.exits;
6020
6021        /*
6022         * We must have an instruction between local_irq_enable() and
6023         * kvm_guest_exit(), so the timer interrupt isn't delayed by
6024         * the interrupt shadow.  The stat.exits increment will do nicely.
6025         * But we need to prevent reordering, hence this barrier():
6026         */
6027        barrier();
6028
6029        kvm_guest_exit();
6030
6031        preempt_enable();
6032
6033        vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
6034
6035        /*
6036         * Profile KVM exit RIPs:
6037         */
6038        if (unlikely(prof_on == KVM_PROFILING)) {
6039                unsigned long rip = kvm_rip_read(vcpu);
6040                profile_hit(KVM_PROFILING, (void *)rip);
6041        }
6042
6043        if (unlikely(vcpu->arch.tsc_always_catchup))
6044                kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
6045
6046        if (vcpu->arch.apic_attention)
6047                kvm_lapic_sync_from_vapic(vcpu);
6048
6049        r = kvm_x86_ops->handle_exit(vcpu);
6050        return r;
6051
6052cancel_injection:
6053        kvm_x86_ops->cancel_injection(vcpu);
6054        if (unlikely(vcpu->arch.apic_attention))
6055                kvm_lapic_sync_from_vapic(vcpu);
6056out:
6057        return r;
6058}
6059
6060
6061static int __vcpu_run(struct kvm_vcpu *vcpu)
6062{
6063        int r;
6064        struct kvm *kvm = vcpu->kvm;
6065
6066        vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
6067
6068        r = 1;
6069        while (r > 0) {
6070                if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
6071                    !vcpu->arch.apf.halted)
6072                        r = vcpu_enter_guest(vcpu);
6073                else {
6074                        srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
6075                        kvm_vcpu_block(vcpu);
6076                        vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
6077                        if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
6078                                kvm_apic_accept_events(vcpu);
6079                                switch(vcpu->arch.mp_state) {
6080                                case KVM_MP_STATE_HALTED:
6081                                        vcpu->arch.pv.pv_unhalted = false;
6082                                        vcpu->arch.mp_state =
6083                                                KVM_MP_STATE_RUNNABLE;
6084                                case KVM_MP_STATE_RUNNABLE:
6085                                        vcpu->arch.apf.halted = false;
6086                                        break;
6087                                case KVM_MP_STATE_INIT_RECEIVED:
6088                                        break;
6089                                default:
6090                                        r = -EINTR;
6091                                        break;
6092                                }
6093                        }
6094                }
6095
6096                if (r <= 0)
6097                        break;
6098
6099                clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
6100                if (kvm_cpu_has_pending_timer(vcpu))
6101                        kvm_inject_pending_timer_irqs(vcpu);
6102
6103                if (dm_request_for_irq_injection(vcpu)) {
6104                        r = -EINTR;
6105                        vcpu->run->exit_reason = KVM_EXIT_INTR;
6106                        ++vcpu->stat.request_irq_exits;
6107                }
6108
6109                kvm_check_async_pf_completion(vcpu);
6110
6111                if (signal_pending(current)) {
6112                        r = -EINTR;
6113                        vcpu->run->exit_reason = KVM_EXIT_INTR;
6114                        ++vcpu->stat.signal_exits;
6115                }
6116                if (need_resched()) {
6117                        srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
6118                        cond_resched();
6119                        vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
6120                }
6121        }
6122
6123        srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
6124
6125        return r;
6126}
6127
6128static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
6129{
6130        int r;
6131        vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
6132        r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
6133        srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
6134        if (r != EMULATE_DONE)
6135                return 0;
6136        return 1;
6137}
6138
6139static int complete_emulated_pio(struct kvm_vcpu *vcpu)
6140{
6141        BUG_ON(!vcpu->arch.pio.count);
6142
6143        return complete_emulated_io(vcpu);
6144}
6145
6146/*
6147 * Implements the following, as a state machine:
6148 *
6149 * read:
6150 *   for each fragment
6151 *     for each mmio piece in the fragment
6152 *       write gpa, len
6153 *       exit
6154 *       copy data
6155 *   execute insn
6156 *
6157 * write:
6158 *   for each fragment
6159 *     for each mmio piece in the fragment
6160 *       write gpa, len
6161 *       copy data
6162 *       exit
6163 */
6164static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
6165{
6166        struct kvm_run *run = vcpu->run;
6167        struct kvm_mmio_fragment *frag;
6168        unsigned len;
6169
6170        BUG_ON(!vcpu->mmio_needed);
6171
6172        /* Complete previous fragment */
6173        frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
6174        len = min(8u, frag->len);
6175        if (!vcpu->mmio_is_write)
6176                memcpy(frag->data, run->mmio.data, len);
6177
6178        if (frag->len <= 8) {
6179                /* Switch to the next fragment. */
6180                frag++;
6181                vcpu->mmio_cur_fragment++;
6182        } else {
6183                /* Go forward to the next mmio piece. */
6184                frag->data += len;
6185                frag->gpa += len;
6186                frag->len -= len;
6187        }
6188
6189        if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
6190                vcpu->mmio_needed = 0;
6191
6192                /* FIXME: return into emulator if single-stepping.  */
6193                if (vcpu->mmio_is_write)
6194                        return 1;
6195                vcpu->mmio_read_completed = 1;
6196                return complete_emulated_io(vcpu);
6197        }
6198
6199        run->exit_reason = KVM_EXIT_MMIO;
6200        run->mmio.phys_addr = frag->gpa;
6201        if (vcpu->mmio_is_write)
6202                memcpy(run->mmio.data, frag->data, min(8u, frag->len));
6203        run->mmio.len = min(8u, frag->len);
6204        run->mmio.is_write = vcpu->mmio_is_write;
6205        vcpu->arch.complete_userspace_io = complete_emulated_mmio;
6206        return 0;
6207}
6208
6209
6210int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
6211{
6212        int r;
6213        sigset_t sigsaved;
6214
6215        if (!tsk_used_math(current) && init_fpu(current))
6216                return -ENOMEM;
6217
6218        if (vcpu->sigset_active)
6219                sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
6220
6221        if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
6222                kvm_vcpu_block(vcpu);
6223                kvm_apic_accept_events(vcpu);
6224                clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
6225                r = -EAGAIN;
6226                goto out;
6227        }
6228
6229        /* re-sync apic's tpr */
6230        if (!irqchip_in_kernel(vcpu->kvm)) {
6231                if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
6232                        r = -EINVAL;
6233                        goto out;
6234                }
6235        }
6236
6237        if (unlikely(vcpu->arch.complete_userspace_io)) {
6238                int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
6239                vcpu->arch.complete_userspace_io = NULL;
6240                r = cui(vcpu);
6241                if (r <= 0)
6242                        goto out;
6243        } else
6244                WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
6245
6246        r = __vcpu_run(vcpu);
6247
6248out:
6249        post_kvm_run_save(vcpu);
6250        if (vcpu->sigset_active)
6251                sigprocmask(SIG_SETMASK, &sigsaved, NULL);
6252
6253        return r;
6254}
6255
6256int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
6257{
6258        if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
6259                /*
6260                 * We are here if userspace calls get_regs() in the middle of
6261                 * instruction emulation. Registers state needs to be copied
6262                 * back from emulation context to vcpu. Userspace shouldn't do
6263                 * that usually, but some bad designed PV devices (vmware
6264                 * backdoor interface) need this to work
6265                 */
6266                emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
6267                vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
6268        }
6269        regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
6270        regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
6271        regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
6272        regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
6273        regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
6274        regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
6275        regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
6276        regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
6277#ifdef CONFIG_X86_64
6278        regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
6279        regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
6280        regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
6281        regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
6282        regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
6283        regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
6284        regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
6285        regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
6286#endif
6287
6288        regs->rip = kvm_rip_read(vcpu);
6289        regs->rflags = kvm_get_rflags(vcpu);
6290
6291        return 0;
6292}
6293
6294int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
6295{
6296        vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
6297        vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
6298
6299        kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
6300        kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
6301        kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
6302        kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
6303        kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
6304        kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
6305        kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
6306        kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
6307#ifdef CONFIG_X86_64
6308        kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
6309        kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
6310        kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
6311        kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
6312        kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
6313        kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
6314        kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
6315        kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
6316#endif
6317
6318        kvm_rip_write(vcpu, regs->rip);
6319        kvm_set_rflags(vcpu, regs->rflags);
6320
6321        vcpu->arch.exception.pending = false;
6322
6323        kvm_make_request(KVM_REQ_EVENT, vcpu);
6324
6325        return 0;
6326}
6327
6328void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
6329{
6330        struct kvm_segment cs;
6331
6332        kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
6333        *db = cs.db;
6334        *l = cs.l;
6335}
6336EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
6337
6338int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
6339                                  struct kvm_sregs *sregs)
6340{
6341        struct desc_ptr dt;
6342
6343        kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
6344        kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
6345        kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
6346        kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
6347        kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
6348        kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
6349
6350        kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
6351        kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
6352
6353        kvm_x86_ops->get_idt(vcpu, &dt);
6354        sregs->idt.limit = dt.size;
6355        sregs->idt.base = dt.address;
6356        kvm_x86_ops->get_gdt(vcpu, &dt);
6357        sregs->gdt.limit = dt.size;
6358        sregs->gdt.base = dt.address;
6359
6360        sregs->cr0 = kvm_read_cr0(vcpu);
6361        sregs->cr2 = vcpu->arch.cr2;
6362        sregs->cr3 = kvm_read_cr3(vcpu);
6363        sregs->cr4 = kvm_read_cr4(vcpu);
6364        sregs->cr8 = kvm_get_cr8(vcpu);
6365        sregs->efer = vcpu->arch.efer;
6366        sregs->apic_base = kvm_get_apic_base(vcpu);
6367
6368        memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
6369
6370        if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
6371                set_bit(vcpu->arch.interrupt.nr,
6372                        (unsigned long *)sregs->interrupt_bitmap);
6373
6374        return 0;
6375}
6376
6377int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
6378                                    struct kvm_mp_state *mp_state)
6379{
6380        kvm_apic_accept_events(vcpu);
6381        if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
6382                                        vcpu->arch.pv.pv_unhalted)
6383                mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
6384        else
6385                mp_state->mp_state = vcpu->arch.mp_state;
6386
6387        return 0;
6388}
6389
6390int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
6391                                    struct kvm_mp_state *mp_state)
6392{
6393        if (!kvm_vcpu_has_lapic(vcpu) &&
6394            mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
6395                return -EINVAL;
6396
6397        if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
6398                vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
6399                set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
6400        } else
6401                vcpu->arch.mp_state = mp_state->mp_state;
6402        kvm_make_request(KVM_REQ_EVENT, vcpu);
6403        return 0;
6404}
6405
6406int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
6407                    int reason, bool has_error_code, u32 error_code)
6408{
6409        struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
6410        int ret;
6411
6412        init_emulate_ctxt(vcpu);
6413
6414        ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
6415                                   has_error_code, error_code);
6416
6417        if (ret)
6418                return EMULATE_FAIL;
6419
6420        kvm_rip_write(vcpu, ctxt->eip);
6421        kvm_set_rflags(vcpu, ctxt->eflags);
6422        kvm_make_request(KVM_REQ_EVENT, vcpu);
6423        return EMULATE_DONE;
6424}
6425EXPORT_SYMBOL_GPL(kvm_task_switch);
6426
6427int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
6428                                  struct kvm_sregs *sregs)
6429{
6430        struct msr_data apic_base_msr;
6431        int mmu_reset_needed = 0;
6432        int pending_vec, max_bits, idx;
6433        struct desc_ptr dt;
6434
6435        if (!guest_cpuid_has_xsave(vcpu) && (sregs->cr4 & X86_CR4_OSXSAVE))
6436                return -EINVAL;
6437
6438        dt.size = sregs->idt.limit;
6439        dt.address = sregs->idt.base;
6440        kvm_x86_ops->set_idt(vcpu, &dt);
6441        dt.size = sregs->gdt.limit;
6442        dt.address = sregs->gdt.base;
6443        kvm_x86_ops->set_gdt(vcpu, &dt);
6444
6445        vcpu->arch.cr2 = sregs->cr2;
6446        mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
6447        vcpu->arch.cr3 = sregs->cr3;
6448        __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
6449
6450        kvm_set_cr8(vcpu, sregs->cr8);
6451
6452        mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
6453        kvm_x86_ops->set_efer(vcpu, sregs->efer);
6454        apic_base_msr.data = sregs->apic_base;
6455        apic_base_msr.host_initiated = true;
6456        kvm_set_apic_base(vcpu, &apic_base_msr);
6457
6458        mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
6459        kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
6460        vcpu->arch.cr0 = sregs->cr0;
6461
6462        mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
6463        kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
6464        if (sregs->cr4 & X86_CR4_OSXSAVE)
6465                kvm_update_cpuid(vcpu);
6466
6467        idx = srcu_read_lock(&vcpu->kvm->srcu);
6468        if (!is_long_mode(vcpu) && is_pae(vcpu)) {
6469                load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
6470                mmu_reset_needed = 1;
6471        }
6472        srcu_read_unlock(&vcpu->kvm->srcu, idx);
6473
6474        if (mmu_reset_needed)
6475                kvm_mmu_reset_context(vcpu);
6476
6477        max_bits = KVM_NR_INTERRUPTS;
6478        pending_vec = find_first_bit(
6479                (const unsigned long *)sregs->interrupt_bitmap, max_bits);
6480        if (pending_vec < max_bits) {
6481                kvm_queue_interrupt(vcpu, pending_vec, false);
6482                pr_debug("Set back pending irq %d\n", pending_vec);
6483        }
6484
6485        kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
6486        kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
6487        kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
6488        kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
6489        kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
6490        kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
6491
6492        kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
6493        kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
6494
6495        update_cr8_intercept(vcpu);
6496
6497        /* Older userspace won't unhalt the vcpu on reset. */
6498        if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
6499            sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
6500            !is_protmode(vcpu))
6501                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
6502
6503        kvm_make_request(KVM_REQ_EVENT, vcpu);
6504
6505        return 0;
6506}
6507
6508int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
6509                                        struct kvm_guest_debug *dbg)
6510{
6511        unsigned long rflags;
6512        int i, r;
6513
6514        if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
6515                r = -EBUSY;
6516                if (vcpu->arch.exception.pending)
6517                        goto out;
6518                if (dbg->control & KVM_GUESTDBG_INJECT_DB)
6519                        kvm_queue_exception(vcpu, DB_VECTOR);
6520                else
6521                        kvm_queue_exception(vcpu, BP_VECTOR);
6522        }
6523
6524        /*
6525         * Read rflags as long as potentially injected trace flags are still
6526         * filtered out.
6527         */
6528        rflags = kvm_get_rflags(vcpu);
6529
6530        vcpu->guest_debug = dbg->control;
6531        if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
6532                vcpu->guest_debug = 0;
6533
6534        if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
6535                for (i = 0; i < KVM_NR_DB_REGS; ++i)
6536                        vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
6537                vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
6538        } else {
6539                for (i = 0; i < KVM_NR_DB_REGS; i++)
6540                        vcpu->arch.eff_db[i] = vcpu->arch.db[i];
6541        }
6542        kvm_update_dr7(vcpu);
6543
6544        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
6545                vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
6546                        get_segment_base(vcpu, VCPU_SREG_CS);
6547
6548        /*
6549         * Trigger an rflags update that will inject or remove the trace
6550         * flags.
6551         */
6552        kvm_set_rflags(vcpu, rflags);
6553
6554        kvm_x86_ops->update_db_bp_intercept(vcpu);
6555
6556        r = 0;
6557
6558out:
6559
6560        return r;
6561}
6562
6563/*
6564 * Translate a guest virtual address to a guest physical address.
6565 */
6566int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
6567                                    struct kvm_translation *tr)
6568{
6569        unsigned long vaddr = tr->linear_address;
6570        gpa_t gpa;
6571        int idx;
6572
6573        idx = srcu_read_lock(&vcpu->kvm->srcu);
6574        gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
6575        srcu_read_unlock(&vcpu->kvm->srcu, idx);
6576        tr->physical_address = gpa;
6577        tr->valid = gpa != UNMAPPED_GVA;
6578        tr->writeable = 1;
6579        tr->usermode = 0;
6580
6581        return 0;
6582}
6583
6584int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
6585{
6586        struct i387_fxsave_struct *fxsave =
6587                        &vcpu->arch.guest_fpu.state->fxsave;
6588
6589        memcpy(fpu->fpr, fxsave->st_space, 128);
6590        fpu->fcw = fxsave->cwd;
6591        fpu->fsw = fxsave->swd;
6592        fpu->ftwx = fxsave->twd;
6593        fpu->last_opcode = fxsave->fop;
6594        fpu->last_ip = fxsave->rip;
6595        fpu->last_dp = fxsave->rdp;
6596        memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
6597
6598        return 0;
6599}
6600
6601int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
6602{
6603        struct i387_fxsave_struct *fxsave =
6604                        &vcpu->arch.guest_fpu.state->fxsave;
6605
6606        memcpy(fxsave->st_space, fpu->fpr, 128);
6607        fxsave->cwd = fpu->fcw;
6608        fxsave->swd = fpu->fsw;
6609        fxsave->twd = fpu->ftwx;
6610        fxsave->fop = fpu->last_opcode;
6611        fxsave->rip = fpu->last_ip;
6612        fxsave->rdp = fpu->last_dp;
6613        memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
6614
6615        return 0;
6616}
6617
6618int fx_init(struct kvm_vcpu *vcpu)
6619{
6620        int err;
6621
6622        err = fpu_alloc(&vcpu->arch.guest_fpu);
6623        if (err)
6624                return err;
6625
6626        fpu_finit(&vcpu->arch.guest_fpu);
6627
6628        /*
6629         * Ensure guest xcr0 is valid for loading
6630         */
6631        vcpu->arch.xcr0 = XSTATE_FP;
6632
6633        vcpu->arch.cr0 |= X86_CR0_ET;
6634
6635        return 0;
6636}
6637EXPORT_SYMBOL_GPL(fx_init);
6638
6639static void fx_free(struct kvm_vcpu *vcpu)
6640{
6641        fpu_free(&vcpu->arch.guest_fpu);
6642}
6643
6644void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
6645{
6646        if (vcpu->guest_fpu_loaded)
6647                return;
6648
6649        /*
6650         * Restore all possible states in the guest,
6651         * and assume host would use all available bits.
6652         * Guest xcr0 would be loaded later.
6653         */
6654        kvm_put_guest_xcr0(vcpu);
6655        vcpu->guest_fpu_loaded = 1;
6656        __kernel_fpu_begin();
6657        fpu_restore_checking(&vcpu->arch.guest_fpu);
6658        trace_kvm_fpu(1);
6659}
6660
6661void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
6662{
6663        kvm_put_guest_xcr0(vcpu);
6664
6665        if (!vcpu->guest_fpu_loaded)
6666                return;
6667
6668        vcpu->guest_fpu_loaded = 0;
6669        fpu_save_init(&vcpu->arch.guest_fpu);
6670        __kernel_fpu_end();
6671        ++vcpu->stat.fpu_reload;
6672        kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
6673        trace_kvm_fpu(0);
6674}
6675
6676void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
6677{
6678        kvmclock_reset(vcpu);
6679
6680        free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
6681        fx_free(vcpu);
6682        kvm_x86_ops->vcpu_free(vcpu);
6683}
6684
6685struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
6686                                                unsigned int id)
6687{
6688        if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
6689                printk_once(KERN_WARNING
6690                "kvm: SMP vm created on host with unstable TSC; "
6691                "guest TSC will not be reliable\n");
6692        return kvm_x86_ops->vcpu_create(kvm, id);
6693}
6694
6695int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
6696{
6697        int r;
6698
6699        vcpu->arch.mtrr_state.have_fixed = 1;
6700        r = vcpu_load(vcpu);
6701        if (r)
6702                return r;
6703        kvm_vcpu_reset(vcpu);
6704        kvm_mmu_setup(vcpu);
6705        vcpu_put(vcpu);
6706
6707        return r;
6708}
6709
6710int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
6711{
6712        int r;
6713        struct msr_data msr;
6714
6715        r = vcpu_load(vcpu);
6716        if (r)
6717                return r;
6718        msr.data = 0x0;
6719        msr.index = MSR_IA32_TSC;
6720        msr.host_initiated = true;
6721        kvm_write_tsc(vcpu, &msr);
6722        vcpu_put(vcpu);
6723
6724        return r;
6725}
6726
6727void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
6728{
6729        int r;
6730        vcpu->arch.apf.msr_val = 0;
6731
6732        r = vcpu_load(vcpu);
6733        BUG_ON(r);
6734        kvm_mmu_unload(vcpu);
6735        vcpu_put(vcpu);
6736
6737        fx_free(vcpu);
6738        kvm_x86_ops->vcpu_free(vcpu);
6739}
6740
6741void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
6742{
6743        atomic_set(&vcpu->arch.nmi_queued, 0);
6744        vcpu->arch.nmi_pending = 0;
6745        vcpu->arch.nmi_injected = false;
6746
6747        memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
6748        vcpu->arch.dr6 = DR6_FIXED_1;
6749        kvm_update_dr6(vcpu);
6750        vcpu->arch.dr7 = DR7_FIXED_1;
6751        kvm_update_dr7(vcpu);
6752
6753        kvm_make_request(KVM_REQ_EVENT, vcpu);
6754        vcpu->arch.apf.msr_val = 0;
6755        vcpu->arch.st.msr_val = 0;
6756
6757        kvmclock_reset(vcpu);
6758
6759        kvm_clear_async_pf_completion_queue(vcpu);
6760        kvm_async_pf_hash_reset(vcpu);
6761        vcpu->arch.apf.halted = false;
6762
6763        kvm_pmu_reset(vcpu);
6764
6765        memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
6766        vcpu->arch.regs_avail = ~0;
6767        vcpu->arch.regs_dirty = ~0;
6768
6769        kvm_x86_ops->vcpu_reset(vcpu);
6770}
6771
6772void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector)
6773{
6774        struct kvm_segment cs;
6775
6776        kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
6777        cs.selector = vector << 8;
6778        cs.base = vector << 12;
6779        kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
6780        kvm_rip_write(vcpu, 0);
6781}
6782
6783int kvm_arch_hardware_enable(void *garbage)
6784{
6785        struct kvm *kvm;
6786        struct kvm_vcpu *vcpu;
6787        int i;
6788        int ret;
6789        u64 local_tsc;
6790        u64 max_tsc = 0;
6791        bool stable, backwards_tsc = false;
6792
6793        kvm_shared_msr_cpu_online();
6794        ret = kvm_x86_ops->hardware_enable(garbage);
6795        if (ret != 0)
6796                return ret;
6797
6798        local_tsc = native_read_tsc();
6799        stable = !check_tsc_unstable();
6800        list_for_each_entry(kvm, &vm_list, vm_list) {
6801                kvm_for_each_vcpu(i, vcpu, kvm) {
6802                        if (!stable && vcpu->cpu == smp_processor_id())
6803                                set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
6804                        if (stable && vcpu->arch.last_host_tsc > local_tsc) {
6805                                backwards_tsc = true;
6806                                if (vcpu->arch.last_host_tsc > max_tsc)
6807                                        max_tsc = vcpu->arch.last_host_tsc;
6808                        }
6809                }
6810        }
6811
6812        /*
6813         * Sometimes, even reliable TSCs go backwards.  This happens on
6814         * platforms that reset TSC during suspend or hibernate actions, but
6815         * maintain synchronization.  We must compensate.  Fortunately, we can
6816         * detect that condition here, which happens early in CPU bringup,
6817         * before any KVM threads can be running.  Unfortunately, we can't
6818         * bring the TSCs fully up to date with real time, as we aren't yet far
6819         * enough into CPU bringup that we know how much real time has actually
6820         * elapsed; our helper function, get_kernel_ns() will be using boot
6821         * variables that haven't been updated yet.
6822         *
6823         * So we simply find the maximum observed TSC above, then record the
6824         * adjustment to TSC in each VCPU.  When the VCPU later gets loaded,
6825         * the adjustment will be applied.  Note that we accumulate
6826         * adjustments, in case multiple suspend cycles happen before some VCPU
6827         * gets a chance to run again.  In the event that no KVM threads get a
6828         * chance to run, we will miss the entire elapsed period, as we'll have
6829         * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
6830         * loose cycle time.  This isn't too big a deal, since the loss will be
6831         * uniform across all VCPUs (not to mention the scenario is extremely
6832         * unlikely). It is possible that a second hibernate recovery happens
6833         * much faster than a first, causing the observed TSC here to be
6834         * smaller; this would require additional padding adjustment, which is
6835         * why we set last_host_tsc to the local tsc observed here.
6836         *
6837         * N.B. - this code below runs only on platforms with reliable TSC,
6838         * as that is the only way backwards_tsc is set above.  Also note
6839         * that this runs for ALL vcpus, which is not a bug; all VCPUs should
6840         * have the same delta_cyc adjustment applied if backwards_tsc
6841         * is detected.  Note further, this adjustment is only done once,
6842         * as we reset last_host_tsc on all VCPUs to stop this from being
6843         * called multiple times (one for each physical CPU bringup).
6844         *
6845         * Platforms with unreliable TSCs don't have to deal with this, they
6846         * will be compensated by the logic in vcpu_load, which sets the TSC to
6847         * catchup mode.  This will catchup all VCPUs to real time, but cannot
6848         * guarantee that they stay in perfect synchronization.
6849         */
6850        if (backwards_tsc) {
6851                u64 delta_cyc = max_tsc - local_tsc;
6852                list_for_each_entry(kvm, &vm_list, vm_list) {
6853                        kvm_for_each_vcpu(i, vcpu, kvm) {
6854                                vcpu->arch.tsc_offset_adjustment += delta_cyc;
6855                                vcpu->arch.last_host_tsc = local_tsc;
6856                                set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
6857                                        &vcpu->requests);
6858                        }
6859
6860                        /*
6861                         * We have to disable TSC offset matching.. if you were
6862                         * booting a VM while issuing an S4 host suspend....
6863                         * you may have some problem.  Solving this issue is
6864                         * left as an exercise to the reader.
6865                         */
6866                        kvm->arch.last_tsc_nsec = 0;
6867                        kvm->arch.last_tsc_write = 0;
6868                }
6869
6870        }
6871        return 0;
6872}
6873
6874void kvm_arch_hardware_disable(void *garbage)
6875{
6876        kvm_x86_ops->hardware_disable(garbage);
6877        drop_user_return_notifiers(garbage);
6878}
6879
6880int kvm_arch_hardware_setup(void)
6881{
6882        return kvm_x86_ops->hardware_setup();
6883}
6884
6885void kvm_arch_hardware_unsetup(void)
6886{
6887        kvm_x86_ops->hardware_unsetup();
6888}
6889
6890void kvm_arch_check_processor_compat(void *rtn)
6891{
6892        kvm_x86_ops->check_processor_compatibility(rtn);
6893}
6894
6895bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
6896{
6897        return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
6898}
6899
6900struct static_key kvm_no_apic_vcpu __read_mostly;
6901
6902int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
6903{
6904        struct page *page;
6905        struct kvm *kvm;
6906        int r;
6907
6908        BUG_ON(vcpu->kvm == NULL);
6909        kvm = vcpu->kvm;
6910
6911        vcpu->arch.pv.pv_unhalted = false;
6912        vcpu->arch.emulate_ctxt.ops = &emulate_ops;
6913        if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
6914                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
6915        else
6916                vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
6917
6918        page = alloc_page(GFP_KERNEL | __GFP_ZERO);
6919        if (!page) {
6920                r = -ENOMEM;
6921                goto fail;
6922        }
6923        vcpu->arch.pio_data = page_address(page);
6924
6925        kvm_set_tsc_khz(vcpu, max_tsc_khz);
6926
6927        r = kvm_mmu_create(vcpu);
6928        if (r < 0)
6929                goto fail_free_pio_data;
6930
6931        if (irqchip_in_kernel(kvm)) {
6932                r = kvm_create_lapic(vcpu);
6933                if (r < 0)
6934                        goto fail_mmu_destroy;
6935        } else
6936                static_key_slow_inc(&kvm_no_apic_vcpu);
6937
6938        vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
6939                                       GFP_KERNEL);
6940        if (!vcpu->arch.mce_banks) {
6941                r = -ENOMEM;
6942                goto fail_free_lapic;
6943        }
6944        vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
6945
6946        if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) {
6947                r = -ENOMEM;
6948                goto fail_free_mce_banks;
6949        }
6950
6951        r = fx_init(vcpu);
6952        if (r)
6953                goto fail_free_wbinvd_dirty_mask;
6954
6955        vcpu->arch.ia32_tsc_adjust_msr = 0x0;
6956        vcpu->arch.pv_time_enabled = false;
6957
6958        vcpu->arch.guest_supported_xcr0 = 0;
6959        vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
6960
6961        kvm_async_pf_hash_reset(vcpu);
6962        kvm_pmu_init(vcpu);
6963
6964        return 0;
6965fail_free_wbinvd_dirty_mask:
6966        free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
6967fail_free_mce_banks:
6968        kfree(vcpu->arch.mce_banks);
6969fail_free_lapic:
6970        kvm_free_lapic(vcpu);
6971fail_mmu_destroy:
6972        kvm_mmu_destroy(vcpu);
6973fail_free_pio_data:
6974        free_page((unsigned long)vcpu->arch.pio_data);
6975fail:
6976        return r;
6977}
6978
6979void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
6980{
6981        int idx;
6982
6983        kvm_pmu_destroy(vcpu);
6984        kfree(vcpu->arch.mce_banks);
6985        kvm_free_lapic(vcpu);
6986        idx = srcu_read_lock(&vcpu->kvm->srcu);
6987        kvm_mmu_destroy(vcpu);
6988        srcu_read_unlock(&vcpu->kvm->srcu, idx);
6989        free_page((unsigned long)vcpu->arch.pio_data);
6990        if (!irqchip_in_kernel(vcpu->kvm))
6991                static_key_slow_dec(&kvm_no_apic_vcpu);
6992}
6993
6994int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
6995{
6996        if (type)
6997                return -EINVAL;
6998
6999        INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
7000        INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);

7001        INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
7002        atomic_set(&kvm->arch.noncoherent_dma_count, 0);
7003
7004        /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
7005        set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
7006        /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
7007        set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
7008                &kvm->arch.irq_sources_bitmap);
7009
7010        raw_spin_lock_init(&kvm->arch.tsc_write_lock);
7011        mutex_init(&kvm->arch.apic_map_lock);
7012        spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
7013
7014        pvclock_update_vm_gtod_copy(kvm);
7015
7016        return 0;
7017}
7018
7019static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
7020{
7021        int r;
7022        r = vcpu_load(vcpu);
7023        BUG_ON(r);
7024        kvm_mmu_unload(vcpu);
7025        vcpu_put(vcpu);
7026}
7027
7028static void kvm_free_vcpus(struct kvm *kvm)
7029{
7030        unsigned int i;
7031        struct kvm_vcpu *vcpu;
7032
7033        /*
7034         * Unpin any mmu pages first.
7035         */
7036        kvm_for_each_vcpu(i, vcpu, kvm) {
7037                kvm_clear_async_pf_completion_queue(vcpu);
7038                kvm_unload_vcpu_mmu(vcpu);
7039        }
7040        kvm_for_each_vcpu(i, vcpu, kvm)
7041                kvm_arch_vcpu_free(vcpu);
7042
7043        mutex_lock(&kvm->lock);
7044        for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
7045                kvm->vcpus[i] = NULL;
7046
7047        atomic_set(&kvm->online_vcpus, 0);
7048        mutex_unlock(&kvm->lock);
7049}
7050
7051void kvm_arch_sync_events(struct kvm *kvm)
7052{
7053        kvm_free_all_assigned_devices(kvm);
7054        kvm_free_pit(kvm);
7055}
7056
7057void kvm_arch_destroy_vm(struct kvm *kvm)
7058{
7059        if (current->mm == kvm->mm) {
7060                /*
7061                 * Free memory regions allocated on behalf of userspace,
7062                 * unless the the memory map has changed due to process exit
7063                 * or fd copying.
7064                 */
7065                struct kvm_userspace_memory_region mem;
7066                memset(&mem, 0, sizeof(mem));
7067                mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
7068                kvm_set_memory_region(kvm, &mem);
7069
7070                mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
7071                kvm_set_memory_region(kvm, &mem);
7072
7073                mem.slot = TSS_PRIVATE_MEMSLOT;
7074                kvm_set_memory_region(kvm, &mem);
7075        }
7076        kvm_iommu_unmap_guest(kvm);
7077        kfree(kvm->arch.vpic);
7078        kfree(kvm->arch.vioapic);
7079        kvm_free_vcpus(kvm);
7080        if (kvm->arch.apic_access_page)
7081                put_page(kvm->arch.apic_access_page);
7082        if (kvm->arch.ept_identity_pagetable)
7083                put_page(kvm->arch.ept_identity_pagetable);
7084        kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
7085}
7086
7087void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
7088                           struct kvm_memory_slot *dont)
7089{
7090        int i;
7091
7092        for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
7093                if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
7094                        kvm_kvfree(free->arch.rmap[i]);
7095                        free->arch.rmap[i] = NULL;
7096                }
7097                if (i == 0)
7098                        continue;
7099
7100                if (!dont || free->arch.lpage_info[i - 1] !=
7101                             dont->arch.lpage_info[i - 1]) {
7102                        kvm_kvfree(free->arch.lpage_info[i - 1]);
7103                        free->arch.lpage_info[i - 1] = NULL;
7104                }
7105        }
7106}
7107
7108int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
7109                            unsigned long npages)
7110{
7111        int i;
7112
7113        for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
7114                unsigned long ugfn;
7115                int lpages;
7116                int level = i + 1;
7117
7118                lpages = gfn_to_index(slot->base_gfn + npages - 1,
7119                                      slot->base_gfn, level) + 1;
7120
7121                slot->arch.rmap[i] =
7122                        kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i]));
7123                if (!slot->arch.rmap[i])
7124                        goto out_free;
7125                if (i == 0)
7126                        continue;
7127
7128                slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *
7129                                        sizeof(*slot->arch.lpage_info[i - 1]));
7130                if (!slot->arch.lpage_info[i - 1])
7131                        goto out_free;
7132
7133                if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
7134                        slot->arch.lpage_info[i - 1][0].write_count = 1;
7135                if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
7136                        slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;
7137                ugfn = slot->userspace_addr >> PAGE_SHIFT;
7138                /*
7139                 * If the gfn and userspace address are not aligned wrt each
7140                 * other, or if explicitly asked to, disable large page
7141                 * support for this slot
7142                 */
7143                if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
7144                    !kvm_largepages_enabled()) {
7145                        unsigned long j;
7146
7147                        for (j = 0; j < lpages; ++j)
7148                                slot->arch.lpage_info[i - 1][j].write_count = 1;
7149                }
7150        }
7151
7152        return 0;
7153
7154out_free:
7155        for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
7156                kvm_kvfree(slot->arch.rmap[i]);
7157                slot->arch.rmap[i] = NULL;
7158                if (i == 0)
7159                        continue;
7160
7161                kvm_kvfree(slot->arch.lpage_info[i - 1]);
7162                slot->arch.lpage_info[i - 1] = NULL;
7163        }
7164        return -ENOMEM;
7165}
7166
7167void kvm_arch_memslots_updated(struct kvm *kvm)
7168{
7169        /*
7170         * memslots->generation has been incremented.
7171         * mmio generation may have reached its maximum value.
7172         */
7173        kvm_mmu_invalidate_mmio_sptes(kvm);
7174}
7175
7176int kvm_arch_prepare_memory_region(struct kvm *kvm,
7177                                struct kvm_memory_slot *memslot,
7178                                struct kvm_userspace_memory_region *mem,
7179                                enum kvm_mr_change change)
7180{
7181        /*
7182         * Only private memory slots need to be mapped here since
7183         * KVM_SET_MEMORY_REGION ioctl is no longer supported.
7184         */
7185        if ((memslot->id >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_CREATE)) {
7186                unsigned long userspace_addr;
7187
7188                /*
7189                 * MAP_SHARED to prevent internal slot pages from being moved
7190                 * by fork()/COW.
7191                 */
7192                userspace_addr = vm_mmap(NULL, 0, memslot->npages * PAGE_SIZE,
7193                                         PROT_READ | PROT_WRITE,
7194                                         MAP_SHARED | MAP_ANONYMOUS, 0);
7195
7196                if (IS_ERR((void *)userspace_addr))
7197                        return PTR_ERR((void *)userspace_addr);
7198
7199                memslot->userspace_addr = userspace_addr;
7200        }
7201
7202        return 0;
7203}
7204
7205void kvm_arch_commit_memory_region(struct kvm *kvm,
7206                                struct kvm_userspace_memory_region *mem,
7207                                const struct kvm_memory_slot *old,
7208                                enum kvm_mr_change change)
7209{
7210
7211        int nr_mmu_pages = 0;
7212
7213        if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) {
7214                int ret;
7215
7216                ret = vm_munmap(old->userspace_addr,
7217                                old->npages * PAGE_SIZE);
7218                if (ret < 0)
7219                        printk(KERN_WARNING
7220                               "kvm_vm_ioctl_set_memory_region: "
7221                               "failed to munmap memory\n");
7222        }
7223
7224        if (!kvm->arch.n_requested_mmu_pages)
7225                nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
7226
7227        if (nr_mmu_pages)
7228                kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
7229        /*
7230         * Write protect all pages for dirty logging.
7231         * Existing largepage mappings are destroyed here and new ones will
7232         * not be created until the end of the logging.
7233         */
7234        if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
7235                kvm_mmu_slot_remove_write_access(kvm, mem->slot);
7236}
7237
7238void kvm_arch_flush_shadow_all(struct kvm *kvm)
7239{
7240        kvm_mmu_invalidate_zap_all_pages(kvm);
7241}
7242
7243void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
7244                                   struct kvm_memory_slot *slot)
7245{
7246        kvm_mmu_invalidate_zap_all_pages(kvm);
7247}
7248
7249int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
7250{
7251        return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
7252                !vcpu->arch.apf.halted)
7253                || !list_empty_careful(&vcpu->async_pf.done)
7254                || kvm_apic_has_events(vcpu)
7255                || vcpu->arch.pv.pv_unhalted
7256                || atomic_read(&vcpu->arch.nmi_queued) ||
7257                (kvm_arch_interrupt_allowed(vcpu) &&
7258                 kvm_cpu_has_interrupt(vcpu));
7259}
7260
7261int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
7262{
7263        return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
7264}
7265
7266int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
7267{
7268        return kvm_x86_ops->interrupt_allowed(vcpu);
7269}
7270
7271bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
7272{
7273        unsigned long current_rip = kvm_rip_read(vcpu) +
7274                get_segment_base(vcpu, VCPU_SREG_CS);
7275
7276        return current_rip == linear_rip;
7277}
7278EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
7279
7280unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
7281{
7282        unsigned long rflags;
7283
7284        rflags = kvm_x86_ops->get_rflags(vcpu);
7285        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
7286                rflags &= ~X86_EFLAGS_TF;
7287        return rflags;
7288}
7289EXPORT_SYMBOL_GPL(kvm_get_rflags);
7290
7291void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
7292{
7293        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
7294            kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
7295                rflags |= X86_EFLAGS_TF;
7296        kvm_x86_ops->set_rflags(vcpu, rflags);
7297        kvm_make_request(KVM_REQ_EVENT, vcpu);
7298}
7299EXPORT_SYMBOL_GPL(kvm_set_rflags);
7300
7301void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
7302{
7303        int r;
7304
7305        if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
7306              work->wakeup_all)
7307                return;
7308
7309        r = kvm_mmu_reload(vcpu);
7310        if (unlikely(r))
7311                return;
7312
7313        if (!vcpu->arch.mmu.direct_map &&
7314              work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
7315                return;
7316
7317        vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
7318}
7319
7320static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
7321{
7322        return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
7323}
7324
7325static inline u32 kvm_async_pf_next_probe(u32 key)
7326{
7327        return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
7328}
7329
7330static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
7331{
7332        u32 key = kvm_async_pf_hash_fn(gfn);
7333
7334        while (vcpu->arch.apf.gfns[key] != ~0)
7335                key = kvm_async_pf_next_probe(key);
7336
7337        vcpu->arch.apf.gfns[key] = gfn;
7338}
7339
7340static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
7341{
7342        int i;
7343        u32 key = kvm_async_pf_hash_fn(gfn);
7344
7345        for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
7346                     (vcpu->arch.apf.gfns[key] != gfn &&
7347                      vcpu->arch.apf.gfns[key] != ~0); i++)
7348                key = kvm_async_pf_next_probe(key);
7349
7350        return key;
7351}
7352
7353bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
7354{
7355        return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
7356}
7357
7358static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
7359{
7360        u32 i, j, k;
7361
7362        i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
7363        while (true) {
7364                vcpu->arch.apf.gfns[i] = ~0;
7365                do {
7366                        j = kvm_async_pf_next_probe(j);
7367                        if (vcpu->arch.apf.gfns[j] == ~0)
7368                                return;
7369                        k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
7370                        /*
7371                         * k lies cyclically in ]i,j]
7372                         * |    i.k.j |
7373                         * |....j i.k.| or  |.k..j i...|
7374                         */
7375                } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
7376                vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
7377                i = j;
7378        }
7379}
7380
7381static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
7382{
7383
7384        return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
7385                                      sizeof(val));
7386}
7387
7388void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
7389                                     struct kvm_async_pf *work)
7390{
7391        struct x86_exception fault;
7392
7393        trace_kvm_async_pf_not_present(work->arch.token, work->gva);
7394        kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
7395
7396        if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
7397            (vcpu->arch.apf.send_user_only &&
7398             kvm_x86_ops->get_cpl(vcpu) == 0))
7399                kvm_make_request(KVM_REQ_APF_HALT, vcpu);
7400        else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
7401                fault.vector = PF_VECTOR;
7402                fault.error_code_valid = true;
7403                fault.error_code = 0;
7404                fault.nested_page_fault = false;
7405                fault.address = work->arch.token;
7406                kvm_inject_page_fault(vcpu, &fault);
7407        }
7408}
7409
7410void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
7411                                 struct kvm_async_pf *work)
7412{
7413        struct x86_exception fault;
7414
7415        trace_kvm_async_pf_ready(work->arch.token, work->gva);
7416        if (work->wakeup_all)
7417                work->arch.token = ~0; /* broadcast wakeup */
7418        else
7419                kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
7420
7421        if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
7422            !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
7423                fault.vector = PF_VECTOR;
7424                fault.error_code_valid = true;
7425                fault.error_code = 0;
7426                fault.nested_page_fault = false;
7427                fault.address = work->arch.token;
7428                kvm_inject_page_fault(vcpu, &fault);
7429        }
7430        vcpu->arch.apf.halted = false;
7431        vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
7432}
7433
7434bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
7435{
7436        if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
7437                return true;
7438        else
7439                return !kvm_event_needs_reinjection(vcpu) &&
7440                        kvm_x86_ops->interrupt_allowed(vcpu);
7441}
7442
7443void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
7444{
7445        atomic_inc(&kvm->arch.noncoherent_dma_count);
7446}
7447EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma);
7448
7449void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
7450{
7451        atomic_dec(&kvm->arch.noncoherent_dma_count);
7452}
7453EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma);
7454
7455bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
7456{
7457        return atomic_read(&kvm->arch.noncoherent_dma_count);
7458}
7459EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
7460
7461EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
7462EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
7463EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
7464EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
7465EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
7466EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
7467EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
7468EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
7469EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
7470EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
7471EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
7472EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
7473EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
7474