linux/arch/x86/kvm/vmx.c
<<
>>
Prefs
   1/*
   2 * Kernel-based Virtual Machine driver for Linux
   3 *
   4 * This module enables machines with Intel VT-x extensions to run virtual
   5 * machines without emulation or binary translation.
   6 *
   7 * Copyright (C) 2006 Qumranet, Inc.
   8 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
   9 *
  10 * Authors:
  11 *   Avi Kivity   <avi@qumranet.com>
  12 *   Yaniv Kamay  <yaniv@qumranet.com>
  13 *
  14 * This work is licensed under the terms of the GNU GPL, version 2.  See
  15 * the COPYING file in the top-level directory.
  16 *
  17 */
  18
  19#include "irq.h"
  20#include "mmu.h"
  21
  22#include <linux/kvm_host.h>
  23#include <linux/module.h>
  24#include <linux/kernel.h>
  25#include <linux/mm.h>
  26#include <linux/highmem.h>
  27#include <linux/sched.h>
  28#include <linux/moduleparam.h>
  29#include <linux/ftrace_event.h>
  30#include <linux/slab.h>
  31#include <linux/tboot.h>
  32#include "kvm_cache_regs.h"
  33#include "x86.h"
  34
  35#include <asm/io.h>
  36#include <asm/desc.h>
  37#include <asm/vmx.h>
  38#include <asm/virtext.h>
  39#include <asm/mce.h>
  40#include <asm/i387.h>
  41#include <asm/xcr.h>
  42
  43#include "trace.h"
  44
  45#define __ex(x) __kvm_handle_fault_on_reboot(x)
  46
  47MODULE_AUTHOR("Qumranet");
  48MODULE_LICENSE("GPL");
  49
  50static int __read_mostly bypass_guest_pf = 1;
  51module_param(bypass_guest_pf, bool, S_IRUGO);
  52
  53static int __read_mostly enable_vpid = 1;
  54module_param_named(vpid, enable_vpid, bool, 0444);
  55
  56static int __read_mostly flexpriority_enabled = 1;
  57module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
  58
  59static int __read_mostly enable_ept = 1;
  60module_param_named(ept, enable_ept, bool, S_IRUGO);
  61
  62static int __read_mostly enable_unrestricted_guest = 1;
  63module_param_named(unrestricted_guest,
  64                        enable_unrestricted_guest, bool, S_IRUGO);
  65
  66static int __read_mostly emulate_invalid_guest_state = 0;
  67module_param(emulate_invalid_guest_state, bool, S_IRUGO);
  68
  69static int __read_mostly vmm_exclusive = 1;
  70module_param(vmm_exclusive, bool, S_IRUGO);
  71
  72static int __read_mostly yield_on_hlt = 1;
  73module_param(yield_on_hlt, bool, S_IRUGO);
  74
  75#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST                           \
  76        (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
  77#define KVM_GUEST_CR0_MASK                                              \
  78        (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
  79#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST                         \
  80        (X86_CR0_WP | X86_CR0_NE)
  81#define KVM_VM_CR0_ALWAYS_ON                                            \
  82        (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
  83#define KVM_CR4_GUEST_OWNED_BITS                                      \
  84        (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR      \
  85         | X86_CR4_OSXMMEXCPT)
  86
  87#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
  88#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
  89
  90#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
  91
  92/*
  93 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
  94 * ple_gap:    upper bound on the amount of time between two successive
  95 *             executions of PAUSE in a loop. Also indicate if ple enabled.
  96 *             According to test, this time is usually small than 41 cycles.
  97 * ple_window: upper bound on the amount of time a guest is allowed to execute
  98 *             in a PAUSE loop. Tests indicate that most spinlocks are held for
  99 *             less than 2^12 cycles
 100 * Time is measured based on a counter that runs at the same rate as the TSC,
 101 * refer SDM volume 3b section 21.6.13 & 22.1.3.
 102 */
 103#define KVM_VMX_DEFAULT_PLE_GAP    41
 104#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
 105static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
 106module_param(ple_gap, int, S_IRUGO);
 107
 108static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
 109module_param(ple_window, int, S_IRUGO);
 110
 111#define NR_AUTOLOAD_MSRS 1
 112
 113struct vmcs {
 114        u32 revision_id;
 115        u32 abort;
 116        char data[0];
 117};
 118
 119struct shared_msr_entry {
 120        unsigned index;
 121        u64 data;
 122        u64 mask;
 123};
 124
 125struct vcpu_vmx {
 126        struct kvm_vcpu       vcpu;
 127        struct list_head      local_vcpus_link;
 128        unsigned long         host_rsp;
 129        int                   launched;
 130        u8                    fail;
 131        u32                   exit_intr_info;
 132        u32                   idt_vectoring_info;
 133        struct shared_msr_entry *guest_msrs;
 134        int                   nmsrs;
 135        int                   save_nmsrs;
 136#ifdef CONFIG_X86_64
 137        u64                   msr_host_kernel_gs_base;
 138        u64                   msr_guest_kernel_gs_base;
 139#endif
 140        struct vmcs          *vmcs;
 141        struct msr_autoload {
 142                unsigned nr;
 143                struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
 144                struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
 145        } msr_autoload;
 146        struct {
 147                int           loaded;
 148                u16           fs_sel, gs_sel, ldt_sel;
 149                int           gs_ldt_reload_needed;
 150                int           fs_reload_needed;
 151        } host_state;
 152        struct {
 153                int vm86_active;
 154                ulong save_rflags;
 155                struct kvm_save_segment {
 156                        u16 selector;
 157                        unsigned long base;
 158                        u32 limit;
 159                        u32 ar;
 160                } tr, es, ds, fs, gs;
 161        } rmode;
 162        int vpid;
 163        bool emulation_required;
 164
 165        /* Support for vnmi-less CPUs */
 166        int soft_vnmi_blocked;
 167        ktime_t entry_time;
 168        s64 vnmi_blocked_time;
 169        u32 exit_reason;
 170
 171        bool rdtscp_enabled;
 172};
 173
 174static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
 175{
 176        return container_of(vcpu, struct vcpu_vmx, vcpu);
 177}
 178
 179static int init_rmode(struct kvm *kvm);
 180static u64 construct_eptp(unsigned long root_hpa);
 181static void kvm_cpu_vmxon(u64 addr);
 182static void kvm_cpu_vmxoff(void);
 183static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 184
 185static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 186static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 187static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
 188static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
 189
 190static unsigned long *vmx_io_bitmap_a;
 191static unsigned long *vmx_io_bitmap_b;
 192static unsigned long *vmx_msr_bitmap_legacy;
 193static unsigned long *vmx_msr_bitmap_longmode;
 194
 195static bool cpu_has_load_ia32_efer;
 196
 197static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
 198static DEFINE_SPINLOCK(vmx_vpid_lock);
 199
 200static struct vmcs_config {
 201        int size;
 202        int order;
 203        u32 revision_id;
 204        u32 pin_based_exec_ctrl;
 205        u32 cpu_based_exec_ctrl;
 206        u32 cpu_based_2nd_exec_ctrl;
 207        u32 vmexit_ctrl;
 208        u32 vmentry_ctrl;
 209} vmcs_config;
 210
 211static struct vmx_capability {
 212        u32 ept;
 213        u32 vpid;
 214} vmx_capability;
 215
 216#define VMX_SEGMENT_FIELD(seg)                                  \
 217        [VCPU_SREG_##seg] = {                                   \
 218                .selector = GUEST_##seg##_SELECTOR,             \
 219                .base = GUEST_##seg##_BASE,                     \
 220                .limit = GUEST_##seg##_LIMIT,                   \
 221                .ar_bytes = GUEST_##seg##_AR_BYTES,             \
 222        }
 223
 224static struct kvm_vmx_segment_field {
 225        unsigned selector;
 226        unsigned base;
 227        unsigned limit;
 228        unsigned ar_bytes;
 229} kvm_vmx_segment_fields[] = {
 230        VMX_SEGMENT_FIELD(CS),
 231        VMX_SEGMENT_FIELD(DS),
 232        VMX_SEGMENT_FIELD(ES),
 233        VMX_SEGMENT_FIELD(FS),
 234        VMX_SEGMENT_FIELD(GS),
 235        VMX_SEGMENT_FIELD(SS),
 236        VMX_SEGMENT_FIELD(TR),
 237        VMX_SEGMENT_FIELD(LDTR),
 238};
 239
 240static u64 host_efer;
 241
 242static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
 243
 244/*
 245 * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
 246 * away by decrementing the array size.
 247 */
 248static const u32 vmx_msr_index[] = {
 249#ifdef CONFIG_X86_64
 250        MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
 251#endif
 252        MSR_EFER, MSR_TSC_AUX, MSR_STAR,
 253};
 254#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
 255
 256static inline bool is_page_fault(u32 intr_info)
 257{
 258        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 259                             INTR_INFO_VALID_MASK)) ==
 260                (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
 261}
 262
 263static inline bool is_no_device(u32 intr_info)
 264{
 265        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 266                             INTR_INFO_VALID_MASK)) ==
 267                (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
 268}
 269
 270static inline bool is_invalid_opcode(u32 intr_info)
 271{
 272        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 273                             INTR_INFO_VALID_MASK)) ==
 274                (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
 275}
 276
 277static inline bool is_external_interrupt(u32 intr_info)
 278{
 279        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
 280                == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
 281}
 282
 283static inline bool is_machine_check(u32 intr_info)
 284{
 285        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 286                             INTR_INFO_VALID_MASK)) ==
 287                (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
 288}
 289
 290static inline bool cpu_has_vmx_msr_bitmap(void)
 291{
 292        return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
 293}
 294
 295static inline bool cpu_has_vmx_tpr_shadow(void)
 296{
 297        return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
 298}
 299
 300static inline bool vm_need_tpr_shadow(struct kvm *kvm)
 301{
 302        return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
 303}
 304
 305static inline bool cpu_has_secondary_exec_ctrls(void)
 306{
 307        return vmcs_config.cpu_based_exec_ctrl &
 308                CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
 309}
 310
 311static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
 312{
 313        return vmcs_config.cpu_based_2nd_exec_ctrl &
 314                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 315}
 316
 317static inline bool cpu_has_vmx_flexpriority(void)
 318{
 319        return cpu_has_vmx_tpr_shadow() &&
 320                cpu_has_vmx_virtualize_apic_accesses();
 321}
 322
 323static inline bool cpu_has_vmx_ept_execute_only(void)
 324{
 325        return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
 326}
 327
 328static inline bool cpu_has_vmx_eptp_uncacheable(void)
 329{
 330        return vmx_capability.ept & VMX_EPTP_UC_BIT;
 331}
 332
 333static inline bool cpu_has_vmx_eptp_writeback(void)
 334{
 335        return vmx_capability.ept & VMX_EPTP_WB_BIT;
 336}
 337
 338static inline bool cpu_has_vmx_ept_2m_page(void)
 339{
 340        return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
 341}
 342
 343static inline bool cpu_has_vmx_ept_1g_page(void)
 344{
 345        return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
 346}
 347
 348static inline bool cpu_has_vmx_ept_4levels(void)
 349{
 350        return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
 351}
 352
 353static inline bool cpu_has_vmx_invept_individual_addr(void)
 354{
 355        return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
 356}
 357
 358static inline bool cpu_has_vmx_invept_context(void)
 359{
 360        return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
 361}
 362
 363static inline bool cpu_has_vmx_invept_global(void)
 364{
 365        return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
 366}
 367
 368static inline bool cpu_has_vmx_invvpid_single(void)
 369{
 370        return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
 371}
 372
 373static inline bool cpu_has_vmx_invvpid_global(void)
 374{
 375        return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
 376}
 377
 378static inline bool cpu_has_vmx_ept(void)
 379{
 380        return vmcs_config.cpu_based_2nd_exec_ctrl &
 381                SECONDARY_EXEC_ENABLE_EPT;
 382}
 383
 384static inline bool cpu_has_vmx_unrestricted_guest(void)
 385{
 386        return vmcs_config.cpu_based_2nd_exec_ctrl &
 387                SECONDARY_EXEC_UNRESTRICTED_GUEST;
 388}
 389
 390static inline bool cpu_has_vmx_ple(void)
 391{
 392        return vmcs_config.cpu_based_2nd_exec_ctrl &
 393                SECONDARY_EXEC_PAUSE_LOOP_EXITING;
 394}
 395
 396static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
 397{
 398        return flexpriority_enabled && irqchip_in_kernel(kvm);
 399}
 400
 401static inline bool cpu_has_vmx_vpid(void)
 402{
 403        return vmcs_config.cpu_based_2nd_exec_ctrl &
 404                SECONDARY_EXEC_ENABLE_VPID;
 405}
 406
 407static inline bool cpu_has_vmx_rdtscp(void)
 408{
 409        return vmcs_config.cpu_based_2nd_exec_ctrl &
 410                SECONDARY_EXEC_RDTSCP;
 411}
 412
 413static inline bool cpu_has_virtual_nmis(void)
 414{
 415        return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
 416}
 417
 418static inline bool cpu_has_vmx_wbinvd_exit(void)
 419{
 420        return vmcs_config.cpu_based_2nd_exec_ctrl &
 421                SECONDARY_EXEC_WBINVD_EXITING;
 422}
 423
 424static inline bool report_flexpriority(void)
 425{
 426        return flexpriority_enabled;
 427}
 428
 429static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 430{
 431        int i;
 432
 433        for (i = 0; i < vmx->nmsrs; ++i)
 434                if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
 435                        return i;
 436        return -1;
 437}
 438
 439static inline void __invvpid(int ext, u16 vpid, gva_t gva)
 440{
 441    struct {
 442        u64 vpid : 16;
 443        u64 rsvd : 48;
 444        u64 gva;
 445    } operand = { vpid, 0, gva };
 446
 447    asm volatile (__ex(ASM_VMX_INVVPID)
 448                  /* CF==1 or ZF==1 --> rc = -1 */
 449                  "; ja 1f ; ud2 ; 1:"
 450                  : : "a"(&operand), "c"(ext) : "cc", "memory");
 451}
 452
 453static inline void __invept(int ext, u64 eptp, gpa_t gpa)
 454{
 455        struct {
 456                u64 eptp, gpa;
 457        } operand = {eptp, gpa};
 458
 459        asm volatile (__ex(ASM_VMX_INVEPT)
 460                        /* CF==1 or ZF==1 --> rc = -1 */
 461                        "; ja 1f ; ud2 ; 1:\n"
 462                        : : "a" (&operand), "c" (ext) : "cc", "memory");
 463}
 464
 465static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
 466{
 467        int i;
 468
 469        i = __find_msr_index(vmx, msr);
 470        if (i >= 0)
 471                return &vmx->guest_msrs[i];
 472        return NULL;
 473}
 474
 475static void vmcs_clear(struct vmcs *vmcs)
 476{
 477        u64 phys_addr = __pa(vmcs);
 478        u8 error;
 479
 480        asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
 481                      : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
 482                      : "cc", "memory");
 483        if (error)
 484                printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
 485                       vmcs, phys_addr);
 486}
 487
 488static void vmcs_load(struct vmcs *vmcs)
 489{
 490        u64 phys_addr = __pa(vmcs);
 491        u8 error;
 492
 493        asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
 494                        : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
 495                        : "cc", "memory");
 496        if (error)
 497                printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
 498                       vmcs, phys_addr);
 499}
 500
 501static void __vcpu_clear(void *arg)
 502{
 503        struct vcpu_vmx *vmx = arg;
 504        int cpu = raw_smp_processor_id();
 505
 506        if (vmx->vcpu.cpu == cpu)
 507                vmcs_clear(vmx->vmcs);
 508        if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
 509                per_cpu(current_vmcs, cpu) = NULL;
 510        list_del(&vmx->local_vcpus_link);
 511        vmx->vcpu.cpu = -1;
 512        vmx->launched = 0;
 513}
 514
 515static void vcpu_clear(struct vcpu_vmx *vmx)
 516{
 517        if (vmx->vcpu.cpu == -1)
 518                return;
 519        smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
 520}
 521
 522static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
 523{
 524        if (vmx->vpid == 0)
 525                return;
 526
 527        if (cpu_has_vmx_invvpid_single())
 528                __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
 529}
 530
 531static inline void vpid_sync_vcpu_global(void)
 532{
 533        if (cpu_has_vmx_invvpid_global())
 534                __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
 535}
 536
 537static inline void vpid_sync_context(struct vcpu_vmx *vmx)
 538{
 539        if (cpu_has_vmx_invvpid_single())
 540                vpid_sync_vcpu_single(vmx);
 541        else
 542                vpid_sync_vcpu_global();
 543}
 544
 545static inline void ept_sync_global(void)
 546{
 547        if (cpu_has_vmx_invept_global())
 548                __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
 549}
 550
 551static inline void ept_sync_context(u64 eptp)
 552{
 553        if (enable_ept) {
 554                if (cpu_has_vmx_invept_context())
 555                        __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
 556                else
 557                        ept_sync_global();
 558        }
 559}
 560
 561static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
 562{
 563        if (enable_ept) {
 564                if (cpu_has_vmx_invept_individual_addr())
 565                        __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
 566                                        eptp, gpa);
 567                else
 568                        ept_sync_context(eptp);
 569        }
 570}
 571
 572static unsigned long vmcs_readl(unsigned long field)
 573{
 574        unsigned long value = 0;
 575
 576        asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
 577                      : "+a"(value) : "d"(field) : "cc");
 578        return value;
 579}
 580
 581static u16 vmcs_read16(unsigned long field)
 582{
 583        return vmcs_readl(field);
 584}
 585
 586static u32 vmcs_read32(unsigned long field)
 587{
 588        return vmcs_readl(field);
 589}
 590
 591static u64 vmcs_read64(unsigned long field)
 592{
 593#ifdef CONFIG_X86_64
 594        return vmcs_readl(field);
 595#else
 596        return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
 597#endif
 598}
 599
 600static noinline void vmwrite_error(unsigned long field, unsigned long value)
 601{
 602        printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
 603               field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
 604        dump_stack();
 605}
 606
 607static void vmcs_writel(unsigned long field, unsigned long value)
 608{
 609        u8 error;
 610
 611        asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
 612                       : "=q"(error) : "a"(value), "d"(field) : "cc");
 613        if (unlikely(error))
 614                vmwrite_error(field, value);
 615}
 616
 617static void vmcs_write16(unsigned long field, u16 value)
 618{
 619        vmcs_writel(field, value);
 620}
 621
 622static void vmcs_write32(unsigned long field, u32 value)
 623{
 624        vmcs_writel(field, value);
 625}
 626
 627static void vmcs_write64(unsigned long field, u64 value)
 628{
 629        vmcs_writel(field, value);
 630#ifndef CONFIG_X86_64
 631        asm volatile ("");
 632        vmcs_writel(field+1, value >> 32);
 633#endif
 634}
 635
 636static void vmcs_clear_bits(unsigned long field, u32 mask)
 637{
 638        vmcs_writel(field, vmcs_readl(field) & ~mask);
 639}
 640
 641static void vmcs_set_bits(unsigned long field, u32 mask)
 642{
 643        vmcs_writel(field, vmcs_readl(field) | mask);
 644}
 645
 646static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 647{
 648        u32 eb;
 649
 650        eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
 651             (1u << NM_VECTOR) | (1u << DB_VECTOR);
 652        if ((vcpu->guest_debug &
 653             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
 654            (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
 655                eb |= 1u << BP_VECTOR;
 656        if (to_vmx(vcpu)->rmode.vm86_active)
 657                eb = ~0;
 658        if (enable_ept)
 659                eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
 660        if (vcpu->fpu_active)
 661                eb &= ~(1u << NM_VECTOR);
 662        vmcs_write32(EXCEPTION_BITMAP, eb);
 663}
 664
 665static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
 666{
 667        unsigned i;
 668        struct msr_autoload *m = &vmx->msr_autoload;
 669
 670        if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
 671                vmcs_clear_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
 672                vmcs_clear_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
 673                return;
 674        }
 675
 676        for (i = 0; i < m->nr; ++i)
 677                if (m->guest[i].index == msr)
 678                        break;
 679
 680        if (i == m->nr)
 681                return;
 682        --m->nr;
 683        m->guest[i] = m->guest[m->nr];
 684        m->host[i] = m->host[m->nr];
 685        vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
 686        vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
 687}
 688
 689static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
 690                                  u64 guest_val, u64 host_val)
 691{
 692        unsigned i;
 693        struct msr_autoload *m = &vmx->msr_autoload;
 694
 695        if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
 696                vmcs_write64(GUEST_IA32_EFER, guest_val);
 697                vmcs_write64(HOST_IA32_EFER, host_val);
 698                vmcs_set_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
 699                vmcs_set_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
 700                return;
 701        }
 702
 703        for (i = 0; i < m->nr; ++i)
 704                if (m->guest[i].index == msr)
 705                        break;
 706
 707        if (i == m->nr) {
 708                ++m->nr;
 709                vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
 710                vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
 711        }
 712
 713        m->guest[i].index = msr;
 714        m->guest[i].value = guest_val;
 715        m->host[i].index = msr;
 716        m->host[i].value = host_val;
 717}
 718
 719static void reload_tss(void)
 720{
 721        /*
 722         * VT restores TR but not its size.  Useless.
 723         */
 724        struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
 725        struct desc_struct *descs;
 726
 727        descs = (void *)gdt->address;
 728        descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
 729        load_TR_desc();
 730}
 731
 732static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
 733{
 734        u64 guest_efer;
 735        u64 ignore_bits;
 736
 737        guest_efer = vmx->vcpu.arch.efer;
 738
 739        /*
 740         * NX is emulated; LMA and LME handled by hardware; SCE meaninless
 741         * outside long mode
 742         */
 743        ignore_bits = EFER_NX | EFER_SCE;
 744#ifdef CONFIG_X86_64
 745        ignore_bits |= EFER_LMA | EFER_LME;
 746        /* SCE is meaningful only in long mode on Intel */
 747        if (guest_efer & EFER_LMA)
 748                ignore_bits &= ~(u64)EFER_SCE;
 749#endif
 750        guest_efer &= ~ignore_bits;
 751        guest_efer |= host_efer & ignore_bits;
 752        vmx->guest_msrs[efer_offset].data = guest_efer;
 753        vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
 754
 755        clear_atomic_switch_msr(vmx, MSR_EFER);
 756        /* On ept, can't emulate nx, and must switch nx atomically */
 757        if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) {
 758                guest_efer = vmx->vcpu.arch.efer;
 759                if (!(guest_efer & EFER_LMA))
 760                        guest_efer &= ~EFER_LME;
 761                add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer);
 762                return false;
 763        }
 764
 765        return true;
 766}
 767
 768static unsigned long segment_base(u16 selector)
 769{
 770        struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
 771        struct desc_struct *d;
 772        unsigned long table_base;
 773        unsigned long v;
 774
 775        if (!(selector & ~3))
 776                return 0;
 777
 778        table_base = gdt->address;
 779
 780        if (selector & 4) {           /* from ldt */
 781                u16 ldt_selector = kvm_read_ldt();
 782
 783                if (!(ldt_selector & ~3))
 784                        return 0;
 785
 786                table_base = segment_base(ldt_selector);
 787        }
 788        d = (struct desc_struct *)(table_base + (selector & ~7));
 789        v = get_desc_base(d);
 790#ifdef CONFIG_X86_64
 791       if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
 792               v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
 793#endif
 794        return v;
 795}
 796
 797static inline unsigned long kvm_read_tr_base(void)
 798{
 799        u16 tr;
 800        asm("str %0" : "=g"(tr));
 801        return segment_base(tr);
 802}
 803
 804static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 805{
 806        struct vcpu_vmx *vmx = to_vmx(vcpu);
 807        int i;
 808
 809        if (vmx->host_state.loaded)
 810                return;
 811
 812        vmx->host_state.loaded = 1;
 813        /*
 814         * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
 815         * allow segment selectors with cpl > 0 or ti == 1.
 816         */
 817        vmx->host_state.ldt_sel = kvm_read_ldt();
 818        vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
 819        savesegment(fs, vmx->host_state.fs_sel);
 820        if (!(vmx->host_state.fs_sel & 7)) {
 821                vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
 822                vmx->host_state.fs_reload_needed = 0;
 823        } else {
 824                vmcs_write16(HOST_FS_SELECTOR, 0);
 825                vmx->host_state.fs_reload_needed = 1;
 826        }
 827        savesegment(gs, vmx->host_state.gs_sel);
 828        if (!(vmx->host_state.gs_sel & 7))
 829                vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
 830        else {
 831                vmcs_write16(HOST_GS_SELECTOR, 0);
 832                vmx->host_state.gs_ldt_reload_needed = 1;
 833        }
 834
 835#ifdef CONFIG_X86_64
 836        vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
 837        vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
 838#else
 839        vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
 840        vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
 841#endif
 842
 843#ifdef CONFIG_X86_64
 844        rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
 845        if (is_long_mode(&vmx->vcpu))
 846                wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
 847#endif
 848        for (i = 0; i < vmx->save_nmsrs; ++i)
 849                kvm_set_shared_msr(vmx->guest_msrs[i].index,
 850                                   vmx->guest_msrs[i].data,
 851                                   vmx->guest_msrs[i].mask);
 852}
 853
 854static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 855{
 856        if (!vmx->host_state.loaded)
 857                return;
 858
 859        ++vmx->vcpu.stat.host_state_reload;
 860        vmx->host_state.loaded = 0;
 861#ifdef CONFIG_X86_64
 862        if (is_long_mode(&vmx->vcpu))
 863                rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
 864#endif
 865        if (vmx->host_state.gs_ldt_reload_needed) {
 866                kvm_load_ldt(vmx->host_state.ldt_sel);
 867#ifdef CONFIG_X86_64
 868                load_gs_index(vmx->host_state.gs_sel);
 869#else
 870                loadsegment(gs, vmx->host_state.gs_sel);
 871#endif
 872        }
 873        if (vmx->host_state.fs_reload_needed)
 874                loadsegment(fs, vmx->host_state.fs_sel);
 875        reload_tss();
 876#ifdef CONFIG_X86_64
 877        wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
 878#endif
 879        if (current_thread_info()->status & TS_USEDFPU)
 880                clts();
 881        load_gdt(&__get_cpu_var(host_gdt));
 882}
 883
 884static void vmx_load_host_state(struct vcpu_vmx *vmx)
 885{
 886        preempt_disable();
 887        __vmx_load_host_state(vmx);
 888        preempt_enable();
 889}
 890
 891/*
 892 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
 893 * vcpu mutex is already taken.
 894 */
 895static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 896{
 897        struct vcpu_vmx *vmx = to_vmx(vcpu);
 898        u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
 899
 900        if (!vmm_exclusive)
 901                kvm_cpu_vmxon(phys_addr);
 902        else if (vcpu->cpu != cpu)
 903                vcpu_clear(vmx);
 904
 905        if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
 906                per_cpu(current_vmcs, cpu) = vmx->vmcs;
 907                vmcs_load(vmx->vmcs);
 908        }
 909
 910        if (vcpu->cpu != cpu) {
 911                struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
 912                unsigned long sysenter_esp;
 913
 914                kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 915                local_irq_disable();
 916                list_add(&vmx->local_vcpus_link,
 917                         &per_cpu(vcpus_on_cpu, cpu));
 918                local_irq_enable();
 919
 920                /*
 921                 * Linux uses per-cpu TSS and GDT, so set these when switching
 922                 * processors.
 923                 */
 924                vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
 925                vmcs_writel(HOST_GDTR_BASE, gdt->address);   /* 22.2.4 */
 926
 927                rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
 928                vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
 929        }
 930}
 931
 932static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 933{
 934        __vmx_load_host_state(to_vmx(vcpu));
 935        if (!vmm_exclusive) {
 936                __vcpu_clear(to_vmx(vcpu));
 937                kvm_cpu_vmxoff();
 938        }
 939}
 940
 941static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
 942{
 943        ulong cr0;
 944
 945        if (vcpu->fpu_active)
 946                return;
 947        vcpu->fpu_active = 1;
 948        cr0 = vmcs_readl(GUEST_CR0);
 949        cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
 950        cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
 951        vmcs_writel(GUEST_CR0, cr0);
 952        update_exception_bitmap(vcpu);
 953        vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
 954        vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
 955}
 956
 957static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
 958
 959static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
 960{
 961        vmx_decache_cr0_guest_bits(vcpu);
 962        vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
 963        update_exception_bitmap(vcpu);
 964        vcpu->arch.cr0_guest_owned_bits = 0;
 965        vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
 966        vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
 967}
 968
 969static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
 970{
 971        unsigned long rflags, save_rflags;
 972
 973        rflags = vmcs_readl(GUEST_RFLAGS);
 974        if (to_vmx(vcpu)->rmode.vm86_active) {
 975                rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
 976                save_rflags = to_vmx(vcpu)->rmode.save_rflags;
 977                rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
 978        }
 979        return rflags;
 980}
 981
 982static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 983{
 984        if (to_vmx(vcpu)->rmode.vm86_active) {
 985                to_vmx(vcpu)->rmode.save_rflags = rflags;
 986                rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
 987        }
 988        vmcs_writel(GUEST_RFLAGS, rflags);
 989}
 990
 991static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 992{
 993        u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
 994        int ret = 0;
 995
 996        if (interruptibility & GUEST_INTR_STATE_STI)
 997                ret |= KVM_X86_SHADOW_INT_STI;
 998        if (interruptibility & GUEST_INTR_STATE_MOV_SS)
 999                ret |= KVM_X86_SHADOW_INT_MOV_SS;
1000
1001        return ret & mask;
1002}
1003
1004static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
1005{
1006        u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1007        u32 interruptibility = interruptibility_old;
1008
1009        interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
1010
1011        if (mask & KVM_X86_SHADOW_INT_MOV_SS)
1012                interruptibility |= GUEST_INTR_STATE_MOV_SS;
1013        else if (mask & KVM_X86_SHADOW_INT_STI)
1014                interruptibility |= GUEST_INTR_STATE_STI;
1015
1016        if ((interruptibility != interruptibility_old))
1017                vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
1018}
1019
1020static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
1021{
1022        unsigned long rip;
1023
1024        rip = kvm_rip_read(vcpu);
1025        rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1026        kvm_rip_write(vcpu, rip);
1027
1028        /* skipping an emulated instruction also counts */
1029        vmx_set_interrupt_shadow(vcpu, 0);
1030}
1031
1032static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1033{
1034        /* Ensure that we clear the HLT state in the VMCS.  We don't need to
1035         * explicitly skip the instruction because if the HLT state is set, then
1036         * the instruction is already executing and RIP has already been
1037         * advanced. */
1038        if (!yield_on_hlt &&
1039            vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1040                vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1041}
1042
1043static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1044                                bool has_error_code, u32 error_code,
1045                                bool reinject)
1046{
1047        struct vcpu_vmx *vmx = to_vmx(vcpu);
1048        u32 intr_info = nr | INTR_INFO_VALID_MASK;
1049
1050        if (has_error_code) {
1051                vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
1052                intr_info |= INTR_INFO_DELIVER_CODE_MASK;
1053        }
1054
1055        if (vmx->rmode.vm86_active) {
1056                if (kvm_inject_realmode_interrupt(vcpu, nr) != EMULATE_DONE)
1057                        kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
1058                return;
1059        }
1060
1061        if (kvm_exception_is_soft(nr)) {
1062                vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
1063                             vmx->vcpu.arch.event_exit_inst_len);
1064                intr_info |= INTR_TYPE_SOFT_EXCEPTION;
1065        } else
1066                intr_info |= INTR_TYPE_HARD_EXCEPTION;
1067
1068        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1069        vmx_clear_hlt(vcpu);
1070}
1071
1072static bool vmx_rdtscp_supported(void)
1073{
1074        return cpu_has_vmx_rdtscp();
1075}
1076
1077/*
1078 * Swap MSR entry in host/guest MSR entry array.
1079 */
1080static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
1081{
1082        struct shared_msr_entry tmp;
1083
1084        tmp = vmx->guest_msrs[to];
1085        vmx->guest_msrs[to] = vmx->guest_msrs[from];
1086        vmx->guest_msrs[from] = tmp;
1087}
1088
1089/*
1090 * Set up the vmcs to automatically save and restore system
1091 * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
1092 * mode, as fiddling with msrs is very expensive.
1093 */
1094static void setup_msrs(struct vcpu_vmx *vmx)
1095{
1096        int save_nmsrs, index;
1097        unsigned long *msr_bitmap;
1098
1099        vmx_load_host_state(vmx);
1100        save_nmsrs = 0;
1101#ifdef CONFIG_X86_64
1102        if (is_long_mode(&vmx->vcpu)) {
1103                index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
1104                if (index >= 0)
1105                        move_msr_up(vmx, index, save_nmsrs++);
1106                index = __find_msr_index(vmx, MSR_LSTAR);
1107                if (index >= 0)
1108                        move_msr_up(vmx, index, save_nmsrs++);
1109                index = __find_msr_index(vmx, MSR_CSTAR);
1110                if (index >= 0)
1111                        move_msr_up(vmx, index, save_nmsrs++);
1112                index = __find_msr_index(vmx, MSR_TSC_AUX);
1113                if (index >= 0 && vmx->rdtscp_enabled)
1114                        move_msr_up(vmx, index, save_nmsrs++);
1115                /*
1116                 * MSR_STAR is only needed on long mode guests, and only
1117                 * if efer.sce is enabled.
1118                 */
1119                index = __find_msr_index(vmx, MSR_STAR);
1120                if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
1121                        move_msr_up(vmx, index, save_nmsrs++);
1122        }
1123#endif
1124        index = __find_msr_index(vmx, MSR_EFER);
1125        if (index >= 0 && update_transition_efer(vmx, index))
1126                move_msr_up(vmx, index, save_nmsrs++);
1127
1128        vmx->save_nmsrs = save_nmsrs;
1129
1130        if (cpu_has_vmx_msr_bitmap()) {
1131                if (is_long_mode(&vmx->vcpu))
1132                        msr_bitmap = vmx_msr_bitmap_longmode;
1133                else
1134                        msr_bitmap = vmx_msr_bitmap_legacy;
1135
1136                vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
1137        }
1138}
1139
1140/*
1141 * reads and returns guest's timestamp counter "register"
1142 * guest_tsc = host_tsc + tsc_offset    -- 21.3
1143 */
1144static u64 guest_read_tsc(void)
1145{
1146        u64 host_tsc, tsc_offset;
1147
1148        rdtscll(host_tsc);
1149        tsc_offset = vmcs_read64(TSC_OFFSET);
1150        return host_tsc + tsc_offset;
1151}
1152
1153/*
1154 * writes 'offset' into guest's timestamp counter offset register
1155 */
1156static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1157{
1158        vmcs_write64(TSC_OFFSET, offset);
1159}
1160
1161static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
1162{
1163        u64 offset = vmcs_read64(TSC_OFFSET);
1164        vmcs_write64(TSC_OFFSET, offset + adjustment);
1165}
1166
1167/*
1168 * Reads an msr value (of 'msr_index') into 'pdata'.
1169 * Returns 0 on success, non-0 otherwise.
1170 * Assumes vcpu_load() was already called.
1171 */
1172static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1173{
1174        u64 data;
1175        struct shared_msr_entry *msr;
1176
1177        if (!pdata) {
1178                printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
1179                return -EINVAL;
1180        }
1181
1182        switch (msr_index) {
1183#ifdef CONFIG_X86_64
1184        case MSR_FS_BASE:
1185                data = vmcs_readl(GUEST_FS_BASE);
1186                break;
1187        case MSR_GS_BASE:
1188                data = vmcs_readl(GUEST_GS_BASE);
1189                break;
1190        case MSR_KERNEL_GS_BASE:
1191                vmx_load_host_state(to_vmx(vcpu));
1192                data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
1193                break;
1194#endif
1195        case MSR_EFER:
1196                return kvm_get_msr_common(vcpu, msr_index, pdata);
1197        case MSR_IA32_TSC:
1198                data = guest_read_tsc();
1199                break;
1200        case MSR_IA32_SYSENTER_CS:
1201                data = vmcs_read32(GUEST_SYSENTER_CS);
1202                break;
1203        case MSR_IA32_SYSENTER_EIP:
1204                data = vmcs_readl(GUEST_SYSENTER_EIP);
1205                break;
1206        case MSR_IA32_SYSENTER_ESP:
1207                data = vmcs_readl(GUEST_SYSENTER_ESP);
1208                break;
1209        case MSR_TSC_AUX:
1210                if (!to_vmx(vcpu)->rdtscp_enabled)
1211                        return 1;
1212                /* Otherwise falls through */
1213        default:
1214                vmx_load_host_state(to_vmx(vcpu));
1215                msr = find_msr_entry(to_vmx(vcpu), msr_index);
1216                if (msr) {
1217                        vmx_load_host_state(to_vmx(vcpu));
1218                        data = msr->data;
1219                        break;
1220                }
1221                return kvm_get_msr_common(vcpu, msr_index, pdata);
1222        }
1223
1224        *pdata = data;
1225        return 0;
1226}
1227
1228/*
1229 * Writes msr value into into the appropriate "register".
1230 * Returns 0 on success, non-0 otherwise.
1231 * Assumes vcpu_load() was already called.
1232 */
1233static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1234{
1235        struct vcpu_vmx *vmx = to_vmx(vcpu);
1236        struct shared_msr_entry *msr;
1237        int ret = 0;
1238
1239        switch (msr_index) {
1240        case MSR_EFER:
1241                vmx_load_host_state(vmx);
1242                ret = kvm_set_msr_common(vcpu, msr_index, data);
1243                break;
1244#ifdef CONFIG_X86_64
1245        case MSR_FS_BASE:
1246                vmcs_writel(GUEST_FS_BASE, data);
1247                break;
1248        case MSR_GS_BASE:
1249                vmcs_writel(GUEST_GS_BASE, data);
1250                break;
1251        case MSR_KERNEL_GS_BASE:
1252                vmx_load_host_state(vmx);
1253                vmx->msr_guest_kernel_gs_base = data;
1254                break;
1255#endif
1256        case MSR_IA32_SYSENTER_CS:
1257                vmcs_write32(GUEST_SYSENTER_CS, data);
1258                break;
1259        case MSR_IA32_SYSENTER_EIP:
1260                vmcs_writel(GUEST_SYSENTER_EIP, data);
1261                break;
1262        case MSR_IA32_SYSENTER_ESP:
1263                vmcs_writel(GUEST_SYSENTER_ESP, data);
1264                break;
1265        case MSR_IA32_TSC:
1266                kvm_write_tsc(vcpu, data);
1267                break;
1268        case MSR_IA32_CR_PAT:
1269                if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
1270                        vmcs_write64(GUEST_IA32_PAT, data);
1271                        vcpu->arch.pat = data;
1272                        break;
1273                }
1274                ret = kvm_set_msr_common(vcpu, msr_index, data);
1275                break;
1276        case MSR_TSC_AUX:
1277                if (!vmx->rdtscp_enabled)
1278                        return 1;
1279                /* Check reserved bit, higher 32 bits should be zero */
1280                if ((data >> 32) != 0)
1281                        return 1;
1282                /* Otherwise falls through */
1283        default:
1284                msr = find_msr_entry(vmx, msr_index);
1285                if (msr) {
1286                        vmx_load_host_state(vmx);
1287                        msr->data = data;
1288                        break;
1289                }
1290                ret = kvm_set_msr_common(vcpu, msr_index, data);
1291        }
1292
1293        return ret;
1294}
1295
1296static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1297{
1298        __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
1299        switch (reg) {
1300        case VCPU_REGS_RSP:
1301                vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
1302                break;
1303        case VCPU_REGS_RIP:
1304                vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
1305                break;
1306        case VCPU_EXREG_PDPTR:
1307                if (enable_ept)
1308                        ept_save_pdptrs(vcpu);
1309                break;
1310        default:
1311                break;
1312        }
1313}
1314
1315static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1316{
1317        if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1318                vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
1319        else
1320                vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
1321
1322        update_exception_bitmap(vcpu);
1323}
1324
1325static __init int cpu_has_kvm_support(void)
1326{
1327        return cpu_has_vmx();
1328}
1329
1330static __init int vmx_disabled_by_bios(void)
1331{
1332        u64 msr;
1333
1334        rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
1335        if (msr & FEATURE_CONTROL_LOCKED) {
1336                if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
1337                        && tboot_enabled())
1338                        return 1;
1339                if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
1340                        && !tboot_enabled()) {
1341                        printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
1342                                " activate TXT before enabling KVM\n");
1343                        return 1;
1344                }
1345        }
1346
1347        return 0;
1348        /* locked but not enabled */
1349}
1350
1351static void kvm_cpu_vmxon(u64 addr)
1352{
1353        asm volatile (ASM_VMX_VMXON_RAX
1354                        : : "a"(&addr), "m"(addr)
1355                        : "memory", "cc");
1356}
1357
1358static int hardware_enable(void *garbage)
1359{
1360        int cpu = raw_smp_processor_id();
1361        u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1362        u64 old, test_bits;
1363
1364        if (read_cr4() & X86_CR4_VMXE)
1365                return -EBUSY;
1366
1367        INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
1368        rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1369
1370        test_bits = FEATURE_CONTROL_LOCKED;
1371        test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
1372        if (tboot_enabled())
1373                test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
1374
1375        if ((old & test_bits) != test_bits) {
1376                /* enable and lock */
1377                wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
1378        }
1379        write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
1380
1381        if (vmm_exclusive) {
1382                kvm_cpu_vmxon(phys_addr);
1383                ept_sync_global();
1384        }
1385
1386        store_gdt(&__get_cpu_var(host_gdt));
1387
1388        return 0;
1389}
1390
1391static void vmclear_local_vcpus(void)
1392{
1393        int cpu = raw_smp_processor_id();
1394        struct vcpu_vmx *vmx, *n;
1395
1396        list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu),
1397                                 local_vcpus_link)
1398                __vcpu_clear(vmx);
1399}
1400
1401
1402/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
1403 * tricks.
1404 */
1405static void kvm_cpu_vmxoff(void)
1406{
1407        asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
1408}
1409
1410static void hardware_disable(void *garbage)
1411{
1412        if (vmm_exclusive) {
1413                vmclear_local_vcpus();
1414                kvm_cpu_vmxoff();
1415        }
1416        write_cr4(read_cr4() & ~X86_CR4_VMXE);
1417}
1418
1419static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
1420                                      u32 msr, u32 *result)
1421{
1422        u32 vmx_msr_low, vmx_msr_high;
1423        u32 ctl = ctl_min | ctl_opt;
1424
1425        rdmsr(msr, vmx_msr_low, vmx_msr_high);
1426
1427        ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
1428        ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
1429
1430        /* Ensure minimum (required) set of control bits are supported. */
1431        if (ctl_min & ~ctl)
1432                return -EIO;
1433
1434        *result = ctl;
1435        return 0;
1436}
1437
1438static __init bool allow_1_setting(u32 msr, u32 ctl)
1439{
1440        u32 vmx_msr_low, vmx_msr_high;
1441
1442        rdmsr(msr, vmx_msr_low, vmx_msr_high);
1443        return vmx_msr_high & ctl;
1444}
1445
1446static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1447{
1448        u32 vmx_msr_low, vmx_msr_high;
1449        u32 min, opt, min2, opt2;
1450        u32 _pin_based_exec_control = 0;
1451        u32 _cpu_based_exec_control = 0;
1452        u32 _cpu_based_2nd_exec_control = 0;
1453        u32 _vmexit_control = 0;
1454        u32 _vmentry_control = 0;
1455
1456        min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
1457        opt = PIN_BASED_VIRTUAL_NMIS;
1458        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
1459                                &_pin_based_exec_control) < 0)
1460                return -EIO;
1461
1462        min =
1463#ifdef CONFIG_X86_64
1464              CPU_BASED_CR8_LOAD_EXITING |
1465              CPU_BASED_CR8_STORE_EXITING |
1466#endif
1467              CPU_BASED_CR3_LOAD_EXITING |
1468              CPU_BASED_CR3_STORE_EXITING |
1469              CPU_BASED_USE_IO_BITMAPS |
1470              CPU_BASED_MOV_DR_EXITING |
1471              CPU_BASED_USE_TSC_OFFSETING |
1472              CPU_BASED_MWAIT_EXITING |
1473              CPU_BASED_MONITOR_EXITING |
1474              CPU_BASED_INVLPG_EXITING;
1475
1476        if (yield_on_hlt)
1477                min |= CPU_BASED_HLT_EXITING;
1478
1479        opt = CPU_BASED_TPR_SHADOW |
1480              CPU_BASED_USE_MSR_BITMAPS |
1481              CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
1482        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
1483                                &_cpu_based_exec_control) < 0)
1484                return -EIO;
1485#ifdef CONFIG_X86_64
1486        if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
1487                _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
1488                                           ~CPU_BASED_CR8_STORE_EXITING;
1489#endif
1490        if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
1491                min2 = 0;
1492                opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
1493                        SECONDARY_EXEC_WBINVD_EXITING |
1494                        SECONDARY_EXEC_ENABLE_VPID |
1495                        SECONDARY_EXEC_ENABLE_EPT |
1496                        SECONDARY_EXEC_UNRESTRICTED_GUEST |
1497                        SECONDARY_EXEC_PAUSE_LOOP_EXITING |
1498                        SECONDARY_EXEC_RDTSCP;
1499                if (adjust_vmx_controls(min2, opt2,
1500                                        MSR_IA32_VMX_PROCBASED_CTLS2,
1501                                        &_cpu_based_2nd_exec_control) < 0)
1502                        return -EIO;
1503        }
1504#ifndef CONFIG_X86_64
1505        if (!(_cpu_based_2nd_exec_control &
1506                                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
1507                _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
1508#endif
1509        if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
1510                /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
1511                   enabled */
1512                _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
1513                                             CPU_BASED_CR3_STORE_EXITING |
1514                                             CPU_BASED_INVLPG_EXITING);
1515                rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
1516                      vmx_capability.ept, vmx_capability.vpid);
1517        }
1518
1519        min = 0;
1520#ifdef CONFIG_X86_64
1521        min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
1522#endif
1523        opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
1524        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
1525                                &_vmexit_control) < 0)
1526                return -EIO;
1527
1528        min = 0;
1529        opt = VM_ENTRY_LOAD_IA32_PAT;
1530        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
1531                                &_vmentry_control) < 0)
1532                return -EIO;
1533
1534        rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
1535
1536        /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
1537        if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
1538                return -EIO;
1539
1540#ifdef CONFIG_X86_64
1541        /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
1542        if (vmx_msr_high & (1u<<16))
1543                return -EIO;
1544#endif
1545
1546        /* Require Write-Back (WB) memory type for VMCS accesses. */
1547        if (((vmx_msr_high >> 18) & 15) != 6)
1548                return -EIO;
1549
1550        vmcs_conf->size = vmx_msr_high & 0x1fff;
1551        vmcs_conf->order = get_order(vmcs_config.size);
1552        vmcs_conf->revision_id = vmx_msr_low;
1553
1554        vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
1555        vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
1556        vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
1557        vmcs_conf->vmexit_ctrl         = _vmexit_control;
1558        vmcs_conf->vmentry_ctrl        = _vmentry_control;
1559
1560        cpu_has_load_ia32_efer =
1561                allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
1562                                VM_ENTRY_LOAD_IA32_EFER)
1563                && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
1564                                   VM_EXIT_LOAD_IA32_EFER);
1565
1566        return 0;
1567}
1568
1569static struct vmcs *alloc_vmcs_cpu(int cpu)
1570{
1571        int node = cpu_to_node(cpu);
1572        struct page *pages;
1573        struct vmcs *vmcs;
1574
1575        pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
1576        if (!pages)
1577                return NULL;
1578        vmcs = page_address(pages);
1579        memset(vmcs, 0, vmcs_config.size);
1580        vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
1581        return vmcs;
1582}
1583
1584static struct vmcs *alloc_vmcs(void)
1585{
1586        return alloc_vmcs_cpu(raw_smp_processor_id());
1587}
1588
1589static void free_vmcs(struct vmcs *vmcs)
1590{
1591        free_pages((unsigned long)vmcs, vmcs_config.order);
1592}
1593
1594static void free_kvm_area(void)
1595{
1596        int cpu;
1597
1598        for_each_possible_cpu(cpu) {
1599                free_vmcs(per_cpu(vmxarea, cpu));
1600                per_cpu(vmxarea, cpu) = NULL;
1601        }
1602}
1603
1604static __init int alloc_kvm_area(void)
1605{
1606        int cpu;
1607
1608        for_each_possible_cpu(cpu) {
1609                struct vmcs *vmcs;
1610
1611                vmcs = alloc_vmcs_cpu(cpu);
1612                if (!vmcs) {
1613                        free_kvm_area();
1614                        return -ENOMEM;
1615                }
1616
1617                per_cpu(vmxarea, cpu) = vmcs;
1618        }
1619        return 0;
1620}
1621
1622static __init int hardware_setup(void)
1623{
1624        if (setup_vmcs_config(&vmcs_config) < 0)
1625                return -EIO;
1626
1627        if (boot_cpu_has(X86_FEATURE_NX))
1628                kvm_enable_efer_bits(EFER_NX);
1629
1630        if (!cpu_has_vmx_vpid())
1631                enable_vpid = 0;
1632
1633        if (!cpu_has_vmx_ept() ||
1634            !cpu_has_vmx_ept_4levels()) {
1635                enable_ept = 0;
1636                enable_unrestricted_guest = 0;
1637        }
1638
1639        if (!cpu_has_vmx_unrestricted_guest())
1640                enable_unrestricted_guest = 0;
1641
1642        if (!cpu_has_vmx_flexpriority())
1643                flexpriority_enabled = 0;
1644
1645        if (!cpu_has_vmx_tpr_shadow())
1646                kvm_x86_ops->update_cr8_intercept = NULL;
1647
1648        if (enable_ept && !cpu_has_vmx_ept_2m_page())
1649                kvm_disable_largepages();
1650
1651        if (!cpu_has_vmx_ple())
1652                ple_gap = 0;
1653
1654        return alloc_kvm_area();
1655}
1656
1657static __exit void hardware_unsetup(void)
1658{
1659        free_kvm_area();
1660}
1661
1662static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
1663{
1664        struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1665
1666        if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
1667                vmcs_write16(sf->selector, save->selector);
1668                vmcs_writel(sf->base, save->base);
1669                vmcs_write32(sf->limit, save->limit);
1670                vmcs_write32(sf->ar_bytes, save->ar);
1671        } else {
1672                u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
1673                        << AR_DPL_SHIFT;
1674                vmcs_write32(sf->ar_bytes, 0x93 | dpl);
1675        }
1676}
1677
1678static void enter_pmode(struct kvm_vcpu *vcpu)
1679{
1680        unsigned long flags;
1681        struct vcpu_vmx *vmx = to_vmx(vcpu);
1682
1683        vmx->emulation_required = 1;
1684        vmx->rmode.vm86_active = 0;
1685
1686        vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
1687        vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
1688        vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
1689
1690        flags = vmcs_readl(GUEST_RFLAGS);
1691        flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1692        flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1693        vmcs_writel(GUEST_RFLAGS, flags);
1694
1695        vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
1696                        (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
1697
1698        update_exception_bitmap(vcpu);
1699
1700        if (emulate_invalid_guest_state)
1701                return;
1702
1703        fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es);
1704        fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds);
1705        fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs);
1706        fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs);
1707
1708        vmcs_write16(GUEST_SS_SELECTOR, 0);
1709        vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
1710
1711        vmcs_write16(GUEST_CS_SELECTOR,
1712                     vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
1713        vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1714}
1715
1716static gva_t rmode_tss_base(struct kvm *kvm)
1717{
1718        if (!kvm->arch.tss_addr) {
1719                struct kvm_memslots *slots;
1720                gfn_t base_gfn;
1721
1722                slots = kvm_memslots(kvm);
1723                base_gfn = slots->memslots[0].base_gfn +
1724                                 kvm->memslots->memslots[0].npages - 3;
1725                return base_gfn << PAGE_SHIFT;
1726        }
1727        return kvm->arch.tss_addr;
1728}
1729
1730static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
1731{
1732        struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1733
1734        save->selector = vmcs_read16(sf->selector);
1735        save->base = vmcs_readl(sf->base);
1736        save->limit = vmcs_read32(sf->limit);
1737        save->ar = vmcs_read32(sf->ar_bytes);
1738        vmcs_write16(sf->selector, save->base >> 4);
1739        vmcs_write32(sf->base, save->base & 0xffff0);
1740        vmcs_write32(sf->limit, 0xffff);
1741        vmcs_write32(sf->ar_bytes, 0xf3);
1742        if (save->base & 0xf)
1743                printk_once(KERN_WARNING "kvm: segment base is not paragraph"
1744                            " aligned when entering protected mode (seg=%d)",
1745                            seg);
1746}
1747
1748static void enter_rmode(struct kvm_vcpu *vcpu)
1749{
1750        unsigned long flags;
1751        struct vcpu_vmx *vmx = to_vmx(vcpu);
1752
1753        if (enable_unrestricted_guest)
1754                return;
1755
1756        vmx->emulation_required = 1;
1757        vmx->rmode.vm86_active = 1;
1758
1759        vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
1760        vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
1761
1762        vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
1763        vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
1764
1765        vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
1766        vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1767
1768        flags = vmcs_readl(GUEST_RFLAGS);
1769        vmx->rmode.save_rflags = flags;
1770
1771        flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1772
1773        vmcs_writel(GUEST_RFLAGS, flags);
1774        vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
1775        update_exception_bitmap(vcpu);
1776
1777        if (emulate_invalid_guest_state)
1778                goto continue_rmode;
1779
1780        vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
1781        vmcs_write32(GUEST_SS_LIMIT, 0xffff);
1782        vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
1783
1784        vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
1785        vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1786        if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
1787                vmcs_writel(GUEST_CS_BASE, 0xf0000);
1788        vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
1789
1790        fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
1791        fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
1792        fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
1793        fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
1794
1795continue_rmode:
1796        kvm_mmu_reset_context(vcpu);
1797        init_rmode(vcpu->kvm);
1798}
1799
1800static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1801{
1802        struct vcpu_vmx *vmx = to_vmx(vcpu);
1803        struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
1804
1805        if (!msr)
1806                return;
1807
1808        /*
1809         * Force kernel_gs_base reloading before EFER changes, as control
1810         * of this msr depends on is_long_mode().
1811         */
1812        vmx_load_host_state(to_vmx(vcpu));
1813        vcpu->arch.efer = efer;
1814        if (efer & EFER_LMA) {
1815                vmcs_write32(VM_ENTRY_CONTROLS,
1816                             vmcs_read32(VM_ENTRY_CONTROLS) |
1817                             VM_ENTRY_IA32E_MODE);
1818                msr->data = efer;
1819        } else {
1820                vmcs_write32(VM_ENTRY_CONTROLS,
1821                             vmcs_read32(VM_ENTRY_CONTROLS) &
1822                             ~VM_ENTRY_IA32E_MODE);
1823
1824                msr->data = efer & ~EFER_LME;
1825        }
1826        setup_msrs(vmx);
1827}
1828
1829#ifdef CONFIG_X86_64
1830
1831static void enter_lmode(struct kvm_vcpu *vcpu)
1832{
1833        u32 guest_tr_ar;
1834
1835        guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
1836        if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
1837                printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
1838                       __func__);
1839                vmcs_write32(GUEST_TR_AR_BYTES,
1840                             (guest_tr_ar & ~AR_TYPE_MASK)
1841                             | AR_TYPE_BUSY_64_TSS);
1842        }
1843        vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
1844}
1845
1846static void exit_lmode(struct kvm_vcpu *vcpu)
1847{
1848        vmcs_write32(VM_ENTRY_CONTROLS,
1849                     vmcs_read32(VM_ENTRY_CONTROLS)
1850                     & ~VM_ENTRY_IA32E_MODE);
1851        vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
1852}
1853
1854#endif
1855
1856static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1857{
1858        vpid_sync_context(to_vmx(vcpu));
1859        if (enable_ept) {
1860                if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
1861                        return;
1862                ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
1863        }
1864}
1865
1866static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1867{
1868        ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
1869
1870        vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
1871        vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
1872}
1873
1874static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
1875{
1876        if (enable_ept && is_paging(vcpu))
1877                vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
1878        __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
1879}
1880
1881static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1882{
1883        ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
1884
1885        vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
1886        vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
1887}
1888
1889static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
1890{
1891        if (!test_bit(VCPU_EXREG_PDPTR,
1892                      (unsigned long *)&vcpu->arch.regs_dirty))
1893                return;
1894
1895        if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
1896                vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]);
1897                vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]);
1898                vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]);
1899                vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]);
1900        }
1901}
1902
1903static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
1904{
1905        if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
1906                vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
1907                vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
1908                vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
1909                vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
1910        }
1911
1912        __set_bit(VCPU_EXREG_PDPTR,
1913                  (unsigned long *)&vcpu->arch.regs_avail);
1914        __set_bit(VCPU_EXREG_PDPTR,
1915                  (unsigned long *)&vcpu->arch.regs_dirty);
1916}
1917
1918static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
1919
1920static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1921                                        unsigned long cr0,
1922                                        struct kvm_vcpu *vcpu)
1923{
1924        vmx_decache_cr3(vcpu);
1925        if (!(cr0 & X86_CR0_PG)) {
1926                /* From paging/starting to nonpaging */
1927                vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1928                             vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
1929                             (CPU_BASED_CR3_LOAD_EXITING |
1930                              CPU_BASED_CR3_STORE_EXITING));
1931                vcpu->arch.cr0 = cr0;
1932                vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
1933        } else if (!is_paging(vcpu)) {
1934                /* From nonpaging to paging */
1935                vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1936                             vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
1937                             ~(CPU_BASED_CR3_LOAD_EXITING |
1938                               CPU_BASED_CR3_STORE_EXITING));
1939                vcpu->arch.cr0 = cr0;
1940                vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
1941        }
1942
1943        if (!(cr0 & X86_CR0_WP))
1944                *hw_cr0 &= ~X86_CR0_WP;
1945}
1946
1947static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1948{
1949        struct vcpu_vmx *vmx = to_vmx(vcpu);
1950        unsigned long hw_cr0;
1951
1952        if (enable_unrestricted_guest)
1953                hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST)
1954                        | KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
1955        else
1956                hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON;
1957
1958        if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
1959                enter_pmode(vcpu);
1960
1961        if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
1962                enter_rmode(vcpu);
1963
1964#ifdef CONFIG_X86_64
1965        if (vcpu->arch.efer & EFER_LME) {
1966                if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
1967                        enter_lmode(vcpu);
1968                if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
1969                        exit_lmode(vcpu);
1970        }
1971#endif
1972
1973        if (enable_ept)
1974                ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
1975
1976        if (!vcpu->fpu_active)
1977                hw_cr0 |= X86_CR0_TS | X86_CR0_MP;
1978
1979        vmcs_writel(CR0_READ_SHADOW, cr0);
1980        vmcs_writel(GUEST_CR0, hw_cr0);
1981        vcpu->arch.cr0 = cr0;
1982}
1983
1984static u64 construct_eptp(unsigned long root_hpa)
1985{
1986        u64 eptp;
1987
1988        /* TODO write the value reading from MSR */
1989        eptp = VMX_EPT_DEFAULT_MT |
1990                VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
1991        eptp |= (root_hpa & PAGE_MASK);
1992
1993        return eptp;
1994}
1995
1996static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1997{
1998        unsigned long guest_cr3;
1999        u64 eptp;
2000
2001        guest_cr3 = cr3;
2002        if (enable_ept) {
2003                eptp = construct_eptp(cr3);
2004                vmcs_write64(EPT_POINTER, eptp);
2005                guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) :
2006                        vcpu->kvm->arch.ept_identity_map_addr;
2007                ept_load_pdptrs(vcpu);
2008        }
2009
2010        vmx_flush_tlb(vcpu);
2011        vmcs_writel(GUEST_CR3, guest_cr3);
2012}
2013
2014static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
2015{
2016        unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
2017                    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
2018
2019        vcpu->arch.cr4 = cr4;
2020        if (enable_ept) {
2021                if (!is_paging(vcpu)) {
2022                        hw_cr4 &= ~X86_CR4_PAE;
2023                        hw_cr4 |= X86_CR4_PSE;
2024                } else if (!(cr4 & X86_CR4_PAE)) {
2025                        hw_cr4 &= ~X86_CR4_PAE;
2026                }
2027        }
2028
2029        vmcs_writel(CR4_READ_SHADOW, cr4);
2030        vmcs_writel(GUEST_CR4, hw_cr4);
2031}
2032
2033static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
2034{
2035        struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2036
2037        return vmcs_readl(sf->base);
2038}
2039
2040static void vmx_get_segment(struct kvm_vcpu *vcpu,
2041                            struct kvm_segment *var, int seg)
2042{
2043        struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2044        u32 ar;
2045
2046        var->base = vmcs_readl(sf->base);
2047        var->limit = vmcs_read32(sf->limit);
2048        var->selector = vmcs_read16(sf->selector);
2049        ar = vmcs_read32(sf->ar_bytes);
2050        if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
2051                ar = 0;
2052        var->type = ar & 15;
2053        var->s = (ar >> 4) & 1;
2054        var->dpl = (ar >> 5) & 3;
2055        var->present = (ar >> 7) & 1;
2056        var->avl = (ar >> 12) & 1;
2057        var->l = (ar >> 13) & 1;
2058        var->db = (ar >> 14) & 1;
2059        var->g = (ar >> 15) & 1;
2060        var->unusable = (ar >> 16) & 1;
2061}
2062
2063static int vmx_get_cpl(struct kvm_vcpu *vcpu)
2064{
2065        if (!is_protmode(vcpu))
2066                return 0;
2067
2068        if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */
2069                return 3;
2070
2071        return vmcs_read16(GUEST_CS_SELECTOR) & 3;
2072}
2073
2074static u32 vmx_segment_access_rights(struct kvm_segment *var)
2075{
2076        u32 ar;
2077
2078        if (var->unusable)
2079                ar = 1 << 16;
2080        else {
2081                ar = var->type & 15;
2082                ar |= (var->s & 1) << 4;
2083                ar |= (var->dpl & 3) << 5;
2084                ar |= (var->present & 1) << 7;
2085                ar |= (var->avl & 1) << 12;
2086                ar |= (var->l & 1) << 13;
2087                ar |= (var->db & 1) << 14;
2088                ar |= (var->g & 1) << 15;
2089        }
2090        if (ar == 0) /* a 0 value means unusable */
2091                ar = AR_UNUSABLE_MASK;
2092
2093        return ar;
2094}
2095
2096static void vmx_set_segment(struct kvm_vcpu *vcpu,
2097                            struct kvm_segment *var, int seg)
2098{
2099        struct vcpu_vmx *vmx = to_vmx(vcpu);
2100        struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2101        u32 ar;
2102
2103        if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
2104                vmx->rmode.tr.selector = var->selector;
2105                vmx->rmode.tr.base = var->base;
2106                vmx->rmode.tr.limit = var->limit;
2107                vmx->rmode.tr.ar = vmx_segment_access_rights(var);
2108                return;
2109        }
2110        vmcs_writel(sf->base, var->base);
2111        vmcs_write32(sf->limit, var->limit);
2112        vmcs_write16(sf->selector, var->selector);
2113        if (vmx->rmode.vm86_active && var->s) {
2114                /*
2115                 * Hack real-mode segments into vm86 compatibility.
2116                 */
2117                if (var->base == 0xffff0000 && var->selector == 0xf000)
2118                        vmcs_writel(sf->base, 0xf0000);
2119                ar = 0xf3;
2120        } else
2121                ar = vmx_segment_access_rights(var);
2122
2123        /*
2124         *   Fix the "Accessed" bit in AR field of segment registers for older
2125         * qemu binaries.
2126         *   IA32 arch specifies that at the time of processor reset the
2127         * "Accessed" bit in the AR field of segment registers is 1. And qemu
2128         * is setting it to 0 in the usedland code. This causes invalid guest
2129         * state vmexit when "unrestricted guest" mode is turned on.
2130         *    Fix for this setup issue in cpu_reset is being pushed in the qemu
2131         * tree. Newer qemu binaries with that qemu fix would not need this
2132         * kvm hack.
2133         */
2134        if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
2135                ar |= 0x1; /* Accessed */
2136
2137        vmcs_write32(sf->ar_bytes, ar);
2138}
2139
2140static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
2141{
2142        u32 ar = vmcs_read32(GUEST_CS_AR_BYTES);
2143
2144        *db = (ar >> 14) & 1;
2145        *l = (ar >> 13) & 1;
2146}
2147
2148static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
2149{
2150        dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
2151        dt->address = vmcs_readl(GUEST_IDTR_BASE);
2152}
2153
2154static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
2155{
2156        vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
2157        vmcs_writel(GUEST_IDTR_BASE, dt->address);
2158}
2159
2160static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
2161{
2162        dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
2163        dt->address = vmcs_readl(GUEST_GDTR_BASE);
2164}
2165
2166static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
2167{
2168        vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
2169        vmcs_writel(GUEST_GDTR_BASE, dt->address);
2170}
2171
2172static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
2173{
2174        struct kvm_segment var;
2175        u32 ar;
2176
2177        vmx_get_segment(vcpu, &var, seg);
2178        ar = vmx_segment_access_rights(&var);
2179
2180        if (var.base != (var.selector << 4))
2181                return false;
2182        if (var.limit != 0xffff)
2183                return false;
2184        if (ar != 0xf3)
2185                return false;
2186
2187        return true;
2188}
2189
2190static bool code_segment_valid(struct kvm_vcpu *vcpu)
2191{
2192        struct kvm_segment cs;
2193        unsigned int cs_rpl;
2194
2195        vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
2196        cs_rpl = cs.selector & SELECTOR_RPL_MASK;
2197
2198        if (cs.unusable)
2199                return false;
2200        if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
2201                return false;
2202        if (!cs.s)
2203                return false;
2204        if (cs.type & AR_TYPE_WRITEABLE_MASK) {
2205                if (cs.dpl > cs_rpl)
2206                        return false;
2207        } else {
2208                if (cs.dpl != cs_rpl)
2209                        return false;
2210        }
2211        if (!cs.present)
2212                return false;
2213
2214        /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
2215        return true;
2216}
2217
2218static bool stack_segment_valid(struct kvm_vcpu *vcpu)
2219{
2220        struct kvm_segment ss;
2221        unsigned int ss_rpl;
2222
2223        vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
2224        ss_rpl = ss.selector & SELECTOR_RPL_MASK;
2225
2226        if (ss.unusable)
2227                return true;
2228        if (ss.type != 3 && ss.type != 7)
2229                return false;
2230        if (!ss.s)
2231                return false;
2232        if (ss.dpl != ss_rpl) /* DPL != RPL */
2233                return false;
2234        if (!ss.present)
2235                return false;
2236
2237        return true;
2238}
2239
2240static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
2241{
2242        struct kvm_segment var;
2243        unsigned int rpl;
2244
2245        vmx_get_segment(vcpu, &var, seg);
2246        rpl = var.selector & SELECTOR_RPL_MASK;
2247
2248        if (var.unusable)
2249                return true;
2250        if (!var.s)
2251                return false;
2252        if (!var.present)
2253                return false;
2254        if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) {
2255                if (var.dpl < rpl) /* DPL < RPL */
2256                        return false;
2257        }
2258
2259        /* TODO: Add other members to kvm_segment_field to allow checking for other access
2260         * rights flags
2261         */
2262        return true;
2263}
2264
2265static bool tr_valid(struct kvm_vcpu *vcpu)
2266{
2267        struct kvm_segment tr;
2268
2269        vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
2270
2271        if (tr.unusable)
2272                return false;
2273        if (tr.selector & SELECTOR_TI_MASK)     /* TI = 1 */
2274                return false;
2275        if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
2276                return false;
2277        if (!tr.present)
2278                return false;
2279
2280        return true;
2281}
2282
2283static bool ldtr_valid(struct kvm_vcpu *vcpu)
2284{
2285        struct kvm_segment ldtr;
2286
2287        vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
2288
2289        if (ldtr.unusable)
2290                return true;
2291        if (ldtr.selector & SELECTOR_TI_MASK)   /* TI = 1 */
2292                return false;
2293        if (ldtr.type != 2)
2294                return false;
2295        if (!ldtr.present)
2296                return false;
2297
2298        return true;
2299}
2300
2301static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
2302{
2303        struct kvm_segment cs, ss;
2304
2305        vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
2306        vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
2307
2308        return ((cs.selector & SELECTOR_RPL_MASK) ==
2309                 (ss.selector & SELECTOR_RPL_MASK));
2310}
2311
2312/*
2313 * Check if guest state is valid. Returns true if valid, false if
2314 * not.
2315 * We assume that registers are always usable
2316 */
2317static bool guest_state_valid(struct kvm_vcpu *vcpu)
2318{
2319        /* real mode guest state checks */
2320        if (!is_protmode(vcpu)) {
2321                if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
2322                        return false;
2323                if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
2324                        return false;
2325                if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
2326                        return false;
2327                if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
2328                        return false;
2329                if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
2330                        return false;
2331                if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
2332                        return false;
2333        } else {
2334        /* protected mode guest state checks */
2335                if (!cs_ss_rpl_check(vcpu))
2336                        return false;
2337                if (!code_segment_valid(vcpu))
2338                        return false;
2339                if (!stack_segment_valid(vcpu))
2340                        return false;
2341                if (!data_segment_valid(vcpu, VCPU_SREG_DS))
2342                        return false;
2343                if (!data_segment_valid(vcpu, VCPU_SREG_ES))
2344                        return false;
2345                if (!data_segment_valid(vcpu, VCPU_SREG_FS))
2346                        return false;
2347                if (!data_segment_valid(vcpu, VCPU_SREG_GS))
2348                        return false;
2349                if (!tr_valid(vcpu))
2350                        return false;
2351                if (!ldtr_valid(vcpu))
2352                        return false;
2353        }
2354        /* TODO:
2355         * - Add checks on RIP
2356         * - Add checks on RFLAGS
2357         */
2358
2359        return true;
2360}
2361
2362static int init_rmode_tss(struct kvm *kvm)
2363{
2364        gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
2365        u16 data = 0;
2366        int ret = 0;
2367        int r;
2368
2369        r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
2370        if (r < 0)
2371                goto out;
2372        data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
2373        r = kvm_write_guest_page(kvm, fn++, &data,
2374                        TSS_IOPB_BASE_OFFSET, sizeof(u16));
2375        if (r < 0)
2376                goto out;
2377        r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
2378        if (r < 0)
2379                goto out;
2380        r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
2381        if (r < 0)
2382                goto out;
2383        data = ~0;
2384        r = kvm_write_guest_page(kvm, fn, &data,
2385                                 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
2386                                 sizeof(u8));
2387        if (r < 0)
2388                goto out;
2389
2390        ret = 1;
2391out:
2392        return ret;
2393}
2394
2395static int init_rmode_identity_map(struct kvm *kvm)
2396{
2397        int i, r, ret;
2398        pfn_t identity_map_pfn;
2399        u32 tmp;
2400
2401        if (!enable_ept)
2402                return 1;
2403        if (unlikely(!kvm->arch.ept_identity_pagetable)) {
2404                printk(KERN_ERR "EPT: identity-mapping pagetable "
2405                        "haven't been allocated!\n");
2406                return 0;
2407        }
2408        if (likely(kvm->arch.ept_identity_pagetable_done))
2409                return 1;
2410        ret = 0;
2411        identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
2412        r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
2413        if (r < 0)
2414                goto out;
2415        /* Set up identity-mapping pagetable for EPT in real mode */
2416        for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
2417                tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
2418                        _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
2419                r = kvm_write_guest_page(kvm, identity_map_pfn,
2420                                &tmp, i * sizeof(tmp), sizeof(tmp));
2421                if (r < 0)
2422                        goto out;
2423        }
2424        kvm->arch.ept_identity_pagetable_done = true;
2425        ret = 1;
2426out:
2427        return ret;
2428}
2429
2430static void seg_setup(int seg)
2431{
2432        struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2433        unsigned int ar;
2434
2435        vmcs_write16(sf->selector, 0);
2436        vmcs_writel(sf->base, 0);
2437        vmcs_write32(sf->limit, 0xffff);
2438        if (enable_unrestricted_guest) {
2439                ar = 0x93;
2440                if (seg == VCPU_SREG_CS)
2441                        ar |= 0x08; /* code segment */
2442        } else
2443                ar = 0xf3;
2444
2445        vmcs_write32(sf->ar_bytes, ar);
2446}
2447
2448static int alloc_apic_access_page(struct kvm *kvm)
2449{
2450        struct kvm_userspace_memory_region kvm_userspace_mem;
2451        int r = 0;
2452
2453        mutex_lock(&kvm->slots_lock);
2454        if (kvm->arch.apic_access_page)
2455                goto out;
2456        kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
2457        kvm_userspace_mem.flags = 0;
2458        kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
2459        kvm_userspace_mem.memory_size = PAGE_SIZE;
2460        r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
2461        if (r)
2462                goto out;
2463
2464        kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
2465out:
2466        mutex_unlock(&kvm->slots_lock);
2467        return r;
2468}
2469
2470static int alloc_identity_pagetable(struct kvm *kvm)
2471{
2472        struct kvm_userspace_memory_region kvm_userspace_mem;
2473        int r = 0;
2474
2475        mutex_lock(&kvm->slots_lock);
2476        if (kvm->arch.ept_identity_pagetable)
2477                goto out;
2478        kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
2479        kvm_userspace_mem.flags = 0;
2480        kvm_userspace_mem.guest_phys_addr =
2481                kvm->arch.ept_identity_map_addr;
2482        kvm_userspace_mem.memory_size = PAGE_SIZE;
2483        r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
2484        if (r)
2485                goto out;
2486
2487        kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
2488                        kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
2489out:
2490        mutex_unlock(&kvm->slots_lock);
2491        return r;
2492}
2493
2494static void allocate_vpid(struct vcpu_vmx *vmx)
2495{
2496        int vpid;
2497
2498        vmx->vpid = 0;
2499        if (!enable_vpid)
2500                return;
2501        spin_lock(&vmx_vpid_lock);
2502        vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
2503        if (vpid < VMX_NR_VPIDS) {
2504                vmx->vpid = vpid;
2505                __set_bit(vpid, vmx_vpid_bitmap);
2506        }
2507        spin_unlock(&vmx_vpid_lock);
2508}
2509
2510static void free_vpid(struct vcpu_vmx *vmx)
2511{
2512        if (!enable_vpid)
2513                return;
2514        spin_lock(&vmx_vpid_lock);
2515        if (vmx->vpid != 0)
2516                __clear_bit(vmx->vpid, vmx_vpid_bitmap);
2517        spin_unlock(&vmx_vpid_lock);
2518}
2519
2520static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
2521{
2522        int f = sizeof(unsigned long);
2523
2524        if (!cpu_has_vmx_msr_bitmap())
2525                return;
2526
2527        /*
2528         * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
2529         * have the write-low and read-high bitmap offsets the wrong way round.
2530         * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
2531         */
2532        if (msr <= 0x1fff) {
2533                __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
2534                __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
2535        } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2536                msr &= 0x1fff;
2537                __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
2538                __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
2539        }
2540}
2541
2542static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
2543{
2544        if (!longmode_only)
2545                __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
2546        __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
2547}
2548
2549/*
2550 * Sets up the vmcs for emulated real mode.
2551 */
2552static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2553{
2554        u32 host_sysenter_cs, msr_low, msr_high;
2555        u32 junk;
2556        u64 host_pat;
2557        unsigned long a;
2558        struct desc_ptr dt;
2559        int i;
2560        unsigned long kvm_vmx_return;
2561        u32 exec_control;
2562
2563        /* I/O */
2564        vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
2565        vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
2566
2567        if (cpu_has_vmx_msr_bitmap())
2568                vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
2569
2570        vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
2571
2572        /* Control */
2573        vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
2574                vmcs_config.pin_based_exec_ctrl);
2575
2576        exec_control = vmcs_config.cpu_based_exec_ctrl;
2577        if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
2578                exec_control &= ~CPU_BASED_TPR_SHADOW;
2579#ifdef CONFIG_X86_64
2580                exec_control |= CPU_BASED_CR8_STORE_EXITING |
2581                                CPU_BASED_CR8_LOAD_EXITING;
2582#endif
2583        }
2584        if (!enable_ept)
2585                exec_control |= CPU_BASED_CR3_STORE_EXITING |
2586                                CPU_BASED_CR3_LOAD_EXITING  |
2587                                CPU_BASED_INVLPG_EXITING;
2588        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
2589
2590        if (cpu_has_secondary_exec_ctrls()) {
2591                exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
2592                if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
2593                        exec_control &=
2594                                ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2595                if (vmx->vpid == 0)
2596                        exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
2597                if (!enable_ept) {
2598                        exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
2599                        enable_unrestricted_guest = 0;
2600                }
2601                if (!enable_unrestricted_guest)
2602                        exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
2603                if (!ple_gap)
2604                        exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
2605                vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
2606        }
2607
2608        if (ple_gap) {
2609                vmcs_write32(PLE_GAP, ple_gap);
2610                vmcs_write32(PLE_WINDOW, ple_window);
2611        }
2612
2613        vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
2614        vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
2615        vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
2616
2617        vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS);  /* 22.2.3 */
2618        vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
2619        vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
2620
2621        vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
2622        vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
2623        vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
2624        vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
2625        vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
2626        vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
2627#ifdef CONFIG_X86_64
2628        rdmsrl(MSR_FS_BASE, a);
2629        vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
2630        rdmsrl(MSR_GS_BASE, a);
2631        vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
2632#else
2633        vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
2634        vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
2635#endif
2636
2637        vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
2638
2639        native_store_idt(&dt);
2640        vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
2641
2642        asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
2643        vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
2644        vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
2645        vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
2646        vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
2647        vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
2648        vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
2649
2650        rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
2651        vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
2652        rdmsrl(MSR_IA32_SYSENTER_ESP, a);
2653        vmcs_writel(HOST_IA32_SYSENTER_ESP, a);   /* 22.2.3 */
2654        rdmsrl(MSR_IA32_SYSENTER_EIP, a);
2655        vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
2656
2657        if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
2658                rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
2659                host_pat = msr_low | ((u64) msr_high << 32);
2660                vmcs_write64(HOST_IA32_PAT, host_pat);
2661        }
2662        if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2663                rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
2664                host_pat = msr_low | ((u64) msr_high << 32);
2665                /* Write the default value follow host pat */
2666                vmcs_write64(GUEST_IA32_PAT, host_pat);
2667                /* Keep arch.pat sync with GUEST_IA32_PAT */
2668                vmx->vcpu.arch.pat = host_pat;
2669        }
2670
2671        for (i = 0; i < NR_VMX_MSR; ++i) {
2672                u32 index = vmx_msr_index[i];
2673                u32 data_low, data_high;
2674                int j = vmx->nmsrs;
2675
2676                if (rdmsr_safe(index, &data_low, &data_high) < 0)
2677                        continue;
2678                if (wrmsr_safe(index, data_low, data_high) < 0)
2679                        continue;
2680                vmx->guest_msrs[j].index = i;
2681                vmx->guest_msrs[j].data = 0;
2682                vmx->guest_msrs[j].mask = -1ull;
2683                ++vmx->nmsrs;
2684        }
2685
2686        vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
2687
2688        /* 22.2.1, 20.8.1 */
2689        vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
2690
2691        vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
2692        vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
2693        if (enable_ept)
2694                vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
2695        vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
2696
2697        kvm_write_tsc(&vmx->vcpu, 0);
2698
2699        return 0;
2700}
2701
2702static int init_rmode(struct kvm *kvm)
2703{
2704        int idx, ret = 0;
2705
2706        idx = srcu_read_lock(&kvm->srcu);
2707        if (!init_rmode_tss(kvm))
2708                goto exit;
2709        if (!init_rmode_identity_map(kvm))
2710                goto exit;
2711
2712        ret = 1;
2713exit:
2714        srcu_read_unlock(&kvm->srcu, idx);
2715        return ret;
2716}
2717
2718static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2719{
2720        struct vcpu_vmx *vmx = to_vmx(vcpu);
2721        u64 msr;
2722        int ret;
2723
2724        vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
2725        if (!init_rmode(vmx->vcpu.kvm)) {
2726                ret = -ENOMEM;
2727                goto out;
2728        }
2729
2730        vmx->rmode.vm86_active = 0;
2731
2732        vmx->soft_vnmi_blocked = 0;
2733
2734        vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
2735        kvm_set_cr8(&vmx->vcpu, 0);
2736        msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
2737        if (kvm_vcpu_is_bsp(&vmx->vcpu))
2738                msr |= MSR_IA32_APICBASE_BSP;
2739        kvm_set_apic_base(&vmx->vcpu, msr);
2740
2741        ret = fx_init(&vmx->vcpu);
2742        if (ret != 0)
2743                goto out;
2744
2745        seg_setup(VCPU_SREG_CS);
2746        /*
2747         * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
2748         * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
2749         */
2750        if (kvm_vcpu_is_bsp(&vmx->vcpu)) {
2751                vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
2752                vmcs_writel(GUEST_CS_BASE, 0x000f0000);
2753        } else {
2754                vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
2755                vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
2756        }
2757
2758        seg_setup(VCPU_SREG_DS);
2759        seg_setup(VCPU_SREG_ES);
2760        seg_setup(VCPU_SREG_FS);
2761        seg_setup(VCPU_SREG_GS);
2762        seg_setup(VCPU_SREG_SS);
2763
2764        vmcs_write16(GUEST_TR_SELECTOR, 0);
2765        vmcs_writel(GUEST_TR_BASE, 0);
2766        vmcs_write32(GUEST_TR_LIMIT, 0xffff);
2767        vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
2768
2769        vmcs_write16(GUEST_LDTR_SELECTOR, 0);
2770        vmcs_writel(GUEST_LDTR_BASE, 0);
2771        vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
2772        vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
2773
2774        vmcs_write32(GUEST_SYSENTER_CS, 0);
2775        vmcs_writel(GUEST_SYSENTER_ESP, 0);
2776        vmcs_writel(GUEST_SYSENTER_EIP, 0);
2777
2778        vmcs_writel(GUEST_RFLAGS, 0x02);
2779        if (kvm_vcpu_is_bsp(&vmx->vcpu))
2780                kvm_rip_write(vcpu, 0xfff0);
2781        else
2782                kvm_rip_write(vcpu, 0);
2783        kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
2784
2785        vmcs_writel(GUEST_DR7, 0x400);
2786
2787        vmcs_writel(GUEST_GDTR_BASE, 0);
2788        vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
2789
2790        vmcs_writel(GUEST_IDTR_BASE, 0);
2791        vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
2792
2793        vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
2794        vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
2795        vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
2796
2797        /* Special registers */
2798        vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
2799
2800        setup_msrs(vmx);
2801
2802        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
2803
2804        if (cpu_has_vmx_tpr_shadow()) {
2805                vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
2806                if (vm_need_tpr_shadow(vmx->vcpu.kvm))
2807                        vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
2808                                page_to_phys(vmx->vcpu.arch.apic->regs_page));
2809                vmcs_write32(TPR_THRESHOLD, 0);
2810        }
2811
2812        if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
2813                vmcs_write64(APIC_ACCESS_ADDR,
2814                             page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
2815
2816        if (vmx->vpid != 0)
2817                vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2818
2819        vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
2820        vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
2821        vmx_set_cr4(&vmx->vcpu, 0);
2822        vmx_set_efer(&vmx->vcpu, 0);
2823        vmx_fpu_activate(&vmx->vcpu);
2824        update_exception_bitmap(&vmx->vcpu);
2825
2826        vpid_sync_context(vmx);
2827
2828        ret = 0;
2829
2830        /* HACK: Don't enable emulation on guest boot/reset */
2831        vmx->emulation_required = 0;
2832
2833out:
2834        return ret;
2835}
2836
2837static void enable_irq_window(struct kvm_vcpu *vcpu)
2838{
2839        u32 cpu_based_vm_exec_control;
2840
2841        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2842        cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
2843        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2844}
2845
2846static void enable_nmi_window(struct kvm_vcpu *vcpu)
2847{
2848        u32 cpu_based_vm_exec_control;
2849
2850        if (!cpu_has_virtual_nmis()) {
2851                enable_irq_window(vcpu);
2852                return;
2853        }
2854
2855        if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
2856                enable_irq_window(vcpu);
2857                return;
2858        }
2859        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2860        cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
2861        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2862}
2863
2864static void vmx_inject_irq(struct kvm_vcpu *vcpu)
2865{
2866        struct vcpu_vmx *vmx = to_vmx(vcpu);
2867        uint32_t intr;
2868        int irq = vcpu->arch.interrupt.nr;
2869
2870        trace_kvm_inj_virq(irq);
2871
2872        ++vcpu->stat.irq_injections;
2873        if (vmx->rmode.vm86_active) {
2874                if (kvm_inject_realmode_interrupt(vcpu, irq) != EMULATE_DONE)
2875                        kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2876                return;
2877        }
2878        intr = irq | INTR_INFO_VALID_MASK;
2879        if (vcpu->arch.interrupt.soft) {
2880                intr |= INTR_TYPE_SOFT_INTR;
2881                vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2882                             vmx->vcpu.arch.event_exit_inst_len);
2883        } else
2884                intr |= INTR_TYPE_EXT_INTR;
2885        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
2886        vmx_clear_hlt(vcpu);
2887}
2888
2889static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2890{
2891        struct vcpu_vmx *vmx = to_vmx(vcpu);
2892
2893        if (!cpu_has_virtual_nmis()) {
2894                /*
2895                 * Tracking the NMI-blocked state in software is built upon
2896                 * finding the next open IRQ window. This, in turn, depends on
2897                 * well-behaving guests: They have to keep IRQs disabled at
2898                 * least as long as the NMI handler runs. Otherwise we may
2899                 * cause NMI nesting, maybe breaking the guest. But as this is
2900                 * highly unlikely, we can live with the residual risk.
2901                 */
2902                vmx->soft_vnmi_blocked = 1;
2903                vmx->vnmi_blocked_time = 0;
2904        }
2905
2906        ++vcpu->stat.nmi_injections;
2907        if (vmx->rmode.vm86_active) {
2908                if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR) != EMULATE_DONE)
2909                        kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2910                return;
2911        }
2912        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2913                        INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
2914        vmx_clear_hlt(vcpu);
2915}
2916
2917static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
2918{
2919        if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
2920                return 0;
2921
2922        return  !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2923                  (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
2924                   | GUEST_INTR_STATE_NMI));
2925}
2926
2927static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
2928{
2929        if (!cpu_has_virtual_nmis())
2930                return to_vmx(vcpu)->soft_vnmi_blocked;
2931        return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
2932}
2933
2934static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
2935{
2936        struct vcpu_vmx *vmx = to_vmx(vcpu);
2937
2938        if (!cpu_has_virtual_nmis()) {
2939                if (vmx->soft_vnmi_blocked != masked) {
2940                        vmx->soft_vnmi_blocked = masked;
2941                        vmx->vnmi_blocked_time = 0;
2942                }
2943        } else {
2944                if (masked)
2945                        vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
2946                                      GUEST_INTR_STATE_NMI);
2947                else
2948                        vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
2949                                        GUEST_INTR_STATE_NMI);
2950        }
2951}
2952
2953static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
2954{
2955        return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2956                !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2957                        (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
2958}
2959
2960static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
2961{
2962        int ret;
2963        struct kvm_userspace_memory_region tss_mem = {
2964                .slot = TSS_PRIVATE_MEMSLOT,
2965                .guest_phys_addr = addr,
2966                .memory_size = PAGE_SIZE * 3,
2967                .flags = 0,
2968        };
2969
2970        ret = kvm_set_memory_region(kvm, &tss_mem, 0);
2971        if (ret)
2972                return ret;
2973        kvm->arch.tss_addr = addr;
2974        return 0;
2975}
2976
2977static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2978                                  int vec, u32 err_code)
2979{
2980        /*
2981         * Instruction with address size override prefix opcode 0x67
2982         * Cause the #SS fault with 0 error code in VM86 mode.
2983         */
2984        if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
2985                if (emulate_instruction(vcpu, 0) == EMULATE_DONE)
2986                        return 1;
2987        /*
2988         * Forward all other exceptions that are valid in real mode.
2989         * FIXME: Breaks guest debugging in real mode, needs to be fixed with
2990         *        the required debugging infrastructure rework.
2991         */
2992        switch (vec) {
2993        case DB_VECTOR:
2994                if (vcpu->guest_debug &
2995                    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
2996                        return 0;
2997                kvm_queue_exception(vcpu, vec);
2998                return 1;
2999        case BP_VECTOR:
3000                /*
3001                 * Update instruction length as we may reinject the exception
3002                 * from user space while in guest debugging mode.
3003                 */
3004                to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
3005                        vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3006                if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
3007                        return 0;
3008                /* fall through */
3009        case DE_VECTOR:
3010        case OF_VECTOR:
3011        case BR_VECTOR:
3012        case UD_VECTOR:
3013        case DF_VECTOR:
3014        case SS_VECTOR:
3015        case GP_VECTOR:
3016        case MF_VECTOR:
3017                kvm_queue_exception(vcpu, vec);
3018                return 1;
3019        }
3020        return 0;
3021}
3022
3023/*
3024 * Trigger machine check on the host. We assume all the MSRs are already set up
3025 * by the CPU and that we still run on the same CPU as the MCE occurred on.
3026 * We pass a fake environment to the machine check handler because we want
3027 * the guest to be always treated like user space, no matter what context
3028 * it used internally.
3029 */
3030static void kvm_machine_check(void)
3031{
3032#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
3033        struct pt_regs regs = {
3034                .cs = 3, /* Fake ring 3 no matter what the guest ran on */
3035                .flags = X86_EFLAGS_IF,
3036        };
3037
3038        do_machine_check(&regs, 0);
3039#endif
3040}
3041
3042static int handle_machine_check(struct kvm_vcpu *vcpu)
3043{
3044        /* already handled by vcpu_run */
3045        return 1;
3046}
3047
3048static int handle_exception(struct kvm_vcpu *vcpu)
3049{
3050        struct vcpu_vmx *vmx = to_vmx(vcpu);
3051        struct kvm_run *kvm_run = vcpu->run;
3052        u32 intr_info, ex_no, error_code;
3053        unsigned long cr2, rip, dr6;
3054        u32 vect_info;
3055        enum emulation_result er;
3056
3057        vect_info = vmx->idt_vectoring_info;
3058        intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
3059
3060        if (is_machine_check(intr_info))
3061                return handle_machine_check(vcpu);
3062
3063        if ((vect_info & VECTORING_INFO_VALID_MASK) &&
3064            !is_page_fault(intr_info)) {
3065                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3066                vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
3067                vcpu->run->internal.ndata = 2;
3068                vcpu->run->internal.data[0] = vect_info;
3069                vcpu->run->internal.data[1] = intr_info;
3070                return 0;
3071        }
3072
3073        if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
3074                return 1;  /* already handled by vmx_vcpu_run() */
3075
3076        if (is_no_device(intr_info)) {
3077                vmx_fpu_activate(vcpu);
3078                return 1;
3079        }
3080
3081        if (is_invalid_opcode(intr_info)) {
3082                er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
3083                if (er != EMULATE_DONE)
3084                        kvm_queue_exception(vcpu, UD_VECTOR);
3085                return 1;
3086        }
3087
3088        error_code = 0;
3089        rip = kvm_rip_read(vcpu);
3090        if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
3091                error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
3092        if (is_page_fault(intr_info)) {
3093                /* EPT won't cause page fault directly */
3094                if (enable_ept)
3095                        BUG();
3096                cr2 = vmcs_readl(EXIT_QUALIFICATION);
3097                trace_kvm_page_fault(cr2, error_code);
3098
3099                if (kvm_event_needs_reinjection(vcpu))
3100                        kvm_mmu_unprotect_page_virt(vcpu, cr2);
3101                return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
3102        }
3103
3104        if (vmx->rmode.vm86_active &&
3105            handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
3106                                                                error_code)) {
3107                if (vcpu->arch.halt_request) {
3108                        vcpu->arch.halt_request = 0;
3109                        return kvm_emulate_halt(vcpu);
3110                }
3111                return 1;
3112        }
3113
3114        ex_no = intr_info & INTR_INFO_VECTOR_MASK;
3115        switch (ex_no) {
3116        case DB_VECTOR:
3117                dr6 = vmcs_readl(EXIT_QUALIFICATION);
3118                if (!(vcpu->guest_debug &
3119                      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
3120                        vcpu->arch.dr6 = dr6 | DR6_FIXED_1;
3121                        kvm_queue_exception(vcpu, DB_VECTOR);
3122                        return 1;
3123                }
3124                kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
3125                kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
3126                /* fall through */
3127        case BP_VECTOR:
3128                /*
3129                 * Update instruction length as we may reinject #BP from
3130                 * user space while in guest debugging mode. Reading it for
3131                 * #DB as well causes no harm, it is not used in that case.
3132                 */
3133                vmx->vcpu.arch.event_exit_inst_len =
3134                        vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3135                kvm_run->exit_reason = KVM_EXIT_DEBUG;
3136                kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
3137                kvm_run->debug.arch.exception = ex_no;
3138                break;
3139        default:
3140                kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
3141                kvm_run->ex.exception = ex_no;
3142                kvm_run->ex.error_code = error_code;
3143                break;
3144        }
3145        return 0;
3146}
3147
3148static int handle_external_interrupt(struct kvm_vcpu *vcpu)
3149{
3150        ++vcpu->stat.irq_exits;
3151        return 1;
3152}
3153
3154static int handle_triple_fault(struct kvm_vcpu *vcpu)
3155{
3156        vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
3157        return 0;
3158}
3159
3160static int handle_io(struct kvm_vcpu *vcpu)
3161{
3162        unsigned long exit_qualification;
3163        int size, in, string;
3164        unsigned port;
3165
3166        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3167        string = (exit_qualification & 16) != 0;
3168        in = (exit_qualification & 8) != 0;
3169
3170        ++vcpu->stat.io_exits;
3171
3172        if (string || in)
3173                return emulate_instruction(vcpu, 0) == EMULATE_DONE;
3174
3175        port = exit_qualification >> 16;
3176        size = (exit_qualification & 7) + 1;
3177        skip_emulated_instruction(vcpu);
3178
3179        return kvm_fast_pio_out(vcpu, size, port);
3180}
3181
3182static void
3183vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
3184{
3185        /*
3186         * Patch in the VMCALL instruction:
3187         */
3188        hypercall[0] = 0x0f;
3189        hypercall[1] = 0x01;
3190        hypercall[2] = 0xc1;
3191}
3192
3193static int handle_cr(struct kvm_vcpu *vcpu)
3194{
3195        unsigned long exit_qualification, val;
3196        int cr;
3197        int reg;
3198        int err;
3199
3200        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3201        cr = exit_qualification & 15;
3202        reg = (exit_qualification >> 8) & 15;
3203        switch ((exit_qualification >> 4) & 3) {
3204        case 0: /* mov to cr */
3205                val = kvm_register_read(vcpu, reg);
3206                trace_kvm_cr_write(cr, val);
3207                switch (cr) {
3208                case 0:
3209                        err = kvm_set_cr0(vcpu, val);
3210                        kvm_complete_insn_gp(vcpu, err);
3211                        return 1;
3212                case 3:
3213                        err = kvm_set_cr3(vcpu, val);
3214                        kvm_complete_insn_gp(vcpu, err);
3215                        return 1;
3216                case 4:
3217                        err = kvm_set_cr4(vcpu, val);
3218                        kvm_complete_insn_gp(vcpu, err);
3219                        return 1;
3220                case 8: {
3221                                u8 cr8_prev = kvm_get_cr8(vcpu);
3222                                u8 cr8 = kvm_register_read(vcpu, reg);
3223                                err = kvm_set_cr8(vcpu, cr8);
3224                                kvm_complete_insn_gp(vcpu, err);
3225                                if (irqchip_in_kernel(vcpu->kvm))
3226                                        return 1;
3227                                if (cr8_prev <= cr8)
3228                                        return 1;
3229                                vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
3230                                return 0;
3231                        }
3232                };
3233                break;
3234        case 2: /* clts */
3235                vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
3236                trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
3237                skip_emulated_instruction(vcpu);
3238                vmx_fpu_activate(vcpu);
3239                return 1;
3240        case 1: /*mov from cr*/
3241                switch (cr) {
3242                case 3:
3243                        val = kvm_read_cr3(vcpu);
3244                        kvm_register_write(vcpu, reg, val);
3245                        trace_kvm_cr_read(cr, val);
3246                        skip_emulated_instruction(vcpu);
3247                        return 1;
3248                case 8:
3249                        val = kvm_get_cr8(vcpu);
3250                        kvm_register_write(vcpu, reg, val);
3251                        trace_kvm_cr_read(cr, val);
3252                        skip_emulated_instruction(vcpu);
3253                        return 1;
3254                }
3255                break;
3256        case 3: /* lmsw */
3257                val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
3258                trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
3259                kvm_lmsw(vcpu, val);
3260
3261                skip_emulated_instruction(vcpu);
3262                return 1;
3263        default:
3264                break;
3265        }
3266        vcpu->run->exit_reason = 0;
3267        pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
3268               (int)(exit_qualification >> 4) & 3, cr);
3269        return 0;
3270}
3271
3272static int handle_dr(struct kvm_vcpu *vcpu)
3273{
3274        unsigned long exit_qualification;
3275        int dr, reg;
3276
3277        /* Do not handle if the CPL > 0, will trigger GP on re-entry */
3278        if (!kvm_require_cpl(vcpu, 0))
3279                return 1;
3280        dr = vmcs_readl(GUEST_DR7);
3281        if (dr & DR7_GD) {
3282                /*
3283                 * As the vm-exit takes precedence over the debug trap, we
3284                 * need to emulate the latter, either for the host or the
3285                 * guest debugging itself.
3286                 */
3287                if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
3288                        vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
3289                        vcpu->run->debug.arch.dr7 = dr;
3290                        vcpu->run->debug.arch.pc =
3291                                vmcs_readl(GUEST_CS_BASE) +
3292                                vmcs_readl(GUEST_RIP);
3293                        vcpu->run->debug.arch.exception = DB_VECTOR;
3294                        vcpu->run->exit_reason = KVM_EXIT_DEBUG;
3295                        return 0;
3296                } else {
3297                        vcpu->arch.dr7 &= ~DR7_GD;
3298                        vcpu->arch.dr6 |= DR6_BD;
3299                        vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
3300                        kvm_queue_exception(vcpu, DB_VECTOR);
3301                        return 1;
3302                }
3303        }
3304
3305        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3306        dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
3307        reg = DEBUG_REG_ACCESS_REG(exit_qualification);
3308        if (exit_qualification & TYPE_MOV_FROM_DR) {
3309                unsigned long val;
3310                if (!kvm_get_dr(vcpu, dr, &val))
3311                        kvm_register_write(vcpu, reg, val);
3312        } else
3313                kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]);
3314        skip_emulated_instruction(vcpu);
3315        return 1;
3316}
3317
3318static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
3319{
3320        vmcs_writel(GUEST_DR7, val);
3321}
3322
3323static int handle_cpuid(struct kvm_vcpu *vcpu)
3324{
3325        kvm_emulate_cpuid(vcpu);
3326        return 1;
3327}
3328
3329static int handle_rdmsr(struct kvm_vcpu *vcpu)
3330{
3331        u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
3332        u64 data;
3333
3334        if (vmx_get_msr(vcpu, ecx, &data)) {
3335                trace_kvm_msr_read_ex(ecx);
3336                kvm_inject_gp(vcpu, 0);
3337                return 1;
3338        }
3339
3340        trace_kvm_msr_read(ecx, data);
3341
3342        /* FIXME: handling of bits 32:63 of rax, rdx */
3343        vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
3344        vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
3345        skip_emulated_instruction(vcpu);
3346        return 1;
3347}
3348
3349static int handle_wrmsr(struct kvm_vcpu *vcpu)
3350{
3351        u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
3352        u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
3353                | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
3354
3355        if (vmx_set_msr(vcpu, ecx, data) != 0) {
3356                trace_kvm_msr_write_ex(ecx, data);
3357                kvm_inject_gp(vcpu, 0);
3358                return 1;
3359        }
3360
3361        trace_kvm_msr_write(ecx, data);
3362        skip_emulated_instruction(vcpu);
3363        return 1;
3364}
3365
3366static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
3367{
3368        kvm_make_request(KVM_REQ_EVENT, vcpu);
3369        return 1;
3370}
3371
3372static int handle_interrupt_window(struct kvm_vcpu *vcpu)
3373{
3374        u32 cpu_based_vm_exec_control;
3375
3376        /* clear pending irq */
3377        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
3378        cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
3379        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
3380
3381        kvm_make_request(KVM_REQ_EVENT, vcpu);
3382
3383        ++vcpu->stat.irq_window_exits;
3384
3385        /*
3386         * If the user space waits to inject interrupts, exit as soon as
3387         * possible
3388         */
3389        if (!irqchip_in_kernel(vcpu->kvm) &&
3390            vcpu->run->request_interrupt_window &&
3391            !kvm_cpu_has_interrupt(vcpu)) {
3392                vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
3393                return 0;
3394        }
3395        return 1;
3396}
3397
3398static int handle_halt(struct kvm_vcpu *vcpu)
3399{
3400        skip_emulated_instruction(vcpu);
3401        return kvm_emulate_halt(vcpu);
3402}
3403
3404static int handle_vmcall(struct kvm_vcpu *vcpu)
3405{
3406        skip_emulated_instruction(vcpu);
3407        kvm_emulate_hypercall(vcpu);
3408        return 1;
3409}
3410
3411static int handle_vmx_insn(struct kvm_vcpu *vcpu)
3412{
3413        kvm_queue_exception(vcpu, UD_VECTOR);
3414        return 1;
3415}
3416
3417static int handle_invd(struct kvm_vcpu *vcpu)
3418{
3419        return emulate_instruction(vcpu, 0) == EMULATE_DONE;
3420}
3421
3422static int handle_invlpg(struct kvm_vcpu *vcpu)
3423{
3424        unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3425
3426        kvm_mmu_invlpg(vcpu, exit_qualification);
3427        skip_emulated_instruction(vcpu);
3428        return 1;
3429}
3430
3431static int handle_wbinvd(struct kvm_vcpu *vcpu)
3432{
3433        skip_emulated_instruction(vcpu);
3434        kvm_emulate_wbinvd(vcpu);
3435        return 1;
3436}
3437
3438static int handle_xsetbv(struct kvm_vcpu *vcpu)
3439{
3440        u64 new_bv = kvm_read_edx_eax(vcpu);
3441        u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
3442
3443        if (kvm_set_xcr(vcpu, index, new_bv) == 0)
3444                skip_emulated_instruction(vcpu);
3445        return 1;
3446}
3447
3448static int handle_apic_access(struct kvm_vcpu *vcpu)
3449{
3450        return emulate_instruction(vcpu, 0) == EMULATE_DONE;
3451}
3452
3453static int handle_task_switch(struct kvm_vcpu *vcpu)
3454{
3455        struct vcpu_vmx *vmx = to_vmx(vcpu);
3456        unsigned long exit_qualification;
3457        bool has_error_code = false;
3458        u32 error_code = 0;
3459        u16 tss_selector;
3460        int reason, type, idt_v;
3461
3462        idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
3463        type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
3464
3465        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3466
3467        reason = (u32)exit_qualification >> 30;
3468        if (reason == TASK_SWITCH_GATE && idt_v) {
3469                switch (type) {
3470                case INTR_TYPE_NMI_INTR:
3471                        vcpu->arch.nmi_injected = false;
3472                        if (cpu_has_virtual_nmis())
3473                                vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3474                                              GUEST_INTR_STATE_NMI);
3475                        break;
3476                case INTR_TYPE_EXT_INTR:
3477                case INTR_TYPE_SOFT_INTR:
3478                        kvm_clear_interrupt_queue(vcpu);
3479                        break;
3480                case INTR_TYPE_HARD_EXCEPTION:
3481                        if (vmx->idt_vectoring_info &
3482                            VECTORING_INFO_DELIVER_CODE_MASK) {
3483                                has_error_code = true;
3484                                error_code =
3485                                        vmcs_read32(IDT_VECTORING_ERROR_CODE);
3486                        }
3487                        /* fall through */
3488                case INTR_TYPE_SOFT_EXCEPTION:
3489                        kvm_clear_exception_queue(vcpu);
3490                        break;
3491                default:
3492                        break;
3493                }
3494        }
3495        tss_selector = exit_qualification;
3496
3497        if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
3498                       type != INTR_TYPE_EXT_INTR &&
3499                       type != INTR_TYPE_NMI_INTR))
3500                skip_emulated_instruction(vcpu);
3501
3502        if (kvm_task_switch(vcpu, tss_selector, reason,
3503                                has_error_code, error_code) == EMULATE_FAIL) {
3504                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3505                vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3506                vcpu->run->internal.ndata = 0;
3507                return 0;
3508        }
3509
3510        /* clear all local breakpoint enable flags */
3511        vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
3512
3513        /*
3514         * TODO: What about debug traps on tss switch?
3515         *       Are we supposed to inject them and update dr6?
3516         */
3517
3518        return 1;
3519}
3520
3521static int handle_ept_violation(struct kvm_vcpu *vcpu)
3522{
3523        unsigned long exit_qualification;
3524        gpa_t gpa;
3525        int gla_validity;
3526
3527        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3528
3529        if (exit_qualification & (1 << 6)) {
3530                printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
3531                return -EINVAL;
3532        }
3533
3534        gla_validity = (exit_qualification >> 7) & 0x3;
3535        if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
3536                printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
3537                printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
3538                        (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
3539                        vmcs_readl(GUEST_LINEAR_ADDRESS));
3540                printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
3541                        (long unsigned int)exit_qualification);
3542                vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3543                vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
3544                return 0;
3545        }
3546
3547        gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
3548        trace_kvm_page_fault(gpa, exit_qualification);
3549        return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0);
3550}
3551
3552static u64 ept_rsvd_mask(u64 spte, int level)
3553{
3554        int i;
3555        u64 mask = 0;
3556
3557        for (i = 51; i > boot_cpu_data.x86_phys_bits; i--)
3558                mask |= (1ULL << i);
3559
3560        if (level > 2)
3561                /* bits 7:3 reserved */
3562                mask |= 0xf8;
3563        else if (level == 2) {
3564                if (spte & (1ULL << 7))
3565                        /* 2MB ref, bits 20:12 reserved */
3566                        mask |= 0x1ff000;
3567                else
3568                        /* bits 6:3 reserved */
3569                        mask |= 0x78;
3570        }
3571
3572        return mask;
3573}
3574
3575static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
3576                                       int level)
3577{
3578        printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level);
3579
3580        /* 010b (write-only) */
3581        WARN_ON((spte & 0x7) == 0x2);
3582
3583        /* 110b (write/execute) */
3584        WARN_ON((spte & 0x7) == 0x6);
3585
3586        /* 100b (execute-only) and value not supported by logical processor */
3587        if (!cpu_has_vmx_ept_execute_only())
3588                WARN_ON((spte & 0x7) == 0x4);
3589
3590        /* not 000b */
3591        if ((spte & 0x7)) {
3592                u64 rsvd_bits = spte & ept_rsvd_mask(spte, level);
3593
3594                if (rsvd_bits != 0) {
3595                        printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n",
3596                                         __func__, rsvd_bits);
3597                        WARN_ON(1);
3598                }
3599
3600                if (level == 1 || (level == 2 && (spte & (1ULL << 7)))) {
3601                        u64 ept_mem_type = (spte & 0x38) >> 3;
3602
3603                        if (ept_mem_type == 2 || ept_mem_type == 3 ||
3604                            ept_mem_type == 7) {
3605                                printk(KERN_ERR "%s: ept_mem_type=0x%llx\n",
3606                                                __func__, ept_mem_type);
3607                                WARN_ON(1);
3608                        }
3609                }
3610        }
3611}
3612
3613static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
3614{
3615        u64 sptes[4];
3616        int nr_sptes, i;
3617        gpa_t gpa;
3618
3619        gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
3620
3621        printk(KERN_ERR "EPT: Misconfiguration.\n");
3622        printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
3623
3624        nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes);
3625
3626        for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
3627                ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
3628
3629        vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3630        vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
3631
3632        return 0;
3633}
3634
3635static int handle_nmi_window(struct kvm_vcpu *vcpu)
3636{
3637        u32 cpu_based_vm_exec_control;
3638
3639        /* clear pending NMI */
3640        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
3641        cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
3642        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
3643        ++vcpu->stat.nmi_window_exits;
3644        kvm_make_request(KVM_REQ_EVENT, vcpu);
3645
3646        return 1;
3647}
3648
3649static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
3650{
3651        struct vcpu_vmx *vmx = to_vmx(vcpu);
3652        enum emulation_result err = EMULATE_DONE;
3653        int ret = 1;
3654        u32 cpu_exec_ctrl;
3655        bool intr_window_requested;
3656
3657        cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
3658        intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
3659
3660        while (!guest_state_valid(vcpu)) {
3661                if (intr_window_requested
3662                    && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
3663                        return handle_interrupt_window(&vmx->vcpu);
3664
3665                err = emulate_instruction(vcpu, 0);
3666
3667                if (err == EMULATE_DO_MMIO) {
3668                        ret = 0;
3669                        goto out;
3670                }
3671
3672                if (err != EMULATE_DONE)
3673                        return 0;
3674
3675                if (signal_pending(current))
3676                        goto out;
3677                if (need_resched())
3678                        schedule();
3679        }
3680
3681        vmx->emulation_required = 0;
3682out:
3683        return ret;
3684}
3685
3686/*
3687 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
3688 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
3689 */
3690static int handle_pause(struct kvm_vcpu *vcpu)
3691{
3692        skip_emulated_instruction(vcpu);
3693        kvm_vcpu_on_spin(vcpu);
3694
3695        return 1;
3696}
3697
3698static int handle_invalid_op(struct kvm_vcpu *vcpu)
3699{
3700        kvm_queue_exception(vcpu, UD_VECTOR);
3701        return 1;
3702}
3703
3704/*
3705 * The exit handlers return 1 if the exit was handled fully and guest execution
3706 * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
3707 * to be done to userspace and return 0.
3708 */
3709static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3710        [EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
3711        [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
3712        [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
3713        [EXIT_REASON_NMI_WINDOW]              = handle_nmi_window,
3714        [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
3715        [EXIT_REASON_CR_ACCESS]               = handle_cr,
3716        [EXIT_REASON_DR_ACCESS]               = handle_dr,
3717        [EXIT_REASON_CPUID]                   = handle_cpuid,
3718        [EXIT_REASON_MSR_READ]                = handle_rdmsr,
3719        [EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
3720        [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
3721        [EXIT_REASON_HLT]                     = handle_halt,
3722        [EXIT_REASON_INVD]                    = handle_invd,
3723        [EXIT_REASON_INVLPG]                  = handle_invlpg,
3724        [EXIT_REASON_VMCALL]                  = handle_vmcall,
3725        [EXIT_REASON_VMCLEAR]                 = handle_vmx_insn,
3726        [EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
3727        [EXIT_REASON_VMPTRLD]                 = handle_vmx_insn,
3728        [EXIT_REASON_VMPTRST]                 = handle_vmx_insn,
3729        [EXIT_REASON_VMREAD]                  = handle_vmx_insn,
3730        [EXIT_REASON_VMRESUME]                = handle_vmx_insn,
3731        [EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
3732        [EXIT_REASON_VMOFF]                   = handle_vmx_insn,
3733        [EXIT_REASON_VMON]                    = handle_vmx_insn,
3734        [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
3735        [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
3736        [EXIT_REASON_WBINVD]                  = handle_wbinvd,
3737        [EXIT_REASON_XSETBV]                  = handle_xsetbv,
3738        [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
3739        [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
3740        [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
3741        [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
3742        [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
3743        [EXIT_REASON_MWAIT_INSTRUCTION]       = handle_invalid_op,
3744        [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
3745};
3746
3747static const int kvm_vmx_max_exit_handlers =
3748        ARRAY_SIZE(kvm_vmx_exit_handlers);
3749
3750static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
3751{
3752        *info1 = vmcs_readl(EXIT_QUALIFICATION);
3753        *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
3754}
3755
3756/*
3757 * The guest has exited.  See if we can fix it or if we need userspace
3758 * assistance.
3759 */
3760static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3761{
3762        struct vcpu_vmx *vmx = to_vmx(vcpu);
3763        u32 exit_reason = vmx->exit_reason;
3764        u32 vectoring_info = vmx->idt_vectoring_info;
3765
3766        trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
3767
3768        /* If guest state is invalid, start emulating */
3769        if (vmx->emulation_required && emulate_invalid_guest_state)
3770                return handle_invalid_guest_state(vcpu);
3771
3772        if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
3773                vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3774                vcpu->run->fail_entry.hardware_entry_failure_reason
3775                        = exit_reason;
3776                return 0;
3777        }
3778
3779        if (unlikely(vmx->fail)) {
3780                vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3781                vcpu->run->fail_entry.hardware_entry_failure_reason
3782                        = vmcs_read32(VM_INSTRUCTION_ERROR);
3783                return 0;
3784        }
3785
3786        if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
3787                        (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
3788                        exit_reason != EXIT_REASON_EPT_VIOLATION &&
3789                        exit_reason != EXIT_REASON_TASK_SWITCH))
3790                printk(KERN_WARNING "%s: unexpected, valid vectoring info "
3791                       "(0x%x) and exit reason is 0x%x\n",
3792                       __func__, vectoring_info, exit_reason);
3793
3794        if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
3795                if (vmx_interrupt_allowed(vcpu)) {
3796                        vmx->soft_vnmi_blocked = 0;
3797                } else if (vmx->vnmi_blocked_time > 1000000000LL &&
3798                           vcpu->arch.nmi_pending) {
3799                        /*
3800                         * This CPU don't support us in finding the end of an
3801                         * NMI-blocked window if the guest runs with IRQs
3802                         * disabled. So we pull the trigger after 1 s of
3803                         * futile waiting, but inform the user about this.
3804                         */
3805                        printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
3806                               "state on VCPU %d after 1 s timeout\n",
3807                               __func__, vcpu->vcpu_id);
3808                        vmx->soft_vnmi_blocked = 0;
3809                }
3810        }
3811
3812        if (exit_reason < kvm_vmx_max_exit_handlers
3813            && kvm_vmx_exit_handlers[exit_reason])
3814                return kvm_vmx_exit_handlers[exit_reason](vcpu);
3815        else {
3816                vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3817                vcpu->run->hw.hardware_exit_reason = exit_reason;
3818        }
3819        return 0;
3820}
3821
3822static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3823{
3824        if (irr == -1 || tpr < irr) {
3825                vmcs_write32(TPR_THRESHOLD, 0);
3826                return;
3827        }
3828
3829        vmcs_write32(TPR_THRESHOLD, irr);
3830}
3831
3832static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
3833{
3834        u32 exit_intr_info = vmx->exit_intr_info;
3835
3836        /* Handle machine checks before interrupts are enabled */
3837        if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
3838            || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
3839                && is_machine_check(exit_intr_info)))
3840                kvm_machine_check();
3841
3842        /* We need to handle NMIs before interrupts are enabled */
3843        if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
3844            (exit_intr_info & INTR_INFO_VALID_MASK)) {
3845                kvm_before_handle_nmi(&vmx->vcpu);
3846                asm("int $2");
3847                kvm_after_handle_nmi(&vmx->vcpu);
3848        }
3849}
3850
3851static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
3852{
3853        u32 exit_intr_info = vmx->exit_intr_info;
3854        bool unblock_nmi;
3855        u8 vector;
3856        bool idtv_info_valid;
3857
3858        idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3859
3860        if (cpu_has_virtual_nmis()) {
3861                unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
3862                vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
3863                /*
3864                 * SDM 3: 27.7.1.2 (September 2008)
3865                 * Re-set bit "block by NMI" before VM entry if vmexit caused by
3866                 * a guest IRET fault.
3867                 * SDM 3: 23.2.2 (September 2008)
3868                 * Bit 12 is undefined in any of the following cases:
3869                 *  If the VM exit sets the valid bit in the IDT-vectoring
3870                 *   information field.
3871                 *  If the VM exit is due to a double fault.
3872                 */
3873                if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
3874                    vector != DF_VECTOR && !idtv_info_valid)
3875                        vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3876                                      GUEST_INTR_STATE_NMI);
3877        } else if (unlikely(vmx->soft_vnmi_blocked))
3878                vmx->vnmi_blocked_time +=
3879                        ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
3880}
3881
3882static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
3883                                      u32 idt_vectoring_info,
3884                                      int instr_len_field,
3885                                      int error_code_field)
3886{
3887        u8 vector;
3888        int type;
3889        bool idtv_info_valid;
3890
3891        idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3892
3893        vmx->vcpu.arch.nmi_injected = false;
3894        kvm_clear_exception_queue(&vmx->vcpu);
3895        kvm_clear_interrupt_queue(&vmx->vcpu);
3896
3897        if (!idtv_info_valid)
3898                return;
3899
3900        kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
3901
3902        vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
3903        type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
3904
3905        switch (type) {
3906        case INTR_TYPE_NMI_INTR:
3907                vmx->vcpu.arch.nmi_injected = true;
3908                /*
3909                 * SDM 3: 27.7.1.2 (September 2008)
3910                 * Clear bit "block by NMI" before VM entry if a NMI
3911                 * delivery faulted.
3912                 */
3913                vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
3914                                GUEST_INTR_STATE_NMI);
3915                break;
3916        case INTR_TYPE_SOFT_EXCEPTION:
3917                vmx->vcpu.arch.event_exit_inst_len =
3918                        vmcs_read32(instr_len_field);
3919                /* fall through */
3920        case INTR_TYPE_HARD_EXCEPTION:
3921                if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
3922                        u32 err = vmcs_read32(error_code_field);
3923                        kvm_queue_exception_e(&vmx->vcpu, vector, err);
3924                } else
3925                        kvm_queue_exception(&vmx->vcpu, vector);
3926                break;
3927        case INTR_TYPE_SOFT_INTR:
3928                vmx->vcpu.arch.event_exit_inst_len =
3929                        vmcs_read32(instr_len_field);
3930                /* fall through */
3931        case INTR_TYPE_EXT_INTR:
3932                kvm_queue_interrupt(&vmx->vcpu, vector,
3933                        type == INTR_TYPE_SOFT_INTR);
3934                break;
3935        default:
3936                break;
3937        }
3938}
3939
3940static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3941{
3942        __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
3943                                  VM_EXIT_INSTRUCTION_LEN,
3944                                  IDT_VECTORING_ERROR_CODE);
3945}
3946
3947static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
3948{
3949        __vmx_complete_interrupts(to_vmx(vcpu),
3950                                  vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
3951                                  VM_ENTRY_INSTRUCTION_LEN,
3952                                  VM_ENTRY_EXCEPTION_ERROR_CODE);
3953
3954        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
3955}
3956
3957#ifdef CONFIG_X86_64
3958#define R "r"
3959#define Q "q"
3960#else
3961#define R "e"
3962#define Q "l"
3963#endif
3964
3965static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3966{
3967        struct vcpu_vmx *vmx = to_vmx(vcpu);
3968
3969        /* Record the guest's net vcpu time for enforced NMI injections. */
3970        if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
3971                vmx->entry_time = ktime_get();
3972
3973        /* Don't enter VMX if guest state is invalid, let the exit handler
3974           start emulation until we arrive back to a valid state */
3975        if (vmx->emulation_required && emulate_invalid_guest_state)
3976                return;
3977
3978        if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
3979                vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
3980        if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
3981                vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
3982
3983        /* When single-stepping over STI and MOV SS, we must clear the
3984         * corresponding interruptibility bits in the guest state. Otherwise
3985         * vmentry fails as it then expects bit 14 (BS) in pending debug
3986         * exceptions being set, but that's not correct for the guest debugging
3987         * case. */
3988        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3989                vmx_set_interrupt_shadow(vcpu, 0);
3990
3991        asm(
3992                /* Store host registers */
3993                "push %%"R"dx; push %%"R"bp;"
3994                "push %%"R"cx \n\t"
3995                "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
3996                "je 1f \n\t"
3997                "mov %%"R"sp, %c[host_rsp](%0) \n\t"
3998                __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
3999                "1: \n\t"
4000                /* Reload cr2 if changed */
4001                "mov %c[cr2](%0), %%"R"ax \n\t"
4002                "mov %%cr2, %%"R"dx \n\t"
4003                "cmp %%"R"ax, %%"R"dx \n\t"
4004                "je 2f \n\t"
4005                "mov %%"R"ax, %%cr2 \n\t"
4006                "2: \n\t"
4007                /* Check if vmlaunch of vmresume is needed */
4008                "cmpl $0, %c[launched](%0) \n\t"
4009                /* Load guest registers.  Don't clobber flags. */
4010                "mov %c[rax](%0), %%"R"ax \n\t"
4011                "mov %c[rbx](%0), %%"R"bx \n\t"
4012                "mov %c[rdx](%0), %%"R"dx \n\t"
4013                "mov %c[rsi](%0), %%"R"si \n\t"
4014                "mov %c[rdi](%0), %%"R"di \n\t"
4015                "mov %c[rbp](%0), %%"R"bp \n\t"
4016#ifdef CONFIG_X86_64
4017                "mov %c[r8](%0),  %%r8  \n\t"
4018                "mov %c[r9](%0),  %%r9  \n\t"
4019                "mov %c[r10](%0), %%r10 \n\t"
4020                "mov %c[r11](%0), %%r11 \n\t"
4021                "mov %c[r12](%0), %%r12 \n\t"
4022                "mov %c[r13](%0), %%r13 \n\t"
4023                "mov %c[r14](%0), %%r14 \n\t"
4024                "mov %c[r15](%0), %%r15 \n\t"
4025#endif
4026                "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
4027
4028                /* Enter guest mode */
4029                "jne .Llaunched \n\t"
4030                __ex(ASM_VMX_VMLAUNCH) "\n\t"
4031                "jmp .Lkvm_vmx_return \n\t"
4032                ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
4033                ".Lkvm_vmx_return: "
4034                /* Save guest registers, load host registers, keep flags */
4035                "xchg %0,     (%%"R"sp) \n\t"
4036                "mov %%"R"ax, %c[rax](%0) \n\t"
4037                "mov %%"R"bx, %c[rbx](%0) \n\t"
4038                "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t"
4039                "mov %%"R"dx, %c[rdx](%0) \n\t"
4040                "mov %%"R"si, %c[rsi](%0) \n\t"
4041                "mov %%"R"di, %c[rdi](%0) \n\t"
4042                "mov %%"R"bp, %c[rbp](%0) \n\t"
4043#ifdef CONFIG_X86_64
4044                "mov %%r8,  %c[r8](%0) \n\t"
4045                "mov %%r9,  %c[r9](%0) \n\t"
4046                "mov %%r10, %c[r10](%0) \n\t"
4047                "mov %%r11, %c[r11](%0) \n\t"
4048                "mov %%r12, %c[r12](%0) \n\t"
4049                "mov %%r13, %c[r13](%0) \n\t"
4050                "mov %%r14, %c[r14](%0) \n\t"
4051                "mov %%r15, %c[r15](%0) \n\t"
4052#endif
4053                "mov %%cr2, %%"R"ax   \n\t"
4054                "mov %%"R"ax, %c[cr2](%0) \n\t"
4055
4056                "pop  %%"R"bp; pop  %%"R"bp; pop  %%"R"dx \n\t"
4057                "setbe %c[fail](%0) \n\t"
4058              : : "c"(vmx), "d"((unsigned long)HOST_RSP),
4059                [launched]"i"(offsetof(struct vcpu_vmx, launched)),
4060                [fail]"i"(offsetof(struct vcpu_vmx, fail)),
4061                [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
4062                [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
4063                [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
4064                [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
4065                [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
4066                [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
4067                [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
4068                [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
4069#ifdef CONFIG_X86_64
4070                [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
4071                [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
4072                [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
4073                [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
4074                [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
4075                [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
4076                [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
4077                [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
4078#endif
4079                [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
4080              : "cc", "memory"
4081                , R"ax", R"bx", R"di", R"si"
4082#ifdef CONFIG_X86_64
4083                , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
4084#endif
4085              );
4086
4087        vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
4088                                  | (1 << VCPU_EXREG_PDPTR)
4089                                  | (1 << VCPU_EXREG_CR3));
4090        vcpu->arch.regs_dirty = 0;
4091
4092        vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
4093
4094        asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
4095        vmx->launched = 1;
4096
4097        vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
4098        vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
4099
4100        vmx_complete_atomic_exit(vmx);
4101        vmx_recover_nmi_blocking(vmx);
4102        vmx_complete_interrupts(vmx);
4103}
4104
4105#undef R
4106#undef Q
4107
4108static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
4109{
4110        struct vcpu_vmx *vmx = to_vmx(vcpu);
4111
4112        if (vmx->vmcs) {
4113                vcpu_clear(vmx);
4114                free_vmcs(vmx->vmcs);
4115                vmx->vmcs = NULL;
4116        }
4117}
4118
4119static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
4120{
4121        struct vcpu_vmx *vmx = to_vmx(vcpu);
4122
4123        free_vpid(vmx);
4124        vmx_free_vmcs(vcpu);
4125        kfree(vmx->guest_msrs);
4126        kvm_vcpu_uninit(vcpu);
4127        kmem_cache_free(kvm_vcpu_cache, vmx);
4128}
4129
4130static inline void vmcs_init(struct vmcs *vmcs)
4131{
4132        u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id()));
4133
4134        if (!vmm_exclusive)
4135                kvm_cpu_vmxon(phys_addr);
4136
4137        vmcs_clear(vmcs);
4138
4139        if (!vmm_exclusive)
4140                kvm_cpu_vmxoff();
4141}
4142
4143static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4144{
4145        int err;
4146        struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
4147        int cpu;
4148
4149        if (!vmx)
4150                return ERR_PTR(-ENOMEM);
4151
4152        allocate_vpid(vmx);
4153
4154        err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
4155        if (err)
4156                goto free_vcpu;
4157
4158        vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
4159        if (!vmx->guest_msrs) {
4160                err = -ENOMEM;
4161                goto uninit_vcpu;
4162        }
4163
4164        vmx->vmcs = alloc_vmcs();
4165        if (!vmx->vmcs)
4166                goto free_msrs;
4167
4168        vmcs_init(vmx->vmcs);
4169
4170        cpu = get_cpu();
4171        vmx_vcpu_load(&vmx->vcpu, cpu);
4172        vmx->vcpu.cpu = cpu;
4173        err = vmx_vcpu_setup(vmx);
4174        vmx_vcpu_put(&vmx->vcpu);
4175        put_cpu();
4176        if (err)
4177                goto free_vmcs;
4178        if (vm_need_virtualize_apic_accesses(kvm))
4179                if (alloc_apic_access_page(kvm) != 0)
4180                        goto free_vmcs;
4181
4182        if (enable_ept) {
4183                if (!kvm->arch.ept_identity_map_addr)
4184                        kvm->arch.ept_identity_map_addr =
4185                                VMX_EPT_IDENTITY_PAGETABLE_ADDR;
4186                if (alloc_identity_pagetable(kvm) != 0)
4187                        goto free_vmcs;
4188        }
4189
4190        return &vmx->vcpu;
4191
4192free_vmcs:
4193        free_vmcs(vmx->vmcs);
4194free_msrs:
4195        kfree(vmx->guest_msrs);
4196uninit_vcpu:
4197        kvm_vcpu_uninit(&vmx->vcpu);
4198free_vcpu:
4199        free_vpid(vmx);
4200        kmem_cache_free(kvm_vcpu_cache, vmx);
4201        return ERR_PTR(err);
4202}
4203
4204static void __init vmx_check_processor_compat(void *rtn)
4205{
4206        struct vmcs_config vmcs_conf;
4207
4208        *(int *)rtn = 0;
4209        if (setup_vmcs_config(&vmcs_conf) < 0)
4210                *(int *)rtn = -EIO;
4211        if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
4212                printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
4213                                smp_processor_id());
4214                *(int *)rtn = -EIO;
4215        }
4216}
4217
4218static int get_ept_level(void)
4219{
4220        return VMX_EPT_DEFAULT_GAW + 1;
4221}
4222
4223static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
4224{
4225        u64 ret;
4226
4227        /* For VT-d and EPT combination
4228         * 1. MMIO: always map as UC
4229         * 2. EPT with VT-d:
4230         *   a. VT-d without snooping control feature: can't guarantee the
4231         *      result, try to trust guest.
4232         *   b. VT-d with snooping control feature: snooping control feature of
4233         *      VT-d engine can guarantee the cache correctness. Just set it
4234         *      to WB to keep consistent with host. So the same as item 3.
4235         * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
4236         *    consistent with host MTRR
4237         */
4238        if (is_mmio)
4239                ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
4240        else if (vcpu->kvm->arch.iommu_domain &&
4241                !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY))
4242                ret = kvm_get_guest_memory_type(vcpu, gfn) <<
4243                      VMX_EPT_MT_EPTE_SHIFT;
4244        else
4245                ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
4246                        | VMX_EPT_IPAT_BIT;
4247
4248        return ret;
4249}
4250
4251#define _ER(x) { EXIT_REASON_##x, #x }
4252
4253static const struct trace_print_flags vmx_exit_reasons_str[] = {
4254        _ER(EXCEPTION_NMI),
4255        _ER(EXTERNAL_INTERRUPT),
4256        _ER(TRIPLE_FAULT),
4257        _ER(PENDING_INTERRUPT),
4258        _ER(NMI_WINDOW),
4259        _ER(TASK_SWITCH),
4260        _ER(CPUID),
4261        _ER(HLT),
4262        _ER(INVLPG),
4263        _ER(RDPMC),
4264        _ER(RDTSC),
4265        _ER(VMCALL),
4266        _ER(VMCLEAR),
4267        _ER(VMLAUNCH),
4268        _ER(VMPTRLD),
4269        _ER(VMPTRST),
4270        _ER(VMREAD),
4271        _ER(VMRESUME),
4272        _ER(VMWRITE),
4273        _ER(VMOFF),
4274        _ER(VMON),
4275        _ER(CR_ACCESS),
4276        _ER(DR_ACCESS),
4277        _ER(IO_INSTRUCTION),
4278        _ER(MSR_READ),
4279        _ER(MSR_WRITE),
4280        _ER(MWAIT_INSTRUCTION),
4281        _ER(MONITOR_INSTRUCTION),
4282        _ER(PAUSE_INSTRUCTION),
4283        _ER(MCE_DURING_VMENTRY),
4284        _ER(TPR_BELOW_THRESHOLD),
4285        _ER(APIC_ACCESS),
4286        _ER(EPT_VIOLATION),
4287        _ER(EPT_MISCONFIG),
4288        _ER(WBINVD),
4289        { -1, NULL }
4290};
4291
4292#undef _ER
4293
4294static int vmx_get_lpage_level(void)
4295{
4296        if (enable_ept && !cpu_has_vmx_ept_1g_page())
4297                return PT_DIRECTORY_LEVEL;
4298        else
4299                /* For shadow and EPT supported 1GB page */
4300                return PT_PDPE_LEVEL;
4301}
4302
4303static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
4304{
4305        struct kvm_cpuid_entry2 *best;
4306        struct vcpu_vmx *vmx = to_vmx(vcpu);
4307        u32 exec_control;
4308
4309        vmx->rdtscp_enabled = false;
4310        if (vmx_rdtscp_supported()) {
4311                exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
4312                if (exec_control & SECONDARY_EXEC_RDTSCP) {
4313                        best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
4314                        if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
4315                                vmx->rdtscp_enabled = true;
4316                        else {
4317                                exec_control &= ~SECONDARY_EXEC_RDTSCP;
4318                                vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
4319                                                exec_control);
4320                        }
4321                }
4322        }
4323}
4324
4325static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
4326{
4327}
4328
4329static struct kvm_x86_ops vmx_x86_ops = {
4330        .cpu_has_kvm_support = cpu_has_kvm_support,
4331        .disabled_by_bios = vmx_disabled_by_bios,
4332        .hardware_setup = hardware_setup,
4333        .hardware_unsetup = hardware_unsetup,
4334        .check_processor_compatibility = vmx_check_processor_compat,
4335        .hardware_enable = hardware_enable,
4336        .hardware_disable = hardware_disable,
4337        .cpu_has_accelerated_tpr = report_flexpriority,
4338
4339        .vcpu_create = vmx_create_vcpu,
4340        .vcpu_free = vmx_free_vcpu,
4341        .vcpu_reset = vmx_vcpu_reset,
4342
4343        .prepare_guest_switch = vmx_save_host_state,
4344        .vcpu_load = vmx_vcpu_load,
4345        .vcpu_put = vmx_vcpu_put,
4346
4347        .set_guest_debug = set_guest_debug,
4348        .get_msr = vmx_get_msr,
4349        .set_msr = vmx_set_msr,
4350        .get_segment_base = vmx_get_segment_base,
4351        .get_segment = vmx_get_segment,
4352        .set_segment = vmx_set_segment,
4353        .get_cpl = vmx_get_cpl,
4354        .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
4355        .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
4356        .decache_cr3 = vmx_decache_cr3,
4357        .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
4358        .set_cr0 = vmx_set_cr0,
4359        .set_cr3 = vmx_set_cr3,
4360        .set_cr4 = vmx_set_cr4,
4361        .set_efer = vmx_set_efer,
4362        .get_idt = vmx_get_idt,
4363        .set_idt = vmx_set_idt,
4364        .get_gdt = vmx_get_gdt,
4365        .set_gdt = vmx_set_gdt,
4366        .set_dr7 = vmx_set_dr7,
4367        .cache_reg = vmx_cache_reg,
4368        .get_rflags = vmx_get_rflags,
4369        .set_rflags = vmx_set_rflags,
4370        .fpu_activate = vmx_fpu_activate,
4371        .fpu_deactivate = vmx_fpu_deactivate,
4372
4373        .tlb_flush = vmx_flush_tlb,
4374
4375        .run = vmx_vcpu_run,
4376        .handle_exit = vmx_handle_exit,
4377        .skip_emulated_instruction = skip_emulated_instruction,
4378        .set_interrupt_shadow = vmx_set_interrupt_shadow,
4379        .get_interrupt_shadow = vmx_get_interrupt_shadow,
4380        .patch_hypercall = vmx_patch_hypercall,
4381        .set_irq = vmx_inject_irq,
4382        .set_nmi = vmx_inject_nmi,
4383        .queue_exception = vmx_queue_exception,
4384        .cancel_injection = vmx_cancel_injection,
4385        .interrupt_allowed = vmx_interrupt_allowed,
4386        .nmi_allowed = vmx_nmi_allowed,
4387        .get_nmi_mask = vmx_get_nmi_mask,
4388        .set_nmi_mask = vmx_set_nmi_mask,
4389        .enable_nmi_window = enable_nmi_window,
4390        .enable_irq_window = enable_irq_window,
4391        .update_cr8_intercept = update_cr8_intercept,
4392
4393        .set_tss_addr = vmx_set_tss_addr,
4394        .get_tdp_level = get_ept_level,
4395        .get_mt_mask = vmx_get_mt_mask,
4396
4397        .get_exit_info = vmx_get_exit_info,
4398        .exit_reasons_str = vmx_exit_reasons_str,
4399
4400        .get_lpage_level = vmx_get_lpage_level,
4401
4402        .cpuid_update = vmx_cpuid_update,
4403
4404        .rdtscp_supported = vmx_rdtscp_supported,
4405
4406        .set_supported_cpuid = vmx_set_supported_cpuid,
4407
4408        .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
4409
4410        .write_tsc_offset = vmx_write_tsc_offset,
4411        .adjust_tsc_offset = vmx_adjust_tsc_offset,
4412
4413        .set_tdp_cr3 = vmx_set_cr3,
4414};
4415
4416static int __init vmx_init(void)
4417{
4418        int r, i;
4419
4420        rdmsrl_safe(MSR_EFER, &host_efer);
4421
4422        for (i = 0; i < NR_VMX_MSR; ++i)
4423                kvm_define_shared_msr(i, vmx_msr_index[i]);
4424
4425        vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
4426        if (!vmx_io_bitmap_a)
4427                return -ENOMEM;
4428
4429        vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
4430        if (!vmx_io_bitmap_b) {
4431                r = -ENOMEM;
4432                goto out;
4433        }
4434
4435        vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
4436        if (!vmx_msr_bitmap_legacy) {
4437                r = -ENOMEM;
4438                goto out1;
4439        }
4440
4441        vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
4442        if (!vmx_msr_bitmap_longmode) {
4443                r = -ENOMEM;
4444                goto out2;
4445        }
4446
4447        /*
4448         * Allow direct access to the PC debug port (it is often used for I/O
4449         * delays, but the vmexits simply slow things down).
4450         */
4451        memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
4452        clear_bit(0x80, vmx_io_bitmap_a);
4453
4454        memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
4455
4456        memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
4457        memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
4458
4459        set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
4460
4461        r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
4462                     __alignof__(struct vcpu_vmx), THIS_MODULE);
4463        if (r)
4464                goto out3;
4465
4466        vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
4467        vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
4468        vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
4469        vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
4470        vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
4471        vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
4472
4473        if (enable_ept) {
4474                bypass_guest_pf = 0;
4475                kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
4476                                VMX_EPT_EXECUTABLE_MASK);
4477                kvm_enable_tdp();
4478        } else
4479                kvm_disable_tdp();
4480
4481        if (bypass_guest_pf)
4482                kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
4483
4484        return 0;
4485
4486out3:
4487        free_page((unsigned long)vmx_msr_bitmap_longmode);
4488out2:
4489        free_page((unsigned long)vmx_msr_bitmap_legacy);
4490out1:
4491        free_page((unsigned long)vmx_io_bitmap_b);
4492out:
4493        free_page((unsigned long)vmx_io_bitmap_a);
4494        return r;
4495}
4496
4497static void __exit vmx_exit(void)
4498{
4499        free_page((unsigned long)vmx_msr_bitmap_legacy);
4500        free_page((unsigned long)vmx_msr_bitmap_longmode);
4501        free_page((unsigned long)vmx_io_bitmap_b);
4502        free_page((unsigned long)vmx_io_bitmap_a);
4503
4504        kvm_exit();
4505}
4506
4507module_init(vmx_init)
4508module_exit(vmx_exit)
4509