linux/arch/x86/kvm/vmx.c
<<
>>
Prefs
   1/*
   2 * Kernel-based Virtual Machine driver for Linux
   3 *
   4 * This module enables machines with Intel VT-x extensions to run virtual
   5 * machines without emulation or binary translation.
   6 *
   7 * Copyright (C) 2006 Qumranet, Inc.
   8 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
   9 *
  10 * Authors:
  11 *   Avi Kivity   <avi@qumranet.com>
  12 *   Yaniv Kamay  <yaniv@qumranet.com>
  13 *
  14 * This work is licensed under the terms of the GNU GPL, version 2.  See
  15 * the COPYING file in the top-level directory.
  16 *
  17 */
  18
  19#include "irq.h"
  20#include "mmu.h"
  21#include "cpuid.h"
  22
  23#include <linux/kvm_host.h>
  24#include <linux/module.h>
  25#include <linux/kernel.h>
  26#include <linux/mm.h>
  27#include <linux/highmem.h>
  28#include <linux/sched.h>
  29#include <linux/moduleparam.h>
  30#include <linux/mod_devicetable.h>
  31#include <linux/ftrace_event.h>
  32#include <linux/slab.h>
  33#include <linux/tboot.h>
  34#include <linux/hrtimer.h>
  35#include "kvm_cache_regs.h"
  36#include "x86.h"
  37
  38#include <asm/io.h>
  39#include <asm/desc.h>
  40#include <asm/vmx.h>
  41#include <asm/virtext.h>
  42#include <asm/mce.h>
  43#include <asm/i387.h>
  44#include <asm/xcr.h>
  45#include <asm/perf_event.h>
  46#include <asm/debugreg.h>
  47#include <asm/kexec.h>
  48#include <asm/apic.h>
  49
  50#include "trace.h"
  51
  52#define __ex(x) __kvm_handle_fault_on_reboot(x)
  53#define __ex_clear(x, reg) \
  54        ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
  55
  56MODULE_AUTHOR("Qumranet");
  57MODULE_LICENSE("GPL");
  58
  59static const struct x86_cpu_id vmx_cpu_id[] = {
  60        X86_FEATURE_MATCH(X86_FEATURE_VMX),
  61        {}
  62};
  63MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
  64
  65static bool __read_mostly enable_vpid = 1;
  66module_param_named(vpid, enable_vpid, bool, 0444);
  67
  68static bool __read_mostly flexpriority_enabled = 1;
  69module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
  70
  71static bool __read_mostly enable_ept = 1;
  72module_param_named(ept, enable_ept, bool, S_IRUGO);
  73
  74static bool __read_mostly enable_unrestricted_guest = 1;
  75module_param_named(unrestricted_guest,
  76                        enable_unrestricted_guest, bool, S_IRUGO);
  77
  78static bool __read_mostly enable_ept_ad_bits = 1;
  79module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
  80
  81static bool __read_mostly emulate_invalid_guest_state = true;
  82module_param(emulate_invalid_guest_state, bool, S_IRUGO);
  83
  84static bool __read_mostly vmm_exclusive = 1;
  85module_param(vmm_exclusive, bool, S_IRUGO);
  86
  87static bool __read_mostly fasteoi = 1;
  88module_param(fasteoi, bool, S_IRUGO);
  89
  90static bool __read_mostly enable_apicv = 1;
  91module_param(enable_apicv, bool, S_IRUGO);
  92
  93static bool __read_mostly enable_shadow_vmcs = 1;
  94module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
  95/*
  96 * If nested=1, nested virtualization is supported, i.e., guests may use
  97 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
  98 * use VMX instructions.
  99 */
 100static bool __read_mostly nested = 0;
 101module_param(nested, bool, S_IRUGO);
 102
 103static u64 __read_mostly host_xss;
 104
 105static bool __read_mostly enable_pml = 1;
 106module_param_named(pml, enable_pml, bool, S_IRUGO);
 107
 108#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
 109#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
 110#define KVM_VM_CR0_ALWAYS_ON                                            \
 111        (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
 112#define KVM_CR4_GUEST_OWNED_BITS                                      \
 113        (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR      \
 114         | X86_CR4_OSXMMEXCPT | X86_CR4_TSD)
 115
 116#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
 117#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
 118
 119#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
 120
 121#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
 122
 123/*
 124 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
 125 * ple_gap:    upper bound on the amount of time between two successive
 126 *             executions of PAUSE in a loop. Also indicate if ple enabled.
 127 *             According to test, this time is usually smaller than 128 cycles.
 128 * ple_window: upper bound on the amount of time a guest is allowed to execute
 129 *             in a PAUSE loop. Tests indicate that most spinlocks are held for
 130 *             less than 2^12 cycles
 131 * Time is measured based on a counter that runs at the same rate as the TSC,
 132 * refer SDM volume 3b section 21.6.13 & 22.1.3.
 133 */
 134#define KVM_VMX_DEFAULT_PLE_GAP           128
 135#define KVM_VMX_DEFAULT_PLE_WINDOW        4096
 136#define KVM_VMX_DEFAULT_PLE_WINDOW_GROW   2
 137#define KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK 0
 138#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX    \
 139                INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW
 140
 141static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
 142module_param(ple_gap, int, S_IRUGO);
 143
 144static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
 145module_param(ple_window, int, S_IRUGO);
 146
 147/* Default doubles per-vcpu window every exit. */
 148static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW;
 149module_param(ple_window_grow, int, S_IRUGO);
 150
 151/* Default resets per-vcpu window every exit to ple_window. */
 152static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK;
 153module_param(ple_window_shrink, int, S_IRUGO);
 154
 155/* Default is to compute the maximum so we can never overflow. */
 156static int ple_window_actual_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
 157static int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
 158module_param(ple_window_max, int, S_IRUGO);
 159
 160extern const ulong vmx_return;
 161
 162#define NR_AUTOLOAD_MSRS 8
 163#define VMCS02_POOL_SIZE 1
 164
 165struct vmcs {
 166        u32 revision_id;
 167        u32 abort;
 168        char data[0];
 169};
 170
 171/*
 172 * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
 173 * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
 174 * loaded on this CPU (so we can clear them if the CPU goes down).
 175 */
 176struct loaded_vmcs {
 177        struct vmcs *vmcs;
 178        int cpu;
 179        int launched;
 180        struct list_head loaded_vmcss_on_cpu_link;
 181};
 182
 183struct shared_msr_entry {
 184        unsigned index;
 185        u64 data;
 186        u64 mask;
 187};
 188
 189/*
 190 * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
 191 * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
 192 * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
 193 * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
 194 * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
 195 * More than one of these structures may exist, if L1 runs multiple L2 guests.
 196 * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
 197 * underlying hardware which will be used to run L2.
 198 * This structure is packed to ensure that its layout is identical across
 199 * machines (necessary for live migration).
 200 * If there are changes in this struct, VMCS12_REVISION must be changed.
 201 */
 202typedef u64 natural_width;
 203struct __packed vmcs12 {
 204        /* According to the Intel spec, a VMCS region must start with the
 205         * following two fields. Then follow implementation-specific data.
 206         */
 207        u32 revision_id;
 208        u32 abort;
 209
 210        u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
 211        u32 padding[7]; /* room for future expansion */
 212
 213        u64 io_bitmap_a;
 214        u64 io_bitmap_b;
 215        u64 msr_bitmap;
 216        u64 vm_exit_msr_store_addr;
 217        u64 vm_exit_msr_load_addr;
 218        u64 vm_entry_msr_load_addr;
 219        u64 tsc_offset;
 220        u64 virtual_apic_page_addr;
 221        u64 apic_access_addr;
 222        u64 posted_intr_desc_addr;
 223        u64 ept_pointer;
 224        u64 eoi_exit_bitmap0;
 225        u64 eoi_exit_bitmap1;
 226        u64 eoi_exit_bitmap2;
 227        u64 eoi_exit_bitmap3;
 228        u64 xss_exit_bitmap;
 229        u64 guest_physical_address;
 230        u64 vmcs_link_pointer;
 231        u64 guest_ia32_debugctl;
 232        u64 guest_ia32_pat;
 233        u64 guest_ia32_efer;
 234        u64 guest_ia32_perf_global_ctrl;
 235        u64 guest_pdptr0;
 236        u64 guest_pdptr1;
 237        u64 guest_pdptr2;
 238        u64 guest_pdptr3;
 239        u64 guest_bndcfgs;
 240        u64 host_ia32_pat;
 241        u64 host_ia32_efer;
 242        u64 host_ia32_perf_global_ctrl;
 243        u64 padding64[8]; /* room for future expansion */
 244        /*
 245         * To allow migration of L1 (complete with its L2 guests) between
 246         * machines of different natural widths (32 or 64 bit), we cannot have
 247         * unsigned long fields with no explict size. We use u64 (aliased
 248         * natural_width) instead. Luckily, x86 is little-endian.
 249         */
 250        natural_width cr0_guest_host_mask;
 251        natural_width cr4_guest_host_mask;
 252        natural_width cr0_read_shadow;
 253        natural_width cr4_read_shadow;
 254        natural_width cr3_target_value0;
 255        natural_width cr3_target_value1;
 256        natural_width cr3_target_value2;
 257        natural_width cr3_target_value3;
 258        natural_width exit_qualification;
 259        natural_width guest_linear_address;
 260        natural_width guest_cr0;
 261        natural_width guest_cr3;
 262        natural_width guest_cr4;
 263        natural_width guest_es_base;
 264        natural_width guest_cs_base;
 265        natural_width guest_ss_base;
 266        natural_width guest_ds_base;
 267        natural_width guest_fs_base;
 268        natural_width guest_gs_base;
 269        natural_width guest_ldtr_base;
 270        natural_width guest_tr_base;
 271        natural_width guest_gdtr_base;
 272        natural_width guest_idtr_base;
 273        natural_width guest_dr7;
 274        natural_width guest_rsp;
 275        natural_width guest_rip;
 276        natural_width guest_rflags;
 277        natural_width guest_pending_dbg_exceptions;
 278        natural_width guest_sysenter_esp;
 279        natural_width guest_sysenter_eip;
 280        natural_width host_cr0;
 281        natural_width host_cr3;
 282        natural_width host_cr4;
 283        natural_width host_fs_base;
 284        natural_width host_gs_base;
 285        natural_width host_tr_base;
 286        natural_width host_gdtr_base;
 287        natural_width host_idtr_base;
 288        natural_width host_ia32_sysenter_esp;
 289        natural_width host_ia32_sysenter_eip;
 290        natural_width host_rsp;
 291        natural_width host_rip;
 292        natural_width paddingl[8]; /* room for future expansion */
 293        u32 pin_based_vm_exec_control;
 294        u32 cpu_based_vm_exec_control;
 295        u32 exception_bitmap;
 296        u32 page_fault_error_code_mask;
 297        u32 page_fault_error_code_match;
 298        u32 cr3_target_count;
 299        u32 vm_exit_controls;
 300        u32 vm_exit_msr_store_count;
 301        u32 vm_exit_msr_load_count;
 302        u32 vm_entry_controls;
 303        u32 vm_entry_msr_load_count;
 304        u32 vm_entry_intr_info_field;
 305        u32 vm_entry_exception_error_code;
 306        u32 vm_entry_instruction_len;
 307        u32 tpr_threshold;
 308        u32 secondary_vm_exec_control;
 309        u32 vm_instruction_error;
 310        u32 vm_exit_reason;
 311        u32 vm_exit_intr_info;
 312        u32 vm_exit_intr_error_code;
 313        u32 idt_vectoring_info_field;
 314        u32 idt_vectoring_error_code;
 315        u32 vm_exit_instruction_len;
 316        u32 vmx_instruction_info;
 317        u32 guest_es_limit;
 318        u32 guest_cs_limit;
 319        u32 guest_ss_limit;
 320        u32 guest_ds_limit;
 321        u32 guest_fs_limit;
 322        u32 guest_gs_limit;
 323        u32 guest_ldtr_limit;
 324        u32 guest_tr_limit;
 325        u32 guest_gdtr_limit;
 326        u32 guest_idtr_limit;
 327        u32 guest_es_ar_bytes;
 328        u32 guest_cs_ar_bytes;
 329        u32 guest_ss_ar_bytes;
 330        u32 guest_ds_ar_bytes;
 331        u32 guest_fs_ar_bytes;
 332        u32 guest_gs_ar_bytes;
 333        u32 guest_ldtr_ar_bytes;
 334        u32 guest_tr_ar_bytes;
 335        u32 guest_interruptibility_info;
 336        u32 guest_activity_state;
 337        u32 guest_sysenter_cs;
 338        u32 host_ia32_sysenter_cs;
 339        u32 vmx_preemption_timer_value;
 340        u32 padding32[7]; /* room for future expansion */
 341        u16 virtual_processor_id;
 342        u16 posted_intr_nv;
 343        u16 guest_es_selector;
 344        u16 guest_cs_selector;
 345        u16 guest_ss_selector;
 346        u16 guest_ds_selector;
 347        u16 guest_fs_selector;
 348        u16 guest_gs_selector;
 349        u16 guest_ldtr_selector;
 350        u16 guest_tr_selector;
 351        u16 guest_intr_status;
 352        u16 host_es_selector;
 353        u16 host_cs_selector;
 354        u16 host_ss_selector;
 355        u16 host_ds_selector;
 356        u16 host_fs_selector;
 357        u16 host_gs_selector;
 358        u16 host_tr_selector;
 359};
 360
 361/*
 362 * VMCS12_REVISION is an arbitrary id that should be changed if the content or
 363 * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
 364 * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
 365 */
 366#define VMCS12_REVISION 0x11e57ed0
 367
 368/*
 369 * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
 370 * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
 371 * current implementation, 4K are reserved to avoid future complications.
 372 */
 373#define VMCS12_SIZE 0x1000
 374
 375/* Used to remember the last vmcs02 used for some recently used vmcs12s */
 376struct vmcs02_list {
 377        struct list_head list;
 378        gpa_t vmptr;
 379        struct loaded_vmcs vmcs02;
 380};
 381
 382/*
 383 * The nested_vmx structure is part of vcpu_vmx, and holds information we need
 384 * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
 385 */
 386struct nested_vmx {
 387        /* Has the level1 guest done vmxon? */
 388        bool vmxon;
 389        gpa_t vmxon_ptr;
 390
 391        /* The guest-physical address of the current VMCS L1 keeps for L2 */
 392        gpa_t current_vmptr;
 393        /* The host-usable pointer to the above */
 394        struct page *current_vmcs12_page;
 395        struct vmcs12 *current_vmcs12;
 396        struct vmcs *current_shadow_vmcs;
 397        /*
 398         * Indicates if the shadow vmcs must be updated with the
 399         * data hold by vmcs12
 400         */
 401        bool sync_shadow_vmcs;
 402
 403        /* vmcs02_list cache of VMCSs recently used to run L2 guests */
 404        struct list_head vmcs02_pool;
 405        int vmcs02_num;
 406        u64 vmcs01_tsc_offset;
 407        /* L2 must run next, and mustn't decide to exit to L1. */
 408        bool nested_run_pending;
 409        /*
 410         * Guest pages referred to in vmcs02 with host-physical pointers, so
 411         * we must keep them pinned while L2 runs.
 412         */
 413        struct page *apic_access_page;
 414        struct page *virtual_apic_page;
 415        struct page *pi_desc_page;
 416        struct pi_desc *pi_desc;
 417        bool pi_pending;
 418        u16 posted_intr_nv;
 419        u64 msr_ia32_feature_control;
 420
 421        struct hrtimer preemption_timer;
 422        bool preemption_timer_expired;
 423
 424        /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
 425        u64 vmcs01_debugctl;
 426
 427        u32 nested_vmx_procbased_ctls_low;
 428        u32 nested_vmx_procbased_ctls_high;
 429        u32 nested_vmx_true_procbased_ctls_low;
 430        u32 nested_vmx_secondary_ctls_low;
 431        u32 nested_vmx_secondary_ctls_high;
 432        u32 nested_vmx_pinbased_ctls_low;
 433        u32 nested_vmx_pinbased_ctls_high;
 434        u32 nested_vmx_exit_ctls_low;
 435        u32 nested_vmx_exit_ctls_high;
 436        u32 nested_vmx_true_exit_ctls_low;
 437        u32 nested_vmx_entry_ctls_low;
 438        u32 nested_vmx_entry_ctls_high;
 439        u32 nested_vmx_true_entry_ctls_low;
 440        u32 nested_vmx_misc_low;
 441        u32 nested_vmx_misc_high;
 442        u32 nested_vmx_ept_caps;
 443};
 444
 445#define POSTED_INTR_ON  0
 446/* Posted-Interrupt Descriptor */
 447struct pi_desc {
 448        u32 pir[8];     /* Posted interrupt requested */
 449        u32 control;    /* bit 0 of control is outstanding notification bit */
 450        u32 rsvd[7];
 451} __aligned(64);
 452
 453static bool pi_test_and_set_on(struct pi_desc *pi_desc)
 454{
 455        return test_and_set_bit(POSTED_INTR_ON,
 456                        (unsigned long *)&pi_desc->control);
 457}
 458
 459static bool pi_test_and_clear_on(struct pi_desc *pi_desc)
 460{
 461        return test_and_clear_bit(POSTED_INTR_ON,
 462                        (unsigned long *)&pi_desc->control);
 463}
 464
 465static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
 466{
 467        return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
 468}
 469
 470struct vcpu_vmx {
 471        struct kvm_vcpu       vcpu;
 472        unsigned long         host_rsp;
 473        u8                    fail;
 474        bool                  nmi_known_unmasked;
 475        u32                   exit_intr_info;
 476        u32                   idt_vectoring_info;
 477        ulong                 rflags;
 478        struct shared_msr_entry *guest_msrs;
 479        int                   nmsrs;
 480        int                   save_nmsrs;
 481        unsigned long         host_idt_base;
 482#ifdef CONFIG_X86_64
 483        u64                   msr_host_kernel_gs_base;
 484        u64                   msr_guest_kernel_gs_base;
 485#endif
 486        u32 vm_entry_controls_shadow;
 487        u32 vm_exit_controls_shadow;
 488        /*
 489         * loaded_vmcs points to the VMCS currently used in this vcpu. For a
 490         * non-nested (L1) guest, it always points to vmcs01. For a nested
 491         * guest (L2), it points to a different VMCS.
 492         */
 493        struct loaded_vmcs    vmcs01;
 494        struct loaded_vmcs   *loaded_vmcs;
 495        bool                  __launched; /* temporary, used in vmx_vcpu_run */
 496        struct msr_autoload {
 497                unsigned nr;
 498                struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
 499                struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
 500        } msr_autoload;
 501        struct {
 502                int           loaded;
 503                u16           fs_sel, gs_sel, ldt_sel;
 504#ifdef CONFIG_X86_64
 505                u16           ds_sel, es_sel;
 506#endif
 507                int           gs_ldt_reload_needed;
 508                int           fs_reload_needed;
 509                u64           msr_host_bndcfgs;
 510                unsigned long vmcs_host_cr4;    /* May not match real cr4 */
 511        } host_state;
 512        struct {
 513                int vm86_active;
 514                ulong save_rflags;
 515                struct kvm_segment segs[8];
 516        } rmode;
 517        struct {
 518                u32 bitmask; /* 4 bits per segment (1 bit per field) */
 519                struct kvm_save_segment {
 520                        u16 selector;
 521                        unsigned long base;
 522                        u32 limit;
 523                        u32 ar;
 524                } seg[8];
 525        } segment_cache;
 526        int vpid;
 527        bool emulation_required;
 528
 529        /* Support for vnmi-less CPUs */
 530        int soft_vnmi_blocked;
 531        ktime_t entry_time;
 532        s64 vnmi_blocked_time;
 533        u32 exit_reason;
 534
 535        bool rdtscp_enabled;
 536
 537        /* Posted interrupt descriptor */
 538        struct pi_desc pi_desc;
 539
 540        /* Support for a guest hypervisor (nested VMX) */
 541        struct nested_vmx nested;
 542
 543        /* Dynamic PLE window. */
 544        int ple_window;
 545        bool ple_window_dirty;
 546
 547        /* Support for PML */
 548#define PML_ENTITY_NUM          512
 549        struct page *pml_pg;
 550};
 551
 552enum segment_cache_field {
 553        SEG_FIELD_SEL = 0,
 554        SEG_FIELD_BASE = 1,
 555        SEG_FIELD_LIMIT = 2,
 556        SEG_FIELD_AR = 3,
 557
 558        SEG_FIELD_NR = 4
 559};
 560
 561static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
 562{
 563        return container_of(vcpu, struct vcpu_vmx, vcpu);
 564}
 565
 566#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
 567#define FIELD(number, name)     [number] = VMCS12_OFFSET(name)
 568#define FIELD64(number, name)   [number] = VMCS12_OFFSET(name), \
 569                                [number##_HIGH] = VMCS12_OFFSET(name)+4
 570
 571
 572static unsigned long shadow_read_only_fields[] = {
 573        /*
 574         * We do NOT shadow fields that are modified when L0
 575         * traps and emulates any vmx instruction (e.g. VMPTRLD,
 576         * VMXON...) executed by L1.
 577         * For example, VM_INSTRUCTION_ERROR is read
 578         * by L1 if a vmx instruction fails (part of the error path).
 579         * Note the code assumes this logic. If for some reason
 580         * we start shadowing these fields then we need to
 581         * force a shadow sync when L0 emulates vmx instructions
 582         * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified
 583         * by nested_vmx_failValid)
 584         */
 585        VM_EXIT_REASON,
 586        VM_EXIT_INTR_INFO,
 587        VM_EXIT_INSTRUCTION_LEN,
 588        IDT_VECTORING_INFO_FIELD,
 589        IDT_VECTORING_ERROR_CODE,
 590        VM_EXIT_INTR_ERROR_CODE,
 591        EXIT_QUALIFICATION,
 592        GUEST_LINEAR_ADDRESS,
 593        GUEST_PHYSICAL_ADDRESS
 594};
 595static int max_shadow_read_only_fields =
 596        ARRAY_SIZE(shadow_read_only_fields);
 597
 598static unsigned long shadow_read_write_fields[] = {
 599        TPR_THRESHOLD,
 600        GUEST_RIP,
 601        GUEST_RSP,
 602        GUEST_CR0,
 603        GUEST_CR3,
 604        GUEST_CR4,
 605        GUEST_INTERRUPTIBILITY_INFO,
 606        GUEST_RFLAGS,
 607        GUEST_CS_SELECTOR,
 608        GUEST_CS_AR_BYTES,
 609        GUEST_CS_LIMIT,
 610        GUEST_CS_BASE,
 611        GUEST_ES_BASE,
 612        GUEST_BNDCFGS,
 613        CR0_GUEST_HOST_MASK,
 614        CR0_READ_SHADOW,
 615        CR4_READ_SHADOW,
 616        TSC_OFFSET,
 617        EXCEPTION_BITMAP,
 618        CPU_BASED_VM_EXEC_CONTROL,
 619        VM_ENTRY_EXCEPTION_ERROR_CODE,
 620        VM_ENTRY_INTR_INFO_FIELD,
 621        VM_ENTRY_INSTRUCTION_LEN,
 622        VM_ENTRY_EXCEPTION_ERROR_CODE,
 623        HOST_FS_BASE,
 624        HOST_GS_BASE,
 625        HOST_FS_SELECTOR,
 626        HOST_GS_SELECTOR
 627};
 628static int max_shadow_read_write_fields =
 629        ARRAY_SIZE(shadow_read_write_fields);
 630
 631static const unsigned short vmcs_field_to_offset_table[] = {
 632        FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
 633        FIELD(POSTED_INTR_NV, posted_intr_nv),
 634        FIELD(GUEST_ES_SELECTOR, guest_es_selector),
 635        FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
 636        FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
 637        FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
 638        FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
 639        FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
 640        FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
 641        FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
 642        FIELD(GUEST_INTR_STATUS, guest_intr_status),
 643        FIELD(HOST_ES_SELECTOR, host_es_selector),
 644        FIELD(HOST_CS_SELECTOR, host_cs_selector),
 645        FIELD(HOST_SS_SELECTOR, host_ss_selector),
 646        FIELD(HOST_DS_SELECTOR, host_ds_selector),
 647        FIELD(HOST_FS_SELECTOR, host_fs_selector),
 648        FIELD(HOST_GS_SELECTOR, host_gs_selector),
 649        FIELD(HOST_TR_SELECTOR, host_tr_selector),
 650        FIELD64(IO_BITMAP_A, io_bitmap_a),
 651        FIELD64(IO_BITMAP_B, io_bitmap_b),
 652        FIELD64(MSR_BITMAP, msr_bitmap),
 653        FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
 654        FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
 655        FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
 656        FIELD64(TSC_OFFSET, tsc_offset),
 657        FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
 658        FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
 659        FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
 660        FIELD64(EPT_POINTER, ept_pointer),
 661        FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
 662        FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
 663        FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
 664        FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
 665        FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
 666        FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
 667        FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
 668        FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
 669        FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
 670        FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
 671        FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
 672        FIELD64(GUEST_PDPTR0, guest_pdptr0),
 673        FIELD64(GUEST_PDPTR1, guest_pdptr1),
 674        FIELD64(GUEST_PDPTR2, guest_pdptr2),
 675        FIELD64(GUEST_PDPTR3, guest_pdptr3),
 676        FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
 677        FIELD64(HOST_IA32_PAT, host_ia32_pat),
 678        FIELD64(HOST_IA32_EFER, host_ia32_efer),
 679        FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
 680        FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
 681        FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
 682        FIELD(EXCEPTION_BITMAP, exception_bitmap),
 683        FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
 684        FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
 685        FIELD(CR3_TARGET_COUNT, cr3_target_count),
 686        FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
 687        FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
 688        FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
 689        FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
 690        FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
 691        FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
 692        FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
 693        FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
 694        FIELD(TPR_THRESHOLD, tpr_threshold),
 695        FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
 696        FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
 697        FIELD(VM_EXIT_REASON, vm_exit_reason),
 698        FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
 699        FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
 700        FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
 701        FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
 702        FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
 703        FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
 704        FIELD(GUEST_ES_LIMIT, guest_es_limit),
 705        FIELD(GUEST_CS_LIMIT, guest_cs_limit),
 706        FIELD(GUEST_SS_LIMIT, guest_ss_limit),
 707        FIELD(GUEST_DS_LIMIT, guest_ds_limit),
 708        FIELD(GUEST_FS_LIMIT, guest_fs_limit),
 709        FIELD(GUEST_GS_LIMIT, guest_gs_limit),
 710        FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
 711        FIELD(GUEST_TR_LIMIT, guest_tr_limit),
 712        FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
 713        FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
 714        FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
 715        FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
 716        FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
 717        FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
 718        FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
 719        FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
 720        FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
 721        FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
 722        FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
 723        FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
 724        FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
 725        FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
 726        FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
 727        FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
 728        FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
 729        FIELD(CR0_READ_SHADOW, cr0_read_shadow),
 730        FIELD(CR4_READ_SHADOW, cr4_read_shadow),
 731        FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
 732        FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
 733        FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
 734        FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
 735        FIELD(EXIT_QUALIFICATION, exit_qualification),
 736        FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
 737        FIELD(GUEST_CR0, guest_cr0),
 738        FIELD(GUEST_CR3, guest_cr3),
 739        FIELD(GUEST_CR4, guest_cr4),
 740        FIELD(GUEST_ES_BASE, guest_es_base),
 741        FIELD(GUEST_CS_BASE, guest_cs_base),
 742        FIELD(GUEST_SS_BASE, guest_ss_base),
 743        FIELD(GUEST_DS_BASE, guest_ds_base),
 744        FIELD(GUEST_FS_BASE, guest_fs_base),
 745        FIELD(GUEST_GS_BASE, guest_gs_base),
 746        FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
 747        FIELD(GUEST_TR_BASE, guest_tr_base),
 748        FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
 749        FIELD(GUEST_IDTR_BASE, guest_idtr_base),
 750        FIELD(GUEST_DR7, guest_dr7),
 751        FIELD(GUEST_RSP, guest_rsp),
 752        FIELD(GUEST_RIP, guest_rip),
 753        FIELD(GUEST_RFLAGS, guest_rflags),
 754        FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
 755        FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
 756        FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
 757        FIELD(HOST_CR0, host_cr0),
 758        FIELD(HOST_CR3, host_cr3),
 759        FIELD(HOST_CR4, host_cr4),
 760        FIELD(HOST_FS_BASE, host_fs_base),
 761        FIELD(HOST_GS_BASE, host_gs_base),
 762        FIELD(HOST_TR_BASE, host_tr_base),
 763        FIELD(HOST_GDTR_BASE, host_gdtr_base),
 764        FIELD(HOST_IDTR_BASE, host_idtr_base),
 765        FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
 766        FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
 767        FIELD(HOST_RSP, host_rsp),
 768        FIELD(HOST_RIP, host_rip),
 769};
 770
 771static inline short vmcs_field_to_offset(unsigned long field)
 772{
 773        BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
 774
 775        if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) ||
 776            vmcs_field_to_offset_table[field] == 0)
 777                return -ENOENT;
 778
 779        return vmcs_field_to_offset_table[field];
 780}
 781
 782static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
 783{
 784        return to_vmx(vcpu)->nested.current_vmcs12;
 785}
 786
 787static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
 788{
 789        struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
 790        if (is_error_page(page))
 791                return NULL;
 792
 793        return page;
 794}
 795
 796static void nested_release_page(struct page *page)
 797{
 798        kvm_release_page_dirty(page);
 799}
 800
 801static void nested_release_page_clean(struct page *page)
 802{
 803        kvm_release_page_clean(page);
 804}
 805
 806static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
 807static u64 construct_eptp(unsigned long root_hpa);
 808static void kvm_cpu_vmxon(u64 addr);
 809static void kvm_cpu_vmxoff(void);
 810static bool vmx_mpx_supported(void);
 811static bool vmx_xsaves_supported(void);
 812static int vmx_vm_has_apicv(struct kvm *kvm);
 813static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 814static void vmx_set_segment(struct kvm_vcpu *vcpu,
 815                            struct kvm_segment *var, int seg);
 816static void vmx_get_segment(struct kvm_vcpu *vcpu,
 817                            struct kvm_segment *var, int seg);
 818static bool guest_state_valid(struct kvm_vcpu *vcpu);
 819static u32 vmx_segment_access_rights(struct kvm_segment *var);
 820static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
 821static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
 822static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
 823static int alloc_identity_pagetable(struct kvm *kvm);
 824
 825static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 826static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 827/*
 828 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
 829 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
 830 */
 831static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
 832static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
 833
 834static unsigned long *vmx_io_bitmap_a;
 835static unsigned long *vmx_io_bitmap_b;
 836static unsigned long *vmx_msr_bitmap_legacy;
 837static unsigned long *vmx_msr_bitmap_longmode;
 838static unsigned long *vmx_msr_bitmap_legacy_x2apic;
 839static unsigned long *vmx_msr_bitmap_longmode_x2apic;
 840static unsigned long *vmx_msr_bitmap_nested;
 841static unsigned long *vmx_vmread_bitmap;
 842static unsigned long *vmx_vmwrite_bitmap;
 843
 844static bool cpu_has_load_ia32_efer;
 845static bool cpu_has_load_perf_global_ctrl;
 846
 847static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
 848static DEFINE_SPINLOCK(vmx_vpid_lock);
 849
 850static struct vmcs_config {
 851        int size;
 852        int order;
 853        u32 revision_id;
 854        u32 pin_based_exec_ctrl;
 855        u32 cpu_based_exec_ctrl;
 856        u32 cpu_based_2nd_exec_ctrl;
 857        u32 vmexit_ctrl;
 858        u32 vmentry_ctrl;
 859} vmcs_config;
 860
 861static struct vmx_capability {
 862        u32 ept;
 863        u32 vpid;
 864} vmx_capability;
 865
 866#define VMX_SEGMENT_FIELD(seg)                                  \
 867        [VCPU_SREG_##seg] = {                                   \
 868                .selector = GUEST_##seg##_SELECTOR,             \
 869                .base = GUEST_##seg##_BASE,                     \
 870                .limit = GUEST_##seg##_LIMIT,                   \
 871                .ar_bytes = GUEST_##seg##_AR_BYTES,             \
 872        }
 873
 874static const struct kvm_vmx_segment_field {
 875        unsigned selector;
 876        unsigned base;
 877        unsigned limit;
 878        unsigned ar_bytes;
 879} kvm_vmx_segment_fields[] = {
 880        VMX_SEGMENT_FIELD(CS),
 881        VMX_SEGMENT_FIELD(DS),
 882        VMX_SEGMENT_FIELD(ES),
 883        VMX_SEGMENT_FIELD(FS),
 884        VMX_SEGMENT_FIELD(GS),
 885        VMX_SEGMENT_FIELD(SS),
 886        VMX_SEGMENT_FIELD(TR),
 887        VMX_SEGMENT_FIELD(LDTR),
 888};
 889
 890static u64 host_efer;
 891
 892static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
 893
 894/*
 895 * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
 896 * away by decrementing the array size.
 897 */
 898static const u32 vmx_msr_index[] = {
 899#ifdef CONFIG_X86_64
 900        MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
 901#endif
 902        MSR_EFER, MSR_TSC_AUX, MSR_STAR,
 903};
 904
 905static inline bool is_page_fault(u32 intr_info)
 906{
 907        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 908                             INTR_INFO_VALID_MASK)) ==
 909                (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
 910}
 911
 912static inline bool is_no_device(u32 intr_info)
 913{
 914        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 915                             INTR_INFO_VALID_MASK)) ==
 916                (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
 917}
 918
 919static inline bool is_invalid_opcode(u32 intr_info)
 920{
 921        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 922                             INTR_INFO_VALID_MASK)) ==
 923                (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
 924}
 925
 926static inline bool is_external_interrupt(u32 intr_info)
 927{
 928        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
 929                == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
 930}
 931
 932static inline bool is_machine_check(u32 intr_info)
 933{
 934        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 935                             INTR_INFO_VALID_MASK)) ==
 936                (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
 937}
 938
 939static inline bool cpu_has_vmx_msr_bitmap(void)
 940{
 941        return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
 942}
 943
 944static inline bool cpu_has_vmx_tpr_shadow(void)
 945{
 946        return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
 947}
 948
 949static inline bool vm_need_tpr_shadow(struct kvm *kvm)
 950{
 951        return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
 952}
 953
 954static inline bool cpu_has_secondary_exec_ctrls(void)
 955{
 956        return vmcs_config.cpu_based_exec_ctrl &
 957                CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
 958}
 959
 960static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
 961{
 962        return vmcs_config.cpu_based_2nd_exec_ctrl &
 963                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 964}
 965
 966static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
 967{
 968        return vmcs_config.cpu_based_2nd_exec_ctrl &
 969                SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
 970}
 971
 972static inline bool cpu_has_vmx_apic_register_virt(void)
 973{
 974        return vmcs_config.cpu_based_2nd_exec_ctrl &
 975                SECONDARY_EXEC_APIC_REGISTER_VIRT;
 976}
 977
 978static inline bool cpu_has_vmx_virtual_intr_delivery(void)
 979{
 980        return vmcs_config.cpu_based_2nd_exec_ctrl &
 981                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
 982}
 983
 984static inline bool cpu_has_vmx_posted_intr(void)
 985{
 986        return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
 987}
 988
 989static inline bool cpu_has_vmx_apicv(void)
 990{
 991        return cpu_has_vmx_apic_register_virt() &&
 992                cpu_has_vmx_virtual_intr_delivery() &&
 993                cpu_has_vmx_posted_intr();
 994}
 995
 996static inline bool cpu_has_vmx_flexpriority(void)
 997{
 998        return cpu_has_vmx_tpr_shadow() &&
 999                cpu_has_vmx_virtualize_apic_accesses();
1000}
1001
1002static inline bool cpu_has_vmx_ept_execute_only(void)
1003{
1004        return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
1005}
1006
1007static inline bool cpu_has_vmx_ept_2m_page(void)
1008{
1009        return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
1010}
1011
1012static inline bool cpu_has_vmx_ept_1g_page(void)
1013{
1014        return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
1015}
1016
1017static inline bool cpu_has_vmx_ept_4levels(void)
1018{
1019        return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
1020}
1021
1022static inline bool cpu_has_vmx_ept_ad_bits(void)
1023{
1024        return vmx_capability.ept & VMX_EPT_AD_BIT;
1025}
1026
1027static inline bool cpu_has_vmx_invept_context(void)
1028{
1029        return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
1030}
1031
1032static inline bool cpu_has_vmx_invept_global(void)
1033{
1034        return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
1035}
1036
1037static inline bool cpu_has_vmx_invvpid_single(void)
1038{
1039        return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
1040}
1041
1042static inline bool cpu_has_vmx_invvpid_global(void)
1043{
1044        return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
1045}
1046
1047static inline bool cpu_has_vmx_ept(void)
1048{
1049        return vmcs_config.cpu_based_2nd_exec_ctrl &
1050                SECONDARY_EXEC_ENABLE_EPT;
1051}
1052
1053static inline bool cpu_has_vmx_unrestricted_guest(void)
1054{
1055        return vmcs_config.cpu_based_2nd_exec_ctrl &
1056                SECONDARY_EXEC_UNRESTRICTED_GUEST;
1057}
1058
1059static inline bool cpu_has_vmx_ple(void)
1060{
1061        return vmcs_config.cpu_based_2nd_exec_ctrl &
1062                SECONDARY_EXEC_PAUSE_LOOP_EXITING;
1063}
1064
1065static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
1066{
1067        return flexpriority_enabled && irqchip_in_kernel(kvm);
1068}
1069
1070static inline bool cpu_has_vmx_vpid(void)
1071{
1072        return vmcs_config.cpu_based_2nd_exec_ctrl &
1073                SECONDARY_EXEC_ENABLE_VPID;
1074}
1075
1076static inline bool cpu_has_vmx_rdtscp(void)
1077{
1078        return vmcs_config.cpu_based_2nd_exec_ctrl &
1079                SECONDARY_EXEC_RDTSCP;
1080}
1081
1082static inline bool cpu_has_vmx_invpcid(void)
1083{
1084        return vmcs_config.cpu_based_2nd_exec_ctrl &
1085                SECONDARY_EXEC_ENABLE_INVPCID;
1086}
1087
1088static inline bool cpu_has_virtual_nmis(void)
1089{
1090        return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
1091}
1092
1093static inline bool cpu_has_vmx_wbinvd_exit(void)
1094{
1095        return vmcs_config.cpu_based_2nd_exec_ctrl &
1096                SECONDARY_EXEC_WBINVD_EXITING;
1097}
1098
1099static inline bool cpu_has_vmx_shadow_vmcs(void)
1100{
1101        u64 vmx_msr;
1102        rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
1103        /* check if the cpu supports writing r/o exit information fields */
1104        if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
1105                return false;
1106
1107        return vmcs_config.cpu_based_2nd_exec_ctrl &
1108                SECONDARY_EXEC_SHADOW_VMCS;
1109}
1110
1111static inline bool cpu_has_vmx_pml(void)
1112{
1113        return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
1114}
1115
1116static inline bool report_flexpriority(void)
1117{
1118        return flexpriority_enabled;
1119}
1120
1121static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
1122{
1123        return vmcs12->cpu_based_vm_exec_control & bit;
1124}
1125
1126static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
1127{
1128        return (vmcs12->cpu_based_vm_exec_control &
1129                        CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
1130                (vmcs12->secondary_vm_exec_control & bit);
1131}
1132
1133static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
1134{
1135        return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
1136}
1137
1138static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
1139{
1140        return vmcs12->pin_based_vm_exec_control &
1141                PIN_BASED_VMX_PREEMPTION_TIMER;
1142}
1143
1144static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
1145{
1146        return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
1147}
1148
1149static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
1150{
1151        return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES) &&
1152                vmx_xsaves_supported();
1153}
1154
1155static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
1156{
1157        return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
1158}
1159
1160static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
1161{
1162        return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
1163}
1164
1165static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
1166{
1167        return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
1168}
1169
1170static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
1171{
1172        return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
1173}
1174
1175static inline bool is_exception(u32 intr_info)
1176{
1177        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
1178                == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
1179}
1180
1181static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
1182                              u32 exit_intr_info,
1183                              unsigned long exit_qualification);
1184static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
1185                        struct vmcs12 *vmcs12,
1186                        u32 reason, unsigned long qualification);
1187
1188static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
1189{
1190        int i;
1191
1192        for (i = 0; i < vmx->nmsrs; ++i)
1193                if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
1194                        return i;
1195        return -1;
1196}
1197
1198static inline void __invvpid(int ext, u16 vpid, gva_t gva)
1199{
1200    struct {
1201        u64 vpid : 16;
1202        u64 rsvd : 48;
1203        u64 gva;
1204    } operand = { vpid, 0, gva };
1205
1206    asm volatile (__ex(ASM_VMX_INVVPID)
1207                  /* CF==1 or ZF==1 --> rc = -1 */
1208                  "; ja 1f ; ud2 ; 1:"
1209                  : : "a"(&operand), "c"(ext) : "cc", "memory");
1210}
1211
1212static inline void __invept(int ext, u64 eptp, gpa_t gpa)
1213{
1214        struct {
1215                u64 eptp, gpa;
1216        } operand = {eptp, gpa};
1217
1218        asm volatile (__ex(ASM_VMX_INVEPT)
1219                        /* CF==1 or ZF==1 --> rc = -1 */
1220                        "; ja 1f ; ud2 ; 1:\n"
1221                        : : "a" (&operand), "c" (ext) : "cc", "memory");
1222}
1223
1224static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
1225{
1226        int i;
1227
1228        i = __find_msr_index(vmx, msr);
1229        if (i >= 0)
1230                return &vmx->guest_msrs[i];
1231        return NULL;
1232}
1233
1234static void vmcs_clear(struct vmcs *vmcs)
1235{
1236        u64 phys_addr = __pa(vmcs);
1237        u8 error;
1238
1239        asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
1240                      : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
1241                      : "cc", "memory");
1242        if (error)
1243                printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
1244                       vmcs, phys_addr);
1245}
1246
1247static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
1248{
1249        vmcs_clear(loaded_vmcs->vmcs);
1250        loaded_vmcs->cpu = -1;
1251        loaded_vmcs->launched = 0;
1252}
1253
1254static void vmcs_load(struct vmcs *vmcs)
1255{
1256        u64 phys_addr = __pa(vmcs);
1257        u8 error;
1258
1259        asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
1260                        : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
1261                        : "cc", "memory");
1262        if (error)
1263                printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
1264                       vmcs, phys_addr);
1265}
1266
1267#ifdef CONFIG_KEXEC
1268/*
1269 * This bitmap is used to indicate whether the vmclear
1270 * operation is enabled on all cpus. All disabled by
1271 * default.
1272 */
1273static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
1274
1275static inline void crash_enable_local_vmclear(int cpu)
1276{
1277        cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
1278}
1279
1280static inline void crash_disable_local_vmclear(int cpu)
1281{
1282        cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
1283}
1284
1285static inline int crash_local_vmclear_enabled(int cpu)
1286{
1287        return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
1288}
1289
1290static void crash_vmclear_local_loaded_vmcss(void)
1291{
1292        int cpu = raw_smp_processor_id();
1293        struct loaded_vmcs *v;
1294
1295        if (!crash_local_vmclear_enabled(cpu))
1296                return;
1297
1298        list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
1299                            loaded_vmcss_on_cpu_link)
1300                vmcs_clear(v->vmcs);
1301}
1302#else
1303static inline void crash_enable_local_vmclear(int cpu) { }
1304static inline void crash_disable_local_vmclear(int cpu) { }
1305#endif /* CONFIG_KEXEC */
1306
1307static void __loaded_vmcs_clear(void *arg)
1308{
1309        struct loaded_vmcs *loaded_vmcs = arg;
1310        int cpu = raw_smp_processor_id();
1311
1312        if (loaded_vmcs->cpu != cpu)
1313                return; /* vcpu migration can race with cpu offline */
1314        if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
1315                per_cpu(current_vmcs, cpu) = NULL;
1316        crash_disable_local_vmclear(cpu);
1317        list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
1318
1319        /*
1320         * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
1321         * is before setting loaded_vmcs->vcpu to -1 which is done in
1322         * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
1323         * then adds the vmcs into percpu list before it is deleted.
1324         */
1325        smp_wmb();
1326
1327        loaded_vmcs_init(loaded_vmcs);
1328        crash_enable_local_vmclear(cpu);
1329}
1330
1331static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
1332{
1333        int cpu = loaded_vmcs->cpu;
1334
1335        if (cpu != -1)
1336                smp_call_function_single(cpu,
1337                         __loaded_vmcs_clear, loaded_vmcs, 1);
1338}
1339
1340static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
1341{
1342        if (vmx->vpid == 0)
1343                return;
1344
1345        if (cpu_has_vmx_invvpid_single())
1346                __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
1347}
1348
1349static inline void vpid_sync_vcpu_global(void)
1350{
1351        if (cpu_has_vmx_invvpid_global())
1352                __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
1353}
1354
1355static inline void vpid_sync_context(struct vcpu_vmx *vmx)
1356{
1357        if (cpu_has_vmx_invvpid_single())
1358                vpid_sync_vcpu_single(vmx);
1359        else
1360                vpid_sync_vcpu_global();
1361}
1362
1363static inline void ept_sync_global(void)
1364{
1365        if (cpu_has_vmx_invept_global())
1366                __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
1367}
1368
1369static inline void ept_sync_context(u64 eptp)
1370{
1371        if (enable_ept) {
1372                if (cpu_has_vmx_invept_context())
1373                        __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
1374                else
1375                        ept_sync_global();
1376        }
1377}
1378
1379static __always_inline unsigned long vmcs_readl(unsigned long field)
1380{
1381        unsigned long value;
1382
1383        asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
1384                      : "=a"(value) : "d"(field) : "cc");
1385        return value;
1386}
1387
1388static __always_inline u16 vmcs_read16(unsigned long field)
1389{
1390        return vmcs_readl(field);
1391}
1392
1393static __always_inline u32 vmcs_read32(unsigned long field)
1394{
1395        return vmcs_readl(field);
1396}
1397
1398static __always_inline u64 vmcs_read64(unsigned long field)
1399{
1400#ifdef CONFIG_X86_64
1401        return vmcs_readl(field);
1402#else
1403        return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
1404#endif
1405}
1406
1407static noinline void vmwrite_error(unsigned long field, unsigned long value)
1408{
1409        printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
1410               field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
1411        dump_stack();
1412}
1413
1414static void vmcs_writel(unsigned long field, unsigned long value)
1415{
1416        u8 error;
1417
1418        asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
1419                       : "=q"(error) : "a"(value), "d"(field) : "cc");
1420        if (unlikely(error))
1421                vmwrite_error(field, value);
1422}
1423
1424static void vmcs_write16(unsigned long field, u16 value)
1425{
1426        vmcs_writel(field, value);
1427}
1428
1429static void vmcs_write32(unsigned long field, u32 value)
1430{
1431        vmcs_writel(field, value);
1432}
1433
1434static void vmcs_write64(unsigned long field, u64 value)
1435{
1436        vmcs_writel(field, value);
1437#ifndef CONFIG_X86_64
1438        asm volatile ("");
1439        vmcs_writel(field+1, value >> 32);
1440#endif
1441}
1442
1443static void vmcs_clear_bits(unsigned long field, u32 mask)
1444{
1445        vmcs_writel(field, vmcs_readl(field) & ~mask);
1446}
1447
1448static void vmcs_set_bits(unsigned long field, u32 mask)
1449{
1450        vmcs_writel(field, vmcs_readl(field) | mask);
1451}
1452
1453static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
1454{
1455        vmcs_write32(VM_ENTRY_CONTROLS, val);
1456        vmx->vm_entry_controls_shadow = val;
1457}
1458
1459static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val)
1460{
1461        if (vmx->vm_entry_controls_shadow != val)
1462                vm_entry_controls_init(vmx, val);
1463}
1464
1465static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx)
1466{
1467        return vmx->vm_entry_controls_shadow;
1468}
1469
1470
1471static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val)
1472{
1473        vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val);
1474}
1475
1476static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
1477{
1478        vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
1479}
1480
1481static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
1482{
1483        vmcs_write32(VM_EXIT_CONTROLS, val);
1484        vmx->vm_exit_controls_shadow = val;
1485}
1486
1487static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val)
1488{
1489        if (vmx->vm_exit_controls_shadow != val)
1490                vm_exit_controls_init(vmx, val);
1491}
1492
1493static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx)
1494{
1495        return vmx->vm_exit_controls_shadow;
1496}
1497
1498
1499static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val)
1500{
1501        vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val);
1502}
1503
1504static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
1505{
1506        vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val);
1507}
1508
1509static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
1510{
1511        vmx->segment_cache.bitmask = 0;
1512}
1513
1514static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
1515                                       unsigned field)
1516{
1517        bool ret;
1518        u32 mask = 1 << (seg * SEG_FIELD_NR + field);
1519
1520        if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
1521                vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
1522                vmx->segment_cache.bitmask = 0;
1523        }
1524        ret = vmx->segment_cache.bitmask & mask;
1525        vmx->segment_cache.bitmask |= mask;
1526        return ret;
1527}
1528
1529static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
1530{
1531        u16 *p = &vmx->segment_cache.seg[seg].selector;
1532
1533        if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
1534                *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
1535        return *p;
1536}
1537
1538static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
1539{
1540        ulong *p = &vmx->segment_cache.seg[seg].base;
1541
1542        if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
1543                *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
1544        return *p;
1545}
1546
1547static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
1548{
1549        u32 *p = &vmx->segment_cache.seg[seg].limit;
1550
1551        if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
1552                *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
1553        return *p;
1554}
1555
1556static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
1557{
1558        u32 *p = &vmx->segment_cache.seg[seg].ar;
1559
1560        if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
1561                *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
1562        return *p;
1563}
1564
1565static void update_exception_bitmap(struct kvm_vcpu *vcpu)
1566{
1567        u32 eb;
1568
1569        eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
1570             (1u << NM_VECTOR) | (1u << DB_VECTOR);
1571        if ((vcpu->guest_debug &
1572             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
1573            (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
1574                eb |= 1u << BP_VECTOR;
1575        if (to_vmx(vcpu)->rmode.vm86_active)
1576                eb = ~0;
1577        if (enable_ept)
1578                eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
1579        if (vcpu->fpu_active)
1580                eb &= ~(1u << NM_VECTOR);
1581
1582        /* When we are running a nested L2 guest and L1 specified for it a
1583         * certain exception bitmap, we must trap the same exceptions and pass
1584         * them to L1. When running L2, we will only handle the exceptions
1585         * specified above if L1 did not want them.
1586         */
1587        if (is_guest_mode(vcpu))
1588                eb |= get_vmcs12(vcpu)->exception_bitmap;
1589
1590        vmcs_write32(EXCEPTION_BITMAP, eb);
1591}
1592
1593static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1594                unsigned long entry, unsigned long exit)
1595{
1596        vm_entry_controls_clearbit(vmx, entry);
1597        vm_exit_controls_clearbit(vmx, exit);
1598}
1599
1600static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
1601{
1602        unsigned i;
1603        struct msr_autoload *m = &vmx->msr_autoload;
1604
1605        switch (msr) {
1606        case MSR_EFER:
1607                if (cpu_has_load_ia32_efer) {
1608                        clear_atomic_switch_msr_special(vmx,
1609                                        VM_ENTRY_LOAD_IA32_EFER,
1610                                        VM_EXIT_LOAD_IA32_EFER);
1611                        return;
1612                }
1613                break;
1614        case MSR_CORE_PERF_GLOBAL_CTRL:
1615                if (cpu_has_load_perf_global_ctrl) {
1616                        clear_atomic_switch_msr_special(vmx,
1617                                        VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1618                                        VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
1619                        return;
1620                }
1621                break;
1622        }
1623
1624        for (i = 0; i < m->nr; ++i)
1625                if (m->guest[i].index == msr)
1626                        break;
1627
1628        if (i == m->nr)
1629                return;
1630        --m->nr;
1631        m->guest[i] = m->guest[m->nr];
1632        m->host[i] = m->host[m->nr];
1633        vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
1634        vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
1635}
1636
1637static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1638                unsigned long entry, unsigned long exit,
1639                unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
1640                u64 guest_val, u64 host_val)
1641{
1642        vmcs_write64(guest_val_vmcs, guest_val);
1643        vmcs_write64(host_val_vmcs, host_val);
1644        vm_entry_controls_setbit(vmx, entry);
1645        vm_exit_controls_setbit(vmx, exit);
1646}
1647
1648static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
1649                                  u64 guest_val, u64 host_val)
1650{
1651        unsigned i;
1652        struct msr_autoload *m = &vmx->msr_autoload;
1653
1654        switch (msr) {
1655        case MSR_EFER:
1656                if (cpu_has_load_ia32_efer) {
1657                        add_atomic_switch_msr_special(vmx,
1658                                        VM_ENTRY_LOAD_IA32_EFER,
1659                                        VM_EXIT_LOAD_IA32_EFER,
1660                                        GUEST_IA32_EFER,
1661                                        HOST_IA32_EFER,
1662                                        guest_val, host_val);
1663                        return;
1664                }
1665                break;
1666        case MSR_CORE_PERF_GLOBAL_CTRL:
1667                if (cpu_has_load_perf_global_ctrl) {
1668                        add_atomic_switch_msr_special(vmx,
1669                                        VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1670                                        VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
1671                                        GUEST_IA32_PERF_GLOBAL_CTRL,
1672                                        HOST_IA32_PERF_GLOBAL_CTRL,
1673                                        guest_val, host_val);
1674                        return;
1675                }
1676                break;
1677        }
1678
1679        for (i = 0; i < m->nr; ++i)
1680                if (m->guest[i].index == msr)
1681                        break;
1682
1683        if (i == NR_AUTOLOAD_MSRS) {
1684                printk_once(KERN_WARNING "Not enough msr switch entries. "
1685                                "Can't add msr %x\n", msr);
1686                return;
1687        } else if (i == m->nr) {
1688                ++m->nr;
1689                vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
1690                vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
1691        }
1692
1693        m->guest[i].index = msr;
1694        m->guest[i].value = guest_val;
1695        m->host[i].index = msr;
1696        m->host[i].value = host_val;
1697}
1698
1699static void reload_tss(void)
1700{
1701        /*
1702         * VT restores TR but not its size.  Useless.
1703         */
1704        struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
1705        struct desc_struct *descs;
1706
1707        descs = (void *)gdt->address;
1708        descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
1709        load_TR_desc();
1710}
1711
1712static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
1713{
1714        u64 guest_efer;
1715        u64 ignore_bits;
1716
1717        guest_efer = vmx->vcpu.arch.efer;
1718
1719        /*
1720         * NX is emulated; LMA and LME handled by hardware; SCE meaningless
1721         * outside long mode
1722         */
1723        ignore_bits = EFER_NX | EFER_SCE;
1724#ifdef CONFIG_X86_64
1725        ignore_bits |= EFER_LMA | EFER_LME;
1726        /* SCE is meaningful only in long mode on Intel */
1727        if (guest_efer & EFER_LMA)
1728                ignore_bits &= ~(u64)EFER_SCE;
1729#endif
1730        guest_efer &= ~ignore_bits;
1731        guest_efer |= host_efer & ignore_bits;
1732        vmx->guest_msrs[efer_offset].data = guest_efer;
1733        vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
1734
1735        clear_atomic_switch_msr(vmx, MSR_EFER);
1736
1737        /*
1738         * On EPT, we can't emulate NX, so we must switch EFER atomically.
1739         * On CPUs that support "load IA32_EFER", always switch EFER
1740         * atomically, since it's faster than switching it manually.
1741         */
1742        if (cpu_has_load_ia32_efer ||
1743            (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
1744                guest_efer = vmx->vcpu.arch.efer;
1745                if (!(guest_efer & EFER_LMA))
1746                        guest_efer &= ~EFER_LME;
1747                if (guest_efer != host_efer)
1748                        add_atomic_switch_msr(vmx, MSR_EFER,
1749                                              guest_efer, host_efer);
1750                return false;
1751        }
1752
1753        return true;
1754}
1755
1756static unsigned long segment_base(u16 selector)
1757{
1758        struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
1759        struct desc_struct *d;
1760        unsigned long table_base;
1761        unsigned long v;
1762
1763        if (!(selector & ~3))
1764                return 0;
1765
1766        table_base = gdt->address;
1767
1768        if (selector & 4) {           /* from ldt */
1769                u16 ldt_selector = kvm_read_ldt();
1770
1771                if (!(ldt_selector & ~3))
1772                        return 0;
1773
1774                table_base = segment_base(ldt_selector);
1775        }
1776        d = (struct desc_struct *)(table_base + (selector & ~7));
1777        v = get_desc_base(d);
1778#ifdef CONFIG_X86_64
1779       if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
1780               v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
1781#endif
1782        return v;
1783}
1784
1785static inline unsigned long kvm_read_tr_base(void)
1786{
1787        u16 tr;
1788        asm("str %0" : "=g"(tr));
1789        return segment_base(tr);
1790}
1791
1792static void vmx_save_host_state(struct kvm_vcpu *vcpu)
1793{
1794        struct vcpu_vmx *vmx = to_vmx(vcpu);
1795        int i;
1796
1797        if (vmx->host_state.loaded)
1798                return;
1799
1800        vmx->host_state.loaded = 1;
1801        /*
1802         * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
1803         * allow segment selectors with cpl > 0 or ti == 1.
1804         */
1805        vmx->host_state.ldt_sel = kvm_read_ldt();
1806        vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
1807        savesegment(fs, vmx->host_state.fs_sel);
1808        if (!(vmx->host_state.fs_sel & 7)) {
1809                vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
1810                vmx->host_state.fs_reload_needed = 0;
1811        } else {
1812                vmcs_write16(HOST_FS_SELECTOR, 0);
1813                vmx->host_state.fs_reload_needed = 1;
1814        }
1815        savesegment(gs, vmx->host_state.gs_sel);
1816        if (!(vmx->host_state.gs_sel & 7))
1817                vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
1818        else {
1819                vmcs_write16(HOST_GS_SELECTOR, 0);
1820                vmx->host_state.gs_ldt_reload_needed = 1;
1821        }
1822
1823#ifdef CONFIG_X86_64
1824        savesegment(ds, vmx->host_state.ds_sel);
1825        savesegment(es, vmx->host_state.es_sel);
1826#endif
1827
1828#ifdef CONFIG_X86_64
1829        vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
1830        vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
1831#else
1832        vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
1833        vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
1834#endif
1835
1836#ifdef CONFIG_X86_64
1837        rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
1838        if (is_long_mode(&vmx->vcpu))
1839                wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1840#endif
1841        if (boot_cpu_has(X86_FEATURE_MPX))
1842                rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
1843        for (i = 0; i < vmx->save_nmsrs; ++i)
1844                kvm_set_shared_msr(vmx->guest_msrs[i].index,
1845                                   vmx->guest_msrs[i].data,
1846                                   vmx->guest_msrs[i].mask);
1847}
1848
1849static void __vmx_load_host_state(struct vcpu_vmx *vmx)
1850{
1851        if (!vmx->host_state.loaded)
1852                return;
1853
1854        ++vmx->vcpu.stat.host_state_reload;
1855        vmx->host_state.loaded = 0;
1856#ifdef CONFIG_X86_64
1857        if (is_long_mode(&vmx->vcpu))
1858                rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1859#endif
1860        if (vmx->host_state.gs_ldt_reload_needed) {
1861                kvm_load_ldt(vmx->host_state.ldt_sel);
1862#ifdef CONFIG_X86_64
1863                load_gs_index(vmx->host_state.gs_sel);
1864#else
1865                loadsegment(gs, vmx->host_state.gs_sel);
1866#endif
1867        }
1868        if (vmx->host_state.fs_reload_needed)
1869                loadsegment(fs, vmx->host_state.fs_sel);
1870#ifdef CONFIG_X86_64
1871        if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) {
1872                loadsegment(ds, vmx->host_state.ds_sel);
1873                loadsegment(es, vmx->host_state.es_sel);
1874        }
1875#endif
1876        reload_tss();
1877#ifdef CONFIG_X86_64
1878        wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
1879#endif
1880        if (vmx->host_state.msr_host_bndcfgs)
1881                wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
1882        /*
1883         * If the FPU is not active (through the host task or
1884         * the guest vcpu), then restore the cr0.TS bit.
1885         */
1886        if (!user_has_fpu() && !vmx->vcpu.guest_fpu_loaded)
1887                stts();
1888        load_gdt(this_cpu_ptr(&host_gdt));
1889}
1890
1891static void vmx_load_host_state(struct vcpu_vmx *vmx)
1892{
1893        preempt_disable();
1894        __vmx_load_host_state(vmx);
1895        preempt_enable();
1896}
1897
1898/*
1899 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
1900 * vcpu mutex is already taken.
1901 */
1902static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1903{
1904        struct vcpu_vmx *vmx = to_vmx(vcpu);
1905        u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1906
1907        if (!vmm_exclusive)
1908                kvm_cpu_vmxon(phys_addr);
1909        else if (vmx->loaded_vmcs->cpu != cpu)
1910                loaded_vmcs_clear(vmx->loaded_vmcs);
1911
1912        if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
1913                per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
1914                vmcs_load(vmx->loaded_vmcs->vmcs);
1915        }
1916
1917        if (vmx->loaded_vmcs->cpu != cpu) {
1918                struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
1919                unsigned long sysenter_esp;
1920
1921                kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1922                local_irq_disable();
1923                crash_disable_local_vmclear(cpu);
1924
1925                /*
1926                 * Read loaded_vmcs->cpu should be before fetching
1927                 * loaded_vmcs->loaded_vmcss_on_cpu_link.
1928                 * See the comments in __loaded_vmcs_clear().
1929                 */
1930                smp_rmb();
1931
1932                list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1933                         &per_cpu(loaded_vmcss_on_cpu, cpu));
1934                crash_enable_local_vmclear(cpu);
1935                local_irq_enable();
1936
1937                /*
1938                 * Linux uses per-cpu TSS and GDT, so set these when switching
1939                 * processors.
1940                 */
1941                vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
1942                vmcs_writel(HOST_GDTR_BASE, gdt->address);   /* 22.2.4 */
1943
1944                rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
1945                vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
1946                vmx->loaded_vmcs->cpu = cpu;
1947        }
1948}
1949
1950static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1951{
1952        __vmx_load_host_state(to_vmx(vcpu));
1953        if (!vmm_exclusive) {
1954                __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
1955                vcpu->cpu = -1;
1956                kvm_cpu_vmxoff();
1957        }
1958}
1959
1960static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
1961{
1962        ulong cr0;
1963
1964        if (vcpu->fpu_active)
1965                return;
1966        vcpu->fpu_active = 1;
1967        cr0 = vmcs_readl(GUEST_CR0);
1968        cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
1969        cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
1970        vmcs_writel(GUEST_CR0, cr0);
1971        update_exception_bitmap(vcpu);
1972        vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
1973        if (is_guest_mode(vcpu))
1974                vcpu->arch.cr0_guest_owned_bits &=
1975                        ~get_vmcs12(vcpu)->cr0_guest_host_mask;
1976        vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
1977}
1978
1979static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
1980
1981/*
1982 * Return the cr0 value that a nested guest would read. This is a combination
1983 * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
1984 * its hypervisor (cr0_read_shadow).
1985 */
1986static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
1987{
1988        return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
1989                (fields->cr0_read_shadow & fields->cr0_guest_host_mask);
1990}
1991static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
1992{
1993        return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
1994                (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
1995}
1996
1997static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
1998{
1999        /* Note that there is no vcpu->fpu_active = 0 here. The caller must
2000         * set this *before* calling this function.
2001         */
2002        vmx_decache_cr0_guest_bits(vcpu);
2003        vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
2004        update_exception_bitmap(vcpu);
2005        vcpu->arch.cr0_guest_owned_bits = 0;
2006        vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
2007        if (is_guest_mode(vcpu)) {
2008                /*
2009                 * L1's specified read shadow might not contain the TS bit,
2010                 * so now that we turned on shadowing of this bit, we need to
2011                 * set this bit of the shadow. Like in nested_vmx_run we need
2012                 * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet
2013                 * up-to-date here because we just decached cr0.TS (and we'll
2014                 * only update vmcs12->guest_cr0 on nested exit).
2015                 */
2016                struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2017                vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) |
2018                        (vcpu->arch.cr0 & X86_CR0_TS);
2019                vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
2020        } else
2021                vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
2022}
2023
2024static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
2025{
2026        unsigned long rflags, save_rflags;
2027
2028        if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
2029                __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
2030                rflags = vmcs_readl(GUEST_RFLAGS);
2031                if (to_vmx(vcpu)->rmode.vm86_active) {
2032                        rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
2033                        save_rflags = to_vmx(vcpu)->rmode.save_rflags;
2034                        rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
2035                }
2036                to_vmx(vcpu)->rflags = rflags;
2037        }
2038        return to_vmx(vcpu)->rflags;
2039}
2040
2041static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
2042{
2043        __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
2044        to_vmx(vcpu)->rflags = rflags;
2045        if (to_vmx(vcpu)->rmode.vm86_active) {
2046                to_vmx(vcpu)->rmode.save_rflags = rflags;
2047                rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
2048        }
2049        vmcs_writel(GUEST_RFLAGS, rflags);
2050}
2051
2052static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
2053{
2054        u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2055        int ret = 0;
2056
2057        if (interruptibility & GUEST_INTR_STATE_STI)
2058                ret |= KVM_X86_SHADOW_INT_STI;
2059        if (interruptibility & GUEST_INTR_STATE_MOV_SS)
2060                ret |= KVM_X86_SHADOW_INT_MOV_SS;
2061
2062        return ret;
2063}
2064
2065static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
2066{
2067        u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2068        u32 interruptibility = interruptibility_old;
2069
2070        interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
2071
2072        if (mask & KVM_X86_SHADOW_INT_MOV_SS)
2073                interruptibility |= GUEST_INTR_STATE_MOV_SS;
2074        else if (mask & KVM_X86_SHADOW_INT_STI)
2075                interruptibility |= GUEST_INTR_STATE_STI;
2076
2077        if ((interruptibility != interruptibility_old))
2078                vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
2079}
2080
2081static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
2082{
2083        unsigned long rip;
2084
2085        rip = kvm_rip_read(vcpu);
2086        rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2087        kvm_rip_write(vcpu, rip);
2088
2089        /* skipping an emulated instruction also counts */
2090        vmx_set_interrupt_shadow(vcpu, 0);
2091}
2092
2093/*
2094 * KVM wants to inject page-faults which it got to the guest. This function
2095 * checks whether in a nested guest, we need to inject them to L1 or L2.
2096 */
2097static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr)
2098{
2099        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2100
2101        if (!(vmcs12->exception_bitmap & (1u << nr)))
2102                return 0;
2103
2104        nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
2105                          vmcs_read32(VM_EXIT_INTR_INFO),
2106                          vmcs_readl(EXIT_QUALIFICATION));
2107        return 1;
2108}
2109
2110static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
2111                                bool has_error_code, u32 error_code,
2112                                bool reinject)
2113{
2114        struct vcpu_vmx *vmx = to_vmx(vcpu);
2115        u32 intr_info = nr | INTR_INFO_VALID_MASK;
2116
2117        if (!reinject && is_guest_mode(vcpu) &&
2118            nested_vmx_check_exception(vcpu, nr))
2119                return;
2120
2121        if (has_error_code) {
2122                vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
2123                intr_info |= INTR_INFO_DELIVER_CODE_MASK;
2124        }
2125
2126        if (vmx->rmode.vm86_active) {
2127                int inc_eip = 0;
2128                if (kvm_exception_is_soft(nr))
2129                        inc_eip = vcpu->arch.event_exit_inst_len;
2130                if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
2131                        kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2132                return;
2133        }
2134
2135        if (kvm_exception_is_soft(nr)) {
2136                vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2137                             vmx->vcpu.arch.event_exit_inst_len);
2138                intr_info |= INTR_TYPE_SOFT_EXCEPTION;
2139        } else
2140                intr_info |= INTR_TYPE_HARD_EXCEPTION;
2141
2142        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
2143}
2144
2145static bool vmx_rdtscp_supported(void)
2146{
2147        return cpu_has_vmx_rdtscp();
2148}
2149
2150static bool vmx_invpcid_supported(void)
2151{
2152        return cpu_has_vmx_invpcid() && enable_ept;
2153}
2154
2155/*
2156 * Swap MSR entry in host/guest MSR entry array.
2157 */
2158static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
2159{
2160        struct shared_msr_entry tmp;
2161
2162        tmp = vmx->guest_msrs[to];
2163        vmx->guest_msrs[to] = vmx->guest_msrs[from];
2164        vmx->guest_msrs[from] = tmp;
2165}
2166
2167static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
2168{
2169        unsigned long *msr_bitmap;
2170
2171        if (is_guest_mode(vcpu))
2172                msr_bitmap = vmx_msr_bitmap_nested;
2173        else if (irqchip_in_kernel(vcpu->kvm) &&
2174                apic_x2apic_mode(vcpu->arch.apic)) {
2175                if (is_long_mode(vcpu))
2176                        msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
2177                else
2178                        msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
2179        } else {
2180                if (is_long_mode(vcpu))
2181                        msr_bitmap = vmx_msr_bitmap_longmode;
2182                else
2183                        msr_bitmap = vmx_msr_bitmap_legacy;
2184        }
2185
2186        vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
2187}
2188
2189/*
2190 * Set up the vmcs to automatically save and restore system
2191 * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
2192 * mode, as fiddling with msrs is very expensive.
2193 */
2194static void setup_msrs(struct vcpu_vmx *vmx)
2195{
2196        int save_nmsrs, index;
2197
2198        save_nmsrs = 0;
2199#ifdef CONFIG_X86_64
2200        if (is_long_mode(&vmx->vcpu)) {
2201                index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
2202                if (index >= 0)
2203                        move_msr_up(vmx, index, save_nmsrs++);
2204                index = __find_msr_index(vmx, MSR_LSTAR);
2205                if (index >= 0)
2206                        move_msr_up(vmx, index, save_nmsrs++);
2207                index = __find_msr_index(vmx, MSR_CSTAR);
2208                if (index >= 0)
2209                        move_msr_up(vmx, index, save_nmsrs++);
2210                index = __find_msr_index(vmx, MSR_TSC_AUX);
2211                if (index >= 0 && vmx->rdtscp_enabled)
2212                        move_msr_up(vmx, index, save_nmsrs++);
2213                /*
2214                 * MSR_STAR is only needed on long mode guests, and only
2215                 * if efer.sce is enabled.
2216                 */
2217                index = __find_msr_index(vmx, MSR_STAR);
2218                if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
2219                        move_msr_up(vmx, index, save_nmsrs++);
2220        }
2221#endif
2222        index = __find_msr_index(vmx, MSR_EFER);
2223        if (index >= 0 && update_transition_efer(vmx, index))
2224                move_msr_up(vmx, index, save_nmsrs++);
2225
2226        vmx->save_nmsrs = save_nmsrs;
2227
2228        if (cpu_has_vmx_msr_bitmap())
2229                vmx_set_msr_bitmap(&vmx->vcpu);
2230}
2231
2232/*
2233 * reads and returns guest's timestamp counter "register"
2234 * guest_tsc = host_tsc + tsc_offset    -- 21.3
2235 */
2236static u64 guest_read_tsc(void)
2237{
2238        u64 host_tsc, tsc_offset;
2239
2240        rdtscll(host_tsc);
2241        tsc_offset = vmcs_read64(TSC_OFFSET);
2242        return host_tsc + tsc_offset;
2243}
2244
2245/*
2246 * Like guest_read_tsc, but always returns L1's notion of the timestamp
2247 * counter, even if a nested guest (L2) is currently running.
2248 */
2249static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
2250{
2251        u64 tsc_offset;
2252
2253        tsc_offset = is_guest_mode(vcpu) ?
2254                to_vmx(vcpu)->nested.vmcs01_tsc_offset :
2255                vmcs_read64(TSC_OFFSET);
2256        return host_tsc + tsc_offset;
2257}
2258
2259/*
2260 * Engage any workarounds for mis-matched TSC rates.  Currently limited to
2261 * software catchup for faster rates on slower CPUs.
2262 */
2263static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
2264{
2265        if (!scale)
2266                return;
2267
2268        if (user_tsc_khz > tsc_khz) {
2269                vcpu->arch.tsc_catchup = 1;
2270                vcpu->arch.tsc_always_catchup = 1;
2271        } else
2272                WARN(1, "user requested TSC rate below hardware speed\n");
2273}
2274
2275static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
2276{
2277        return vmcs_read64(TSC_OFFSET);
2278}
2279
2280/*
2281 * writes 'offset' into guest's timestamp counter offset register
2282 */
2283static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
2284{
2285        if (is_guest_mode(vcpu)) {
2286                /*
2287                 * We're here if L1 chose not to trap WRMSR to TSC. According
2288                 * to the spec, this should set L1's TSC; The offset that L1
2289                 * set for L2 remains unchanged, and still needs to be added
2290                 * to the newly set TSC to get L2's TSC.
2291                 */
2292                struct vmcs12 *vmcs12;
2293                to_vmx(vcpu)->nested.vmcs01_tsc_offset = offset;
2294                /* recalculate vmcs02.TSC_OFFSET: */
2295                vmcs12 = get_vmcs12(vcpu);
2296                vmcs_write64(TSC_OFFSET, offset +
2297                        (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
2298                         vmcs12->tsc_offset : 0));
2299        } else {
2300                trace_kvm_write_tsc_offset(vcpu->vcpu_id,
2301                                           vmcs_read64(TSC_OFFSET), offset);
2302                vmcs_write64(TSC_OFFSET, offset);
2303        }
2304}
2305
2306static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
2307{
2308        u64 offset = vmcs_read64(TSC_OFFSET);
2309
2310        vmcs_write64(TSC_OFFSET, offset + adjustment);
2311        if (is_guest_mode(vcpu)) {
2312                /* Even when running L2, the adjustment needs to apply to L1 */
2313                to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment;
2314        } else
2315                trace_kvm_write_tsc_offset(vcpu->vcpu_id, offset,
2316                                           offset + adjustment);
2317}
2318
2319static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
2320{
2321        return target_tsc - native_read_tsc();
2322}
2323
2324static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
2325{
2326        struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
2327        return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31)));
2328}
2329
2330/*
2331 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
2332 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
2333 * all guests if the "nested" module option is off, and can also be disabled
2334 * for a single guest by disabling its VMX cpuid bit.
2335 */
2336static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
2337{
2338        return nested && guest_cpuid_has_vmx(vcpu);
2339}
2340
2341/*
2342 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
2343 * returned for the various VMX controls MSRs when nested VMX is enabled.
2344 * The same values should also be used to verify that vmcs12 control fields are
2345 * valid during nested entry from L1 to L2.
2346 * Each of these control msrs has a low and high 32-bit half: A low bit is on
2347 * if the corresponding bit in the (32-bit) control field *must* be on, and a
2348 * bit in the high half is on if the corresponding bit in the control field
2349 * may be on. See also vmx_control_verify().
2350 */
2351static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2352{
2353        /*
2354         * Note that as a general rule, the high half of the MSRs (bits in
2355         * the control fields which may be 1) should be initialized by the
2356         * intersection of the underlying hardware's MSR (i.e., features which
2357         * can be supported) and the list of features we want to expose -
2358         * because they are known to be properly supported in our code.
2359         * Also, usually, the low half of the MSRs (bits which must be 1) can
2360         * be set to 0, meaning that L1 may turn off any of these bits. The
2361         * reason is that if one of these bits is necessary, it will appear
2362         * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
2363         * fields of vmcs01 and vmcs02, will turn these bits off - and
2364         * nested_vmx_exit_handled() will not pass related exits to L1.
2365         * These rules have exceptions below.
2366         */
2367
2368        /* pin-based controls */
2369        rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
2370                vmx->nested.nested_vmx_pinbased_ctls_low,
2371                vmx->nested.nested_vmx_pinbased_ctls_high);
2372        vmx->nested.nested_vmx_pinbased_ctls_low |=
2373                PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2374        vmx->nested.nested_vmx_pinbased_ctls_high &=
2375                PIN_BASED_EXT_INTR_MASK |
2376                PIN_BASED_NMI_EXITING |
2377                PIN_BASED_VIRTUAL_NMIS;
2378        vmx->nested.nested_vmx_pinbased_ctls_high |=
2379                PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
2380                PIN_BASED_VMX_PREEMPTION_TIMER;
2381        if (vmx_vm_has_apicv(vmx->vcpu.kvm))
2382                vmx->nested.nested_vmx_pinbased_ctls_high |=
2383                        PIN_BASED_POSTED_INTR;
2384
2385        /* exit controls */
2386        rdmsr(MSR_IA32_VMX_EXIT_CTLS,
2387                vmx->nested.nested_vmx_exit_ctls_low,
2388                vmx->nested.nested_vmx_exit_ctls_high);
2389        vmx->nested.nested_vmx_exit_ctls_low =
2390                VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
2391
2392        vmx->nested.nested_vmx_exit_ctls_high &=
2393#ifdef CONFIG_X86_64
2394                VM_EXIT_HOST_ADDR_SPACE_SIZE |
2395#endif
2396                VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
2397        vmx->nested.nested_vmx_exit_ctls_high |=
2398                VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
2399                VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
2400                VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
2401
2402        if (vmx_mpx_supported())
2403                vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
2404
2405        /* We support free control of debug control saving. */
2406        vmx->nested.nested_vmx_true_exit_ctls_low =
2407                vmx->nested.nested_vmx_exit_ctls_low &
2408                ~VM_EXIT_SAVE_DEBUG_CONTROLS;
2409
2410        /* entry controls */
2411        rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
2412                vmx->nested.nested_vmx_entry_ctls_low,
2413                vmx->nested.nested_vmx_entry_ctls_high);
2414        vmx->nested.nested_vmx_entry_ctls_low =
2415                VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
2416        vmx->nested.nested_vmx_entry_ctls_high &=
2417#ifdef CONFIG_X86_64
2418                VM_ENTRY_IA32E_MODE |
2419#endif
2420                VM_ENTRY_LOAD_IA32_PAT;
2421        vmx->nested.nested_vmx_entry_ctls_high |=
2422                (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
2423        if (vmx_mpx_supported())
2424                vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
2425
2426        /* We support free control of debug control loading. */
2427        vmx->nested.nested_vmx_true_entry_ctls_low =
2428                vmx->nested.nested_vmx_entry_ctls_low &
2429                ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
2430
2431        /* cpu-based controls */
2432        rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
2433                vmx->nested.nested_vmx_procbased_ctls_low,
2434                vmx->nested.nested_vmx_procbased_ctls_high);
2435        vmx->nested.nested_vmx_procbased_ctls_low =
2436                CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2437        vmx->nested.nested_vmx_procbased_ctls_high &=
2438                CPU_BASED_VIRTUAL_INTR_PENDING |
2439                CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
2440                CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
2441                CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
2442                CPU_BASED_CR3_STORE_EXITING |
2443#ifdef CONFIG_X86_64
2444                CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
2445#endif
2446                CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
2447                CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
2448                CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING |
2449                CPU_BASED_PAUSE_EXITING | CPU_BASED_TPR_SHADOW |
2450                CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2451        /*
2452         * We can allow some features even when not supported by the
2453         * hardware. For example, L1 can specify an MSR bitmap - and we
2454         * can use it to avoid exits to L1 - even when L0 runs L2
2455         * without MSR bitmaps.
2456         */
2457        vmx->nested.nested_vmx_procbased_ctls_high |=
2458                CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
2459                CPU_BASED_USE_MSR_BITMAPS;
2460
2461        /* We support free control of CR3 access interception. */
2462        vmx->nested.nested_vmx_true_procbased_ctls_low =
2463                vmx->nested.nested_vmx_procbased_ctls_low &
2464                ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
2465
2466        /* secondary cpu-based controls */
2467        rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
2468                vmx->nested.nested_vmx_secondary_ctls_low,
2469                vmx->nested.nested_vmx_secondary_ctls_high);
2470        vmx->nested.nested_vmx_secondary_ctls_low = 0;
2471        vmx->nested.nested_vmx_secondary_ctls_high &=
2472                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2473                SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2474                SECONDARY_EXEC_APIC_REGISTER_VIRT |
2475                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2476                SECONDARY_EXEC_WBINVD_EXITING |
2477                SECONDARY_EXEC_XSAVES;
2478
2479        if (enable_ept) {
2480                /* nested EPT: emulate EPT also to L1 */
2481                vmx->nested.nested_vmx_secondary_ctls_high |=
2482                        SECONDARY_EXEC_ENABLE_EPT;
2483                vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
2484                         VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
2485                         VMX_EPT_INVEPT_BIT;
2486                vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
2487                /*
2488                 * For nested guests, we don't do anything specific
2489                 * for single context invalidation. Hence, only advertise
2490                 * support for global context invalidation.
2491                 */
2492                vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT;
2493        } else
2494                vmx->nested.nested_vmx_ept_caps = 0;
2495
2496        if (enable_unrestricted_guest)
2497                vmx->nested.nested_vmx_secondary_ctls_high |=
2498                        SECONDARY_EXEC_UNRESTRICTED_GUEST;
2499
2500        /* miscellaneous data */
2501        rdmsr(MSR_IA32_VMX_MISC,
2502                vmx->nested.nested_vmx_misc_low,
2503                vmx->nested.nested_vmx_misc_high);
2504        vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
2505        vmx->nested.nested_vmx_misc_low |=
2506                VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
2507                VMX_MISC_ACTIVITY_HLT;
2508        vmx->nested.nested_vmx_misc_high = 0;
2509}
2510
2511static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
2512{
2513        /*
2514         * Bits 0 in high must be 0, and bits 1 in low must be 1.
2515         */
2516        return ((control & high) | low) == control;
2517}
2518
2519static inline u64 vmx_control_msr(u32 low, u32 high)
2520{
2521        return low | ((u64)high << 32);
2522}
2523
2524/* Returns 0 on success, non-0 otherwise. */
2525static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2526{
2527        struct vcpu_vmx *vmx = to_vmx(vcpu);
2528
2529        switch (msr_index) {
2530        case MSR_IA32_VMX_BASIC:
2531                /*
2532                 * This MSR reports some information about VMX support. We
2533                 * should return information about the VMX we emulate for the
2534                 * guest, and the VMCS structure we give it - not about the
2535                 * VMX support of the underlying hardware.
2536                 */
2537                *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS |
2538                           ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
2539                           (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
2540                break;
2541        case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
2542        case MSR_IA32_VMX_PINBASED_CTLS:
2543                *pdata = vmx_control_msr(
2544                        vmx->nested.nested_vmx_pinbased_ctls_low,
2545                        vmx->nested.nested_vmx_pinbased_ctls_high);
2546                break;
2547        case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
2548                *pdata = vmx_control_msr(
2549                        vmx->nested.nested_vmx_true_procbased_ctls_low,
2550                        vmx->nested.nested_vmx_procbased_ctls_high);
2551                break;
2552        case MSR_IA32_VMX_PROCBASED_CTLS:
2553                *pdata = vmx_control_msr(
2554                        vmx->nested.nested_vmx_procbased_ctls_low,
2555                        vmx->nested.nested_vmx_procbased_ctls_high);
2556                break;
2557        case MSR_IA32_VMX_TRUE_EXIT_CTLS:
2558                *pdata = vmx_control_msr(
2559                        vmx->nested.nested_vmx_true_exit_ctls_low,
2560                        vmx->nested.nested_vmx_exit_ctls_high);
2561                break;
2562        case MSR_IA32_VMX_EXIT_CTLS:
2563                *pdata = vmx_control_msr(
2564                        vmx->nested.nested_vmx_exit_ctls_low,
2565                        vmx->nested.nested_vmx_exit_ctls_high);
2566                break;
2567        case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
2568                *pdata = vmx_control_msr(
2569                        vmx->nested.nested_vmx_true_entry_ctls_low,
2570                        vmx->nested.nested_vmx_entry_ctls_high);
2571                break;
2572        case MSR_IA32_VMX_ENTRY_CTLS:
2573                *pdata = vmx_control_msr(
2574                        vmx->nested.nested_vmx_entry_ctls_low,
2575                        vmx->nested.nested_vmx_entry_ctls_high);
2576                break;
2577        case MSR_IA32_VMX_MISC:
2578                *pdata = vmx_control_msr(
2579                        vmx->nested.nested_vmx_misc_low,
2580                        vmx->nested.nested_vmx_misc_high);
2581                break;
2582        /*
2583         * These MSRs specify bits which the guest must keep fixed (on or off)
2584         * while L1 is in VMXON mode (in L1's root mode, or running an L2).
2585         * We picked the standard core2 setting.
2586         */
2587#define VMXON_CR0_ALWAYSON      (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
2588#define VMXON_CR4_ALWAYSON      X86_CR4_VMXE
2589        case MSR_IA32_VMX_CR0_FIXED0:
2590                *pdata = VMXON_CR0_ALWAYSON;
2591                break;
2592        case MSR_IA32_VMX_CR0_FIXED1:
2593                *pdata = -1ULL;
2594                break;
2595        case MSR_IA32_VMX_CR4_FIXED0:
2596                *pdata = VMXON_CR4_ALWAYSON;
2597                break;
2598        case MSR_IA32_VMX_CR4_FIXED1:
2599                *pdata = -1ULL;
2600                break;
2601        case MSR_IA32_VMX_VMCS_ENUM:
2602                *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */
2603                break;
2604        case MSR_IA32_VMX_PROCBASED_CTLS2:
2605                *pdata = vmx_control_msr(
2606                        vmx->nested.nested_vmx_secondary_ctls_low,
2607                        vmx->nested.nested_vmx_secondary_ctls_high);
2608                break;
2609        case MSR_IA32_VMX_EPT_VPID_CAP:
2610                /* Currently, no nested vpid support */
2611                *pdata = vmx->nested.nested_vmx_ept_caps;
2612                break;
2613        default:
2614                return 1;
2615        }
2616
2617        return 0;
2618}
2619
2620/*
2621 * Reads an msr value (of 'msr_index') into 'pdata'.
2622 * Returns 0 on success, non-0 otherwise.
2623 * Assumes vcpu_load() was already called.
2624 */
2625static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2626{
2627        u64 data;
2628        struct shared_msr_entry *msr;
2629
2630        if (!pdata) {
2631                printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
2632                return -EINVAL;
2633        }
2634
2635        switch (msr_index) {
2636#ifdef CONFIG_X86_64
2637        case MSR_FS_BASE:
2638                data = vmcs_readl(GUEST_FS_BASE);
2639                break;
2640        case MSR_GS_BASE:
2641                data = vmcs_readl(GUEST_GS_BASE);
2642                break;
2643        case MSR_KERNEL_GS_BASE:
2644                vmx_load_host_state(to_vmx(vcpu));
2645                data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
2646                break;
2647#endif
2648        case MSR_EFER:
2649                return kvm_get_msr_common(vcpu, msr_index, pdata);
2650        case MSR_IA32_TSC:
2651                data = guest_read_tsc();
2652                break;
2653        case MSR_IA32_SYSENTER_CS:
2654                data = vmcs_read32(GUEST_SYSENTER_CS);
2655                break;
2656        case MSR_IA32_SYSENTER_EIP:
2657                data = vmcs_readl(GUEST_SYSENTER_EIP);
2658                break;
2659        case MSR_IA32_SYSENTER_ESP:
2660                data = vmcs_readl(GUEST_SYSENTER_ESP);
2661                break;
2662        case MSR_IA32_BNDCFGS:
2663                if (!vmx_mpx_supported())
2664                        return 1;
2665                data = vmcs_read64(GUEST_BNDCFGS);
2666                break;
2667        case MSR_IA32_FEATURE_CONTROL:
2668                if (!nested_vmx_allowed(vcpu))
2669                        return 1;
2670                data = to_vmx(vcpu)->nested.msr_ia32_feature_control;
2671                break;
2672        case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2673                if (!nested_vmx_allowed(vcpu))
2674                        return 1;
2675                return vmx_get_vmx_msr(vcpu, msr_index, pdata);
2676        case MSR_IA32_XSS:
2677                if (!vmx_xsaves_supported())
2678                        return 1;
2679                data = vcpu->arch.ia32_xss;
2680                break;
2681        case MSR_TSC_AUX:
2682                if (!to_vmx(vcpu)->rdtscp_enabled)
2683                        return 1;
2684                /* Otherwise falls through */
2685        default:
2686                msr = find_msr_entry(to_vmx(vcpu), msr_index);
2687                if (msr) {
2688                        data = msr->data;
2689                        break;
2690                }
2691                return kvm_get_msr_common(vcpu, msr_index, pdata);
2692        }
2693
2694        *pdata = data;
2695        return 0;
2696}
2697
2698static void vmx_leave_nested(struct kvm_vcpu *vcpu);
2699
2700/*
2701 * Writes msr value into into the appropriate "register".
2702 * Returns 0 on success, non-0 otherwise.
2703 * Assumes vcpu_load() was already called.
2704 */
2705static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2706{
2707        struct vcpu_vmx *vmx = to_vmx(vcpu);
2708        struct shared_msr_entry *msr;
2709        int ret = 0;
2710        u32 msr_index = msr_info->index;
2711        u64 data = msr_info->data;
2712
2713        switch (msr_index) {
2714        case MSR_EFER:
2715                ret = kvm_set_msr_common(vcpu, msr_info);
2716                break;
2717#ifdef CONFIG_X86_64
2718        case MSR_FS_BASE:
2719                vmx_segment_cache_clear(vmx);
2720                vmcs_writel(GUEST_FS_BASE, data);
2721                break;
2722        case MSR_GS_BASE:
2723                vmx_segment_cache_clear(vmx);
2724                vmcs_writel(GUEST_GS_BASE, data);
2725                break;
2726        case MSR_KERNEL_GS_BASE:
2727                vmx_load_host_state(vmx);
2728                vmx->msr_guest_kernel_gs_base = data;
2729                break;
2730#endif
2731        case MSR_IA32_SYSENTER_CS:
2732                vmcs_write32(GUEST_SYSENTER_CS, data);
2733                break;
2734        case MSR_IA32_SYSENTER_EIP:
2735                vmcs_writel(GUEST_SYSENTER_EIP, data);
2736                break;
2737        case MSR_IA32_SYSENTER_ESP:
2738                vmcs_writel(GUEST_SYSENTER_ESP, data);
2739                break;
2740        case MSR_IA32_BNDCFGS:
2741                if (!vmx_mpx_supported())
2742                        return 1;
2743                vmcs_write64(GUEST_BNDCFGS, data);
2744                break;
2745        case MSR_IA32_TSC:
2746                kvm_write_tsc(vcpu, msr_info);
2747                break;
2748        case MSR_IA32_CR_PAT:
2749                if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2750                        if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
2751                                return 1;
2752                        vmcs_write64(GUEST_IA32_PAT, data);
2753                        vcpu->arch.pat = data;
2754                        break;
2755                }
2756                ret = kvm_set_msr_common(vcpu, msr_info);
2757                break;
2758        case MSR_IA32_TSC_ADJUST:
2759                ret = kvm_set_msr_common(vcpu, msr_info);
2760                break;
2761        case MSR_IA32_FEATURE_CONTROL:
2762                if (!nested_vmx_allowed(vcpu) ||
2763                    (to_vmx(vcpu)->nested.msr_ia32_feature_control &
2764                     FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
2765                        return 1;
2766                vmx->nested.msr_ia32_feature_control = data;
2767                if (msr_info->host_initiated && data == 0)
2768                        vmx_leave_nested(vcpu);
2769                break;
2770        case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2771                return 1; /* they are read-only */
2772        case MSR_IA32_XSS:
2773                if (!vmx_xsaves_supported())
2774                        return 1;
2775                /*
2776                 * The only supported bit as of Skylake is bit 8, but
2777                 * it is not supported on KVM.
2778                 */
2779                if (data != 0)
2780                        return 1;
2781                vcpu->arch.ia32_xss = data;
2782                if (vcpu->arch.ia32_xss != host_xss)
2783                        add_atomic_switch_msr(vmx, MSR_IA32_XSS,
2784                                vcpu->arch.ia32_xss, host_xss);
2785                else
2786                        clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
2787                break;
2788        case MSR_TSC_AUX:
2789                if (!vmx->rdtscp_enabled)
2790                        return 1;
2791                /* Check reserved bit, higher 32 bits should be zero */
2792                if ((data >> 32) != 0)
2793                        return 1;
2794                /* Otherwise falls through */
2795        default:
2796                msr = find_msr_entry(vmx, msr_index);
2797                if (msr) {
2798                        u64 old_msr_data = msr->data;
2799                        msr->data = data;
2800                        if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
2801                                preempt_disable();
2802                                ret = kvm_set_shared_msr(msr->index, msr->data,
2803                                                         msr->mask);
2804                                preempt_enable();
2805                                if (ret)
2806                                        msr->data = old_msr_data;
2807                        }
2808                        break;
2809                }
2810                ret = kvm_set_msr_common(vcpu, msr_info);
2811        }
2812
2813        return ret;
2814}
2815
2816static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
2817{
2818        __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
2819        switch (reg) {
2820        case VCPU_REGS_RSP:
2821                vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
2822                break;
2823        case VCPU_REGS_RIP:
2824                vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
2825                break;
2826        case VCPU_EXREG_PDPTR:
2827                if (enable_ept)
2828                        ept_save_pdptrs(vcpu);
2829                break;
2830        default:
2831                break;
2832        }
2833}
2834
2835static __init int cpu_has_kvm_support(void)
2836{
2837        return cpu_has_vmx();
2838}
2839
2840static __init int vmx_disabled_by_bios(void)
2841{
2842        u64 msr;
2843
2844        rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
2845        if (msr & FEATURE_CONTROL_LOCKED) {
2846                /* launched w/ TXT and VMX disabled */
2847                if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
2848                        && tboot_enabled())
2849                        return 1;
2850                /* launched w/o TXT and VMX only enabled w/ TXT */
2851                if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
2852                        && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
2853                        && !tboot_enabled()) {
2854                        printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
2855                                "activate TXT before enabling KVM\n");
2856                        return 1;
2857                }
2858                /* launched w/o TXT and VMX disabled */
2859                if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
2860                        && !tboot_enabled())
2861                        return 1;
2862        }
2863
2864        return 0;
2865}
2866
2867static void kvm_cpu_vmxon(u64 addr)
2868{
2869        asm volatile (ASM_VMX_VMXON_RAX
2870                        : : "a"(&addr), "m"(addr)
2871                        : "memory", "cc");
2872}
2873
2874static int hardware_enable(void)
2875{
2876        int cpu = raw_smp_processor_id();
2877        u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
2878        u64 old, test_bits;
2879
2880        if (cr4_read_shadow() & X86_CR4_VMXE)
2881                return -EBUSY;
2882
2883        INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
2884
2885        /*
2886         * Now we can enable the vmclear operation in kdump
2887         * since the loaded_vmcss_on_cpu list on this cpu
2888         * has been initialized.
2889         *
2890         * Though the cpu is not in VMX operation now, there
2891         * is no problem to enable the vmclear operation
2892         * for the loaded_vmcss_on_cpu list is empty!
2893         */
2894        crash_enable_local_vmclear(cpu);
2895
2896        rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
2897
2898        test_bits = FEATURE_CONTROL_LOCKED;
2899        test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
2900        if (tboot_enabled())
2901                test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
2902
2903        if ((old & test_bits) != test_bits) {
2904                /* enable and lock */
2905                wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
2906        }
2907        cr4_set_bits(X86_CR4_VMXE);
2908
2909        if (vmm_exclusive) {
2910                kvm_cpu_vmxon(phys_addr);
2911                ept_sync_global();
2912        }
2913
2914        native_store_gdt(this_cpu_ptr(&host_gdt));
2915
2916        return 0;
2917}
2918
2919static void vmclear_local_loaded_vmcss(void)
2920{
2921        int cpu = raw_smp_processor_id();
2922        struct loaded_vmcs *v, *n;
2923
2924        list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
2925                                 loaded_vmcss_on_cpu_link)
2926                __loaded_vmcs_clear(v);
2927}
2928
2929
2930/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
2931 * tricks.
2932 */
2933static void kvm_cpu_vmxoff(void)
2934{
2935        asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
2936}
2937
2938static void hardware_disable(void)
2939{
2940        if (vmm_exclusive) {
2941                vmclear_local_loaded_vmcss();
2942                kvm_cpu_vmxoff();
2943        }
2944        cr4_clear_bits(X86_CR4_VMXE);
2945}
2946
2947static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
2948                                      u32 msr, u32 *result)
2949{
2950        u32 vmx_msr_low, vmx_msr_high;
2951        u32 ctl = ctl_min | ctl_opt;
2952
2953        rdmsr(msr, vmx_msr_low, vmx_msr_high);
2954
2955        ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
2956        ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
2957
2958        /* Ensure minimum (required) set of control bits are supported. */
2959        if (ctl_min & ~ctl)
2960                return -EIO;
2961
2962        *result = ctl;
2963        return 0;
2964}
2965
2966static __init bool allow_1_setting(u32 msr, u32 ctl)
2967{
2968        u32 vmx_msr_low, vmx_msr_high;
2969
2970        rdmsr(msr, vmx_msr_low, vmx_msr_high);
2971        return vmx_msr_high & ctl;
2972}
2973
2974static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2975{
2976        u32 vmx_msr_low, vmx_msr_high;
2977        u32 min, opt, min2, opt2;
2978        u32 _pin_based_exec_control = 0;
2979        u32 _cpu_based_exec_control = 0;
2980        u32 _cpu_based_2nd_exec_control = 0;
2981        u32 _vmexit_control = 0;
2982        u32 _vmentry_control = 0;
2983
2984        min = CPU_BASED_HLT_EXITING |
2985#ifdef CONFIG_X86_64
2986              CPU_BASED_CR8_LOAD_EXITING |
2987              CPU_BASED_CR8_STORE_EXITING |
2988#endif
2989              CPU_BASED_CR3_LOAD_EXITING |
2990              CPU_BASED_CR3_STORE_EXITING |
2991              CPU_BASED_USE_IO_BITMAPS |
2992              CPU_BASED_MOV_DR_EXITING |
2993              CPU_BASED_USE_TSC_OFFSETING |
2994              CPU_BASED_MWAIT_EXITING |
2995              CPU_BASED_MONITOR_EXITING |
2996              CPU_BASED_INVLPG_EXITING |
2997              CPU_BASED_RDPMC_EXITING;
2998
2999        opt = CPU_BASED_TPR_SHADOW |
3000              CPU_BASED_USE_MSR_BITMAPS |
3001              CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
3002        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
3003                                &_cpu_based_exec_control) < 0)
3004                return -EIO;
3005#ifdef CONFIG_X86_64
3006        if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
3007                _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
3008                                           ~CPU_BASED_CR8_STORE_EXITING;
3009#endif
3010        if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
3011                min2 = 0;
3012                opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
3013                        SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
3014                        SECONDARY_EXEC_WBINVD_EXITING |
3015                        SECONDARY_EXEC_ENABLE_VPID |
3016                        SECONDARY_EXEC_ENABLE_EPT |
3017                        SECONDARY_EXEC_UNRESTRICTED_GUEST |
3018                        SECONDARY_EXEC_PAUSE_LOOP_EXITING |
3019                        SECONDARY_EXEC_RDTSCP |
3020                        SECONDARY_EXEC_ENABLE_INVPCID |
3021                        SECONDARY_EXEC_APIC_REGISTER_VIRT |
3022                        SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
3023                        SECONDARY_EXEC_SHADOW_VMCS |
3024                        SECONDARY_EXEC_XSAVES |
3025                        SECONDARY_EXEC_ENABLE_PML;
3026                if (adjust_vmx_controls(min2, opt2,
3027                                        MSR_IA32_VMX_PROCBASED_CTLS2,
3028                                        &_cpu_based_2nd_exec_control) < 0)
3029                        return -EIO;
3030        }
3031#ifndef CONFIG_X86_64
3032        if (!(_cpu_based_2nd_exec_control &
3033                                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
3034                _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
3035#endif
3036
3037        if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
3038                _cpu_based_2nd_exec_control &= ~(
3039                                SECONDARY_EXEC_APIC_REGISTER_VIRT |
3040                                SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
3041                                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
3042
3043        if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
3044                /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
3045                   enabled */
3046                _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
3047                                             CPU_BASED_CR3_STORE_EXITING |
3048                                             CPU_BASED_INVLPG_EXITING);
3049                rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
3050                      vmx_capability.ept, vmx_capability.vpid);
3051        }
3052
3053        min = VM_EXIT_SAVE_DEBUG_CONTROLS;
3054#ifdef CONFIG_X86_64
3055        min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
3056#endif
3057        opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
3058                VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS;
3059        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
3060                                &_vmexit_control) < 0)
3061                return -EIO;
3062
3063        min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
3064        opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
3065        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
3066                                &_pin_based_exec_control) < 0)
3067                return -EIO;
3068
3069        if (!(_cpu_based_2nd_exec_control &
3070                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
3071                !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
3072                _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
3073
3074        min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
3075        opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
3076        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
3077                                &_vmentry_control) < 0)
3078                return -EIO;
3079
3080        rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
3081
3082        /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
3083        if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
3084                return -EIO;
3085
3086#ifdef CONFIG_X86_64
3087        /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
3088        if (vmx_msr_high & (1u<<16))
3089                return -EIO;
3090#endif
3091
3092        /* Require Write-Back (WB) memory type for VMCS accesses. */
3093        if (((vmx_msr_high >> 18) & 15) != 6)
3094                return -EIO;
3095
3096        vmcs_conf->size = vmx_msr_high & 0x1fff;
3097        vmcs_conf->order = get_order(vmcs_config.size);
3098        vmcs_conf->revision_id = vmx_msr_low;
3099
3100        vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
3101        vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
3102        vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
3103        vmcs_conf->vmexit_ctrl         = _vmexit_control;
3104        vmcs_conf->vmentry_ctrl        = _vmentry_control;
3105
3106        cpu_has_load_ia32_efer =
3107                allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
3108                                VM_ENTRY_LOAD_IA32_EFER)
3109                && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
3110                                   VM_EXIT_LOAD_IA32_EFER);
3111
3112        cpu_has_load_perf_global_ctrl =
3113                allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
3114                                VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
3115                && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
3116                                   VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
3117
3118        /*
3119         * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL
3120         * but due to arrata below it can't be used. Workaround is to use
3121         * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL.
3122         *
3123         * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32]
3124         *
3125         * AAK155             (model 26)
3126         * AAP115             (model 30)
3127         * AAT100             (model 37)
3128         * BC86,AAY89,BD102   (model 44)
3129         * BA97               (model 46)
3130         *
3131         */
3132        if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) {
3133                switch (boot_cpu_data.x86_model) {
3134                case 26:
3135                case 30:
3136                case 37:
3137                case 44:
3138                case 46:
3139                        cpu_has_load_perf_global_ctrl = false;
3140                        printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
3141                                        "does not work properly. Using workaround\n");
3142                        break;
3143                default:
3144                        break;
3145                }
3146        }
3147
3148        if (cpu_has_xsaves)
3149                rdmsrl(MSR_IA32_XSS, host_xss);
3150
3151        return 0;
3152}
3153
3154static struct vmcs *alloc_vmcs_cpu(int cpu)
3155{
3156        int node = cpu_to_node(cpu);
3157        struct page *pages;
3158        struct vmcs *vmcs;
3159
3160        pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
3161        if (!pages)
3162                return NULL;
3163        vmcs = page_address(pages);
3164        memset(vmcs, 0, vmcs_config.size);
3165        vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
3166        return vmcs;
3167}
3168
3169static struct vmcs *alloc_vmcs(void)
3170{
3171        return alloc_vmcs_cpu(raw_smp_processor_id());
3172}
3173
3174static void free_vmcs(struct vmcs *vmcs)
3175{
3176        free_pages((unsigned long)vmcs, vmcs_config.order);
3177}
3178
3179/*
3180 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
3181 */
3182static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
3183{
3184        if (!loaded_vmcs->vmcs)
3185                return;
3186        loaded_vmcs_clear(loaded_vmcs);
3187        free_vmcs(loaded_vmcs->vmcs);
3188        loaded_vmcs->vmcs = NULL;
3189}
3190
3191static void free_kvm_area(void)
3192{
3193        int cpu;
3194
3195        for_each_possible_cpu(cpu) {
3196                free_vmcs(per_cpu(vmxarea, cpu));
3197                per_cpu(vmxarea, cpu) = NULL;
3198        }
3199}
3200
3201static void init_vmcs_shadow_fields(void)
3202{
3203        int i, j;
3204
3205        /* No checks for read only fields yet */
3206
3207        for (i = j = 0; i < max_shadow_read_write_fields; i++) {
3208                switch (shadow_read_write_fields[i]) {
3209                case GUEST_BNDCFGS:
3210                        if (!vmx_mpx_supported())
3211                                continue;
3212                        break;
3213                default:
3214                        break;
3215                }
3216
3217                if (j < i)
3218                        shadow_read_write_fields[j] =
3219                                shadow_read_write_fields[i];
3220                j++;
3221        }
3222        max_shadow_read_write_fields = j;
3223
3224        /* shadowed fields guest access without vmexit */
3225        for (i = 0; i < max_shadow_read_write_fields; i++) {
3226                clear_bit(shadow_read_write_fields[i],
3227                          vmx_vmwrite_bitmap);
3228                clear_bit(shadow_read_write_fields[i],
3229                          vmx_vmread_bitmap);
3230        }
3231        for (i = 0; i < max_shadow_read_only_fields; i++)
3232                clear_bit(shadow_read_only_fields[i],
3233                          vmx_vmread_bitmap);
3234}
3235
3236static __init int alloc_kvm_area(void)
3237{
3238        int cpu;
3239
3240        for_each_possible_cpu(cpu) {
3241                struct vmcs *vmcs;
3242
3243                vmcs = alloc_vmcs_cpu(cpu);
3244                if (!vmcs) {
3245                        free_kvm_area();
3246                        return -ENOMEM;
3247                }
3248
3249                per_cpu(vmxarea, cpu) = vmcs;
3250        }
3251        return 0;
3252}
3253
3254static bool emulation_required(struct kvm_vcpu *vcpu)
3255{
3256        return emulate_invalid_guest_state && !guest_state_valid(vcpu);
3257}
3258
3259static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
3260                struct kvm_segment *save)
3261{
3262        if (!emulate_invalid_guest_state) {
3263                /*
3264                 * CS and SS RPL should be equal during guest entry according
3265                 * to VMX spec, but in reality it is not always so. Since vcpu
3266                 * is in the middle of the transition from real mode to
3267                 * protected mode it is safe to assume that RPL 0 is a good
3268                 * default value.
3269                 */
3270                if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
3271                        save->selector &= ~SELECTOR_RPL_MASK;
3272                save->dpl = save->selector & SELECTOR_RPL_MASK;
3273                save->s = 1;
3274        }
3275        vmx_set_segment(vcpu, save, seg);
3276}
3277
3278static void enter_pmode(struct kvm_vcpu *vcpu)
3279{
3280        unsigned long flags;
3281        struct vcpu_vmx *vmx = to_vmx(vcpu);
3282
3283        /*
3284         * Update real mode segment cache. It may be not up-to-date if sement
3285         * register was written while vcpu was in a guest mode.
3286         */
3287        vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3288        vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3289        vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3290        vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3291        vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3292        vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3293
3294        vmx->rmode.vm86_active = 0;
3295
3296        vmx_segment_cache_clear(vmx);
3297
3298        vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3299
3300        flags = vmcs_readl(GUEST_RFLAGS);
3301        flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
3302        flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
3303        vmcs_writel(GUEST_RFLAGS, flags);
3304
3305        vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
3306                        (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
3307
3308        update_exception_bitmap(vcpu);
3309
3310        fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3311        fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3312        fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3313        fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3314        fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3315        fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3316}
3317
3318static void fix_rmode_seg(int seg, struct kvm_segment *save)
3319{
3320        const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3321        struct kvm_segment var = *save;
3322
3323        var.dpl = 0x3;
3324        if (seg == VCPU_SREG_CS)
3325                var.type = 0x3;
3326
3327        if (!emulate_invalid_guest_state) {
3328                var.selector = var.base >> 4;
3329                var.base = var.base & 0xffff0;
3330                var.limit = 0xffff;
3331                var.g = 0;
3332                var.db = 0;
3333                var.present = 1;
3334                var.s = 1;
3335                var.l = 0;
3336                var.unusable = 0;
3337                var.type = 0x3;
3338                var.avl = 0;
3339                if (save->base & 0xf)
3340                        printk_once(KERN_WARNING "kvm: segment base is not "
3341                                        "paragraph aligned when entering "
3342                                        "protected mode (seg=%d)", seg);
3343        }
3344
3345        vmcs_write16(sf->selector, var.selector);
3346        vmcs_write32(sf->base, var.base);
3347        vmcs_write32(sf->limit, var.limit);
3348        vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
3349}
3350
3351static void enter_rmode(struct kvm_vcpu *vcpu)
3352{
3353        unsigned long flags;
3354        struct vcpu_vmx *vmx = to_vmx(vcpu);
3355
3356        vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3357        vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3358        vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3359        vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3360        vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3361        vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3362        vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3363
3364        vmx->rmode.vm86_active = 1;
3365
3366        /*
3367         * Very old userspace does not call KVM_SET_TSS_ADDR before entering
3368         * vcpu. Warn the user that an update is overdue.
3369         */
3370        if (!vcpu->kvm->arch.tss_addr)
3371                printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
3372                             "called before entering vcpu\n");
3373
3374        vmx_segment_cache_clear(vmx);
3375
3376        vmcs_writel(GUEST_TR_BASE, vcpu->kvm->arch.tss_addr);
3377        vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
3378        vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
3379
3380        flags = vmcs_readl(GUEST_RFLAGS);
3381        vmx->rmode.save_rflags = flags;
3382
3383        flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
3384
3385        vmcs_writel(GUEST_RFLAGS, flags);
3386        vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
3387        update_exception_bitmap(vcpu);
3388
3389        fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3390        fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3391        fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3392        fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3393        fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3394        fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3395
3396        kvm_mmu_reset_context(vcpu);
3397}
3398
3399static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
3400{
3401        struct vcpu_vmx *vmx = to_vmx(vcpu);
3402        struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
3403
3404        if (!msr)
3405                return;
3406
3407        /*
3408         * Force kernel_gs_base reloading before EFER changes, as control
3409         * of this msr depends on is_long_mode().
3410         */
3411        vmx_load_host_state(to_vmx(vcpu));
3412        vcpu->arch.efer = efer;
3413        if (efer & EFER_LMA) {
3414                vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
3415                msr->data = efer;
3416        } else {
3417                vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
3418
3419                msr->data = efer & ~EFER_LME;
3420        }
3421        setup_msrs(vmx);
3422}
3423
3424#ifdef CONFIG_X86_64
3425
3426static void enter_lmode(struct kvm_vcpu *vcpu)
3427{
3428        u32 guest_tr_ar;
3429
3430        vmx_segment_cache_clear(to_vmx(vcpu));
3431
3432        guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
3433        if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
3434                pr_debug_ratelimited("%s: tss fixup for long mode. \n",
3435                                     __func__);
3436                vmcs_write32(GUEST_TR_AR_BYTES,
3437                             (guest_tr_ar & ~AR_TYPE_MASK)
3438                             | AR_TYPE_BUSY_64_TSS);
3439        }
3440        vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
3441}
3442
3443static void exit_lmode(struct kvm_vcpu *vcpu)
3444{
3445        vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
3446        vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
3447}
3448
3449#endif
3450
3451static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
3452{
3453        vpid_sync_context(to_vmx(vcpu));
3454        if (enable_ept) {
3455                if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3456                        return;
3457                ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
3458        }
3459}
3460
3461static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
3462{
3463        ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
3464
3465        vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
3466        vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
3467}
3468
3469static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
3470{
3471        if (enable_ept && is_paging(vcpu))
3472                vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3473        __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
3474}
3475
3476static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
3477{
3478        ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
3479
3480        vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
3481        vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
3482}
3483
3484static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
3485{
3486        struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3487
3488        if (!test_bit(VCPU_EXREG_PDPTR,
3489                      (unsigned long *)&vcpu->arch.regs_dirty))
3490                return;
3491
3492        if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
3493                vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
3494                vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
3495                vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
3496                vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
3497        }
3498}
3499
3500static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
3501{
3502        struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3503
3504        if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
3505                mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
3506                mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
3507                mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
3508                mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
3509        }
3510
3511        __set_bit(VCPU_EXREG_PDPTR,
3512                  (unsigned long *)&vcpu->arch.regs_avail);
3513        __set_bit(VCPU_EXREG_PDPTR,
3514                  (unsigned long *)&vcpu->arch.regs_dirty);
3515}
3516
3517static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
3518
3519static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
3520                                        unsigned long cr0,
3521                                        struct kvm_vcpu *vcpu)
3522{
3523        if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
3524                vmx_decache_cr3(vcpu);
3525        if (!(cr0 & X86_CR0_PG)) {
3526                /* From paging/starting to nonpaging */
3527                vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
3528                             vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
3529                             (CPU_BASED_CR3_LOAD_EXITING |
3530                              CPU_BASED_CR3_STORE_EXITING));
3531                vcpu->arch.cr0 = cr0;
3532                vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
3533        } else if (!is_paging(vcpu)) {
3534                /* From nonpaging to paging */
3535                vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
3536                             vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
3537                             ~(CPU_BASED_CR3_LOAD_EXITING |
3538                               CPU_BASED_CR3_STORE_EXITING));
3539                vcpu->arch.cr0 = cr0;
3540                vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
3541        }
3542
3543        if (!(cr0 & X86_CR0_WP))
3544                *hw_cr0 &= ~X86_CR0_WP;
3545}
3546
3547static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3548{
3549        struct vcpu_vmx *vmx = to_vmx(vcpu);
3550        unsigned long hw_cr0;
3551
3552        hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK);
3553        if (enable_unrestricted_guest)
3554                hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
3555        else {
3556                hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
3557
3558                if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
3559                        enter_pmode(vcpu);
3560
3561                if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
3562                        enter_rmode(vcpu);
3563        }
3564
3565#ifdef CONFIG_X86_64
3566        if (vcpu->arch.efer & EFER_LME) {
3567                if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
3568                        enter_lmode(vcpu);
3569                if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
3570                        exit_lmode(vcpu);
3571        }
3572#endif
3573
3574        if (enable_ept)
3575                ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
3576
3577        if (!vcpu->fpu_active)
3578                hw_cr0 |= X86_CR0_TS | X86_CR0_MP;
3579
3580        vmcs_writel(CR0_READ_SHADOW, cr0);
3581        vmcs_writel(GUEST_CR0, hw_cr0);
3582        vcpu->arch.cr0 = cr0;
3583
3584        /* depends on vcpu->arch.cr0 to be set to a new value */
3585        vmx->emulation_required = emulation_required(vcpu);
3586}
3587
3588static u64 construct_eptp(unsigned long root_hpa)
3589{
3590        u64 eptp;
3591
3592        /* TODO write the value reading from MSR */
3593        eptp = VMX_EPT_DEFAULT_MT |
3594                VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
3595        if (enable_ept_ad_bits)
3596                eptp |= VMX_EPT_AD_ENABLE_BIT;
3597        eptp |= (root_hpa & PAGE_MASK);
3598
3599        return eptp;
3600}
3601
3602static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
3603{
3604        unsigned long guest_cr3;
3605        u64 eptp;
3606
3607        guest_cr3 = cr3;
3608        if (enable_ept) {
3609                eptp = construct_eptp(cr3);
3610                vmcs_write64(EPT_POINTER, eptp);
3611                if (is_paging(vcpu) || is_guest_mode(vcpu))
3612                        guest_cr3 = kvm_read_cr3(vcpu);
3613                else
3614                        guest_cr3 = vcpu->kvm->arch.ept_identity_map_addr;
3615                ept_load_pdptrs(vcpu);
3616        }
3617
3618        vmx_flush_tlb(vcpu);
3619        vmcs_writel(GUEST_CR3, guest_cr3);
3620}
3621
3622static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3623{
3624        unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
3625                    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
3626
3627        if (cr4 & X86_CR4_VMXE) {
3628                /*
3629                 * To use VMXON (and later other VMX instructions), a guest
3630                 * must first be able to turn on cr4.VMXE (see handle_vmon()).
3631                 * So basically the check on whether to allow nested VMX
3632                 * is here.
3633                 */
3634                if (!nested_vmx_allowed(vcpu))
3635                        return 1;
3636        }
3637        if (to_vmx(vcpu)->nested.vmxon &&
3638            ((cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON))
3639                return 1;
3640
3641        vcpu->arch.cr4 = cr4;
3642        if (enable_ept) {
3643                if (!is_paging(vcpu)) {
3644                        hw_cr4 &= ~X86_CR4_PAE;
3645                        hw_cr4 |= X86_CR4_PSE;
3646                        /*
3647                         * SMEP/SMAP is disabled if CPU is in non-paging mode
3648                         * in hardware. However KVM always uses paging mode to
3649                         * emulate guest non-paging mode with TDP.
3650                         * To emulate this behavior, SMEP/SMAP needs to be
3651                         * manually disabled when guest switches to non-paging
3652                         * mode.
3653                         */
3654                        hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
3655                } else if (!(cr4 & X86_CR4_PAE)) {
3656                        hw_cr4 &= ~X86_CR4_PAE;
3657                }
3658        }
3659
3660        vmcs_writel(CR4_READ_SHADOW, cr4);
3661        vmcs_writel(GUEST_CR4, hw_cr4);
3662        return 0;
3663}
3664
3665static void vmx_get_segment(struct kvm_vcpu *vcpu,
3666                            struct kvm_segment *var, int seg)
3667{
3668        struct vcpu_vmx *vmx = to_vmx(vcpu);
3669        u32 ar;
3670
3671        if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3672                *var = vmx->rmode.segs[seg];
3673                if (seg == VCPU_SREG_TR
3674                    || var->selector == vmx_read_guest_seg_selector(vmx, seg))
3675                        return;
3676                var->base = vmx_read_guest_seg_base(vmx, seg);
3677                var->selector = vmx_read_guest_seg_selector(vmx, seg);
3678                return;
3679        }
3680        var->base = vmx_read_guest_seg_base(vmx, seg);
3681        var->limit = vmx_read_guest_seg_limit(vmx, seg);
3682        var->selector = vmx_read_guest_seg_selector(vmx, seg);
3683        ar = vmx_read_guest_seg_ar(vmx, seg);
3684        var->unusable = (ar >> 16) & 1;
3685        var->type = ar & 15;
3686        var->s = (ar >> 4) & 1;
3687        var->dpl = (ar >> 5) & 3;
3688        /*
3689         * Some userspaces do not preserve unusable property. Since usable
3690         * segment has to be present according to VMX spec we can use present
3691         * property to amend userspace bug by making unusable segment always
3692         * nonpresent. vmx_segment_access_rights() already marks nonpresent
3693         * segment as unusable.
3694         */
3695        var->present = !var->unusable;
3696        var->avl = (ar >> 12) & 1;
3697        var->l = (ar >> 13) & 1;
3698        var->db = (ar >> 14) & 1;
3699        var->g = (ar >> 15) & 1;
3700}
3701
3702static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
3703{
3704        struct kvm_segment s;
3705
3706        if (to_vmx(vcpu)->rmode.vm86_active) {
3707                vmx_get_segment(vcpu, &s, seg);
3708                return s.base;
3709        }
3710        return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
3711}
3712
3713static int vmx_get_cpl(struct kvm_vcpu *vcpu)
3714{
3715        struct vcpu_vmx *vmx = to_vmx(vcpu);
3716
3717        if (unlikely(vmx->rmode.vm86_active))
3718                return 0;
3719        else {
3720                int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
3721                return AR_DPL(ar);
3722        }
3723}
3724
3725static u32 vmx_segment_access_rights(struct kvm_segment *var)
3726{
3727        u32 ar;
3728
3729        if (var->unusable || !var->present)
3730                ar = 1 << 16;
3731        else {
3732                ar = var->type & 15;
3733                ar |= (var->s & 1) << 4;
3734                ar |= (var->dpl & 3) << 5;
3735                ar |= (var->present & 1) << 7;
3736                ar |= (var->avl & 1) << 12;
3737                ar |= (var->l & 1) << 13;
3738                ar |= (var->db & 1) << 14;
3739                ar |= (var->g & 1) << 15;
3740        }
3741
3742        return ar;
3743}
3744
3745static void vmx_set_segment(struct kvm_vcpu *vcpu,
3746                            struct kvm_segment *var, int seg)
3747{
3748        struct vcpu_vmx *vmx = to_vmx(vcpu);
3749        const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3750
3751        vmx_segment_cache_clear(vmx);
3752
3753        if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3754                vmx->rmode.segs[seg] = *var;
3755                if (seg == VCPU_SREG_TR)
3756                        vmcs_write16(sf->selector, var->selector);
3757                else if (var->s)
3758                        fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
3759                goto out;
3760        }
3761
3762        vmcs_writel(sf->base, var->base);
3763        vmcs_write32(sf->limit, var->limit);
3764        vmcs_write16(sf->selector, var->selector);
3765
3766        /*
3767         *   Fix the "Accessed" bit in AR field of segment registers for older
3768         * qemu binaries.
3769         *   IA32 arch specifies that at the time of processor reset the
3770         * "Accessed" bit in the AR field of segment registers is 1. And qemu
3771         * is setting it to 0 in the userland code. This causes invalid guest
3772         * state vmexit when "unrestricted guest" mode is turned on.
3773         *    Fix for this setup issue in cpu_reset is being pushed in the qemu
3774         * tree. Newer qemu binaries with that qemu fix would not need this
3775         * kvm hack.
3776         */
3777        if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
3778                var->type |= 0x1; /* Accessed */
3779
3780        vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
3781
3782out:
3783        vmx->emulation_required = emulation_required(vcpu);
3784}
3785
3786static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3787{
3788        u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
3789
3790        *db = (ar >> 14) & 1;
3791        *l = (ar >> 13) & 1;
3792}
3793
3794static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3795{
3796        dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
3797        dt->address = vmcs_readl(GUEST_IDTR_BASE);
3798}
3799
3800static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3801{
3802        vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
3803        vmcs_writel(GUEST_IDTR_BASE, dt->address);
3804}
3805
3806static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3807{
3808        dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
3809        dt->address = vmcs_readl(GUEST_GDTR_BASE);
3810}
3811
3812static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3813{
3814        vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
3815        vmcs_writel(GUEST_GDTR_BASE, dt->address);
3816}
3817
3818static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
3819{
3820        struct kvm_segment var;
3821        u32 ar;
3822
3823        vmx_get_segment(vcpu, &var, seg);
3824        var.dpl = 0x3;
3825        if (seg == VCPU_SREG_CS)
3826                var.type = 0x3;
3827        ar = vmx_segment_access_rights(&var);
3828
3829        if (var.base != (var.selector << 4))
3830                return false;
3831        if (var.limit != 0xffff)
3832                return false;
3833        if (ar != 0xf3)
3834                return false;
3835
3836        return true;
3837}
3838
3839static bool code_segment_valid(struct kvm_vcpu *vcpu)
3840{
3841        struct kvm_segment cs;
3842        unsigned int cs_rpl;
3843
3844        vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3845        cs_rpl = cs.selector & SELECTOR_RPL_MASK;
3846
3847        if (cs.unusable)
3848                return false;
3849        if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
3850                return false;
3851        if (!cs.s)
3852                return false;
3853        if (cs.type & AR_TYPE_WRITEABLE_MASK) {
3854                if (cs.dpl > cs_rpl)
3855                        return false;
3856        } else {
3857                if (cs.dpl != cs_rpl)
3858                        return false;
3859        }
3860        if (!cs.present)
3861                return false;
3862
3863        /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
3864        return true;
3865}
3866
3867static bool stack_segment_valid(struct kvm_vcpu *vcpu)
3868{
3869        struct kvm_segment ss;
3870        unsigned int ss_rpl;
3871
3872        vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3873        ss_rpl = ss.selector & SELECTOR_RPL_MASK;
3874
3875        if (ss.unusable)
3876                return true;
3877        if (ss.type != 3 && ss.type != 7)
3878                return false;
3879        if (!ss.s)
3880                return false;
3881        if (ss.dpl != ss_rpl) /* DPL != RPL */
3882                return false;
3883        if (!ss.present)
3884                return false;
3885
3886        return true;
3887}
3888
3889static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
3890{
3891        struct kvm_segment var;
3892        unsigned int rpl;
3893
3894        vmx_get_segment(vcpu, &var, seg);
3895        rpl = var.selector & SELECTOR_RPL_MASK;
3896
3897        if (var.unusable)
3898                return true;
3899        if (!var.s)
3900                return false;
3901        if (!var.present)
3902                return false;
3903        if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) {
3904                if (var.dpl < rpl) /* DPL < RPL */
3905                        return false;
3906        }
3907
3908        /* TODO: Add other members to kvm_segment_field to allow checking for other access
3909         * rights flags
3910         */
3911        return true;
3912}
3913
3914static bool tr_valid(struct kvm_vcpu *vcpu)
3915{
3916        struct kvm_segment tr;
3917
3918        vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
3919
3920        if (tr.unusable)
3921                return false;
3922        if (tr.selector & SELECTOR_TI_MASK)     /* TI = 1 */
3923                return false;
3924        if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
3925                return false;
3926        if (!tr.present)
3927                return false;
3928
3929        return true;
3930}
3931
3932static bool ldtr_valid(struct kvm_vcpu *vcpu)
3933{
3934        struct kvm_segment ldtr;
3935
3936        vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
3937
3938        if (ldtr.unusable)
3939                return true;
3940        if (ldtr.selector & SELECTOR_TI_MASK)   /* TI = 1 */
3941                return false;
3942        if (ldtr.type != 2)
3943                return false;
3944        if (!ldtr.present)
3945                return false;
3946
3947        return true;
3948}
3949
3950static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
3951{
3952        struct kvm_segment cs, ss;
3953
3954        vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3955        vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3956
3957        return ((cs.selector & SELECTOR_RPL_MASK) ==
3958                 (ss.selector & SELECTOR_RPL_MASK));
3959}
3960
3961/*
3962 * Check if guest state is valid. Returns true if valid, false if
3963 * not.
3964 * We assume that registers are always usable
3965 */
3966static bool guest_state_valid(struct kvm_vcpu *vcpu)
3967{
3968        if (enable_unrestricted_guest)
3969                return true;
3970
3971        /* real mode guest state checks */
3972        if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
3973                if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
3974                        return false;
3975                if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
3976                        return false;
3977                if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
3978                        return false;
3979                if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
3980                        return false;
3981                if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
3982                        return false;
3983                if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
3984                        return false;
3985        } else {
3986        /* protected mode guest state checks */
3987                if (!cs_ss_rpl_check(vcpu))
3988                        return false;
3989                if (!code_segment_valid(vcpu))
3990                        return false;
3991                if (!stack_segment_valid(vcpu))
3992                        return false;
3993                if (!data_segment_valid(vcpu, VCPU_SREG_DS))
3994                        return false;
3995                if (!data_segment_valid(vcpu, VCPU_SREG_ES))
3996                        return false;
3997                if (!data_segment_valid(vcpu, VCPU_SREG_FS))
3998                        return false;
3999                if (!data_segment_valid(vcpu, VCPU_SREG_GS))
4000                        return false;
4001                if (!tr_valid(vcpu))
4002                        return false;
4003                if (!ldtr_valid(vcpu))
4004                        return false;
4005        }
4006        /* TODO:
4007         * - Add checks on RIP
4008         * - Add checks on RFLAGS
4009         */
4010
4011        return true;
4012}
4013
4014static int init_rmode_tss(struct kvm *kvm)
4015{
4016        gfn_t fn;
4017        u16 data = 0;
4018        int idx, r;
4019
4020        idx = srcu_read_lock(&kvm->srcu);
4021        fn = kvm->arch.tss_addr >> PAGE_SHIFT;
4022        r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
4023        if (r < 0)
4024                goto out;
4025        data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
4026        r = kvm_write_guest_page(kvm, fn++, &data,
4027                        TSS_IOPB_BASE_OFFSET, sizeof(u16));
4028        if (r < 0)
4029                goto out;
4030        r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
4031        if (r < 0)
4032                goto out;
4033        r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
4034        if (r < 0)
4035                goto out;
4036        data = ~0;
4037        r = kvm_write_guest_page(kvm, fn, &data,
4038                                 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
4039                                 sizeof(u8));
4040out:
4041        srcu_read_unlock(&kvm->srcu, idx);
4042        return r;
4043}
4044
4045static int init_rmode_identity_map(struct kvm *kvm)
4046{
4047        int i, idx, r = 0;
4048        pfn_t identity_map_pfn;
4049        u32 tmp;
4050
4051        if (!enable_ept)
4052                return 0;
4053
4054        /* Protect kvm->arch.ept_identity_pagetable_done. */
4055        mutex_lock(&kvm->slots_lock);
4056
4057        if (likely(kvm->arch.ept_identity_pagetable_done))
4058                goto out2;
4059
4060        identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
4061
4062        r = alloc_identity_pagetable(kvm);
4063        if (r < 0)
4064                goto out2;
4065
4066        idx = srcu_read_lock(&kvm->srcu);
4067        r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
4068        if (r < 0)
4069                goto out;
4070        /* Set up identity-mapping pagetable for EPT in real mode */
4071        for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
4072                tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
4073                        _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
4074                r = kvm_write_guest_page(kvm, identity_map_pfn,
4075                                &tmp, i * sizeof(tmp), sizeof(tmp));
4076                if (r < 0)
4077                        goto out;
4078        }
4079        kvm->arch.ept_identity_pagetable_done = true;
4080
4081out:
4082        srcu_read_unlock(&kvm->srcu, idx);
4083
4084out2:
4085        mutex_unlock(&kvm->slots_lock);
4086        return r;
4087}
4088
4089static void seg_setup(int seg)
4090{
4091        const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
4092        unsigned int ar;
4093
4094        vmcs_write16(sf->selector, 0);
4095        vmcs_writel(sf->base, 0);
4096        vmcs_write32(sf->limit, 0xffff);
4097        ar = 0x93;
4098        if (seg == VCPU_SREG_CS)
4099                ar |= 0x08; /* code segment */
4100
4101        vmcs_write32(sf->ar_bytes, ar);
4102}
4103
4104static int alloc_apic_access_page(struct kvm *kvm)
4105{
4106        struct page *page;
4107        struct kvm_userspace_memory_region kvm_userspace_mem;
4108        int r = 0;
4109
4110        mutex_lock(&kvm->slots_lock);
4111        if (kvm->arch.apic_access_page_done)
4112                goto out;
4113        kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
4114        kvm_userspace_mem.flags = 0;
4115        kvm_userspace_mem.guest_phys_addr = APIC_DEFAULT_PHYS_BASE;
4116        kvm_userspace_mem.memory_size = PAGE_SIZE;
4117        r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
4118        if (r)
4119                goto out;
4120
4121        page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
4122        if (is_error_page(page)) {
4123                r = -EFAULT;
4124                goto out;
4125        }
4126
4127        /*
4128         * Do not pin the page in memory, so that memory hot-unplug
4129         * is able to migrate it.
4130         */
4131        put_page(page);
4132        kvm->arch.apic_access_page_done = true;
4133out:
4134        mutex_unlock(&kvm->slots_lock);
4135        return r;
4136}
4137
4138static int alloc_identity_pagetable(struct kvm *kvm)
4139{
4140        /* Called with kvm->slots_lock held. */
4141
4142        struct kvm_userspace_memory_region kvm_userspace_mem;
4143        int r = 0;
4144
4145        BUG_ON(kvm->arch.ept_identity_pagetable_done);
4146
4147        kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
4148        kvm_userspace_mem.flags = 0;
4149        kvm_userspace_mem.guest_phys_addr =
4150                kvm->arch.ept_identity_map_addr;
4151        kvm_userspace_mem.memory_size = PAGE_SIZE;
4152        r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
4153
4154        return r;
4155}
4156
4157static void allocate_vpid(struct vcpu_vmx *vmx)
4158{
4159        int vpid;
4160
4161        vmx->vpid = 0;
4162        if (!enable_vpid)
4163                return;
4164        spin_lock(&vmx_vpid_lock);
4165        vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
4166        if (vpid < VMX_NR_VPIDS) {
4167                vmx->vpid = vpid;
4168                __set_bit(vpid, vmx_vpid_bitmap);
4169        }
4170        spin_unlock(&vmx_vpid_lock);
4171}
4172
4173static void free_vpid(struct vcpu_vmx *vmx)
4174{
4175        if (!enable_vpid)
4176                return;
4177        spin_lock(&vmx_vpid_lock);
4178        if (vmx->vpid != 0)
4179                __clear_bit(vmx->vpid, vmx_vpid_bitmap);
4180        spin_unlock(&vmx_vpid_lock);
4181}
4182
4183#define MSR_TYPE_R      1
4184#define MSR_TYPE_W      2
4185static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
4186                                                u32 msr, int type)
4187{
4188        int f = sizeof(unsigned long);
4189
4190        if (!cpu_has_vmx_msr_bitmap())
4191                return;
4192
4193        /*
4194         * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
4195         * have the write-low and read-high bitmap offsets the wrong way round.
4196         * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
4197         */
4198        if (msr <= 0x1fff) {
4199                if (type & MSR_TYPE_R)
4200                        /* read-low */
4201                        __clear_bit(msr, msr_bitmap + 0x000 / f);
4202
4203                if (type & MSR_TYPE_W)
4204                        /* write-low */
4205                        __clear_bit(msr, msr_bitmap + 0x800 / f);
4206
4207        } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
4208                msr &= 0x1fff;
4209                if (type & MSR_TYPE_R)
4210                        /* read-high */
4211                        __clear_bit(msr, msr_bitmap + 0x400 / f);
4212
4213                if (type & MSR_TYPE_W)
4214                        /* write-high */
4215                        __clear_bit(msr, msr_bitmap + 0xc00 / f);
4216
4217        }
4218}
4219
4220static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
4221                                                u32 msr, int type)
4222{
4223        int f = sizeof(unsigned long);
4224
4225        if (!cpu_has_vmx_msr_bitmap())
4226                return;
4227
4228        /*
4229         * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
4230         * have the write-low and read-high bitmap offsets the wrong way round.
4231         * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
4232         */
4233        if (msr <= 0x1fff) {
4234                if (type & MSR_TYPE_R)
4235                        /* read-low */
4236                        __set_bit(msr, msr_bitmap + 0x000 / f);
4237
4238                if (type & MSR_TYPE_W)
4239                        /* write-low */
4240                        __set_bit(msr, msr_bitmap + 0x800 / f);
4241
4242        } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
4243                msr &= 0x1fff;
4244                if (type & MSR_TYPE_R)
4245                        /* read-high */
4246                        __set_bit(msr, msr_bitmap + 0x400 / f);
4247
4248                if (type & MSR_TYPE_W)
4249                        /* write-high */
4250                        __set_bit(msr, msr_bitmap + 0xc00 / f);
4251
4252        }
4253}
4254
4255/*
4256 * If a msr is allowed by L0, we should check whether it is allowed by L1.
4257 * The corresponding bit will be cleared unless both of L0 and L1 allow it.
4258 */
4259static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
4260                                               unsigned long *msr_bitmap_nested,
4261                                               u32 msr, int type)
4262{
4263        int f = sizeof(unsigned long);
4264
4265        if (!cpu_has_vmx_msr_bitmap()) {
4266                WARN_ON(1);
4267                return;
4268        }
4269
4270        /*
4271         * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
4272         * have the write-low and read-high bitmap offsets the wrong way round.
4273         * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
4274         */
4275        if (msr <= 0x1fff) {
4276                if (type & MSR_TYPE_R &&
4277                   !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
4278                        /* read-low */
4279                        __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
4280
4281                if (type & MSR_TYPE_W &&
4282                   !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
4283                        /* write-low */
4284                        __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
4285
4286        } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
4287                msr &= 0x1fff;
4288                if (type & MSR_TYPE_R &&
4289                   !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
4290                        /* read-high */
4291                        __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
4292
4293                if (type & MSR_TYPE_W &&
4294                   !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
4295                        /* write-high */
4296                        __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
4297
4298        }
4299}
4300
4301static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
4302{
4303        if (!longmode_only)
4304                __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
4305                                                msr, MSR_TYPE_R | MSR_TYPE_W);
4306        __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
4307                                                msr, MSR_TYPE_R | MSR_TYPE_W);
4308}
4309
4310static void vmx_enable_intercept_msr_read_x2apic(u32 msr)
4311{
4312        __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
4313                        msr, MSR_TYPE_R);
4314        __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
4315                        msr, MSR_TYPE_R);
4316}
4317
4318static void vmx_disable_intercept_msr_read_x2apic(u32 msr)
4319{
4320        __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
4321                        msr, MSR_TYPE_R);
4322        __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
4323                        msr, MSR_TYPE_R);
4324}
4325
4326static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
4327{
4328        __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
4329                        msr, MSR_TYPE_W);
4330        __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
4331                        msr, MSR_TYPE_W);
4332}
4333
4334static int vmx_vm_has_apicv(struct kvm *kvm)
4335{
4336        return enable_apicv && irqchip_in_kernel(kvm);
4337}
4338
4339static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
4340{
4341        struct vcpu_vmx *vmx = to_vmx(vcpu);
4342        int max_irr;
4343        void *vapic_page;
4344        u16 status;
4345
4346        if (vmx->nested.pi_desc &&
4347            vmx->nested.pi_pending) {
4348                vmx->nested.pi_pending = false;
4349                if (!pi_test_and_clear_on(vmx->nested.pi_desc))
4350                        return 0;
4351
4352                max_irr = find_last_bit(
4353                        (unsigned long *)vmx->nested.pi_desc->pir, 256);
4354
4355                if (max_irr == 256)
4356                        return 0;
4357
4358                vapic_page = kmap(vmx->nested.virtual_apic_page);
4359                if (!vapic_page) {
4360                        WARN_ON(1);
4361                        return -ENOMEM;
4362                }
4363                __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
4364                kunmap(vmx->nested.virtual_apic_page);
4365
4366                status = vmcs_read16(GUEST_INTR_STATUS);
4367                if ((u8)max_irr > ((u8)status & 0xff)) {
4368                        status &= ~0xff;
4369                        status |= (u8)max_irr;
4370                        vmcs_write16(GUEST_INTR_STATUS, status);
4371                }
4372        }
4373        return 0;
4374}
4375
4376static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
4377{
4378#ifdef CONFIG_SMP
4379        if (vcpu->mode == IN_GUEST_MODE) {
4380                apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
4381                                POSTED_INTR_VECTOR);
4382                return true;
4383        }
4384#endif
4385        return false;
4386}
4387
4388static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
4389                                                int vector)
4390{
4391        struct vcpu_vmx *vmx = to_vmx(vcpu);
4392
4393        if (is_guest_mode(vcpu) &&
4394            vector == vmx->nested.posted_intr_nv) {
4395                /* the PIR and ON have been set by L1. */
4396                kvm_vcpu_trigger_posted_interrupt(vcpu);
4397                /*
4398                 * If a posted intr is not recognized by hardware,
4399                 * we will accomplish it in the next vmentry.
4400                 */
4401                vmx->nested.pi_pending = true;
4402                kvm_make_request(KVM_REQ_EVENT, vcpu);
4403                return 0;
4404        }
4405        return -1;
4406}
4407/*
4408 * Send interrupt to vcpu via posted interrupt way.
4409 * 1. If target vcpu is running(non-root mode), send posted interrupt
4410 * notification to vcpu and hardware will sync PIR to vIRR atomically.
4411 * 2. If target vcpu isn't running(root mode), kick it to pick up the
4412 * interrupt from PIR in next vmentry.
4413 */
4414static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
4415{
4416        struct vcpu_vmx *vmx = to_vmx(vcpu);
4417        int r;
4418
4419        r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
4420        if (!r)
4421                return;
4422
4423        if (pi_test_and_set_pir(vector, &vmx->pi_desc))
4424                return;
4425
4426        r = pi_test_and_set_on(&vmx->pi_desc);
4427        kvm_make_request(KVM_REQ_EVENT, vcpu);
4428        if (r || !kvm_vcpu_trigger_posted_interrupt(vcpu))
4429                kvm_vcpu_kick(vcpu);
4430}
4431
4432static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
4433{
4434        struct vcpu_vmx *vmx = to_vmx(vcpu);
4435
4436        if (!pi_test_and_clear_on(&vmx->pi_desc))
4437                return;
4438
4439        kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
4440}
4441
4442static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu)
4443{
4444        return;
4445}
4446
4447/*
4448 * Set up the vmcs's constant host-state fields, i.e., host-state fields that
4449 * will not change in the lifetime of the guest.
4450 * Note that host-state that does change is set elsewhere. E.g., host-state
4451 * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
4452 */
4453static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
4454{
4455        u32 low32, high32;
4456        unsigned long tmpl;
4457        struct desc_ptr dt;
4458        unsigned long cr4;
4459
4460        vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS);  /* 22.2.3 */
4461        vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
4462
4463        /* Save the most likely value for this task's CR4 in the VMCS. */
4464        cr4 = cr4_read_shadow();
4465        vmcs_writel(HOST_CR4, cr4);                     /* 22.2.3, 22.2.5 */
4466        vmx->host_state.vmcs_host_cr4 = cr4;
4467
4468        vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
4469#ifdef CONFIG_X86_64
4470        /*
4471         * Load null selectors, so we can avoid reloading them in
4472         * __vmx_load_host_state(), in case userspace uses the null selectors
4473         * too (the expected case).
4474         */
4475        vmcs_write16(HOST_DS_SELECTOR, 0);
4476        vmcs_write16(HOST_ES_SELECTOR, 0);
4477#else
4478        vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4479        vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4480#endif
4481        vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4482        vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
4483
4484        native_store_idt(&dt);
4485        vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
4486        vmx->host_idt_base = dt.address;
4487
4488        vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */
4489
4490        rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
4491        vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
4492        rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
4493        vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
4494
4495        if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
4496                rdmsr(MSR_IA32_CR_PAT, low32, high32);
4497                vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
4498        }
4499}
4500
4501static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
4502{
4503        vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
4504        if (enable_ept)
4505                vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
4506        if (is_guest_mode(&vmx->vcpu))
4507                vmx->vcpu.arch.cr4_guest_owned_bits &=
4508                        ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
4509        vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
4510}
4511
4512static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
4513{
4514        u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
4515
4516        if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
4517                pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
4518        return pin_based_exec_ctrl;
4519}
4520
4521static u32 vmx_exec_control(struct vcpu_vmx *vmx)
4522{
4523        u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
4524
4525        if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
4526                exec_control &= ~CPU_BASED_MOV_DR_EXITING;
4527
4528        if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
4529                exec_control &= ~CPU_BASED_TPR_SHADOW;
4530#ifdef CONFIG_X86_64
4531                exec_control |= CPU_BASED_CR8_STORE_EXITING |
4532                                CPU_BASED_CR8_LOAD_EXITING;
4533#endif
4534        }
4535        if (!enable_ept)
4536                exec_control |= CPU_BASED_CR3_STORE_EXITING |
4537                                CPU_BASED_CR3_LOAD_EXITING  |
4538                                CPU_BASED_INVLPG_EXITING;
4539        return exec_control;
4540}
4541
4542static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
4543{
4544        u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
4545        if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
4546                exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
4547        if (vmx->vpid == 0)
4548                exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
4549        if (!enable_ept) {
4550                exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
4551                enable_unrestricted_guest = 0;
4552                /* Enable INVPCID for non-ept guests may cause performance regression. */
4553                exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
4554        }
4555        if (!enable_unrestricted_guest)
4556                exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
4557        if (!ple_gap)
4558                exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
4559        if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
4560                exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
4561                                  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4562        exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
4563        /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
4564           (handle_vmptrld).
4565           We can NOT enable shadow_vmcs here because we don't have yet
4566           a current VMCS12
4567        */
4568        exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
4569        /* PML is enabled/disabled in creating/destorying vcpu */
4570        exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
4571
4572        return exec_control;
4573}
4574
4575static void ept_set_mmio_spte_mask(void)
4576{
4577        /*
4578         * EPT Misconfigurations can be generated if the value of bits 2:0
4579         * of an EPT paging-structure entry is 110b (write/execute).
4580         * Also, magic bits (0x3ull << 62) is set to quickly identify mmio
4581         * spte.
4582         */
4583        kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull);
4584}
4585
4586#define VMX_XSS_EXIT_BITMAP 0
4587/*
4588 * Sets up the vmcs for emulated real mode.
4589 */
4590static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
4591{
4592#ifdef CONFIG_X86_64
4593        unsigned long a;
4594#endif
4595        int i;
4596
4597        /* I/O */
4598        vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
4599        vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
4600
4601        if (enable_shadow_vmcs) {
4602                vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
4603                vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
4604        }
4605        if (cpu_has_vmx_msr_bitmap())
4606                vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
4607
4608        vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
4609
4610        /* Control */
4611        vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
4612
4613        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
4614
4615        if (cpu_has_secondary_exec_ctrls()) {
4616                vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
4617                                vmx_secondary_exec_control(vmx));
4618        }
4619
4620        if (vmx_vm_has_apicv(vmx->vcpu.kvm)) {
4621                vmcs_write64(EOI_EXIT_BITMAP0, 0);
4622                vmcs_write64(EOI_EXIT_BITMAP1, 0);
4623                vmcs_write64(EOI_EXIT_BITMAP2, 0);
4624                vmcs_write64(EOI_EXIT_BITMAP3, 0);
4625
4626                vmcs_write16(GUEST_INTR_STATUS, 0);
4627
4628                vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
4629                vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
4630        }
4631
4632        if (ple_gap) {
4633                vmcs_write32(PLE_GAP, ple_gap);
4634                vmx->ple_window = ple_window;
4635                vmx->ple_window_dirty = true;
4636        }
4637
4638        vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
4639        vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
4640        vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
4641
4642        vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
4643        vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
4644        vmx_set_constant_host_state(vmx);
4645#ifdef CONFIG_X86_64
4646        rdmsrl(MSR_FS_BASE, a);
4647        vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
4648        rdmsrl(MSR_GS_BASE, a);
4649        vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
4650#else
4651        vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
4652        vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
4653#endif
4654
4655        vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
4656        vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
4657        vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
4658        vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
4659        vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
4660
4661        if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
4662                u32 msr_low, msr_high;
4663                u64 host_pat;
4664                rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
4665                host_pat = msr_low | ((u64) msr_high << 32);
4666                /* Write the default value follow host pat */
4667                vmcs_write64(GUEST_IA32_PAT, host_pat);
4668                /* Keep arch.pat sync with GUEST_IA32_PAT */
4669                vmx->vcpu.arch.pat = host_pat;
4670        }
4671
4672        for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
4673                u32 index = vmx_msr_index[i];
4674                u32 data_low, data_high;
4675                int j = vmx->nmsrs;
4676
4677                if (rdmsr_safe(index, &data_low, &data_high) < 0)
4678                        continue;
4679                if (wrmsr_safe(index, data_low, data_high) < 0)
4680                        continue;
4681                vmx->guest_msrs[j].index = i;
4682                vmx->guest_msrs[j].data = 0;
4683                vmx->guest_msrs[j].mask = -1ull;
4684                ++vmx->nmsrs;
4685        }
4686
4687
4688        vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
4689
4690        /* 22.2.1, 20.8.1 */
4691        vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl);
4692
4693        vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
4694        set_cr4_guest_host_mask(vmx);
4695
4696        if (vmx_xsaves_supported())
4697                vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
4698
4699        return 0;
4700}
4701
4702static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4703{
4704        struct vcpu_vmx *vmx = to_vmx(vcpu);
4705        struct msr_data apic_base_msr;
4706
4707        vmx->rmode.vm86_active = 0;
4708
4709        vmx->soft_vnmi_blocked = 0;
4710
4711        vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
4712        kvm_set_cr8(&vmx->vcpu, 0);
4713        apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
4714        if (kvm_vcpu_is_bsp(&vmx->vcpu))
4715                apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
4716        apic_base_msr.host_initiated = true;
4717        kvm_set_apic_base(&vmx->vcpu, &apic_base_msr);
4718
4719        vmx_segment_cache_clear(vmx);
4720
4721        seg_setup(VCPU_SREG_CS);
4722        vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
4723        vmcs_write32(GUEST_CS_BASE, 0xffff0000);
4724
4725        seg_setup(VCPU_SREG_DS);
4726        seg_setup(VCPU_SREG_ES);
4727        seg_setup(VCPU_SREG_FS);
4728        seg_setup(VCPU_SREG_GS);
4729        seg_setup(VCPU_SREG_SS);
4730
4731        vmcs_write16(GUEST_TR_SELECTOR, 0);
4732        vmcs_writel(GUEST_TR_BASE, 0);
4733        vmcs_write32(GUEST_TR_LIMIT, 0xffff);
4734        vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
4735
4736        vmcs_write16(GUEST_LDTR_SELECTOR, 0);
4737        vmcs_writel(GUEST_LDTR_BASE, 0);
4738        vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
4739        vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
4740
4741        vmcs_write32(GUEST_SYSENTER_CS, 0);
4742        vmcs_writel(GUEST_SYSENTER_ESP, 0);
4743        vmcs_writel(GUEST_SYSENTER_EIP, 0);
4744
4745        vmcs_writel(GUEST_RFLAGS, 0x02);
4746        kvm_rip_write(vcpu, 0xfff0);
4747
4748        vmcs_writel(GUEST_GDTR_BASE, 0);
4749        vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
4750
4751        vmcs_writel(GUEST_IDTR_BASE, 0);
4752        vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
4753
4754        vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
4755        vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
4756        vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
4757
4758        /* Special registers */
4759        vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4760
4761        setup_msrs(vmx);
4762
4763        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
4764
4765        if (cpu_has_vmx_tpr_shadow()) {
4766                vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
4767                if (vm_need_tpr_shadow(vmx->vcpu.kvm))
4768                        vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
4769                                     __pa(vmx->vcpu.arch.apic->regs));
4770                vmcs_write32(TPR_THRESHOLD, 0);
4771        }
4772
4773        kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
4774
4775        if (vmx_vm_has_apicv(vcpu->kvm))
4776                memset(&vmx->pi_desc, 0, sizeof(struct pi_desc));
4777
4778        if (vmx->vpid != 0)
4779                vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
4780
4781        vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
4782        vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
4783        vmx_set_cr4(&vmx->vcpu, 0);
4784        vmx_set_efer(&vmx->vcpu, 0);
4785        vmx_fpu_activate(&vmx->vcpu);
4786        update_exception_bitmap(&vmx->vcpu);
4787
4788        vpid_sync_context(vmx);
4789}
4790
4791/*
4792 * In nested virtualization, check if L1 asked to exit on external interrupts.
4793 * For most existing hypervisors, this will always return true.
4794 */
4795static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
4796{
4797        return get_vmcs12(vcpu)->pin_based_vm_exec_control &
4798                PIN_BASED_EXT_INTR_MASK;
4799}
4800
4801/*
4802 * In nested virtualization, check if L1 has set
4803 * VM_EXIT_ACK_INTR_ON_EXIT
4804 */
4805static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
4806{
4807        return get_vmcs12(vcpu)->vm_exit_controls &
4808                VM_EXIT_ACK_INTR_ON_EXIT;
4809}
4810
4811static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
4812{
4813        return get_vmcs12(vcpu)->pin_based_vm_exec_control &
4814                PIN_BASED_NMI_EXITING;
4815}
4816
4817static void enable_irq_window(struct kvm_vcpu *vcpu)
4818{
4819        u32 cpu_based_vm_exec_control;
4820
4821        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
4822        cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
4823        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
4824}
4825
4826static void enable_nmi_window(struct kvm_vcpu *vcpu)
4827{
4828        u32 cpu_based_vm_exec_control;
4829
4830        if (!cpu_has_virtual_nmis() ||
4831            vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
4832                enable_irq_window(vcpu);
4833                return;
4834        }
4835
4836        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
4837        cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
4838        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
4839}
4840
4841static void vmx_inject_irq(struct kvm_vcpu *vcpu)
4842{
4843        struct vcpu_vmx *vmx = to_vmx(vcpu);
4844        uint32_t intr;
4845        int irq = vcpu->arch.interrupt.nr;
4846
4847        trace_kvm_inj_virq(irq);
4848
4849        ++vcpu->stat.irq_injections;
4850        if (vmx->rmode.vm86_active) {
4851                int inc_eip = 0;
4852                if (vcpu->arch.interrupt.soft)
4853                        inc_eip = vcpu->arch.event_exit_inst_len;
4854                if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
4855                        kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
4856                return;
4857        }
4858        intr = irq | INTR_INFO_VALID_MASK;
4859        if (vcpu->arch.interrupt.soft) {
4860                intr |= INTR_TYPE_SOFT_INTR;
4861                vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
4862                             vmx->vcpu.arch.event_exit_inst_len);
4863        } else
4864                intr |= INTR_TYPE_EXT_INTR;
4865        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
4866}
4867
4868static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
4869{
4870        struct vcpu_vmx *vmx = to_vmx(vcpu);
4871
4872        if (is_guest_mode(vcpu))
4873                return;
4874
4875        if (!cpu_has_virtual_nmis()) {
4876                /*
4877                 * Tracking the NMI-blocked state in software is built upon
4878                 * finding the next open IRQ window. This, in turn, depends on
4879                 * well-behaving guests: They have to keep IRQs disabled at
4880                 * least as long as the NMI handler runs. Otherwise we may
4881                 * cause NMI nesting, maybe breaking the guest. But as this is
4882                 * highly unlikely, we can live with the residual risk.
4883                 */
4884                vmx->soft_vnmi_blocked = 1;
4885                vmx->vnmi_blocked_time = 0;
4886        }
4887
4888        ++vcpu->stat.nmi_injections;
4889        vmx->nmi_known_unmasked = false;
4890        if (vmx->rmode.vm86_active) {
4891                if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
4892                        kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
4893                return;
4894        }
4895        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
4896                        INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
4897}
4898
4899static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
4900{
4901        if (!cpu_has_virtual_nmis())
4902                return to_vmx(vcpu)->soft_vnmi_blocked;
4903        if (to_vmx(vcpu)->nmi_known_unmasked)
4904                return false;
4905        return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
4906}
4907
4908static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
4909{
4910        struct vcpu_vmx *vmx = to_vmx(vcpu);
4911
4912        if (!cpu_has_virtual_nmis()) {
4913                if (vmx->soft_vnmi_blocked != masked) {
4914                        vmx->soft_vnmi_blocked = masked;
4915                        vmx->vnmi_blocked_time = 0;
4916                }
4917        } else {
4918                vmx->nmi_known_unmasked = !masked;
4919                if (masked)
4920                        vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
4921                                      GUEST_INTR_STATE_NMI);
4922                else
4923                        vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
4924                                        GUEST_INTR_STATE_NMI);
4925        }
4926}
4927
4928static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
4929{
4930        if (to_vmx(vcpu)->nested.nested_run_pending)
4931                return 0;
4932
4933        if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
4934                return 0;
4935
4936        return  !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
4937                  (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
4938                   | GUEST_INTR_STATE_NMI));
4939}
4940
4941static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
4942{
4943        return (!to_vmx(vcpu)->nested.nested_run_pending &&
4944                vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
4945                !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
4946                        (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
4947}
4948
4949static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
4950{
4951        int ret;
4952        struct kvm_userspace_memory_region tss_mem = {
4953                .slot = TSS_PRIVATE_MEMSLOT,
4954                .guest_phys_addr = addr,
4955                .memory_size = PAGE_SIZE * 3,
4956                .flags = 0,
4957        };
4958
4959        ret = kvm_set_memory_region(kvm, &tss_mem);
4960        if (ret)
4961                return ret;
4962        kvm->arch.tss_addr = addr;
4963        return init_rmode_tss(kvm);
4964}
4965
4966static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
4967{
4968        switch (vec) {
4969        case BP_VECTOR:
4970                /*
4971                 * Update instruction length as we may reinject the exception
4972                 * from user space while in guest debugging mode.
4973                 */
4974                to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
4975                        vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
4976                if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
4977                        return false;
4978                /* fall through */
4979        case DB_VECTOR:
4980                if (vcpu->guest_debug &
4981                        (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
4982                        return false;
4983                /* fall through */
4984        case DE_VECTOR:
4985        case OF_VECTOR:
4986        case BR_VECTOR:
4987        case UD_VECTOR:
4988        case DF_VECTOR:
4989        case SS_VECTOR:
4990        case GP_VECTOR:
4991        case MF_VECTOR:
4992                return true;
4993        break;
4994        }
4995        return false;
4996}
4997
4998static int handle_rmode_exception(struct kvm_vcpu *vcpu,
4999                                  int vec, u32 err_code)
5000{
5001        /*
5002         * Instruction with address size override prefix opcode 0x67
5003         * Cause the #SS fault with 0 error code in VM86 mode.
5004         */
5005        if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
5006                if (emulate_instruction(vcpu, 0) == EMULATE_DONE) {
5007                        if (vcpu->arch.halt_request) {
5008                                vcpu->arch.halt_request = 0;
5009                                return kvm_emulate_halt(vcpu);
5010                        }
5011                        return 1;
5012                }
5013                return 0;
5014        }
5015
5016        /*
5017         * Forward all other exceptions that are valid in real mode.
5018         * FIXME: Breaks guest debugging in real mode, needs to be fixed with
5019         *        the required debugging infrastructure rework.
5020         */
5021        kvm_queue_exception(vcpu, vec);
5022        return 1;
5023}
5024
5025/*
5026 * Trigger machine check on the host. We assume all the MSRs are already set up
5027 * by the CPU and that we still run on the same CPU as the MCE occurred on.
5028 * We pass a fake environment to the machine check handler because we want
5029 * the guest to be always treated like user space, no matter what context
5030 * it used internally.
5031 */
5032static void kvm_machine_check(void)
5033{
5034#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
5035        struct pt_regs regs = {
5036                .cs = 3, /* Fake ring 3 no matter what the guest ran on */
5037                .flags = X86_EFLAGS_IF,
5038        };
5039
5040        do_machine_check(&regs, 0);
5041#endif
5042}
5043
5044static int handle_machine_check(struct kvm_vcpu *vcpu)
5045{
5046        /* already handled by vcpu_run */
5047        return 1;
5048}
5049
5050static int handle_exception(struct kvm_vcpu *vcpu)
5051{
5052        struct vcpu_vmx *vmx = to_vmx(vcpu);
5053        struct kvm_run *kvm_run = vcpu->run;
5054        u32 intr_info, ex_no, error_code;
5055        unsigned long cr2, rip, dr6;
5056        u32 vect_info;
5057        enum emulation_result er;
5058
5059        vect_info = vmx->idt_vectoring_info;
5060        intr_info = vmx->exit_intr_info;
5061
5062        if (is_machine_check(intr_info))
5063                return handle_machine_check(vcpu);
5064
5065        if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
5066                return 1;  /* already handled by vmx_vcpu_run() */
5067
5068        if (is_no_device(intr_info)) {
5069                vmx_fpu_activate(vcpu);
5070                return 1;
5071        }
5072
5073        if (is_invalid_opcode(intr_info)) {
5074                er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
5075                if (er != EMULATE_DONE)
5076                        kvm_queue_exception(vcpu, UD_VECTOR);
5077                return 1;
5078        }
5079
5080        error_code = 0;
5081        if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
5082                error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
5083
5084        /*
5085         * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
5086         * MMIO, it is better to report an internal error.
5087         * See the comments in vmx_handle_exit.
5088         */
5089        if ((vect_info & VECTORING_INFO_VALID_MASK) &&
5090            !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
5091                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5092                vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
5093                vcpu->run->internal.ndata = 2;
5094                vcpu->run->internal.data[0] = vect_info;
5095                vcpu->run->internal.data[1] = intr_info;
5096                return 0;
5097        }
5098
5099        if (is_page_fault(intr_info)) {
5100                /* EPT won't cause page fault directly */
5101                BUG_ON(enable_ept);
5102                cr2 = vmcs_readl(EXIT_QUALIFICATION);
5103                trace_kvm_page_fault(cr2, error_code);
5104
5105                if (kvm_event_needs_reinjection(vcpu))
5106                        kvm_mmu_unprotect_page_virt(vcpu, cr2);
5107                return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
5108        }
5109
5110        ex_no = intr_info & INTR_INFO_VECTOR_MASK;
5111
5112        if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
5113                return handle_rmode_exception(vcpu, ex_no, error_code);
5114
5115        switch (ex_no) {
5116        case DB_VECTOR:
5117                dr6 = vmcs_readl(EXIT_QUALIFICATION);
5118                if (!(vcpu->guest_debug &
5119                      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
5120                        vcpu->arch.dr6 &= ~15;
5121                        vcpu->arch.dr6 |= dr6 | DR6_RTM;
5122                        if (!(dr6 & ~DR6_RESERVED)) /* icebp */
5123                                skip_emulated_instruction(vcpu);
5124
5125                        kvm_queue_exception(vcpu, DB_VECTOR);
5126                        return 1;
5127                }
5128                kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
5129                kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
5130                /* fall through */
5131        case BP_VECTOR:
5132                /*
5133                 * Update instruction length as we may reinject #BP from
5134                 * user space while in guest debugging mode. Reading it for
5135                 * #DB as well causes no harm, it is not used in that case.
5136                 */
5137                vmx->vcpu.arch.event_exit_inst_len =
5138                        vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
5139                kvm_run->exit_reason = KVM_EXIT_DEBUG;
5140                rip = kvm_rip_read(vcpu);
5141                kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
5142                kvm_run->debug.arch.exception = ex_no;
5143                break;
5144        default:
5145                kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
5146                kvm_run->ex.exception = ex_no;
5147                kvm_run->ex.error_code = error_code;
5148                break;
5149        }
5150        return 0;
5151}
5152
5153static int handle_external_interrupt(struct kvm_vcpu *vcpu)
5154{
5155        ++vcpu->stat.irq_exits;
5156        return 1;
5157}
5158
5159static int handle_triple_fault(struct kvm_vcpu *vcpu)
5160{
5161        vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
5162        return 0;
5163}
5164
5165static int handle_io(struct kvm_vcpu *vcpu)
5166{
5167        unsigned long exit_qualification;
5168        int size, in, string;
5169        unsigned port;
5170
5171        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5172        string = (exit_qualification & 16) != 0;
5173        in = (exit_qualification & 8) != 0;
5174
5175        ++vcpu->stat.io_exits;
5176
5177        if (string || in)
5178                return emulate_instruction(vcpu, 0) == EMULATE_DONE;
5179
5180        port = exit_qualification >> 16;
5181        size = (exit_qualification & 7) + 1;
5182        skip_emulated_instruction(vcpu);
5183
5184        return kvm_fast_pio_out(vcpu, size, port);
5185}
5186
5187static void
5188vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
5189{
5190        /*
5191         * Patch in the VMCALL instruction:
5192         */
5193        hypercall[0] = 0x0f;
5194        hypercall[1] = 0x01;
5195        hypercall[2] = 0xc1;
5196}
5197
5198static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
5199{
5200        unsigned long always_on = VMXON_CR0_ALWAYSON;
5201        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5202
5203        if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high &
5204                SECONDARY_EXEC_UNRESTRICTED_GUEST &&
5205            nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
5206                always_on &= ~(X86_CR0_PE | X86_CR0_PG);
5207        return (val & always_on) == always_on;
5208}
5209
5210/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
5211static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
5212{
5213        if (is_guest_mode(vcpu)) {
5214                struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5215                unsigned long orig_val = val;
5216
5217                /*
5218                 * We get here when L2 changed cr0 in a way that did not change
5219                 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
5220                 * but did change L0 shadowed bits. So we first calculate the
5221                 * effective cr0 value that L1 would like to write into the
5222                 * hardware. It consists of the L2-owned bits from the new
5223                 * value combined with the L1-owned bits from L1's guest_cr0.
5224                 */
5225                val = (val & ~vmcs12->cr0_guest_host_mask) |
5226                        (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
5227
5228                if (!nested_cr0_valid(vcpu, val))
5229                        return 1;
5230
5231                if (kvm_set_cr0(vcpu, val))
5232                        return 1;
5233                vmcs_writel(CR0_READ_SHADOW, orig_val);
5234                return 0;
5235        } else {
5236                if (to_vmx(vcpu)->nested.vmxon &&
5237                    ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON))
5238                        return 1;
5239                return kvm_set_cr0(vcpu, val);
5240        }
5241}
5242
5243static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
5244{
5245        if (is_guest_mode(vcpu)) {
5246                struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5247                unsigned long orig_val = val;
5248
5249                /* analogously to handle_set_cr0 */
5250                val = (val & ~vmcs12->cr4_guest_host_mask) |
5251                        (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
5252                if (kvm_set_cr4(vcpu, val))
5253                        return 1;
5254                vmcs_writel(CR4_READ_SHADOW, orig_val);
5255                return 0;
5256        } else
5257                return kvm_set_cr4(vcpu, val);
5258}
5259
5260/* called to set cr0 as approriate for clts instruction exit. */
5261static void handle_clts(struct kvm_vcpu *vcpu)
5262{
5263        if (is_guest_mode(vcpu)) {
5264                /*
5265                 * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS
5266                 * but we did (!fpu_active). We need to keep GUEST_CR0.TS on,
5267                 * just pretend it's off (also in arch.cr0 for fpu_activate).
5268                 */
5269                vmcs_writel(CR0_READ_SHADOW,
5270                        vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS);
5271                vcpu->arch.cr0 &= ~X86_CR0_TS;
5272        } else
5273                vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
5274}
5275
5276static int handle_cr(struct kvm_vcpu *vcpu)
5277{
5278        unsigned long exit_qualification, val;
5279        int cr;
5280        int reg;
5281        int err;
5282
5283        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5284        cr = exit_qualification & 15;
5285        reg = (exit_qualification >> 8) & 15;
5286        switch ((exit_qualification >> 4) & 3) {
5287        case 0: /* mov to cr */
5288                val = kvm_register_readl(vcpu, reg);
5289                trace_kvm_cr_write(cr, val);
5290                switch (cr) {
5291                case 0:
5292                        err = handle_set_cr0(vcpu, val);
5293                        kvm_complete_insn_gp(vcpu, err);
5294                        return 1;
5295                case 3:
5296                        err = kvm_set_cr3(vcpu, val);
5297                        kvm_complete_insn_gp(vcpu, err);
5298                        return 1;
5299                case 4:
5300                        err = handle_set_cr4(vcpu, val);
5301                        kvm_complete_insn_gp(vcpu, err);
5302                        return 1;
5303                case 8: {
5304                                u8 cr8_prev = kvm_get_cr8(vcpu);
5305                                u8 cr8 = (u8)val;
5306                                err = kvm_set_cr8(vcpu, cr8);
5307                                kvm_complete_insn_gp(vcpu, err);
5308                                if (irqchip_in_kernel(vcpu->kvm))
5309                                        return 1;
5310                                if (cr8_prev <= cr8)
5311                                        return 1;
5312                                vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
5313                                return 0;
5314                        }
5315                }
5316                break;
5317        case 2: /* clts */
5318                handle_clts(vcpu);
5319                trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
5320                skip_emulated_instruction(vcpu);
5321                vmx_fpu_activate(vcpu);
5322                return 1;
5323        case 1: /*mov from cr*/
5324                switch (cr) {
5325                case 3:
5326                        val = kvm_read_cr3(vcpu);
5327                        kvm_register_write(vcpu, reg, val);
5328                        trace_kvm_cr_read(cr, val);
5329                        skip_emulated_instruction(vcpu);
5330                        return 1;
5331                case 8:
5332                        val = kvm_get_cr8(vcpu);
5333                        kvm_register_write(vcpu, reg, val);
5334                        trace_kvm_cr_read(cr, val);
5335                        skip_emulated_instruction(vcpu);
5336                        return 1;
5337                }
5338                break;
5339        case 3: /* lmsw */
5340                val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5341                trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
5342                kvm_lmsw(vcpu, val);
5343
5344                skip_emulated_instruction(vcpu);
5345                return 1;
5346        default:
5347                break;
5348        }
5349        vcpu->run->exit_reason = 0;
5350        vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
5351               (int)(exit_qualification >> 4) & 3, cr);
5352        return 0;
5353}
5354
5355static int handle_dr(struct kvm_vcpu *vcpu)
5356{
5357        unsigned long exit_qualification;
5358        int dr, dr7, reg;
5359
5360        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5361        dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
5362
5363        /* First, if DR does not exist, trigger UD */
5364        if (!kvm_require_dr(vcpu, dr))
5365                return 1;
5366
5367        /* Do not handle if the CPL > 0, will trigger GP on re-entry */
5368        if (!kvm_require_cpl(vcpu, 0))
5369                return 1;
5370        dr7 = vmcs_readl(GUEST_DR7);
5371        if (dr7 & DR7_GD) {
5372                /*
5373                 * As the vm-exit takes precedence over the debug trap, we
5374                 * need to emulate the latter, either for the host or the
5375                 * guest debugging itself.
5376                 */
5377                if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
5378                        vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
5379                        vcpu->run->debug.arch.dr7 = dr7;
5380                        vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
5381                        vcpu->run->debug.arch.exception = DB_VECTOR;
5382                        vcpu->run->exit_reason = KVM_EXIT_DEBUG;
5383                        return 0;
5384                } else {
5385                        vcpu->arch.dr6 &= ~15;
5386                        vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
5387                        kvm_queue_exception(vcpu, DB_VECTOR);
5388                        return 1;
5389                }
5390        }
5391
5392        if (vcpu->guest_debug == 0) {
5393                u32 cpu_based_vm_exec_control;
5394
5395                cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
5396                cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING;
5397                vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
5398
5399                /*
5400                 * No more DR vmexits; force a reload of the debug registers
5401                 * and reenter on this instruction.  The next vmexit will
5402                 * retrieve the full state of the debug registers.
5403                 */
5404                vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
5405                return 1;
5406        }
5407
5408        reg = DEBUG_REG_ACCESS_REG(exit_qualification);
5409        if (exit_qualification & TYPE_MOV_FROM_DR) {
5410                unsigned long val;
5411
5412                if (kvm_get_dr(vcpu, dr, &val))
5413                        return 1;
5414                kvm_register_write(vcpu, reg, val);
5415        } else
5416                if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
5417                        return 1;
5418
5419        skip_emulated_instruction(vcpu);
5420        return 1;
5421}
5422
5423static u64 vmx_get_dr6(struct kvm_vcpu *vcpu)
5424{
5425        return vcpu->arch.dr6;
5426}
5427
5428static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
5429{
5430}
5431
5432static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
5433{
5434        u32 cpu_based_vm_exec_control;
5435
5436        get_debugreg(vcpu->arch.db[0], 0);
5437        get_debugreg(vcpu->arch.db[1], 1);
5438        get_debugreg(vcpu->arch.db[2], 2);
5439        get_debugreg(vcpu->arch.db[3], 3);
5440        get_debugreg(vcpu->arch.dr6, 6);
5441        vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
5442
5443        vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
5444
5445        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
5446        cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING;
5447        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
5448}
5449
5450static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
5451{
5452        vmcs_writel(GUEST_DR7, val);
5453}
5454
5455static int handle_cpuid(struct kvm_vcpu *vcpu)
5456{
5457        kvm_emulate_cpuid(vcpu);
5458        return 1;
5459}
5460
5461static int handle_rdmsr(struct kvm_vcpu *vcpu)
5462{
5463        u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
5464        u64 data;
5465
5466        if (vmx_get_msr(vcpu, ecx, &data)) {
5467                trace_kvm_msr_read_ex(ecx);
5468                kvm_inject_gp(vcpu, 0);
5469                return 1;
5470        }
5471
5472        trace_kvm_msr_read(ecx, data);
5473
5474        /* FIXME: handling of bits 32:63 of rax, rdx */
5475        vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
5476        vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
5477        skip_emulated_instruction(vcpu);
5478        return 1;
5479}
5480
5481static int handle_wrmsr(struct kvm_vcpu *vcpu)
5482{
5483        struct msr_data msr;
5484        u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
5485        u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
5486                | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
5487
5488        msr.data = data;
5489        msr.index = ecx;
5490        msr.host_initiated = false;
5491        if (kvm_set_msr(vcpu, &msr) != 0) {
5492                trace_kvm_msr_write_ex(ecx, data);
5493                kvm_inject_gp(vcpu, 0);
5494                return 1;
5495        }
5496
5497        trace_kvm_msr_write(ecx, data);
5498        skip_emulated_instruction(vcpu);
5499        return 1;
5500}
5501
5502static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
5503{
5504        kvm_make_request(KVM_REQ_EVENT, vcpu);
5505        return 1;
5506}
5507
5508static int handle_interrupt_window(struct kvm_vcpu *vcpu)
5509{
5510        u32 cpu_based_vm_exec_control;
5511
5512        /* clear pending irq */
5513        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
5514        cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
5515        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
5516
5517        kvm_make_request(KVM_REQ_EVENT, vcpu);
5518
5519        ++vcpu->stat.irq_window_exits;
5520
5521        /*
5522         * If the user space waits to inject interrupts, exit as soon as
5523         * possible
5524         */
5525        if (!irqchip_in_kernel(vcpu->kvm) &&
5526            vcpu->run->request_interrupt_window &&
5527            !kvm_cpu_has_interrupt(vcpu)) {
5528                vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
5529                return 0;
5530        }
5531        return 1;
5532}
5533
5534static int handle_halt(struct kvm_vcpu *vcpu)
5535{
5536        skip_emulated_instruction(vcpu);
5537        return kvm_emulate_halt(vcpu);
5538}
5539
5540static int handle_vmcall(struct kvm_vcpu *vcpu)
5541{
5542        skip_emulated_instruction(vcpu);
5543        kvm_emulate_hypercall(vcpu);
5544        return 1;
5545}
5546
5547static int handle_invd(struct kvm_vcpu *vcpu)
5548{
5549        return emulate_instruction(vcpu, 0) == EMULATE_DONE;
5550}
5551
5552static int handle_invlpg(struct kvm_vcpu *vcpu)
5553{
5554        unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5555
5556        kvm_mmu_invlpg(vcpu, exit_qualification);
5557        skip_emulated_instruction(vcpu);
5558        return 1;
5559}
5560
5561static int handle_rdpmc(struct kvm_vcpu *vcpu)
5562{
5563        int err;
5564
5565        err = kvm_rdpmc(vcpu);
5566        kvm_complete_insn_gp(vcpu, err);
5567
5568        return 1;
5569}
5570
5571static int handle_wbinvd(struct kvm_vcpu *vcpu)
5572{
5573        skip_emulated_instruction(vcpu);
5574        kvm_emulate_wbinvd(vcpu);
5575        return 1;
5576}
5577
5578static int handle_xsetbv(struct kvm_vcpu *vcpu)
5579{
5580        u64 new_bv = kvm_read_edx_eax(vcpu);
5581        u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
5582
5583        if (kvm_set_xcr(vcpu, index, new_bv) == 0)
5584                skip_emulated_instruction(vcpu);
5585        return 1;
5586}
5587
5588static int handle_xsaves(struct kvm_vcpu *vcpu)
5589{
5590        skip_emulated_instruction(vcpu);
5591        WARN(1, "this should never happen\n");
5592        return 1;
5593}
5594
5595static int handle_xrstors(struct kvm_vcpu *vcpu)
5596{
5597        skip_emulated_instruction(vcpu);
5598        WARN(1, "this should never happen\n");
5599        return 1;
5600}
5601
5602static int handle_apic_access(struct kvm_vcpu *vcpu)
5603{
5604        if (likely(fasteoi)) {
5605                unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5606                int access_type, offset;
5607
5608                access_type = exit_qualification & APIC_ACCESS_TYPE;
5609                offset = exit_qualification & APIC_ACCESS_OFFSET;
5610                /*
5611                 * Sane guest uses MOV to write EOI, with written value
5612                 * not cared. So make a short-circuit here by avoiding
5613                 * heavy instruction emulation.
5614                 */
5615                if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
5616                    (offset == APIC_EOI)) {
5617                        kvm_lapic_set_eoi(vcpu);
5618                        skip_emulated_instruction(vcpu);
5619                        return 1;
5620                }
5621        }
5622        return emulate_instruction(vcpu, 0) == EMULATE_DONE;
5623}
5624
5625static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
5626{
5627        unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5628        int vector = exit_qualification & 0xff;
5629
5630        /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
5631        kvm_apic_set_eoi_accelerated(vcpu, vector);
5632        return 1;
5633}
5634
5635static int handle_apic_write(struct kvm_vcpu *vcpu)
5636{
5637        unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5638        u32 offset = exit_qualification & 0xfff;
5639
5640        /* APIC-write VM exit is trap-like and thus no need to adjust IP */
5641        kvm_apic_write_nodecode(vcpu, offset);
5642        return 1;
5643}
5644
5645static int handle_task_switch(struct kvm_vcpu *vcpu)
5646{
5647        struct vcpu_vmx *vmx = to_vmx(vcpu);
5648        unsigned long exit_qualification;
5649        bool has_error_code = false;
5650        u32 error_code = 0;
5651        u16 tss_selector;
5652        int reason, type, idt_v, idt_index;
5653
5654        idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
5655        idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
5656        type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
5657
5658        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5659
5660        reason = (u32)exit_qualification >> 30;
5661        if (reason == TASK_SWITCH_GATE && idt_v) {
5662                switch (type) {
5663                case INTR_TYPE_NMI_INTR:
5664                        vcpu->arch.nmi_injected = false;
5665                        vmx_set_nmi_mask(vcpu, true);
5666                        break;
5667                case INTR_TYPE_EXT_INTR:
5668                case INTR_TYPE_SOFT_INTR:
5669                        kvm_clear_interrupt_queue(vcpu);
5670                        break;
5671                case INTR_TYPE_HARD_EXCEPTION:
5672                        if (vmx->idt_vectoring_info &
5673                            VECTORING_INFO_DELIVER_CODE_MASK) {
5674                                has_error_code = true;
5675                                error_code =
5676                                        vmcs_read32(IDT_VECTORING_ERROR_CODE);
5677                        }
5678                        /* fall through */
5679                case INTR_TYPE_SOFT_EXCEPTION:
5680                        kvm_clear_exception_queue(vcpu);
5681                        break;
5682                default:
5683                        break;
5684                }
5685        }
5686        tss_selector = exit_qualification;
5687
5688        if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
5689                       type != INTR_TYPE_EXT_INTR &&
5690                       type != INTR_TYPE_NMI_INTR))
5691                skip_emulated_instruction(vcpu);
5692
5693        if (kvm_task_switch(vcpu, tss_selector,
5694                            type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
5695                            has_error_code, error_code) == EMULATE_FAIL) {
5696                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5697                vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
5698                vcpu->run->internal.ndata = 0;
5699                return 0;
5700        }
5701
5702        /* clear all local breakpoint enable flags */
5703        vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x155);
5704
5705        /*
5706         * TODO: What about debug traps on tss switch?
5707         *       Are we supposed to inject them and update dr6?
5708         */
5709
5710        return 1;
5711}
5712
5713static int handle_ept_violation(struct kvm_vcpu *vcpu)
5714{
5715        unsigned long exit_qualification;
5716        gpa_t gpa;
5717        u32 error_code;
5718        int gla_validity;
5719
5720        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5721
5722        gla_validity = (exit_qualification >> 7) & 0x3;
5723        if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
5724                printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
5725                printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
5726                        (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
5727                        vmcs_readl(GUEST_LINEAR_ADDRESS));
5728                printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
5729                        (long unsigned int)exit_qualification);
5730                vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
5731                vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
5732                return 0;
5733        }
5734
5735        /*
5736         * EPT violation happened while executing iret from NMI,
5737         * "blocked by NMI" bit has to be set before next VM entry.
5738         * There are errata that may cause this bit to not be set:
5739         * AAK134, BY25.
5740         */
5741        if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5742                        cpu_has_virtual_nmis() &&
5743                        (exit_qualification & INTR_INFO_UNBLOCK_NMI))
5744                vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
5745
5746        gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5747        trace_kvm_page_fault(gpa, exit_qualification);
5748
5749        /* It is a write fault? */
5750        error_code = exit_qualification & PFERR_WRITE_MASK;
5751        /* It is a fetch fault? */
5752        error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK;
5753        /* ept page table is present? */
5754        error_code |= (exit_qualification >> 3) & PFERR_PRESENT_MASK;
5755
5756        vcpu->arch.exit_qualification = exit_qualification;
5757
5758        return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
5759}
5760
5761static u64 ept_rsvd_mask(u64 spte, int level)
5762{
5763        int i;
5764        u64 mask = 0;
5765
5766        for (i = 51; i > boot_cpu_data.x86_phys_bits; i--)
5767                mask |= (1ULL << i);
5768
5769        if (level == 4)
5770                /* bits 7:3 reserved */
5771                mask |= 0xf8;
5772        else if (spte & (1ULL << 7))
5773                /*
5774                 * 1GB/2MB page, bits 29:12 or 20:12 reserved respectively,
5775                 * level == 1 if the hypervisor is using the ignored bit 7.
5776                 */
5777                mask |= (PAGE_SIZE << ((level - 1) * 9)) - PAGE_SIZE;
5778        else if (level > 1)
5779                /* bits 6:3 reserved */
5780                mask |= 0x78;
5781
5782        return mask;
5783}
5784
5785static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
5786                                       int level)
5787{
5788        printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level);
5789
5790        /* 010b (write-only) */
5791        WARN_ON((spte & 0x7) == 0x2);
5792
5793        /* 110b (write/execute) */
5794        WARN_ON((spte & 0x7) == 0x6);
5795
5796        /* 100b (execute-only) and value not supported by logical processor */
5797        if (!cpu_has_vmx_ept_execute_only())
5798                WARN_ON((spte & 0x7) == 0x4);
5799
5800        /* not 000b */
5801        if ((spte & 0x7)) {
5802                u64 rsvd_bits = spte & ept_rsvd_mask(spte, level);
5803
5804                if (rsvd_bits != 0) {
5805                        printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n",
5806                                         __func__, rsvd_bits);
5807                        WARN_ON(1);
5808                }
5809
5810                /* bits 5:3 are _not_ reserved for large page or leaf page */
5811                if ((rsvd_bits & 0x38) == 0) {
5812                        u64 ept_mem_type = (spte & 0x38) >> 3;
5813
5814                        if (ept_mem_type == 2 || ept_mem_type == 3 ||
5815                            ept_mem_type == 7) {
5816                                printk(KERN_ERR "%s: ept_mem_type=0x%llx\n",
5817                                                __func__, ept_mem_type);
5818                                WARN_ON(1);
5819                        }
5820                }
5821        }
5822}
5823
5824static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
5825{
5826        u64 sptes[4];
5827        int nr_sptes, i, ret;
5828        gpa_t gpa;
5829
5830        gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5831        if (!kvm_io_bus_write(vcpu->kvm, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
5832                skip_emulated_instruction(vcpu);
5833                return 1;
5834        }
5835
5836        ret = handle_mmio_page_fault_common(vcpu, gpa, true);
5837        if (likely(ret == RET_MMIO_PF_EMULATE))
5838                return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
5839                                              EMULATE_DONE;
5840
5841        if (unlikely(ret == RET_MMIO_PF_INVALID))
5842                return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0);
5843
5844        if (unlikely(ret == RET_MMIO_PF_RETRY))
5845                return 1;
5846
5847        /* It is the real ept misconfig */
5848        printk(KERN_ERR "EPT: Misconfiguration.\n");
5849        printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
5850
5851        nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes);
5852
5853        for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
5854                ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
5855
5856        vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
5857        vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
5858
5859        return 0;
5860}
5861
5862static int handle_nmi_window(struct kvm_vcpu *vcpu)
5863{
5864        u32 cpu_based_vm_exec_control;
5865
5866        /* clear pending NMI */
5867        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
5868        cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
5869        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
5870        ++vcpu->stat.nmi_window_exits;
5871        kvm_make_request(KVM_REQ_EVENT, vcpu);
5872
5873        return 1;
5874}
5875
5876static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
5877{
5878        struct vcpu_vmx *vmx = to_vmx(vcpu);
5879        enum emulation_result err = EMULATE_DONE;
5880        int ret = 1;
5881        u32 cpu_exec_ctrl;
5882        bool intr_window_requested;
5883        unsigned count = 130;
5884
5885        cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
5886        intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
5887
5888        while (vmx->emulation_required && count-- != 0) {
5889                if (intr_window_requested && vmx_interrupt_allowed(vcpu))
5890                        return handle_interrupt_window(&vmx->vcpu);
5891
5892                if (test_bit(KVM_REQ_EVENT, &vcpu->requests))
5893                        return 1;
5894
5895                err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
5896
5897                if (err == EMULATE_USER_EXIT) {
5898                        ++vcpu->stat.mmio_exits;
5899                        ret = 0;
5900                        goto out;
5901                }
5902
5903                if (err != EMULATE_DONE) {
5904                        vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5905                        vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
5906                        vcpu->run->internal.ndata = 0;
5907                        return 0;
5908                }
5909
5910                if (vcpu->arch.halt_request) {
5911                        vcpu->arch.halt_request = 0;
5912                        ret = kvm_emulate_halt(vcpu);
5913                        goto out;
5914                }
5915
5916                if (signal_pending(current))
5917                        goto out;
5918                if (need_resched())
5919                        schedule();
5920        }
5921
5922out:
5923        return ret;
5924}
5925
5926static int __grow_ple_window(int val)
5927{
5928        if (ple_window_grow < 1)
5929                return ple_window;
5930
5931        val = min(val, ple_window_actual_max);
5932
5933        if (ple_window_grow < ple_window)
5934                val *= ple_window_grow;
5935        else
5936                val += ple_window_grow;
5937
5938        return val;
5939}
5940
5941static int __shrink_ple_window(int val, int modifier, int minimum)
5942{
5943        if (modifier < 1)
5944                return ple_window;
5945
5946        if (modifier < ple_window)
5947                val /= modifier;
5948        else
5949                val -= modifier;
5950
5951        return max(val, minimum);
5952}
5953
5954static void grow_ple_window(struct kvm_vcpu *vcpu)
5955{
5956        struct vcpu_vmx *vmx = to_vmx(vcpu);
5957        int old = vmx->ple_window;
5958
5959        vmx->ple_window = __grow_ple_window(old);
5960
5961        if (vmx->ple_window != old)
5962                vmx->ple_window_dirty = true;
5963
5964        trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old);
5965}
5966
5967static void shrink_ple_window(struct kvm_vcpu *vcpu)
5968{
5969        struct vcpu_vmx *vmx = to_vmx(vcpu);
5970        int old = vmx->ple_window;
5971
5972        vmx->ple_window = __shrink_ple_window(old,
5973                                              ple_window_shrink, ple_window);
5974
5975        if (vmx->ple_window != old)
5976                vmx->ple_window_dirty = true;
5977
5978        trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old);
5979}
5980
5981/*
5982 * ple_window_actual_max is computed to be one grow_ple_window() below
5983 * ple_window_max. (See __grow_ple_window for the reason.)
5984 * This prevents overflows, because ple_window_max is int.
5985 * ple_window_max effectively rounded down to a multiple of ple_window_grow in
5986 * this process.
5987 * ple_window_max is also prevented from setting vmx->ple_window < ple_window.
5988 */
5989static void update_ple_window_actual_max(void)
5990{
5991        ple_window_actual_max =
5992                        __shrink_ple_window(max(ple_window_max, ple_window),
5993                                            ple_window_grow, INT_MIN);
5994}
5995
5996static __init int hardware_setup(void)
5997{
5998        int r = -ENOMEM, i, msr;
5999
6000        rdmsrl_safe(MSR_EFER, &host_efer);
6001
6002        for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
6003                kvm_define_shared_msr(i, vmx_msr_index[i]);
6004
6005        vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
6006        if (!vmx_io_bitmap_a)
6007                return r;
6008
6009        vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
6010        if (!vmx_io_bitmap_b)
6011                goto out;
6012
6013        vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
6014        if (!vmx_msr_bitmap_legacy)
6015                goto out1;
6016
6017        vmx_msr_bitmap_legacy_x2apic =
6018                                (unsigned long *)__get_free_page(GFP_KERNEL);
6019        if (!vmx_msr_bitmap_legacy_x2apic)
6020                goto out2;
6021
6022        vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
6023        if (!vmx_msr_bitmap_longmode)
6024                goto out3;
6025
6026        vmx_msr_bitmap_longmode_x2apic =
6027                                (unsigned long *)__get_free_page(GFP_KERNEL);
6028        if (!vmx_msr_bitmap_longmode_x2apic)
6029                goto out4;
6030
6031        if (nested) {
6032                vmx_msr_bitmap_nested =
6033                        (unsigned long *)__get_free_page(GFP_KERNEL);
6034                if (!vmx_msr_bitmap_nested)
6035                        goto out5;
6036        }
6037
6038        vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
6039        if (!vmx_vmread_bitmap)
6040                goto out6;
6041
6042        vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
6043        if (!vmx_vmwrite_bitmap)
6044                goto out7;
6045
6046        memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
6047        memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
6048
6049        /*
6050         * Allow direct access to the PC debug port (it is often used for I/O
6051         * delays, but the vmexits simply slow things down).
6052         */
6053        memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
6054        clear_bit(0x80, vmx_io_bitmap_a);
6055
6056        memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
6057
6058        memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
6059        memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
6060        if (nested)
6061                memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
6062
6063        if (setup_vmcs_config(&vmcs_config) < 0) {
6064                r = -EIO;
6065                goto out8;
6066        }
6067
6068        if (boot_cpu_has(X86_FEATURE_NX))
6069                kvm_enable_efer_bits(EFER_NX);
6070
6071        if (!cpu_has_vmx_vpid())
6072                enable_vpid = 0;
6073        if (!cpu_has_vmx_shadow_vmcs())
6074                enable_shadow_vmcs = 0;
6075        if (enable_shadow_vmcs)
6076                init_vmcs_shadow_fields();
6077
6078        if (!cpu_has_vmx_ept() ||
6079            !cpu_has_vmx_ept_4levels()) {
6080                enable_ept = 0;
6081                enable_unrestricted_guest = 0;
6082                enable_ept_ad_bits = 0;
6083        }
6084
6085        if (!cpu_has_vmx_ept_ad_bits())
6086                enable_ept_ad_bits = 0;
6087
6088        if (!cpu_has_vmx_unrestricted_guest())
6089                enable_unrestricted_guest = 0;
6090
6091        if (!cpu_has_vmx_flexpriority())
6092                flexpriority_enabled = 0;
6093
6094        /*
6095         * set_apic_access_page_addr() is used to reload apic access
6096         * page upon invalidation.  No need to do anything if not
6097         * using the APIC_ACCESS_ADDR VMCS field.
6098         */
6099        if (!flexpriority_enabled)
6100                kvm_x86_ops->set_apic_access_page_addr = NULL;
6101
6102        if (!cpu_has_vmx_tpr_shadow())
6103                kvm_x86_ops->update_cr8_intercept = NULL;
6104
6105        if (enable_ept && !cpu_has_vmx_ept_2m_page())
6106                kvm_disable_largepages();
6107
6108        if (!cpu_has_vmx_ple())
6109                ple_gap = 0;
6110
6111        if (!cpu_has_vmx_apicv())
6112                enable_apicv = 0;
6113
6114        if (enable_apicv)
6115                kvm_x86_ops->update_cr8_intercept = NULL;
6116        else {
6117                kvm_x86_ops->hwapic_irr_update = NULL;
6118                kvm_x86_ops->hwapic_isr_update = NULL;
6119                kvm_x86_ops->deliver_posted_interrupt = NULL;
6120                kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
6121        }
6122
6123        vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
6124        vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
6125        vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
6126        vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
6127        vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
6128        vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
6129        vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true);
6130
6131        memcpy(vmx_msr_bitmap_legacy_x2apic,
6132                        vmx_msr_bitmap_legacy, PAGE_SIZE);
6133        memcpy(vmx_msr_bitmap_longmode_x2apic,
6134                        vmx_msr_bitmap_longmode, PAGE_SIZE);
6135
6136        if (enable_apicv) {
6137                for (msr = 0x800; msr <= 0x8ff; msr++)
6138                        vmx_disable_intercept_msr_read_x2apic(msr);
6139
6140                /* According SDM, in x2apic mode, the whole id reg is used.
6141                 * But in KVM, it only use the highest eight bits. Need to
6142                 * intercept it */
6143                vmx_enable_intercept_msr_read_x2apic(0x802);
6144                /* TMCCT */
6145                vmx_enable_intercept_msr_read_x2apic(0x839);
6146                /* TPR */
6147                vmx_disable_intercept_msr_write_x2apic(0x808);
6148                /* EOI */
6149                vmx_disable_intercept_msr_write_x2apic(0x80b);
6150                /* SELF-IPI */
6151                vmx_disable_intercept_msr_write_x2apic(0x83f);
6152        }
6153
6154        if (enable_ept) {
6155                kvm_mmu_set_mask_ptes(0ull,
6156                        (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
6157                        (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
6158                        0ull, VMX_EPT_EXECUTABLE_MASK);
6159                ept_set_mmio_spte_mask();
6160                kvm_enable_tdp();
6161        } else
6162                kvm_disable_tdp();
6163
6164        update_ple_window_actual_max();
6165
6166        /*
6167         * Only enable PML when hardware supports PML feature, and both EPT
6168         * and EPT A/D bit features are enabled -- PML depends on them to work.
6169         */
6170        if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
6171                enable_pml = 0;
6172
6173        if (!enable_pml) {
6174                kvm_x86_ops->slot_enable_log_dirty = NULL;
6175                kvm_x86_ops->slot_disable_log_dirty = NULL;
6176                kvm_x86_ops->flush_log_dirty = NULL;
6177                kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
6178        }
6179
6180        return alloc_kvm_area();
6181
6182out8:
6183        free_page((unsigned long)vmx_vmwrite_bitmap);
6184out7:
6185        free_page((unsigned long)vmx_vmread_bitmap);
6186out6:
6187        if (nested)
6188                free_page((unsigned long)vmx_msr_bitmap_nested);
6189out5:
6190        free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
6191out4:
6192        free_page((unsigned long)vmx_msr_bitmap_longmode);
6193out3:
6194        free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
6195out2:
6196        free_page((unsigned long)vmx_msr_bitmap_legacy);
6197out1:
6198        free_page((unsigned long)vmx_io_bitmap_b);
6199out:
6200        free_page((unsigned long)vmx_io_bitmap_a);
6201
6202    return r;
6203}
6204
6205static __exit void hardware_unsetup(void)
6206{
6207        free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
6208        free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
6209        free_page((unsigned long)vmx_msr_bitmap_legacy);
6210        free_page((unsigned long)vmx_msr_bitmap_longmode);
6211        free_page((unsigned long)vmx_io_bitmap_b);
6212        free_page((unsigned long)vmx_io_bitmap_a);
6213        free_page((unsigned long)vmx_vmwrite_bitmap);
6214        free_page((unsigned long)vmx_vmread_bitmap);
6215        if (nested)
6216                free_page((unsigned long)vmx_msr_bitmap_nested);
6217
6218        free_kvm_area();
6219}
6220
6221/*
6222 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
6223 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
6224 */
6225static int handle_pause(struct kvm_vcpu *vcpu)
6226{
6227        if (ple_gap)
6228                grow_ple_window(vcpu);
6229
6230        skip_emulated_instruction(vcpu);
6231        kvm_vcpu_on_spin(vcpu);
6232
6233        return 1;
6234}
6235
6236static int handle_nop(struct kvm_vcpu *vcpu)
6237{
6238        skip_emulated_instruction(vcpu);
6239        return 1;
6240}
6241
6242static int handle_mwait(struct kvm_vcpu *vcpu)
6243{
6244        printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
6245        return handle_nop(vcpu);
6246}
6247
6248static int handle_monitor(struct kvm_vcpu *vcpu)
6249{
6250        printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
6251        return handle_nop(vcpu);
6252}
6253
6254/*
6255 * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
6256 * We could reuse a single VMCS for all the L2 guests, but we also want the
6257 * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
6258 * allows keeping them loaded on the processor, and in the future will allow
6259 * optimizations where prepare_vmcs02 doesn't need to set all the fields on
6260 * every entry if they never change.
6261 * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
6262 * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
6263 *
6264 * The following functions allocate and free a vmcs02 in this pool.
6265 */
6266
6267/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
6268static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
6269{
6270        struct vmcs02_list *item;
6271        list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
6272                if (item->vmptr == vmx->nested.current_vmptr) {
6273                        list_move(&item->list, &vmx->nested.vmcs02_pool);
6274                        return &item->vmcs02;
6275                }
6276
6277        if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
6278                /* Recycle the least recently used VMCS. */
6279                item = list_entry(vmx->nested.vmcs02_pool.prev,
6280                        struct vmcs02_list, list);
6281                item->vmptr = vmx->nested.current_vmptr;
6282                list_move(&item->list, &vmx->nested.vmcs02_pool);
6283                return &item->vmcs02;
6284        }
6285
6286        /* Create a new VMCS */
6287        item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
6288        if (!item)
6289                return NULL;
6290        item->vmcs02.vmcs = alloc_vmcs();
6291        if (!item->vmcs02.vmcs) {
6292                kfree(item);
6293                return NULL;
6294        }
6295        loaded_vmcs_init(&item->vmcs02);
6296        item->vmptr = vmx->nested.current_vmptr;
6297        list_add(&(item->list), &(vmx->nested.vmcs02_pool));
6298        vmx->nested.vmcs02_num++;
6299        return &item->vmcs02;
6300}
6301
6302/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
6303static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
6304{
6305        struct vmcs02_list *item;
6306        list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
6307                if (item->vmptr == vmptr) {
6308                        free_loaded_vmcs(&item->vmcs02);
6309                        list_del(&item->list);
6310                        kfree(item);
6311                        vmx->nested.vmcs02_num--;
6312                        return;
6313                }
6314}
6315
6316/*
6317 * Free all VMCSs saved for this vcpu, except the one pointed by
6318 * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
6319 * must be &vmx->vmcs01.
6320 */
6321static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
6322{
6323        struct vmcs02_list *item, *n;
6324
6325        WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
6326        list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
6327                /*
6328                 * Something will leak if the above WARN triggers.  Better than
6329                 * a use-after-free.
6330                 */
6331                if (vmx->loaded_vmcs == &item->vmcs02)
6332                        continue;
6333
6334                free_loaded_vmcs(&item->vmcs02);
6335                list_del(&item->list);
6336                kfree(item);
6337                vmx->nested.vmcs02_num--;
6338        }
6339}
6340
6341/*
6342 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
6343 * set the success or error code of an emulated VMX instruction, as specified
6344 * by Vol 2B, VMX Instruction Reference, "Conventions".
6345 */
6346static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
6347{
6348        vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
6349                        & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
6350                            X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
6351}
6352
6353static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
6354{
6355        vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
6356                        & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
6357                            X86_EFLAGS_SF | X86_EFLAGS_OF))
6358                        | X86_EFLAGS_CF);
6359}
6360
6361static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
6362                                        u32 vm_instruction_error)
6363{
6364        if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
6365                /*
6366                 * failValid writes the error number to the current VMCS, which
6367                 * can't be done there isn't a current VMCS.
6368                 */
6369                nested_vmx_failInvalid(vcpu);
6370                return;
6371        }
6372        vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
6373                        & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
6374                            X86_EFLAGS_SF | X86_EFLAGS_OF))
6375                        | X86_EFLAGS_ZF);
6376        get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
6377        /*
6378         * We don't need to force a shadow sync because
6379         * VM_INSTRUCTION_ERROR is not shadowed
6380         */
6381}
6382
6383static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
6384{
6385        /* TODO: not to reset guest simply here. */
6386        kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
6387        pr_warn("kvm: nested vmx abort, indicator %d\n", indicator);
6388}
6389
6390static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
6391{
6392        struct vcpu_vmx *vmx =
6393                container_of(timer, struct vcpu_vmx, nested.preemption_timer);
6394
6395        vmx->nested.preemption_timer_expired = true;
6396        kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
6397        kvm_vcpu_kick(&vmx->vcpu);
6398
6399        return HRTIMER_NORESTART;
6400}
6401
6402/*
6403 * Decode the memory-address operand of a vmx instruction, as recorded on an
6404 * exit caused by such an instruction (run by a guest hypervisor).
6405 * On success, returns 0. When the operand is invalid, returns 1 and throws
6406 * #UD or #GP.
6407 */
6408static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
6409                                 unsigned long exit_qualification,
6410                                 u32 vmx_instruction_info, gva_t *ret)
6411{
6412        /*
6413         * According to Vol. 3B, "Information for VM Exits Due to Instruction
6414         * Execution", on an exit, vmx_instruction_info holds most of the
6415         * addressing components of the operand. Only the displacement part
6416         * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
6417         * For how an actual address is calculated from all these components,
6418         * refer to Vol. 1, "Operand Addressing".
6419         */
6420        int  scaling = vmx_instruction_info & 3;
6421        int  addr_size = (vmx_instruction_info >> 7) & 7;
6422        bool is_reg = vmx_instruction_info & (1u << 10);
6423        int  seg_reg = (vmx_instruction_info >> 15) & 7;
6424        int  index_reg = (vmx_instruction_info >> 18) & 0xf;
6425        bool index_is_valid = !(vmx_instruction_info & (1u << 22));
6426        int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
6427        bool base_is_valid  = !(vmx_instruction_info & (1u << 27));
6428
6429        if (is_reg) {
6430                kvm_queue_exception(vcpu, UD_VECTOR);
6431                return 1;
6432        }
6433
6434        /* Addr = segment_base + offset */
6435        /* offset = base + [index * scale] + displacement */
6436        *ret = vmx_get_segment_base(vcpu, seg_reg);
6437        if (base_is_valid)
6438                *ret += kvm_register_read(vcpu, base_reg);
6439        if (index_is_valid)
6440                *ret += kvm_register_read(vcpu, index_reg)<<scaling;
6441        *ret += exit_qualification; /* holds the displacement */
6442
6443        if (addr_size == 1) /* 32 bit */
6444                *ret &= 0xffffffff;
6445
6446        /*
6447         * TODO: throw #GP (and return 1) in various cases that the VM*
6448         * instructions require it - e.g., offset beyond segment limit,
6449         * unusable or unreadable/unwritable segment, non-canonical 64-bit
6450         * address, and so on. Currently these are not checked.
6451         */
6452        return 0;
6453}
6454
6455/*
6456 * This function performs the various checks including
6457 * - if it's 4KB aligned
6458 * - No bits beyond the physical address width are set
6459 * - Returns 0 on success or else 1
6460 * (Intel SDM Section 30.3)
6461 */
6462static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
6463                                  gpa_t *vmpointer)
6464{
6465        gva_t gva;
6466        gpa_t vmptr;
6467        struct x86_exception e;
6468        struct page *page;
6469        struct vcpu_vmx *vmx = to_vmx(vcpu);
6470        int maxphyaddr = cpuid_maxphyaddr(vcpu);
6471
6472        if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
6473                        vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
6474                return 1;
6475
6476        if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
6477                                sizeof(vmptr), &e)) {
6478                kvm_inject_page_fault(vcpu, &e);
6479                return 1;
6480        }
6481
6482        switch (exit_reason) {
6483        case EXIT_REASON_VMON:
6484                /*
6485                 * SDM 3: 24.11.5
6486                 * The first 4 bytes of VMXON region contain the supported
6487                 * VMCS revision identifier
6488                 *
6489                 * Note - IA32_VMX_BASIC[48] will never be 1
6490                 * for the nested case;
6491                 * which replaces physical address width with 32
6492                 *
6493                 */
6494                if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
6495                        nested_vmx_failInvalid(vcpu);
6496                        skip_emulated_instruction(vcpu);
6497                        return 1;
6498                }
6499
6500                page = nested_get_page(vcpu, vmptr);
6501                if (page == NULL ||
6502                    *(u32 *)kmap(page) != VMCS12_REVISION) {
6503                        nested_vmx_failInvalid(vcpu);
6504                        kunmap(page);
6505                        skip_emulated_instruction(vcpu);
6506                        return 1;
6507                }
6508                kunmap(page);
6509                vmx->nested.vmxon_ptr = vmptr;
6510                break;
6511        case EXIT_REASON_VMCLEAR:
6512                if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
6513                        nested_vmx_failValid(vcpu,
6514                                             VMXERR_VMCLEAR_INVALID_ADDRESS);
6515                        skip_emulated_instruction(vcpu);
6516                        return 1;
6517                }
6518
6519                if (vmptr == vmx->nested.vmxon_ptr) {
6520                        nested_vmx_failValid(vcpu,
6521                                             VMXERR_VMCLEAR_VMXON_POINTER);
6522                        skip_emulated_instruction(vcpu);
6523                        return 1;
6524                }
6525                break;
6526        case EXIT_REASON_VMPTRLD:
6527                if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
6528                        nested_vmx_failValid(vcpu,
6529                                             VMXERR_VMPTRLD_INVALID_ADDRESS);
6530                        skip_emulated_instruction(vcpu);
6531                        return 1;
6532                }
6533
6534                if (vmptr == vmx->nested.vmxon_ptr) {
6535                        nested_vmx_failValid(vcpu,
6536                                             VMXERR_VMCLEAR_VMXON_POINTER);
6537                        skip_emulated_instruction(vcpu);
6538                        return 1;
6539                }
6540                break;
6541        default:
6542                return 1; /* shouldn't happen */
6543        }
6544
6545        if (vmpointer)
6546                *vmpointer = vmptr;
6547        return 0;
6548}
6549
6550/*
6551 * Emulate the VMXON instruction.
6552 * Currently, we just remember that VMX is active, and do not save or even
6553 * inspect the argument to VMXON (the so-called "VMXON pointer") because we
6554 * do not currently need to store anything in that guest-allocated memory
6555 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
6556 * argument is different from the VMXON pointer (which the spec says they do).
6557 */
6558static int handle_vmon(struct kvm_vcpu *vcpu)
6559{
6560        struct kvm_segment cs;
6561        struct vcpu_vmx *vmx = to_vmx(vcpu);
6562        struct vmcs *shadow_vmcs;
6563        const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
6564                | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
6565
6566        /* The Intel VMX Instruction Reference lists a bunch of bits that
6567         * are prerequisite to running VMXON, most notably cr4.VMXE must be
6568         * set to 1 (see vmx_set_cr4() for when we allow the guest to set this).
6569         * Otherwise, we should fail with #UD. We test these now:
6570         */
6571        if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) ||
6572            !kvm_read_cr0_bits(vcpu, X86_CR0_PE) ||
6573            (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
6574                kvm_queue_exception(vcpu, UD_VECTOR);
6575                return 1;
6576        }
6577
6578        vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
6579        if (is_long_mode(vcpu) && !cs.l) {
6580                kvm_queue_exception(vcpu, UD_VECTOR);
6581                return 1;
6582        }
6583
6584        if (vmx_get_cpl(vcpu)) {
6585                kvm_inject_gp(vcpu, 0);
6586                return 1;
6587        }
6588
6589        if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL))
6590                return 1;
6591
6592        if (vmx->nested.vmxon) {
6593                nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
6594                skip_emulated_instruction(vcpu);
6595                return 1;
6596        }
6597
6598        if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
6599                        != VMXON_NEEDED_FEATURES) {
6600                kvm_inject_gp(vcpu, 0);
6601                return 1;
6602        }
6603
6604        if (enable_shadow_vmcs) {
6605                shadow_vmcs = alloc_vmcs();
6606                if (!shadow_vmcs)
6607                        return -ENOMEM;
6608                /* mark vmcs as shadow */
6609                shadow_vmcs->revision_id |= (1u << 31);
6610                /* init shadow vmcs */
6611                vmcs_clear(shadow_vmcs);
6612                vmx->nested.current_shadow_vmcs = shadow_vmcs;
6613        }
6614
6615        INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
6616        vmx->nested.vmcs02_num = 0;
6617
6618        hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
6619                     HRTIMER_MODE_REL);
6620        vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
6621
6622        vmx->nested.vmxon = true;
6623
6624        skip_emulated_instruction(vcpu);
6625        nested_vmx_succeed(vcpu);
6626        return 1;
6627}
6628
6629/*
6630 * Intel's VMX Instruction Reference specifies a common set of prerequisites
6631 * for running VMX instructions (except VMXON, whose prerequisites are
6632 * slightly different). It also specifies what exception to inject otherwise.
6633 */
6634static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
6635{
6636        struct kvm_segment cs;
6637        struct vcpu_vmx *vmx = to_vmx(vcpu);
6638
6639        if (!vmx->nested.vmxon) {
6640                kvm_queue_exception(vcpu, UD_VECTOR);
6641                return 0;
6642        }
6643
6644        vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
6645        if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
6646            (is_long_mode(vcpu) && !cs.l)) {
6647                kvm_queue_exception(vcpu, UD_VECTOR);
6648                return 0;
6649        }
6650
6651        if (vmx_get_cpl(vcpu)) {
6652                kvm_inject_gp(vcpu, 0);
6653                return 0;
6654        }
6655
6656        return 1;
6657}
6658
6659static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
6660{
6661        u32 exec_control;
6662        if (vmx->nested.current_vmptr == -1ull)
6663                return;
6664
6665        /* current_vmptr and current_vmcs12 are always set/reset together */
6666        if (WARN_ON(vmx->nested.current_vmcs12 == NULL))
6667                return;
6668
6669        if (enable_shadow_vmcs) {
6670                /* copy to memory all shadowed fields in case
6671                   they were modified */
6672                copy_shadow_to_vmcs12(vmx);
6673                vmx->nested.sync_shadow_vmcs = false;
6674                exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
6675                exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
6676                vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
6677                vmcs_write64(VMCS_LINK_POINTER, -1ull);
6678        }
6679        vmx->nested.posted_intr_nv = -1;
6680        kunmap(vmx->nested.current_vmcs12_page);
6681        nested_release_page(vmx->nested.current_vmcs12_page);
6682        vmx->nested.current_vmptr = -1ull;
6683        vmx->nested.current_vmcs12 = NULL;
6684}
6685
6686/*
6687 * Free whatever needs to be freed from vmx->nested when L1 goes down, or
6688 * just stops using VMX.
6689 */
6690static void free_nested(struct vcpu_vmx *vmx)
6691{
6692        if (!vmx->nested.vmxon)
6693                return;
6694
6695        vmx->nested.vmxon = false;
6696        nested_release_vmcs12(vmx);
6697        if (enable_shadow_vmcs)
6698                free_vmcs(vmx->nested.current_shadow_vmcs);
6699        /* Unpin physical memory we referred to in current vmcs02 */
6700        if (vmx->nested.apic_access_page) {
6701                nested_release_page(vmx->nested.apic_access_page);
6702                vmx->nested.apic_access_page = NULL;
6703        }
6704        if (vmx->nested.virtual_apic_page) {
6705                nested_release_page(vmx->nested.virtual_apic_page);
6706                vmx->nested.virtual_apic_page = NULL;
6707        }
6708        if (vmx->nested.pi_desc_page) {
6709                kunmap(vmx->nested.pi_desc_page);
6710                nested_release_page(vmx->nested.pi_desc_page);
6711                vmx->nested.pi_desc_page = NULL;
6712                vmx->nested.pi_desc = NULL;
6713        }
6714
6715        nested_free_all_saved_vmcss(vmx);
6716}
6717
6718/* Emulate the VMXOFF instruction */
6719static int handle_vmoff(struct kvm_vcpu *vcpu)
6720{
6721        if (!nested_vmx_check_permission(vcpu))
6722                return 1;
6723        free_nested(to_vmx(vcpu));
6724        skip_emulated_instruction(vcpu);
6725        nested_vmx_succeed(vcpu);
6726        return 1;
6727}
6728
6729/* Emulate the VMCLEAR instruction */
6730static int handle_vmclear(struct kvm_vcpu *vcpu)
6731{
6732        struct vcpu_vmx *vmx = to_vmx(vcpu);
6733        gpa_t vmptr;
6734        struct vmcs12 *vmcs12;
6735        struct page *page;
6736
6737        if (!nested_vmx_check_permission(vcpu))
6738                return 1;
6739
6740        if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr))
6741                return 1;
6742
6743        if (vmptr == vmx->nested.current_vmptr)
6744                nested_release_vmcs12(vmx);
6745
6746        page = nested_get_page(vcpu, vmptr);
6747        if (page == NULL) {
6748                /*
6749                 * For accurate processor emulation, VMCLEAR beyond available
6750                 * physical memory should do nothing at all. However, it is
6751                 * possible that a nested vmx bug, not a guest hypervisor bug,
6752                 * resulted in this case, so let's shut down before doing any
6753                 * more damage:
6754                 */
6755                kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
6756                return 1;
6757        }
6758        vmcs12 = kmap(page);
6759        vmcs12->launch_state = 0;
6760        kunmap(page);
6761        nested_release_page(page);
6762
6763        nested_free_vmcs02(vmx, vmptr);
6764
6765        skip_emulated_instruction(vcpu);
6766        nested_vmx_succeed(vcpu);
6767        return 1;
6768}
6769
6770static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
6771
6772/* Emulate the VMLAUNCH instruction */
6773static int handle_vmlaunch(struct kvm_vcpu *vcpu)
6774{
6775        return nested_vmx_run(vcpu, true);
6776}
6777
6778/* Emulate the VMRESUME instruction */
6779static int handle_vmresume(struct kvm_vcpu *vcpu)
6780{
6781
6782        return nested_vmx_run(vcpu, false);
6783}
6784
6785enum vmcs_field_type {
6786        VMCS_FIELD_TYPE_U16 = 0,
6787        VMCS_FIELD_TYPE_U64 = 1,
6788        VMCS_FIELD_TYPE_U32 = 2,
6789        VMCS_FIELD_TYPE_NATURAL_WIDTH = 3
6790};
6791
6792static inline int vmcs_field_type(unsigned long field)
6793{
6794        if (0x1 & field)        /* the *_HIGH fields are all 32 bit */
6795                return VMCS_FIELD_TYPE_U32;
6796        return (field >> 13) & 0x3 ;
6797}
6798
6799static inline int vmcs_field_readonly(unsigned long field)
6800{
6801        return (((field >> 10) & 0x3) == 1);
6802}
6803
6804/*
6805 * Read a vmcs12 field. Since these can have varying lengths and we return
6806 * one type, we chose the biggest type (u64) and zero-extend the return value
6807 * to that size. Note that the caller, handle_vmread, might need to use only
6808 * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
6809 * 64-bit fields are to be returned).
6810 */
6811static inline int vmcs12_read_any(struct kvm_vcpu *vcpu,
6812                                  unsigned long field, u64 *ret)
6813{
6814        short offset = vmcs_field_to_offset(field);
6815        char *p;
6816
6817        if (offset < 0)
6818                return offset;
6819
6820        p = ((char *)(get_vmcs12(vcpu))) + offset;
6821
6822        switch (vmcs_field_type(field)) {
6823        case VMCS_FIELD_TYPE_NATURAL_WIDTH:
6824                *ret = *((natural_width *)p);
6825                return 0;
6826        case VMCS_FIELD_TYPE_U16:
6827                *ret = *((u16 *)p);
6828                return 0;
6829        case VMCS_FIELD_TYPE_U32:
6830                *ret = *((u32 *)p);
6831                return 0;
6832        case VMCS_FIELD_TYPE_U64:
6833                *ret = *((u64 *)p);
6834                return 0;
6835        default:
6836                WARN_ON(1);
6837                return -ENOENT;
6838        }
6839}
6840
6841
6842static inline int vmcs12_write_any(struct kvm_vcpu *vcpu,
6843                                   unsigned long field, u64 field_value){
6844        short offset = vmcs_field_to_offset(field);
6845        char *p = ((char *) get_vmcs12(vcpu)) + offset;
6846        if (offset < 0)
6847                return offset;
6848
6849        switch (vmcs_field_type(field)) {
6850        case VMCS_FIELD_TYPE_U16:
6851                *(u16 *)p = field_value;
6852                return 0;
6853        case VMCS_FIELD_TYPE_U32:
6854                *(u32 *)p = field_value;
6855                return 0;
6856        case VMCS_FIELD_TYPE_U64:
6857                *(u64 *)p = field_value;
6858                return 0;
6859        case VMCS_FIELD_TYPE_NATURAL_WIDTH:
6860                *(natural_width *)p = field_value;
6861                return 0;
6862        default:
6863                WARN_ON(1);
6864                return -ENOENT;
6865        }
6866
6867}
6868
6869static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
6870{
6871        int i;
6872        unsigned long field;
6873        u64 field_value;
6874        struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs;
6875        const unsigned long *fields = shadow_read_write_fields;
6876        const int num_fields = max_shadow_read_write_fields;
6877
6878        preempt_disable();
6879
6880        vmcs_load(shadow_vmcs);
6881
6882        for (i = 0; i < num_fields; i++) {
6883                field = fields[i];
6884                switch (vmcs_field_type(field)) {
6885                case VMCS_FIELD_TYPE_U16:
6886                        field_value = vmcs_read16(field);
6887                        break;
6888                case VMCS_FIELD_TYPE_U32:
6889                        field_value = vmcs_read32(field);
6890                        break;
6891                case VMCS_FIELD_TYPE_U64:
6892                        field_value = vmcs_read64(field);
6893                        break;
6894                case VMCS_FIELD_TYPE_NATURAL_WIDTH:
6895                        field_value = vmcs_readl(field);
6896                        break;
6897                default:
6898                        WARN_ON(1);
6899                        continue;
6900                }
6901                vmcs12_write_any(&vmx->vcpu, field, field_value);
6902        }
6903
6904        vmcs_clear(shadow_vmcs);
6905        vmcs_load(vmx->loaded_vmcs->vmcs);
6906
6907        preempt_enable();
6908}
6909
6910static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
6911{
6912        const unsigned long *fields[] = {
6913                shadow_read_write_fields,
6914                shadow_read_only_fields
6915        };
6916        const int max_fields[] = {
6917                max_shadow_read_write_fields,
6918                max_shadow_read_only_fields
6919        };
6920        int i, q;
6921        unsigned long field;
6922        u64 field_value = 0;
6923        struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs;
6924
6925        vmcs_load(shadow_vmcs);
6926
6927        for (q = 0; q < ARRAY_SIZE(fields); q++) {
6928                for (i = 0; i < max_fields[q]; i++) {
6929                        field = fields[q][i];
6930                        vmcs12_read_any(&vmx->vcpu, field, &field_value);
6931
6932                        switch (vmcs_field_type(field)) {
6933                        case VMCS_FIELD_TYPE_U16:
6934                                vmcs_write16(field, (u16)field_value);
6935                                break;
6936                        case VMCS_FIELD_TYPE_U32:
6937                                vmcs_write32(field, (u32)field_value);
6938                                break;
6939                        case VMCS_FIELD_TYPE_U64:
6940                                vmcs_write64(field, (u64)field_value);
6941                                break;
6942                        case VMCS_FIELD_TYPE_NATURAL_WIDTH:
6943                                vmcs_writel(field, (long)field_value);
6944                                break;
6945                        default:
6946                                WARN_ON(1);
6947                                break;
6948                        }
6949                }
6950        }
6951
6952        vmcs_clear(shadow_vmcs);
6953        vmcs_load(vmx->loaded_vmcs->vmcs);
6954}
6955
6956/*
6957 * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
6958 * used before) all generate the same failure when it is missing.
6959 */
6960static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
6961{
6962        struct vcpu_vmx *vmx = to_vmx(vcpu);
6963        if (vmx->nested.current_vmptr == -1ull) {
6964                nested_vmx_failInvalid(vcpu);
6965                skip_emulated_instruction(vcpu);
6966                return 0;
6967        }
6968        return 1;
6969}
6970
6971static int handle_vmread(struct kvm_vcpu *vcpu)
6972{
6973        unsigned long field;
6974        u64 field_value;
6975        unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6976        u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6977        gva_t gva = 0;
6978
6979        if (!nested_vmx_check_permission(vcpu) ||
6980            !nested_vmx_check_vmcs12(vcpu))
6981                return 1;
6982
6983        /* Decode instruction info and find the field to read */
6984        field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
6985        /* Read the field, zero-extended to a u64 field_value */
6986        if (vmcs12_read_any(vcpu, field, &field_value) < 0) {
6987                nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
6988                skip_emulated_instruction(vcpu);
6989                return 1;
6990        }
6991        /*
6992         * Now copy part of this value to register or memory, as requested.
6993         * Note that the number of bits actually copied is 32 or 64 depending
6994         * on the guest's mode (32 or 64 bit), not on the given field's length.
6995         */
6996        if (vmx_instruction_info & (1u << 10)) {
6997                kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
6998                        field_value);
6999        } else {
7000                if (get_vmx_mem_address(vcpu, exit_qualification,
7001                                vmx_instruction_info, &gva))
7002                        return 1;
7003                /* _system ok, as nested_vmx_check_permission verified cpl=0 */
7004                kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva,
7005                             &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL);
7006        }
7007
7008        nested_vmx_succeed(vcpu);
7009        skip_emulated_instruction(vcpu);
7010        return 1;
7011}
7012
7013
7014static int handle_vmwrite(struct kvm_vcpu *vcpu)
7015{
7016        unsigned long field;
7017        gva_t gva;
7018        unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7019        u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
7020        /* The value to write might be 32 or 64 bits, depending on L1's long
7021         * mode, and eventually we need to write that into a field of several
7022         * possible lengths. The code below first zero-extends the value to 64
7023         * bit (field_value), and then copies only the approriate number of
7024         * bits into the vmcs12 field.
7025         */
7026        u64 field_value = 0;
7027        struct x86_exception e;
7028
7029        if (!nested_vmx_check_permission(vcpu) ||
7030            !nested_vmx_check_vmcs12(vcpu))
7031                return 1;
7032
7033        if (vmx_instruction_info & (1u << 10))
7034                field_value = kvm_register_readl(vcpu,
7035                        (((vmx_instruction_info) >> 3) & 0xf));
7036        else {
7037                if (get_vmx_mem_address(vcpu, exit_qualification,
7038                                vmx_instruction_info, &gva))
7039                        return 1;
7040                if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva,
7041                           &field_value, (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
7042                        kvm_inject_page_fault(vcpu, &e);
7043                        return 1;
7044                }
7045        }
7046
7047
7048        field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
7049        if (vmcs_field_readonly(field)) {
7050                nested_vmx_failValid(vcpu,
7051                        VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
7052                skip_emulated_instruction(vcpu);
7053                return 1;
7054        }
7055
7056        if (vmcs12_write_any(vcpu, field, field_value) < 0) {
7057                nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
7058                skip_emulated_instruction(vcpu);
7059                return 1;
7060        }
7061
7062        nested_vmx_succeed(vcpu);
7063        skip_emulated_instruction(vcpu);
7064        return 1;
7065}
7066
7067/* Emulate the VMPTRLD instruction */
7068static int handle_vmptrld(struct kvm_vcpu *vcpu)
7069{
7070        struct vcpu_vmx *vmx = to_vmx(vcpu);
7071        gpa_t vmptr;
7072        u32 exec_control;
7073
7074        if (!nested_vmx_check_permission(vcpu))
7075                return 1;
7076
7077        if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMPTRLD, &vmptr))
7078                return 1;
7079
7080        if (vmx->nested.current_vmptr != vmptr) {
7081                struct vmcs12 *new_vmcs12;
7082                struct page *page;
7083                page = nested_get_page(vcpu, vmptr);
7084                if (page == NULL) {
7085                        nested_vmx_failInvalid(vcpu);
7086                        skip_emulated_instruction(vcpu);
7087                        return 1;
7088                }
7089                new_vmcs12 = kmap(page);
7090                if (new_vmcs12->revision_id != VMCS12_REVISION) {
7091                        kunmap(page);
7092                        nested_release_page_clean(page);
7093                        nested_vmx_failValid(vcpu,
7094                                VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
7095                        skip_emulated_instruction(vcpu);
7096                        return 1;
7097                }
7098
7099                nested_release_vmcs12(vmx);
7100                vmx->nested.current_vmptr = vmptr;
7101                vmx->nested.current_vmcs12 = new_vmcs12;
7102                vmx->nested.current_vmcs12_page = page;
7103                if (enable_shadow_vmcs) {
7104                        exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
7105                        exec_control |= SECONDARY_EXEC_SHADOW_VMCS;
7106                        vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
7107                        vmcs_write64(VMCS_LINK_POINTER,
7108                                     __pa(vmx->nested.current_shadow_vmcs));
7109                        vmx->nested.sync_shadow_vmcs = true;
7110                }
7111        }
7112
7113        nested_vmx_succeed(vcpu);
7114        skip_emulated_instruction(vcpu);
7115        return 1;
7116}
7117
7118/* Emulate the VMPTRST instruction */
7119static int handle_vmptrst(struct kvm_vcpu *vcpu)
7120{
7121        unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7122        u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
7123        gva_t vmcs_gva;
7124        struct x86_exception e;
7125
7126        if (!nested_vmx_check_permission(vcpu))
7127                return 1;
7128
7129        if (get_vmx_mem_address(vcpu, exit_qualification,
7130                        vmx_instruction_info, &vmcs_gva))
7131                return 1;
7132        /* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */
7133        if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva,
7134                                 (void *)&to_vmx(vcpu)->nested.current_vmptr,
7135                                 sizeof(u64), &e)) {
7136                kvm_inject_page_fault(vcpu, &e);
7137                return 1;
7138        }
7139        nested_vmx_succeed(vcpu);
7140        skip_emulated_instruction(vcpu);
7141        return 1;
7142}
7143
7144/* Emulate the INVEPT instruction */
7145static int handle_invept(struct kvm_vcpu *vcpu)
7146{
7147        struct vcpu_vmx *vmx = to_vmx(vcpu);
7148        u32 vmx_instruction_info, types;
7149        unsigned long type;
7150        gva_t gva;
7151        struct x86_exception e;
7152        struct {
7153                u64 eptp, gpa;
7154        } operand;
7155
7156        if (!(vmx->nested.nested_vmx_secondary_ctls_high &
7157              SECONDARY_EXEC_ENABLE_EPT) ||
7158            !(vmx->nested.nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
7159                kvm_queue_exception(vcpu, UD_VECTOR);
7160                return 1;
7161        }
7162
7163        if (!nested_vmx_check_permission(vcpu))
7164                return 1;
7165
7166        if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
7167                kvm_queue_exception(vcpu, UD_VECTOR);
7168                return 1;
7169        }
7170
7171        vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
7172        type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
7173
7174        types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
7175
7176        if (!(types & (1UL << type))) {
7177                nested_vmx_failValid(vcpu,
7178                                VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
7179                return 1;
7180        }
7181
7182        /* According to the Intel VMX instruction reference, the memory
7183         * operand is read even if it isn't needed (e.g., for type==global)
7184         */
7185        if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
7186                        vmx_instruction_info, &gva))
7187                return 1;
7188        if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
7189                                sizeof(operand), &e)) {
7190                kvm_inject_page_fault(vcpu, &e);
7191                return 1;
7192        }
7193
7194        switch (type) {
7195        case VMX_EPT_EXTENT_GLOBAL:
7196                kvm_mmu_sync_roots(vcpu);
7197                kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
7198                nested_vmx_succeed(vcpu);
7199                break;
7200        default:
7201                /* Trap single context invalidation invept calls */
7202                BUG_ON(1);
7203                break;
7204        }
7205
7206        skip_emulated_instruction(vcpu);
7207        return 1;
7208}
7209
7210static int handle_invvpid(struct kvm_vcpu *vcpu)
7211{
7212        kvm_queue_exception(vcpu, UD_VECTOR);
7213        return 1;
7214}
7215
7216static int handle_pml_full(struct kvm_vcpu *vcpu)
7217{
7218        unsigned long exit_qualification;
7219
7220        trace_kvm_pml_full(vcpu->vcpu_id);
7221
7222        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7223
7224        /*
7225         * PML buffer FULL happened while executing iret from NMI,
7226         * "blocked by NMI" bit has to be set before next VM entry.
7227         */
7228        if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
7229                        cpu_has_virtual_nmis() &&
7230                        (exit_qualification & INTR_INFO_UNBLOCK_NMI))
7231                vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
7232                                GUEST_INTR_STATE_NMI);
7233
7234        /*
7235         * PML buffer already flushed at beginning of VMEXIT. Nothing to do
7236         * here.., and there's no userspace involvement needed for PML.
7237         */
7238        return 1;
7239}
7240
7241/*
7242 * The exit handlers return 1 if the exit was handled fully and guest execution
7243 * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
7244 * to be done to userspace and return 0.
7245 */
7246static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
7247        [EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
7248        [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
7249        [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
7250        [EXIT_REASON_NMI_WINDOW]              = handle_nmi_window,
7251        [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
7252        [EXIT_REASON_CR_ACCESS]               = handle_cr,
7253        [EXIT_REASON_DR_ACCESS]               = handle_dr,
7254        [EXIT_REASON_CPUID]                   = handle_cpuid,
7255        [EXIT_REASON_MSR_READ]                = handle_rdmsr,
7256        [EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
7257        [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
7258        [EXIT_REASON_HLT]                     = handle_halt,
7259        [EXIT_REASON_INVD]                    = handle_invd,
7260        [EXIT_REASON_INVLPG]                  = handle_invlpg,
7261        [EXIT_REASON_RDPMC]                   = handle_rdpmc,
7262        [EXIT_REASON_VMCALL]                  = handle_vmcall,
7263        [EXIT_REASON_VMCLEAR]                 = handle_vmclear,
7264        [EXIT_REASON_VMLAUNCH]                = handle_vmlaunch,
7265        [EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
7266        [EXIT_REASON_VMPTRST]                 = handle_vmptrst,
7267        [EXIT_REASON_VMREAD]                  = handle_vmread,
7268        [EXIT_REASON_VMRESUME]                = handle_vmresume,
7269        [EXIT_REASON_VMWRITE]                 = handle_vmwrite,
7270        [EXIT_REASON_VMOFF]                   = handle_vmoff,
7271        [EXIT_REASON_VMON]                    = handle_vmon,
7272        [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
7273        [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
7274        [EXIT_REASON_APIC_WRITE]              = handle_apic_write,
7275        [EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
7276        [EXIT_REASON_WBINVD]                  = handle_wbinvd,
7277        [EXIT_REASON_XSETBV]                  = handle_xsetbv,
7278        [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
7279        [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
7280        [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
7281        [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
7282        [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
7283        [EXIT_REASON_MWAIT_INSTRUCTION]       = handle_mwait,
7284        [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_monitor,
7285        [EXIT_REASON_INVEPT]                  = handle_invept,
7286        [EXIT_REASON_INVVPID]                 = handle_invvpid,
7287        [EXIT_REASON_XSAVES]                  = handle_xsaves,
7288        [EXIT_REASON_XRSTORS]                 = handle_xrstors,
7289        [EXIT_REASON_PML_FULL]                = handle_pml_full,
7290};
7291
7292static const int kvm_vmx_max_exit_handlers =
7293        ARRAY_SIZE(kvm_vmx_exit_handlers);
7294
7295static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
7296                                       struct vmcs12 *vmcs12)
7297{
7298        unsigned long exit_qualification;
7299        gpa_t bitmap, last_bitmap;
7300        unsigned int port;
7301        int size;
7302        u8 b;
7303
7304        if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
7305                return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
7306
7307        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7308
7309        port = exit_qualification >> 16;
7310        size = (exit_qualification & 7) + 1;
7311
7312        last_bitmap = (gpa_t)-1;
7313        b = -1;
7314
7315        while (size > 0) {
7316                if (port < 0x8000)
7317                        bitmap = vmcs12->io_bitmap_a;
7318                else if (port < 0x10000)
7319                        bitmap = vmcs12->io_bitmap_b;
7320                else
7321                        return 1;
7322                bitmap += (port & 0x7fff) / 8;
7323
7324                if (last_bitmap != bitmap)
7325                        if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1))
7326                                return 1;
7327                if (b & (1 << (port & 7)))
7328                        return 1;
7329
7330                port++;
7331                size--;
7332                last_bitmap = bitmap;
7333        }
7334
7335        return 0;
7336}
7337
7338/*
7339 * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
7340 * rather than handle it ourselves in L0. I.e., check whether L1 expressed
7341 * disinterest in the current event (read or write a specific MSR) by using an
7342 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
7343 */
7344static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
7345        struct vmcs12 *vmcs12, u32 exit_reason)
7346{
7347        u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
7348        gpa_t bitmap;
7349
7350        if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
7351                return 1;
7352
7353        /*
7354         * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
7355         * for the four combinations of read/write and low/high MSR numbers.
7356         * First we need to figure out which of the four to use:
7357         */
7358        bitmap = vmcs12->msr_bitmap;
7359        if (exit_reason == EXIT_REASON_MSR_WRITE)
7360                bitmap += 2048;
7361        if (msr_index >= 0xc0000000) {
7362                msr_index -= 0xc0000000;
7363                bitmap += 1024;
7364        }
7365
7366        /* Then read the msr_index'th bit from this bitmap: */
7367        if (msr_index < 1024*8) {
7368                unsigned char b;
7369                if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1))
7370                        return 1;
7371                return 1 & (b >> (msr_index & 7));
7372        } else
7373                return 1; /* let L1 handle the wrong parameter */
7374}
7375
7376/*
7377 * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
7378 * rather than handle it ourselves in L0. I.e., check if L1 wanted to
7379 * intercept (via guest_host_mask etc.) the current event.
7380 */
7381static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
7382        struct vmcs12 *vmcs12)
7383{
7384        unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7385        int cr = exit_qualification & 15;
7386        int reg = (exit_qualification >> 8) & 15;
7387        unsigned long val = kvm_register_readl(vcpu, reg);
7388
7389        switch ((exit_qualification >> 4) & 3) {
7390        case 0: /* mov to cr */
7391                switch (cr) {
7392                case 0:
7393                        if (vmcs12->cr0_guest_host_mask &
7394                            (val ^ vmcs12->cr0_read_shadow))
7395                                return 1;
7396                        break;
7397                case 3:
7398                        if ((vmcs12->cr3_target_count >= 1 &&
7399                                        vmcs12->cr3_target_value0 == val) ||
7400                                (vmcs12->cr3_target_count >= 2 &&
7401                                        vmcs12->cr3_target_value1 == val) ||
7402                                (vmcs12->cr3_target_count >= 3 &&
7403                                        vmcs12->cr3_target_value2 == val) ||
7404                                (vmcs12->cr3_target_count >= 4 &&
7405                                        vmcs12->cr3_target_value3 == val))
7406                                return 0;
7407                        if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
7408                                return 1;
7409                        break;
7410                case 4:
7411                        if (vmcs12->cr4_guest_host_mask &
7412                            (vmcs12->cr4_read_shadow ^ val))
7413                                return 1;
7414                        break;
7415                case 8:
7416                        if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
7417                                return 1;
7418                        break;
7419                }
7420                break;
7421        case 2: /* clts */
7422                if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
7423                    (vmcs12->cr0_read_shadow & X86_CR0_TS))
7424                        return 1;
7425                break;
7426        case 1: /* mov from cr */
7427                switch (cr) {
7428                case 3:
7429                        if (vmcs12->cpu_based_vm_exec_control &
7430                            CPU_BASED_CR3_STORE_EXITING)
7431                                return 1;
7432                        break;
7433                case 8:
7434                        if (vmcs12->cpu_based_vm_exec_control &
7435                            CPU_BASED_CR8_STORE_EXITING)
7436                                return 1;
7437                        break;
7438                }
7439                break;
7440        case 3: /* lmsw */
7441                /*
7442                 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
7443                 * cr0. Other attempted changes are ignored, with no exit.
7444                 */
7445                if (vmcs12->cr0_guest_host_mask & 0xe &
7446                    (val ^ vmcs12->cr0_read_shadow))
7447                        return 1;
7448                if ((vmcs12->cr0_guest_host_mask & 0x1) &&
7449                    !(vmcs12->cr0_read_shadow & 0x1) &&
7450                    (val & 0x1))
7451                        return 1;
7452                break;
7453        }
7454        return 0;
7455}
7456
7457/*
7458 * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
7459 * should handle it ourselves in L0 (and then continue L2). Only call this
7460 * when in is_guest_mode (L2).
7461 */
7462static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
7463{
7464        u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
7465        struct vcpu_vmx *vmx = to_vmx(vcpu);
7466        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7467        u32 exit_reason = vmx->exit_reason;
7468
7469        trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
7470                                vmcs_readl(EXIT_QUALIFICATION),
7471                                vmx->idt_vectoring_info,
7472                                intr_info,
7473                                vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
7474                                KVM_ISA_VMX);
7475
7476        if (vmx->nested.nested_run_pending)
7477                return 0;
7478
7479        if (unlikely(vmx->fail)) {
7480                pr_info_ratelimited("%s failed vm entry %x\n", __func__,
7481                                    vmcs_read32(VM_INSTRUCTION_ERROR));
7482                return 1;
7483        }
7484
7485        switch (exit_reason) {
7486        case EXIT_REASON_EXCEPTION_NMI:
7487                if (!is_exception(intr_info))
7488                        return 0;
7489                else if (is_page_fault(intr_info))
7490                        return enable_ept;
7491                else if (is_no_device(intr_info) &&
7492                         !(vmcs12->guest_cr0 & X86_CR0_TS))
7493                        return 0;
7494                return vmcs12->exception_bitmap &
7495                                (1u << (intr_info & INTR_INFO_VECTOR_MASK));
7496        case EXIT_REASON_EXTERNAL_INTERRUPT:
7497                return 0;
7498        case EXIT_REASON_TRIPLE_FAULT:
7499                return 1;
7500        case EXIT_REASON_PENDING_INTERRUPT:
7501                return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
7502        case EXIT_REASON_NMI_WINDOW:
7503                return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
7504        case EXIT_REASON_TASK_SWITCH:
7505                return 1;
7506        case EXIT_REASON_CPUID:
7507                if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa)
7508                        return 0;
7509                return 1;
7510        case EXIT_REASON_HLT:
7511                return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
7512        case EXIT_REASON_INVD:
7513                return 1;
7514        case EXIT_REASON_INVLPG:
7515                return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
7516        case EXIT_REASON_RDPMC:
7517                return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
7518        case EXIT_REASON_RDTSC:
7519                return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
7520        case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
7521        case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
7522        case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
7523        case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
7524        case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
7525        case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
7526                /*
7527                 * VMX instructions trap unconditionally. This allows L1 to
7528                 * emulate them for its L2 guest, i.e., allows 3-level nesting!
7529                 */
7530                return 1;
7531        case EXIT_REASON_CR_ACCESS:
7532                return nested_vmx_exit_handled_cr(vcpu, vmcs12);
7533        case EXIT_REASON_DR_ACCESS:
7534                return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
7535        case EXIT_REASON_IO_INSTRUCTION:
7536                return nested_vmx_exit_handled_io(vcpu, vmcs12);
7537        case EXIT_REASON_MSR_READ:
7538        case EXIT_REASON_MSR_WRITE:
7539                return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
7540        case EXIT_REASON_INVALID_STATE:
7541                return 1;
7542        case EXIT_REASON_MWAIT_INSTRUCTION:
7543                return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
7544        case EXIT_REASON_MONITOR_INSTRUCTION:
7545                return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
7546        case EXIT_REASON_PAUSE_INSTRUCTION:
7547                return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
7548                        nested_cpu_has2(vmcs12,
7549                                SECONDARY_EXEC_PAUSE_LOOP_EXITING);
7550        case EXIT_REASON_MCE_DURING_VMENTRY:
7551                return 0;
7552        case EXIT_REASON_TPR_BELOW_THRESHOLD:
7553                return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
7554        case EXIT_REASON_APIC_ACCESS:
7555                return nested_cpu_has2(vmcs12,
7556                        SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
7557        case EXIT_REASON_APIC_WRITE:
7558        case EXIT_REASON_EOI_INDUCED:
7559                /* apic_write and eoi_induced should exit unconditionally. */
7560                return 1;
7561        case EXIT_REASON_EPT_VIOLATION:
7562                /*
7563                 * L0 always deals with the EPT violation. If nested EPT is
7564                 * used, and the nested mmu code discovers that the address is
7565                 * missing in the guest EPT table (EPT12), the EPT violation
7566                 * will be injected with nested_ept_inject_page_fault()
7567                 */
7568                return 0;
7569        case EXIT_REASON_EPT_MISCONFIG:
7570                /*
7571                 * L2 never uses directly L1's EPT, but rather L0's own EPT
7572                 * table (shadow on EPT) or a merged EPT table that L0 built
7573                 * (EPT on EPT). So any problems with the structure of the
7574                 * table is L0's fault.
7575                 */
7576                return 0;
7577        case EXIT_REASON_WBINVD:
7578                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
7579        case EXIT_REASON_XSETBV:
7580                return 1;
7581        case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
7582                /*
7583                 * This should never happen, since it is not possible to
7584                 * set XSS to a non-zero value---neither in L1 nor in L2.
7585                 * If if it were, XSS would have to be checked against
7586                 * the XSS exit bitmap in vmcs12.
7587                 */
7588                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
7589        default:
7590                return 1;
7591        }
7592}
7593
7594static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
7595{
7596        *info1 = vmcs_readl(EXIT_QUALIFICATION);
7597        *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
7598}
7599
7600static int vmx_enable_pml(struct vcpu_vmx *vmx)
7601{
7602        struct page *pml_pg;
7603        u32 exec_control;
7604
7605        pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
7606        if (!pml_pg)
7607                return -ENOMEM;
7608
7609        vmx->pml_pg = pml_pg;
7610
7611        vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
7612        vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
7613
7614        exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
7615        exec_control |= SECONDARY_EXEC_ENABLE_PML;
7616        vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
7617
7618        return 0;
7619}
7620
7621static void vmx_disable_pml(struct vcpu_vmx *vmx)
7622{
7623        u32 exec_control;
7624
7625        ASSERT(vmx->pml_pg);
7626        __free_page(vmx->pml_pg);
7627        vmx->pml_pg = NULL;
7628
7629        exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
7630        exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
7631        vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
7632}
7633
7634static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx)
7635{
7636        struct kvm *kvm = vmx->vcpu.kvm;
7637        u64 *pml_buf;
7638        u16 pml_idx;
7639
7640        pml_idx = vmcs_read16(GUEST_PML_INDEX);
7641
7642        /* Do nothing if PML buffer is empty */
7643        if (pml_idx == (PML_ENTITY_NUM - 1))
7644                return;
7645
7646        /* PML index always points to next available PML buffer entity */
7647        if (pml_idx >= PML_ENTITY_NUM)
7648                pml_idx = 0;
7649        else
7650                pml_idx++;
7651
7652        pml_buf = page_address(vmx->pml_pg);
7653        for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
7654                u64 gpa;
7655
7656                gpa = pml_buf[pml_idx];
7657                WARN_ON(gpa & (PAGE_SIZE - 1));
7658                mark_page_dirty(kvm, gpa >> PAGE_SHIFT);
7659        }
7660
7661        /* reset PML index */
7662        vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
7663}
7664
7665/*
7666 * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
7667 * Called before reporting dirty_bitmap to userspace.
7668 */
7669static void kvm_flush_pml_buffers(struct kvm *kvm)
7670{
7671        int i;
7672        struct kvm_vcpu *vcpu;
7673        /*
7674         * We only need to kick vcpu out of guest mode here, as PML buffer
7675         * is flushed at beginning of all VMEXITs, and it's obvious that only
7676         * vcpus running in guest are possible to have unflushed GPAs in PML
7677         * buffer.
7678         */
7679        kvm_for_each_vcpu(i, vcpu, kvm)
7680                kvm_vcpu_kick(vcpu);
7681}
7682
7683/*
7684 * The guest has exited.  See if we can fix it or if we need userspace
7685 * assistance.
7686 */
7687static int vmx_handle_exit(struct kvm_vcpu *vcpu)
7688{
7689        struct vcpu_vmx *vmx = to_vmx(vcpu);
7690        u32 exit_reason = vmx->exit_reason;
7691        u32 vectoring_info = vmx->idt_vectoring_info;
7692
7693        /*
7694         * Flush logged GPAs PML buffer, this will make dirty_bitmap more
7695         * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
7696         * querying dirty_bitmap, we only need to kick all vcpus out of guest
7697         * mode as if vcpus is in root mode, the PML buffer must has been
7698         * flushed already.
7699         */
7700        if (enable_pml)
7701                vmx_flush_pml_buffer(vmx);
7702
7703        /* If guest state is invalid, start emulating */
7704        if (vmx->emulation_required)
7705                return handle_invalid_guest_state(vcpu);
7706
7707        if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
7708                nested_vmx_vmexit(vcpu, exit_reason,
7709                                  vmcs_read32(VM_EXIT_INTR_INFO),
7710                                  vmcs_readl(EXIT_QUALIFICATION));
7711                return 1;
7712        }
7713
7714        if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
7715                vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
7716                vcpu->run->fail_entry.hardware_entry_failure_reason
7717                        = exit_reason;
7718                return 0;
7719        }
7720
7721        if (unlikely(vmx->fail)) {
7722                vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
7723                vcpu->run->fail_entry.hardware_entry_failure_reason
7724                        = vmcs_read32(VM_INSTRUCTION_ERROR);
7725                return 0;
7726        }
7727
7728        /*
7729         * Note:
7730         * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
7731         * delivery event since it indicates guest is accessing MMIO.
7732         * The vm-exit can be triggered again after return to guest that
7733         * will cause infinite loop.
7734         */
7735        if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
7736                        (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
7737                        exit_reason != EXIT_REASON_EPT_VIOLATION &&
7738                        exit_reason != EXIT_REASON_TASK_SWITCH)) {
7739                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
7740                vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
7741                vcpu->run->internal.ndata = 2;
7742                vcpu->run->internal.data[0] = vectoring_info;
7743                vcpu->run->internal.data[1] = exit_reason;
7744                return 0;
7745        }
7746
7747        if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
7748            !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
7749                                        get_vmcs12(vcpu))))) {
7750                if (vmx_interrupt_allowed(vcpu)) {
7751                        vmx->soft_vnmi_blocked = 0;
7752                } else if (vmx->vnmi_blocked_time > 1000000000LL &&
7753                           vcpu->arch.nmi_pending) {
7754                        /*
7755                         * This CPU don't support us in finding the end of an
7756                         * NMI-blocked window if the guest runs with IRQs
7757                         * disabled. So we pull the trigger after 1 s of
7758                         * futile waiting, but inform the user about this.
7759                         */
7760                        printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
7761                               "state on VCPU %d after 1 s timeout\n",
7762                               __func__, vcpu->vcpu_id);
7763                        vmx->soft_vnmi_blocked = 0;
7764                }
7765        }
7766
7767        if (exit_reason < kvm_vmx_max_exit_handlers
7768            && kvm_vmx_exit_handlers[exit_reason])
7769                return kvm_vmx_exit_handlers[exit_reason](vcpu);
7770        else {
7771                WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason);
7772                kvm_queue_exception(vcpu, UD_VECTOR);
7773                return 1;
7774        }
7775}
7776
7777static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
7778{
7779        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7780
7781        if (is_guest_mode(vcpu) &&
7782                nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
7783                return;
7784
7785        if (irr == -1 || tpr < irr) {
7786                vmcs_write32(TPR_THRESHOLD, 0);
7787                return;
7788        }
7789
7790        vmcs_write32(TPR_THRESHOLD, irr);
7791}
7792
7793static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
7794{
7795        u32 sec_exec_control;
7796
7797        /*
7798         * There is not point to enable virtualize x2apic without enable
7799         * apicv
7800         */
7801        if (!cpu_has_vmx_virtualize_x2apic_mode() ||
7802                                !vmx_vm_has_apicv(vcpu->kvm))
7803                return;
7804
7805        if (!vm_need_tpr_shadow(vcpu->kvm))
7806                return;
7807
7808        sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
7809
7810        if (set) {
7811                sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
7812                sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
7813        } else {
7814                sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
7815                sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
7816        }
7817        vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
7818
7819        vmx_set_msr_bitmap(vcpu);
7820}
7821
7822static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
7823{
7824        struct vcpu_vmx *vmx = to_vmx(vcpu);
7825
7826        /*
7827         * Currently we do not handle the nested case where L2 has an
7828         * APIC access page of its own; that page is still pinned.
7829         * Hence, we skip the case where the VCPU is in guest mode _and_
7830         * L1 prepared an APIC access page for L2.
7831         *
7832         * For the case where L1 and L2 share the same APIC access page
7833         * (flexpriority=Y but SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES clear
7834         * in the vmcs12), this function will only update either the vmcs01
7835         * or the vmcs02.  If the former, the vmcs02 will be updated by
7836         * prepare_vmcs02.  If the latter, the vmcs01 will be updated in
7837         * the next L2->L1 exit.
7838         */
7839        if (!is_guest_mode(vcpu) ||
7840            !nested_cpu_has2(vmx->nested.current_vmcs12,
7841                             SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
7842                vmcs_write64(APIC_ACCESS_ADDR, hpa);
7843}
7844
7845static void vmx_hwapic_isr_update(struct kvm *kvm, int isr)
7846{
7847        u16 status;
7848        u8 old;
7849
7850        if (isr == -1)
7851                isr = 0;
7852
7853        status = vmcs_read16(GUEST_INTR_STATUS);
7854        old = status >> 8;
7855        if (isr != old) {
7856                status &= 0xff;
7857                status |= isr << 8;
7858                vmcs_write16(GUEST_INTR_STATUS, status);
7859        }
7860}
7861
7862static void vmx_set_rvi(int vector)
7863{
7864        u16 status;
7865        u8 old;
7866
7867        if (vector == -1)
7868                vector = 0;
7869
7870        status = vmcs_read16(GUEST_INTR_STATUS);
7871        old = (u8)status & 0xff;
7872        if ((u8)vector != old) {
7873                status &= ~0xff;
7874                status |= (u8)vector;
7875                vmcs_write16(GUEST_INTR_STATUS, status);
7876        }
7877}
7878
7879static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
7880{
7881        if (!is_guest_mode(vcpu)) {
7882                vmx_set_rvi(max_irr);
7883                return;
7884        }
7885
7886        if (max_irr == -1)
7887                return;
7888
7889        /*
7890         * In guest mode.  If a vmexit is needed, vmx_check_nested_events
7891         * handles it.
7892         */
7893        if (nested_exit_on_intr(vcpu))
7894                return;
7895
7896        /*
7897         * Else, fall back to pre-APICv interrupt injection since L2
7898         * is run without virtual interrupt delivery.
7899         */
7900        if (!kvm_event_needs_reinjection(vcpu) &&
7901            vmx_interrupt_allowed(vcpu)) {
7902                kvm_queue_interrupt(vcpu, max_irr, false);
7903                vmx_inject_irq(vcpu);
7904        }
7905}
7906
7907static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
7908{
7909        if (!vmx_vm_has_apicv(vcpu->kvm))
7910                return;
7911
7912        vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
7913        vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
7914        vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
7915        vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
7916}
7917
7918static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
7919{
7920        u32 exit_intr_info;
7921
7922        if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY
7923              || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI))
7924                return;
7925
7926        vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
7927        exit_intr_info = vmx->exit_intr_info;
7928
7929        /* Handle machine checks before interrupts are enabled */
7930        if (is_machine_check(exit_intr_info))
7931                kvm_machine_check();
7932
7933        /* We need to handle NMIs before interrupts are enabled */
7934        if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
7935            (exit_intr_info & INTR_INFO_VALID_MASK)) {
7936                kvm_before_handle_nmi(&vmx->vcpu);
7937                asm("int $2");
7938                kvm_after_handle_nmi(&vmx->vcpu);
7939        }
7940}
7941
7942static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
7943{
7944        u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
7945
7946        /*
7947         * If external interrupt exists, IF bit is set in rflags/eflags on the
7948         * interrupt stack frame, and interrupt will be enabled on a return
7949         * from interrupt handler.
7950         */
7951        if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
7952                        == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) {
7953                unsigned int vector;
7954                unsigned long entry;
7955                gate_desc *desc;
7956                struct vcpu_vmx *vmx = to_vmx(vcpu);
7957#ifdef CONFIG_X86_64
7958                unsigned long tmp;
7959#endif
7960
7961                vector =  exit_intr_info & INTR_INFO_VECTOR_MASK;
7962                desc = (gate_desc *)vmx->host_idt_base + vector;
7963                entry = gate_offset(*desc);
7964                asm volatile(
7965#ifdef CONFIG_X86_64
7966                        "mov %%" _ASM_SP ", %[sp]\n\t"
7967                        "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
7968                        "push $%c[ss]\n\t"
7969                        "push %[sp]\n\t"
7970#endif
7971                        "pushf\n\t"
7972                        "orl $0x200, (%%" _ASM_SP ")\n\t"
7973                        __ASM_SIZE(push) " $%c[cs]\n\t"
7974                        "call *%[entry]\n\t"
7975                        :
7976#ifdef CONFIG_X86_64
7977                        [sp]"=&r"(tmp)
7978#endif
7979                        :
7980                        [entry]"r"(entry),
7981                        [ss]"i"(__KERNEL_DS),
7982                        [cs]"i"(__KERNEL_CS)
7983                        );
7984        } else
7985                local_irq_enable();
7986}
7987
7988static bool vmx_mpx_supported(void)
7989{
7990        return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
7991                (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS);
7992}
7993
7994static bool vmx_xsaves_supported(void)
7995{
7996        return vmcs_config.cpu_based_2nd_exec_ctrl &
7997                SECONDARY_EXEC_XSAVES;
7998}
7999
8000static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
8001{
8002        u32 exit_intr_info;
8003        bool unblock_nmi;
8004        u8 vector;
8005        bool idtv_info_valid;
8006
8007        idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
8008
8009        if (cpu_has_virtual_nmis()) {
8010                if (vmx->nmi_known_unmasked)
8011                        return;
8012                /*
8013                 * Can't use vmx->exit_intr_info since we're not sure what
8014                 * the exit reason is.
8015                 */
8016                exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
8017                unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
8018                vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
8019                /*
8020                 * SDM 3: 27.7.1.2 (September 2008)
8021                 * Re-set bit "block by NMI" before VM entry if vmexit caused by
8022                 * a guest IRET fault.
8023                 * SDM 3: 23.2.2 (September 2008)
8024                 * Bit 12 is undefined in any of the following cases:
8025                 *  If the VM exit sets the valid bit in the IDT-vectoring
8026                 *   information field.
8027                 *  If the VM exit is due to a double fault.
8028                 */
8029                if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
8030                    vector != DF_VECTOR && !idtv_info_valid)
8031                        vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
8032                                      GUEST_INTR_STATE_NMI);
8033                else
8034                        vmx->nmi_known_unmasked =
8035                                !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
8036                                  & GUEST_INTR_STATE_NMI);
8037        } else if (unlikely(vmx->soft_vnmi_blocked))
8038                vmx->vnmi_blocked_time +=
8039                        ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
8040}
8041
8042static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
8043                                      u32 idt_vectoring_info,
8044                                      int instr_len_field,
8045                                      int error_code_field)
8046{
8047        u8 vector;
8048        int type;
8049        bool idtv_info_valid;
8050
8051        idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
8052
8053        vcpu->arch.nmi_injected = false;
8054        kvm_clear_exception_queue(vcpu);
8055        kvm_clear_interrupt_queue(vcpu);
8056
8057        if (!idtv_info_valid)
8058                return;
8059
8060        kvm_make_request(KVM_REQ_EVENT, vcpu);
8061
8062        vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
8063        type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
8064
8065        switch (type) {
8066        case INTR_TYPE_NMI_INTR:
8067                vcpu->arch.nmi_injected = true;
8068                /*
8069                 * SDM 3: 27.7.1.2 (September 2008)
8070                 * Clear bit "block by NMI" before VM entry if a NMI
8071                 * delivery faulted.
8072                 */
8073                vmx_set_nmi_mask(vcpu, false);
8074                break;
8075        case INTR_TYPE_SOFT_EXCEPTION:
8076                vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
8077                /* fall through */
8078        case INTR_TYPE_HARD_EXCEPTION:
8079                if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
8080                        u32 err = vmcs_read32(error_code_field);
8081                        kvm_requeue_exception_e(vcpu, vector, err);
8082                } else
8083                        kvm_requeue_exception(vcpu, vector);
8084                break;
8085        case INTR_TYPE_SOFT_INTR:
8086                vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
8087                /* fall through */
8088        case INTR_TYPE_EXT_INTR:
8089                kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
8090                break;
8091        default:
8092                break;
8093        }
8094}
8095
8096static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
8097{
8098        __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
8099                                  VM_EXIT_INSTRUCTION_LEN,
8100                                  IDT_VECTORING_ERROR_CODE);
8101}
8102
8103static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
8104{
8105        __vmx_complete_interrupts(vcpu,
8106                                  vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
8107                                  VM_ENTRY_INSTRUCTION_LEN,
8108                                  VM_ENTRY_EXCEPTION_ERROR_CODE);
8109
8110        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
8111}
8112
8113static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
8114{
8115        int i, nr_msrs;
8116        struct perf_guest_switch_msr *msrs;
8117
8118        msrs = perf_guest_get_msrs(&nr_msrs);
8119
8120        if (!msrs)
8121                return;
8122
8123        for (i = 0; i < nr_msrs; i++)
8124                if (msrs[i].host == msrs[i].guest)
8125                        clear_atomic_switch_msr(vmx, msrs[i].msr);
8126                else
8127                        add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
8128                                        msrs[i].host);
8129}
8130
8131static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
8132{
8133        struct vcpu_vmx *vmx = to_vmx(vcpu);
8134        unsigned long debugctlmsr, cr4;
8135
8136        /* Record the guest's net vcpu time for enforced NMI injections. */
8137        if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
8138                vmx->entry_time = ktime_get();
8139
8140        /* Don't enter VMX if guest state is invalid, let the exit handler
8141           start emulation until we arrive back to a valid state */
8142        if (vmx->emulation_required)
8143                return;
8144
8145        if (vmx->ple_window_dirty) {
8146                vmx->ple_window_dirty = false;
8147                vmcs_write32(PLE_WINDOW, vmx->ple_window);
8148        }
8149
8150        if (vmx->nested.sync_shadow_vmcs) {
8151                copy_vmcs12_to_shadow(vmx);
8152                vmx->nested.sync_shadow_vmcs = false;
8153        }
8154
8155        if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
8156                vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
8157        if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
8158                vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
8159
8160        cr4 = cr4_read_shadow();
8161        if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
8162                vmcs_writel(HOST_CR4, cr4);
8163                vmx->host_state.vmcs_host_cr4 = cr4;
8164        }
8165
8166        /* When single-stepping over STI and MOV SS, we must clear the
8167         * corresponding interruptibility bits in the guest state. Otherwise
8168         * vmentry fails as it then expects bit 14 (BS) in pending debug
8169         * exceptions being set, but that's not correct for the guest debugging
8170         * case. */
8171        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
8172                vmx_set_interrupt_shadow(vcpu, 0);
8173
8174        atomic_switch_perf_msrs(vmx);
8175        debugctlmsr = get_debugctlmsr();
8176
8177        vmx->__launched = vmx->loaded_vmcs->launched;
8178        asm(
8179                /* Store host registers */
8180                "push %%" _ASM_DX "; push %%" _ASM_BP ";"
8181                "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
8182                "push %%" _ASM_CX " \n\t"
8183                "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
8184                "je 1f \n\t"
8185                "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
8186                __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
8187                "1: \n\t"
8188                /* Reload cr2 if changed */
8189                "mov %c[cr2](%0), %%" _ASM_AX " \n\t"
8190                "mov %%cr2, %%" _ASM_DX " \n\t"
8191                "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
8192                "je 2f \n\t"
8193                "mov %%" _ASM_AX", %%cr2 \n\t"
8194                "2: \n\t"
8195                /* Check if vmlaunch of vmresume is needed */
8196                "cmpl $0, %c[launched](%0) \n\t"
8197                /* Load guest registers.  Don't clobber flags. */
8198                "mov %c[rax](%0), %%" _ASM_AX " \n\t"
8199                "mov %c[rbx](%0), %%" _ASM_BX " \n\t"
8200                "mov %c[rdx](%0), %%" _ASM_DX " \n\t"
8201                "mov %c[rsi](%0), %%" _ASM_SI " \n\t"
8202                "mov %c[rdi](%0), %%" _ASM_DI " \n\t"
8203                "mov %c[rbp](%0), %%" _ASM_BP " \n\t"
8204#ifdef CONFIG_X86_64
8205                "mov %c[r8](%0),  %%r8  \n\t"
8206                "mov %c[r9](%0),  %%r9  \n\t"
8207                "mov %c[r10](%0), %%r10 \n\t"
8208                "mov %c[r11](%0), %%r11 \n\t"
8209                "mov %c[r12](%0), %%r12 \n\t"
8210                "mov %c[r13](%0), %%r13 \n\t"
8211                "mov %c[r14](%0), %%r14 \n\t"
8212                "mov %c[r15](%0), %%r15 \n\t"
8213#endif
8214                "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */
8215
8216                /* Enter guest mode */
8217                "jne 1f \n\t"
8218                __ex(ASM_VMX_VMLAUNCH) "\n\t"
8219                "jmp 2f \n\t"
8220                "1: " __ex(ASM_VMX_VMRESUME) "\n\t"
8221                "2: "
8222                /* Save guest registers, load host registers, keep flags */
8223                "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
8224                "pop %0 \n\t"
8225                "mov %%" _ASM_AX ", %c[rax](%0) \n\t"
8226                "mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
8227                __ASM_SIZE(pop) " %c[rcx](%0) \n\t"
8228                "mov %%" _ASM_DX ", %c[rdx](%0) \n\t"
8229                "mov %%" _ASM_SI ", %c[rsi](%0) \n\t"
8230                "mov %%" _ASM_DI ", %c[rdi](%0) \n\t"
8231                "mov %%" _ASM_BP ", %c[rbp](%0) \n\t"
8232#ifdef CONFIG_X86_64
8233                "mov %%r8,  %c[r8](%0) \n\t"
8234                "mov %%r9,  %c[r9](%0) \n\t"
8235                "mov %%r10, %c[r10](%0) \n\t"
8236                "mov %%r11, %c[r11](%0) \n\t"
8237                "mov %%r12, %c[r12](%0) \n\t"
8238                "mov %%r13, %c[r13](%0) \n\t"
8239                "mov %%r14, %c[r14](%0) \n\t"
8240                "mov %%r15, %c[r15](%0) \n\t"
8241#endif
8242                "mov %%cr2, %%" _ASM_AX "   \n\t"
8243                "mov %%" _ASM_AX ", %c[cr2](%0) \n\t"
8244
8245                "pop  %%" _ASM_BP "; pop  %%" _ASM_DX " \n\t"
8246                "setbe %c[fail](%0) \n\t"
8247                ".pushsection .rodata \n\t"
8248                ".global vmx_return \n\t"
8249                "vmx_return: " _ASM_PTR " 2b \n\t"
8250                ".popsection"
8251              : : "c"(vmx), "d"((unsigned long)HOST_RSP),
8252                [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
8253                [fail]"i"(offsetof(struct vcpu_vmx, fail)),
8254                [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
8255                [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
8256                [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
8257                [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
8258                [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
8259                [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
8260                [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
8261                [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
8262#ifdef CONFIG_X86_64
8263                [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
8264                [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
8265                [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
8266                [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
8267                [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
8268                [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
8269                [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
8270                [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
8271#endif
8272                [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
8273                [wordsize]"i"(sizeof(ulong))
8274              : "cc", "memory"
8275#ifdef CONFIG_X86_64
8276                , "rax", "rbx", "rdi", "rsi"
8277                , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
8278#else
8279                , "eax", "ebx", "edi", "esi"
8280#endif
8281              );
8282
8283        /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
8284        if (debugctlmsr)
8285                update_debugctlmsr(debugctlmsr);
8286
8287#ifndef CONFIG_X86_64
8288        /*
8289         * The sysexit path does not restore ds/es, so we must set them to
8290         * a reasonable value ourselves.
8291         *
8292         * We can't defer this to vmx_load_host_state() since that function
8293         * may be executed in interrupt context, which saves and restore segments
8294         * around it, nullifying its effect.
8295         */
8296        loadsegment(ds, __USER_DS);
8297        loadsegment(es, __USER_DS);
8298#endif
8299
8300        vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
8301                                  | (1 << VCPU_EXREG_RFLAGS)
8302                                  | (1 << VCPU_EXREG_PDPTR)
8303                                  | (1 << VCPU_EXREG_SEGMENTS)
8304                                  | (1 << VCPU_EXREG_CR3));
8305        vcpu->arch.regs_dirty = 0;
8306
8307        vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
8308
8309        vmx->loaded_vmcs->launched = 1;
8310
8311        vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
8312        trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
8313
8314        /*
8315         * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
8316         * we did not inject a still-pending event to L1 now because of
8317         * nested_run_pending, we need to re-enable this bit.
8318         */
8319        if (vmx->nested.nested_run_pending)
8320                kvm_make_request(KVM_REQ_EVENT, vcpu);
8321
8322        vmx->nested.nested_run_pending = 0;
8323
8324        vmx_complete_atomic_exit(vmx);
8325        vmx_recover_nmi_blocking(vmx);
8326        vmx_complete_interrupts(vmx);
8327}
8328
8329static void vmx_load_vmcs01(struct kvm_vcpu *vcpu)
8330{
8331        struct vcpu_vmx *vmx = to_vmx(vcpu);
8332        int cpu;
8333
8334        if (vmx->loaded_vmcs == &vmx->vmcs01)
8335                return;
8336
8337        cpu = get_cpu();
8338        vmx->loaded_vmcs = &vmx->vmcs01;
8339        vmx_vcpu_put(vcpu);
8340        vmx_vcpu_load(vcpu, cpu);
8341        vcpu->cpu = cpu;
8342        put_cpu();
8343}
8344
8345static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
8346{
8347        struct vcpu_vmx *vmx = to_vmx(vcpu);
8348
8349        if (enable_pml)
8350                vmx_disable_pml(vmx);
8351        free_vpid(vmx);
8352        leave_guest_mode(vcpu);
8353        vmx_load_vmcs01(vcpu);
8354        free_nested(vmx);
8355        free_loaded_vmcs(vmx->loaded_vmcs);
8356        kfree(vmx->guest_msrs);
8357        kvm_vcpu_uninit(vcpu);
8358        kmem_cache_free(kvm_vcpu_cache, vmx);
8359}
8360
8361static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
8362{
8363        int err;
8364        struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
8365        int cpu;
8366
8367        if (!vmx)
8368                return ERR_PTR(-ENOMEM);
8369
8370        allocate_vpid(vmx);
8371
8372        err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
8373        if (err)
8374                goto free_vcpu;
8375
8376        vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
8377        BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0])
8378                     > PAGE_SIZE);
8379
8380        err = -ENOMEM;
8381        if (!vmx->guest_msrs) {
8382                goto uninit_vcpu;
8383        }
8384
8385        vmx->loaded_vmcs = &vmx->vmcs01;
8386        vmx->loaded_vmcs->vmcs = alloc_vmcs();
8387        if (!vmx->loaded_vmcs->vmcs)
8388                goto free_msrs;
8389        if (!vmm_exclusive)
8390                kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
8391        loaded_vmcs_init(vmx->loaded_vmcs);
8392        if (!vmm_exclusive)
8393                kvm_cpu_vmxoff();
8394
8395        cpu = get_cpu();
8396        vmx_vcpu_load(&vmx->vcpu, cpu);
8397        vmx->vcpu.cpu = cpu;
8398        err = vmx_vcpu_setup(vmx);
8399        vmx_vcpu_put(&vmx->vcpu);
8400        put_cpu();
8401        if (err)
8402                goto free_vmcs;
8403        if (vm_need_virtualize_apic_accesses(kvm)) {
8404                err = alloc_apic_access_page(kvm);
8405                if (err)
8406                        goto free_vmcs;
8407        }
8408
8409        if (enable_ept) {
8410                if (!kvm->arch.ept_identity_map_addr)
8411                        kvm->arch.ept_identity_map_addr =
8412                                VMX_EPT_IDENTITY_PAGETABLE_ADDR;
8413                err = init_rmode_identity_map(kvm);
8414                if (err)
8415                        goto free_vmcs;
8416        }
8417
8418        if (nested)
8419                nested_vmx_setup_ctls_msrs(vmx);
8420
8421        vmx->nested.posted_intr_nv = -1;
8422        vmx->nested.current_vmptr = -1ull;
8423        vmx->nested.current_vmcs12 = NULL;
8424
8425        /*
8426         * If PML is turned on, failure on enabling PML just results in failure
8427         * of creating the vcpu, therefore we can simplify PML logic (by
8428         * avoiding dealing with cases, such as enabling PML partially on vcpus
8429         * for the guest, etc.
8430         */
8431        if (enable_pml) {
8432                err = vmx_enable_pml(vmx);
8433                if (err)
8434                        goto free_vmcs;
8435        }
8436
8437        return &vmx->vcpu;
8438
8439free_vmcs:
8440        free_loaded_vmcs(vmx->loaded_vmcs);
8441free_msrs:
8442        kfree(vmx->guest_msrs);
8443uninit_vcpu:
8444        kvm_vcpu_uninit(&vmx->vcpu);
8445free_vcpu:
8446        free_vpid(vmx);
8447        kmem_cache_free(kvm_vcpu_cache, vmx);
8448        return ERR_PTR(err);
8449}
8450
8451static void __init vmx_check_processor_compat(void *rtn)
8452{
8453        struct vmcs_config vmcs_conf;
8454
8455        *(int *)rtn = 0;
8456        if (setup_vmcs_config(&vmcs_conf) < 0)
8457                *(int *)rtn = -EIO;
8458        if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
8459                printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
8460                                smp_processor_id());
8461                *(int *)rtn = -EIO;
8462        }
8463}
8464
8465static int get_ept_level(void)
8466{
8467        return VMX_EPT_DEFAULT_GAW + 1;
8468}
8469
8470static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
8471{
8472        u64 ret;
8473
8474        /* For VT-d and EPT combination
8475         * 1. MMIO: always map as UC
8476         * 2. EPT with VT-d:
8477         *   a. VT-d without snooping control feature: can't guarantee the
8478         *      result, try to trust guest.
8479         *   b. VT-d with snooping control feature: snooping control feature of
8480         *      VT-d engine can guarantee the cache correctness. Just set it
8481         *      to WB to keep consistent with host. So the same as item 3.
8482         * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
8483         *    consistent with host MTRR
8484         */
8485        if (is_mmio)
8486                ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
8487        else if (kvm_arch_has_noncoherent_dma(vcpu->kvm))
8488                ret = kvm_get_guest_memory_type(vcpu, gfn) <<
8489                      VMX_EPT_MT_EPTE_SHIFT;
8490        else
8491                ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
8492                        | VMX_EPT_IPAT_BIT;
8493
8494        return ret;
8495}
8496
8497static int vmx_get_lpage_level(void)
8498{
8499        if (enable_ept && !cpu_has_vmx_ept_1g_page())
8500                return PT_DIRECTORY_LEVEL;
8501        else
8502                /* For shadow and EPT supported 1GB page */
8503                return PT_PDPE_LEVEL;
8504}
8505
8506static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
8507{
8508        struct kvm_cpuid_entry2 *best;
8509        struct vcpu_vmx *vmx = to_vmx(vcpu);
8510        u32 exec_control;
8511
8512        vmx->rdtscp_enabled = false;
8513        if (vmx_rdtscp_supported()) {
8514                exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
8515                if (exec_control & SECONDARY_EXEC_RDTSCP) {
8516                        best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
8517                        if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
8518                                vmx->rdtscp_enabled = true;
8519                        else {
8520                                exec_control &= ~SECONDARY_EXEC_RDTSCP;
8521                                vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
8522                                                exec_control);
8523                        }
8524                }
8525        }
8526
8527        /* Exposing INVPCID only when PCID is exposed */
8528        best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
8529        if (vmx_invpcid_supported() &&
8530            best && (best->ebx & bit(X86_FEATURE_INVPCID)) &&
8531            guest_cpuid_has_pcid(vcpu)) {
8532                exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
8533                exec_control |= SECONDARY_EXEC_ENABLE_INVPCID;
8534                vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
8535                             exec_control);
8536        } else {
8537                if (cpu_has_secondary_exec_ctrls()) {
8538                        exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
8539                        exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
8540                        vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
8541                                     exec_control);
8542                }
8543                if (best)
8544                        best->ebx &= ~bit(X86_FEATURE_INVPCID);
8545        }
8546}
8547
8548static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
8549{
8550        if (func == 1 && nested)
8551                entry->ecx |= bit(X86_FEATURE_VMX);
8552}
8553
8554static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
8555                struct x86_exception *fault)
8556{
8557        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
8558        u32 exit_reason;
8559
8560        if (fault->error_code & PFERR_RSVD_MASK)
8561                exit_reason = EXIT_REASON_EPT_MISCONFIG;
8562        else
8563                exit_reason = EXIT_REASON_EPT_VIOLATION;
8564        nested_vmx_vmexit(vcpu, exit_reason, 0, vcpu->arch.exit_qualification);
8565        vmcs12->guest_physical_address = fault->address;
8566}
8567
8568/* Callbacks for nested_ept_init_mmu_context: */
8569
8570static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
8571{
8572        /* return the page table to be shadowed - in our case, EPT12 */
8573        return get_vmcs12(vcpu)->ept_pointer;
8574}
8575
8576static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
8577{
8578        WARN_ON(mmu_is_nested(vcpu));
8579        kvm_init_shadow_ept_mmu(vcpu,
8580                        to_vmx(vcpu)->nested.nested_vmx_ept_caps &
8581                        VMX_EPT_EXECUTE_ONLY_BIT);
8582        vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
8583        vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
8584        vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
8585
8586        vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
8587}
8588
8589static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
8590{
8591        vcpu->arch.walk_mmu = &vcpu->arch.mmu;
8592}
8593
8594static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
8595                                            u16 error_code)
8596{
8597        bool inequality, bit;
8598
8599        bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
8600        inequality =
8601                (error_code & vmcs12->page_fault_error_code_mask) !=
8602                 vmcs12->page_fault_error_code_match;
8603        return inequality ^ bit;
8604}
8605
8606static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
8607                struct x86_exception *fault)
8608{
8609        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
8610
8611        WARN_ON(!is_guest_mode(vcpu));
8612
8613        if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code))
8614                nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
8615                                  vmcs_read32(VM_EXIT_INTR_INFO),
8616                                  vmcs_readl(EXIT_QUALIFICATION));
8617        else
8618                kvm_inject_page_fault(vcpu, fault);
8619}
8620
8621static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
8622                                        struct vmcs12 *vmcs12)
8623{
8624        struct vcpu_vmx *vmx = to_vmx(vcpu);
8625
8626        if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
8627                /* TODO: Also verify bits beyond physical address width are 0 */
8628                if (!PAGE_ALIGNED(vmcs12->apic_access_addr))
8629                        return false;
8630
8631                /*
8632                 * Translate L1 physical address to host physical
8633                 * address for vmcs02. Keep the page pinned, so this
8634                 * physical address remains valid. We keep a reference
8635                 * to it so we can release it later.
8636                 */
8637                if (vmx->nested.apic_access_page) /* shouldn't happen */
8638                        nested_release_page(vmx->nested.apic_access_page);
8639                vmx->nested.apic_access_page =
8640                        nested_get_page(vcpu, vmcs12->apic_access_addr);
8641        }
8642
8643        if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
8644                /* TODO: Also verify bits beyond physical address width are 0 */
8645                if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr))
8646                        return false;
8647
8648                if (vmx->nested.virtual_apic_page) /* shouldn't happen */
8649                        nested_release_page(vmx->nested.virtual_apic_page);
8650                vmx->nested.virtual_apic_page =
8651                        nested_get_page(vcpu, vmcs12->virtual_apic_page_addr);
8652
8653                /*
8654                 * Failing the vm entry is _not_ what the processor does
8655                 * but it's basically the only possibility we have.
8656                 * We could still enter the guest if CR8 load exits are
8657                 * enabled, CR8 store exits are enabled, and virtualize APIC
8658                 * access is disabled; in this case the processor would never
8659                 * use the TPR shadow and we could simply clear the bit from
8660                 * the execution control.  But such a configuration is useless,
8661                 * so let's keep the code simple.
8662                 */
8663                if (!vmx->nested.virtual_apic_page)
8664                        return false;
8665        }
8666
8667        if (nested_cpu_has_posted_intr(vmcs12)) {
8668                if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64))
8669                        return false;
8670
8671                if (vmx->nested.pi_desc_page) { /* shouldn't happen */
8672                        kunmap(vmx->nested.pi_desc_page);
8673                        nested_release_page(vmx->nested.pi_desc_page);
8674                }
8675                vmx->nested.pi_desc_page =
8676                        nested_get_page(vcpu, vmcs12->posted_intr_desc_addr);
8677                if (!vmx->nested.pi_desc_page)
8678                        return false;
8679
8680                vmx->nested.pi_desc =
8681                        (struct pi_desc *)kmap(vmx->nested.pi_desc_page);
8682                if (!vmx->nested.pi_desc) {
8683                        nested_release_page_clean(vmx->nested.pi_desc_page);
8684                        return false;
8685                }
8686                vmx->nested.pi_desc =
8687                        (struct pi_desc *)((void *)vmx->nested.pi_desc +
8688                        (unsigned long)(vmcs12->posted_intr_desc_addr &
8689                        (PAGE_SIZE - 1)));
8690        }
8691
8692        return true;
8693}
8694
8695static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
8696{
8697        u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
8698        struct vcpu_vmx *vmx = to_vmx(vcpu);
8699
8700        if (vcpu->arch.virtual_tsc_khz == 0)
8701                return;
8702
8703        /* Make sure short timeouts reliably trigger an immediate vmexit.
8704         * hrtimer_start does not guarantee this. */
8705        if (preemption_timeout <= 1) {
8706                vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
8707                return;
8708        }
8709
8710        preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
8711        preemption_timeout *= 1000000;
8712        do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
8713        hrtimer_start(&vmx->nested.preemption_timer,
8714                      ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
8715}
8716
8717static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
8718                                                struct vmcs12 *vmcs12)
8719{
8720        int maxphyaddr;
8721        u64 addr;
8722
8723        if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
8724                return 0;
8725
8726        if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) {
8727                WARN_ON(1);
8728                return -EINVAL;
8729        }
8730        maxphyaddr = cpuid_maxphyaddr(vcpu);
8731
8732        if (!PAGE_ALIGNED(vmcs12->msr_bitmap) ||
8733           ((addr + PAGE_SIZE) >> maxphyaddr))
8734                return -EINVAL;
8735
8736        return 0;
8737}
8738
8739/*
8740 * Merge L0's and L1's MSR bitmap, return false to indicate that
8741 * we do not use the hardware.
8742 */
8743static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
8744                                               struct vmcs12 *vmcs12)
8745{
8746        int msr;
8747        struct page *page;
8748        unsigned long *msr_bitmap;
8749
8750        if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
8751                return false;
8752
8753        page = nested_get_page(vcpu, vmcs12->msr_bitmap);
8754        if (!page) {
8755                WARN_ON(1);
8756                return false;
8757        }
8758        msr_bitmap = (unsigned long *)kmap(page);
8759        if (!msr_bitmap) {
8760                nested_release_page_clean(page);
8761                WARN_ON(1);
8762                return false;
8763        }
8764
8765        if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
8766                if (nested_cpu_has_apic_reg_virt(vmcs12))
8767                        for (msr = 0x800; msr <= 0x8ff; msr++)
8768                                nested_vmx_disable_intercept_for_msr(
8769                                        msr_bitmap,
8770                                        vmx_msr_bitmap_nested,
8771                                        msr, MSR_TYPE_R);
8772                /* TPR is allowed */
8773                nested_vmx_disable_intercept_for_msr(msr_bitmap,
8774                                vmx_msr_bitmap_nested,
8775                                APIC_BASE_MSR + (APIC_TASKPRI >> 4),
8776                                MSR_TYPE_R | MSR_TYPE_W);
8777                if (nested_cpu_has_vid(vmcs12)) {
8778                        /* EOI and self-IPI are allowed */
8779                        nested_vmx_disable_intercept_for_msr(
8780                                msr_bitmap,
8781                                vmx_msr_bitmap_nested,
8782                                APIC_BASE_MSR + (APIC_EOI >> 4),
8783                                MSR_TYPE_W);
8784                        nested_vmx_disable_intercept_for_msr(
8785                                msr_bitmap,
8786                                vmx_msr_bitmap_nested,
8787                                APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
8788                                MSR_TYPE_W);
8789                }
8790        } else {
8791                /*
8792                 * Enable reading intercept of all the x2apic
8793                 * MSRs. We should not rely on vmcs12 to do any
8794                 * optimizations here, it may have been modified
8795                 * by L1.
8796                 */
8797                for (msr = 0x800; msr <= 0x8ff; msr++)
8798                        __vmx_enable_intercept_for_msr(
8799                                vmx_msr_bitmap_nested,
8800                                msr,
8801                                MSR_TYPE_R);
8802
8803                __vmx_enable_intercept_for_msr(
8804                                vmx_msr_bitmap_nested,
8805                                APIC_BASE_MSR + (APIC_TASKPRI >> 4),
8806                                MSR_TYPE_W);
8807                __vmx_enable_intercept_for_msr(
8808                                vmx_msr_bitmap_nested,
8809                                APIC_BASE_MSR + (APIC_EOI >> 4),
8810                                MSR_TYPE_W);
8811                __vmx_enable_intercept_for_msr(
8812                                vmx_msr_bitmap_nested,
8813                                APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
8814                                MSR_TYPE_W);
8815        }
8816        kunmap(page);
8817        nested_release_page_clean(page);
8818
8819        return true;
8820}
8821
8822static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
8823                                           struct vmcs12 *vmcs12)
8824{
8825        if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
8826            !nested_cpu_has_apic_reg_virt(vmcs12) &&
8827            !nested_cpu_has_vid(vmcs12) &&
8828            !nested_cpu_has_posted_intr(vmcs12))
8829                return 0;
8830
8831        /*
8832         * If virtualize x2apic mode is enabled,
8833         * virtualize apic access must be disabled.
8834         */
8835        if (nested_cpu_has_virt_x2apic_mode(vmcs12) &&
8836            nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
8837                return -EINVAL;
8838
8839        /*
8840         * If virtual interrupt delivery is enabled,
8841         * we must exit on external interrupts.
8842         */
8843        if (nested_cpu_has_vid(vmcs12) &&
8844           !nested_exit_on_intr(vcpu))
8845                return -EINVAL;
8846
8847        /*
8848         * bits 15:8 should be zero in posted_intr_nv,
8849         * the descriptor address has been already checked
8850         * in nested_get_vmcs12_pages.
8851         */
8852        if (nested_cpu_has_posted_intr(vmcs12) &&
8853           (!nested_cpu_has_vid(vmcs12) ||
8854            !nested_exit_intr_ack_set(vcpu) ||
8855            vmcs12->posted_intr_nv & 0xff00))
8856                return -EINVAL;
8857
8858        /* tpr shadow is needed by all apicv features. */
8859        if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
8860                return -EINVAL;
8861
8862        return 0;
8863}
8864
8865static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
8866                                       unsigned long count_field,
8867                                       unsigned long addr_field,
8868                                       int maxphyaddr)
8869{
8870        u64 count, addr;
8871
8872        if (vmcs12_read_any(vcpu, count_field, &count) ||
8873            vmcs12_read_any(vcpu, addr_field, &addr)) {
8874                WARN_ON(1);
8875                return -EINVAL;
8876        }
8877        if (count == 0)
8878                return 0;
8879        if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
8880            (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
8881                pr_warn_ratelimited(
8882                        "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
8883                        addr_field, maxphyaddr, count, addr);
8884                return -EINVAL;
8885        }
8886        return 0;
8887}
8888
8889static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
8890                                                struct vmcs12 *vmcs12)
8891{
8892        int maxphyaddr;
8893
8894        if (vmcs12->vm_exit_msr_load_count == 0 &&
8895            vmcs12->vm_exit_msr_store_count == 0 &&
8896            vmcs12->vm_entry_msr_load_count == 0)
8897                return 0; /* Fast path */
8898        maxphyaddr = cpuid_maxphyaddr(vcpu);
8899        if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
8900                                        VM_EXIT_MSR_LOAD_ADDR, maxphyaddr) ||
8901            nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
8902                                        VM_EXIT_MSR_STORE_ADDR, maxphyaddr) ||
8903            nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
8904                                        VM_ENTRY_MSR_LOAD_ADDR, maxphyaddr))
8905                return -EINVAL;
8906        return 0;
8907}
8908
8909static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
8910                                       struct vmx_msr_entry *e)
8911{
8912        /* x2APIC MSR accesses are not allowed */
8913        if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8)
8914                return -EINVAL;
8915        if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
8916            e->index == MSR_IA32_UCODE_REV)
8917                return -EINVAL;
8918        if (e->reserved != 0)
8919                return -EINVAL;
8920        return 0;
8921}
8922
8923static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
8924                                     struct vmx_msr_entry *e)
8925{
8926        if (e->index == MSR_FS_BASE ||
8927            e->index == MSR_GS_BASE ||
8928            e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */
8929            nested_vmx_msr_check_common(vcpu, e))
8930                return -EINVAL;
8931        return 0;
8932}
8933
8934static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
8935                                      struct vmx_msr_entry *e)
8936{
8937        if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */
8938            nested_vmx_msr_check_common(vcpu, e))
8939                return -EINVAL;
8940        return 0;
8941}
8942
8943/*
8944 * Load guest's/host's msr at nested entry/exit.
8945 * return 0 for success, entry index for failure.
8946 */
8947static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
8948{
8949        u32 i;
8950        struct vmx_msr_entry e;
8951        struct msr_data msr;
8952
8953        msr.host_initiated = false;
8954        for (i = 0; i < count; i++) {
8955                if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e),
8956                                   &e, sizeof(e))) {
8957                        pr_warn_ratelimited(
8958                                "%s cannot read MSR entry (%u, 0x%08llx)\n",
8959                                __func__, i, gpa + i * sizeof(e));
8960                        goto fail;
8961                }
8962                if (nested_vmx_load_msr_check(vcpu, &e)) {
8963                        pr_warn_ratelimited(
8964                                "%s check failed (%u, 0x%x, 0x%x)\n",
8965                                __func__, i, e.index, e.reserved);
8966                        goto fail;
8967                }
8968                msr.index = e.index;
8969                msr.data = e.value;
8970                if (kvm_set_msr(vcpu, &msr)) {
8971                        pr_warn_ratelimited(
8972                                "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
8973                                __func__, i, e.index, e.value);
8974                        goto fail;
8975                }
8976        }
8977        return 0;
8978fail:
8979        return i + 1;
8980}
8981
8982static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
8983{
8984        u32 i;
8985        struct vmx_msr_entry e;
8986
8987        for (i = 0; i < count; i++) {
8988                if (kvm_read_guest(vcpu->kvm,
8989                                   gpa + i * sizeof(e),
8990                                   &e, 2 * sizeof(u32))) {
8991                        pr_warn_ratelimited(
8992                                "%s cannot read MSR entry (%u, 0x%08llx)\n",
8993                                __func__, i, gpa + i * sizeof(e));
8994                        return -EINVAL;
8995                }
8996                if (nested_vmx_store_msr_check(vcpu, &e)) {
8997                        pr_warn_ratelimited(
8998                                "%s check failed (%u, 0x%x, 0x%x)\n",
8999                                __func__, i, e.index, e.reserved);
9000                        return -EINVAL;
9001                }
9002                if (kvm_get_msr(vcpu, e.index, &e.value)) {
9003                        pr_warn_ratelimited(
9004                                "%s cannot read MSR (%u, 0x%x)\n",
9005                                __func__, i, e.index);
9006                        return -EINVAL;
9007                }
9008                if (kvm_write_guest(vcpu->kvm,
9009                                    gpa + i * sizeof(e) +
9010                                        offsetof(struct vmx_msr_entry, value),
9011                                    &e.value, sizeof(e.value))) {
9012                        pr_warn_ratelimited(
9013                                "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
9014                                __func__, i, e.index, e.value);
9015                        return -EINVAL;
9016                }
9017        }
9018        return 0;
9019}
9020
9021/*
9022 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
9023 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
9024 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
9025 * guest in a way that will both be appropriate to L1's requests, and our
9026 * needs. In addition to modifying the active vmcs (which is vmcs02), this
9027 * function also has additional necessary side-effects, like setting various
9028 * vcpu->arch fields.
9029 */
9030static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
9031{
9032        struct vcpu_vmx *vmx = to_vmx(vcpu);
9033        u32 exec_control;
9034
9035        vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
9036        vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
9037        vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
9038        vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
9039        vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
9040        vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
9041        vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
9042        vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
9043        vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
9044        vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
9045        vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
9046        vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
9047        vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
9048        vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
9049        vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
9050        vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
9051        vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
9052        vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
9053        vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
9054        vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
9055        vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
9056        vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
9057        vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
9058        vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
9059        vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
9060        vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
9061        vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
9062        vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
9063        vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
9064        vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
9065        vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
9066        vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
9067        vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
9068        vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
9069        vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
9070        vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
9071
9072        if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
9073                kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
9074                vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
9075        } else {
9076                kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
9077                vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
9078        }
9079        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
9080                vmcs12->vm_entry_intr_info_field);
9081        vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
9082                vmcs12->vm_entry_exception_error_code);
9083        vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
9084                vmcs12->vm_entry_instruction_len);
9085        vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
9086                vmcs12->guest_interruptibility_info);
9087        vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
9088        vmx_set_rflags(vcpu, vmcs12->guest_rflags);
9089        vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
9090                vmcs12->guest_pending_dbg_exceptions);
9091        vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
9092        vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
9093
9094        if (nested_cpu_has_xsaves(vmcs12))
9095                vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
9096        vmcs_write64(VMCS_LINK_POINTER, -1ull);
9097
9098        exec_control = vmcs12->pin_based_vm_exec_control;
9099        exec_control |= vmcs_config.pin_based_exec_ctrl;
9100        exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
9101
9102        if (nested_cpu_has_posted_intr(vmcs12)) {
9103                /*
9104                 * Note that we use L0's vector here and in
9105                 * vmx_deliver_nested_posted_interrupt.
9106                 */
9107                vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
9108                vmx->nested.pi_pending = false;
9109                vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
9110                vmcs_write64(POSTED_INTR_DESC_ADDR,
9111                        page_to_phys(vmx->nested.pi_desc_page) +
9112                        (unsigned long)(vmcs12->posted_intr_desc_addr &
9113                        (PAGE_SIZE - 1)));
9114        } else
9115                exec_control &= ~PIN_BASED_POSTED_INTR;
9116
9117        vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
9118
9119        vmx->nested.preemption_timer_expired = false;
9120        if (nested_cpu_has_preemption_timer(vmcs12))
9121                vmx_start_preemption_timer(vcpu);
9122
9123        /*
9124         * Whether page-faults are trapped is determined by a combination of
9125         * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
9126         * If enable_ept, L0 doesn't care about page faults and we should
9127         * set all of these to L1's desires. However, if !enable_ept, L0 does
9128         * care about (at least some) page faults, and because it is not easy
9129         * (if at all possible?) to merge L0 and L1's desires, we simply ask
9130         * to exit on each and every L2 page fault. This is done by setting
9131         * MASK=MATCH=0 and (see below) EB.PF=1.
9132         * Note that below we don't need special code to set EB.PF beyond the
9133         * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
9134         * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
9135         * !enable_ept, EB.PF is 1, so the "or" will always be 1.
9136         *
9137         * A problem with this approach (when !enable_ept) is that L1 may be
9138         * injected with more page faults than it asked for. This could have
9139         * caused problems, but in practice existing hypervisors don't care.
9140         * To fix this, we will need to emulate the PFEC checking (on the L1
9141         * page tables), using walk_addr(), when injecting PFs to L1.
9142         */
9143        vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
9144                enable_ept ? vmcs12->page_fault_error_code_mask : 0);
9145        vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
9146                enable_ept ? vmcs12->page_fault_error_code_match : 0);
9147
9148        if (cpu_has_secondary_exec_ctrls()) {
9149                exec_control = vmx_secondary_exec_control(vmx);
9150                if (!vmx->rdtscp_enabled)
9151                        exec_control &= ~SECONDARY_EXEC_RDTSCP;
9152                /* Take the following fields only from vmcs12 */
9153                exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
9154                                  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
9155                                  SECONDARY_EXEC_APIC_REGISTER_VIRT);
9156                if (nested_cpu_has(vmcs12,
9157                                CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
9158                        exec_control |= vmcs12->secondary_vm_exec_control;
9159
9160                if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) {
9161                        /*
9162                         * If translation failed, no matter: This feature asks
9163                         * to exit when accessing the given address, and if it
9164                         * can never be accessed, this feature won't do
9165                         * anything anyway.
9166                         */
9167                        if (!vmx->nested.apic_access_page)
9168                                exec_control &=
9169                                  ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
9170                        else
9171                                vmcs_write64(APIC_ACCESS_ADDR,
9172                                  page_to_phys(vmx->nested.apic_access_page));
9173                } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
9174                            (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) {
9175                        exec_control |=
9176                                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
9177                        kvm_vcpu_reload_apic_access_page(vcpu);
9178                }
9179
9180                if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
9181                        vmcs_write64(EOI_EXIT_BITMAP0,
9182                                vmcs12->eoi_exit_bitmap0);
9183                        vmcs_write64(EOI_EXIT_BITMAP1,
9184                                vmcs12->eoi_exit_bitmap1);
9185                        vmcs_write64(EOI_EXIT_BITMAP2,
9186                                vmcs12->eoi_exit_bitmap2);
9187                        vmcs_write64(EOI_EXIT_BITMAP3,
9188                                vmcs12->eoi_exit_bitmap3);
9189                        vmcs_write16(GUEST_INTR_STATUS,
9190                                vmcs12->guest_intr_status);
9191                }
9192
9193                vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
9194        }
9195
9196
9197        /*
9198         * Set host-state according to L0's settings (vmcs12 is irrelevant here)
9199         * Some constant fields are set here by vmx_set_constant_host_state().
9200         * Other fields are different per CPU, and will be set later when
9201         * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
9202         */
9203        vmx_set_constant_host_state(vmx);
9204
9205        /*
9206         * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
9207         * entry, but only if the current (host) sp changed from the value
9208         * we wrote last (vmx->host_rsp). This cache is no longer relevant
9209         * if we switch vmcs, and rather than hold a separate cache per vmcs,
9210         * here we just force the write to happen on entry.
9211         */
9212        vmx->host_rsp = 0;
9213
9214        exec_control = vmx_exec_control(vmx); /* L0's desires */
9215        exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
9216        exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
9217        exec_control &= ~CPU_BASED_TPR_SHADOW;
9218        exec_control |= vmcs12->cpu_based_vm_exec_control;
9219
9220        if (exec_control & CPU_BASED_TPR_SHADOW) {
9221                vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
9222                                page_to_phys(vmx->nested.virtual_apic_page));
9223                vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
9224        }
9225
9226        if (cpu_has_vmx_msr_bitmap() &&
9227            exec_control & CPU_BASED_USE_MSR_BITMAPS) {
9228                nested_vmx_merge_msr_bitmap(vcpu, vmcs12);
9229                /* MSR_BITMAP will be set by following vmx_set_efer. */
9230        } else
9231                exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
9232
9233        /*
9234         * Merging of IO bitmap not currently supported.
9235         * Rather, exit every time.
9236         */
9237        exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
9238        exec_control |= CPU_BASED_UNCOND_IO_EXITING;
9239
9240        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
9241
9242        /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
9243         * bitwise-or of what L1 wants to trap for L2, and what we want to
9244         * trap. Note that CR0.TS also needs updating - we do this later.
9245         */
9246        update_exception_bitmap(vcpu);
9247        vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
9248        vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
9249
9250        /* L2->L1 exit controls are emulated - the hardware exit is to L0 so
9251         * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
9252         * bits are further modified by vmx_set_efer() below.
9253         */
9254        vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
9255
9256        /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
9257         * emulated by vmx_set_efer(), below.
9258         */
9259        vm_entry_controls_init(vmx, 
9260                (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
9261                        ~VM_ENTRY_IA32E_MODE) |
9262                (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
9263
9264        if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) {
9265                vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
9266                vcpu->arch.pat = vmcs12->guest_ia32_pat;
9267        } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
9268                vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
9269
9270
9271        set_cr4_guest_host_mask(vmx);
9272
9273        if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
9274                vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
9275
9276        if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
9277                vmcs_write64(TSC_OFFSET,
9278                        vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
9279        else
9280                vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
9281
9282        if (enable_vpid) {
9283                /*
9284                 * Trivially support vpid by letting L2s share their parent
9285                 * L1's vpid. TODO: move to a more elaborate solution, giving
9286                 * each L2 its own vpid and exposing the vpid feature to L1.
9287                 */
9288                vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
9289                vmx_flush_tlb(vcpu);
9290        }
9291
9292        if (nested_cpu_has_ept(vmcs12)) {
9293                kvm_mmu_unload(vcpu);
9294                nested_ept_init_mmu_context(vcpu);
9295        }
9296
9297        if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
9298                vcpu->arch.efer = vmcs12->guest_ia32_efer;
9299        else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
9300                vcpu->arch.efer |= (EFER_LMA | EFER_LME);
9301        else
9302                vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
9303        /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
9304        vmx_set_efer(vcpu, vcpu->arch.efer);
9305
9306        /*
9307         * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified
9308         * TS bit (for lazy fpu) and bits which we consider mandatory enabled.
9309         * The CR0_READ_SHADOW is what L2 should have expected to read given
9310         * the specifications by L1; It's not enough to take
9311         * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
9312         * have more bits than L1 expected.
9313         */
9314        vmx_set_cr0(vcpu, vmcs12->guest_cr0);
9315        vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
9316
9317        vmx_set_cr4(vcpu, vmcs12->guest_cr4);
9318        vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
9319
9320        /* shadow page tables on either EPT or shadow page tables */
9321        kvm_set_cr3(vcpu, vmcs12->guest_cr3);
9322        kvm_mmu_reset_context(vcpu);
9323
9324        if (!enable_ept)
9325                vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
9326
9327        /*
9328         * L1 may access the L2's PDPTR, so save them to construct vmcs12
9329         */
9330        if (enable_ept) {
9331                vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
9332                vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
9333                vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
9334                vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
9335        }
9336
9337        kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
9338        kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
9339}
9340
9341/*
9342 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
9343 * for running an L2 nested guest.
9344 */
9345static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
9346{
9347        struct vmcs12 *vmcs12;
9348        struct vcpu_vmx *vmx = to_vmx(vcpu);
9349        int cpu;
9350        struct loaded_vmcs *vmcs02;
9351        bool ia32e;
9352        u32 msr_entry_idx;
9353
9354        if (!nested_vmx_check_permission(vcpu) ||
9355            !nested_vmx_check_vmcs12(vcpu))
9356                return 1;
9357
9358        skip_emulated_instruction(vcpu);
9359        vmcs12 = get_vmcs12(vcpu);
9360
9361        if (enable_shadow_vmcs)
9362                copy_shadow_to_vmcs12(vmx);
9363
9364        /*
9365         * The nested entry process starts with enforcing various prerequisites
9366         * on vmcs12 as required by the Intel SDM, and act appropriately when
9367         * they fail: As the SDM explains, some conditions should cause the
9368         * instruction to fail, while others will cause the instruction to seem
9369         * to succeed, but return an EXIT_REASON_INVALID_STATE.
9370         * To speed up the normal (success) code path, we should avoid checking
9371         * for misconfigurations which will anyway be caught by the processor
9372         * when using the merged vmcs02.
9373         */
9374        if (vmcs12->launch_state == launch) {
9375                nested_vmx_failValid(vcpu,
9376                        launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
9377                               : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
9378                return 1;
9379        }
9380
9381        if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
9382            vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) {
9383                nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
9384                return 1;
9385        }
9386
9387        if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
9388                /*TODO: Also verify bits beyond physical address width are 0*/
9389                nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
9390                return 1;
9391        }
9392
9393        if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) {
9394                nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
9395                return 1;
9396        }
9397
9398        if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) {
9399                nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
9400                return 1;
9401        }
9402
9403        if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) {
9404                nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
9405                return 1;
9406        }
9407
9408        if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
9409                                vmx->nested.nested_vmx_true_procbased_ctls_low,
9410                                vmx->nested.nested_vmx_procbased_ctls_high) ||
9411            !vmx_control_verify(vmcs12->secondary_vm_exec_control,
9412                                vmx->nested.nested_vmx_secondary_ctls_low,
9413                                vmx->nested.nested_vmx_secondary_ctls_high) ||
9414            !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
9415                                vmx->nested.nested_vmx_pinbased_ctls_low,
9416                                vmx->nested.nested_vmx_pinbased_ctls_high) ||
9417            !vmx_control_verify(vmcs12->vm_exit_controls,
9418                                vmx->nested.nested_vmx_true_exit_ctls_low,
9419                                vmx->nested.nested_vmx_exit_ctls_high) ||
9420            !vmx_control_verify(vmcs12->vm_entry_controls,
9421                                vmx->nested.nested_vmx_true_entry_ctls_low,
9422                                vmx->nested.nested_vmx_entry_ctls_high))
9423        {
9424                nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
9425                return 1;
9426        }
9427
9428        if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
9429            ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
9430                nested_vmx_failValid(vcpu,
9431                        VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
9432                return 1;
9433        }
9434
9435        if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) ||
9436            ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
9437                nested_vmx_entry_failure(vcpu, vmcs12,
9438                        EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
9439                return 1;
9440        }
9441        if (vmcs12->vmcs_link_pointer != -1ull) {
9442                nested_vmx_entry_failure(vcpu, vmcs12,
9443                        EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR);
9444                return 1;
9445        }
9446
9447        /*
9448         * If the load IA32_EFER VM-entry control is 1, the following checks
9449         * are performed on the field for the IA32_EFER MSR:
9450         * - Bits reserved in the IA32_EFER MSR must be 0.
9451         * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
9452         *   the IA-32e mode guest VM-exit control. It must also be identical
9453         *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
9454         *   CR0.PG) is 1.
9455         */
9456        if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) {
9457                ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
9458                if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
9459                    ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
9460                    ((vmcs12->guest_cr0 & X86_CR0_PG) &&
9461                     ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) {
9462                        nested_vmx_entry_failure(vcpu, vmcs12,
9463                                EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
9464                        return 1;
9465                }
9466        }
9467
9468        /*
9469         * If the load IA32_EFER VM-exit control is 1, bits reserved in the
9470         * IA32_EFER MSR must be 0 in the field for that register. In addition,
9471         * the values of the LMA and LME bits in the field must each be that of
9472         * the host address-space size VM-exit control.
9473         */
9474        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
9475                ia32e = (vmcs12->vm_exit_controls &
9476                         VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
9477                if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
9478                    ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
9479                    ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) {
9480                        nested_vmx_entry_failure(vcpu, vmcs12,
9481                                EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
9482                        return 1;
9483                }
9484        }
9485
9486        /*
9487         * We're finally done with prerequisite checking, and can start with
9488         * the nested entry.
9489         */
9490
9491        vmcs02 = nested_get_current_vmcs02(vmx);
9492        if (!vmcs02)
9493                return -ENOMEM;
9494
9495        enter_guest_mode(vcpu);
9496
9497        vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET);
9498
9499        if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
9500                vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
9501
9502        cpu = get_cpu();
9503        vmx->loaded_vmcs = vmcs02;
9504        vmx_vcpu_put(vcpu);
9505        vmx_vcpu_load(vcpu, cpu);
9506        vcpu->cpu = cpu;
9507        put_cpu();
9508
9509        vmx_segment_cache_clear(vmx);
9510
9511        prepare_vmcs02(vcpu, vmcs12);
9512
9513        msr_entry_idx = nested_vmx_load_msr(vcpu,
9514                                            vmcs12->vm_entry_msr_load_addr,
9515                                            vmcs12->vm_entry_msr_load_count);
9516        if (msr_entry_idx) {
9517                leave_guest_mode(vcpu);
9518                vmx_load_vmcs01(vcpu);
9519                nested_vmx_entry_failure(vcpu, vmcs12,
9520                                EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
9521                return 1;
9522        }
9523
9524        vmcs12->launch_state = 1;
9525
9526        if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
9527                return kvm_emulate_halt(vcpu);
9528
9529        vmx->nested.nested_run_pending = 1;
9530
9531        /*
9532         * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
9533         * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
9534         * returned as far as L1 is concerned. It will only return (and set
9535         * the success flag) when L2 exits (see nested_vmx_vmexit()).
9536         */
9537        return 1;
9538}
9539
9540/*
9541 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
9542 * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
9543 * This function returns the new value we should put in vmcs12.guest_cr0.
9544 * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
9545 *  1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
9546 *     available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
9547 *     didn't trap the bit, because if L1 did, so would L0).
9548 *  2. Bits that L1 asked to trap (and therefore L0 also did) could not have
9549 *     been modified by L2, and L1 knows it. So just leave the old value of
9550 *     the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
9551 *     isn't relevant, because if L0 traps this bit it can set it to anything.
9552 *  3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
9553 *     changed these bits, and therefore they need to be updated, but L0
9554 *     didn't necessarily allow them to be changed in GUEST_CR0 - and rather
9555 *     put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
9556 */
9557static inline unsigned long
9558vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
9559{
9560        return
9561        /*1*/   (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
9562        /*2*/   (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
9563        /*3*/   (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
9564                        vcpu->arch.cr0_guest_owned_bits));
9565}
9566
9567static inline unsigned long
9568vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
9569{
9570        return
9571        /*1*/   (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
9572        /*2*/   (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
9573        /*3*/   (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
9574                        vcpu->arch.cr4_guest_owned_bits));
9575}
9576
9577static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
9578                                       struct vmcs12 *vmcs12)
9579{
9580        u32 idt_vectoring;
9581        unsigned int nr;
9582
9583        if (vcpu->arch.exception.pending && vcpu->arch.exception.reinject) {
9584                nr = vcpu->arch.exception.nr;
9585                idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
9586
9587                if (kvm_exception_is_soft(nr)) {
9588                        vmcs12->vm_exit_instruction_len =
9589                                vcpu->arch.event_exit_inst_len;
9590                        idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
9591                } else
9592                        idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
9593
9594                if (vcpu->arch.exception.has_error_code) {
9595                        idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
9596                        vmcs12->idt_vectoring_error_code =
9597                                vcpu->arch.exception.error_code;
9598                }
9599
9600                vmcs12->idt_vectoring_info_field = idt_vectoring;
9601        } else if (vcpu->arch.nmi_injected) {
9602                vmcs12->idt_vectoring_info_field =
9603                        INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
9604        } else if (vcpu->arch.interrupt.pending) {
9605                nr = vcpu->arch.interrupt.nr;
9606                idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
9607
9608                if (vcpu->arch.interrupt.soft) {
9609                        idt_vectoring |= INTR_TYPE_SOFT_INTR;
9610                        vmcs12->vm_entry_instruction_len =
9611                                vcpu->arch.event_exit_inst_len;
9612                } else
9613                        idt_vectoring |= INTR_TYPE_EXT_INTR;
9614
9615                vmcs12->idt_vectoring_info_field = idt_vectoring;
9616        }
9617}
9618
9619static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
9620{
9621        struct vcpu_vmx *vmx = to_vmx(vcpu);
9622
9623        if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
9624            vmx->nested.preemption_timer_expired) {
9625                if (vmx->nested.nested_run_pending)
9626                        return -EBUSY;
9627                nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
9628                return 0;
9629        }
9630
9631        if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
9632                if (vmx->nested.nested_run_pending ||
9633                    vcpu->arch.interrupt.pending)
9634                        return -EBUSY;
9635                nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
9636                                  NMI_VECTOR | INTR_TYPE_NMI_INTR |
9637                                  INTR_INFO_VALID_MASK, 0);
9638                /*
9639                 * The NMI-triggered VM exit counts as injection:
9640                 * clear this one and block further NMIs.
9641                 */
9642                vcpu->arch.nmi_pending = 0;
9643                vmx_set_nmi_mask(vcpu, true);
9644                return 0;
9645        }
9646
9647        if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
9648            nested_exit_on_intr(vcpu)) {
9649                if (vmx->nested.nested_run_pending)
9650                        return -EBUSY;
9651                nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
9652                return 0;
9653        }
9654
9655        return vmx_complete_nested_posted_interrupt(vcpu);
9656}
9657
9658static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
9659{
9660        ktime_t remaining =
9661                hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
9662        u64 value;
9663
9664        if (ktime_to_ns(remaining) <= 0)
9665                return 0;
9666
9667        value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
9668        do_div(value, 1000000);
9669        return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
9670}
9671
9672/*
9673 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
9674 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
9675 * and this function updates it to reflect the changes to the guest state while
9676 * L2 was running (and perhaps made some exits which were handled directly by L0
9677 * without going back to L1), and to reflect the exit reason.
9678 * Note that we do not have to copy here all VMCS fields, just those that
9679 * could have changed by the L2 guest or the exit - i.e., the guest-state and
9680 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
9681 * which already writes to vmcs12 directly.
9682 */
9683static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
9684                           u32 exit_reason, u32 exit_intr_info,
9685                           unsigned long exit_qualification)
9686{
9687        /* update guest state fields: */
9688        vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
9689        vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
9690
9691        vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
9692        vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
9693        vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
9694
9695        vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
9696        vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
9697        vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
9698        vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
9699        vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
9700        vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
9701        vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
9702        vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
9703        vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
9704        vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
9705        vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
9706        vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
9707        vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
9708        vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
9709        vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
9710        vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
9711        vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
9712        vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
9713        vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
9714        vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
9715        vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
9716        vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
9717        vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
9718        vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
9719        vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
9720        vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
9721        vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
9722        vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
9723        vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
9724        vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
9725        vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
9726        vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
9727        vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
9728        vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
9729        vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
9730        vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
9731
9732        vmcs12->guest_interruptibility_info =
9733                vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
9734        vmcs12->guest_pending_dbg_exceptions =
9735                vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
9736        if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
9737                vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
9738        else
9739                vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
9740
9741        if (nested_cpu_has_preemption_timer(vmcs12)) {
9742                if (vmcs12->vm_exit_controls &
9743                    VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
9744                        vmcs12->vmx_preemption_timer_value =
9745                                vmx_get_preemption_timer_value(vcpu);
9746                hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
9747        }
9748
9749        /*
9750         * In some cases (usually, nested EPT), L2 is allowed to change its
9751         * own CR3 without exiting. If it has changed it, we must keep it.
9752         * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
9753         * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
9754         *
9755         * Additionally, restore L2's PDPTR to vmcs12.
9756         */
9757        if (enable_ept) {
9758                vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3);
9759                vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
9760                vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
9761                vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
9762                vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
9763        }
9764
9765        if (nested_cpu_has_vid(vmcs12))
9766                vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
9767
9768        vmcs12->vm_entry_controls =
9769                (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
9770                (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
9771
9772        if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) {
9773                kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
9774                vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
9775        }
9776
9777        /* TODO: These cannot have changed unless we have MSR bitmaps and
9778         * the relevant bit asks not to trap the change */
9779        if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
9780                vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
9781        if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
9782                vmcs12->guest_ia32_efer = vcpu->arch.efer;
9783        vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
9784        vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
9785        vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
9786        if (vmx_mpx_supported())
9787                vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
9788        if (nested_cpu_has_xsaves(vmcs12))
9789                vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP);
9790
9791        /* update exit information fields: */
9792
9793        vmcs12->vm_exit_reason = exit_reason;
9794        vmcs12->exit_qualification = exit_qualification;
9795
9796        vmcs12->vm_exit_intr_info = exit_intr_info;
9797        if ((vmcs12->vm_exit_intr_info &
9798             (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
9799            (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK))
9800                vmcs12->vm_exit_intr_error_code =
9801                        vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
9802        vmcs12->idt_vectoring_info_field = 0;
9803        vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
9804        vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
9805
9806        if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
9807                /* vm_entry_intr_info_field is cleared on exit. Emulate this
9808                 * instead of reading the real value. */
9809                vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
9810
9811                /*
9812                 * Transfer the event that L0 or L1 may wanted to inject into
9813                 * L2 to IDT_VECTORING_INFO_FIELD.
9814                 */
9815                vmcs12_save_pending_event(vcpu, vmcs12);
9816        }
9817
9818        /*
9819         * Drop what we picked up for L2 via vmx_complete_interrupts. It is
9820         * preserved above and would only end up incorrectly in L1.
9821         */
9822        vcpu->arch.nmi_injected = false;
9823        kvm_clear_exception_queue(vcpu);
9824        kvm_clear_interrupt_queue(vcpu);
9825}
9826
9827/*
9828 * A part of what we need to when the nested L2 guest exits and we want to
9829 * run its L1 parent, is to reset L1's guest state to the host state specified
9830 * in vmcs12.
9831 * This function is to be called not only on normal nested exit, but also on
9832 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
9833 * Failures During or After Loading Guest State").
9834 * This function should be called when the active VMCS is L1's (vmcs01).
9835 */
9836static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
9837                                   struct vmcs12 *vmcs12)
9838{
9839        struct kvm_segment seg;
9840
9841        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
9842                vcpu->arch.efer = vmcs12->host_ia32_efer;
9843        else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
9844                vcpu->arch.efer |= (EFER_LMA | EFER_LME);
9845        else
9846                vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
9847        vmx_set_efer(vcpu, vcpu->arch.efer);
9848
9849        kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
9850        kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
9851        vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
9852        /*
9853         * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
9854         * actually changed, because it depends on the current state of
9855         * fpu_active (which may have changed).
9856         * Note that vmx_set_cr0 refers to efer set above.
9857         */
9858        vmx_set_cr0(vcpu, vmcs12->host_cr0);
9859        /*
9860         * If we did fpu_activate()/fpu_deactivate() during L2's run, we need
9861         * to apply the same changes to L1's vmcs. We just set cr0 correctly,
9862         * but we also need to update cr0_guest_host_mask and exception_bitmap.
9863         */
9864        update_exception_bitmap(vcpu);
9865        vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0);
9866        vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
9867
9868        /*
9869         * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01
9870         * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask();
9871         */
9872        vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
9873        kvm_set_cr4(vcpu, vmcs12->host_cr4);
9874
9875        nested_ept_uninit_mmu_context(vcpu);
9876
9877        kvm_set_cr3(vcpu, vmcs12->host_cr3);
9878        kvm_mmu_reset_context(vcpu);
9879
9880        if (!enable_ept)
9881                vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
9882
9883        if (enable_vpid) {
9884                /*
9885                 * Trivially support vpid by letting L2s share their parent
9886                 * L1's vpid. TODO: move to a more elaborate solution, giving
9887                 * each L2 its own vpid and exposing the vpid feature to L1.
9888                 */
9889                vmx_flush_tlb(vcpu);
9890        }
9891
9892
9893        vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
9894        vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
9895        vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
9896        vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
9897        vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
9898
9899        /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1.  */
9900        if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
9901                vmcs_write64(GUEST_BNDCFGS, 0);
9902
9903        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
9904                vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
9905                vcpu->arch.pat = vmcs12->host_ia32_pat;
9906        }
9907        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
9908                vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
9909                        vmcs12->host_ia32_perf_global_ctrl);
9910
9911        /* Set L1 segment info according to Intel SDM
9912            27.5.2 Loading Host Segment and Descriptor-Table Registers */
9913        seg = (struct kvm_segment) {
9914                .base = 0,
9915                .limit = 0xFFFFFFFF,
9916                .selector = vmcs12->host_cs_selector,
9917                .type = 11,
9918                .present = 1,
9919                .s = 1,
9920                .g = 1
9921        };
9922        if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
9923                seg.l = 1;
9924        else
9925                seg.db = 1;
9926        vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
9927        seg = (struct kvm_segment) {
9928                .base = 0,
9929                .limit = 0xFFFFFFFF,
9930                .type = 3,
9931                .present = 1,
9932                .s = 1,
9933                .db = 1,
9934                .g = 1
9935        };
9936        seg.selector = vmcs12->host_ds_selector;
9937        vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
9938        seg.selector = vmcs12->host_es_selector;
9939        vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
9940        seg.selector = vmcs12->host_ss_selector;
9941        vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
9942        seg.selector = vmcs12->host_fs_selector;
9943        seg.base = vmcs12->host_fs_base;
9944        vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
9945        seg.selector = vmcs12->host_gs_selector;
9946        seg.base = vmcs12->host_gs_base;
9947        vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
9948        seg = (struct kvm_segment) {
9949                .base = vmcs12->host_tr_base,
9950                .limit = 0x67,
9951                .selector = vmcs12->host_tr_selector,
9952                .type = 11,
9953                .present = 1
9954        };
9955        vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
9956
9957        kvm_set_dr(vcpu, 7, 0x400);
9958        vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
9959
9960        if (cpu_has_vmx_msr_bitmap())
9961                vmx_set_msr_bitmap(vcpu);
9962
9963        if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
9964                                vmcs12->vm_exit_msr_load_count))
9965                nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
9966}
9967
9968/*
9969 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
9970 * and modify vmcs12 to make it see what it would expect to see there if
9971 * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
9972 */
9973static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
9974                              u32 exit_intr_info,
9975                              unsigned long exit_qualification)
9976{
9977        struct vcpu_vmx *vmx = to_vmx(vcpu);
9978        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
9979
9980        /* trying to cancel vmlaunch/vmresume is a bug */
9981        WARN_ON_ONCE(vmx->nested.nested_run_pending);
9982
9983        leave_guest_mode(vcpu);
9984        prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
9985                       exit_qualification);
9986
9987        if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
9988                                 vmcs12->vm_exit_msr_store_count))
9989                nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
9990
9991        vmx_load_vmcs01(vcpu);
9992
9993        if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
9994            && nested_exit_intr_ack_set(vcpu)) {
9995                int irq = kvm_cpu_get_interrupt(vcpu);
9996                WARN_ON(irq < 0);
9997                vmcs12->vm_exit_intr_info = irq |
9998                        INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
9999        }
10000
10001        trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
10002                                       vmcs12->exit_qualification,
10003                                       vmcs12->idt_vectoring_info_field,
10004                                       vmcs12->vm_exit_intr_info,
10005                                       vmcs12->vm_exit_intr_error_code,
10006                                       KVM_ISA_VMX);
10007
10008        vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS));
10009        vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS));
10010        vmx_segment_cache_clear(vmx);
10011
10012        /* if no vmcs02 cache requested, remove the one we used */
10013        if (VMCS02_POOL_SIZE == 0)
10014                nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
10015
10016        load_vmcs12_host_state(vcpu, vmcs12);
10017
10018        /* Update TSC_OFFSET if TSC was changed while L2 ran */
10019        vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
10020
10021        /* This is needed for same reason as it was needed in prepare_vmcs02 */
10022        vmx->host_rsp = 0;
10023
10024        /* Unpin physical memory we referred to in vmcs02 */
10025        if (vmx->nested.apic_access_page) {
10026                nested_release_page(vmx->nested.apic_access_page);
10027                vmx->nested.apic_access_page = NULL;
10028        }
10029        if (vmx->nested.virtual_apic_page) {
10030                nested_release_page(vmx->nested.virtual_apic_page);
10031                vmx->nested.virtual_apic_page = NULL;
10032        }
10033        if (vmx->nested.pi_desc_page) {
10034                kunmap(vmx->nested.pi_desc_page);
10035                nested_release_page(vmx->nested.pi_desc_page);
10036                vmx->nested.pi_desc_page = NULL;
10037                vmx->nested.pi_desc = NULL;
10038        }
10039
10040        /*
10041         * We are now running in L2, mmu_notifier will force to reload the
10042         * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
10043         */
10044        kvm_vcpu_reload_apic_access_page(vcpu);
10045
10046        /*
10047         * Exiting from L2 to L1, we're now back to L1 which thinks it just
10048         * finished a VMLAUNCH or VMRESUME instruction, so we need to set the
10049         * success or failure flag accordingly.
10050         */
10051        if (unlikely(vmx->fail)) {
10052                vmx->fail = 0;
10053                nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR));
10054        } else
10055                nested_vmx_succeed(vcpu);
10056        if (enable_shadow_vmcs)
10057                vmx->nested.sync_shadow_vmcs = true;
10058
10059        /* in case we halted in L2 */
10060        vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
10061}
10062
10063/*
10064 * Forcibly leave nested mode in order to be able to reset the VCPU later on.
10065 */
10066static void vmx_leave_nested(struct kvm_vcpu *vcpu)
10067{
10068        if (is_guest_mode(vcpu))
10069                nested_vmx_vmexit(vcpu, -1, 0, 0);
10070        free_nested(to_vmx(vcpu));
10071}
10072
10073/*
10074 * L1's failure to enter L2 is a subset of a normal exit, as explained in
10075 * 23.7 "VM-entry failures during or after loading guest state" (this also
10076 * lists the acceptable exit-reason and exit-qualification parameters).
10077 * It should only be called before L2 actually succeeded to run, and when
10078 * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss).
10079 */
10080static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
10081                        struct vmcs12 *vmcs12,
10082                        u32 reason, unsigned long qualification)
10083{
10084        load_vmcs12_host_state(vcpu, vmcs12);
10085        vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
10086        vmcs12->exit_qualification = qualification;
10087        nested_vmx_succeed(vcpu);
10088        if (enable_shadow_vmcs)
10089                to_vmx(vcpu)->nested.sync_shadow_vmcs = true;
10090}
10091
10092static int vmx_check_intercept(struct kvm_vcpu *vcpu,
10093                               struct x86_instruction_info *info,
10094                               enum x86_intercept_stage stage)
10095{
10096        return X86EMUL_CONTINUE;
10097}
10098
10099static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
10100{
10101        if (ple_gap)
10102                shrink_ple_window(vcpu);
10103}
10104
10105static void vmx_slot_enable_log_dirty(struct kvm *kvm,
10106                                     struct kvm_memory_slot *slot)
10107{
10108        kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
10109        kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
10110}
10111
10112static void vmx_slot_disable_log_dirty(struct kvm *kvm,
10113                                       struct kvm_memory_slot *slot)
10114{
10115        kvm_mmu_slot_set_dirty(kvm, slot);
10116}
10117
10118static void vmx_flush_log_dirty(struct kvm *kvm)
10119{
10120        kvm_flush_pml_buffers(kvm);
10121}
10122
10123static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
10124                                           struct kvm_memory_slot *memslot,
10125                                           gfn_t offset, unsigned long mask)
10126{
10127        kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
10128}
10129
10130static struct kvm_x86_ops vmx_x86_ops = {
10131        .cpu_has_kvm_support = cpu_has_kvm_support,
10132        .disabled_by_bios = vmx_disabled_by_bios,
10133        .hardware_setup = hardware_setup,
10134        .hardware_unsetup = hardware_unsetup,
10135        .check_processor_compatibility = vmx_check_processor_compat,
10136        .hardware_enable = hardware_enable,
10137        .hardware_disable = hardware_disable,
10138        .cpu_has_accelerated_tpr = report_flexpriority,
10139
10140        .vcpu_create = vmx_create_vcpu,
10141        .vcpu_free = vmx_free_vcpu,
10142        .vcpu_reset = vmx_vcpu_reset,
10143
10144        .prepare_guest_switch = vmx_save_host_state,
10145        .vcpu_load = vmx_vcpu_load,
10146        .vcpu_put = vmx_vcpu_put,
10147
10148        .update_db_bp_intercept = update_exception_bitmap,
10149        .get_msr = vmx_get_msr,
10150        .set_msr = vmx_set_msr,
10151        .get_segment_base = vmx_get_segment_base,
10152        .get_segment = vmx_get_segment,
10153        .set_segment = vmx_set_segment,
10154        .get_cpl = vmx_get_cpl,
10155        .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
10156        .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
10157        .decache_cr3 = vmx_decache_cr3,
10158        .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
10159        .set_cr0 = vmx_set_cr0,
10160        .set_cr3 = vmx_set_cr3,
10161        .set_cr4 = vmx_set_cr4,
10162        .set_efer = vmx_set_efer,
10163        .get_idt = vmx_get_idt,
10164        .set_idt = vmx_set_idt,
10165        .get_gdt = vmx_get_gdt,
10166        .set_gdt = vmx_set_gdt,
10167        .get_dr6 = vmx_get_dr6,
10168        .set_dr6 = vmx_set_dr6,
10169        .set_dr7 = vmx_set_dr7,
10170        .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
10171        .cache_reg = vmx_cache_reg,
10172        .get_rflags = vmx_get_rflags,
10173        .set_rflags = vmx_set_rflags,
10174        .fpu_deactivate = vmx_fpu_deactivate,
10175
10176        .tlb_flush = vmx_flush_tlb,
10177
10178        .run = vmx_vcpu_run,
10179        .handle_exit = vmx_handle_exit,
10180        .skip_emulated_instruction = skip_emulated_instruction,
10181        .set_interrupt_shadow = vmx_set_interrupt_shadow,
10182        .get_interrupt_shadow = vmx_get_interrupt_shadow,
10183        .patch_hypercall = vmx_patch_hypercall,
10184        .set_irq = vmx_inject_irq,
10185        .set_nmi = vmx_inject_nmi,
10186        .queue_exception = vmx_queue_exception,
10187        .cancel_injection = vmx_cancel_injection,
10188        .interrupt_allowed = vmx_interrupt_allowed,
10189        .nmi_allowed = vmx_nmi_allowed,
10190        .get_nmi_mask = vmx_get_nmi_mask,
10191        .set_nmi_mask = vmx_set_nmi_mask,
10192        .enable_nmi_window = enable_nmi_window,
10193        .enable_irq_window = enable_irq_window,
10194        .update_cr8_intercept = update_cr8_intercept,
10195        .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
10196        .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
10197        .vm_has_apicv = vmx_vm_has_apicv,
10198        .load_eoi_exitmap = vmx_load_eoi_exitmap,
10199        .hwapic_irr_update = vmx_hwapic_irr_update,
10200        .hwapic_isr_update = vmx_hwapic_isr_update,
10201        .sync_pir_to_irr = vmx_sync_pir_to_irr,
10202        .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
10203
10204        .set_tss_addr = vmx_set_tss_addr,
10205        .get_tdp_level = get_ept_level,
10206        .get_mt_mask = vmx_get_mt_mask,
10207
10208        .get_exit_info = vmx_get_exit_info,
10209
10210        .get_lpage_level = vmx_get_lpage_level,
10211
10212        .cpuid_update = vmx_cpuid_update,
10213
10214        .rdtscp_supported = vmx_rdtscp_supported,
10215        .invpcid_supported = vmx_invpcid_supported,
10216
10217        .set_supported_cpuid = vmx_set_supported_cpuid,
10218
10219        .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
10220
10221        .set_tsc_khz = vmx_set_tsc_khz,
10222        .read_tsc_offset = vmx_read_tsc_offset,
10223        .write_tsc_offset = vmx_write_tsc_offset,
10224        .adjust_tsc_offset = vmx_adjust_tsc_offset,
10225        .compute_tsc_offset = vmx_compute_tsc_offset,
10226        .read_l1_tsc = vmx_read_l1_tsc,
10227
10228        .set_tdp_cr3 = vmx_set_cr3,
10229
10230        .check_intercept = vmx_check_intercept,
10231        .handle_external_intr = vmx_handle_external_intr,
10232        .mpx_supported = vmx_mpx_supported,
10233        .xsaves_supported = vmx_xsaves_supported,
10234
10235        .check_nested_events = vmx_check_nested_events,
10236
10237        .sched_in = vmx_sched_in,
10238
10239        .slot_enable_log_dirty = vmx_slot_enable_log_dirty,
10240        .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
10241        .flush_log_dirty = vmx_flush_log_dirty,
10242        .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
10243};
10244
10245static int __init vmx_init(void)
10246{
10247        int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
10248                     __alignof__(struct vcpu_vmx), THIS_MODULE);
10249        if (r)
10250                return r;
10251
10252#ifdef CONFIG_KEXEC
10253        rcu_assign_pointer(crash_vmclear_loaded_vmcss,
10254                           crash_vmclear_local_loaded_vmcss);
10255#endif
10256
10257        return 0;
10258}
10259
10260static void __exit vmx_exit(void)
10261{
10262#ifdef CONFIG_KEXEC
10263        RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
10264        synchronize_rcu();
10265#endif
10266
10267        kvm_exit();
10268}
10269
10270module_init(vmx_init)
10271module_exit(vmx_exit)
10272