linux/arch/x86/kvm/vmx/nested.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2
   3#include <linux/frame.h>
   4#include <linux/percpu.h>
   5
   6#include <asm/debugreg.h>
   7#include <asm/mmu_context.h>
   8
   9#include "cpuid.h"
  10#include "hyperv.h"
  11#include "mmu.h"
  12#include "nested.h"
  13#include "trace.h"
  14#include "x86.h"
  15
  16static bool __read_mostly enable_shadow_vmcs = 1;
  17module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
  18
  19static bool __read_mostly nested_early_check = 0;
  20module_param(nested_early_check, bool, S_IRUGO);
  21
  22/*
  23 * Hyper-V requires all of these, so mark them as supported even though
  24 * they are just treated the same as all-context.
  25 */
  26#define VMX_VPID_EXTENT_SUPPORTED_MASK          \
  27        (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT |  \
  28        VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |    \
  29        VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT |    \
  30        VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
  31
  32#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
  33
  34enum {
  35        VMX_VMREAD_BITMAP,
  36        VMX_VMWRITE_BITMAP,
  37        VMX_BITMAP_NR
  38};
  39static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
  40
  41#define vmx_vmread_bitmap                    (vmx_bitmap[VMX_VMREAD_BITMAP])
  42#define vmx_vmwrite_bitmap                   (vmx_bitmap[VMX_VMWRITE_BITMAP])
  43
  44struct shadow_vmcs_field {
  45        u16     encoding;
  46        u16     offset;
  47};
  48static struct shadow_vmcs_field shadow_read_only_fields[] = {
  49#define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) },
  50#include "vmcs_shadow_fields.h"
  51};
  52static int max_shadow_read_only_fields =
  53        ARRAY_SIZE(shadow_read_only_fields);
  54
  55static struct shadow_vmcs_field shadow_read_write_fields[] = {
  56#define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
  57#include "vmcs_shadow_fields.h"
  58};
  59static int max_shadow_read_write_fields =
  60        ARRAY_SIZE(shadow_read_write_fields);
  61
  62static void init_vmcs_shadow_fields(void)
  63{
  64        int i, j;
  65
  66        memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
  67        memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
  68
  69        for (i = j = 0; i < max_shadow_read_only_fields; i++) {
  70                struct shadow_vmcs_field entry = shadow_read_only_fields[i];
  71                u16 field = entry.encoding;
  72
  73                if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
  74                    (i + 1 == max_shadow_read_only_fields ||
  75                     shadow_read_only_fields[i + 1].encoding != field + 1))
  76                        pr_err("Missing field from shadow_read_only_field %x\n",
  77                               field + 1);
  78
  79                clear_bit(field, vmx_vmread_bitmap);
  80                if (field & 1)
  81#ifdef CONFIG_X86_64
  82                        continue;
  83#else
  84                        entry.offset += sizeof(u32);
  85#endif
  86                shadow_read_only_fields[j++] = entry;
  87        }
  88        max_shadow_read_only_fields = j;
  89
  90        for (i = j = 0; i < max_shadow_read_write_fields; i++) {
  91                struct shadow_vmcs_field entry = shadow_read_write_fields[i];
  92                u16 field = entry.encoding;
  93
  94                if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
  95                    (i + 1 == max_shadow_read_write_fields ||
  96                     shadow_read_write_fields[i + 1].encoding != field + 1))
  97                        pr_err("Missing field from shadow_read_write_field %x\n",
  98                               field + 1);
  99
 100                WARN_ONCE(field >= GUEST_ES_AR_BYTES &&
 101                          field <= GUEST_TR_AR_BYTES,
 102                          "Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
 103
 104                /*
 105                 * PML and the preemption timer can be emulated, but the
 106                 * processor cannot vmwrite to fields that don't exist
 107                 * on bare metal.
 108                 */
 109                switch (field) {
 110                case GUEST_PML_INDEX:
 111                        if (!cpu_has_vmx_pml())
 112                                continue;
 113                        break;
 114                case VMX_PREEMPTION_TIMER_VALUE:
 115                        if (!cpu_has_vmx_preemption_timer())
 116                                continue;
 117                        break;
 118                case GUEST_INTR_STATUS:
 119                        if (!cpu_has_vmx_apicv())
 120                                continue;
 121                        break;
 122                default:
 123                        break;
 124                }
 125
 126                clear_bit(field, vmx_vmwrite_bitmap);
 127                clear_bit(field, vmx_vmread_bitmap);
 128                if (field & 1)
 129#ifdef CONFIG_X86_64
 130                        continue;
 131#else
 132                        entry.offset += sizeof(u32);
 133#endif
 134                shadow_read_write_fields[j++] = entry;
 135        }
 136        max_shadow_read_write_fields = j;
 137}
 138
 139/*
 140 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
 141 * set the success or error code of an emulated VMX instruction (as specified
 142 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
 143 * instruction.
 144 */
 145static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
 146{
 147        vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
 148                        & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
 149                            X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
 150        return kvm_skip_emulated_instruction(vcpu);
 151}
 152
 153static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
 154{
 155        vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
 156                        & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
 157                            X86_EFLAGS_SF | X86_EFLAGS_OF))
 158                        | X86_EFLAGS_CF);
 159        return kvm_skip_emulated_instruction(vcpu);
 160}
 161
 162static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
 163                                u32 vm_instruction_error)
 164{
 165        struct vcpu_vmx *vmx = to_vmx(vcpu);
 166
 167        /*
 168         * failValid writes the error number to the current VMCS, which
 169         * can't be done if there isn't a current VMCS.
 170         */
 171        if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
 172                return nested_vmx_failInvalid(vcpu);
 173
 174        vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
 175                        & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
 176                            X86_EFLAGS_SF | X86_EFLAGS_OF))
 177                        | X86_EFLAGS_ZF);
 178        get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
 179        /*
 180         * We don't need to force a shadow sync because
 181         * VM_INSTRUCTION_ERROR is not shadowed
 182         */
 183        return kvm_skip_emulated_instruction(vcpu);
 184}
 185
 186static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
 187{
 188        /* TODO: not to reset guest simply here. */
 189        kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 190        pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
 191}
 192
 193static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
 194{
 195        secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
 196        vmcs_write64(VMCS_LINK_POINTER, -1ull);
 197        vmx->nested.need_vmcs12_to_shadow_sync = false;
 198}
 199
 200static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
 201{
 202        struct vcpu_vmx *vmx = to_vmx(vcpu);
 203
 204        if (!vmx->nested.hv_evmcs)
 205                return;
 206
 207        kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
 208        vmx->nested.hv_evmcs_vmptr = -1ull;
 209        vmx->nested.hv_evmcs = NULL;
 210}
 211
 212/*
 213 * Free whatever needs to be freed from vmx->nested when L1 goes down, or
 214 * just stops using VMX.
 215 */
 216static void free_nested(struct kvm_vcpu *vcpu)
 217{
 218        struct vcpu_vmx *vmx = to_vmx(vcpu);
 219
 220        if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
 221                return;
 222
 223        kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
 224
 225        vmx->nested.vmxon = false;
 226        vmx->nested.smm.vmxon = false;
 227        free_vpid(vmx->nested.vpid02);
 228        vmx->nested.posted_intr_nv = -1;
 229        vmx->nested.current_vmptr = -1ull;
 230        if (enable_shadow_vmcs) {
 231                vmx_disable_shadow_vmcs(vmx);
 232                vmcs_clear(vmx->vmcs01.shadow_vmcs);
 233                free_vmcs(vmx->vmcs01.shadow_vmcs);
 234                vmx->vmcs01.shadow_vmcs = NULL;
 235        }
 236        kfree(vmx->nested.cached_vmcs12);
 237        vmx->nested.cached_vmcs12 = NULL;
 238        kfree(vmx->nested.cached_shadow_vmcs12);
 239        vmx->nested.cached_shadow_vmcs12 = NULL;
 240        /* Unpin physical memory we referred to in the vmcs02 */
 241        if (vmx->nested.apic_access_page) {
 242                kvm_release_page_dirty(vmx->nested.apic_access_page);
 243                vmx->nested.apic_access_page = NULL;
 244        }
 245        kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
 246        kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
 247        vmx->nested.pi_desc = NULL;
 248
 249        kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
 250
 251        nested_release_evmcs(vcpu);
 252
 253        free_loaded_vmcs(&vmx->nested.vmcs02);
 254}
 255
 256static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
 257                                     struct loaded_vmcs *prev)
 258{
 259        struct vmcs_host_state *dest, *src;
 260
 261        if (unlikely(!vmx->guest_state_loaded))
 262                return;
 263
 264        src = &prev->host_state;
 265        dest = &vmx->loaded_vmcs->host_state;
 266
 267        vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
 268        dest->ldt_sel = src->ldt_sel;
 269#ifdef CONFIG_X86_64
 270        dest->ds_sel = src->ds_sel;
 271        dest->es_sel = src->es_sel;
 272#endif
 273}
 274
 275static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
 276{
 277        struct vcpu_vmx *vmx = to_vmx(vcpu);
 278        struct loaded_vmcs *prev;
 279        int cpu;
 280
 281        if (vmx->loaded_vmcs == vmcs)
 282                return;
 283
 284        cpu = get_cpu();
 285        prev = vmx->loaded_vmcs;
 286        vmx->loaded_vmcs = vmcs;
 287        vmx_vcpu_load_vmcs(vcpu, cpu);
 288        vmx_sync_vmcs_host_state(vmx, prev);
 289        put_cpu();
 290
 291        vmx_segment_cache_clear(vmx);
 292}
 293
 294/*
 295 * Ensure that the current vmcs of the logical processor is the
 296 * vmcs01 of the vcpu before calling free_nested().
 297 */
 298void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
 299{
 300        vcpu_load(vcpu);
 301        vmx_leave_nested(vcpu);
 302        vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
 303        free_nested(vcpu);
 304        vcpu_put(vcpu);
 305}
 306
 307static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
 308                struct x86_exception *fault)
 309{
 310        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 311        struct vcpu_vmx *vmx = to_vmx(vcpu);
 312        u32 exit_reason;
 313        unsigned long exit_qualification = vcpu->arch.exit_qualification;
 314
 315        if (vmx->nested.pml_full) {
 316                exit_reason = EXIT_REASON_PML_FULL;
 317                vmx->nested.pml_full = false;
 318                exit_qualification &= INTR_INFO_UNBLOCK_NMI;
 319        } else if (fault->error_code & PFERR_RSVD_MASK)
 320                exit_reason = EXIT_REASON_EPT_MISCONFIG;
 321        else
 322                exit_reason = EXIT_REASON_EPT_VIOLATION;
 323
 324        nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification);
 325        vmcs12->guest_physical_address = fault->address;
 326}
 327
 328static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
 329{
 330        WARN_ON(mmu_is_nested(vcpu));
 331
 332        vcpu->arch.mmu = &vcpu->arch.guest_mmu;
 333        kvm_init_shadow_ept_mmu(vcpu,
 334                        to_vmx(vcpu)->nested.msrs.ept_caps &
 335                        VMX_EPT_EXECUTE_ONLY_BIT,
 336                        nested_ept_ad_enabled(vcpu),
 337                        nested_ept_get_cr3(vcpu));
 338        vcpu->arch.mmu->set_cr3           = vmx_set_cr3;
 339        vcpu->arch.mmu->get_cr3           = nested_ept_get_cr3;
 340        vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
 341        vcpu->arch.mmu->get_pdptr         = kvm_pdptr_read;
 342
 343        vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
 344}
 345
 346static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
 347{
 348        vcpu->arch.mmu = &vcpu->arch.root_mmu;
 349        vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
 350}
 351
 352static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
 353                                            u16 error_code)
 354{
 355        bool inequality, bit;
 356
 357        bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
 358        inequality =
 359                (error_code & vmcs12->page_fault_error_code_mask) !=
 360                 vmcs12->page_fault_error_code_match;
 361        return inequality ^ bit;
 362}
 363
 364
 365/*
 366 * KVM wants to inject page-faults which it got to the guest. This function
 367 * checks whether in a nested guest, we need to inject them to L1 or L2.
 368 */
 369static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
 370{
 371        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 372        unsigned int nr = vcpu->arch.exception.nr;
 373        bool has_payload = vcpu->arch.exception.has_payload;
 374        unsigned long payload = vcpu->arch.exception.payload;
 375
 376        if (nr == PF_VECTOR) {
 377                if (vcpu->arch.exception.nested_apf) {
 378                        *exit_qual = vcpu->arch.apf.nested_apf_token;
 379                        return 1;
 380                }
 381                if (nested_vmx_is_page_fault_vmexit(vmcs12,
 382                                                    vcpu->arch.exception.error_code)) {
 383                        *exit_qual = has_payload ? payload : vcpu->arch.cr2;
 384                        return 1;
 385                }
 386        } else if (vmcs12->exception_bitmap & (1u << nr)) {
 387                if (nr == DB_VECTOR) {
 388                        if (!has_payload) {
 389                                payload = vcpu->arch.dr6;
 390                                payload &= ~(DR6_FIXED_1 | DR6_BT);
 391                                payload ^= DR6_RTM;
 392                        }
 393                        *exit_qual = payload;
 394                } else
 395                        *exit_qual = 0;
 396                return 1;
 397        }
 398
 399        return 0;
 400}
 401
 402
 403static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
 404                struct x86_exception *fault)
 405{
 406        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 407
 408        WARN_ON(!is_guest_mode(vcpu));
 409
 410        if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
 411                !to_vmx(vcpu)->nested.nested_run_pending) {
 412                vmcs12->vm_exit_intr_error_code = fault->error_code;
 413                nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
 414                                  PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
 415                                  INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
 416                                  fault->address);
 417        } else {
 418                kvm_inject_page_fault(vcpu, fault);
 419        }
 420}
 421
 422static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
 423{
 424        return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
 425}
 426
 427static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
 428                                               struct vmcs12 *vmcs12)
 429{
 430        if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
 431                return 0;
 432
 433        if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) ||
 434            !page_address_valid(vcpu, vmcs12->io_bitmap_b))
 435                return -EINVAL;
 436
 437        return 0;
 438}
 439
 440static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
 441                                                struct vmcs12 *vmcs12)
 442{
 443        if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
 444                return 0;
 445
 446        if (!page_address_valid(vcpu, vmcs12->msr_bitmap))
 447                return -EINVAL;
 448
 449        return 0;
 450}
 451
 452static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
 453                                                struct vmcs12 *vmcs12)
 454{
 455        if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
 456                return 0;
 457
 458        if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))
 459                return -EINVAL;
 460
 461        return 0;
 462}
 463
 464/*
 465 * Check if MSR is intercepted for L01 MSR bitmap.
 466 */
 467static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
 468{
 469        unsigned long *msr_bitmap;
 470        int f = sizeof(unsigned long);
 471
 472        if (!cpu_has_vmx_msr_bitmap())
 473                return true;
 474
 475        msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
 476
 477        if (msr <= 0x1fff) {
 478                return !!test_bit(msr, msr_bitmap + 0x800 / f);
 479        } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
 480                msr &= 0x1fff;
 481                return !!test_bit(msr, msr_bitmap + 0xc00 / f);
 482        }
 483
 484        return true;
 485}
 486
 487/*
 488 * If a msr is allowed by L0, we should check whether it is allowed by L1.
 489 * The corresponding bit will be cleared unless both of L0 and L1 allow it.
 490 */
 491static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
 492                                               unsigned long *msr_bitmap_nested,
 493                                               u32 msr, int type)
 494{
 495        int f = sizeof(unsigned long);
 496
 497        /*
 498         * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
 499         * have the write-low and read-high bitmap offsets the wrong way round.
 500         * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
 501         */
 502        if (msr <= 0x1fff) {
 503                if (type & MSR_TYPE_R &&
 504                   !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
 505                        /* read-low */
 506                        __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
 507
 508                if (type & MSR_TYPE_W &&
 509                   !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
 510                        /* write-low */
 511                        __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
 512
 513        } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
 514                msr &= 0x1fff;
 515                if (type & MSR_TYPE_R &&
 516                   !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
 517                        /* read-high */
 518                        __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
 519
 520                if (type & MSR_TYPE_W &&
 521                   !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
 522                        /* write-high */
 523                        __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
 524
 525        }
 526}
 527
 528static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) {
 529        int msr;
 530
 531        for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
 532                unsigned word = msr / BITS_PER_LONG;
 533
 534                msr_bitmap[word] = ~0;
 535                msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
 536        }
 537}
 538
 539/*
 540 * Merge L0's and L1's MSR bitmap, return false to indicate that
 541 * we do not use the hardware.
 542 */
 543static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 544                                                 struct vmcs12 *vmcs12)
 545{
 546        int msr;
 547        unsigned long *msr_bitmap_l1;
 548        unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
 549        struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map;
 550
 551        /* Nothing to do if the MSR bitmap is not in use.  */
 552        if (!cpu_has_vmx_msr_bitmap() ||
 553            !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
 554                return false;
 555
 556        if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
 557                return false;
 558
 559        msr_bitmap_l1 = (unsigned long *)map->hva;
 560
 561        /*
 562         * To keep the control flow simple, pay eight 8-byte writes (sixteen
 563         * 4-byte writes on 32-bit systems) up front to enable intercepts for
 564         * the x2APIC MSR range and selectively disable them below.
 565         */
 566        enable_x2apic_msr_intercepts(msr_bitmap_l0);
 567
 568        if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
 569                if (nested_cpu_has_apic_reg_virt(vmcs12)) {
 570                        /*
 571                         * L0 need not intercept reads for MSRs between 0x800
 572                         * and 0x8ff, it just lets the processor take the value
 573                         * from the virtual-APIC page; take those 256 bits
 574                         * directly from the L1 bitmap.
 575                         */
 576                        for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
 577                                unsigned word = msr / BITS_PER_LONG;
 578
 579                                msr_bitmap_l0[word] = msr_bitmap_l1[word];
 580                        }
 581                }
 582
 583                nested_vmx_disable_intercept_for_msr(
 584                        msr_bitmap_l1, msr_bitmap_l0,
 585                        X2APIC_MSR(APIC_TASKPRI),
 586                        MSR_TYPE_R | MSR_TYPE_W);
 587
 588                if (nested_cpu_has_vid(vmcs12)) {
 589                        nested_vmx_disable_intercept_for_msr(
 590                                msr_bitmap_l1, msr_bitmap_l0,
 591                                X2APIC_MSR(APIC_EOI),
 592                                MSR_TYPE_W);
 593                        nested_vmx_disable_intercept_for_msr(
 594                                msr_bitmap_l1, msr_bitmap_l0,
 595                                X2APIC_MSR(APIC_SELF_IPI),
 596                                MSR_TYPE_W);
 597                }
 598        }
 599
 600        /* KVM unconditionally exposes the FS/GS base MSRs to L1. */
 601        nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
 602                                             MSR_FS_BASE, MSR_TYPE_RW);
 603
 604        nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
 605                                             MSR_GS_BASE, MSR_TYPE_RW);
 606
 607        nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
 608                                             MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
 609
 610        /*
 611         * Checking the L0->L1 bitmap is trying to verify two things:
 612         *
 613         * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
 614         *    ensures that we do not accidentally generate an L02 MSR bitmap
 615         *    from the L12 MSR bitmap that is too permissive.
 616         * 2. That L1 or L2s have actually used the MSR. This avoids
 617         *    unnecessarily merging of the bitmap if the MSR is unused. This
 618         *    works properly because we only update the L01 MSR bitmap lazily.
 619         *    So even if L0 should pass L1 these MSRs, the L01 bitmap is only
 620         *    updated to reflect this when L1 (or its L2s) actually write to
 621         *    the MSR.
 622         */
 623        if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL))
 624                nested_vmx_disable_intercept_for_msr(
 625                                        msr_bitmap_l1, msr_bitmap_l0,
 626                                        MSR_IA32_SPEC_CTRL,
 627                                        MSR_TYPE_R | MSR_TYPE_W);
 628
 629        if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD))
 630                nested_vmx_disable_intercept_for_msr(
 631                                        msr_bitmap_l1, msr_bitmap_l0,
 632                                        MSR_IA32_PRED_CMD,
 633                                        MSR_TYPE_W);
 634
 635        kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false);
 636
 637        return true;
 638}
 639
 640static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
 641                                       struct vmcs12 *vmcs12)
 642{
 643        struct kvm_host_map map;
 644        struct vmcs12 *shadow;
 645
 646        if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
 647            vmcs12->vmcs_link_pointer == -1ull)
 648                return;
 649
 650        shadow = get_shadow_vmcs12(vcpu);
 651
 652        if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
 653                return;
 654
 655        memcpy(shadow, map.hva, VMCS12_SIZE);
 656        kvm_vcpu_unmap(vcpu, &map, false);
 657}
 658
 659static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
 660                                              struct vmcs12 *vmcs12)
 661{
 662        struct vcpu_vmx *vmx = to_vmx(vcpu);
 663
 664        if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
 665            vmcs12->vmcs_link_pointer == -1ull)
 666                return;
 667
 668        kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
 669                        get_shadow_vmcs12(vcpu), VMCS12_SIZE);
 670}
 671
 672/*
 673 * In nested virtualization, check if L1 has set
 674 * VM_EXIT_ACK_INTR_ON_EXIT
 675 */
 676static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
 677{
 678        return get_vmcs12(vcpu)->vm_exit_controls &
 679                VM_EXIT_ACK_INTR_ON_EXIT;
 680}
 681
 682static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
 683{
 684        return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
 685}
 686
 687static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
 688                                          struct vmcs12 *vmcs12)
 689{
 690        if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
 691            !page_address_valid(vcpu, vmcs12->apic_access_addr))
 692                return -EINVAL;
 693        else
 694                return 0;
 695}
 696
 697static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
 698                                           struct vmcs12 *vmcs12)
 699{
 700        if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
 701            !nested_cpu_has_apic_reg_virt(vmcs12) &&
 702            !nested_cpu_has_vid(vmcs12) &&
 703            !nested_cpu_has_posted_intr(vmcs12))
 704                return 0;
 705
 706        /*
 707         * If virtualize x2apic mode is enabled,
 708         * virtualize apic access must be disabled.
 709         */
 710        if (nested_cpu_has_virt_x2apic_mode(vmcs12) &&
 711            nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
 712                return -EINVAL;
 713
 714        /*
 715         * If virtual interrupt delivery is enabled,
 716         * we must exit on external interrupts.
 717         */
 718        if (nested_cpu_has_vid(vmcs12) &&
 719           !nested_exit_on_intr(vcpu))
 720                return -EINVAL;
 721
 722        /*
 723         * bits 15:8 should be zero in posted_intr_nv,
 724         * the descriptor address has been already checked
 725         * in nested_get_vmcs12_pages.
 726         *
 727         * bits 5:0 of posted_intr_desc_addr should be zero.
 728         */
 729        if (nested_cpu_has_posted_intr(vmcs12) &&
 730           (!nested_cpu_has_vid(vmcs12) ||
 731            !nested_exit_intr_ack_set(vcpu) ||
 732            (vmcs12->posted_intr_nv & 0xff00) ||
 733            (vmcs12->posted_intr_desc_addr & 0x3f) ||
 734            (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu))))
 735                return -EINVAL;
 736
 737        /* tpr shadow is needed by all apicv features. */
 738        if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
 739                return -EINVAL;
 740
 741        return 0;
 742}
 743
 744static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
 745                                       u32 count, u64 addr)
 746{
 747        int maxphyaddr;
 748
 749        if (count == 0)
 750                return 0;
 751        maxphyaddr = cpuid_maxphyaddr(vcpu);
 752        if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
 753            (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr)
 754                return -EINVAL;
 755
 756        return 0;
 757}
 758
 759static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu,
 760                                                     struct vmcs12 *vmcs12)
 761{
 762        if (nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_load_count,
 763                                        vmcs12->vm_exit_msr_load_addr) ||
 764            nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_store_count,
 765                                        vmcs12->vm_exit_msr_store_addr))
 766                return -EINVAL;
 767
 768        return 0;
 769}
 770
 771static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu,
 772                                                      struct vmcs12 *vmcs12)
 773{
 774        if (nested_vmx_check_msr_switch(vcpu, vmcs12->vm_entry_msr_load_count,
 775                                        vmcs12->vm_entry_msr_load_addr))
 776                return -EINVAL;
 777
 778        return 0;
 779}
 780
 781static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
 782                                         struct vmcs12 *vmcs12)
 783{
 784        if (!nested_cpu_has_pml(vmcs12))
 785                return 0;
 786
 787        if (!nested_cpu_has_ept(vmcs12) ||
 788            !page_address_valid(vcpu, vmcs12->pml_address))
 789                return -EINVAL;
 790
 791        return 0;
 792}
 793
 794static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu,
 795                                                        struct vmcs12 *vmcs12)
 796{
 797        if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) &&
 798            !nested_cpu_has_ept(vmcs12))
 799                return -EINVAL;
 800        return 0;
 801}
 802
 803static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu,
 804                                                         struct vmcs12 *vmcs12)
 805{
 806        if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) &&
 807            !nested_cpu_has_ept(vmcs12))
 808                return -EINVAL;
 809        return 0;
 810}
 811
 812static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
 813                                                 struct vmcs12 *vmcs12)
 814{
 815        if (!nested_cpu_has_shadow_vmcs(vmcs12))
 816                return 0;
 817
 818        if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) ||
 819            !page_address_valid(vcpu, vmcs12->vmwrite_bitmap))
 820                return -EINVAL;
 821
 822        return 0;
 823}
 824
 825static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
 826                                       struct vmx_msr_entry *e)
 827{
 828        /* x2APIC MSR accesses are not allowed */
 829        if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)
 830                return -EINVAL;
 831        if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
 832            e->index == MSR_IA32_UCODE_REV)
 833                return -EINVAL;
 834        if (e->reserved != 0)
 835                return -EINVAL;
 836        return 0;
 837}
 838
 839static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
 840                                     struct vmx_msr_entry *e)
 841{
 842        if (e->index == MSR_FS_BASE ||
 843            e->index == MSR_GS_BASE ||
 844            e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */
 845            nested_vmx_msr_check_common(vcpu, e))
 846                return -EINVAL;
 847        return 0;
 848}
 849
 850static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
 851                                      struct vmx_msr_entry *e)
 852{
 853        if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */
 854            nested_vmx_msr_check_common(vcpu, e))
 855                return -EINVAL;
 856        return 0;
 857}
 858
 859/*
 860 * Load guest's/host's msr at nested entry/exit.
 861 * return 0 for success, entry index for failure.
 862 */
 863static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
 864{
 865        u32 i;
 866        struct vmx_msr_entry e;
 867        struct msr_data msr;
 868
 869        msr.host_initiated = false;
 870        for (i = 0; i < count; i++) {
 871                if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
 872                                        &e, sizeof(e))) {
 873                        pr_debug_ratelimited(
 874                                "%s cannot read MSR entry (%u, 0x%08llx)\n",
 875                                __func__, i, gpa + i * sizeof(e));
 876                        goto fail;
 877                }
 878                if (nested_vmx_load_msr_check(vcpu, &e)) {
 879                        pr_debug_ratelimited(
 880                                "%s check failed (%u, 0x%x, 0x%x)\n",
 881                                __func__, i, e.index, e.reserved);
 882                        goto fail;
 883                }
 884                msr.index = e.index;
 885                msr.data = e.value;
 886                if (kvm_set_msr(vcpu, &msr)) {
 887                        pr_debug_ratelimited(
 888                                "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
 889                                __func__, i, e.index, e.value);
 890                        goto fail;
 891                }
 892        }
 893        return 0;
 894fail:
 895        return i + 1;
 896}
 897
 898static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
 899{
 900        u32 i;
 901        struct vmx_msr_entry e;
 902
 903        for (i = 0; i < count; i++) {
 904                struct msr_data msr_info;
 905                if (kvm_vcpu_read_guest(vcpu,
 906                                        gpa + i * sizeof(e),
 907                                        &e, 2 * sizeof(u32))) {
 908                        pr_debug_ratelimited(
 909                                "%s cannot read MSR entry (%u, 0x%08llx)\n",
 910                                __func__, i, gpa + i * sizeof(e));
 911                        return -EINVAL;
 912                }
 913                if (nested_vmx_store_msr_check(vcpu, &e)) {
 914                        pr_debug_ratelimited(
 915                                "%s check failed (%u, 0x%x, 0x%x)\n",
 916                                __func__, i, e.index, e.reserved);
 917                        return -EINVAL;
 918                }
 919                msr_info.host_initiated = false;
 920                msr_info.index = e.index;
 921                if (kvm_get_msr(vcpu, &msr_info)) {
 922                        pr_debug_ratelimited(
 923                                "%s cannot read MSR (%u, 0x%x)\n",
 924                                __func__, i, e.index);
 925                        return -EINVAL;
 926                }
 927                if (kvm_vcpu_write_guest(vcpu,
 928                                         gpa + i * sizeof(e) +
 929                                             offsetof(struct vmx_msr_entry, value),
 930                                         &msr_info.data, sizeof(msr_info.data))) {
 931                        pr_debug_ratelimited(
 932                                "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
 933                                __func__, i, e.index, msr_info.data);
 934                        return -EINVAL;
 935                }
 936        }
 937        return 0;
 938}
 939
 940static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
 941{
 942        unsigned long invalid_mask;
 943
 944        invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
 945        return (val & invalid_mask) == 0;
 946}
 947
 948/*
 949 * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are
 950 * emulating VM entry into a guest with EPT enabled.
 951 * Returns 0 on success, 1 on failure. Invalid state exit qualification code
 952 * is assigned to entry_failure_code on failure.
 953 */
 954static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
 955                               u32 *entry_failure_code)
 956{
 957        if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
 958                if (!nested_cr3_valid(vcpu, cr3)) {
 959                        *entry_failure_code = ENTRY_FAIL_DEFAULT;
 960                        return -EINVAL;
 961                }
 962
 963                /*
 964                 * If PAE paging and EPT are both on, CR3 is not used by the CPU and
 965                 * must not be dereferenced.
 966                 */
 967                if (is_pae_paging(vcpu) && !nested_ept) {
 968                        if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) {
 969                                *entry_failure_code = ENTRY_FAIL_PDPTE;
 970                                return -EINVAL;
 971                        }
 972                }
 973        }
 974
 975        if (!nested_ept)
 976                kvm_mmu_new_cr3(vcpu, cr3, false);
 977
 978        vcpu->arch.cr3 = cr3;
 979        __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
 980
 981        kvm_init_mmu(vcpu, false);
 982
 983        return 0;
 984}
 985
 986/*
 987 * Returns if KVM is able to config CPU to tag TLB entries
 988 * populated by L2 differently than TLB entries populated
 989 * by L1.
 990 *
 991 * If L1 uses EPT, then TLB entries are tagged with different EPTP.
 992 *
 993 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
 994 * with different VPID (L1 entries are tagged with vmx->vpid
 995 * while L2 entries are tagged with vmx->nested.vpid02).
 996 */
 997static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
 998{
 999        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1000
1001        return nested_cpu_has_ept(vmcs12) ||
1002               (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
1003}
1004
1005static u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
1006{
1007        struct vcpu_vmx *vmx = to_vmx(vcpu);
1008
1009        return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
1010}
1011
1012
1013static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
1014{
1015        return fixed_bits_valid(control, low, high);
1016}
1017
1018static inline u64 vmx_control_msr(u32 low, u32 high)
1019{
1020        return low | ((u64)high << 32);
1021}
1022
1023static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
1024{
1025        superset &= mask;
1026        subset &= mask;
1027
1028        return (superset | subset) == superset;
1029}
1030
1031static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
1032{
1033        const u64 feature_and_reserved =
1034                /* feature (except bit 48; see below) */
1035                BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
1036                /* reserved */
1037                BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
1038        u64 vmx_basic = vmx->nested.msrs.basic;
1039
1040        if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
1041                return -EINVAL;
1042
1043        /*
1044         * KVM does not emulate a version of VMX that constrains physical
1045         * addresses of VMX structures (e.g. VMCS) to 32-bits.
1046         */
1047        if (data & BIT_ULL(48))
1048                return -EINVAL;
1049
1050        if (vmx_basic_vmcs_revision_id(vmx_basic) !=
1051            vmx_basic_vmcs_revision_id(data))
1052                return -EINVAL;
1053
1054        if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
1055                return -EINVAL;
1056
1057        vmx->nested.msrs.basic = data;
1058        return 0;
1059}
1060
1061static int
1062vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1063{
1064        u64 supported;
1065        u32 *lowp, *highp;
1066
1067        switch (msr_index) {
1068        case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1069                lowp = &vmx->nested.msrs.pinbased_ctls_low;
1070                highp = &vmx->nested.msrs.pinbased_ctls_high;
1071                break;
1072        case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1073                lowp = &vmx->nested.msrs.procbased_ctls_low;
1074                highp = &vmx->nested.msrs.procbased_ctls_high;
1075                break;
1076        case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1077                lowp = &vmx->nested.msrs.exit_ctls_low;
1078                highp = &vmx->nested.msrs.exit_ctls_high;
1079                break;
1080        case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1081                lowp = &vmx->nested.msrs.entry_ctls_low;
1082                highp = &vmx->nested.msrs.entry_ctls_high;
1083                break;
1084        case MSR_IA32_VMX_PROCBASED_CTLS2:
1085                lowp = &vmx->nested.msrs.secondary_ctls_low;
1086                highp = &vmx->nested.msrs.secondary_ctls_high;
1087                break;
1088        default:
1089                BUG();
1090        }
1091
1092        supported = vmx_control_msr(*lowp, *highp);
1093
1094        /* Check must-be-1 bits are still 1. */
1095        if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
1096                return -EINVAL;
1097
1098        /* Check must-be-0 bits are still 0. */
1099        if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
1100                return -EINVAL;
1101
1102        *lowp = data;
1103        *highp = data >> 32;
1104        return 0;
1105}
1106
1107static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
1108{
1109        const u64 feature_and_reserved_bits =
1110                /* feature */
1111                BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
1112                BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
1113                /* reserved */
1114                GENMASK_ULL(13, 9) | BIT_ULL(31);
1115        u64 vmx_misc;
1116
1117        vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
1118                                   vmx->nested.msrs.misc_high);
1119
1120        if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
1121                return -EINVAL;
1122
1123        if ((vmx->nested.msrs.pinbased_ctls_high &
1124             PIN_BASED_VMX_PREEMPTION_TIMER) &&
1125            vmx_misc_preemption_timer_rate(data) !=
1126            vmx_misc_preemption_timer_rate(vmx_misc))
1127                return -EINVAL;
1128
1129        if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
1130                return -EINVAL;
1131
1132        if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
1133                return -EINVAL;
1134
1135        if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
1136                return -EINVAL;
1137
1138        vmx->nested.msrs.misc_low = data;
1139        vmx->nested.msrs.misc_high = data >> 32;
1140
1141        return 0;
1142}
1143
1144static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
1145{
1146        u64 vmx_ept_vpid_cap;
1147
1148        vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
1149                                           vmx->nested.msrs.vpid_caps);
1150
1151        /* Every bit is either reserved or a feature bit. */
1152        if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
1153                return -EINVAL;
1154
1155        vmx->nested.msrs.ept_caps = data;
1156        vmx->nested.msrs.vpid_caps = data >> 32;
1157        return 0;
1158}
1159
1160static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1161{
1162        u64 *msr;
1163
1164        switch (msr_index) {
1165        case MSR_IA32_VMX_CR0_FIXED0:
1166                msr = &vmx->nested.msrs.cr0_fixed0;
1167                break;
1168        case MSR_IA32_VMX_CR4_FIXED0:
1169                msr = &vmx->nested.msrs.cr4_fixed0;
1170                break;
1171        default:
1172                BUG();
1173        }
1174
1175        /*
1176         * 1 bits (which indicates bits which "must-be-1" during VMX operation)
1177         * must be 1 in the restored value.
1178         */
1179        if (!is_bitwise_subset(data, *msr, -1ULL))
1180                return -EINVAL;
1181
1182        *msr = data;
1183        return 0;
1184}
1185
1186/*
1187 * Called when userspace is restoring VMX MSRs.
1188 *
1189 * Returns 0 on success, non-0 otherwise.
1190 */
1191int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1192{
1193        struct vcpu_vmx *vmx = to_vmx(vcpu);
1194
1195        /*
1196         * Don't allow changes to the VMX capability MSRs while the vCPU
1197         * is in VMX operation.
1198         */
1199        if (vmx->nested.vmxon)
1200                return -EBUSY;
1201
1202        switch (msr_index) {
1203        case MSR_IA32_VMX_BASIC:
1204                return vmx_restore_vmx_basic(vmx, data);
1205        case MSR_IA32_VMX_PINBASED_CTLS:
1206        case MSR_IA32_VMX_PROCBASED_CTLS:
1207        case MSR_IA32_VMX_EXIT_CTLS:
1208        case MSR_IA32_VMX_ENTRY_CTLS:
1209                /*
1210                 * The "non-true" VMX capability MSRs are generated from the
1211                 * "true" MSRs, so we do not support restoring them directly.
1212                 *
1213                 * If userspace wants to emulate VMX_BASIC[55]=0, userspace
1214                 * should restore the "true" MSRs with the must-be-1 bits
1215                 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
1216                 * DEFAULT SETTINGS".
1217                 */
1218                return -EINVAL;
1219        case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1220        case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1221        case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1222        case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1223        case MSR_IA32_VMX_PROCBASED_CTLS2:
1224                return vmx_restore_control_msr(vmx, msr_index, data);
1225        case MSR_IA32_VMX_MISC:
1226                return vmx_restore_vmx_misc(vmx, data);
1227        case MSR_IA32_VMX_CR0_FIXED0:
1228        case MSR_IA32_VMX_CR4_FIXED0:
1229                return vmx_restore_fixed0_msr(vmx, msr_index, data);
1230        case MSR_IA32_VMX_CR0_FIXED1:
1231        case MSR_IA32_VMX_CR4_FIXED1:
1232                /*
1233                 * These MSRs are generated based on the vCPU's CPUID, so we
1234                 * do not support restoring them directly.
1235                 */
1236                return -EINVAL;
1237        case MSR_IA32_VMX_EPT_VPID_CAP:
1238                return vmx_restore_vmx_ept_vpid_cap(vmx, data);
1239        case MSR_IA32_VMX_VMCS_ENUM:
1240                vmx->nested.msrs.vmcs_enum = data;
1241                return 0;
1242        case MSR_IA32_VMX_VMFUNC:
1243                if (data & ~vmx->nested.msrs.vmfunc_controls)
1244                        return -EINVAL;
1245                vmx->nested.msrs.vmfunc_controls = data;
1246                return 0;
1247        default:
1248                /*
1249                 * The rest of the VMX capability MSRs do not support restore.
1250                 */
1251                return -EINVAL;
1252        }
1253}
1254
1255/* Returns 0 on success, non-0 otherwise. */
1256int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
1257{
1258        switch (msr_index) {
1259        case MSR_IA32_VMX_BASIC:
1260                *pdata = msrs->basic;
1261                break;
1262        case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1263        case MSR_IA32_VMX_PINBASED_CTLS:
1264                *pdata = vmx_control_msr(
1265                        msrs->pinbased_ctls_low,
1266                        msrs->pinbased_ctls_high);
1267                if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
1268                        *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1269                break;
1270        case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1271        case MSR_IA32_VMX_PROCBASED_CTLS:
1272                *pdata = vmx_control_msr(
1273                        msrs->procbased_ctls_low,
1274                        msrs->procbased_ctls_high);
1275                if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
1276                        *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1277                break;
1278        case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1279        case MSR_IA32_VMX_EXIT_CTLS:
1280                *pdata = vmx_control_msr(
1281                        msrs->exit_ctls_low,
1282                        msrs->exit_ctls_high);
1283                if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
1284                        *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
1285                break;
1286        case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1287        case MSR_IA32_VMX_ENTRY_CTLS:
1288                *pdata = vmx_control_msr(
1289                        msrs->entry_ctls_low,
1290                        msrs->entry_ctls_high);
1291                if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
1292                        *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
1293                break;
1294        case MSR_IA32_VMX_MISC:
1295                *pdata = vmx_control_msr(
1296                        msrs->misc_low,
1297                        msrs->misc_high);
1298                break;
1299        case MSR_IA32_VMX_CR0_FIXED0:
1300                *pdata = msrs->cr0_fixed0;
1301                break;
1302        case MSR_IA32_VMX_CR0_FIXED1:
1303                *pdata = msrs->cr0_fixed1;
1304                break;
1305        case MSR_IA32_VMX_CR4_FIXED0:
1306                *pdata = msrs->cr4_fixed0;
1307                break;
1308        case MSR_IA32_VMX_CR4_FIXED1:
1309                *pdata = msrs->cr4_fixed1;
1310                break;
1311        case MSR_IA32_VMX_VMCS_ENUM:
1312                *pdata = msrs->vmcs_enum;
1313                break;
1314        case MSR_IA32_VMX_PROCBASED_CTLS2:
1315                *pdata = vmx_control_msr(
1316                        msrs->secondary_ctls_low,
1317                        msrs->secondary_ctls_high);
1318                break;
1319        case MSR_IA32_VMX_EPT_VPID_CAP:
1320                *pdata = msrs->ept_caps |
1321                        ((u64)msrs->vpid_caps << 32);
1322                break;
1323        case MSR_IA32_VMX_VMFUNC:
1324                *pdata = msrs->vmfunc_controls;
1325                break;
1326        default:
1327                return 1;
1328        }
1329
1330        return 0;
1331}
1332
1333/*
1334 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have
1335 * been modified by the L1 guest.  Note, "writable" in this context means
1336 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of
1337 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only"
1338 * VM-exit information fields (which are actually writable if the vCPU is
1339 * configured to support "VMWRITE to any supported field in the VMCS").
1340 */
1341static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
1342{
1343        struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1344        struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1345        struct shadow_vmcs_field field;
1346        unsigned long val;
1347        int i;
1348
1349        if (WARN_ON(!shadow_vmcs))
1350                return;
1351
1352        preempt_disable();
1353
1354        vmcs_load(shadow_vmcs);
1355
1356        for (i = 0; i < max_shadow_read_write_fields; i++) {
1357                field = shadow_read_write_fields[i];
1358                val = __vmcs_readl(field.encoding);
1359                vmcs12_write_any(vmcs12, field.encoding, field.offset, val);
1360        }
1361
1362        vmcs_clear(shadow_vmcs);
1363        vmcs_load(vmx->loaded_vmcs->vmcs);
1364
1365        preempt_enable();
1366}
1367
1368static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
1369{
1370        const struct shadow_vmcs_field *fields[] = {
1371                shadow_read_write_fields,
1372                shadow_read_only_fields
1373        };
1374        const int max_fields[] = {
1375                max_shadow_read_write_fields,
1376                max_shadow_read_only_fields
1377        };
1378        struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1379        struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1380        struct shadow_vmcs_field field;
1381        unsigned long val;
1382        int i, q;
1383
1384        if (WARN_ON(!shadow_vmcs))
1385                return;
1386
1387        vmcs_load(shadow_vmcs);
1388
1389        for (q = 0; q < ARRAY_SIZE(fields); q++) {
1390                for (i = 0; i < max_fields[q]; i++) {
1391                        field = fields[q][i];
1392                        val = vmcs12_read_any(vmcs12, field.encoding,
1393                                              field.offset);
1394                        __vmcs_writel(field.encoding, val);
1395                }
1396        }
1397
1398        vmcs_clear(shadow_vmcs);
1399        vmcs_load(vmx->loaded_vmcs->vmcs);
1400}
1401
1402static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
1403{
1404        struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1405        struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1406
1407        /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
1408        vmcs12->tpr_threshold = evmcs->tpr_threshold;
1409        vmcs12->guest_rip = evmcs->guest_rip;
1410
1411        if (unlikely(!(evmcs->hv_clean_fields &
1412                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
1413                vmcs12->guest_rsp = evmcs->guest_rsp;
1414                vmcs12->guest_rflags = evmcs->guest_rflags;
1415                vmcs12->guest_interruptibility_info =
1416                        evmcs->guest_interruptibility_info;
1417        }
1418
1419        if (unlikely(!(evmcs->hv_clean_fields &
1420                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
1421                vmcs12->cpu_based_vm_exec_control =
1422                        evmcs->cpu_based_vm_exec_control;
1423        }
1424
1425        if (unlikely(!(evmcs->hv_clean_fields &
1426                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) {
1427                vmcs12->exception_bitmap = evmcs->exception_bitmap;
1428        }
1429
1430        if (unlikely(!(evmcs->hv_clean_fields &
1431                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
1432                vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
1433        }
1434
1435        if (unlikely(!(evmcs->hv_clean_fields &
1436                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
1437                vmcs12->vm_entry_intr_info_field =
1438                        evmcs->vm_entry_intr_info_field;
1439                vmcs12->vm_entry_exception_error_code =
1440                        evmcs->vm_entry_exception_error_code;
1441                vmcs12->vm_entry_instruction_len =
1442                        evmcs->vm_entry_instruction_len;
1443        }
1444
1445        if (unlikely(!(evmcs->hv_clean_fields &
1446                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
1447                vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
1448                vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
1449                vmcs12->host_cr0 = evmcs->host_cr0;
1450                vmcs12->host_cr3 = evmcs->host_cr3;
1451                vmcs12->host_cr4 = evmcs->host_cr4;
1452                vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
1453                vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
1454                vmcs12->host_rip = evmcs->host_rip;
1455                vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
1456                vmcs12->host_es_selector = evmcs->host_es_selector;
1457                vmcs12->host_cs_selector = evmcs->host_cs_selector;
1458                vmcs12->host_ss_selector = evmcs->host_ss_selector;
1459                vmcs12->host_ds_selector = evmcs->host_ds_selector;
1460                vmcs12->host_fs_selector = evmcs->host_fs_selector;
1461                vmcs12->host_gs_selector = evmcs->host_gs_selector;
1462                vmcs12->host_tr_selector = evmcs->host_tr_selector;
1463        }
1464
1465        if (unlikely(!(evmcs->hv_clean_fields &
1466                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) {
1467                vmcs12->pin_based_vm_exec_control =
1468                        evmcs->pin_based_vm_exec_control;
1469                vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
1470                vmcs12->secondary_vm_exec_control =
1471                        evmcs->secondary_vm_exec_control;
1472        }
1473
1474        if (unlikely(!(evmcs->hv_clean_fields &
1475                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
1476                vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
1477                vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
1478        }
1479
1480        if (unlikely(!(evmcs->hv_clean_fields &
1481                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
1482                vmcs12->msr_bitmap = evmcs->msr_bitmap;
1483        }
1484
1485        if (unlikely(!(evmcs->hv_clean_fields &
1486                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
1487                vmcs12->guest_es_base = evmcs->guest_es_base;
1488                vmcs12->guest_cs_base = evmcs->guest_cs_base;
1489                vmcs12->guest_ss_base = evmcs->guest_ss_base;
1490                vmcs12->guest_ds_base = evmcs->guest_ds_base;
1491                vmcs12->guest_fs_base = evmcs->guest_fs_base;
1492                vmcs12->guest_gs_base = evmcs->guest_gs_base;
1493                vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
1494                vmcs12->guest_tr_base = evmcs->guest_tr_base;
1495                vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
1496                vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
1497                vmcs12->guest_es_limit = evmcs->guest_es_limit;
1498                vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
1499                vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
1500                vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
1501                vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
1502                vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
1503                vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
1504                vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
1505                vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
1506                vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
1507                vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
1508                vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
1509                vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
1510                vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
1511                vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
1512                vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
1513                vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
1514                vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
1515                vmcs12->guest_es_selector = evmcs->guest_es_selector;
1516                vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
1517                vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
1518                vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
1519                vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
1520                vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
1521                vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
1522                vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
1523        }
1524
1525        if (unlikely(!(evmcs->hv_clean_fields &
1526                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
1527                vmcs12->tsc_offset = evmcs->tsc_offset;
1528                vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
1529                vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
1530        }
1531
1532        if (unlikely(!(evmcs->hv_clean_fields &
1533                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
1534                vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
1535                vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
1536                vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
1537                vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
1538                vmcs12->guest_cr0 = evmcs->guest_cr0;
1539                vmcs12->guest_cr3 = evmcs->guest_cr3;
1540                vmcs12->guest_cr4 = evmcs->guest_cr4;
1541                vmcs12->guest_dr7 = evmcs->guest_dr7;
1542        }
1543
1544        if (unlikely(!(evmcs->hv_clean_fields &
1545                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
1546                vmcs12->host_fs_base = evmcs->host_fs_base;
1547                vmcs12->host_gs_base = evmcs->host_gs_base;
1548                vmcs12->host_tr_base = evmcs->host_tr_base;
1549                vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
1550                vmcs12->host_idtr_base = evmcs->host_idtr_base;
1551                vmcs12->host_rsp = evmcs->host_rsp;
1552        }
1553
1554        if (unlikely(!(evmcs->hv_clean_fields &
1555                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
1556                vmcs12->ept_pointer = evmcs->ept_pointer;
1557                vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
1558        }
1559
1560        if (unlikely(!(evmcs->hv_clean_fields &
1561                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
1562                vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
1563                vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
1564                vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
1565                vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
1566                vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
1567                vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
1568                vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
1569                vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
1570                vmcs12->guest_pending_dbg_exceptions =
1571                        evmcs->guest_pending_dbg_exceptions;
1572                vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
1573                vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
1574                vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
1575                vmcs12->guest_activity_state = evmcs->guest_activity_state;
1576                vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
1577        }
1578
1579        /*
1580         * Not used?
1581         * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
1582         * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
1583         * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
1584         * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0;
1585         * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1;
1586         * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2;
1587         * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3;
1588         * vmcs12->page_fault_error_code_mask =
1589         *              evmcs->page_fault_error_code_mask;
1590         * vmcs12->page_fault_error_code_match =
1591         *              evmcs->page_fault_error_code_match;
1592         * vmcs12->cr3_target_count = evmcs->cr3_target_count;
1593         * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
1594         * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
1595         * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
1596         */
1597
1598        /*
1599         * Read only fields:
1600         * vmcs12->guest_physical_address = evmcs->guest_physical_address;
1601         * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
1602         * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
1603         * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
1604         * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
1605         * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
1606         * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
1607         * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
1608         * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
1609         * vmcs12->exit_qualification = evmcs->exit_qualification;
1610         * vmcs12->guest_linear_address = evmcs->guest_linear_address;
1611         *
1612         * Not present in struct vmcs12:
1613         * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
1614         * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
1615         * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
1616         * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
1617         */
1618
1619        return 0;
1620}
1621
1622static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
1623{
1624        struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1625        struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1626
1627        /*
1628         * Should not be changed by KVM:
1629         *
1630         * evmcs->host_es_selector = vmcs12->host_es_selector;
1631         * evmcs->host_cs_selector = vmcs12->host_cs_selector;
1632         * evmcs->host_ss_selector = vmcs12->host_ss_selector;
1633         * evmcs->host_ds_selector = vmcs12->host_ds_selector;
1634         * evmcs->host_fs_selector = vmcs12->host_fs_selector;
1635         * evmcs->host_gs_selector = vmcs12->host_gs_selector;
1636         * evmcs->host_tr_selector = vmcs12->host_tr_selector;
1637         * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
1638         * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
1639         * evmcs->host_cr0 = vmcs12->host_cr0;
1640         * evmcs->host_cr3 = vmcs12->host_cr3;
1641         * evmcs->host_cr4 = vmcs12->host_cr4;
1642         * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
1643         * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
1644         * evmcs->host_rip = vmcs12->host_rip;
1645         * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
1646         * evmcs->host_fs_base = vmcs12->host_fs_base;
1647         * evmcs->host_gs_base = vmcs12->host_gs_base;
1648         * evmcs->host_tr_base = vmcs12->host_tr_base;
1649         * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
1650         * evmcs->host_idtr_base = vmcs12->host_idtr_base;
1651         * evmcs->host_rsp = vmcs12->host_rsp;
1652         * sync_vmcs02_to_vmcs12() doesn't read these:
1653         * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
1654         * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
1655         * evmcs->msr_bitmap = vmcs12->msr_bitmap;
1656         * evmcs->ept_pointer = vmcs12->ept_pointer;
1657         * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
1658         * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
1659         * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
1660         * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
1661         * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0;
1662         * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1;
1663         * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2;
1664         * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3;
1665         * evmcs->tpr_threshold = vmcs12->tpr_threshold;
1666         * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
1667         * evmcs->exception_bitmap = vmcs12->exception_bitmap;
1668         * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
1669         * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
1670         * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
1671         * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
1672         * evmcs->page_fault_error_code_mask =
1673         *              vmcs12->page_fault_error_code_mask;
1674         * evmcs->page_fault_error_code_match =
1675         *              vmcs12->page_fault_error_code_match;
1676         * evmcs->cr3_target_count = vmcs12->cr3_target_count;
1677         * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
1678         * evmcs->tsc_offset = vmcs12->tsc_offset;
1679         * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
1680         * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
1681         * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
1682         * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
1683         * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
1684         * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
1685         * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
1686         * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
1687         *
1688         * Not present in struct vmcs12:
1689         * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
1690         * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
1691         * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
1692         * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
1693         */
1694
1695        evmcs->guest_es_selector = vmcs12->guest_es_selector;
1696        evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
1697        evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
1698        evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
1699        evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
1700        evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
1701        evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
1702        evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
1703
1704        evmcs->guest_es_limit = vmcs12->guest_es_limit;
1705        evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
1706        evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
1707        evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
1708        evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
1709        evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
1710        evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
1711        evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
1712        evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
1713        evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
1714
1715        evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
1716        evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
1717        evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
1718        evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
1719        evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
1720        evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
1721        evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
1722        evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
1723
1724        evmcs->guest_es_base = vmcs12->guest_es_base;
1725        evmcs->guest_cs_base = vmcs12->guest_cs_base;
1726        evmcs->guest_ss_base = vmcs12->guest_ss_base;
1727        evmcs->guest_ds_base = vmcs12->guest_ds_base;
1728        evmcs->guest_fs_base = vmcs12->guest_fs_base;
1729        evmcs->guest_gs_base = vmcs12->guest_gs_base;
1730        evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
1731        evmcs->guest_tr_base = vmcs12->guest_tr_base;
1732        evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
1733        evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
1734
1735        evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
1736        evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
1737
1738        evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
1739        evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
1740        evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
1741        evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
1742
1743        evmcs->guest_pending_dbg_exceptions =
1744                vmcs12->guest_pending_dbg_exceptions;
1745        evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
1746        evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
1747
1748        evmcs->guest_activity_state = vmcs12->guest_activity_state;
1749        evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
1750
1751        evmcs->guest_cr0 = vmcs12->guest_cr0;
1752        evmcs->guest_cr3 = vmcs12->guest_cr3;
1753        evmcs->guest_cr4 = vmcs12->guest_cr4;
1754        evmcs->guest_dr7 = vmcs12->guest_dr7;
1755
1756        evmcs->guest_physical_address = vmcs12->guest_physical_address;
1757
1758        evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
1759        evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
1760        evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
1761        evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
1762        evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
1763        evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
1764        evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
1765        evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
1766
1767        evmcs->exit_qualification = vmcs12->exit_qualification;
1768
1769        evmcs->guest_linear_address = vmcs12->guest_linear_address;
1770        evmcs->guest_rsp = vmcs12->guest_rsp;
1771        evmcs->guest_rflags = vmcs12->guest_rflags;
1772
1773        evmcs->guest_interruptibility_info =
1774                vmcs12->guest_interruptibility_info;
1775        evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
1776        evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
1777        evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
1778        evmcs->vm_entry_exception_error_code =
1779                vmcs12->vm_entry_exception_error_code;
1780        evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
1781
1782        evmcs->guest_rip = vmcs12->guest_rip;
1783
1784        evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
1785
1786        return 0;
1787}
1788
1789/*
1790 * This is an equivalent of the nested hypervisor executing the vmptrld
1791 * instruction.
1792 */
1793static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
1794                                                 bool from_launch)
1795{
1796        struct vcpu_vmx *vmx = to_vmx(vcpu);
1797        bool evmcs_gpa_changed = false;
1798        u64 evmcs_gpa;
1799
1800        if (likely(!vmx->nested.enlightened_vmcs_enabled))
1801                return 1;
1802
1803        if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa))
1804                return 1;
1805
1806        if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
1807                if (!vmx->nested.hv_evmcs)
1808                        vmx->nested.current_vmptr = -1ull;
1809
1810                nested_release_evmcs(vcpu);
1811
1812                if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
1813                                 &vmx->nested.hv_evmcs_map))
1814                        return 0;
1815
1816                vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
1817
1818                /*
1819                 * Currently, KVM only supports eVMCS version 1
1820                 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this
1821                 * value to first u32 field of eVMCS which should specify eVMCS
1822                 * VersionNumber.
1823                 *
1824                 * Guest should be aware of supported eVMCS versions by host by
1825                 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
1826                 * expected to set this CPUID leaf according to the value
1827                 * returned in vmcs_version from nested_enable_evmcs().
1828                 *
1829                 * However, it turns out that Microsoft Hyper-V fails to comply
1830                 * to their own invented interface: When Hyper-V use eVMCS, it
1831                 * just sets first u32 field of eVMCS to revision_id specified
1832                 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
1833                 * which is one of the supported versions specified in
1834                 * CPUID.0x4000000A.EAX[0:15].
1835                 *
1836                 * To overcome Hyper-V bug, we accept here either a supported
1837                 * eVMCS version or VMCS12 revision_id as valid values for first
1838                 * u32 field of eVMCS.
1839                 */
1840                if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
1841                    (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
1842                        nested_release_evmcs(vcpu);
1843                        return 0;
1844                }
1845
1846                vmx->nested.dirty_vmcs12 = true;
1847                vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
1848
1849                evmcs_gpa_changed = true;
1850                /*
1851                 * Unlike normal vmcs12, enlightened vmcs12 is not fully
1852                 * reloaded from guest's memory (read only fields, fields not
1853                 * present in struct hv_enlightened_vmcs, ...). Make sure there
1854                 * are no leftovers.
1855                 */
1856                if (from_launch) {
1857                        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1858                        memset(vmcs12, 0, sizeof(*vmcs12));
1859                        vmcs12->hdr.revision_id = VMCS12_REVISION;
1860                }
1861
1862        }
1863
1864        /*
1865         * Clean fields data can't de used on VMLAUNCH and when we switch
1866         * between different L2 guests as KVM keeps a single VMCS12 per L1.
1867         */
1868        if (from_launch || evmcs_gpa_changed)
1869                vmx->nested.hv_evmcs->hv_clean_fields &=
1870                        ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
1871
1872        return 1;
1873}
1874
1875void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
1876{
1877        struct vcpu_vmx *vmx = to_vmx(vcpu);
1878
1879        /*
1880         * hv_evmcs may end up being not mapped after migration (when
1881         * L2 was running), map it here to make sure vmcs12 changes are
1882         * properly reflected.
1883         */
1884        if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs)
1885                nested_vmx_handle_enlightened_vmptrld(vcpu, false);
1886
1887        if (vmx->nested.hv_evmcs) {
1888                copy_vmcs12_to_enlightened(vmx);
1889                /* All fields are clean */
1890                vmx->nested.hv_evmcs->hv_clean_fields |=
1891                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
1892        } else {
1893                copy_vmcs12_to_shadow(vmx);
1894        }
1895
1896        vmx->nested.need_vmcs12_to_shadow_sync = false;
1897}
1898
1899static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
1900{
1901        struct vcpu_vmx *vmx =
1902                container_of(timer, struct vcpu_vmx, nested.preemption_timer);
1903
1904        vmx->nested.preemption_timer_expired = true;
1905        kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
1906        kvm_vcpu_kick(&vmx->vcpu);
1907
1908        return HRTIMER_NORESTART;
1909}
1910
1911static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
1912{
1913        u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
1914        struct vcpu_vmx *vmx = to_vmx(vcpu);
1915
1916        /*
1917         * A timer value of zero is architecturally guaranteed to cause
1918         * a VMExit prior to executing any instructions in the guest.
1919         */
1920        if (preemption_timeout == 0) {
1921                vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
1922                return;
1923        }
1924
1925        if (vcpu->arch.virtual_tsc_khz == 0)
1926                return;
1927
1928        preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
1929        preemption_timeout *= 1000000;
1930        do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
1931        hrtimer_start(&vmx->nested.preemption_timer,
1932                      ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
1933}
1934
1935static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
1936{
1937        if (vmx->nested.nested_run_pending &&
1938            (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
1939                return vmcs12->guest_ia32_efer;
1940        else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
1941                return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
1942        else
1943                return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
1944}
1945
1946static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
1947{
1948        /*
1949         * If vmcs02 hasn't been initialized, set the constant vmcs02 state
1950         * according to L0's settings (vmcs12 is irrelevant here).  Host
1951         * fields that come from L0 and are not constant, e.g. HOST_CR3,
1952         * will be set as needed prior to VMLAUNCH/VMRESUME.
1953         */
1954        if (vmx->nested.vmcs02_initialized)
1955                return;
1956        vmx->nested.vmcs02_initialized = true;
1957
1958        /*
1959         * We don't care what the EPTP value is we just need to guarantee
1960         * it's valid so we don't get a false positive when doing early
1961         * consistency checks.
1962         */
1963        if (enable_ept && nested_early_check)
1964                vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0));
1965
1966        /* All VMFUNCs are currently emulated through L0 vmexits.  */
1967        if (cpu_has_vmx_vmfunc())
1968                vmcs_write64(VM_FUNCTION_CONTROL, 0);
1969
1970        if (cpu_has_vmx_posted_intr())
1971                vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
1972
1973        if (cpu_has_vmx_msr_bitmap())
1974                vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
1975
1976        /*
1977         * The PML address never changes, so it is constant in vmcs02.
1978         * Conceptually we want to copy the PML index from vmcs01 here,
1979         * and then back to vmcs01 on nested vmexit.  But since we flush
1980         * the log and reset GUEST_PML_INDEX on each vmexit, the PML
1981         * index is also effectively constant in vmcs02.
1982         */
1983        if (enable_pml) {
1984                vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
1985                vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
1986        }
1987
1988        if (cpu_has_vmx_encls_vmexit())
1989                vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
1990
1991        /*
1992         * Set the MSR load/store lists to match L0's settings.  Only the
1993         * addresses are constant (for vmcs02), the counts can change based
1994         * on L2's behavior, e.g. switching to/from long mode.
1995         */
1996        vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
1997        vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
1998        vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
1999
2000        vmx_set_constant_host_state(vmx);
2001}
2002
2003static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
2004                                      struct vmcs12 *vmcs12)
2005{
2006        prepare_vmcs02_constant_state(vmx);
2007
2008        vmcs_write64(VMCS_LINK_POINTER, -1ull);
2009
2010        if (enable_vpid) {
2011                if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
2012                        vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
2013                else
2014                        vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2015        }
2016}
2017
2018static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2019{
2020        u32 exec_control, vmcs12_exec_ctrl;
2021        u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
2022
2023        if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
2024                prepare_vmcs02_early_rare(vmx, vmcs12);
2025
2026        /*
2027         * PIN CONTROLS
2028         */
2029        exec_control = vmx_pin_based_exec_ctrl(vmx);
2030        exec_control |= (vmcs12->pin_based_vm_exec_control &
2031                         ~PIN_BASED_VMX_PREEMPTION_TIMER);
2032
2033        /* Posted interrupts setting is only taken from vmcs12.  */
2034        if (nested_cpu_has_posted_intr(vmcs12)) {
2035                vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
2036                vmx->nested.pi_pending = false;
2037        } else {
2038                exec_control &= ~PIN_BASED_POSTED_INTR;
2039        }
2040        pin_controls_set(vmx, exec_control);
2041
2042        /*
2043         * EXEC CONTROLS
2044         */
2045        exec_control = vmx_exec_control(vmx); /* L0's desires */
2046        exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2047        exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
2048        exec_control &= ~CPU_BASED_TPR_SHADOW;
2049        exec_control |= vmcs12->cpu_based_vm_exec_control;
2050
2051        if (exec_control & CPU_BASED_TPR_SHADOW)
2052                vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
2053#ifdef CONFIG_X86_64
2054        else
2055                exec_control |= CPU_BASED_CR8_LOAD_EXITING |
2056                                CPU_BASED_CR8_STORE_EXITING;
2057#endif
2058
2059        /*
2060         * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
2061         * for I/O port accesses.
2062         */
2063        exec_control |= CPU_BASED_UNCOND_IO_EXITING;
2064        exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
2065
2066        /*
2067         * This bit will be computed in nested_get_vmcs12_pages, because
2068         * we do not have access to L1's MSR bitmap yet.  For now, keep
2069         * the same bit as before, hoping to avoid multiple VMWRITEs that
2070         * only set/clear this bit.
2071         */
2072        exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
2073        exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS;
2074
2075        exec_controls_set(vmx, exec_control);
2076
2077        /*
2078         * SECONDARY EXEC CONTROLS
2079         */
2080        if (cpu_has_secondary_exec_ctrls()) {
2081                exec_control = vmx->secondary_exec_control;
2082
2083                /* Take the following fields only from vmcs12 */
2084                exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2085                                  SECONDARY_EXEC_ENABLE_INVPCID |
2086                                  SECONDARY_EXEC_RDTSCP |
2087                                  SECONDARY_EXEC_XSAVES |
2088                                  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2089                                  SECONDARY_EXEC_APIC_REGISTER_VIRT |
2090                                  SECONDARY_EXEC_ENABLE_VMFUNC);
2091                if (nested_cpu_has(vmcs12,
2092                                   CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
2093                        vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
2094                                ~SECONDARY_EXEC_ENABLE_PML;
2095                        exec_control |= vmcs12_exec_ctrl;
2096                }
2097
2098                /* VMCS shadowing for L2 is emulated for now */
2099                exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
2100
2101                /*
2102                 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
2103                 * will not have to rewrite the controls just for this bit.
2104                 */
2105                if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() &&
2106                    (vmcs12->guest_cr4 & X86_CR4_UMIP))
2107                        exec_control |= SECONDARY_EXEC_DESC;
2108
2109                if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
2110                        vmcs_write16(GUEST_INTR_STATUS,
2111                                vmcs12->guest_intr_status);
2112
2113                secondary_exec_controls_set(vmx, exec_control);
2114        }
2115
2116        /*
2117         * ENTRY CONTROLS
2118         *
2119         * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
2120         * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
2121         * on the related bits (if supported by the CPU) in the hope that
2122         * we can avoid VMWrites during vmx_set_efer().
2123         */
2124        exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) &
2125                        ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
2126        if (cpu_has_load_ia32_efer()) {
2127                if (guest_efer & EFER_LMA)
2128                        exec_control |= VM_ENTRY_IA32E_MODE;
2129                if (guest_efer != host_efer)
2130                        exec_control |= VM_ENTRY_LOAD_IA32_EFER;
2131        }
2132        vm_entry_controls_set(vmx, exec_control);
2133
2134        /*
2135         * EXIT CONTROLS
2136         *
2137         * L2->L1 exit controls are emulated - the hardware exit is to L0 so
2138         * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
2139         * bits may be modified by vmx_set_efer() in prepare_vmcs02().
2140         */
2141        exec_control = vmx_vmexit_ctrl();
2142        if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
2143                exec_control |= VM_EXIT_LOAD_IA32_EFER;
2144        vm_exit_controls_set(vmx, exec_control);
2145
2146        /*
2147         * Interrupt/Exception Fields
2148         */
2149        if (vmx->nested.nested_run_pending) {
2150                vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2151                             vmcs12->vm_entry_intr_info_field);
2152                vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2153                             vmcs12->vm_entry_exception_error_code);
2154                vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2155                             vmcs12->vm_entry_instruction_len);
2156                vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2157                             vmcs12->guest_interruptibility_info);
2158                vmx->loaded_vmcs->nmi_known_unmasked =
2159                        !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
2160        } else {
2161                vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
2162        }
2163}
2164
2165static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2166{
2167        struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
2168
2169        if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2170                           HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
2171                vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
2172                vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
2173                vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
2174                vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
2175                vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
2176                vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
2177                vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
2178                vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
2179                vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
2180                vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
2181                vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
2182                vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
2183                vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
2184                vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
2185                vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
2186                vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
2187                vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
2188                vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
2189                vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
2190                vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
2191                vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
2192                vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
2193                vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
2194                vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
2195                vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
2196                vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
2197                vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
2198                vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
2199                vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
2200                vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
2201                vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
2202                vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
2203                vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
2204                vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
2205                vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
2206                vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
2207        }
2208
2209        if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2210                           HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
2211                vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
2212                vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
2213                            vmcs12->guest_pending_dbg_exceptions);
2214                vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
2215                vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
2216
2217                /*
2218                 * L1 may access the L2's PDPTR, so save them to construct
2219                 * vmcs12
2220                 */
2221                if (enable_ept) {
2222                        vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2223                        vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2224                        vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2225                        vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2226                }
2227
2228                if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
2229                    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
2230                        vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
2231        }
2232
2233        if (nested_cpu_has_xsaves(vmcs12))
2234                vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
2235
2236        /*
2237         * Whether page-faults are trapped is determined by a combination of
2238         * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
2239         * If enable_ept, L0 doesn't care about page faults and we should
2240         * set all of these to L1's desires. However, if !enable_ept, L0 does
2241         * care about (at least some) page faults, and because it is not easy
2242         * (if at all possible?) to merge L0 and L1's desires, we simply ask
2243         * to exit on each and every L2 page fault. This is done by setting
2244         * MASK=MATCH=0 and (see below) EB.PF=1.
2245         * Note that below we don't need special code to set EB.PF beyond the
2246         * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
2247         * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
2248         * !enable_ept, EB.PF is 1, so the "or" will always be 1.
2249         */
2250        vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
2251                enable_ept ? vmcs12->page_fault_error_code_mask : 0);
2252        vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
2253                enable_ept ? vmcs12->page_fault_error_code_match : 0);
2254
2255        if (cpu_has_vmx_apicv()) {
2256                vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
2257                vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
2258                vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
2259                vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
2260        }
2261
2262        vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
2263        vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2264
2265        set_cr4_guest_host_mask(vmx);
2266}
2267
2268/*
2269 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
2270 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
2271 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
2272 * guest in a way that will both be appropriate to L1's requests, and our
2273 * needs. In addition to modifying the active vmcs (which is vmcs02), this
2274 * function also has additional necessary side-effects, like setting various
2275 * vcpu->arch fields.
2276 * Returns 0 on success, 1 on failure. Invalid state exit qualification code
2277 * is assigned to entry_failure_code on failure.
2278 */
2279static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
2280                          u32 *entry_failure_code)
2281{
2282        struct vcpu_vmx *vmx = to_vmx(vcpu);
2283        struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
2284        bool load_guest_pdptrs_vmcs12 = false;
2285
2286        if (vmx->nested.dirty_vmcs12 || hv_evmcs) {
2287                prepare_vmcs02_rare(vmx, vmcs12);
2288                vmx->nested.dirty_vmcs12 = false;
2289
2290                load_guest_pdptrs_vmcs12 = !hv_evmcs ||
2291                        !(hv_evmcs->hv_clean_fields &
2292                          HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
2293        }
2294
2295        if (vmx->nested.nested_run_pending &&
2296            (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
2297                kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
2298                vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
2299        } else {
2300                kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
2301                vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
2302        }
2303        if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
2304            !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
2305                vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
2306        vmx_set_rflags(vcpu, vmcs12->guest_rflags);
2307
2308        /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
2309         * bitwise-or of what L1 wants to trap for L2, and what we want to
2310         * trap. Note that CR0.TS also needs updating - we do this later.
2311         */
2312        update_exception_bitmap(vcpu);
2313        vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
2314        vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
2315
2316        if (vmx->nested.nested_run_pending &&
2317            (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
2318                vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
2319                vcpu->arch.pat = vmcs12->guest_ia32_pat;
2320        } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2321                vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
2322        }
2323
2324        vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
2325
2326        if (kvm_has_tsc_control)
2327                decache_tsc_multiplier(vmx);
2328
2329        if (enable_vpid) {
2330                /*
2331                 * There is no direct mapping between vpid02 and vpid12, the
2332                 * vpid02 is per-vCPU for L0 and reused while the value of
2333                 * vpid12 is changed w/ one invvpid during nested vmentry.
2334                 * The vpid12 is allocated by L1 for L2, so it will not
2335                 * influence global bitmap(for vpid01 and vpid02 allocation)
2336                 * even if spawn a lot of nested vCPUs.
2337                 */
2338                if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) {
2339                        if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
2340                                vmx->nested.last_vpid = vmcs12->virtual_processor_id;
2341                                __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false);
2342                        }
2343                } else {
2344                        /*
2345                         * If L1 use EPT, then L0 needs to execute INVEPT on
2346                         * EPTP02 instead of EPTP01. Therefore, delay TLB
2347                         * flush until vmcs02->eptp is fully updated by
2348                         * KVM_REQ_LOAD_CR3. Note that this assumes
2349                         * KVM_REQ_TLB_FLUSH is evaluated after
2350                         * KVM_REQ_LOAD_CR3 in vcpu_enter_guest().
2351                         */
2352                        kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2353                }
2354        }
2355
2356        if (nested_cpu_has_ept(vmcs12))
2357                nested_ept_init_mmu_context(vcpu);
2358        else if (nested_cpu_has2(vmcs12,
2359                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2360                vmx_flush_tlb(vcpu, true);
2361
2362        /*
2363         * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
2364         * bits which we consider mandatory enabled.
2365         * The CR0_READ_SHADOW is what L2 should have expected to read given
2366         * the specifications by L1; It's not enough to take
2367         * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
2368         * have more bits than L1 expected.
2369         */
2370        vmx_set_cr0(vcpu, vmcs12->guest_cr0);
2371        vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
2372
2373        vmx_set_cr4(vcpu, vmcs12->guest_cr4);
2374        vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
2375
2376        vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
2377        /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
2378        vmx_set_efer(vcpu, vcpu->arch.efer);
2379
2380        /*
2381         * Guest state is invalid and unrestricted guest is disabled,
2382         * which means L1 attempted VMEntry to L2 with invalid state.
2383         * Fail the VMEntry.
2384         */
2385        if (vmx->emulation_required) {
2386                *entry_failure_code = ENTRY_FAIL_DEFAULT;
2387                return -EINVAL;
2388        }
2389
2390        /* Shadow page tables on either EPT or shadow page tables. */
2391        if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
2392                                entry_failure_code))
2393                return -EINVAL;
2394
2395        /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
2396        if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
2397            is_pae_paging(vcpu)) {
2398                vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2399                vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2400                vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2401                vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2402        }
2403
2404        if (!enable_ept)
2405                vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
2406
2407        kvm_rsp_write(vcpu, vmcs12->guest_rsp);
2408        kvm_rip_write(vcpu, vmcs12->guest_rip);
2409        return 0;
2410}
2411
2412static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
2413{
2414        if (!nested_cpu_has_nmi_exiting(vmcs12) &&
2415            nested_cpu_has_virtual_nmis(vmcs12))
2416                return -EINVAL;
2417
2418        if (!nested_cpu_has_virtual_nmis(vmcs12) &&
2419            nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING))
2420                return -EINVAL;
2421
2422        return 0;
2423}
2424
2425static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
2426{
2427        struct vcpu_vmx *vmx = to_vmx(vcpu);
2428        int maxphyaddr = cpuid_maxphyaddr(vcpu);
2429
2430        /* Check for memory type validity */
2431        switch (address & VMX_EPTP_MT_MASK) {
2432        case VMX_EPTP_MT_UC:
2433                if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))
2434                        return false;
2435                break;
2436        case VMX_EPTP_MT_WB:
2437                if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))
2438                        return false;
2439                break;
2440        default:
2441                return false;
2442        }
2443
2444        /* only 4 levels page-walk length are valid */
2445        if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4)
2446                return false;
2447
2448        /* Reserved bits should not be set */
2449        if (address >> maxphyaddr || ((address >> 7) & 0x1f))
2450                return false;
2451
2452        /* AD, if set, should be supported */
2453        if (address & VMX_EPTP_AD_ENABLE_BIT) {
2454                if (!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))
2455                        return false;
2456        }
2457
2458        return true;
2459}
2460
2461/*
2462 * Checks related to VM-Execution Control Fields
2463 */
2464static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
2465                                              struct vmcs12 *vmcs12)
2466{
2467        struct vcpu_vmx *vmx = to_vmx(vcpu);
2468
2469        if (!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
2470                                vmx->nested.msrs.pinbased_ctls_low,
2471                                vmx->nested.msrs.pinbased_ctls_high) ||
2472            !vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
2473                                vmx->nested.msrs.procbased_ctls_low,
2474                                vmx->nested.msrs.procbased_ctls_high))
2475                return -EINVAL;
2476
2477        if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
2478            !vmx_control_verify(vmcs12->secondary_vm_exec_control,
2479                                 vmx->nested.msrs.secondary_ctls_low,
2480                                 vmx->nested.msrs.secondary_ctls_high))
2481                return -EINVAL;
2482
2483        if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu) ||
2484            nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) ||
2485            nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) ||
2486            nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) ||
2487            nested_vmx_check_apic_access_controls(vcpu, vmcs12) ||
2488            nested_vmx_check_apicv_controls(vcpu, vmcs12) ||
2489            nested_vmx_check_nmi_controls(vmcs12) ||
2490            nested_vmx_check_pml_controls(vcpu, vmcs12) ||
2491            nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) ||
2492            nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) ||
2493            nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) ||
2494            (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
2495                return -EINVAL;
2496
2497        if (!nested_cpu_has_preemption_timer(vmcs12) &&
2498            nested_cpu_has_save_preemption_timer(vmcs12))
2499                return -EINVAL;
2500
2501        if (nested_cpu_has_ept(vmcs12) &&
2502            !valid_ept_address(vcpu, vmcs12->ept_pointer))
2503                return -EINVAL;
2504
2505        if (nested_cpu_has_vmfunc(vmcs12)) {
2506                if (vmcs12->vm_function_control &
2507                    ~vmx->nested.msrs.vmfunc_controls)
2508                        return -EINVAL;
2509
2510                if (nested_cpu_has_eptp_switching(vmcs12)) {
2511                        if (!nested_cpu_has_ept(vmcs12) ||
2512                            !page_address_valid(vcpu, vmcs12->eptp_list_address))
2513                                return -EINVAL;
2514                }
2515        }
2516
2517        return 0;
2518}
2519
2520/*
2521 * Checks related to VM-Exit Control Fields
2522 */
2523static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu,
2524                                         struct vmcs12 *vmcs12)
2525{
2526        struct vcpu_vmx *vmx = to_vmx(vcpu);
2527
2528        if (!vmx_control_verify(vmcs12->vm_exit_controls,
2529                                vmx->nested.msrs.exit_ctls_low,
2530                                vmx->nested.msrs.exit_ctls_high) ||
2531            nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))
2532                return -EINVAL;
2533
2534        return 0;
2535}
2536
2537/*
2538 * Checks related to VM-Entry Control Fields
2539 */
2540static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
2541                                          struct vmcs12 *vmcs12)
2542{
2543        struct vcpu_vmx *vmx = to_vmx(vcpu);
2544
2545        if (!vmx_control_verify(vmcs12->vm_entry_controls,
2546                                vmx->nested.msrs.entry_ctls_low,
2547                                vmx->nested.msrs.entry_ctls_high))
2548                return -EINVAL;
2549
2550        /*
2551         * From the Intel SDM, volume 3:
2552         * Fields relevant to VM-entry event injection must be set properly.
2553         * These fields are the VM-entry interruption-information field, the
2554         * VM-entry exception error code, and the VM-entry instruction length.
2555         */
2556        if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
2557                u32 intr_info = vmcs12->vm_entry_intr_info_field;
2558                u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
2559                u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
2560                bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
2561                bool should_have_error_code;
2562                bool urg = nested_cpu_has2(vmcs12,
2563                                           SECONDARY_EXEC_UNRESTRICTED_GUEST);
2564                bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
2565
2566                /* VM-entry interruption-info field: interruption type */
2567                if (intr_type == INTR_TYPE_RESERVED ||
2568                    (intr_type == INTR_TYPE_OTHER_EVENT &&
2569                     !nested_cpu_supports_monitor_trap_flag(vcpu)))
2570                        return -EINVAL;
2571
2572                /* VM-entry interruption-info field: vector */
2573                if ((intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
2574                    (intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
2575                    (intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
2576                        return -EINVAL;
2577
2578                /* VM-entry interruption-info field: deliver error code */
2579                should_have_error_code =
2580                        intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
2581                        x86_exception_has_error_code(vector);
2582                if (has_error_code != should_have_error_code)
2583                        return -EINVAL;
2584
2585                /* VM-entry exception error code */
2586                if (has_error_code &&
2587                    vmcs12->vm_entry_exception_error_code & GENMASK(31, 15))
2588                        return -EINVAL;
2589
2590                /* VM-entry interruption-info field: reserved bits */
2591                if (intr_info & INTR_INFO_RESVD_BITS_MASK)
2592                        return -EINVAL;
2593
2594                /* VM-entry instruction length */
2595                switch (intr_type) {
2596                case INTR_TYPE_SOFT_EXCEPTION:
2597                case INTR_TYPE_SOFT_INTR:
2598                case INTR_TYPE_PRIV_SW_EXCEPTION:
2599                        if ((vmcs12->vm_entry_instruction_len > 15) ||
2600                            (vmcs12->vm_entry_instruction_len == 0 &&
2601                             !nested_cpu_has_zero_length_injection(vcpu)))
2602                                return -EINVAL;
2603                }
2604        }
2605
2606        if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12))
2607                return -EINVAL;
2608
2609        return 0;
2610}
2611
2612static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
2613                                     struct vmcs12 *vmcs12)
2614{
2615        if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
2616            nested_check_vm_exit_controls(vcpu, vmcs12) ||
2617            nested_check_vm_entry_controls(vcpu, vmcs12))
2618                return -EINVAL;
2619
2620        return 0;
2621}
2622
2623static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
2624                                       struct vmcs12 *vmcs12)
2625{
2626        bool ia32e;
2627
2628        if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) ||
2629            !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
2630            !nested_cr3_valid(vcpu, vmcs12->host_cr3))
2631                return -EINVAL;
2632
2633        if (is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu) ||
2634            is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))
2635                return -EINVAL;
2636
2637        if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
2638            !kvm_pat_valid(vmcs12->host_ia32_pat))
2639                return -EINVAL;
2640
2641        ia32e = (vmcs12->vm_exit_controls &
2642                 VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
2643
2644        if (vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
2645            vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
2646            vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
2647            vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
2648            vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
2649            vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
2650            vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) ||
2651            vmcs12->host_cs_selector == 0 ||
2652            vmcs12->host_tr_selector == 0 ||
2653            (vmcs12->host_ss_selector == 0 && !ia32e))
2654                return -EINVAL;
2655
2656#ifdef CONFIG_X86_64
2657        if (is_noncanonical_address(vmcs12->host_fs_base, vcpu) ||
2658            is_noncanonical_address(vmcs12->host_gs_base, vcpu) ||
2659            is_noncanonical_address(vmcs12->host_gdtr_base, vcpu) ||
2660            is_noncanonical_address(vmcs12->host_idtr_base, vcpu) ||
2661            is_noncanonical_address(vmcs12->host_tr_base, vcpu))
2662                return -EINVAL;
2663#endif
2664
2665        /*
2666         * If the load IA32_EFER VM-exit control is 1, bits reserved in the
2667         * IA32_EFER MSR must be 0 in the field for that register. In addition,
2668         * the values of the LMA and LME bits in the field must each be that of
2669         * the host address-space size VM-exit control.
2670         */
2671        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
2672                if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
2673                    ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
2674                    ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
2675                        return -EINVAL;
2676        }
2677
2678        return 0;
2679}
2680
2681static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
2682                                          struct vmcs12 *vmcs12)
2683{
2684        int r = 0;
2685        struct vmcs12 *shadow;
2686        struct kvm_host_map map;
2687
2688        if (vmcs12->vmcs_link_pointer == -1ull)
2689                return 0;
2690
2691        if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))
2692                return -EINVAL;
2693
2694        if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
2695                return -EINVAL;
2696
2697        shadow = map.hva;
2698
2699        if (shadow->hdr.revision_id != VMCS12_REVISION ||
2700            shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))
2701                r = -EINVAL;
2702
2703        kvm_vcpu_unmap(vcpu, &map, false);
2704        return r;
2705}
2706
2707/*
2708 * Checks related to Guest Non-register State
2709 */
2710static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
2711{
2712        if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
2713            vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
2714                return -EINVAL;
2715
2716        return 0;
2717}
2718
2719static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
2720                                        struct vmcs12 *vmcs12,
2721                                        u32 *exit_qual)
2722{
2723        bool ia32e;
2724
2725        *exit_qual = ENTRY_FAIL_DEFAULT;
2726
2727        if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
2728            !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
2729                return -EINVAL;
2730
2731        if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
2732            !kvm_pat_valid(vmcs12->guest_ia32_pat))
2733                return -EINVAL;
2734
2735        if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
2736                *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
2737                return -EINVAL;
2738        }
2739
2740        /*
2741         * If the load IA32_EFER VM-entry control is 1, the following checks
2742         * are performed on the field for the IA32_EFER MSR:
2743         * - Bits reserved in the IA32_EFER MSR must be 0.
2744         * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
2745         *   the IA-32e mode guest VM-exit control. It must also be identical
2746         *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
2747         *   CR0.PG) is 1.
2748         */
2749        if (to_vmx(vcpu)->nested.nested_run_pending &&
2750            (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
2751                ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
2752                if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
2753                    ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
2754                    ((vmcs12->guest_cr0 & X86_CR0_PG) &&
2755                     ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
2756                        return -EINVAL;
2757        }
2758
2759        if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
2760            (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
2761             (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
2762                return -EINVAL;
2763
2764        if (nested_check_guest_non_reg_state(vmcs12))
2765                return -EINVAL;
2766
2767        return 0;
2768}
2769
2770static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
2771{
2772        struct vcpu_vmx *vmx = to_vmx(vcpu);
2773        unsigned long cr3, cr4;
2774        bool vm_fail;
2775
2776        if (!nested_early_check)
2777                return 0;
2778
2779        if (vmx->msr_autoload.host.nr)
2780                vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
2781        if (vmx->msr_autoload.guest.nr)
2782                vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
2783
2784        preempt_disable();
2785
2786        vmx_prepare_switch_to_guest(vcpu);
2787
2788        /*
2789         * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
2790         * which is reserved to '1' by hardware.  GUEST_RFLAGS is guaranteed to
2791         * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e.
2792         * there is no need to preserve other bits or save/restore the field.
2793         */
2794        vmcs_writel(GUEST_RFLAGS, 0);
2795
2796        cr3 = __get_current_cr3_fast();
2797        if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
2798                vmcs_writel(HOST_CR3, cr3);
2799                vmx->loaded_vmcs->host_state.cr3 = cr3;
2800        }
2801
2802        cr4 = cr4_read_shadow();
2803        if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
2804                vmcs_writel(HOST_CR4, cr4);
2805                vmx->loaded_vmcs->host_state.cr4 = cr4;
2806        }
2807
2808        asm(
2809                "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
2810                "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
2811                "je 1f \n\t"
2812                __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t"
2813                "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
2814                "1: \n\t"
2815                "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
2816
2817                /* Check if vmlaunch or vmresume is needed */
2818                "cmpb $0, %c[launched](%[loaded_vmcs])\n\t"
2819
2820                /*
2821                 * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set
2822                 * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail
2823                 * Valid.  vmx_vmenter() directly "returns" RFLAGS, and so the
2824                 * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail.
2825                 */
2826                "call vmx_vmenter\n\t"
2827
2828                CC_SET(be)
2829              : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail)
2830              : [HOST_RSP]"r"((unsigned long)HOST_RSP),
2831                [loaded_vmcs]"r"(vmx->loaded_vmcs),
2832                [launched]"i"(offsetof(struct loaded_vmcs, launched)),
2833                [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)),
2834                [wordsize]"i"(sizeof(ulong))
2835              : "memory"
2836        );
2837
2838        if (vmx->msr_autoload.host.nr)
2839                vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
2840        if (vmx->msr_autoload.guest.nr)
2841                vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2842
2843        if (vm_fail) {
2844                preempt_enable();
2845                WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
2846                             VMXERR_ENTRY_INVALID_CONTROL_FIELD);
2847                return 1;
2848        }
2849
2850        /*
2851         * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
2852         */
2853        local_irq_enable();
2854        if (hw_breakpoint_active())
2855                set_debugreg(__this_cpu_read(cpu_dr7), 7);
2856        preempt_enable();
2857
2858        /*
2859         * A non-failing VMEntry means we somehow entered guest mode with
2860         * an illegal RIP, and that's just the tip of the iceberg.  There
2861         * is no telling what memory has been modified or what state has
2862         * been exposed to unknown code.  Hitting this all but guarantees
2863         * a (very critical) hardware issue.
2864         */
2865        WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
2866                VMX_EXIT_REASONS_FAILED_VMENTRY));
2867
2868        return 0;
2869}
2870
2871static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
2872                                                 struct vmcs12 *vmcs12);
2873
2874static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
2875{
2876        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2877        struct vcpu_vmx *vmx = to_vmx(vcpu);
2878        struct kvm_host_map *map;
2879        struct page *page;
2880        u64 hpa;
2881
2882        if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
2883                /*
2884                 * Translate L1 physical address to host physical
2885                 * address for vmcs02. Keep the page pinned, so this
2886                 * physical address remains valid. We keep a reference
2887                 * to it so we can release it later.
2888                 */
2889                if (vmx->nested.apic_access_page) { /* shouldn't happen */
2890                        kvm_release_page_dirty(vmx->nested.apic_access_page);
2891                        vmx->nested.apic_access_page = NULL;
2892                }
2893                page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
2894                /*
2895                 * If translation failed, no matter: This feature asks
2896                 * to exit when accessing the given address, and if it
2897                 * can never be accessed, this feature won't do
2898                 * anything anyway.
2899                 */
2900                if (!is_error_page(page)) {
2901                        vmx->nested.apic_access_page = page;
2902                        hpa = page_to_phys(vmx->nested.apic_access_page);
2903                        vmcs_write64(APIC_ACCESS_ADDR, hpa);
2904                } else {
2905                        secondary_exec_controls_clearbit(vmx,
2906                                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
2907                }
2908        }
2909
2910        if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
2911                map = &vmx->nested.virtual_apic_map;
2912
2913                if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
2914                        vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
2915                } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
2916                           nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
2917                           !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
2918                        /*
2919                         * The processor will never use the TPR shadow, simply
2920                         * clear the bit from the execution control.  Such a
2921                         * configuration is useless, but it happens in tests.
2922                         * For any other configuration, failing the vm entry is
2923                         * _not_ what the processor does but it's basically the
2924                         * only possibility we have.
2925                         */
2926                        exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW);
2927                } else {
2928                        /*
2929                         * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
2930                         * force VM-Entry to fail.
2931                         */
2932                        vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
2933                }
2934        }
2935
2936        if (nested_cpu_has_posted_intr(vmcs12)) {
2937                map = &vmx->nested.pi_desc_map;
2938
2939                if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
2940                        vmx->nested.pi_desc =
2941                                (struct pi_desc *)(((void *)map->hva) +
2942                                offset_in_page(vmcs12->posted_intr_desc_addr));
2943                        vmcs_write64(POSTED_INTR_DESC_ADDR,
2944                                     pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
2945                }
2946        }
2947        if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
2948                exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
2949        else
2950                exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
2951}
2952
2953/*
2954 * Intel's VMX Instruction Reference specifies a common set of prerequisites
2955 * for running VMX instructions (except VMXON, whose prerequisites are
2956 * slightly different). It also specifies what exception to inject otherwise.
2957 * Note that many of these exceptions have priority over VM exits, so they
2958 * don't have to be checked again here.
2959 */
2960static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
2961{
2962        if (!to_vmx(vcpu)->nested.vmxon) {
2963                kvm_queue_exception(vcpu, UD_VECTOR);
2964                return 0;
2965        }
2966
2967        if (vmx_get_cpl(vcpu)) {
2968                kvm_inject_gp(vcpu, 0);
2969                return 0;
2970        }
2971
2972        return 1;
2973}
2974
2975static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
2976{
2977        u8 rvi = vmx_get_rvi();
2978        u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
2979
2980        return ((rvi & 0xf0) > (vppr & 0xf0));
2981}
2982
2983static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
2984                                   struct vmcs12 *vmcs12);
2985
2986/*
2987 * If from_vmentry is false, this is being called from state restore (either RSM
2988 * or KVM_SET_NESTED_STATE).  Otherwise it's called from vmlaunch/vmresume.
2989+ *
2990+ * Returns:
2991+ *   0 - success, i.e. proceed with actual VMEnter
2992+ *   1 - consistency check VMExit
2993+ *  -1 - consistency check VMFail
2994 */
2995int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
2996{
2997        struct vcpu_vmx *vmx = to_vmx(vcpu);
2998        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2999        bool evaluate_pending_interrupts;
3000        u32 exit_reason = EXIT_REASON_INVALID_STATE;
3001        u32 exit_qual;
3002
3003        evaluate_pending_interrupts = exec_controls_get(vmx) &
3004                (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
3005        if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
3006                evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
3007
3008        if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
3009                vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
3010        if (kvm_mpx_supported() &&
3011                !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
3012                vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3013
3014        /*
3015         * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
3016         * nested early checks are disabled.  In the event of a "late" VM-Fail,
3017         * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
3018         * software model to the pre-VMEntry host state.  When EPT is disabled,
3019         * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
3020         * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3.  Stuffing
3021         * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
3022         * the correct value.  Smashing vmcs01.GUEST_CR3 is safe because nested
3023         * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
3024         * guaranteed to be overwritten with a shadow CR3 prior to re-entering
3025         * L1.  Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
3026         * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
3027         * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
3028         * path would need to manually save/restore vmcs01.GUEST_CR3.
3029         */
3030        if (!enable_ept && !nested_early_check)
3031                vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3032
3033        vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
3034
3035        prepare_vmcs02_early(vmx, vmcs12);
3036
3037        if (from_vmentry) {
3038                nested_get_vmcs12_pages(vcpu);
3039
3040                if (nested_vmx_check_vmentry_hw(vcpu)) {
3041                        vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3042                        return -1;
3043                }
3044
3045                if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
3046                        goto vmentry_fail_vmexit;
3047        }
3048
3049        enter_guest_mode(vcpu);
3050        if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
3051                vcpu->arch.tsc_offset += vmcs12->tsc_offset;
3052
3053        if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
3054                goto vmentry_fail_vmexit_guest_mode;
3055
3056        if (from_vmentry) {
3057                exit_reason = EXIT_REASON_MSR_LOAD_FAIL;
3058                exit_qual = nested_vmx_load_msr(vcpu,
3059                                                vmcs12->vm_entry_msr_load_addr,
3060                                                vmcs12->vm_entry_msr_load_count);
3061                if (exit_qual)
3062                        goto vmentry_fail_vmexit_guest_mode;
3063        } else {
3064                /*
3065                 * The MMU is not initialized to point at the right entities yet and
3066                 * "get pages" would need to read data from the guest (i.e. we will
3067                 * need to perform gpa to hpa translation). Request a call
3068                 * to nested_get_vmcs12_pages before the next VM-entry.  The MSRs
3069                 * have already been set at vmentry time and should not be reset.
3070                 */
3071                kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
3072        }
3073
3074        /*
3075         * If L1 had a pending IRQ/NMI until it executed
3076         * VMLAUNCH/VMRESUME which wasn't delivered because it was
3077         * disallowed (e.g. interrupts disabled), L0 needs to
3078         * evaluate if this pending event should cause an exit from L2
3079         * to L1 or delivered directly to L2 (e.g. In case L1 don't
3080         * intercept EXTERNAL_INTERRUPT).
3081         *
3082         * Usually this would be handled by the processor noticing an
3083         * IRQ/NMI window request, or checking RVI during evaluation of
3084         * pending virtual interrupts.  However, this setting was done
3085         * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
3086         * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
3087         */
3088        if (unlikely(evaluate_pending_interrupts))
3089                kvm_make_request(KVM_REQ_EVENT, vcpu);
3090
3091        /*
3092         * Do not start the preemption timer hrtimer until after we know
3093         * we are successful, so that only nested_vmx_vmexit needs to cancel
3094         * the timer.
3095         */
3096        vmx->nested.preemption_timer_expired = false;
3097        if (nested_cpu_has_preemption_timer(vmcs12))
3098                vmx_start_preemption_timer(vcpu);
3099
3100        /*
3101         * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
3102         * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
3103         * returned as far as L1 is concerned. It will only return (and set
3104         * the success flag) when L2 exits (see nested_vmx_vmexit()).
3105         */
3106        return 0;
3107
3108        /*
3109         * A failed consistency check that leads to a VMExit during L1's
3110         * VMEnter to L2 is a variation of a normal VMexit, as explained in
3111         * 26.7 "VM-entry failures during or after loading guest state".
3112         */
3113vmentry_fail_vmexit_guest_mode:
3114        if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
3115                vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
3116        leave_guest_mode(vcpu);
3117
3118vmentry_fail_vmexit:
3119        vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3120
3121        if (!from_vmentry)
3122                return 1;
3123
3124        load_vmcs12_host_state(vcpu, vmcs12);
3125        vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
3126        vmcs12->exit_qualification = exit_qual;
3127        if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
3128                vmx->nested.need_vmcs12_to_shadow_sync = true;
3129        return 1;
3130}
3131
3132/*
3133 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
3134 * for running an L2 nested guest.
3135 */
3136static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
3137{
3138        struct vmcs12 *vmcs12;
3139        struct vcpu_vmx *vmx = to_vmx(vcpu);
3140        u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
3141        int ret;
3142
3143        if (!nested_vmx_check_permission(vcpu))
3144                return 1;
3145
3146        if (!nested_vmx_handle_enlightened_vmptrld(vcpu, launch))
3147                return 1;
3148
3149        if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)
3150                return nested_vmx_failInvalid(vcpu);
3151
3152        vmcs12 = get_vmcs12(vcpu);
3153
3154        /*
3155         * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
3156         * that there *is* a valid VMCS pointer, RFLAGS.CF is set
3157         * rather than RFLAGS.ZF, and no error number is stored to the
3158         * VM-instruction error field.
3159         */
3160        if (vmcs12->hdr.shadow_vmcs)
3161                return nested_vmx_failInvalid(vcpu);
3162
3163        if (vmx->nested.hv_evmcs) {
3164                copy_enlightened_to_vmcs12(vmx);
3165                /* Enlightened VMCS doesn't have launch state */
3166                vmcs12->launch_state = !launch;
3167        } else if (enable_shadow_vmcs) {
3168                copy_shadow_to_vmcs12(vmx);
3169        }
3170
3171        /*
3172         * The nested entry process starts with enforcing various prerequisites
3173         * on vmcs12 as required by the Intel SDM, and act appropriately when
3174         * they fail: As the SDM explains, some conditions should cause the
3175         * instruction to fail, while others will cause the instruction to seem
3176         * to succeed, but return an EXIT_REASON_INVALID_STATE.
3177         * To speed up the normal (success) code path, we should avoid checking
3178         * for misconfigurations which will anyway be caught by the processor
3179         * when using the merged vmcs02.
3180         */
3181        if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)
3182                return nested_vmx_failValid(vcpu,
3183                        VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
3184
3185        if (vmcs12->launch_state == launch)
3186                return nested_vmx_failValid(vcpu,
3187                        launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
3188                               : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
3189
3190        if (nested_vmx_check_controls(vcpu, vmcs12))
3191                return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3192
3193        if (nested_vmx_check_host_state(vcpu, vmcs12))
3194                return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
3195
3196        /*
3197         * We're finally done with prerequisite checking, and can start with
3198         * the nested entry.
3199         */
3200        vmx->nested.nested_run_pending = 1;
3201        ret = nested_vmx_enter_non_root_mode(vcpu, true);
3202        vmx->nested.nested_run_pending = !ret;
3203        if (ret > 0)
3204                return 1;
3205        else if (ret)
3206                return nested_vmx_failValid(vcpu,
3207                        VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3208
3209        /* Hide L1D cache contents from the nested guest.  */
3210        vmx->vcpu.arch.l1tf_flush_l1d = true;
3211
3212        /*
3213         * Must happen outside of nested_vmx_enter_non_root_mode() as it will
3214         * also be used as part of restoring nVMX state for
3215         * snapshot restore (migration).
3216         *
3217         * In this flow, it is assumed that vmcs12 cache was
3218         * trasferred as part of captured nVMX state and should
3219         * therefore not be read from guest memory (which may not
3220         * exist on destination host yet).
3221         */
3222        nested_cache_shadow_vmcs12(vcpu, vmcs12);
3223
3224        /*
3225         * If we're entering a halted L2 vcpu and the L2 vcpu won't be
3226         * awakened by event injection or by an NMI-window VM-exit or
3227         * by an interrupt-window VM-exit, halt the vcpu.
3228         */
3229        if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
3230            !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
3231            !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_NMI_PENDING) &&
3232            !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_INTR_PENDING) &&
3233              (vmcs12->guest_rflags & X86_EFLAGS_IF))) {
3234                vmx->nested.nested_run_pending = 0;
3235                return kvm_vcpu_halt(vcpu);
3236        }
3237        return 1;
3238}
3239
3240/*
3241 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
3242 * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
3243 * This function returns the new value we should put in vmcs12.guest_cr0.
3244 * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
3245 *  1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
3246 *     available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
3247 *     didn't trap the bit, because if L1 did, so would L0).
3248 *  2. Bits that L1 asked to trap (and therefore L0 also did) could not have
3249 *     been modified by L2, and L1 knows it. So just leave the old value of
3250 *     the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
3251 *     isn't relevant, because if L0 traps this bit it can set it to anything.
3252 *  3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
3253 *     changed these bits, and therefore they need to be updated, but L0
3254 *     didn't necessarily allow them to be changed in GUEST_CR0 - and rather
3255 *     put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
3256 */
3257static inline unsigned long
3258vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3259{
3260        return
3261        /*1*/   (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
3262        /*2*/   (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
3263        /*3*/   (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
3264                        vcpu->arch.cr0_guest_owned_bits));
3265}
3266
3267static inline unsigned long
3268vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3269{
3270        return
3271        /*1*/   (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
3272        /*2*/   (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
3273        /*3*/   (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
3274                        vcpu->arch.cr4_guest_owned_bits));
3275}
3276
3277static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
3278                                      struct vmcs12 *vmcs12)
3279{
3280        u32 idt_vectoring;
3281        unsigned int nr;
3282
3283        if (vcpu->arch.exception.injected) {
3284                nr = vcpu->arch.exception.nr;
3285                idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3286
3287                if (kvm_exception_is_soft(nr)) {
3288                        vmcs12->vm_exit_instruction_len =
3289                                vcpu->arch.event_exit_inst_len;
3290                        idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
3291                } else
3292                        idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
3293
3294                if (vcpu->arch.exception.has_error_code) {
3295                        idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
3296                        vmcs12->idt_vectoring_error_code =
3297                                vcpu->arch.exception.error_code;
3298                }
3299
3300                vmcs12->idt_vectoring_info_field = idt_vectoring;
3301        } else if (vcpu->arch.nmi_injected) {
3302                vmcs12->idt_vectoring_info_field =
3303                        INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
3304        } else if (vcpu->arch.interrupt.injected) {
3305                nr = vcpu->arch.interrupt.nr;
3306                idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3307
3308                if (vcpu->arch.interrupt.soft) {
3309                        idt_vectoring |= INTR_TYPE_SOFT_INTR;
3310                        vmcs12->vm_entry_instruction_len =
3311                                vcpu->arch.event_exit_inst_len;
3312                } else
3313                        idt_vectoring |= INTR_TYPE_EXT_INTR;
3314
3315                vmcs12->idt_vectoring_info_field = idt_vectoring;
3316        }
3317}
3318
3319
3320static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
3321{
3322        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3323        gfn_t gfn;
3324
3325        /*
3326         * Don't need to mark the APIC access page dirty; it is never
3327         * written to by the CPU during APIC virtualization.
3328         */
3329
3330        if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3331                gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
3332                kvm_vcpu_mark_page_dirty(vcpu, gfn);
3333        }
3334
3335        if (nested_cpu_has_posted_intr(vmcs12)) {
3336                gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
3337                kvm_vcpu_mark_page_dirty(vcpu, gfn);
3338        }
3339}
3340
3341static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
3342{
3343        struct vcpu_vmx *vmx = to_vmx(vcpu);
3344        int max_irr;
3345        void *vapic_page;
3346        u16 status;
3347
3348        if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
3349                return;
3350
3351        vmx->nested.pi_pending = false;
3352        if (!pi_test_and_clear_on(vmx->nested.pi_desc))
3353                return;
3354
3355        max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
3356        if (max_irr != 256) {
3357                vapic_page = vmx->nested.virtual_apic_map.hva;
3358                if (!vapic_page)
3359                        return;
3360
3361                __kvm_apic_update_irr(vmx->nested.pi_desc->pir,
3362                        vapic_page, &max_irr);
3363                status = vmcs_read16(GUEST_INTR_STATUS);
3364                if ((u8)max_irr > ((u8)status & 0xff)) {
3365                        status &= ~0xff;
3366                        status |= (u8)max_irr;
3367                        vmcs_write16(GUEST_INTR_STATUS, status);
3368                }
3369        }
3370
3371        nested_mark_vmcs12_pages_dirty(vcpu);
3372}
3373
3374static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
3375                                               unsigned long exit_qual)
3376{
3377        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3378        unsigned int nr = vcpu->arch.exception.nr;
3379        u32 intr_info = nr | INTR_INFO_VALID_MASK;
3380
3381        if (vcpu->arch.exception.has_error_code) {
3382                vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
3383                intr_info |= INTR_INFO_DELIVER_CODE_MASK;
3384        }
3385
3386        if (kvm_exception_is_soft(nr))
3387                intr_info |= INTR_TYPE_SOFT_EXCEPTION;
3388        else
3389                intr_info |= INTR_TYPE_HARD_EXCEPTION;
3390
3391        if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
3392            vmx_get_nmi_mask(vcpu))
3393                intr_info |= INTR_INFO_UNBLOCK_NMI;
3394
3395        nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
3396}
3397
3398static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
3399{
3400        struct vcpu_vmx *vmx = to_vmx(vcpu);
3401        unsigned long exit_qual;
3402        bool block_nested_events =
3403            vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
3404
3405        if (vcpu->arch.exception.pending &&
3406                nested_vmx_check_exception(vcpu, &exit_qual)) {
3407                if (block_nested_events)
3408                        return -EBUSY;
3409                nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
3410                return 0;
3411        }
3412
3413        if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
3414            vmx->nested.preemption_timer_expired) {
3415                if (block_nested_events)
3416                        return -EBUSY;
3417                nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
3418                return 0;
3419        }
3420
3421        if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
3422                if (block_nested_events)
3423                        return -EBUSY;
3424                nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
3425                                  NMI_VECTOR | INTR_TYPE_NMI_INTR |
3426                                  INTR_INFO_VALID_MASK, 0);
3427                /*
3428                 * The NMI-triggered VM exit counts as injection:
3429                 * clear this one and block further NMIs.
3430                 */
3431                vcpu->arch.nmi_pending = 0;
3432                vmx_set_nmi_mask(vcpu, true);
3433                return 0;
3434        }
3435
3436        if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
3437            nested_exit_on_intr(vcpu)) {
3438                if (block_nested_events)
3439                        return -EBUSY;
3440                nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
3441                return 0;
3442        }
3443
3444        vmx_complete_nested_posted_interrupt(vcpu);
3445        return 0;
3446}
3447
3448static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
3449{
3450        ktime_t remaining =
3451                hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
3452        u64 value;
3453
3454        if (ktime_to_ns(remaining) <= 0)
3455                return 0;
3456
3457        value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
3458        do_div(value, 1000000);
3459        return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
3460}
3461
3462static bool is_vmcs12_ext_field(unsigned long field)
3463{
3464        switch (field) {
3465        case GUEST_ES_SELECTOR:
3466        case GUEST_CS_SELECTOR:
3467        case GUEST_SS_SELECTOR:
3468        case GUEST_DS_SELECTOR:
3469        case GUEST_FS_SELECTOR:
3470        case GUEST_GS_SELECTOR:
3471        case GUEST_LDTR_SELECTOR:
3472        case GUEST_TR_SELECTOR:
3473        case GUEST_ES_LIMIT:
3474        case GUEST_CS_LIMIT:
3475        case GUEST_SS_LIMIT:
3476        case GUEST_DS_LIMIT:
3477        case GUEST_FS_LIMIT:
3478        case GUEST_GS_LIMIT:
3479        case GUEST_LDTR_LIMIT:
3480        case GUEST_TR_LIMIT:
3481        case GUEST_GDTR_LIMIT:
3482        case GUEST_IDTR_LIMIT:
3483        case GUEST_ES_AR_BYTES:
3484        case GUEST_DS_AR_BYTES:
3485        case GUEST_FS_AR_BYTES:
3486        case GUEST_GS_AR_BYTES:
3487        case GUEST_LDTR_AR_BYTES:
3488        case GUEST_TR_AR_BYTES:
3489        case GUEST_ES_BASE:
3490        case GUEST_CS_BASE:
3491        case GUEST_SS_BASE:
3492        case GUEST_DS_BASE:
3493        case GUEST_FS_BASE:
3494        case GUEST_GS_BASE:
3495        case GUEST_LDTR_BASE:
3496        case GUEST_TR_BASE:
3497        case GUEST_GDTR_BASE:
3498        case GUEST_IDTR_BASE:
3499        case GUEST_PENDING_DBG_EXCEPTIONS:
3500        case GUEST_BNDCFGS:
3501                return true;
3502        default:
3503                break;
3504        }
3505
3506        return false;
3507}
3508
3509static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
3510                                       struct vmcs12 *vmcs12)
3511{
3512        struct vcpu_vmx *vmx = to_vmx(vcpu);
3513
3514        vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
3515        vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
3516        vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
3517        vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
3518        vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
3519        vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
3520        vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
3521        vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
3522        vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
3523        vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
3524        vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
3525        vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
3526        vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
3527        vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
3528        vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
3529        vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
3530        vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
3531        vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
3532        vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
3533        vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
3534        vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
3535        vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
3536        vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
3537        vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
3538        vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
3539        vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
3540        vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
3541        vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
3542        vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
3543        vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
3544        vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
3545        vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
3546        vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
3547        vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
3548        vmcs12->guest_pending_dbg_exceptions =
3549                vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
3550        if (kvm_mpx_supported())
3551                vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3552
3553        vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
3554}
3555
3556static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
3557                                       struct vmcs12 *vmcs12)
3558{
3559        struct vcpu_vmx *vmx = to_vmx(vcpu);
3560        int cpu;
3561
3562        if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare)
3563                return;
3564
3565
3566        WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
3567
3568        cpu = get_cpu();
3569        vmx->loaded_vmcs = &vmx->nested.vmcs02;
3570        vmx_vcpu_load(&vmx->vcpu, cpu);
3571
3572        sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
3573
3574        vmx->loaded_vmcs = &vmx->vmcs01;
3575        vmx_vcpu_load(&vmx->vcpu, cpu);
3576        put_cpu();
3577}
3578
3579/*
3580 * Update the guest state fields of vmcs12 to reflect changes that
3581 * occurred while L2 was running. (The "IA-32e mode guest" bit of the
3582 * VM-entry controls is also updated, since this is really a guest
3583 * state bit.)
3584 */
3585static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3586{
3587        struct vcpu_vmx *vmx = to_vmx(vcpu);
3588
3589        if (vmx->nested.hv_evmcs)
3590                sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
3591
3592        vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs;
3593
3594        vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
3595        vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
3596
3597        vmcs12->guest_rsp = kvm_rsp_read(vcpu);
3598        vmcs12->guest_rip = kvm_rip_read(vcpu);
3599        vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
3600
3601        vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
3602        vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
3603
3604        vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
3605        vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
3606        vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
3607
3608        vmcs12->guest_interruptibility_info =
3609                vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3610
3611        if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
3612                vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
3613        else
3614                vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
3615
3616        if (nested_cpu_has_preemption_timer(vmcs12) &&
3617            vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
3618                        vmcs12->vmx_preemption_timer_value =
3619                                vmx_get_preemption_timer_value(vcpu);
3620
3621        /*
3622         * In some cases (usually, nested EPT), L2 is allowed to change its
3623         * own CR3 without exiting. If it has changed it, we must keep it.
3624         * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
3625         * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
3626         *
3627         * Additionally, restore L2's PDPTR to vmcs12.
3628         */
3629        if (enable_ept) {
3630                vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
3631                if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
3632                        vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
3633                        vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
3634                        vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
3635                        vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
3636                }
3637        }
3638
3639        vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
3640
3641        if (nested_cpu_has_vid(vmcs12))
3642                vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
3643
3644        vmcs12->vm_entry_controls =
3645                (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
3646                (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
3647
3648        if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
3649                kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
3650
3651        if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
3652                vmcs12->guest_ia32_efer = vcpu->arch.efer;
3653}
3654
3655/*
3656 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
3657 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
3658 * and this function updates it to reflect the changes to the guest state while
3659 * L2 was running (and perhaps made some exits which were handled directly by L0
3660 * without going back to L1), and to reflect the exit reason.
3661 * Note that we do not have to copy here all VMCS fields, just those that
3662 * could have changed by the L2 guest or the exit - i.e., the guest-state and
3663 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
3664 * which already writes to vmcs12 directly.
3665 */
3666static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
3667                           u32 exit_reason, u32 exit_intr_info,
3668                           unsigned long exit_qualification)
3669{
3670        /* update exit information fields: */
3671        vmcs12->vm_exit_reason = exit_reason;
3672        vmcs12->exit_qualification = exit_qualification;
3673        vmcs12->vm_exit_intr_info = exit_intr_info;
3674
3675        vmcs12->idt_vectoring_info_field = 0;
3676        vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3677        vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
3678
3679        if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
3680                vmcs12->launch_state = 1;
3681
3682                /* vm_entry_intr_info_field is cleared on exit. Emulate this
3683                 * instead of reading the real value. */
3684                vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
3685
3686                /*
3687                 * Transfer the event that L0 or L1 may wanted to inject into
3688                 * L2 to IDT_VECTORING_INFO_FIELD.
3689                 */
3690                vmcs12_save_pending_event(vcpu, vmcs12);
3691
3692                /*
3693                 * According to spec, there's no need to store the guest's
3694                 * MSRs if the exit is due to a VM-entry failure that occurs
3695                 * during or after loading the guest state. Since this exit
3696                 * does not fall in that category, we need to save the MSRs.
3697                 */
3698                if (nested_vmx_store_msr(vcpu,
3699                                         vmcs12->vm_exit_msr_store_addr,
3700                                         vmcs12->vm_exit_msr_store_count))
3701                        nested_vmx_abort(vcpu,
3702                                         VMX_ABORT_SAVE_GUEST_MSR_FAIL);
3703        }
3704
3705        /*
3706         * Drop what we picked up for L2 via vmx_complete_interrupts. It is
3707         * preserved above and would only end up incorrectly in L1.
3708         */
3709        vcpu->arch.nmi_injected = false;
3710        kvm_clear_exception_queue(vcpu);
3711        kvm_clear_interrupt_queue(vcpu);
3712}
3713
3714/*
3715 * A part of what we need to when the nested L2 guest exits and we want to
3716 * run its L1 parent, is to reset L1's guest state to the host state specified
3717 * in vmcs12.
3718 * This function is to be called not only on normal nested exit, but also on
3719 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
3720 * Failures During or After Loading Guest State").
3721 * This function should be called when the active VMCS is L1's (vmcs01).
3722 */
3723static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
3724                                   struct vmcs12 *vmcs12)
3725{
3726        struct kvm_segment seg;
3727        u32 entry_failure_code;
3728
3729        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
3730                vcpu->arch.efer = vmcs12->host_ia32_efer;
3731        else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
3732                vcpu->arch.efer |= (EFER_LMA | EFER_LME);
3733        else
3734                vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
3735        vmx_set_efer(vcpu, vcpu->arch.efer);
3736
3737        kvm_rsp_write(vcpu, vmcs12->host_rsp);
3738        kvm_rip_write(vcpu, vmcs12->host_rip);
3739        vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
3740        vmx_set_interrupt_shadow(vcpu, 0);
3741
3742        /*
3743         * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
3744         * actually changed, because vmx_set_cr0 refers to efer set above.
3745         *
3746         * CR0_GUEST_HOST_MASK is already set in the original vmcs01
3747         * (KVM doesn't change it);
3748         */
3749        vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
3750        vmx_set_cr0(vcpu, vmcs12->host_cr0);
3751
3752        /* Same as above - no reason to call set_cr4_guest_host_mask().  */
3753        vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
3754        vmx_set_cr4(vcpu, vmcs12->host_cr4);
3755
3756        nested_ept_uninit_mmu_context(vcpu);
3757
3758        /*
3759         * Only PDPTE load can fail as the value of cr3 was checked on entry and
3760         * couldn't have changed.
3761         */
3762        if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
3763                nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
3764
3765        if (!enable_ept)
3766                vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
3767
3768        /*
3769         * If vmcs01 doesn't use VPID, CPU flushes TLB on every
3770         * VMEntry/VMExit. Thus, no need to flush TLB.
3771         *
3772         * If vmcs12 doesn't use VPID, L1 expects TLB to be
3773         * flushed on every VMEntry/VMExit.
3774         *
3775         * Otherwise, we can preserve TLB entries as long as we are
3776         * able to tag L1 TLB entries differently than L2 TLB entries.
3777         *
3778         * If vmcs12 uses EPT, we need to execute this flush on EPTP01
3779         * and therefore we request the TLB flush to happen only after VMCS EPTP
3780         * has been set by KVM_REQ_LOAD_CR3.
3781         */
3782        if (enable_vpid &&
3783            (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) {
3784                kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3785        }
3786
3787        vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
3788        vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
3789        vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
3790        vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
3791        vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
3792        vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
3793        vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
3794
3795        /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1.  */
3796        if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
3797                vmcs_write64(GUEST_BNDCFGS, 0);
3798
3799        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
3800                vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
3801                vcpu->arch.pat = vmcs12->host_ia32_pat;
3802        }
3803        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
3804                vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
3805                        vmcs12->host_ia32_perf_global_ctrl);
3806
3807        /* Set L1 segment info according to Intel SDM
3808            27.5.2 Loading Host Segment and Descriptor-Table Registers */
3809        seg = (struct kvm_segment) {
3810                .base = 0,
3811                .limit = 0xFFFFFFFF,
3812                .selector = vmcs12->host_cs_selector,
3813                .type = 11,
3814                .present = 1,
3815                .s = 1,
3816                .g = 1
3817        };
3818        if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
3819                seg.l = 1;
3820        else
3821                seg.db = 1;
3822        vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
3823        seg = (struct kvm_segment) {
3824                .base = 0,
3825                .limit = 0xFFFFFFFF,
3826                .type = 3,
3827                .present = 1,
3828                .s = 1,
3829                .db = 1,
3830                .g = 1
3831        };
3832        seg.selector = vmcs12->host_ds_selector;
3833        vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
3834        seg.selector = vmcs12->host_es_selector;
3835        vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
3836        seg.selector = vmcs12->host_ss_selector;
3837        vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
3838        seg.selector = vmcs12->host_fs_selector;
3839        seg.base = vmcs12->host_fs_base;
3840        vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
3841        seg.selector = vmcs12->host_gs_selector;
3842        seg.base = vmcs12->host_gs_base;
3843        vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
3844        seg = (struct kvm_segment) {
3845                .base = vmcs12->host_tr_base,
3846                .limit = 0x67,
3847                .selector = vmcs12->host_tr_selector,
3848                .type = 11,
3849                .present = 1
3850        };
3851        vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
3852
3853        kvm_set_dr(vcpu, 7, 0x400);
3854        vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
3855
3856        if (cpu_has_vmx_msr_bitmap())
3857                vmx_update_msr_bitmap(vcpu);
3858
3859        if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
3860                                vmcs12->vm_exit_msr_load_count))
3861                nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
3862}
3863
3864static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
3865{
3866        struct shared_msr_entry *efer_msr;
3867        unsigned int i;
3868
3869        if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
3870                return vmcs_read64(GUEST_IA32_EFER);
3871
3872        if (cpu_has_load_ia32_efer())
3873                return host_efer;
3874
3875        for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
3876                if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
3877                        return vmx->msr_autoload.guest.val[i].value;
3878        }
3879
3880        efer_msr = find_msr_entry(vmx, MSR_EFER);
3881        if (efer_msr)
3882                return efer_msr->data;
3883
3884        return host_efer;
3885}
3886
3887static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
3888{
3889        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3890        struct vcpu_vmx *vmx = to_vmx(vcpu);
3891        struct vmx_msr_entry g, h;
3892        struct msr_data msr;
3893        gpa_t gpa;
3894        u32 i, j;
3895
3896        vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
3897
3898        if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
3899                /*
3900                 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
3901                 * as vmcs01.GUEST_DR7 contains a userspace defined value
3902                 * and vcpu->arch.dr7 is not squirreled away before the
3903                 * nested VMENTER (not worth adding a variable in nested_vmx).
3904                 */
3905                if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
3906                        kvm_set_dr(vcpu, 7, DR7_FIXED_1);
3907                else
3908                        WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
3909        }
3910
3911        /*
3912         * Note that calling vmx_set_{efer,cr0,cr4} is important as they
3913         * handle a variety of side effects to KVM's software model.
3914         */
3915        vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
3916
3917        vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
3918        vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
3919
3920        vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
3921        vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
3922
3923        nested_ept_uninit_mmu_context(vcpu);
3924        vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3925        __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
3926
3927        /*
3928         * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
3929         * from vmcs01 (if necessary).  The PDPTRs are not loaded on
3930         * VMFail, like everything else we just need to ensure our
3931         * software model is up-to-date.
3932         */
3933        if (enable_ept)
3934                ept_save_pdptrs(vcpu);
3935
3936        kvm_mmu_reset_context(vcpu);
3937
3938        if (cpu_has_vmx_msr_bitmap())
3939                vmx_update_msr_bitmap(vcpu);
3940
3941        /*
3942         * This nasty bit of open coding is a compromise between blindly
3943         * loading L1's MSRs using the exit load lists (incorrect emulation
3944         * of VMFail), leaving the nested VM's MSRs in the software model
3945         * (incorrect behavior) and snapshotting the modified MSRs (too
3946         * expensive since the lists are unbound by hardware).  For each
3947         * MSR that was (prematurely) loaded from the nested VMEntry load
3948         * list, reload it from the exit load list if it exists and differs
3949         * from the guest value.  The intent is to stuff host state as
3950         * silently as possible, not to fully process the exit load list.
3951         */
3952        msr.host_initiated = false;
3953        for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
3954                gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
3955                if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
3956                        pr_debug_ratelimited(
3957                                "%s read MSR index failed (%u, 0x%08llx)\n",
3958                                __func__, i, gpa);
3959                        goto vmabort;
3960                }
3961
3962                for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
3963                        gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
3964                        if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
3965                                pr_debug_ratelimited(
3966                                        "%s read MSR failed (%u, 0x%08llx)\n",
3967                                        __func__, j, gpa);
3968                                goto vmabort;
3969                        }
3970                        if (h.index != g.index)
3971                                continue;
3972                        if (h.value == g.value)
3973                                break;
3974
3975                        if (nested_vmx_load_msr_check(vcpu, &h)) {
3976                                pr_debug_ratelimited(
3977                                        "%s check failed (%u, 0x%x, 0x%x)\n",
3978                                        __func__, j, h.index, h.reserved);
3979                                goto vmabort;
3980                        }
3981
3982                        msr.index = h.index;
3983                        msr.data = h.value;
3984                        if (kvm_set_msr(vcpu, &msr)) {
3985                                pr_debug_ratelimited(
3986                                        "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
3987                                        __func__, j, h.index, h.value);
3988                                goto vmabort;
3989                        }
3990                }
3991        }
3992
3993        return;
3994
3995vmabort:
3996        nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
3997}
3998
3999/*
4000 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
4001 * and modify vmcs12 to make it see what it would expect to see there if
4002 * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
4003 */
4004void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
4005                       u32 exit_intr_info, unsigned long exit_qualification)
4006{
4007        struct vcpu_vmx *vmx = to_vmx(vcpu);
4008        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4009
4010        /* trying to cancel vmlaunch/vmresume is a bug */
4011        WARN_ON_ONCE(vmx->nested.nested_run_pending);
4012
4013        leave_guest_mode(vcpu);
4014
4015        if (nested_cpu_has_preemption_timer(vmcs12))
4016                hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
4017
4018        if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
4019                vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
4020
4021        if (likely(!vmx->fail)) {
4022                sync_vmcs02_to_vmcs12(vcpu, vmcs12);
4023
4024                if (exit_reason != -1)
4025                        prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
4026                                       exit_qualification);
4027
4028                /*
4029                 * Must happen outside of sync_vmcs02_to_vmcs12() as it will
4030                 * also be used to capture vmcs12 cache as part of
4031                 * capturing nVMX state for snapshot (migration).
4032                 *
4033                 * Otherwise, this flush will dirty guest memory at a
4034                 * point it is already assumed by user-space to be
4035                 * immutable.
4036                 */
4037                nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
4038        } else {
4039                /*
4040                 * The only expected VM-instruction error is "VM entry with
4041                 * invalid control field(s)." Anything else indicates a
4042                 * problem with L0.  And we should never get here with a
4043                 * VMFail of any type if early consistency checks are enabled.
4044                 */
4045                WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
4046                             VMXERR_ENTRY_INVALID_CONTROL_FIELD);
4047                WARN_ON_ONCE(nested_early_check);
4048        }
4049
4050        vmx_switch_vmcs(vcpu, &vmx->vmcs01);
4051
4052        /* Update any VMCS fields that might have changed while L2 ran */
4053        vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
4054        vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
4055        vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
4056
4057        if (kvm_has_tsc_control)
4058                decache_tsc_multiplier(vmx);
4059
4060        if (vmx->nested.change_vmcs01_virtual_apic_mode) {
4061                vmx->nested.change_vmcs01_virtual_apic_mode = false;
4062                vmx_set_virtual_apic_mode(vcpu);
4063        } else if (!nested_cpu_has_ept(vmcs12) &&
4064                   nested_cpu_has2(vmcs12,
4065                                   SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
4066                vmx_flush_tlb(vcpu, true);
4067        }
4068
4069        /* Unpin physical memory we referred to in vmcs02 */
4070        if (vmx->nested.apic_access_page) {
4071                kvm_release_page_dirty(vmx->nested.apic_access_page);
4072                vmx->nested.apic_access_page = NULL;
4073        }
4074        kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
4075        kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
4076        vmx->nested.pi_desc = NULL;
4077
4078        /*
4079         * We are now running in L2, mmu_notifier will force to reload the
4080         * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
4081         */
4082        kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
4083
4084        if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs))
4085                vmx->nested.need_vmcs12_to_shadow_sync = true;
4086
4087        /* in case we halted in L2 */
4088        vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4089
4090        if (likely(!vmx->fail)) {
4091                /*
4092                 * TODO: SDM says that with acknowledge interrupt on
4093                 * exit, bit 31 of the VM-exit interrupt information
4094                 * (valid interrupt) is always set to 1 on
4095                 * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't
4096                 * need kvm_cpu_has_interrupt().  See the commit
4097                 * message for details.
4098                 */
4099                if (nested_exit_intr_ack_set(vcpu) &&
4100                    exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
4101                    kvm_cpu_has_interrupt(vcpu)) {
4102                        int irq = kvm_cpu_get_interrupt(vcpu);
4103                        WARN_ON(irq < 0);
4104                        vmcs12->vm_exit_intr_info = irq |
4105                                INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
4106                }
4107
4108                if (exit_reason != -1)
4109                        trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
4110                                                       vmcs12->exit_qualification,
4111                                                       vmcs12->idt_vectoring_info_field,
4112                                                       vmcs12->vm_exit_intr_info,
4113                                                       vmcs12->vm_exit_intr_error_code,
4114                                                       KVM_ISA_VMX);
4115
4116                load_vmcs12_host_state(vcpu, vmcs12);
4117
4118                return;
4119        }
4120
4121        /*
4122         * After an early L2 VM-entry failure, we're now back
4123         * in L1 which thinks it just finished a VMLAUNCH or
4124         * VMRESUME instruction, so we need to set the failure
4125         * flag and the VM-instruction error field of the VMCS
4126         * accordingly, and skip the emulated instruction.
4127         */
4128        (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
4129
4130        /*
4131         * Restore L1's host state to KVM's software model.  We're here
4132         * because a consistency check was caught by hardware, which
4133         * means some amount of guest state has been propagated to KVM's
4134         * model and needs to be unwound to the host's state.
4135         */
4136        nested_vmx_restore_host_state(vcpu);
4137
4138        vmx->fail = 0;
4139}
4140
4141/*
4142 * Decode the memory-address operand of a vmx instruction, as recorded on an
4143 * exit caused by such an instruction (run by a guest hypervisor).
4144 * On success, returns 0. When the operand is invalid, returns 1 and throws
4145 * #UD or #GP.
4146 */
4147int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
4148                        u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
4149{
4150        gva_t off;
4151        bool exn;
4152        struct kvm_segment s;
4153
4154        /*
4155         * According to Vol. 3B, "Information for VM Exits Due to Instruction
4156         * Execution", on an exit, vmx_instruction_info holds most of the
4157         * addressing components of the operand. Only the displacement part
4158         * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
4159         * For how an actual address is calculated from all these components,
4160         * refer to Vol. 1, "Operand Addressing".
4161         */
4162        int  scaling = vmx_instruction_info & 3;
4163        int  addr_size = (vmx_instruction_info >> 7) & 7;
4164        bool is_reg = vmx_instruction_info & (1u << 10);
4165        int  seg_reg = (vmx_instruction_info >> 15) & 7;
4166        int  index_reg = (vmx_instruction_info >> 18) & 0xf;
4167        bool index_is_valid = !(vmx_instruction_info & (1u << 22));
4168        int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
4169        bool base_is_valid  = !(vmx_instruction_info & (1u << 27));
4170
4171        if (is_reg) {
4172                kvm_queue_exception(vcpu, UD_VECTOR);
4173                return 1;
4174        }
4175
4176        /* Addr = segment_base + offset */
4177        /* offset = base + [index * scale] + displacement */
4178        off = exit_qualification; /* holds the displacement */
4179        if (addr_size == 1)
4180                off = (gva_t)sign_extend64(off, 31);
4181        else if (addr_size == 0)
4182                off = (gva_t)sign_extend64(off, 15);
4183        if (base_is_valid)
4184                off += kvm_register_read(vcpu, base_reg);
4185        if (index_is_valid)
4186                off += kvm_register_read(vcpu, index_reg)<<scaling;
4187        vmx_get_segment(vcpu, &s, seg_reg);
4188
4189        /*
4190         * The effective address, i.e. @off, of a memory operand is truncated
4191         * based on the address size of the instruction.  Note that this is
4192         * the *effective address*, i.e. the address prior to accounting for
4193         * the segment's base.
4194         */
4195        if (addr_size == 1) /* 32 bit */
4196                off &= 0xffffffff;
4197        else if (addr_size == 0) /* 16 bit */
4198                off &= 0xffff;
4199
4200        /* Checks for #GP/#SS exceptions. */
4201        exn = false;
4202        if (is_long_mode(vcpu)) {
4203                /*
4204                 * The virtual/linear address is never truncated in 64-bit
4205                 * mode, e.g. a 32-bit address size can yield a 64-bit virtual
4206                 * address when using FS/GS with a non-zero base.
4207                 */
4208                if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS)
4209                        *ret = s.base + off;
4210                else
4211                        *ret = off;
4212
4213                /* Long mode: #GP(0)/#SS(0) if the memory address is in a
4214                 * non-canonical form. This is the only check on the memory
4215                 * destination for long mode!
4216                 */
4217                exn = is_noncanonical_address(*ret, vcpu);
4218        } else {
4219                /*
4220                 * When not in long mode, the virtual/linear address is
4221                 * unconditionally truncated to 32 bits regardless of the
4222                 * address size.
4223                 */
4224                *ret = (s.base + off) & 0xffffffff;
4225
4226                /* Protected mode: apply checks for segment validity in the
4227                 * following order:
4228                 * - segment type check (#GP(0) may be thrown)
4229                 * - usability check (#GP(0)/#SS(0))
4230                 * - limit check (#GP(0)/#SS(0))
4231                 */
4232                if (wr)
4233                        /* #GP(0) if the destination operand is located in a
4234                         * read-only data segment or any code segment.
4235                         */
4236                        exn = ((s.type & 0xa) == 0 || (s.type & 8));
4237                else
4238                        /* #GP(0) if the source operand is located in an
4239                         * execute-only code segment
4240                         */
4241                        exn = ((s.type & 0xa) == 8);
4242                if (exn) {
4243                        kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
4244                        return 1;
4245                }
4246                /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
4247                 */
4248                exn = (s.unusable != 0);
4249
4250                /*
4251                 * Protected mode: #GP(0)/#SS(0) if the memory operand is
4252                 * outside the segment limit.  All CPUs that support VMX ignore
4253                 * limit checks for flat segments, i.e. segments with base==0,
4254                 * limit==0xffffffff and of type expand-up data or code.
4255                 */
4256                if (!(s.base == 0 && s.limit == 0xffffffff &&
4257                     ((s.type & 8) || !(s.type & 4))))
4258                        exn = exn || ((u64)off + len - 1 > s.limit);
4259        }
4260        if (exn) {
4261                kvm_queue_exception_e(vcpu,
4262                                      seg_reg == VCPU_SREG_SS ?
4263                                                SS_VECTOR : GP_VECTOR,
4264                                      0);
4265                return 1;
4266        }
4267
4268        return 0;
4269}
4270
4271static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
4272{
4273        gva_t gva;
4274        struct x86_exception e;
4275
4276        if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
4277                                vmcs_read32(VMX_INSTRUCTION_INFO), false,
4278                                sizeof(*vmpointer), &gva))
4279                return 1;
4280
4281        if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
4282                kvm_inject_page_fault(vcpu, &e);
4283                return 1;
4284        }
4285
4286        return 0;
4287}
4288
4289/*
4290 * Allocate a shadow VMCS and associate it with the currently loaded
4291 * VMCS, unless such a shadow VMCS already exists. The newly allocated
4292 * VMCS is also VMCLEARed, so that it is ready for use.
4293 */
4294static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
4295{
4296        struct vcpu_vmx *vmx = to_vmx(vcpu);
4297        struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
4298
4299        /*
4300         * We should allocate a shadow vmcs for vmcs01 only when L1
4301         * executes VMXON and free it when L1 executes VMXOFF.
4302         * As it is invalid to execute VMXON twice, we shouldn't reach
4303         * here when vmcs01 already have an allocated shadow vmcs.
4304         */
4305        WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
4306
4307        if (!loaded_vmcs->shadow_vmcs) {
4308                loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
4309                if (loaded_vmcs->shadow_vmcs)
4310                        vmcs_clear(loaded_vmcs->shadow_vmcs);
4311        }
4312        return loaded_vmcs->shadow_vmcs;
4313}
4314
4315static int enter_vmx_operation(struct kvm_vcpu *vcpu)
4316{
4317        struct vcpu_vmx *vmx = to_vmx(vcpu);
4318        int r;
4319
4320        r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
4321        if (r < 0)
4322                goto out_vmcs02;
4323
4324        vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4325        if (!vmx->nested.cached_vmcs12)
4326                goto out_cached_vmcs12;
4327
4328        vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4329        if (!vmx->nested.cached_shadow_vmcs12)
4330                goto out_cached_shadow_vmcs12;
4331
4332        if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
4333                goto out_shadow_vmcs;
4334
4335        hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
4336                     HRTIMER_MODE_REL_PINNED);
4337        vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
4338
4339        vmx->nested.vpid02 = allocate_vpid();
4340
4341        vmx->nested.vmcs02_initialized = false;
4342        vmx->nested.vmxon = true;
4343
4344        if (pt_mode == PT_MODE_HOST_GUEST) {
4345                vmx->pt_desc.guest.ctl = 0;
4346                pt_update_intercept_for_msr(vmx);
4347        }
4348
4349        return 0;
4350
4351out_shadow_vmcs:
4352        kfree(vmx->nested.cached_shadow_vmcs12);
4353
4354out_cached_shadow_vmcs12:
4355        kfree(vmx->nested.cached_vmcs12);
4356
4357out_cached_vmcs12:
4358        free_loaded_vmcs(&vmx->nested.vmcs02);
4359
4360out_vmcs02:
4361        return -ENOMEM;
4362}
4363
4364/*
4365 * Emulate the VMXON instruction.
4366 * Currently, we just remember that VMX is active, and do not save or even
4367 * inspect the argument to VMXON (the so-called "VMXON pointer") because we
4368 * do not currently need to store anything in that guest-allocated memory
4369 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
4370 * argument is different from the VMXON pointer (which the spec says they do).
4371 */
4372static int handle_vmon(struct kvm_vcpu *vcpu)
4373{
4374        int ret;
4375        gpa_t vmptr;
4376        uint32_t revision;
4377        struct vcpu_vmx *vmx = to_vmx(vcpu);
4378        const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
4379                | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
4380
4381        /*
4382         * The Intel VMX Instruction Reference lists a bunch of bits that are
4383         * prerequisite to running VMXON, most notably cr4.VMXE must be set to
4384         * 1 (see vmx_set_cr4() for when we allow the guest to set this).
4385         * Otherwise, we should fail with #UD.  But most faulting conditions
4386         * have already been checked by hardware, prior to the VM-exit for
4387         * VMXON.  We do test guest cr4.VMXE because processor CR4 always has
4388         * that bit set to 1 in non-root mode.
4389         */
4390        if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
4391                kvm_queue_exception(vcpu, UD_VECTOR);
4392                return 1;
4393        }
4394
4395        /* CPL=0 must be checked manually. */
4396        if (vmx_get_cpl(vcpu)) {
4397                kvm_inject_gp(vcpu, 0);
4398                return 1;
4399        }
4400
4401        if (vmx->nested.vmxon)
4402                return nested_vmx_failValid(vcpu,
4403                        VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
4404
4405        if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
4406                        != VMXON_NEEDED_FEATURES) {
4407                kvm_inject_gp(vcpu, 0);
4408                return 1;
4409        }
4410
4411        if (nested_vmx_get_vmptr(vcpu, &vmptr))
4412                return 1;
4413
4414        /*
4415         * SDM 3: 24.11.5
4416         * The first 4 bytes of VMXON region contain the supported
4417         * VMCS revision identifier
4418         *
4419         * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
4420         * which replaces physical address width with 32
4421         */
4422        if (!page_address_valid(vcpu, vmptr))
4423                return nested_vmx_failInvalid(vcpu);
4424
4425        if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
4426            revision != VMCS12_REVISION)
4427                return nested_vmx_failInvalid(vcpu);
4428
4429        vmx->nested.vmxon_ptr = vmptr;
4430        ret = enter_vmx_operation(vcpu);
4431        if (ret)
4432                return ret;
4433
4434        return nested_vmx_succeed(vcpu);
4435}
4436
4437static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
4438{
4439        struct vcpu_vmx *vmx = to_vmx(vcpu);
4440
4441        if (vmx->nested.current_vmptr == -1ull)
4442                return;
4443
4444        copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
4445
4446        if (enable_shadow_vmcs) {
4447                /* copy to memory all shadowed fields in case
4448                   they were modified */
4449                copy_shadow_to_vmcs12(vmx);
4450                vmx_disable_shadow_vmcs(vmx);
4451        }
4452        vmx->nested.posted_intr_nv = -1;
4453
4454        /* Flush VMCS12 to guest memory */
4455        kvm_vcpu_write_guest_page(vcpu,
4456                                  vmx->nested.current_vmptr >> PAGE_SHIFT,
4457                                  vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
4458
4459        kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
4460
4461        vmx->nested.current_vmptr = -1ull;
4462}
4463
4464/* Emulate the VMXOFF instruction */
4465static int handle_vmoff(struct kvm_vcpu *vcpu)
4466{
4467        if (!nested_vmx_check_permission(vcpu))
4468                return 1;
4469        free_nested(vcpu);
4470        return nested_vmx_succeed(vcpu);
4471}
4472
4473/* Emulate the VMCLEAR instruction */
4474static int handle_vmclear(struct kvm_vcpu *vcpu)
4475{
4476        struct vcpu_vmx *vmx = to_vmx(vcpu);
4477        u32 zero = 0;
4478        gpa_t vmptr;
4479        u64 evmcs_gpa;
4480
4481        if (!nested_vmx_check_permission(vcpu))
4482                return 1;
4483
4484        if (nested_vmx_get_vmptr(vcpu, &vmptr))
4485                return 1;
4486
4487        if (!page_address_valid(vcpu, vmptr))
4488                return nested_vmx_failValid(vcpu,
4489                        VMXERR_VMCLEAR_INVALID_ADDRESS);
4490
4491        if (vmptr == vmx->nested.vmxon_ptr)
4492                return nested_vmx_failValid(vcpu,
4493                        VMXERR_VMCLEAR_VMXON_POINTER);
4494
4495        /*
4496         * When Enlightened VMEntry is enabled on the calling CPU we treat
4497         * memory area pointer by vmptr as Enlightened VMCS (as there's no good
4498         * way to distinguish it from VMCS12) and we must not corrupt it by
4499         * writing to the non-existent 'launch_state' field. The area doesn't
4500         * have to be the currently active EVMCS on the calling CPU and there's
4501         * nothing KVM has to do to transition it from 'active' to 'non-active'
4502         * state. It is possible that the area will stay mapped as
4503         * vmx->nested.hv_evmcs but this shouldn't be a problem.
4504         */
4505        if (likely(!vmx->nested.enlightened_vmcs_enabled ||
4506                   !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
4507                if (vmptr == vmx->nested.current_vmptr)
4508                        nested_release_vmcs12(vcpu);
4509
4510                kvm_vcpu_write_guest(vcpu,
4511                                     vmptr + offsetof(struct vmcs12,
4512                                                      launch_state),
4513                                     &zero, sizeof(zero));
4514        }
4515
4516        return nested_vmx_succeed(vcpu);
4517}
4518
4519static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
4520
4521/* Emulate the VMLAUNCH instruction */
4522static int handle_vmlaunch(struct kvm_vcpu *vcpu)
4523{
4524        return nested_vmx_run(vcpu, true);
4525}
4526
4527/* Emulate the VMRESUME instruction */
4528static int handle_vmresume(struct kvm_vcpu *vcpu)
4529{
4530
4531        return nested_vmx_run(vcpu, false);
4532}
4533
4534static int handle_vmread(struct kvm_vcpu *vcpu)
4535{
4536        unsigned long field;
4537        u64 field_value;
4538        unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4539        u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4540        int len;
4541        gva_t gva = 0;
4542        struct vmcs12 *vmcs12;
4543        struct x86_exception e;
4544        short offset;
4545
4546        if (!nested_vmx_check_permission(vcpu))
4547                return 1;
4548
4549        if (to_vmx(vcpu)->nested.current_vmptr == -1ull)
4550                return nested_vmx_failInvalid(vcpu);
4551
4552        if (!is_guest_mode(vcpu))
4553                vmcs12 = get_vmcs12(vcpu);
4554        else {
4555                /*
4556                 * When vmcs->vmcs_link_pointer is -1ull, any VMREAD
4557                 * to shadowed-field sets the ALU flags for VMfailInvalid.
4558                 */
4559                if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
4560                        return nested_vmx_failInvalid(vcpu);
4561                vmcs12 = get_shadow_vmcs12(vcpu);
4562        }
4563
4564        /* Decode instruction info and find the field to read */
4565        field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
4566
4567        offset = vmcs_field_to_offset(field);
4568        if (offset < 0)
4569                return nested_vmx_failValid(vcpu,
4570                        VMXERR_UNSUPPORTED_VMCS_COMPONENT);
4571
4572        if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
4573                copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4574
4575        /* Read the field, zero-extended to a u64 field_value */
4576        field_value = vmcs12_read_any(vmcs12, field, offset);
4577
4578        /*
4579         * Now copy part of this value to register or memory, as requested.
4580         * Note that the number of bits actually copied is 32 or 64 depending
4581         * on the guest's mode (32 or 64 bit), not on the given field's length.
4582         */
4583        if (vmx_instruction_info & (1u << 10)) {
4584                kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
4585                        field_value);
4586        } else {
4587                len = is_64_bit_mode(vcpu) ? 8 : 4;
4588                if (get_vmx_mem_address(vcpu, exit_qualification,
4589                                vmx_instruction_info, true, len, &gva))
4590                        return 1;
4591                /* _system ok, nested_vmx_check_permission has verified cpl=0 */
4592                if (kvm_write_guest_virt_system(vcpu, gva, &field_value, len, &e))
4593                        kvm_inject_page_fault(vcpu, &e);
4594        }
4595
4596        return nested_vmx_succeed(vcpu);
4597}
4598
4599static bool is_shadow_field_rw(unsigned long field)
4600{
4601        switch (field) {
4602#define SHADOW_FIELD_RW(x, y) case x:
4603#include "vmcs_shadow_fields.h"
4604                return true;
4605        default:
4606                break;
4607        }
4608        return false;
4609}
4610
4611static bool is_shadow_field_ro(unsigned long field)
4612{
4613        switch (field) {
4614#define SHADOW_FIELD_RO(x, y) case x:
4615#include "vmcs_shadow_fields.h"
4616                return true;
4617        default:
4618                break;
4619        }
4620        return false;
4621}
4622
4623static int handle_vmwrite(struct kvm_vcpu *vcpu)
4624{
4625        unsigned long field;
4626        int len;
4627        gva_t gva;
4628        struct vcpu_vmx *vmx = to_vmx(vcpu);
4629        unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4630        u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4631
4632        /* The value to write might be 32 or 64 bits, depending on L1's long
4633         * mode, and eventually we need to write that into a field of several
4634         * possible lengths. The code below first zero-extends the value to 64
4635         * bit (field_value), and then copies only the appropriate number of
4636         * bits into the vmcs12 field.
4637         */
4638        u64 field_value = 0;
4639        struct x86_exception e;
4640        struct vmcs12 *vmcs12;
4641        short offset;
4642
4643        if (!nested_vmx_check_permission(vcpu))
4644                return 1;
4645
4646        if (vmx->nested.current_vmptr == -1ull)
4647                return nested_vmx_failInvalid(vcpu);
4648
4649        if (vmx_instruction_info & (1u << 10))
4650                field_value = kvm_register_readl(vcpu,
4651                        (((vmx_instruction_info) >> 3) & 0xf));
4652        else {
4653                len = is_64_bit_mode(vcpu) ? 8 : 4;
4654                if (get_vmx_mem_address(vcpu, exit_qualification,
4655                                vmx_instruction_info, false, len, &gva))
4656                        return 1;
4657                if (kvm_read_guest_virt(vcpu, gva, &field_value, len, &e)) {
4658                        kvm_inject_page_fault(vcpu, &e);
4659                        return 1;
4660                }
4661        }
4662
4663
4664        field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
4665        /*
4666         * If the vCPU supports "VMWRITE to any supported field in the
4667         * VMCS," then the "read-only" fields are actually read/write.
4668         */
4669        if (vmcs_field_readonly(field) &&
4670            !nested_cpu_has_vmwrite_any_field(vcpu))
4671                return nested_vmx_failValid(vcpu,
4672                        VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
4673
4674        if (!is_guest_mode(vcpu)) {
4675                vmcs12 = get_vmcs12(vcpu);
4676
4677                /*
4678                 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
4679                 * vmcs12, else we may crush a field or consume a stale value.
4680                 */
4681                if (!is_shadow_field_rw(field))
4682                        copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4683        } else {
4684                /*
4685                 * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
4686                 * to shadowed-field sets the ALU flags for VMfailInvalid.
4687                 */
4688                if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
4689                        return nested_vmx_failInvalid(vcpu);
4690                vmcs12 = get_shadow_vmcs12(vcpu);
4691        }
4692
4693        offset = vmcs_field_to_offset(field);
4694        if (offset < 0)
4695                return nested_vmx_failValid(vcpu,
4696                        VMXERR_UNSUPPORTED_VMCS_COMPONENT);
4697
4698        /*
4699         * Some Intel CPUs intentionally drop the reserved bits of the AR byte
4700         * fields on VMWRITE.  Emulate this behavior to ensure consistent KVM
4701         * behavior regardless of the underlying hardware, e.g. if an AR_BYTE
4702         * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD
4703         * from L1 will return a different value than VMREAD from L2 (L1 sees
4704         * the stripped down value, L2 sees the full value as stored by KVM).
4705         */
4706        if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
4707                field_value &= 0x1f0ff;
4708
4709        vmcs12_write_any(vmcs12, field, offset, field_value);
4710
4711        /*
4712         * Do not track vmcs12 dirty-state if in guest-mode as we actually
4713         * dirty shadow vmcs12 instead of vmcs12.  Fields that can be updated
4714         * by L1 without a vmexit are always updated in the vmcs02, i.e. don't
4715         * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path.
4716         */
4717        if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) {
4718                /*
4719                 * L1 can read these fields without exiting, ensure the
4720                 * shadow VMCS is up-to-date.
4721                 */
4722                if (enable_shadow_vmcs && is_shadow_field_ro(field)) {
4723                        preempt_disable();
4724                        vmcs_load(vmx->vmcs01.shadow_vmcs);
4725
4726                        __vmcs_writel(field, field_value);
4727
4728                        vmcs_clear(vmx->vmcs01.shadow_vmcs);
4729                        vmcs_load(vmx->loaded_vmcs->vmcs);
4730                        preempt_enable();
4731                }
4732                vmx->nested.dirty_vmcs12 = true;
4733        }
4734
4735        return nested_vmx_succeed(vcpu);
4736}
4737
4738static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
4739{
4740        vmx->nested.current_vmptr = vmptr;
4741        if (enable_shadow_vmcs) {
4742                secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
4743                vmcs_write64(VMCS_LINK_POINTER,
4744                             __pa(vmx->vmcs01.shadow_vmcs));
4745                vmx->nested.need_vmcs12_to_shadow_sync = true;
4746        }
4747        vmx->nested.dirty_vmcs12 = true;
4748}
4749
4750/* Emulate the VMPTRLD instruction */
4751static int handle_vmptrld(struct kvm_vcpu *vcpu)
4752{
4753        struct vcpu_vmx *vmx = to_vmx(vcpu);
4754        gpa_t vmptr;
4755
4756        if (!nested_vmx_check_permission(vcpu))
4757                return 1;
4758
4759        if (nested_vmx_get_vmptr(vcpu, &vmptr))
4760                return 1;
4761
4762        if (!page_address_valid(vcpu, vmptr))
4763                return nested_vmx_failValid(vcpu,
4764                        VMXERR_VMPTRLD_INVALID_ADDRESS);
4765
4766        if (vmptr == vmx->nested.vmxon_ptr)
4767                return nested_vmx_failValid(vcpu,
4768                        VMXERR_VMPTRLD_VMXON_POINTER);
4769
4770        /* Forbid normal VMPTRLD if Enlightened version was used */
4771        if (vmx->nested.hv_evmcs)
4772                return 1;
4773
4774        if (vmx->nested.current_vmptr != vmptr) {
4775                struct kvm_host_map map;
4776                struct vmcs12 *new_vmcs12;
4777
4778                if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) {
4779                        /*
4780                         * Reads from an unbacked page return all 1s,
4781                         * which means that the 32 bits located at the
4782                         * given physical address won't match the required
4783                         * VMCS12_REVISION identifier.
4784                         */
4785                        return nested_vmx_failValid(vcpu,
4786                                VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
4787                }
4788
4789                new_vmcs12 = map.hva;
4790
4791                if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
4792                    (new_vmcs12->hdr.shadow_vmcs &&
4793                     !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
4794                        kvm_vcpu_unmap(vcpu, &map, false);
4795                        return nested_vmx_failValid(vcpu,
4796                                VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
4797                }
4798
4799                nested_release_vmcs12(vcpu);
4800
4801                /*
4802                 * Load VMCS12 from guest memory since it is not already
4803                 * cached.
4804                 */
4805                memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
4806                kvm_vcpu_unmap(vcpu, &map, false);
4807
4808                set_current_vmptr(vmx, vmptr);
4809        }
4810
4811        return nested_vmx_succeed(vcpu);
4812}
4813
4814/* Emulate the VMPTRST instruction */
4815static int handle_vmptrst(struct kvm_vcpu *vcpu)
4816{
4817        unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION);
4818        u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4819        gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
4820        struct x86_exception e;
4821        gva_t gva;
4822
4823        if (!nested_vmx_check_permission(vcpu))
4824                return 1;
4825
4826        if (unlikely(to_vmx(vcpu)->nested.hv_evmcs))
4827                return 1;
4828
4829        if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
4830                                true, sizeof(gpa_t), &gva))
4831                return 1;
4832        /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
4833        if (kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
4834                                        sizeof(gpa_t), &e)) {
4835                kvm_inject_page_fault(vcpu, &e);
4836                return 1;
4837        }
4838        return nested_vmx_succeed(vcpu);
4839}
4840
4841/* Emulate the INVEPT instruction */
4842static int handle_invept(struct kvm_vcpu *vcpu)
4843{
4844        struct vcpu_vmx *vmx = to_vmx(vcpu);
4845        u32 vmx_instruction_info, types;
4846        unsigned long type;
4847        gva_t gva;
4848        struct x86_exception e;
4849        struct {
4850                u64 eptp, gpa;
4851        } operand;
4852
4853        if (!(vmx->nested.msrs.secondary_ctls_high &
4854              SECONDARY_EXEC_ENABLE_EPT) ||
4855            !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
4856                kvm_queue_exception(vcpu, UD_VECTOR);
4857                return 1;
4858        }
4859
4860        if (!nested_vmx_check_permission(vcpu))
4861                return 1;
4862
4863        vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4864        type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
4865
4866        types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
4867
4868        if (type >= 32 || !(types & (1 << type)))
4869                return nested_vmx_failValid(vcpu,
4870                                VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
4871
4872        /* According to the Intel VMX instruction reference, the memory
4873         * operand is read even if it isn't needed (e.g., for type==global)
4874         */
4875        if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
4876                        vmx_instruction_info, false, sizeof(operand), &gva))
4877                return 1;
4878        if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
4879                kvm_inject_page_fault(vcpu, &e);
4880                return 1;
4881        }
4882
4883        switch (type) {
4884        case VMX_EPT_EXTENT_GLOBAL:
4885        case VMX_EPT_EXTENT_CONTEXT:
4886        /*
4887         * TODO: Sync the necessary shadow EPT roots here, rather than
4888         * at the next emulated VM-entry.
4889         */
4890                break;
4891        default:
4892                BUG_ON(1);
4893                break;
4894        }
4895
4896        return nested_vmx_succeed(vcpu);
4897}
4898
4899static int handle_invvpid(struct kvm_vcpu *vcpu)
4900{
4901        struct vcpu_vmx *vmx = to_vmx(vcpu);
4902        u32 vmx_instruction_info;
4903        unsigned long type, types;
4904        gva_t gva;
4905        struct x86_exception e;
4906        struct {
4907                u64 vpid;
4908                u64 gla;
4909        } operand;
4910        u16 vpid02;
4911
4912        if (!(vmx->nested.msrs.secondary_ctls_high &
4913              SECONDARY_EXEC_ENABLE_VPID) ||
4914                        !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
4915                kvm_queue_exception(vcpu, UD_VECTOR);
4916                return 1;
4917        }
4918
4919        if (!nested_vmx_check_permission(vcpu))
4920                return 1;
4921
4922        vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4923        type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
4924
4925        types = (vmx->nested.msrs.vpid_caps &
4926                        VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
4927
4928        if (type >= 32 || !(types & (1 << type)))
4929                return nested_vmx_failValid(vcpu,
4930                        VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
4931
4932        /* according to the intel vmx instruction reference, the memory
4933         * operand is read even if it isn't needed (e.g., for type==global)
4934         */
4935        if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
4936                        vmx_instruction_info, false, sizeof(operand), &gva))
4937                return 1;
4938        if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
4939                kvm_inject_page_fault(vcpu, &e);
4940                return 1;
4941        }
4942        if (operand.vpid >> 16)
4943                return nested_vmx_failValid(vcpu,
4944                        VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
4945
4946        vpid02 = nested_get_vpid02(vcpu);
4947        switch (type) {
4948        case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
4949                if (!operand.vpid ||
4950                    is_noncanonical_address(operand.gla, vcpu))
4951                        return nested_vmx_failValid(vcpu,
4952                                VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
4953                if (cpu_has_vmx_invvpid_individual_addr()) {
4954                        __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
4955                                vpid02, operand.gla);
4956                } else
4957                        __vmx_flush_tlb(vcpu, vpid02, false);
4958                break;
4959        case VMX_VPID_EXTENT_SINGLE_CONTEXT:
4960        case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
4961                if (!operand.vpid)
4962                        return nested_vmx_failValid(vcpu,
4963                                VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
4964                __vmx_flush_tlb(vcpu, vpid02, false);
4965                break;
4966        case VMX_VPID_EXTENT_ALL_CONTEXT:
4967                __vmx_flush_tlb(vcpu, vpid02, false);
4968                break;
4969        default:
4970                WARN_ON_ONCE(1);
4971                return kvm_skip_emulated_instruction(vcpu);
4972        }
4973
4974        return nested_vmx_succeed(vcpu);
4975}
4976
4977static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
4978                                     struct vmcs12 *vmcs12)
4979{
4980        u32 index = kvm_rcx_read(vcpu);
4981        u64 address;
4982        bool accessed_dirty;
4983        struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
4984
4985        if (!nested_cpu_has_eptp_switching(vmcs12) ||
4986            !nested_cpu_has_ept(vmcs12))
4987                return 1;
4988
4989        if (index >= VMFUNC_EPTP_ENTRIES)
4990                return 1;
4991
4992
4993        if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
4994                                     &address, index * 8, 8))
4995                return 1;
4996
4997        accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT);
4998
4999        /*
5000         * If the (L2) guest does a vmfunc to the currently
5001         * active ept pointer, we don't have to do anything else
5002         */
5003        if (vmcs12->ept_pointer != address) {
5004                if (!valid_ept_address(vcpu, address))
5005                        return 1;
5006
5007                kvm_mmu_unload(vcpu);
5008                mmu->ept_ad = accessed_dirty;
5009                mmu->mmu_role.base.ad_disabled = !accessed_dirty;
5010                vmcs12->ept_pointer = address;
5011                /*
5012                 * TODO: Check what's the correct approach in case
5013                 * mmu reload fails. Currently, we just let the next
5014                 * reload potentially fail
5015                 */
5016                kvm_mmu_reload(vcpu);
5017        }
5018
5019        return 0;
5020}
5021
5022static int handle_vmfunc(struct kvm_vcpu *vcpu)
5023{
5024        struct vcpu_vmx *vmx = to_vmx(vcpu);
5025        struct vmcs12 *vmcs12;
5026        u32 function = kvm_rax_read(vcpu);
5027
5028        /*
5029         * VMFUNC is only supported for nested guests, but we always enable the
5030         * secondary control for simplicity; for non-nested mode, fake that we
5031         * didn't by injecting #UD.
5032         */
5033        if (!is_guest_mode(vcpu)) {
5034                kvm_queue_exception(vcpu, UD_VECTOR);
5035                return 1;
5036        }
5037
5038        vmcs12 = get_vmcs12(vcpu);
5039        if ((vmcs12->vm_function_control & (1 << function)) == 0)
5040                goto fail;
5041
5042        switch (function) {
5043        case 0:
5044                if (nested_vmx_eptp_switching(vcpu, vmcs12))
5045                        goto fail;
5046                break;
5047        default:
5048                goto fail;
5049        }
5050        return kvm_skip_emulated_instruction(vcpu);
5051
5052fail:
5053        nested_vmx_vmexit(vcpu, vmx->exit_reason,
5054                          vmcs_read32(VM_EXIT_INTR_INFO),
5055                          vmcs_readl(EXIT_QUALIFICATION));
5056        return 1;
5057}
5058
5059
5060static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
5061                                       struct vmcs12 *vmcs12)
5062{
5063        unsigned long exit_qualification;
5064        gpa_t bitmap, last_bitmap;
5065        unsigned int port;
5066        int size;
5067        u8 b;
5068
5069        if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
5070                return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
5071
5072        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5073
5074        port = exit_qualification >> 16;
5075        size = (exit_qualification & 7) + 1;
5076
5077        last_bitmap = (gpa_t)-1;
5078        b = -1;
5079
5080        while (size > 0) {
5081                if (port < 0x8000)
5082                        bitmap = vmcs12->io_bitmap_a;
5083                else if (port < 0x10000)
5084                        bitmap = vmcs12->io_bitmap_b;
5085                else
5086                        return true;
5087                bitmap += (port & 0x7fff) / 8;
5088
5089                if (last_bitmap != bitmap)
5090                        if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
5091                                return true;
5092                if (b & (1 << (port & 7)))
5093                        return true;
5094
5095                port++;
5096                size--;
5097                last_bitmap = bitmap;
5098        }
5099
5100        return false;
5101}
5102
5103/*
5104 * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
5105 * rather than handle it ourselves in L0. I.e., check whether L1 expressed
5106 * disinterest in the current event (read or write a specific MSR) by using an
5107 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
5108 */
5109static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
5110        struct vmcs12 *vmcs12, u32 exit_reason)
5111{
5112        u32 msr_index = kvm_rcx_read(vcpu);
5113        gpa_t bitmap;
5114
5115        if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
5116                return true;
5117
5118        /*
5119         * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
5120         * for the four combinations of read/write and low/high MSR numbers.
5121         * First we need to figure out which of the four to use:
5122         */
5123        bitmap = vmcs12->msr_bitmap;
5124        if (exit_reason == EXIT_REASON_MSR_WRITE)
5125                bitmap += 2048;
5126        if (msr_index >= 0xc0000000) {
5127                msr_index -= 0xc0000000;
5128                bitmap += 1024;
5129        }
5130
5131        /* Then read the msr_index'th bit from this bitmap: */
5132        if (msr_index < 1024*8) {
5133                unsigned char b;
5134                if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
5135                        return true;
5136                return 1 & (b >> (msr_index & 7));
5137        } else
5138                return true; /* let L1 handle the wrong parameter */
5139}
5140
5141/*
5142 * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
5143 * rather than handle it ourselves in L0. I.e., check if L1 wanted to
5144 * intercept (via guest_host_mask etc.) the current event.
5145 */
5146static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
5147        struct vmcs12 *vmcs12)
5148{
5149        unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5150        int cr = exit_qualification & 15;
5151        int reg;
5152        unsigned long val;
5153
5154        switch ((exit_qualification >> 4) & 3) {
5155        case 0: /* mov to cr */
5156                reg = (exit_qualification >> 8) & 15;
5157                val = kvm_register_readl(vcpu, reg);
5158                switch (cr) {
5159                case 0:
5160                        if (vmcs12->cr0_guest_host_mask &
5161                            (val ^ vmcs12->cr0_read_shadow))
5162                                return true;
5163                        break;
5164                case 3:
5165                        if ((vmcs12->cr3_target_count >= 1 &&
5166                                        vmcs12->cr3_target_value0 == val) ||
5167                                (vmcs12->cr3_target_count >= 2 &&
5168                                        vmcs12->cr3_target_value1 == val) ||
5169                                (vmcs12->cr3_target_count >= 3 &&
5170                                        vmcs12->cr3_target_value2 == val) ||
5171                                (vmcs12->cr3_target_count >= 4 &&
5172                                        vmcs12->cr3_target_value3 == val))
5173                                return false;
5174                        if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
5175                                return true;
5176                        break;
5177                case 4:
5178                        if (vmcs12->cr4_guest_host_mask &
5179                            (vmcs12->cr4_read_shadow ^ val))
5180                                return true;
5181                        break;
5182                case 8:
5183                        if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
5184                                return true;
5185                        break;
5186                }
5187                break;
5188        case 2: /* clts */
5189                if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
5190                    (vmcs12->cr0_read_shadow & X86_CR0_TS))
5191                        return true;
5192                break;
5193        case 1: /* mov from cr */
5194                switch (cr) {
5195                case 3:
5196                        if (vmcs12->cpu_based_vm_exec_control &
5197                            CPU_BASED_CR3_STORE_EXITING)
5198                                return true;
5199                        break;
5200                case 8:
5201                        if (vmcs12->cpu_based_vm_exec_control &
5202                            CPU_BASED_CR8_STORE_EXITING)
5203                                return true;
5204                        break;
5205                }
5206                break;
5207        case 3: /* lmsw */
5208                /*
5209                 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
5210                 * cr0. Other attempted changes are ignored, with no exit.
5211                 */
5212                val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5213                if (vmcs12->cr0_guest_host_mask & 0xe &
5214                    (val ^ vmcs12->cr0_read_shadow))
5215                        return true;
5216                if ((vmcs12->cr0_guest_host_mask & 0x1) &&
5217                    !(vmcs12->cr0_read_shadow & 0x1) &&
5218                    (val & 0x1))
5219                        return true;
5220                break;
5221        }
5222        return false;
5223}
5224
5225static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
5226        struct vmcs12 *vmcs12, gpa_t bitmap)
5227{
5228        u32 vmx_instruction_info;
5229        unsigned long field;
5230        u8 b;
5231
5232        if (!nested_cpu_has_shadow_vmcs(vmcs12))
5233                return true;
5234
5235        /* Decode instruction info and find the field to access */
5236        vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5237        field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
5238
5239        /* Out-of-range fields always cause a VM exit from L2 to L1 */
5240        if (field >> 15)
5241                return true;
5242
5243        if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
5244                return true;
5245
5246        return 1 & (b >> (field & 7));
5247}
5248
5249/*
5250 * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
5251 * should handle it ourselves in L0 (and then continue L2). Only call this
5252 * when in is_guest_mode (L2).
5253 */
5254bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
5255{
5256        u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
5257        struct vcpu_vmx *vmx = to_vmx(vcpu);
5258        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5259
5260        if (vmx->nested.nested_run_pending)
5261                return false;
5262
5263        if (unlikely(vmx->fail)) {
5264                pr_info_ratelimited("%s failed vm entry %x\n", __func__,
5265                                    vmcs_read32(VM_INSTRUCTION_ERROR));
5266                return true;
5267        }
5268
5269        /*
5270         * The host physical addresses of some pages of guest memory
5271         * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
5272         * Page). The CPU may write to these pages via their host
5273         * physical address while L2 is running, bypassing any
5274         * address-translation-based dirty tracking (e.g. EPT write
5275         * protection).
5276         *
5277         * Mark them dirty on every exit from L2 to prevent them from
5278         * getting out of sync with dirty tracking.
5279         */
5280        nested_mark_vmcs12_pages_dirty(vcpu);
5281
5282        trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
5283                                vmcs_readl(EXIT_QUALIFICATION),
5284                                vmx->idt_vectoring_info,
5285                                intr_info,
5286                                vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
5287                                KVM_ISA_VMX);
5288
5289        switch (exit_reason) {
5290        case EXIT_REASON_EXCEPTION_NMI:
5291                if (is_nmi(intr_info))
5292                        return false;
5293                else if (is_page_fault(intr_info))
5294                        return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
5295                else if (is_debug(intr_info) &&
5296                         vcpu->guest_debug &
5297                         (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
5298                        return false;
5299                else if (is_breakpoint(intr_info) &&
5300                         vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
5301                        return false;
5302                return vmcs12->exception_bitmap &
5303                                (1u << (intr_info & INTR_INFO_VECTOR_MASK));
5304        case EXIT_REASON_EXTERNAL_INTERRUPT:
5305                return false;
5306        case EXIT_REASON_TRIPLE_FAULT:
5307                return true;
5308        case EXIT_REASON_PENDING_INTERRUPT:
5309                return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
5310        case EXIT_REASON_NMI_WINDOW:
5311                return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
5312        case EXIT_REASON_TASK_SWITCH:
5313                return true;
5314        case EXIT_REASON_CPUID:
5315                return true;
5316        case EXIT_REASON_HLT:
5317                return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
5318        case EXIT_REASON_INVD:
5319                return true;
5320        case EXIT_REASON_INVLPG:
5321                return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5322        case EXIT_REASON_RDPMC:
5323                return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
5324        case EXIT_REASON_RDRAND:
5325                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
5326        case EXIT_REASON_RDSEED:
5327                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
5328        case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
5329                return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
5330        case EXIT_REASON_VMREAD:
5331                return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
5332                        vmcs12->vmread_bitmap);
5333        case EXIT_REASON_VMWRITE:
5334                return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
5335                        vmcs12->vmwrite_bitmap);
5336        case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
5337        case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
5338        case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
5339        case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
5340        case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
5341                /*
5342                 * VMX instructions trap unconditionally. This allows L1 to
5343                 * emulate them for its L2 guest, i.e., allows 3-level nesting!
5344                 */
5345                return true;
5346        case EXIT_REASON_CR_ACCESS:
5347                return nested_vmx_exit_handled_cr(vcpu, vmcs12);
5348        case EXIT_REASON_DR_ACCESS:
5349                return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
5350        case EXIT_REASON_IO_INSTRUCTION:
5351                return nested_vmx_exit_handled_io(vcpu, vmcs12);
5352        case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
5353                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
5354        case EXIT_REASON_MSR_READ:
5355        case EXIT_REASON_MSR_WRITE:
5356                return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
5357        case EXIT_REASON_INVALID_STATE:
5358                return true;
5359        case EXIT_REASON_MWAIT_INSTRUCTION:
5360                return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
5361        case EXIT_REASON_MONITOR_TRAP_FLAG:
5362                return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
5363        case EXIT_REASON_MONITOR_INSTRUCTION:
5364                return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
5365        case EXIT_REASON_PAUSE_INSTRUCTION:
5366                return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
5367                        nested_cpu_has2(vmcs12,
5368                                SECONDARY_EXEC_PAUSE_LOOP_EXITING);
5369        case EXIT_REASON_MCE_DURING_VMENTRY:
5370                return false;
5371        case EXIT_REASON_TPR_BELOW_THRESHOLD:
5372                return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
5373        case EXIT_REASON_APIC_ACCESS:
5374        case EXIT_REASON_APIC_WRITE:
5375        case EXIT_REASON_EOI_INDUCED:
5376                /*
5377                 * The controls for "virtualize APIC accesses," "APIC-
5378                 * register virtualization," and "virtual-interrupt
5379                 * delivery" only come from vmcs12.
5380                 */
5381                return true;
5382        case EXIT_REASON_EPT_VIOLATION:
5383                /*
5384                 * L0 always deals with the EPT violation. If nested EPT is
5385                 * used, and the nested mmu code discovers that the address is
5386                 * missing in the guest EPT table (EPT12), the EPT violation
5387                 * will be injected with nested_ept_inject_page_fault()
5388                 */
5389                return false;
5390        case EXIT_REASON_EPT_MISCONFIG:
5391                /*
5392                 * L2 never uses directly L1's EPT, but rather L0's own EPT
5393                 * table (shadow on EPT) or a merged EPT table that L0 built
5394                 * (EPT on EPT). So any problems with the structure of the
5395                 * table is L0's fault.
5396                 */
5397                return false;
5398        case EXIT_REASON_INVPCID:
5399                return
5400                        nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
5401                        nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5402        case EXIT_REASON_WBINVD:
5403                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
5404        case EXIT_REASON_XSETBV:
5405                return true;
5406        case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
5407                /*
5408                 * This should never happen, since it is not possible to
5409                 * set XSS to a non-zero value---neither in L1 nor in L2.
5410                 * If if it were, XSS would have to be checked against
5411                 * the XSS exit bitmap in vmcs12.
5412                 */
5413                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
5414        case EXIT_REASON_PREEMPTION_TIMER:
5415                return false;
5416        case EXIT_REASON_PML_FULL:
5417                /* We emulate PML support to L1. */
5418                return false;
5419        case EXIT_REASON_VMFUNC:
5420                /* VM functions are emulated through L2->L0 vmexits. */
5421                return false;
5422        case EXIT_REASON_ENCLS:
5423                /* SGX is never exposed to L1 */
5424                return false;
5425        default:
5426                return true;
5427        }
5428}
5429
5430
5431static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
5432                                struct kvm_nested_state __user *user_kvm_nested_state,
5433                                u32 user_data_size)
5434{
5435        struct vcpu_vmx *vmx;
5436        struct vmcs12 *vmcs12;
5437        struct kvm_nested_state kvm_state = {
5438                .flags = 0,
5439                .format = KVM_STATE_NESTED_FORMAT_VMX,
5440                .size = sizeof(kvm_state),
5441                .hdr.vmx.vmxon_pa = -1ull,
5442                .hdr.vmx.vmcs12_pa = -1ull,
5443        };
5444        struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
5445                &user_kvm_nested_state->data.vmx[0];
5446
5447        if (!vcpu)
5448                return kvm_state.size + sizeof(*user_vmx_nested_state);
5449
5450        vmx = to_vmx(vcpu);
5451        vmcs12 = get_vmcs12(vcpu);
5452
5453        if (nested_vmx_allowed(vcpu) &&
5454            (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
5455                kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
5456                kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr;
5457
5458                if (vmx_has_valid_vmcs12(vcpu)) {
5459                        kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
5460
5461                        if (vmx->nested.hv_evmcs)
5462                                kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
5463
5464                        if (is_guest_mode(vcpu) &&
5465                            nested_cpu_has_shadow_vmcs(vmcs12) &&
5466                            vmcs12->vmcs_link_pointer != -1ull)
5467                                kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12);
5468                }
5469
5470                if (vmx->nested.smm.vmxon)
5471                        kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
5472
5473                if (vmx->nested.smm.guest_mode)
5474                        kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
5475
5476                if (is_guest_mode(vcpu)) {
5477                        kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
5478
5479                        if (vmx->nested.nested_run_pending)
5480                                kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
5481                }
5482        }
5483
5484        if (user_data_size < kvm_state.size)
5485                goto out;
5486
5487        if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
5488                return -EFAULT;
5489
5490        if (!vmx_has_valid_vmcs12(vcpu))
5491                goto out;
5492
5493        /*
5494         * When running L2, the authoritative vmcs12 state is in the
5495         * vmcs02. When running L1, the authoritative vmcs12 state is
5496         * in the shadow or enlightened vmcs linked to vmcs01, unless
5497         * need_vmcs12_to_shadow_sync is set, in which case, the authoritative
5498         * vmcs12 state is in the vmcs12 already.
5499         */
5500        if (is_guest_mode(vcpu)) {
5501                sync_vmcs02_to_vmcs12(vcpu, vmcs12);
5502                sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
5503        } else if (!vmx->nested.need_vmcs12_to_shadow_sync) {
5504                if (vmx->nested.hv_evmcs)
5505                        copy_enlightened_to_vmcs12(vmx);
5506                else if (enable_shadow_vmcs)
5507                        copy_shadow_to_vmcs12(vmx);
5508        }
5509
5510        BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
5511        BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE);
5512
5513        /*
5514         * Copy over the full allocated size of vmcs12 rather than just the size
5515         * of the struct.
5516         */
5517        if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE))
5518                return -EFAULT;
5519
5520        if (nested_cpu_has_shadow_vmcs(vmcs12) &&
5521            vmcs12->vmcs_link_pointer != -1ull) {
5522                if (copy_to_user(user_vmx_nested_state->shadow_vmcs12,
5523                                 get_shadow_vmcs12(vcpu), VMCS12_SIZE))
5524                        return -EFAULT;
5525        }
5526
5527out:
5528        return kvm_state.size;
5529}
5530
5531/*
5532 * Forcibly leave nested mode in order to be able to reset the VCPU later on.
5533 */
5534void vmx_leave_nested(struct kvm_vcpu *vcpu)
5535{
5536        if (is_guest_mode(vcpu)) {
5537                to_vmx(vcpu)->nested.nested_run_pending = 0;
5538                nested_vmx_vmexit(vcpu, -1, 0, 0);
5539        }
5540        free_nested(vcpu);
5541}
5542
5543static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
5544                                struct kvm_nested_state __user *user_kvm_nested_state,
5545                                struct kvm_nested_state *kvm_state)
5546{
5547        struct vcpu_vmx *vmx = to_vmx(vcpu);
5548        struct vmcs12 *vmcs12;
5549        u32 exit_qual;
5550        struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
5551                &user_kvm_nested_state->data.vmx[0];
5552        int ret;
5553
5554        if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX)
5555                return -EINVAL;
5556
5557        if (kvm_state->hdr.vmx.vmxon_pa == -1ull) {
5558                if (kvm_state->hdr.vmx.smm.flags)
5559                        return -EINVAL;
5560
5561                if (kvm_state->hdr.vmx.vmcs12_pa != -1ull)
5562                        return -EINVAL;
5563
5564                /*
5565                 * KVM_STATE_NESTED_EVMCS used to signal that KVM should
5566                 * enable eVMCS capability on vCPU. However, since then
5567                 * code was changed such that flag signals vmcs12 should
5568                 * be copied into eVMCS in guest memory.
5569                 *
5570                 * To preserve backwards compatability, allow user
5571                 * to set this flag even when there is no VMXON region.
5572                 */
5573                if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
5574                        return -EINVAL;
5575        } else {
5576                if (!nested_vmx_allowed(vcpu))
5577                        return -EINVAL;
5578
5579                if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa))
5580                        return -EINVAL;
5581        }
5582
5583        if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
5584            (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
5585                return -EINVAL;
5586
5587        if (kvm_state->hdr.vmx.smm.flags &
5588            ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
5589                return -EINVAL;
5590
5591        /*
5592         * SMM temporarily disables VMX, so we cannot be in guest mode,
5593         * nor can VMLAUNCH/VMRESUME be pending.  Outside SMM, SMM flags
5594         * must be zero.
5595         */
5596        if (is_smm(vcpu) ?
5597                (kvm_state->flags &
5598                 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
5599                : kvm_state->hdr.vmx.smm.flags)
5600                return -EINVAL;
5601
5602        if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
5603            !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
5604                return -EINVAL;
5605
5606        if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) &&
5607                (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled))
5608                        return -EINVAL;
5609
5610        vmx_leave_nested(vcpu);
5611
5612        if (kvm_state->hdr.vmx.vmxon_pa == -1ull)
5613                return 0;
5614
5615        vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa;
5616        ret = enter_vmx_operation(vcpu);
5617        if (ret)
5618                return ret;
5619
5620        /* Empty 'VMXON' state is permitted */
5621        if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12))
5622                return 0;
5623
5624        if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) {
5625                if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
5626                    !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa))
5627                        return -EINVAL;
5628
5629                set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa);
5630        } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
5631                /*
5632                 * Sync eVMCS upon entry as we may not have
5633                 * HV_X64_MSR_VP_ASSIST_PAGE set up yet.
5634                 */
5635                vmx->nested.need_vmcs12_to_shadow_sync = true;
5636        } else {
5637                return -EINVAL;
5638        }
5639
5640        if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
5641                vmx->nested.smm.vmxon = true;
5642                vmx->nested.vmxon = false;
5643
5644                if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
5645                        vmx->nested.smm.guest_mode = true;
5646        }
5647
5648        vmcs12 = get_vmcs12(vcpu);
5649        if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12)))
5650                return -EFAULT;
5651
5652        if (vmcs12->hdr.revision_id != VMCS12_REVISION)
5653                return -EINVAL;
5654
5655        if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
5656                return 0;
5657
5658        vmx->nested.nested_run_pending =
5659                !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
5660
5661        ret = -EINVAL;
5662        if (nested_cpu_has_shadow_vmcs(vmcs12) &&
5663            vmcs12->vmcs_link_pointer != -1ull) {
5664                struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
5665
5666                if (kvm_state->size <
5667                    sizeof(*kvm_state) +
5668                    sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12))
5669                        goto error_guest_mode;
5670
5671                if (copy_from_user(shadow_vmcs12,
5672                                   user_vmx_nested_state->shadow_vmcs12,
5673                                   sizeof(*shadow_vmcs12))) {
5674                        ret = -EFAULT;
5675                        goto error_guest_mode;
5676                }
5677
5678                if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
5679                    !shadow_vmcs12->hdr.shadow_vmcs)
5680                        goto error_guest_mode;
5681        }
5682
5683        if (nested_vmx_check_controls(vcpu, vmcs12) ||
5684            nested_vmx_check_host_state(vcpu, vmcs12) ||
5685            nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
5686                goto error_guest_mode;
5687
5688        vmx->nested.dirty_vmcs12 = true;
5689        ret = nested_vmx_enter_non_root_mode(vcpu, false);
5690        if (ret)
5691                goto error_guest_mode;
5692
5693        return 0;
5694
5695error_guest_mode:
5696        vmx->nested.nested_run_pending = 0;
5697        return ret;
5698}
5699
5700void nested_vmx_vcpu_setup(void)
5701{
5702        if (enable_shadow_vmcs) {
5703                vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
5704                vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
5705        }
5706}
5707
5708/*
5709 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
5710 * returned for the various VMX controls MSRs when nested VMX is enabled.
5711 * The same values should also be used to verify that vmcs12 control fields are
5712 * valid during nested entry from L1 to L2.
5713 * Each of these control msrs has a low and high 32-bit half: A low bit is on
5714 * if the corresponding bit in the (32-bit) control field *must* be on, and a
5715 * bit in the high half is on if the corresponding bit in the control field
5716 * may be on. See also vmx_control_verify().
5717 */
5718void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
5719                                bool apicv)
5720{
5721        /*
5722         * Note that as a general rule, the high half of the MSRs (bits in
5723         * the control fields which may be 1) should be initialized by the
5724         * intersection of the underlying hardware's MSR (i.e., features which
5725         * can be supported) and the list of features we want to expose -
5726         * because they are known to be properly supported in our code.
5727         * Also, usually, the low half of the MSRs (bits which must be 1) can
5728         * be set to 0, meaning that L1 may turn off any of these bits. The
5729         * reason is that if one of these bits is necessary, it will appear
5730         * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
5731         * fields of vmcs01 and vmcs02, will turn these bits off - and
5732         * nested_vmx_exit_reflected() will not pass related exits to L1.
5733         * These rules have exceptions below.
5734         */
5735
5736        /* pin-based controls */
5737        rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
5738                msrs->pinbased_ctls_low,
5739                msrs->pinbased_ctls_high);
5740        msrs->pinbased_ctls_low |=
5741                PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
5742        msrs->pinbased_ctls_high &=
5743                PIN_BASED_EXT_INTR_MASK |
5744                PIN_BASED_NMI_EXITING |
5745                PIN_BASED_VIRTUAL_NMIS |
5746                (apicv ? PIN_BASED_POSTED_INTR : 0);
5747        msrs->pinbased_ctls_high |=
5748                PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
5749                PIN_BASED_VMX_PREEMPTION_TIMER;
5750
5751        /* exit controls */
5752        rdmsr(MSR_IA32_VMX_EXIT_CTLS,
5753                msrs->exit_ctls_low,
5754                msrs->exit_ctls_high);
5755        msrs->exit_ctls_low =
5756                VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
5757
5758        msrs->exit_ctls_high &=
5759#ifdef CONFIG_X86_64
5760                VM_EXIT_HOST_ADDR_SPACE_SIZE |
5761#endif
5762                VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
5763        msrs->exit_ctls_high |=
5764                VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
5765                VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
5766                VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
5767
5768        /* We support free control of debug control saving. */
5769        msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
5770
5771        /* entry controls */
5772        rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
5773                msrs->entry_ctls_low,
5774                msrs->entry_ctls_high);
5775        msrs->entry_ctls_low =
5776                VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
5777        msrs->entry_ctls_high &=
5778#ifdef CONFIG_X86_64
5779                VM_ENTRY_IA32E_MODE |
5780#endif
5781                VM_ENTRY_LOAD_IA32_PAT;
5782        msrs->entry_ctls_high |=
5783                (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
5784
5785        /* We support free control of debug control loading. */
5786        msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
5787
5788        /* cpu-based controls */
5789        rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
5790                msrs->procbased_ctls_low,
5791                msrs->procbased_ctls_high);
5792        msrs->procbased_ctls_low =
5793                CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
5794        msrs->procbased_ctls_high &=
5795                CPU_BASED_VIRTUAL_INTR_PENDING |
5796                CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
5797                CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
5798                CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
5799                CPU_BASED_CR3_STORE_EXITING |
5800#ifdef CONFIG_X86_64
5801                CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
5802#endif
5803                CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
5804                CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
5805                CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
5806                CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
5807                CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
5808        /*
5809         * We can allow some features even when not supported by the
5810         * hardware. For example, L1 can specify an MSR bitmap - and we
5811         * can use it to avoid exits to L1 - even when L0 runs L2
5812         * without MSR bitmaps.
5813         */
5814        msrs->procbased_ctls_high |=
5815                CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
5816                CPU_BASED_USE_MSR_BITMAPS;
5817
5818        /* We support free control of CR3 access interception. */
5819        msrs->procbased_ctls_low &=
5820                ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
5821
5822        /*
5823         * secondary cpu-based controls.  Do not include those that
5824         * depend on CPUID bits, they are added later by vmx_cpuid_update.
5825         */
5826        if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
5827                rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
5828                      msrs->secondary_ctls_low,
5829                      msrs->secondary_ctls_high);
5830
5831        msrs->secondary_ctls_low = 0;
5832        msrs->secondary_ctls_high &=
5833                SECONDARY_EXEC_DESC |
5834                SECONDARY_EXEC_RDTSCP |
5835                SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
5836                SECONDARY_EXEC_WBINVD_EXITING |
5837                SECONDARY_EXEC_APIC_REGISTER_VIRT |
5838                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
5839                SECONDARY_EXEC_RDRAND_EXITING |
5840                SECONDARY_EXEC_ENABLE_INVPCID |
5841                SECONDARY_EXEC_RDSEED_EXITING |
5842                SECONDARY_EXEC_XSAVES;
5843
5844        /*
5845         * We can emulate "VMCS shadowing," even if the hardware
5846         * doesn't support it.
5847         */
5848        msrs->secondary_ctls_high |=
5849                SECONDARY_EXEC_SHADOW_VMCS;
5850
5851        if (enable_ept) {
5852                /* nested EPT: emulate EPT also to L1 */
5853                msrs->secondary_ctls_high |=
5854                        SECONDARY_EXEC_ENABLE_EPT;
5855                msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
5856                         VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
5857                if (cpu_has_vmx_ept_execute_only())
5858                        msrs->ept_caps |=
5859                                VMX_EPT_EXECUTE_ONLY_BIT;
5860                msrs->ept_caps &= ept_caps;
5861                msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
5862                        VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
5863                        VMX_EPT_1GB_PAGE_BIT;
5864                if (enable_ept_ad_bits) {
5865                        msrs->secondary_ctls_high |=
5866                                SECONDARY_EXEC_ENABLE_PML;
5867                        msrs->ept_caps |= VMX_EPT_AD_BIT;
5868                }
5869        }
5870
5871        if (cpu_has_vmx_vmfunc()) {
5872                msrs->secondary_ctls_high |=
5873                        SECONDARY_EXEC_ENABLE_VMFUNC;
5874                /*
5875                 * Advertise EPTP switching unconditionally
5876                 * since we emulate it
5877                 */
5878                if (enable_ept)
5879                        msrs->vmfunc_controls =
5880                                VMX_VMFUNC_EPTP_SWITCHING;
5881        }
5882
5883        /*
5884         * Old versions of KVM use the single-context version without
5885         * checking for support, so declare that it is supported even
5886         * though it is treated as global context.  The alternative is
5887         * not failing the single-context invvpid, and it is worse.
5888         */
5889        if (enable_vpid) {
5890                msrs->secondary_ctls_high |=
5891                        SECONDARY_EXEC_ENABLE_VPID;
5892                msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
5893                        VMX_VPID_EXTENT_SUPPORTED_MASK;
5894        }
5895
5896        if (enable_unrestricted_guest)
5897                msrs->secondary_ctls_high |=
5898                        SECONDARY_EXEC_UNRESTRICTED_GUEST;
5899
5900        if (flexpriority_enabled)
5901                msrs->secondary_ctls_high |=
5902                        SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
5903
5904        /* miscellaneous data */
5905        rdmsr(MSR_IA32_VMX_MISC,
5906                msrs->misc_low,
5907                msrs->misc_high);
5908        msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
5909        msrs->misc_low |=
5910                MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
5911                VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
5912                VMX_MISC_ACTIVITY_HLT;
5913        msrs->misc_high = 0;
5914
5915        /*
5916         * This MSR reports some information about VMX support. We
5917         * should return information about the VMX we emulate for the
5918         * guest, and the VMCS structure we give it - not about the
5919         * VMX support of the underlying hardware.
5920         */
5921        msrs->basic =
5922                VMCS12_REVISION |
5923                VMX_BASIC_TRUE_CTLS |
5924                ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
5925                (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
5926
5927        if (cpu_has_vmx_basic_inout())
5928                msrs->basic |= VMX_BASIC_INOUT;
5929
5930        /*
5931         * These MSRs specify bits which the guest must keep fixed on
5932         * while L1 is in VMXON mode (in L1's root mode, or running an L2).
5933         * We picked the standard core2 setting.
5934         */
5935#define VMXON_CR0_ALWAYSON     (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
5936#define VMXON_CR4_ALWAYSON     X86_CR4_VMXE
5937        msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
5938        msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
5939
5940        /* These MSRs specify bits which the guest must keep fixed off. */
5941        rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
5942        rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
5943
5944        /* highest index: VMX_PREEMPTION_TIMER_VALUE */
5945        msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
5946}
5947
5948void nested_vmx_hardware_unsetup(void)
5949{
5950        int i;
5951
5952        if (enable_shadow_vmcs) {
5953                for (i = 0; i < VMX_BITMAP_NR; i++)
5954                        free_page((unsigned long)vmx_bitmap[i]);
5955        }
5956}
5957
5958__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
5959{
5960        int i;
5961
5962        if (!cpu_has_vmx_shadow_vmcs())
5963                enable_shadow_vmcs = 0;
5964        if (enable_shadow_vmcs) {
5965                for (i = 0; i < VMX_BITMAP_NR; i++) {
5966                        /*
5967                         * The vmx_bitmap is not tied to a VM and so should
5968                         * not be charged to a memcg.
5969                         */
5970                        vmx_bitmap[i] = (unsigned long *)
5971                                __get_free_page(GFP_KERNEL);
5972                        if (!vmx_bitmap[i]) {
5973                                nested_vmx_hardware_unsetup();
5974                                return -ENOMEM;
5975                        }
5976                }
5977
5978                init_vmcs_shadow_fields();
5979        }
5980
5981        exit_handlers[EXIT_REASON_VMCLEAR]      = handle_vmclear,
5982        exit_handlers[EXIT_REASON_VMLAUNCH]     = handle_vmlaunch,
5983        exit_handlers[EXIT_REASON_VMPTRLD]      = handle_vmptrld,
5984        exit_handlers[EXIT_REASON_VMPTRST]      = handle_vmptrst,
5985        exit_handlers[EXIT_REASON_VMREAD]       = handle_vmread,
5986        exit_handlers[EXIT_REASON_VMRESUME]     = handle_vmresume,
5987        exit_handlers[EXIT_REASON_VMWRITE]      = handle_vmwrite,
5988        exit_handlers[EXIT_REASON_VMOFF]        = handle_vmoff,
5989        exit_handlers[EXIT_REASON_VMON]         = handle_vmon,
5990        exit_handlers[EXIT_REASON_INVEPT]       = handle_invept,
5991        exit_handlers[EXIT_REASON_INVVPID]      = handle_invvpid,
5992        exit_handlers[EXIT_REASON_VMFUNC]       = handle_vmfunc,
5993
5994        kvm_x86_ops->check_nested_events = vmx_check_nested_events;
5995        kvm_x86_ops->get_nested_state = vmx_get_nested_state;
5996        kvm_x86_ops->set_nested_state = vmx_set_nested_state;
5997        kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages,
5998        kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs;
5999        kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version;
6000
6001        return 0;
6002}
6003