linux/arch/x86/kvm/vmx/nested.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2
   3#include <linux/objtool.h>
   4#include <linux/percpu.h>
   5
   6#include <asm/debugreg.h>
   7#include <asm/mmu_context.h>
   8
   9#include "cpuid.h"
  10#include "hyperv.h"
  11#include "mmu.h"
  12#include "nested.h"
  13#include "pmu.h"
  14#include "sgx.h"
  15#include "trace.h"
  16#include "vmx.h"
  17#include "x86.h"
  18
  19static bool __read_mostly enable_shadow_vmcs = 1;
  20module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
  21
  22static bool __read_mostly nested_early_check = 0;
  23module_param(nested_early_check, bool, S_IRUGO);
  24
  25#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
  26
  27/*
  28 * Hyper-V requires all of these, so mark them as supported even though
  29 * they are just treated the same as all-context.
  30 */
  31#define VMX_VPID_EXTENT_SUPPORTED_MASK          \
  32        (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT |  \
  33        VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |    \
  34        VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT |    \
  35        VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
  36
  37#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
  38
  39enum {
  40        VMX_VMREAD_BITMAP,
  41        VMX_VMWRITE_BITMAP,
  42        VMX_BITMAP_NR
  43};
  44static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
  45
  46#define vmx_vmread_bitmap                    (vmx_bitmap[VMX_VMREAD_BITMAP])
  47#define vmx_vmwrite_bitmap                   (vmx_bitmap[VMX_VMWRITE_BITMAP])
  48
  49struct shadow_vmcs_field {
  50        u16     encoding;
  51        u16     offset;
  52};
  53static struct shadow_vmcs_field shadow_read_only_fields[] = {
  54#define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) },
  55#include "vmcs_shadow_fields.h"
  56};
  57static int max_shadow_read_only_fields =
  58        ARRAY_SIZE(shadow_read_only_fields);
  59
  60static struct shadow_vmcs_field shadow_read_write_fields[] = {
  61#define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
  62#include "vmcs_shadow_fields.h"
  63};
  64static int max_shadow_read_write_fields =
  65        ARRAY_SIZE(shadow_read_write_fields);
  66
  67static void init_vmcs_shadow_fields(void)
  68{
  69        int i, j;
  70
  71        memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
  72        memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
  73
  74        for (i = j = 0; i < max_shadow_read_only_fields; i++) {
  75                struct shadow_vmcs_field entry = shadow_read_only_fields[i];
  76                u16 field = entry.encoding;
  77
  78                if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
  79                    (i + 1 == max_shadow_read_only_fields ||
  80                     shadow_read_only_fields[i + 1].encoding != field + 1))
  81                        pr_err("Missing field from shadow_read_only_field %x\n",
  82                               field + 1);
  83
  84                clear_bit(field, vmx_vmread_bitmap);
  85                if (field & 1)
  86#ifdef CONFIG_X86_64
  87                        continue;
  88#else
  89                        entry.offset += sizeof(u32);
  90#endif
  91                shadow_read_only_fields[j++] = entry;
  92        }
  93        max_shadow_read_only_fields = j;
  94
  95        for (i = j = 0; i < max_shadow_read_write_fields; i++) {
  96                struct shadow_vmcs_field entry = shadow_read_write_fields[i];
  97                u16 field = entry.encoding;
  98
  99                if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
 100                    (i + 1 == max_shadow_read_write_fields ||
 101                     shadow_read_write_fields[i + 1].encoding != field + 1))
 102                        pr_err("Missing field from shadow_read_write_field %x\n",
 103                               field + 1);
 104
 105                WARN_ONCE(field >= GUEST_ES_AR_BYTES &&
 106                          field <= GUEST_TR_AR_BYTES,
 107                          "Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
 108
 109                /*
 110                 * PML and the preemption timer can be emulated, but the
 111                 * processor cannot vmwrite to fields that don't exist
 112                 * on bare metal.
 113                 */
 114                switch (field) {
 115                case GUEST_PML_INDEX:
 116                        if (!cpu_has_vmx_pml())
 117                                continue;
 118                        break;
 119                case VMX_PREEMPTION_TIMER_VALUE:
 120                        if (!cpu_has_vmx_preemption_timer())
 121                                continue;
 122                        break;
 123                case GUEST_INTR_STATUS:
 124                        if (!cpu_has_vmx_apicv())
 125                                continue;
 126                        break;
 127                default:
 128                        break;
 129                }
 130
 131                clear_bit(field, vmx_vmwrite_bitmap);
 132                clear_bit(field, vmx_vmread_bitmap);
 133                if (field & 1)
 134#ifdef CONFIG_X86_64
 135                        continue;
 136#else
 137                        entry.offset += sizeof(u32);
 138#endif
 139                shadow_read_write_fields[j++] = entry;
 140        }
 141        max_shadow_read_write_fields = j;
 142}
 143
 144/*
 145 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
 146 * set the success or error code of an emulated VMX instruction (as specified
 147 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
 148 * instruction.
 149 */
 150static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
 151{
 152        vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
 153                        & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
 154                            X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
 155        return kvm_skip_emulated_instruction(vcpu);
 156}
 157
 158static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
 159{
 160        vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
 161                        & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
 162                            X86_EFLAGS_SF | X86_EFLAGS_OF))
 163                        | X86_EFLAGS_CF);
 164        return kvm_skip_emulated_instruction(vcpu);
 165}
 166
 167static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
 168                                u32 vm_instruction_error)
 169{
 170        vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
 171                        & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
 172                            X86_EFLAGS_SF | X86_EFLAGS_OF))
 173                        | X86_EFLAGS_ZF);
 174        get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
 175        /*
 176         * We don't need to force sync to shadow VMCS because
 177         * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all
 178         * fields and thus must be synced.
 179         */
 180        if (to_vmx(vcpu)->nested.hv_evmcs_vmptr != EVMPTR_INVALID)
 181                to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true;
 182
 183        return kvm_skip_emulated_instruction(vcpu);
 184}
 185
 186static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error)
 187{
 188        struct vcpu_vmx *vmx = to_vmx(vcpu);
 189
 190        /*
 191         * failValid writes the error number to the current VMCS, which
 192         * can't be done if there isn't a current VMCS.
 193         */
 194        if (vmx->nested.current_vmptr == -1ull &&
 195            !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
 196                return nested_vmx_failInvalid(vcpu);
 197
 198        return nested_vmx_failValid(vcpu, vm_instruction_error);
 199}
 200
 201static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
 202{
 203        /* TODO: not to reset guest simply here. */
 204        kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 205        pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
 206}
 207
 208static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
 209{
 210        return fixed_bits_valid(control, low, high);
 211}
 212
 213static inline u64 vmx_control_msr(u32 low, u32 high)
 214{
 215        return low | ((u64)high << 32);
 216}
 217
 218static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
 219{
 220        secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
 221        vmcs_write64(VMCS_LINK_POINTER, -1ull);
 222        vmx->nested.need_vmcs12_to_shadow_sync = false;
 223}
 224
 225static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
 226{
 227        struct vcpu_vmx *vmx = to_vmx(vcpu);
 228
 229        if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
 230                kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
 231                vmx->nested.hv_evmcs = NULL;
 232        }
 233
 234        vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
 235}
 236
 237static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
 238                                     struct loaded_vmcs *prev)
 239{
 240        struct vmcs_host_state *dest, *src;
 241
 242        if (unlikely(!vmx->guest_state_loaded))
 243                return;
 244
 245        src = &prev->host_state;
 246        dest = &vmx->loaded_vmcs->host_state;
 247
 248        vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
 249        dest->ldt_sel = src->ldt_sel;
 250#ifdef CONFIG_X86_64
 251        dest->ds_sel = src->ds_sel;
 252        dest->es_sel = src->es_sel;
 253#endif
 254}
 255
 256static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
 257{
 258        struct vcpu_vmx *vmx = to_vmx(vcpu);
 259        struct loaded_vmcs *prev;
 260        int cpu;
 261
 262        if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs))
 263                return;
 264
 265        cpu = get_cpu();
 266        prev = vmx->loaded_vmcs;
 267        vmx->loaded_vmcs = vmcs;
 268        vmx_vcpu_load_vmcs(vcpu, cpu, prev);
 269        vmx_sync_vmcs_host_state(vmx, prev);
 270        put_cpu();
 271
 272        vmx_register_cache_reset(vcpu);
 273}
 274
 275/*
 276 * Free whatever needs to be freed from vmx->nested when L1 goes down, or
 277 * just stops using VMX.
 278 */
 279static void free_nested(struct kvm_vcpu *vcpu)
 280{
 281        struct vcpu_vmx *vmx = to_vmx(vcpu);
 282
 283        if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01))
 284                vmx_switch_vmcs(vcpu, &vmx->vmcs01);
 285
 286        if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
 287                return;
 288
 289        kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
 290
 291        vmx->nested.vmxon = false;
 292        vmx->nested.smm.vmxon = false;
 293        free_vpid(vmx->nested.vpid02);
 294        vmx->nested.posted_intr_nv = -1;
 295        vmx->nested.current_vmptr = -1ull;
 296        if (enable_shadow_vmcs) {
 297                vmx_disable_shadow_vmcs(vmx);
 298                vmcs_clear(vmx->vmcs01.shadow_vmcs);
 299                free_vmcs(vmx->vmcs01.shadow_vmcs);
 300                vmx->vmcs01.shadow_vmcs = NULL;
 301        }
 302        kfree(vmx->nested.cached_vmcs12);
 303        vmx->nested.cached_vmcs12 = NULL;
 304        kfree(vmx->nested.cached_shadow_vmcs12);
 305        vmx->nested.cached_shadow_vmcs12 = NULL;
 306        /* Unpin physical memory we referred to in the vmcs02 */
 307        if (vmx->nested.apic_access_page) {
 308                kvm_release_page_clean(vmx->nested.apic_access_page);
 309                vmx->nested.apic_access_page = NULL;
 310        }
 311        kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
 312        kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
 313        vmx->nested.pi_desc = NULL;
 314
 315        kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
 316
 317        nested_release_evmcs(vcpu);
 318
 319        free_loaded_vmcs(&vmx->nested.vmcs02);
 320}
 321
 322/*
 323 * Ensure that the current vmcs of the logical processor is the
 324 * vmcs01 of the vcpu before calling free_nested().
 325 */
 326void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
 327{
 328        vcpu_load(vcpu);
 329        vmx_leave_nested(vcpu);
 330        vcpu_put(vcpu);
 331}
 332
 333#define EPTP_PA_MASK   GENMASK_ULL(51, 12)
 334
 335static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
 336{
 337        return VALID_PAGE(root_hpa) &&
 338               ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK));
 339}
 340
 341static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
 342                                       gpa_t addr)
 343{
 344        uint i;
 345        struct kvm_mmu_root_info *cached_root;
 346
 347        WARN_ON_ONCE(!mmu_is_nested(vcpu));
 348
 349        for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
 350                cached_root = &vcpu->arch.mmu->prev_roots[i];
 351
 352                if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd,
 353                                            eptp))
 354                        vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa);
 355        }
 356}
 357
 358static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
 359                struct x86_exception *fault)
 360{
 361        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 362        struct vcpu_vmx *vmx = to_vmx(vcpu);
 363        u32 vm_exit_reason;
 364        unsigned long exit_qualification = vcpu->arch.exit_qualification;
 365
 366        if (vmx->nested.pml_full) {
 367                vm_exit_reason = EXIT_REASON_PML_FULL;
 368                vmx->nested.pml_full = false;
 369                exit_qualification &= INTR_INFO_UNBLOCK_NMI;
 370        } else {
 371                if (fault->error_code & PFERR_RSVD_MASK)
 372                        vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
 373                else
 374                        vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
 375
 376                /*
 377                 * Although the caller (kvm_inject_emulated_page_fault) would
 378                 * have already synced the faulting address in the shadow EPT
 379                 * tables for the current EPTP12, we also need to sync it for
 380                 * any other cached EPTP02s based on the same EP4TA, since the
 381                 * TLB associates mappings to the EP4TA rather than the full EPTP.
 382                 */
 383                nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer,
 384                                           fault->address);
 385        }
 386
 387        nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification);
 388        vmcs12->guest_physical_address = fault->address;
 389}
 390
 391static void nested_ept_new_eptp(struct kvm_vcpu *vcpu)
 392{
 393        kvm_init_shadow_ept_mmu(vcpu,
 394                                to_vmx(vcpu)->nested.msrs.ept_caps &
 395                                VMX_EPT_EXECUTE_ONLY_BIT,
 396                                nested_ept_ad_enabled(vcpu),
 397                                nested_ept_get_eptp(vcpu));
 398}
 399
 400static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
 401{
 402        WARN_ON(mmu_is_nested(vcpu));
 403
 404        vcpu->arch.mmu = &vcpu->arch.guest_mmu;
 405        nested_ept_new_eptp(vcpu);
 406        vcpu->arch.mmu->get_guest_pgd     = nested_ept_get_eptp;
 407        vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
 408        vcpu->arch.mmu->get_pdptr         = kvm_pdptr_read;
 409
 410        vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
 411}
 412
 413static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
 414{
 415        vcpu->arch.mmu = &vcpu->arch.root_mmu;
 416        vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
 417}
 418
 419static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
 420                                            u16 error_code)
 421{
 422        bool inequality, bit;
 423
 424        bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
 425        inequality =
 426                (error_code & vmcs12->page_fault_error_code_mask) !=
 427                 vmcs12->page_fault_error_code_match;
 428        return inequality ^ bit;
 429}
 430
 431
 432/*
 433 * KVM wants to inject page-faults which it got to the guest. This function
 434 * checks whether in a nested guest, we need to inject them to L1 or L2.
 435 */
 436static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
 437{
 438        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 439        unsigned int nr = vcpu->arch.exception.nr;
 440        bool has_payload = vcpu->arch.exception.has_payload;
 441        unsigned long payload = vcpu->arch.exception.payload;
 442
 443        if (nr == PF_VECTOR) {
 444                if (vcpu->arch.exception.nested_apf) {
 445                        *exit_qual = vcpu->arch.apf.nested_apf_token;
 446                        return 1;
 447                }
 448                if (nested_vmx_is_page_fault_vmexit(vmcs12,
 449                                                    vcpu->arch.exception.error_code)) {
 450                        *exit_qual = has_payload ? payload : vcpu->arch.cr2;
 451                        return 1;
 452                }
 453        } else if (vmcs12->exception_bitmap & (1u << nr)) {
 454                if (nr == DB_VECTOR) {
 455                        if (!has_payload) {
 456                                payload = vcpu->arch.dr6;
 457                                payload &= ~DR6_BT;
 458                                payload ^= DR6_ACTIVE_LOW;
 459                        }
 460                        *exit_qual = payload;
 461                } else
 462                        *exit_qual = 0;
 463                return 1;
 464        }
 465
 466        return 0;
 467}
 468
 469
 470static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
 471                struct x86_exception *fault)
 472{
 473        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 474
 475        WARN_ON(!is_guest_mode(vcpu));
 476
 477        if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
 478                !to_vmx(vcpu)->nested.nested_run_pending) {
 479                vmcs12->vm_exit_intr_error_code = fault->error_code;
 480                nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
 481                                  PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
 482                                  INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
 483                                  fault->address);
 484        } else {
 485                kvm_inject_page_fault(vcpu, fault);
 486        }
 487}
 488
 489static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
 490                                               struct vmcs12 *vmcs12)
 491{
 492        if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
 493                return 0;
 494
 495        if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) ||
 496            CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b)))
 497                return -EINVAL;
 498
 499        return 0;
 500}
 501
 502static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
 503                                                struct vmcs12 *vmcs12)
 504{
 505        if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
 506                return 0;
 507
 508        if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap)))
 509                return -EINVAL;
 510
 511        return 0;
 512}
 513
 514static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
 515                                                struct vmcs12 *vmcs12)
 516{
 517        if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
 518                return 0;
 519
 520        if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)))
 521                return -EINVAL;
 522
 523        return 0;
 524}
 525
 526/*
 527 * Check if MSR is intercepted for L01 MSR bitmap.
 528 */
 529static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
 530{
 531        unsigned long *msr_bitmap;
 532        int f = sizeof(unsigned long);
 533
 534        if (!cpu_has_vmx_msr_bitmap())
 535                return true;
 536
 537        msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
 538
 539        if (msr <= 0x1fff) {
 540                return !!test_bit(msr, msr_bitmap + 0x800 / f);
 541        } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
 542                msr &= 0x1fff;
 543                return !!test_bit(msr, msr_bitmap + 0xc00 / f);
 544        }
 545
 546        return true;
 547}
 548
 549/*
 550 * If a msr is allowed by L0, we should check whether it is allowed by L1.
 551 * The corresponding bit will be cleared unless both of L0 and L1 allow it.
 552 */
 553static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
 554                                               unsigned long *msr_bitmap_nested,
 555                                               u32 msr, int type)
 556{
 557        int f = sizeof(unsigned long);
 558
 559        /*
 560         * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
 561         * have the write-low and read-high bitmap offsets the wrong way round.
 562         * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
 563         */
 564        if (msr <= 0x1fff) {
 565                if (type & MSR_TYPE_R &&
 566                   !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
 567                        /* read-low */
 568                        __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
 569
 570                if (type & MSR_TYPE_W &&
 571                   !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
 572                        /* write-low */
 573                        __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
 574
 575        } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
 576                msr &= 0x1fff;
 577                if (type & MSR_TYPE_R &&
 578                   !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
 579                        /* read-high */
 580                        __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
 581
 582                if (type & MSR_TYPE_W &&
 583                   !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
 584                        /* write-high */
 585                        __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
 586
 587        }
 588}
 589
 590static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
 591{
 592        int msr;
 593
 594        for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
 595                unsigned word = msr / BITS_PER_LONG;
 596
 597                msr_bitmap[word] = ~0;
 598                msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
 599        }
 600}
 601
 602/*
 603 * Merge L0's and L1's MSR bitmap, return false to indicate that
 604 * we do not use the hardware.
 605 */
 606static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 607                                                 struct vmcs12 *vmcs12)
 608{
 609        int msr;
 610        unsigned long *msr_bitmap_l1;
 611        unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
 612        struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map;
 613
 614        /* Nothing to do if the MSR bitmap is not in use.  */
 615        if (!cpu_has_vmx_msr_bitmap() ||
 616            !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
 617                return false;
 618
 619        if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
 620                return false;
 621
 622        msr_bitmap_l1 = (unsigned long *)map->hva;
 623
 624        /*
 625         * To keep the control flow simple, pay eight 8-byte writes (sixteen
 626         * 4-byte writes on 32-bit systems) up front to enable intercepts for
 627         * the x2APIC MSR range and selectively disable them below.
 628         */
 629        enable_x2apic_msr_intercepts(msr_bitmap_l0);
 630
 631        if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
 632                if (nested_cpu_has_apic_reg_virt(vmcs12)) {
 633                        /*
 634                         * L0 need not intercept reads for MSRs between 0x800
 635                         * and 0x8ff, it just lets the processor take the value
 636                         * from the virtual-APIC page; take those 256 bits
 637                         * directly from the L1 bitmap.
 638                         */
 639                        for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
 640                                unsigned word = msr / BITS_PER_LONG;
 641
 642                                msr_bitmap_l0[word] = msr_bitmap_l1[word];
 643                        }
 644                }
 645
 646                nested_vmx_disable_intercept_for_msr(
 647                        msr_bitmap_l1, msr_bitmap_l0,
 648                        X2APIC_MSR(APIC_TASKPRI),
 649                        MSR_TYPE_R | MSR_TYPE_W);
 650
 651                if (nested_cpu_has_vid(vmcs12)) {
 652                        nested_vmx_disable_intercept_for_msr(
 653                                msr_bitmap_l1, msr_bitmap_l0,
 654                                X2APIC_MSR(APIC_EOI),
 655                                MSR_TYPE_W);
 656                        nested_vmx_disable_intercept_for_msr(
 657                                msr_bitmap_l1, msr_bitmap_l0,
 658                                X2APIC_MSR(APIC_SELF_IPI),
 659                                MSR_TYPE_W);
 660                }
 661        }
 662
 663        /* KVM unconditionally exposes the FS/GS base MSRs to L1. */
 664#ifdef CONFIG_X86_64
 665        nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
 666                                             MSR_FS_BASE, MSR_TYPE_RW);
 667
 668        nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
 669                                             MSR_GS_BASE, MSR_TYPE_RW);
 670
 671        nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
 672                                             MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
 673#endif
 674
 675        /*
 676         * Checking the L0->L1 bitmap is trying to verify two things:
 677         *
 678         * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
 679         *    ensures that we do not accidentally generate an L02 MSR bitmap
 680         *    from the L12 MSR bitmap that is too permissive.
 681         * 2. That L1 or L2s have actually used the MSR. This avoids
 682         *    unnecessarily merging of the bitmap if the MSR is unused. This
 683         *    works properly because we only update the L01 MSR bitmap lazily.
 684         *    So even if L0 should pass L1 these MSRs, the L01 bitmap is only
 685         *    updated to reflect this when L1 (or its L2s) actually write to
 686         *    the MSR.
 687         */
 688        if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL))
 689                nested_vmx_disable_intercept_for_msr(
 690                                        msr_bitmap_l1, msr_bitmap_l0,
 691                                        MSR_IA32_SPEC_CTRL,
 692                                        MSR_TYPE_R | MSR_TYPE_W);
 693
 694        if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD))
 695                nested_vmx_disable_intercept_for_msr(
 696                                        msr_bitmap_l1, msr_bitmap_l0,
 697                                        MSR_IA32_PRED_CMD,
 698                                        MSR_TYPE_W);
 699
 700        kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false);
 701
 702        return true;
 703}
 704
 705static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
 706                                       struct vmcs12 *vmcs12)
 707{
 708        struct kvm_host_map map;
 709        struct vmcs12 *shadow;
 710
 711        if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
 712            vmcs12->vmcs_link_pointer == -1ull)
 713                return;
 714
 715        shadow = get_shadow_vmcs12(vcpu);
 716
 717        if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
 718                return;
 719
 720        memcpy(shadow, map.hva, VMCS12_SIZE);
 721        kvm_vcpu_unmap(vcpu, &map, false);
 722}
 723
 724static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
 725                                              struct vmcs12 *vmcs12)
 726{
 727        struct vcpu_vmx *vmx = to_vmx(vcpu);
 728
 729        if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
 730            vmcs12->vmcs_link_pointer == -1ull)
 731                return;
 732
 733        kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
 734                        get_shadow_vmcs12(vcpu), VMCS12_SIZE);
 735}
 736
 737/*
 738 * In nested virtualization, check if L1 has set
 739 * VM_EXIT_ACK_INTR_ON_EXIT
 740 */
 741static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
 742{
 743        return get_vmcs12(vcpu)->vm_exit_controls &
 744                VM_EXIT_ACK_INTR_ON_EXIT;
 745}
 746
 747static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
 748                                          struct vmcs12 *vmcs12)
 749{
 750        if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
 751            CC(!page_address_valid(vcpu, vmcs12->apic_access_addr)))
 752                return -EINVAL;
 753        else
 754                return 0;
 755}
 756
 757static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
 758                                           struct vmcs12 *vmcs12)
 759{
 760        if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
 761            !nested_cpu_has_apic_reg_virt(vmcs12) &&
 762            !nested_cpu_has_vid(vmcs12) &&
 763            !nested_cpu_has_posted_intr(vmcs12))
 764                return 0;
 765
 766        /*
 767         * If virtualize x2apic mode is enabled,
 768         * virtualize apic access must be disabled.
 769         */
 770        if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) &&
 771               nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)))
 772                return -EINVAL;
 773
 774        /*
 775         * If virtual interrupt delivery is enabled,
 776         * we must exit on external interrupts.
 777         */
 778        if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu)))
 779                return -EINVAL;
 780
 781        /*
 782         * bits 15:8 should be zero in posted_intr_nv,
 783         * the descriptor address has been already checked
 784         * in nested_get_vmcs12_pages.
 785         *
 786         * bits 5:0 of posted_intr_desc_addr should be zero.
 787         */
 788        if (nested_cpu_has_posted_intr(vmcs12) &&
 789           (CC(!nested_cpu_has_vid(vmcs12)) ||
 790            CC(!nested_exit_intr_ack_set(vcpu)) ||
 791            CC((vmcs12->posted_intr_nv & 0xff00)) ||
 792            CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64))))
 793                return -EINVAL;
 794
 795        /* tpr shadow is needed by all apicv features. */
 796        if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)))
 797                return -EINVAL;
 798
 799        return 0;
 800}
 801
 802static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
 803                                       u32 count, u64 addr)
 804{
 805        if (count == 0)
 806                return 0;
 807
 808        if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) ||
 809            !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1)))
 810                return -EINVAL;
 811
 812        return 0;
 813}
 814
 815static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu,
 816                                                     struct vmcs12 *vmcs12)
 817{
 818        if (CC(nested_vmx_check_msr_switch(vcpu,
 819                                           vmcs12->vm_exit_msr_load_count,
 820                                           vmcs12->vm_exit_msr_load_addr)) ||
 821            CC(nested_vmx_check_msr_switch(vcpu,
 822                                           vmcs12->vm_exit_msr_store_count,
 823                                           vmcs12->vm_exit_msr_store_addr)))
 824                return -EINVAL;
 825
 826        return 0;
 827}
 828
 829static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu,
 830                                                      struct vmcs12 *vmcs12)
 831{
 832        if (CC(nested_vmx_check_msr_switch(vcpu,
 833                                           vmcs12->vm_entry_msr_load_count,
 834                                           vmcs12->vm_entry_msr_load_addr)))
 835                return -EINVAL;
 836
 837        return 0;
 838}
 839
 840static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
 841                                         struct vmcs12 *vmcs12)
 842{
 843        if (!nested_cpu_has_pml(vmcs12))
 844                return 0;
 845
 846        if (CC(!nested_cpu_has_ept(vmcs12)) ||
 847            CC(!page_address_valid(vcpu, vmcs12->pml_address)))
 848                return -EINVAL;
 849
 850        return 0;
 851}
 852
 853static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu,
 854                                                        struct vmcs12 *vmcs12)
 855{
 856        if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) &&
 857               !nested_cpu_has_ept(vmcs12)))
 858                return -EINVAL;
 859        return 0;
 860}
 861
 862static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu,
 863                                                         struct vmcs12 *vmcs12)
 864{
 865        if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) &&
 866               !nested_cpu_has_ept(vmcs12)))
 867                return -EINVAL;
 868        return 0;
 869}
 870
 871static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
 872                                                 struct vmcs12 *vmcs12)
 873{
 874        if (!nested_cpu_has_shadow_vmcs(vmcs12))
 875                return 0;
 876
 877        if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) ||
 878            CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap)))
 879                return -EINVAL;
 880
 881        return 0;
 882}
 883
 884static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
 885                                       struct vmx_msr_entry *e)
 886{
 887        /* x2APIC MSR accesses are not allowed */
 888        if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8))
 889                return -EINVAL;
 890        if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */
 891            CC(e->index == MSR_IA32_UCODE_REV))
 892                return -EINVAL;
 893        if (CC(e->reserved != 0))
 894                return -EINVAL;
 895        return 0;
 896}
 897
 898static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
 899                                     struct vmx_msr_entry *e)
 900{
 901        if (CC(e->index == MSR_FS_BASE) ||
 902            CC(e->index == MSR_GS_BASE) ||
 903            CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */
 904            nested_vmx_msr_check_common(vcpu, e))
 905                return -EINVAL;
 906        return 0;
 907}
 908
 909static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
 910                                      struct vmx_msr_entry *e)
 911{
 912        if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */
 913            nested_vmx_msr_check_common(vcpu, e))
 914                return -EINVAL;
 915        return 0;
 916}
 917
 918static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
 919{
 920        struct vcpu_vmx *vmx = to_vmx(vcpu);
 921        u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
 922                                       vmx->nested.msrs.misc_high);
 923
 924        return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
 925}
 926
 927/*
 928 * Load guest's/host's msr at nested entry/exit.
 929 * return 0 for success, entry index for failure.
 930 *
 931 * One of the failure modes for MSR load/store is when a list exceeds the
 932 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch
 933 * as possible, process all valid entries before failing rather than precheck
 934 * for a capacity violation.
 935 */
 936static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
 937{
 938        u32 i;
 939        struct vmx_msr_entry e;
 940        u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
 941
 942        for (i = 0; i < count; i++) {
 943                if (unlikely(i >= max_msr_list_size))
 944                        goto fail;
 945
 946                if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
 947                                        &e, sizeof(e))) {
 948                        pr_debug_ratelimited(
 949                                "%s cannot read MSR entry (%u, 0x%08llx)\n",
 950                                __func__, i, gpa + i * sizeof(e));
 951                        goto fail;
 952                }
 953                if (nested_vmx_load_msr_check(vcpu, &e)) {
 954                        pr_debug_ratelimited(
 955                                "%s check failed (%u, 0x%x, 0x%x)\n",
 956                                __func__, i, e.index, e.reserved);
 957                        goto fail;
 958                }
 959                if (kvm_set_msr(vcpu, e.index, e.value)) {
 960                        pr_debug_ratelimited(
 961                                "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
 962                                __func__, i, e.index, e.value);
 963                        goto fail;
 964                }
 965        }
 966        return 0;
 967fail:
 968        /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */
 969        return i + 1;
 970}
 971
 972static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
 973                                            u32 msr_index,
 974                                            u64 *data)
 975{
 976        struct vcpu_vmx *vmx = to_vmx(vcpu);
 977
 978        /*
 979         * If the L0 hypervisor stored a more accurate value for the TSC that
 980         * does not include the time taken for emulation of the L2->L1
 981         * VM-exit in L0, use the more accurate value.
 982         */
 983        if (msr_index == MSR_IA32_TSC) {
 984                int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest,
 985                                                    MSR_IA32_TSC);
 986
 987                if (i >= 0) {
 988                        u64 val = vmx->msr_autostore.guest.val[i].value;
 989
 990                        *data = kvm_read_l1_tsc(vcpu, val);
 991                        return true;
 992                }
 993        }
 994
 995        if (kvm_get_msr(vcpu, msr_index, data)) {
 996                pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
 997                        msr_index);
 998                return false;
 999        }
1000        return true;
1001}
1002
1003static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i,
1004                                     struct vmx_msr_entry *e)
1005{
1006        if (kvm_vcpu_read_guest(vcpu,
1007                                gpa + i * sizeof(*e),
1008                                e, 2 * sizeof(u32))) {
1009                pr_debug_ratelimited(
1010                        "%s cannot read MSR entry (%u, 0x%08llx)\n",
1011                        __func__, i, gpa + i * sizeof(*e));
1012                return false;
1013        }
1014        if (nested_vmx_store_msr_check(vcpu, e)) {
1015                pr_debug_ratelimited(
1016                        "%s check failed (%u, 0x%x, 0x%x)\n",
1017                        __func__, i, e->index, e->reserved);
1018                return false;
1019        }
1020        return true;
1021}
1022
1023static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
1024{
1025        u64 data;
1026        u32 i;
1027        struct vmx_msr_entry e;
1028        u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
1029
1030        for (i = 0; i < count; i++) {
1031                if (unlikely(i >= max_msr_list_size))
1032                        return -EINVAL;
1033
1034                if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
1035                        return -EINVAL;
1036
1037                if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data))
1038                        return -EINVAL;
1039
1040                if (kvm_vcpu_write_guest(vcpu,
1041                                         gpa + i * sizeof(e) +
1042                                             offsetof(struct vmx_msr_entry, value),
1043                                         &data, sizeof(data))) {
1044                        pr_debug_ratelimited(
1045                                "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
1046                                __func__, i, e.index, data);
1047                        return -EINVAL;
1048                }
1049        }
1050        return 0;
1051}
1052
1053static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index)
1054{
1055        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1056        u32 count = vmcs12->vm_exit_msr_store_count;
1057        u64 gpa = vmcs12->vm_exit_msr_store_addr;
1058        struct vmx_msr_entry e;
1059        u32 i;
1060
1061        for (i = 0; i < count; i++) {
1062                if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
1063                        return false;
1064
1065                if (e.index == msr_index)
1066                        return true;
1067        }
1068        return false;
1069}
1070
1071static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
1072                                           u32 msr_index)
1073{
1074        struct vcpu_vmx *vmx = to_vmx(vcpu);
1075        struct vmx_msrs *autostore = &vmx->msr_autostore.guest;
1076        bool in_vmcs12_store_list;
1077        int msr_autostore_slot;
1078        bool in_autostore_list;
1079        int last;
1080
1081        msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index);
1082        in_autostore_list = msr_autostore_slot >= 0;
1083        in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index);
1084
1085        if (in_vmcs12_store_list && !in_autostore_list) {
1086                if (autostore->nr == MAX_NR_LOADSTORE_MSRS) {
1087                        /*
1088                         * Emulated VMEntry does not fail here.  Instead a less
1089                         * accurate value will be returned by
1090                         * nested_vmx_get_vmexit_msr_value() using kvm_get_msr()
1091                         * instead of reading the value from the vmcs02 VMExit
1092                         * MSR-store area.
1093                         */
1094                        pr_warn_ratelimited(
1095                                "Not enough msr entries in msr_autostore.  Can't add msr %x\n",
1096                                msr_index);
1097                        return;
1098                }
1099                last = autostore->nr++;
1100                autostore->val[last].index = msr_index;
1101        } else if (!in_vmcs12_store_list && in_autostore_list) {
1102                last = --autostore->nr;
1103                autostore->val[msr_autostore_slot] = autostore->val[last];
1104        }
1105}
1106
1107/*
1108 * Load guest's/host's cr3 at nested entry/exit.  @nested_ept is true if we are
1109 * emulating VM-Entry into a guest with EPT enabled.  On failure, the expected
1110 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to
1111 * @entry_failure_code.
1112 */
1113static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
1114                               bool nested_ept, bool reload_pdptrs,
1115                               enum vm_entry_failure_code *entry_failure_code)
1116{
1117        if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) {
1118                *entry_failure_code = ENTRY_FAIL_DEFAULT;
1119                return -EINVAL;
1120        }
1121
1122        /*
1123         * If PAE paging and EPT are both on, CR3 is not used by the CPU and
1124         * must not be dereferenced.
1125         */
1126        if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) &&
1127            CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) {
1128                *entry_failure_code = ENTRY_FAIL_PDPTE;
1129                return -EINVAL;
1130        }
1131
1132        if (!nested_ept)
1133                kvm_mmu_new_pgd(vcpu, cr3);
1134
1135        vcpu->arch.cr3 = cr3;
1136        kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
1137
1138        /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
1139        kvm_init_mmu(vcpu);
1140
1141        return 0;
1142}
1143
1144/*
1145 * Returns if KVM is able to config CPU to tag TLB entries
1146 * populated by L2 differently than TLB entries populated
1147 * by L1.
1148 *
1149 * If L0 uses EPT, L1 and L2 run with different EPTP because
1150 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries
1151 * are tagged with different EPTP.
1152 *
1153 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
1154 * with different VPID (L1 entries are tagged with vmx->vpid
1155 * while L2 entries are tagged with vmx->nested.vpid02).
1156 */
1157static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
1158{
1159        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1160
1161        return enable_ept ||
1162               (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
1163}
1164
1165static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
1166                                            struct vmcs12 *vmcs12,
1167                                            bool is_vmenter)
1168{
1169        struct vcpu_vmx *vmx = to_vmx(vcpu);
1170
1171        /*
1172         * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
1173         * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a
1174         * full TLB flush from the guest's perspective.  This is required even
1175         * if VPID is disabled in the host as KVM may need to synchronize the
1176         * MMU in response to the guest TLB flush.
1177         *
1178         * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use.
1179         * EPT is a special snowflake, as guest-physical mappings aren't
1180         * flushed on VPID invalidations, including VM-Enter or VM-Exit with
1181         * VPID disabled.  As a result, KVM _never_ needs to sync nEPT
1182         * entries on VM-Enter because L1 can't rely on VM-Enter to flush
1183         * those mappings.
1184         */
1185        if (!nested_cpu_has_vpid(vmcs12)) {
1186                kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
1187                return;
1188        }
1189
1190        /* L2 should never have a VPID if VPID is disabled. */
1191        WARN_ON(!enable_vpid);
1192
1193        /*
1194         * If VPID is enabled and used by vmc12, but L2 does not have a unique
1195         * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate
1196         * a VPID for L2, flush the current context as the effective ASID is
1197         * common to both L1 and L2.
1198         *
1199         * Defer the flush so that it runs after vmcs02.EPTP has been set by
1200         * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid
1201         * redundant flushes further down the nested pipeline.
1202         *
1203         * If a TLB flush isn't required due to any of the above, and vpid12 is
1204         * changing then the new "virtual" VPID (vpid12) will reuse the same
1205         * "real" VPID (vpid02), and so needs to be flushed.  There's no direct
1206         * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for
1207         * all nested vCPUs.  Remember, a flush on VM-Enter does not invalidate
1208         * guest-physical mappings, so there is no need to sync the nEPT MMU.
1209         */
1210        if (!nested_has_guest_tlb_tag(vcpu)) {
1211                kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
1212        } else if (is_vmenter &&
1213                   vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
1214                vmx->nested.last_vpid = vmcs12->virtual_processor_id;
1215                vpid_sync_context(nested_get_vpid02(vcpu));
1216        }
1217}
1218
1219static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
1220{
1221        superset &= mask;
1222        subset &= mask;
1223
1224        return (superset | subset) == superset;
1225}
1226
1227static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
1228{
1229        const u64 feature_and_reserved =
1230                /* feature (except bit 48; see below) */
1231                BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
1232                /* reserved */
1233                BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
1234        u64 vmx_basic = vmx->nested.msrs.basic;
1235
1236        if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
1237                return -EINVAL;
1238
1239        /*
1240         * KVM does not emulate a version of VMX that constrains physical
1241         * addresses of VMX structures (e.g. VMCS) to 32-bits.
1242         */
1243        if (data & BIT_ULL(48))
1244                return -EINVAL;
1245
1246        if (vmx_basic_vmcs_revision_id(vmx_basic) !=
1247            vmx_basic_vmcs_revision_id(data))
1248                return -EINVAL;
1249
1250        if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
1251                return -EINVAL;
1252
1253        vmx->nested.msrs.basic = data;
1254        return 0;
1255}
1256
1257static int
1258vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1259{
1260        u64 supported;
1261        u32 *lowp, *highp;
1262
1263        switch (msr_index) {
1264        case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1265                lowp = &vmx->nested.msrs.pinbased_ctls_low;
1266                highp = &vmx->nested.msrs.pinbased_ctls_high;
1267                break;
1268        case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1269                lowp = &vmx->nested.msrs.procbased_ctls_low;
1270                highp = &vmx->nested.msrs.procbased_ctls_high;
1271                break;
1272        case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1273                lowp = &vmx->nested.msrs.exit_ctls_low;
1274                highp = &vmx->nested.msrs.exit_ctls_high;
1275                break;
1276        case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1277                lowp = &vmx->nested.msrs.entry_ctls_low;
1278                highp = &vmx->nested.msrs.entry_ctls_high;
1279                break;
1280        case MSR_IA32_VMX_PROCBASED_CTLS2:
1281                lowp = &vmx->nested.msrs.secondary_ctls_low;
1282                highp = &vmx->nested.msrs.secondary_ctls_high;
1283                break;
1284        default:
1285                BUG();
1286        }
1287
1288        supported = vmx_control_msr(*lowp, *highp);
1289
1290        /* Check must-be-1 bits are still 1. */
1291        if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
1292                return -EINVAL;
1293
1294        /* Check must-be-0 bits are still 0. */
1295        if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
1296                return -EINVAL;
1297
1298        *lowp = data;
1299        *highp = data >> 32;
1300        return 0;
1301}
1302
1303static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
1304{
1305        const u64 feature_and_reserved_bits =
1306                /* feature */
1307                BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
1308                BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
1309                /* reserved */
1310                GENMASK_ULL(13, 9) | BIT_ULL(31);
1311        u64 vmx_misc;
1312
1313        vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
1314                                   vmx->nested.msrs.misc_high);
1315
1316        if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
1317                return -EINVAL;
1318
1319        if ((vmx->nested.msrs.pinbased_ctls_high &
1320             PIN_BASED_VMX_PREEMPTION_TIMER) &&
1321            vmx_misc_preemption_timer_rate(data) !=
1322            vmx_misc_preemption_timer_rate(vmx_misc))
1323                return -EINVAL;
1324
1325        if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
1326                return -EINVAL;
1327
1328        if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
1329                return -EINVAL;
1330
1331        if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
1332                return -EINVAL;
1333
1334        vmx->nested.msrs.misc_low = data;
1335        vmx->nested.msrs.misc_high = data >> 32;
1336
1337        return 0;
1338}
1339
1340static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
1341{
1342        u64 vmx_ept_vpid_cap;
1343
1344        vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
1345                                           vmx->nested.msrs.vpid_caps);
1346
1347        /* Every bit is either reserved or a feature bit. */
1348        if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
1349                return -EINVAL;
1350
1351        vmx->nested.msrs.ept_caps = data;
1352        vmx->nested.msrs.vpid_caps = data >> 32;
1353        return 0;
1354}
1355
1356static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1357{
1358        u64 *msr;
1359
1360        switch (msr_index) {
1361        case MSR_IA32_VMX_CR0_FIXED0:
1362                msr = &vmx->nested.msrs.cr0_fixed0;
1363                break;
1364        case MSR_IA32_VMX_CR4_FIXED0:
1365                msr = &vmx->nested.msrs.cr4_fixed0;
1366                break;
1367        default:
1368                BUG();
1369        }
1370
1371        /*
1372         * 1 bits (which indicates bits which "must-be-1" during VMX operation)
1373         * must be 1 in the restored value.
1374         */
1375        if (!is_bitwise_subset(data, *msr, -1ULL))
1376                return -EINVAL;
1377
1378        *msr = data;
1379        return 0;
1380}
1381
1382/*
1383 * Called when userspace is restoring VMX MSRs.
1384 *
1385 * Returns 0 on success, non-0 otherwise.
1386 */
1387int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1388{
1389        struct vcpu_vmx *vmx = to_vmx(vcpu);
1390
1391        /*
1392         * Don't allow changes to the VMX capability MSRs while the vCPU
1393         * is in VMX operation.
1394         */
1395        if (vmx->nested.vmxon)
1396                return -EBUSY;
1397
1398        switch (msr_index) {
1399        case MSR_IA32_VMX_BASIC:
1400                return vmx_restore_vmx_basic(vmx, data);
1401        case MSR_IA32_VMX_PINBASED_CTLS:
1402        case MSR_IA32_VMX_PROCBASED_CTLS:
1403        case MSR_IA32_VMX_EXIT_CTLS:
1404        case MSR_IA32_VMX_ENTRY_CTLS:
1405                /*
1406                 * The "non-true" VMX capability MSRs are generated from the
1407                 * "true" MSRs, so we do not support restoring them directly.
1408                 *
1409                 * If userspace wants to emulate VMX_BASIC[55]=0, userspace
1410                 * should restore the "true" MSRs with the must-be-1 bits
1411                 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
1412                 * DEFAULT SETTINGS".
1413                 */
1414                return -EINVAL;
1415        case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1416        case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1417        case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1418        case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1419        case MSR_IA32_VMX_PROCBASED_CTLS2:
1420                return vmx_restore_control_msr(vmx, msr_index, data);
1421        case MSR_IA32_VMX_MISC:
1422                return vmx_restore_vmx_misc(vmx, data);
1423        case MSR_IA32_VMX_CR0_FIXED0:
1424        case MSR_IA32_VMX_CR4_FIXED0:
1425                return vmx_restore_fixed0_msr(vmx, msr_index, data);
1426        case MSR_IA32_VMX_CR0_FIXED1:
1427        case MSR_IA32_VMX_CR4_FIXED1:
1428                /*
1429                 * These MSRs are generated based on the vCPU's CPUID, so we
1430                 * do not support restoring them directly.
1431                 */
1432                return -EINVAL;
1433        case MSR_IA32_VMX_EPT_VPID_CAP:
1434                return vmx_restore_vmx_ept_vpid_cap(vmx, data);
1435        case MSR_IA32_VMX_VMCS_ENUM:
1436                vmx->nested.msrs.vmcs_enum = data;
1437                return 0;
1438        case MSR_IA32_VMX_VMFUNC:
1439                if (data & ~vmx->nested.msrs.vmfunc_controls)
1440                        return -EINVAL;
1441                vmx->nested.msrs.vmfunc_controls = data;
1442                return 0;
1443        default:
1444                /*
1445                 * The rest of the VMX capability MSRs do not support restore.
1446                 */
1447                return -EINVAL;
1448        }
1449}
1450
1451/* Returns 0 on success, non-0 otherwise. */
1452int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
1453{
1454        switch (msr_index) {
1455        case MSR_IA32_VMX_BASIC:
1456                *pdata = msrs->basic;
1457                break;
1458        case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1459        case MSR_IA32_VMX_PINBASED_CTLS:
1460                *pdata = vmx_control_msr(
1461                        msrs->pinbased_ctls_low,
1462                        msrs->pinbased_ctls_high);
1463                if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
1464                        *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1465                break;
1466        case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1467        case MSR_IA32_VMX_PROCBASED_CTLS:
1468                *pdata = vmx_control_msr(
1469                        msrs->procbased_ctls_low,
1470                        msrs->procbased_ctls_high);
1471                if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
1472                        *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1473                break;
1474        case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1475        case MSR_IA32_VMX_EXIT_CTLS:
1476                *pdata = vmx_control_msr(
1477                        msrs->exit_ctls_low,
1478                        msrs->exit_ctls_high);
1479                if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
1480                        *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
1481                break;
1482        case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1483        case MSR_IA32_VMX_ENTRY_CTLS:
1484                *pdata = vmx_control_msr(
1485                        msrs->entry_ctls_low,
1486                        msrs->entry_ctls_high);
1487                if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
1488                        *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
1489                break;
1490        case MSR_IA32_VMX_MISC:
1491                *pdata = vmx_control_msr(
1492                        msrs->misc_low,
1493                        msrs->misc_high);
1494                break;
1495        case MSR_IA32_VMX_CR0_FIXED0:
1496                *pdata = msrs->cr0_fixed0;
1497                break;
1498        case MSR_IA32_VMX_CR0_FIXED1:
1499                *pdata = msrs->cr0_fixed1;
1500                break;
1501        case MSR_IA32_VMX_CR4_FIXED0:
1502                *pdata = msrs->cr4_fixed0;
1503                break;
1504        case MSR_IA32_VMX_CR4_FIXED1:
1505                *pdata = msrs->cr4_fixed1;
1506                break;
1507        case MSR_IA32_VMX_VMCS_ENUM:
1508                *pdata = msrs->vmcs_enum;
1509                break;
1510        case MSR_IA32_VMX_PROCBASED_CTLS2:
1511                *pdata = vmx_control_msr(
1512                        msrs->secondary_ctls_low,
1513                        msrs->secondary_ctls_high);
1514                break;
1515        case MSR_IA32_VMX_EPT_VPID_CAP:
1516                *pdata = msrs->ept_caps |
1517                        ((u64)msrs->vpid_caps << 32);
1518                break;
1519        case MSR_IA32_VMX_VMFUNC:
1520                *pdata = msrs->vmfunc_controls;
1521                break;
1522        default:
1523                return 1;
1524        }
1525
1526        return 0;
1527}
1528
1529/*
1530 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have
1531 * been modified by the L1 guest.  Note, "writable" in this context means
1532 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of
1533 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only"
1534 * VM-exit information fields (which are actually writable if the vCPU is
1535 * configured to support "VMWRITE to any supported field in the VMCS").
1536 */
1537static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
1538{
1539        struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1540        struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1541        struct shadow_vmcs_field field;
1542        unsigned long val;
1543        int i;
1544
1545        if (WARN_ON(!shadow_vmcs))
1546                return;
1547
1548        preempt_disable();
1549
1550        vmcs_load(shadow_vmcs);
1551
1552        for (i = 0; i < max_shadow_read_write_fields; i++) {
1553                field = shadow_read_write_fields[i];
1554                val = __vmcs_readl(field.encoding);
1555                vmcs12_write_any(vmcs12, field.encoding, field.offset, val);
1556        }
1557
1558        vmcs_clear(shadow_vmcs);
1559        vmcs_load(vmx->loaded_vmcs->vmcs);
1560
1561        preempt_enable();
1562}
1563
1564static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
1565{
1566        const struct shadow_vmcs_field *fields[] = {
1567                shadow_read_write_fields,
1568                shadow_read_only_fields
1569        };
1570        const int max_fields[] = {
1571                max_shadow_read_write_fields,
1572                max_shadow_read_only_fields
1573        };
1574        struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1575        struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1576        struct shadow_vmcs_field field;
1577        unsigned long val;
1578        int i, q;
1579
1580        if (WARN_ON(!shadow_vmcs))
1581                return;
1582
1583        vmcs_load(shadow_vmcs);
1584
1585        for (q = 0; q < ARRAY_SIZE(fields); q++) {
1586                for (i = 0; i < max_fields[q]; i++) {
1587                        field = fields[q][i];
1588                        val = vmcs12_read_any(vmcs12, field.encoding,
1589                                              field.offset);
1590                        __vmcs_writel(field.encoding, val);
1591                }
1592        }
1593
1594        vmcs_clear(shadow_vmcs);
1595        vmcs_load(vmx->loaded_vmcs->vmcs);
1596}
1597
1598static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields)
1599{
1600        struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1601        struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1602
1603        /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
1604        vmcs12->tpr_threshold = evmcs->tpr_threshold;
1605        vmcs12->guest_rip = evmcs->guest_rip;
1606
1607        if (unlikely(!(hv_clean_fields &
1608                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
1609                vmcs12->guest_rsp = evmcs->guest_rsp;
1610                vmcs12->guest_rflags = evmcs->guest_rflags;
1611                vmcs12->guest_interruptibility_info =
1612                        evmcs->guest_interruptibility_info;
1613        }
1614
1615        if (unlikely(!(hv_clean_fields &
1616                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
1617                vmcs12->cpu_based_vm_exec_control =
1618                        evmcs->cpu_based_vm_exec_control;
1619        }
1620
1621        if (unlikely(!(hv_clean_fields &
1622                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) {
1623                vmcs12->exception_bitmap = evmcs->exception_bitmap;
1624        }
1625
1626        if (unlikely(!(hv_clean_fields &
1627                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
1628                vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
1629        }
1630
1631        if (unlikely(!(hv_clean_fields &
1632                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
1633                vmcs12->vm_entry_intr_info_field =
1634                        evmcs->vm_entry_intr_info_field;
1635                vmcs12->vm_entry_exception_error_code =
1636                        evmcs->vm_entry_exception_error_code;
1637                vmcs12->vm_entry_instruction_len =
1638                        evmcs->vm_entry_instruction_len;
1639        }
1640
1641        if (unlikely(!(hv_clean_fields &
1642                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
1643                vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
1644                vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
1645                vmcs12->host_cr0 = evmcs->host_cr0;
1646                vmcs12->host_cr3 = evmcs->host_cr3;
1647                vmcs12->host_cr4 = evmcs->host_cr4;
1648                vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
1649                vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
1650                vmcs12->host_rip = evmcs->host_rip;
1651                vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
1652                vmcs12->host_es_selector = evmcs->host_es_selector;
1653                vmcs12->host_cs_selector = evmcs->host_cs_selector;
1654                vmcs12->host_ss_selector = evmcs->host_ss_selector;
1655                vmcs12->host_ds_selector = evmcs->host_ds_selector;
1656                vmcs12->host_fs_selector = evmcs->host_fs_selector;
1657                vmcs12->host_gs_selector = evmcs->host_gs_selector;
1658                vmcs12->host_tr_selector = evmcs->host_tr_selector;
1659        }
1660
1661        if (unlikely(!(hv_clean_fields &
1662                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) {
1663                vmcs12->pin_based_vm_exec_control =
1664                        evmcs->pin_based_vm_exec_control;
1665                vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
1666                vmcs12->secondary_vm_exec_control =
1667                        evmcs->secondary_vm_exec_control;
1668        }
1669
1670        if (unlikely(!(hv_clean_fields &
1671                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
1672                vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
1673                vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
1674        }
1675
1676        if (unlikely(!(hv_clean_fields &
1677                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
1678                vmcs12->msr_bitmap = evmcs->msr_bitmap;
1679        }
1680
1681        if (unlikely(!(hv_clean_fields &
1682                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
1683                vmcs12->guest_es_base = evmcs->guest_es_base;
1684                vmcs12->guest_cs_base = evmcs->guest_cs_base;
1685                vmcs12->guest_ss_base = evmcs->guest_ss_base;
1686                vmcs12->guest_ds_base = evmcs->guest_ds_base;
1687                vmcs12->guest_fs_base = evmcs->guest_fs_base;
1688                vmcs12->guest_gs_base = evmcs->guest_gs_base;
1689                vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
1690                vmcs12->guest_tr_base = evmcs->guest_tr_base;
1691                vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
1692                vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
1693                vmcs12->guest_es_limit = evmcs->guest_es_limit;
1694                vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
1695                vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
1696                vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
1697                vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
1698                vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
1699                vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
1700                vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
1701                vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
1702                vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
1703                vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
1704                vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
1705                vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
1706                vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
1707                vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
1708                vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
1709                vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
1710                vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
1711                vmcs12->guest_es_selector = evmcs->guest_es_selector;
1712                vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
1713                vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
1714                vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
1715                vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
1716                vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
1717                vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
1718                vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
1719        }
1720
1721        if (unlikely(!(hv_clean_fields &
1722                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
1723                vmcs12->tsc_offset = evmcs->tsc_offset;
1724                vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
1725                vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
1726        }
1727
1728        if (unlikely(!(hv_clean_fields &
1729                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
1730                vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
1731                vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
1732                vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
1733                vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
1734                vmcs12->guest_cr0 = evmcs->guest_cr0;
1735                vmcs12->guest_cr3 = evmcs->guest_cr3;
1736                vmcs12->guest_cr4 = evmcs->guest_cr4;
1737                vmcs12->guest_dr7 = evmcs->guest_dr7;
1738        }
1739
1740        if (unlikely(!(hv_clean_fields &
1741                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
1742                vmcs12->host_fs_base = evmcs->host_fs_base;
1743                vmcs12->host_gs_base = evmcs->host_gs_base;
1744                vmcs12->host_tr_base = evmcs->host_tr_base;
1745                vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
1746                vmcs12->host_idtr_base = evmcs->host_idtr_base;
1747                vmcs12->host_rsp = evmcs->host_rsp;
1748        }
1749
1750        if (unlikely(!(hv_clean_fields &
1751                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
1752                vmcs12->ept_pointer = evmcs->ept_pointer;
1753                vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
1754        }
1755
1756        if (unlikely(!(hv_clean_fields &
1757                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
1758                vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
1759                vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
1760                vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
1761                vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
1762                vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
1763                vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
1764                vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
1765                vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
1766                vmcs12->guest_pending_dbg_exceptions =
1767                        evmcs->guest_pending_dbg_exceptions;
1768                vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
1769                vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
1770                vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
1771                vmcs12->guest_activity_state = evmcs->guest_activity_state;
1772                vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
1773        }
1774
1775        /*
1776         * Not used?
1777         * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
1778         * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
1779         * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
1780         * vmcs12->page_fault_error_code_mask =
1781         *              evmcs->page_fault_error_code_mask;
1782         * vmcs12->page_fault_error_code_match =
1783         *              evmcs->page_fault_error_code_match;
1784         * vmcs12->cr3_target_count = evmcs->cr3_target_count;
1785         * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
1786         * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
1787         * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
1788         */
1789
1790        /*
1791         * Read only fields:
1792         * vmcs12->guest_physical_address = evmcs->guest_physical_address;
1793         * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
1794         * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
1795         * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
1796         * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
1797         * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
1798         * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
1799         * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
1800         * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
1801         * vmcs12->exit_qualification = evmcs->exit_qualification;
1802         * vmcs12->guest_linear_address = evmcs->guest_linear_address;
1803         *
1804         * Not present in struct vmcs12:
1805         * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
1806         * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
1807         * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
1808         * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
1809         */
1810
1811        return;
1812}
1813
1814static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
1815{
1816        struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1817        struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1818
1819        /*
1820         * Should not be changed by KVM:
1821         *
1822         * evmcs->host_es_selector = vmcs12->host_es_selector;
1823         * evmcs->host_cs_selector = vmcs12->host_cs_selector;
1824         * evmcs->host_ss_selector = vmcs12->host_ss_selector;
1825         * evmcs->host_ds_selector = vmcs12->host_ds_selector;
1826         * evmcs->host_fs_selector = vmcs12->host_fs_selector;
1827         * evmcs->host_gs_selector = vmcs12->host_gs_selector;
1828         * evmcs->host_tr_selector = vmcs12->host_tr_selector;
1829         * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
1830         * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
1831         * evmcs->host_cr0 = vmcs12->host_cr0;
1832         * evmcs->host_cr3 = vmcs12->host_cr3;
1833         * evmcs->host_cr4 = vmcs12->host_cr4;
1834         * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
1835         * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
1836         * evmcs->host_rip = vmcs12->host_rip;
1837         * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
1838         * evmcs->host_fs_base = vmcs12->host_fs_base;
1839         * evmcs->host_gs_base = vmcs12->host_gs_base;
1840         * evmcs->host_tr_base = vmcs12->host_tr_base;
1841         * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
1842         * evmcs->host_idtr_base = vmcs12->host_idtr_base;
1843         * evmcs->host_rsp = vmcs12->host_rsp;
1844         * sync_vmcs02_to_vmcs12() doesn't read these:
1845         * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
1846         * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
1847         * evmcs->msr_bitmap = vmcs12->msr_bitmap;
1848         * evmcs->ept_pointer = vmcs12->ept_pointer;
1849         * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
1850         * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
1851         * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
1852         * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
1853         * evmcs->tpr_threshold = vmcs12->tpr_threshold;
1854         * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
1855         * evmcs->exception_bitmap = vmcs12->exception_bitmap;
1856         * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
1857         * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
1858         * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
1859         * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
1860         * evmcs->page_fault_error_code_mask =
1861         *              vmcs12->page_fault_error_code_mask;
1862         * evmcs->page_fault_error_code_match =
1863         *              vmcs12->page_fault_error_code_match;
1864         * evmcs->cr3_target_count = vmcs12->cr3_target_count;
1865         * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
1866         * evmcs->tsc_offset = vmcs12->tsc_offset;
1867         * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
1868         * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
1869         * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
1870         * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
1871         * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
1872         * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
1873         * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
1874         * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
1875         *
1876         * Not present in struct vmcs12:
1877         * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
1878         * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
1879         * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
1880         * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
1881         */
1882
1883        evmcs->guest_es_selector = vmcs12->guest_es_selector;
1884        evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
1885        evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
1886        evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
1887        evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
1888        evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
1889        evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
1890        evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
1891
1892        evmcs->guest_es_limit = vmcs12->guest_es_limit;
1893        evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
1894        evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
1895        evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
1896        evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
1897        evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
1898        evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
1899        evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
1900        evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
1901        evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
1902
1903        evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
1904        evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
1905        evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
1906        evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
1907        evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
1908        evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
1909        evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
1910        evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
1911
1912        evmcs->guest_es_base = vmcs12->guest_es_base;
1913        evmcs->guest_cs_base = vmcs12->guest_cs_base;
1914        evmcs->guest_ss_base = vmcs12->guest_ss_base;
1915        evmcs->guest_ds_base = vmcs12->guest_ds_base;
1916        evmcs->guest_fs_base = vmcs12->guest_fs_base;
1917        evmcs->guest_gs_base = vmcs12->guest_gs_base;
1918        evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
1919        evmcs->guest_tr_base = vmcs12->guest_tr_base;
1920        evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
1921        evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
1922
1923        evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
1924        evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
1925
1926        evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
1927        evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
1928        evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
1929        evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
1930
1931        evmcs->guest_pending_dbg_exceptions =
1932                vmcs12->guest_pending_dbg_exceptions;
1933        evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
1934        evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
1935
1936        evmcs->guest_activity_state = vmcs12->guest_activity_state;
1937        evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
1938
1939        evmcs->guest_cr0 = vmcs12->guest_cr0;
1940        evmcs->guest_cr3 = vmcs12->guest_cr3;
1941        evmcs->guest_cr4 = vmcs12->guest_cr4;
1942        evmcs->guest_dr7 = vmcs12->guest_dr7;
1943
1944        evmcs->guest_physical_address = vmcs12->guest_physical_address;
1945
1946        evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
1947        evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
1948        evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
1949        evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
1950        evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
1951        evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
1952        evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
1953        evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
1954
1955        evmcs->exit_qualification = vmcs12->exit_qualification;
1956
1957        evmcs->guest_linear_address = vmcs12->guest_linear_address;
1958        evmcs->guest_rsp = vmcs12->guest_rsp;
1959        evmcs->guest_rflags = vmcs12->guest_rflags;
1960
1961        evmcs->guest_interruptibility_info =
1962                vmcs12->guest_interruptibility_info;
1963        evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
1964        evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
1965        evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
1966        evmcs->vm_entry_exception_error_code =
1967                vmcs12->vm_entry_exception_error_code;
1968        evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
1969
1970        evmcs->guest_rip = vmcs12->guest_rip;
1971
1972        evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
1973
1974        return;
1975}
1976
1977/*
1978 * This is an equivalent of the nested hypervisor executing the vmptrld
1979 * instruction.
1980 */
1981static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
1982        struct kvm_vcpu *vcpu, bool from_launch)
1983{
1984        struct vcpu_vmx *vmx = to_vmx(vcpu);
1985        bool evmcs_gpa_changed = false;
1986        u64 evmcs_gpa;
1987
1988        if (likely(!vmx->nested.enlightened_vmcs_enabled))
1989                return EVMPTRLD_DISABLED;
1990
1991        if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) {
1992                nested_release_evmcs(vcpu);
1993                return EVMPTRLD_DISABLED;
1994        }
1995
1996        if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
1997                vmx->nested.current_vmptr = -1ull;
1998
1999                nested_release_evmcs(vcpu);
2000
2001                if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
2002                                 &vmx->nested.hv_evmcs_map))
2003                        return EVMPTRLD_ERROR;
2004
2005                vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
2006
2007                /*
2008                 * Currently, KVM only supports eVMCS version 1
2009                 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this
2010                 * value to first u32 field of eVMCS which should specify eVMCS
2011                 * VersionNumber.
2012                 *
2013                 * Guest should be aware of supported eVMCS versions by host by
2014                 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
2015                 * expected to set this CPUID leaf according to the value
2016                 * returned in vmcs_version from nested_enable_evmcs().
2017                 *
2018                 * However, it turns out that Microsoft Hyper-V fails to comply
2019                 * to their own invented interface: When Hyper-V use eVMCS, it
2020                 * just sets first u32 field of eVMCS to revision_id specified
2021                 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
2022                 * which is one of the supported versions specified in
2023                 * CPUID.0x4000000A.EAX[0:15].
2024                 *
2025                 * To overcome Hyper-V bug, we accept here either a supported
2026                 * eVMCS version or VMCS12 revision_id as valid values for first
2027                 * u32 field of eVMCS.
2028                 */
2029                if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
2030                    (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
2031                        nested_release_evmcs(vcpu);
2032                        return EVMPTRLD_VMFAIL;
2033                }
2034
2035                vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
2036
2037                evmcs_gpa_changed = true;
2038                /*
2039                 * Unlike normal vmcs12, enlightened vmcs12 is not fully
2040                 * reloaded from guest's memory (read only fields, fields not
2041                 * present in struct hv_enlightened_vmcs, ...). Make sure there
2042                 * are no leftovers.
2043                 */
2044                if (from_launch) {
2045                        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2046                        memset(vmcs12, 0, sizeof(*vmcs12));
2047                        vmcs12->hdr.revision_id = VMCS12_REVISION;
2048                }
2049
2050        }
2051
2052        /*
2053         * Clean fields data can't be used on VMLAUNCH and when we switch
2054         * between different L2 guests as KVM keeps a single VMCS12 per L1.
2055         */
2056        if (from_launch || evmcs_gpa_changed)
2057                vmx->nested.hv_evmcs->hv_clean_fields &=
2058                        ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
2059
2060        return EVMPTRLD_SUCCEEDED;
2061}
2062
2063void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
2064{
2065        struct vcpu_vmx *vmx = to_vmx(vcpu);
2066
2067        if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
2068                copy_vmcs12_to_enlightened(vmx);
2069        else
2070                copy_vmcs12_to_shadow(vmx);
2071
2072        vmx->nested.need_vmcs12_to_shadow_sync = false;
2073}
2074
2075static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
2076{
2077        struct vcpu_vmx *vmx =
2078                container_of(timer, struct vcpu_vmx, nested.preemption_timer);
2079
2080        vmx->nested.preemption_timer_expired = true;
2081        kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
2082        kvm_vcpu_kick(&vmx->vcpu);
2083
2084        return HRTIMER_NORESTART;
2085}
2086
2087static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu)
2088{
2089        struct vcpu_vmx *vmx = to_vmx(vcpu);
2090        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2091
2092        u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >>
2093                            VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
2094
2095        if (!vmx->nested.has_preemption_timer_deadline) {
2096                vmx->nested.preemption_timer_deadline =
2097                        vmcs12->vmx_preemption_timer_value + l1_scaled_tsc;
2098                vmx->nested.has_preemption_timer_deadline = true;
2099        }
2100        return vmx->nested.preemption_timer_deadline - l1_scaled_tsc;
2101}
2102
2103static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu,
2104                                        u64 preemption_timeout)
2105{
2106        struct vcpu_vmx *vmx = to_vmx(vcpu);
2107
2108        /*
2109         * A timer value of zero is architecturally guaranteed to cause
2110         * a VMExit prior to executing any instructions in the guest.
2111         */
2112        if (preemption_timeout == 0) {
2113                vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
2114                return;
2115        }
2116
2117        if (vcpu->arch.virtual_tsc_khz == 0)
2118                return;
2119
2120        preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
2121        preemption_timeout *= 1000000;
2122        do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
2123        hrtimer_start(&vmx->nested.preemption_timer,
2124                      ktime_add_ns(ktime_get(), preemption_timeout),
2125                      HRTIMER_MODE_ABS_PINNED);
2126}
2127
2128static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2129{
2130        if (vmx->nested.nested_run_pending &&
2131            (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
2132                return vmcs12->guest_ia32_efer;
2133        else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
2134                return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
2135        else
2136                return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
2137}
2138
2139static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
2140{
2141        /*
2142         * If vmcs02 hasn't been initialized, set the constant vmcs02 state
2143         * according to L0's settings (vmcs12 is irrelevant here).  Host
2144         * fields that come from L0 and are not constant, e.g. HOST_CR3,
2145         * will be set as needed prior to VMLAUNCH/VMRESUME.
2146         */
2147        if (vmx->nested.vmcs02_initialized)
2148                return;
2149        vmx->nested.vmcs02_initialized = true;
2150
2151        /*
2152         * We don't care what the EPTP value is we just need to guarantee
2153         * it's valid so we don't get a false positive when doing early
2154         * consistency checks.
2155         */
2156        if (enable_ept && nested_early_check)
2157                vmcs_write64(EPT_POINTER,
2158                             construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL));
2159
2160        /* All VMFUNCs are currently emulated through L0 vmexits.  */
2161        if (cpu_has_vmx_vmfunc())
2162                vmcs_write64(VM_FUNCTION_CONTROL, 0);
2163
2164        if (cpu_has_vmx_posted_intr())
2165                vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
2166
2167        if (cpu_has_vmx_msr_bitmap())
2168                vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
2169
2170        /*
2171         * PML is emulated for L2, but never enabled in hardware as the MMU
2172         * handles A/D emulation.  Disabling PML for L2 also avoids having to
2173         * deal with filtering out L2 GPAs from the buffer.
2174         */
2175        if (enable_pml) {
2176                vmcs_write64(PML_ADDRESS, 0);
2177                vmcs_write16(GUEST_PML_INDEX, -1);
2178        }
2179
2180        if (cpu_has_vmx_encls_vmexit())
2181                vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
2182
2183        /*
2184         * Set the MSR load/store lists to match L0's settings.  Only the
2185         * addresses are constant (for vmcs02), the counts can change based
2186         * on L2's behavior, e.g. switching to/from long mode.
2187         */
2188        vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val));
2189        vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
2190        vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
2191
2192        vmx_set_constant_host_state(vmx);
2193}
2194
2195static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
2196                                      struct vmcs12 *vmcs12)
2197{
2198        prepare_vmcs02_constant_state(vmx);
2199
2200        vmcs_write64(VMCS_LINK_POINTER, -1ull);
2201
2202        if (enable_vpid) {
2203                if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
2204                        vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
2205                else
2206                        vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2207        }
2208}
2209
2210static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01,
2211                                 struct vmcs12 *vmcs12)
2212{
2213        u32 exec_control;
2214        u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
2215
2216        if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
2217                prepare_vmcs02_early_rare(vmx, vmcs12);
2218
2219        /*
2220         * PIN CONTROLS
2221         */
2222        exec_control = __pin_controls_get(vmcs01);
2223        exec_control |= (vmcs12->pin_based_vm_exec_control &
2224                         ~PIN_BASED_VMX_PREEMPTION_TIMER);
2225
2226        /* Posted interrupts setting is only taken from vmcs12.  */
2227        vmx->nested.pi_pending = false;
2228        if (nested_cpu_has_posted_intr(vmcs12))
2229                vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
2230        else
2231                exec_control &= ~PIN_BASED_POSTED_INTR;
2232        pin_controls_set(vmx, exec_control);
2233
2234        /*
2235         * EXEC CONTROLS
2236         */
2237        exec_control = __exec_controls_get(vmcs01); /* L0's desires */
2238        exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING;
2239        exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING;
2240        exec_control &= ~CPU_BASED_TPR_SHADOW;
2241        exec_control |= vmcs12->cpu_based_vm_exec_control;
2242
2243        vmx->nested.l1_tpr_threshold = -1;
2244        if (exec_control & CPU_BASED_TPR_SHADOW)
2245                vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
2246#ifdef CONFIG_X86_64
2247        else
2248                exec_control |= CPU_BASED_CR8_LOAD_EXITING |
2249                                CPU_BASED_CR8_STORE_EXITING;
2250#endif
2251
2252        /*
2253         * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
2254         * for I/O port accesses.
2255         */
2256        exec_control |= CPU_BASED_UNCOND_IO_EXITING;
2257        exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
2258
2259        /*
2260         * This bit will be computed in nested_get_vmcs12_pages, because
2261         * we do not have access to L1's MSR bitmap yet.  For now, keep
2262         * the same bit as before, hoping to avoid multiple VMWRITEs that
2263         * only set/clear this bit.
2264         */
2265        exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
2266        exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS;
2267
2268        exec_controls_set(vmx, exec_control);
2269
2270        /*
2271         * SECONDARY EXEC CONTROLS
2272         */
2273        if (cpu_has_secondary_exec_ctrls()) {
2274                exec_control = __secondary_exec_controls_get(vmcs01);
2275
2276                /* Take the following fields only from vmcs12 */
2277                exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2278                                  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2279                                  SECONDARY_EXEC_ENABLE_INVPCID |
2280                                  SECONDARY_EXEC_ENABLE_RDTSCP |
2281                                  SECONDARY_EXEC_XSAVES |
2282                                  SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
2283                                  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2284                                  SECONDARY_EXEC_APIC_REGISTER_VIRT |
2285                                  SECONDARY_EXEC_ENABLE_VMFUNC |
2286                                  SECONDARY_EXEC_TSC_SCALING |
2287                                  SECONDARY_EXEC_DESC);
2288
2289                if (nested_cpu_has(vmcs12,
2290                                   CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
2291                        exec_control |= vmcs12->secondary_vm_exec_control;
2292
2293                /* PML is emulated and never enabled in hardware for L2. */
2294                exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
2295
2296                /* VMCS shadowing for L2 is emulated for now */
2297                exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
2298
2299                /*
2300                 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
2301                 * will not have to rewrite the controls just for this bit.
2302                 */
2303                if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() &&
2304                    (vmcs12->guest_cr4 & X86_CR4_UMIP))
2305                        exec_control |= SECONDARY_EXEC_DESC;
2306
2307                if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
2308                        vmcs_write16(GUEST_INTR_STATUS,
2309                                vmcs12->guest_intr_status);
2310
2311                if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
2312                    exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
2313
2314                if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
2315                        vmx_write_encls_bitmap(&vmx->vcpu, vmcs12);
2316
2317                secondary_exec_controls_set(vmx, exec_control);
2318        }
2319
2320        /*
2321         * ENTRY CONTROLS
2322         *
2323         * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
2324         * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
2325         * on the related bits (if supported by the CPU) in the hope that
2326         * we can avoid VMWrites during vmx_set_efer().
2327         */
2328        exec_control = __vm_entry_controls_get(vmcs01);
2329        exec_control |= vmcs12->vm_entry_controls;
2330        exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER);
2331        if (cpu_has_load_ia32_efer()) {
2332                if (guest_efer & EFER_LMA)
2333                        exec_control |= VM_ENTRY_IA32E_MODE;
2334                if (guest_efer != host_efer)
2335                        exec_control |= VM_ENTRY_LOAD_IA32_EFER;
2336        }
2337        vm_entry_controls_set(vmx, exec_control);
2338
2339        /*
2340         * EXIT CONTROLS
2341         *
2342         * L2->L1 exit controls are emulated - the hardware exit is to L0 so
2343         * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
2344         * bits may be modified by vmx_set_efer() in prepare_vmcs02().
2345         */
2346        exec_control = __vm_exit_controls_get(vmcs01);
2347        if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
2348                exec_control |= VM_EXIT_LOAD_IA32_EFER;
2349        else
2350                exec_control &= ~VM_EXIT_LOAD_IA32_EFER;
2351        vm_exit_controls_set(vmx, exec_control);
2352
2353        /*
2354         * Interrupt/Exception Fields
2355         */
2356        if (vmx->nested.nested_run_pending) {
2357                vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2358                             vmcs12->vm_entry_intr_info_field);
2359                vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2360                             vmcs12->vm_entry_exception_error_code);
2361                vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2362                             vmcs12->vm_entry_instruction_len);
2363                vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2364                             vmcs12->guest_interruptibility_info);
2365                vmx->loaded_vmcs->nmi_known_unmasked =
2366                        !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
2367        } else {
2368                vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
2369        }
2370}
2371
2372static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2373{
2374        struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
2375
2376        if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2377                           HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
2378                vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
2379                vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
2380                vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
2381                vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
2382                vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
2383                vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
2384                vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
2385                vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
2386                vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
2387                vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
2388                vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
2389                vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
2390                vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
2391                vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
2392                vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
2393                vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
2394                vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
2395                vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
2396                vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
2397                vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
2398                vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
2399                vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
2400                vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
2401                vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
2402                vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
2403                vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
2404                vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
2405                vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
2406                vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
2407                vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
2408                vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
2409                vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
2410                vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
2411                vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
2412                vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
2413                vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
2414
2415                vmx->segment_cache.bitmask = 0;
2416        }
2417
2418        if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2419                           HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
2420                vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
2421                vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
2422                            vmcs12->guest_pending_dbg_exceptions);
2423                vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
2424                vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
2425
2426                /*
2427                 * L1 may access the L2's PDPTR, so save them to construct
2428                 * vmcs12
2429                 */
2430                if (enable_ept) {
2431                        vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2432                        vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2433                        vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2434                        vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2435                }
2436
2437                if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
2438                    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
2439                        vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
2440        }
2441
2442        if (nested_cpu_has_xsaves(vmcs12))
2443                vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
2444
2445        /*
2446         * Whether page-faults are trapped is determined by a combination of
2447         * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.  If L0
2448         * doesn't care about page faults then we should set all of these to
2449         * L1's desires. However, if L0 does care about (some) page faults, it
2450         * is not easy (if at all possible?) to merge L0 and L1's desires, we
2451         * simply ask to exit on each and every L2 page fault. This is done by
2452         * setting MASK=MATCH=0 and (see below) EB.PF=1.
2453         * Note that below we don't need special code to set EB.PF beyond the
2454         * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
2455         * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
2456         * !enable_ept, EB.PF is 1, so the "or" will always be 1.
2457         */
2458        if (vmx_need_pf_intercept(&vmx->vcpu)) {
2459                /*
2460                 * TODO: if both L0 and L1 need the same MASK and MATCH,
2461                 * go ahead and use it?
2462                 */
2463                vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
2464                vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
2465        } else {
2466                vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask);
2467                vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match);
2468        }
2469
2470        if (cpu_has_vmx_apicv()) {
2471                vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
2472                vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
2473                vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
2474                vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
2475        }
2476
2477        /*
2478         * Make sure the msr_autostore list is up to date before we set the
2479         * count in the vmcs02.
2480         */
2481        prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC);
2482
2483        vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr);
2484        vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
2485        vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2486
2487        set_cr4_guest_host_mask(vmx);
2488}
2489
2490/*
2491 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
2492 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
2493 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
2494 * guest in a way that will both be appropriate to L1's requests, and our
2495 * needs. In addition to modifying the active vmcs (which is vmcs02), this
2496 * function also has additional necessary side-effects, like setting various
2497 * vcpu->arch fields.
2498 * Returns 0 on success, 1 on failure. Invalid state exit qualification code
2499 * is assigned to entry_failure_code on failure.
2500 */
2501static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
2502                          bool from_vmentry,
2503                          enum vm_entry_failure_code *entry_failure_code)
2504{
2505        struct vcpu_vmx *vmx = to_vmx(vcpu);
2506        bool load_guest_pdptrs_vmcs12 = false;
2507
2508        if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
2509                prepare_vmcs02_rare(vmx, vmcs12);
2510                vmx->nested.dirty_vmcs12 = false;
2511
2512                load_guest_pdptrs_vmcs12 = !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) ||
2513                        !(vmx->nested.hv_evmcs->hv_clean_fields &
2514                          HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
2515        }
2516
2517        if (vmx->nested.nested_run_pending &&
2518            (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
2519                kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
2520                vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
2521        } else {
2522                kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
2523                vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
2524        }
2525        if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
2526            !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
2527                vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
2528        vmx_set_rflags(vcpu, vmcs12->guest_rflags);
2529
2530        /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
2531         * bitwise-or of what L1 wants to trap for L2, and what we want to
2532         * trap. Note that CR0.TS also needs updating - we do this later.
2533         */
2534        vmx_update_exception_bitmap(vcpu);
2535        vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
2536        vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
2537
2538        if (vmx->nested.nested_run_pending &&
2539            (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
2540                vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
2541                vcpu->arch.pat = vmcs12->guest_ia32_pat;
2542        } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2543                vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
2544        }
2545
2546        vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
2547                        vcpu->arch.l1_tsc_offset,
2548                        vmx_get_l2_tsc_offset(vcpu),
2549                        vmx_get_l2_tsc_multiplier(vcpu));
2550
2551        vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
2552                        vcpu->arch.l1_tsc_scaling_ratio,
2553                        vmx_get_l2_tsc_multiplier(vcpu));
2554
2555        vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
2556        if (kvm_has_tsc_control)
2557                vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
2558
2559        nested_vmx_transition_tlb_flush(vcpu, vmcs12, true);
2560
2561        if (nested_cpu_has_ept(vmcs12))
2562                nested_ept_init_mmu_context(vcpu);
2563
2564        /*
2565         * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
2566         * bits which we consider mandatory enabled.
2567         * The CR0_READ_SHADOW is what L2 should have expected to read given
2568         * the specifications by L1; It's not enough to take
2569         * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
2570         * have more bits than L1 expected.
2571         */
2572        vmx_set_cr0(vcpu, vmcs12->guest_cr0);
2573        vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
2574
2575        vmx_set_cr4(vcpu, vmcs12->guest_cr4);
2576        vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
2577
2578        vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
2579        /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
2580        vmx_set_efer(vcpu, vcpu->arch.efer);
2581
2582        /*
2583         * Guest state is invalid and unrestricted guest is disabled,
2584         * which means L1 attempted VMEntry to L2 with invalid state.
2585         * Fail the VMEntry.
2586         *
2587         * However when force loading the guest state (SMM exit or
2588         * loading nested state after migration, it is possible to
2589         * have invalid guest state now, which will be later fixed by
2590         * restoring L2 register state
2591         */
2592        if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) {
2593                *entry_failure_code = ENTRY_FAIL_DEFAULT;
2594                return -EINVAL;
2595        }
2596
2597        /* Shadow page tables on either EPT or shadow page tables. */
2598        if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
2599                                from_vmentry, entry_failure_code))
2600                return -EINVAL;
2601
2602        /*
2603         * Immediately write vmcs02.GUEST_CR3.  It will be propagated to vmcs12
2604         * on nested VM-Exit, which can occur without actually running L2 and
2605         * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with
2606         * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the
2607         * transition to HLT instead of running L2.
2608         */
2609        if (enable_ept)
2610                vmcs_writel(GUEST_CR3, vmcs12->guest_cr3);
2611
2612        /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
2613        if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
2614            is_pae_paging(vcpu)) {
2615                vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2616                vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2617                vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2618                vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2619        }
2620
2621        if (!enable_ept)
2622                vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
2623
2624        if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2625            WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
2626                                     vmcs12->guest_ia32_perf_global_ctrl)))
2627                return -EINVAL;
2628
2629        kvm_rsp_write(vcpu, vmcs12->guest_rsp);
2630        kvm_rip_write(vcpu, vmcs12->guest_rip);
2631
2632        /*
2633         * It was observed that genuine Hyper-V running in L1 doesn't reset
2634         * 'hv_clean_fields' by itself, it only sets the corresponding dirty
2635         * bits when it changes a field in eVMCS. Mark all fields as clean
2636         * here.
2637         */
2638        if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
2639                vmx->nested.hv_evmcs->hv_clean_fields |=
2640                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
2641
2642        return 0;
2643}
2644
2645static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
2646{
2647        if (CC(!nested_cpu_has_nmi_exiting(vmcs12) &&
2648               nested_cpu_has_virtual_nmis(vmcs12)))
2649                return -EINVAL;
2650
2651        if (CC(!nested_cpu_has_virtual_nmis(vmcs12) &&
2652               nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING)))
2653                return -EINVAL;
2654
2655        return 0;
2656}
2657
2658static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp)
2659{
2660        struct vcpu_vmx *vmx = to_vmx(vcpu);
2661
2662        /* Check for memory type validity */
2663        switch (new_eptp & VMX_EPTP_MT_MASK) {
2664        case VMX_EPTP_MT_UC:
2665                if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)))
2666                        return false;
2667                break;
2668        case VMX_EPTP_MT_WB:
2669                if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)))
2670                        return false;
2671                break;
2672        default:
2673                return false;
2674        }
2675
2676        /* Page-walk levels validity. */
2677        switch (new_eptp & VMX_EPTP_PWL_MASK) {
2678        case VMX_EPTP_PWL_5:
2679                if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT)))
2680                        return false;
2681                break;
2682        case VMX_EPTP_PWL_4:
2683                if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT)))
2684                        return false;
2685                break;
2686        default:
2687                return false;
2688        }
2689
2690        /* Reserved bits should not be set */
2691        if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f)))
2692                return false;
2693
2694        /* AD, if set, should be supported */
2695        if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) {
2696                if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)))
2697                        return false;
2698        }
2699
2700        return true;
2701}
2702
2703/*
2704 * Checks related to VM-Execution Control Fields
2705 */
2706static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
2707                                              struct vmcs12 *vmcs12)
2708{
2709        struct vcpu_vmx *vmx = to_vmx(vcpu);
2710
2711        if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
2712                                   vmx->nested.msrs.pinbased_ctls_low,
2713                                   vmx->nested.msrs.pinbased_ctls_high)) ||
2714            CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
2715                                   vmx->nested.msrs.procbased_ctls_low,
2716                                   vmx->nested.msrs.procbased_ctls_high)))
2717                return -EINVAL;
2718
2719        if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
2720            CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control,
2721                                   vmx->nested.msrs.secondary_ctls_low,
2722                                   vmx->nested.msrs.secondary_ctls_high)))
2723                return -EINVAL;
2724
2725        if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) ||
2726            nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) ||
2727            nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) ||
2728            nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) ||
2729            nested_vmx_check_apic_access_controls(vcpu, vmcs12) ||
2730            nested_vmx_check_apicv_controls(vcpu, vmcs12) ||
2731            nested_vmx_check_nmi_controls(vmcs12) ||
2732            nested_vmx_check_pml_controls(vcpu, vmcs12) ||
2733            nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) ||
2734            nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) ||
2735            nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) ||
2736            CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
2737                return -EINVAL;
2738
2739        if (!nested_cpu_has_preemption_timer(vmcs12) &&
2740            nested_cpu_has_save_preemption_timer(vmcs12))
2741                return -EINVAL;
2742
2743        if (nested_cpu_has_ept(vmcs12) &&
2744            CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer)))
2745                return -EINVAL;
2746
2747        if (nested_cpu_has_vmfunc(vmcs12)) {
2748                if (CC(vmcs12->vm_function_control &
2749                       ~vmx->nested.msrs.vmfunc_controls))
2750                        return -EINVAL;
2751
2752                if (nested_cpu_has_eptp_switching(vmcs12)) {
2753                        if (CC(!nested_cpu_has_ept(vmcs12)) ||
2754                            CC(!page_address_valid(vcpu, vmcs12->eptp_list_address)))
2755                                return -EINVAL;
2756                }
2757        }
2758
2759        return 0;
2760}
2761
2762/*
2763 * Checks related to VM-Exit Control Fields
2764 */
2765static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu,
2766                                         struct vmcs12 *vmcs12)
2767{
2768        struct vcpu_vmx *vmx = to_vmx(vcpu);
2769
2770        if (CC(!vmx_control_verify(vmcs12->vm_exit_controls,
2771                                    vmx->nested.msrs.exit_ctls_low,
2772                                    vmx->nested.msrs.exit_ctls_high)) ||
2773            CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12)))
2774                return -EINVAL;
2775
2776        return 0;
2777}
2778
2779/*
2780 * Checks related to VM-Entry Control Fields
2781 */
2782static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
2783                                          struct vmcs12 *vmcs12)
2784{
2785        struct vcpu_vmx *vmx = to_vmx(vcpu);
2786
2787        if (CC(!vmx_control_verify(vmcs12->vm_entry_controls,
2788                                    vmx->nested.msrs.entry_ctls_low,
2789                                    vmx->nested.msrs.entry_ctls_high)))
2790                return -EINVAL;
2791
2792        /*
2793         * From the Intel SDM, volume 3:
2794         * Fields relevant to VM-entry event injection must be set properly.
2795         * These fields are the VM-entry interruption-information field, the
2796         * VM-entry exception error code, and the VM-entry instruction length.
2797         */
2798        if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
2799                u32 intr_info = vmcs12->vm_entry_intr_info_field;
2800                u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
2801                u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
2802                bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
2803                bool should_have_error_code;
2804                bool urg = nested_cpu_has2(vmcs12,
2805                                           SECONDARY_EXEC_UNRESTRICTED_GUEST);
2806                bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
2807
2808                /* VM-entry interruption-info field: interruption type */
2809                if (CC(intr_type == INTR_TYPE_RESERVED) ||
2810                    CC(intr_type == INTR_TYPE_OTHER_EVENT &&
2811                       !nested_cpu_supports_monitor_trap_flag(vcpu)))
2812                        return -EINVAL;
2813
2814                /* VM-entry interruption-info field: vector */
2815                if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
2816                    CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
2817                    CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
2818                        return -EINVAL;
2819
2820                /* VM-entry interruption-info field: deliver error code */
2821                should_have_error_code =
2822                        intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
2823                        x86_exception_has_error_code(vector);
2824                if (CC(has_error_code != should_have_error_code))
2825                        return -EINVAL;
2826
2827                /* VM-entry exception error code */
2828                if (CC(has_error_code &&
2829                       vmcs12->vm_entry_exception_error_code & GENMASK(31, 16)))
2830                        return -EINVAL;
2831
2832                /* VM-entry interruption-info field: reserved bits */
2833                if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK))
2834                        return -EINVAL;
2835
2836                /* VM-entry instruction length */
2837                switch (intr_type) {
2838                case INTR_TYPE_SOFT_EXCEPTION:
2839                case INTR_TYPE_SOFT_INTR:
2840                case INTR_TYPE_PRIV_SW_EXCEPTION:
2841                        if (CC(vmcs12->vm_entry_instruction_len > 15) ||
2842                            CC(vmcs12->vm_entry_instruction_len == 0 &&
2843                            CC(!nested_cpu_has_zero_length_injection(vcpu))))
2844                                return -EINVAL;
2845                }
2846        }
2847
2848        if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12))
2849                return -EINVAL;
2850
2851        return 0;
2852}
2853
2854static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
2855                                     struct vmcs12 *vmcs12)
2856{
2857        if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
2858            nested_check_vm_exit_controls(vcpu, vmcs12) ||
2859            nested_check_vm_entry_controls(vcpu, vmcs12))
2860                return -EINVAL;
2861
2862        if (to_vmx(vcpu)->nested.enlightened_vmcs_enabled)
2863                return nested_evmcs_check_controls(vmcs12);
2864
2865        return 0;
2866}
2867
2868static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
2869                                       struct vmcs12 *vmcs12)
2870{
2871        bool ia32e;
2872
2873        if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) ||
2874            CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) ||
2875            CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3)))
2876                return -EINVAL;
2877
2878        if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
2879            CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
2880                return -EINVAL;
2881
2882        if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
2883            CC(!kvm_pat_valid(vmcs12->host_ia32_pat)))
2884                return -EINVAL;
2885
2886        if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2887            CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
2888                                           vmcs12->host_ia32_perf_global_ctrl)))
2889                return -EINVAL;
2890
2891#ifdef CONFIG_X86_64
2892        ia32e = !!(vcpu->arch.efer & EFER_LMA);
2893#else
2894        ia32e = false;
2895#endif
2896
2897        if (ia32e) {
2898                if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) ||
2899                    CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
2900                        return -EINVAL;
2901        } else {
2902                if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) ||
2903                    CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
2904                    CC(vmcs12->host_cr4 & X86_CR4_PCIDE) ||
2905                    CC((vmcs12->host_rip) >> 32))
2906                        return -EINVAL;
2907        }
2908
2909        if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2910            CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2911            CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2912            CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2913            CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2914            CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2915            CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2916            CC(vmcs12->host_cs_selector == 0) ||
2917            CC(vmcs12->host_tr_selector == 0) ||
2918            CC(vmcs12->host_ss_selector == 0 && !ia32e))
2919                return -EINVAL;
2920
2921        if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) ||
2922            CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) ||
2923            CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) ||
2924            CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) ||
2925            CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) ||
2926            CC(is_noncanonical_address(vmcs12->host_rip, vcpu)))
2927                return -EINVAL;
2928
2929        /*
2930         * If the load IA32_EFER VM-exit control is 1, bits reserved in the
2931         * IA32_EFER MSR must be 0 in the field for that register. In addition,
2932         * the values of the LMA and LME bits in the field must each be that of
2933         * the host address-space size VM-exit control.
2934         */
2935        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
2936                if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) ||
2937                    CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) ||
2938                    CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)))
2939                        return -EINVAL;
2940        }
2941
2942        return 0;
2943}
2944
2945static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
2946                                          struct vmcs12 *vmcs12)
2947{
2948        int r = 0;
2949        struct vmcs12 *shadow;
2950        struct kvm_host_map map;
2951
2952        if (vmcs12->vmcs_link_pointer == -1ull)
2953                return 0;
2954
2955        if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)))
2956                return -EINVAL;
2957
2958        if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)))
2959                return -EINVAL;
2960
2961        shadow = map.hva;
2962
2963        if (CC(shadow->hdr.revision_id != VMCS12_REVISION) ||
2964            CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)))
2965                r = -EINVAL;
2966
2967        kvm_vcpu_unmap(vcpu, &map, false);
2968        return r;
2969}
2970
2971/*
2972 * Checks related to Guest Non-register State
2973 */
2974static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
2975{
2976        if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
2977               vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT &&
2978               vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI))
2979                return -EINVAL;
2980
2981        return 0;
2982}
2983
2984static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
2985                                        struct vmcs12 *vmcs12,
2986                                        enum vm_entry_failure_code *entry_failure_code)
2987{
2988        bool ia32e;
2989
2990        *entry_failure_code = ENTRY_FAIL_DEFAULT;
2991
2992        if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) ||
2993            CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)))
2994                return -EINVAL;
2995
2996        if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
2997            CC(!kvm_dr7_valid(vmcs12->guest_dr7)))
2998                return -EINVAL;
2999
3000        if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
3001            CC(!kvm_pat_valid(vmcs12->guest_ia32_pat)))
3002                return -EINVAL;
3003
3004        if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
3005                *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR;
3006                return -EINVAL;
3007        }
3008
3009        if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
3010            CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
3011                                           vmcs12->guest_ia32_perf_global_ctrl)))
3012                return -EINVAL;
3013
3014        /*
3015         * If the load IA32_EFER VM-entry control is 1, the following checks
3016         * are performed on the field for the IA32_EFER MSR:
3017         * - Bits reserved in the IA32_EFER MSR must be 0.
3018         * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
3019         *   the IA-32e mode guest VM-exit control. It must also be identical
3020         *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
3021         *   CR0.PG) is 1.
3022         */
3023        if (to_vmx(vcpu)->nested.nested_run_pending &&
3024            (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
3025                ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
3026                if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) ||
3027                    CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) ||
3028                    CC(((vmcs12->guest_cr0 & X86_CR0_PG) &&
3029                     ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))))
3030                        return -EINVAL;
3031        }
3032
3033        if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
3034            (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) ||
3035             CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))))
3036                return -EINVAL;
3037
3038        if (nested_check_guest_non_reg_state(vmcs12))
3039                return -EINVAL;
3040
3041        return 0;
3042}
3043
3044static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
3045{
3046        struct vcpu_vmx *vmx = to_vmx(vcpu);
3047        unsigned long cr3, cr4;
3048        bool vm_fail;
3049
3050        if (!nested_early_check)
3051                return 0;
3052
3053        if (vmx->msr_autoload.host.nr)
3054                vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
3055        if (vmx->msr_autoload.guest.nr)
3056                vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
3057
3058        preempt_disable();
3059
3060        vmx_prepare_switch_to_guest(vcpu);
3061
3062        /*
3063         * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
3064         * which is reserved to '1' by hardware.  GUEST_RFLAGS is guaranteed to
3065         * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e.
3066         * there is no need to preserve other bits or save/restore the field.
3067         */
3068        vmcs_writel(GUEST_RFLAGS, 0);
3069
3070        cr3 = __get_current_cr3_fast();
3071        if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
3072                vmcs_writel(HOST_CR3, cr3);
3073                vmx->loaded_vmcs->host_state.cr3 = cr3;
3074        }
3075
3076        cr4 = cr4_read_shadow();
3077        if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
3078                vmcs_writel(HOST_CR4, cr4);
3079                vmx->loaded_vmcs->host_state.cr4 = cr4;
3080        }
3081
3082        vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
3083                                 vmx->loaded_vmcs->launched);
3084
3085        if (vmx->msr_autoload.host.nr)
3086                vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
3087        if (vmx->msr_autoload.guest.nr)
3088                vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
3089
3090        if (vm_fail) {
3091                u32 error = vmcs_read32(VM_INSTRUCTION_ERROR);
3092
3093                preempt_enable();
3094
3095                trace_kvm_nested_vmenter_failed(
3096                        "early hardware check VM-instruction error: ", error);
3097                WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3098                return 1;
3099        }
3100
3101        /*
3102         * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
3103         */
3104        if (hw_breakpoint_active())
3105                set_debugreg(__this_cpu_read(cpu_dr7), 7);
3106        local_irq_enable();
3107        preempt_enable();
3108
3109        /*
3110         * A non-failing VMEntry means we somehow entered guest mode with
3111         * an illegal RIP, and that's just the tip of the iceberg.  There
3112         * is no telling what memory has been modified or what state has
3113         * been exposed to unknown code.  Hitting this all but guarantees
3114         * a (very critical) hardware issue.
3115         */
3116        WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
3117                VMX_EXIT_REASONS_FAILED_VMENTRY));
3118
3119        return 0;
3120}
3121
3122static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
3123{
3124        struct vcpu_vmx *vmx = to_vmx(vcpu);
3125
3126        /*
3127         * hv_evmcs may end up being not mapped after migration (when
3128         * L2 was running), map it here to make sure vmcs12 changes are
3129         * properly reflected.
3130         */
3131        if (vmx->nested.enlightened_vmcs_enabled &&
3132            vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) {
3133                enum nested_evmptrld_status evmptrld_status =
3134                        nested_vmx_handle_enlightened_vmptrld(vcpu, false);
3135
3136                if (evmptrld_status == EVMPTRLD_VMFAIL ||
3137                    evmptrld_status == EVMPTRLD_ERROR)
3138                        return false;
3139
3140                /*
3141                 * Post migration VMCS12 always provides the most actual
3142                 * information, copy it to eVMCS upon entry.
3143                 */
3144                vmx->nested.need_vmcs12_to_shadow_sync = true;
3145        }
3146
3147        return true;
3148}
3149
3150static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
3151{
3152        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3153        struct vcpu_vmx *vmx = to_vmx(vcpu);
3154        struct kvm_host_map *map;
3155        struct page *page;
3156        u64 hpa;
3157
3158        if (!vcpu->arch.pdptrs_from_userspace &&
3159            !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
3160                /*
3161                 * Reload the guest's PDPTRs since after a migration
3162                 * the guest CR3 might be restored prior to setting the nested
3163                 * state which can lead to a load of wrong PDPTRs.
3164                 */
3165                if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3)))
3166                        return false;
3167        }
3168
3169
3170        if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3171                /*
3172                 * Translate L1 physical address to host physical
3173                 * address for vmcs02. Keep the page pinned, so this
3174                 * physical address remains valid. We keep a reference
3175                 * to it so we can release it later.
3176                 */
3177                if (vmx->nested.apic_access_page) { /* shouldn't happen */
3178                        kvm_release_page_clean(vmx->nested.apic_access_page);
3179                        vmx->nested.apic_access_page = NULL;
3180                }
3181                page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
3182                if (!is_error_page(page)) {
3183                        vmx->nested.apic_access_page = page;
3184                        hpa = page_to_phys(vmx->nested.apic_access_page);
3185                        vmcs_write64(APIC_ACCESS_ADDR, hpa);
3186                } else {
3187                        pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n",
3188                                             __func__);
3189                        vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3190                        vcpu->run->internal.suberror =
3191                                KVM_INTERNAL_ERROR_EMULATION;
3192                        vcpu->run->internal.ndata = 0;
3193                        return false;
3194                }
3195        }
3196
3197        if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3198                map = &vmx->nested.virtual_apic_map;
3199
3200                if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
3201                        vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
3202                } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
3203                           nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
3204                           !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3205                        /*
3206                         * The processor will never use the TPR shadow, simply
3207                         * clear the bit from the execution control.  Such a
3208                         * configuration is useless, but it happens in tests.
3209                         * For any other configuration, failing the vm entry is
3210                         * _not_ what the processor does but it's basically the
3211                         * only possibility we have.
3212                         */
3213                        exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW);
3214                } else {
3215                        /*
3216                         * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
3217                         * force VM-Entry to fail.
3218                         */
3219                        vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
3220                }
3221        }
3222
3223        if (nested_cpu_has_posted_intr(vmcs12)) {
3224                map = &vmx->nested.pi_desc_map;
3225
3226                if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
3227                        vmx->nested.pi_desc =
3228                                (struct pi_desc *)(((void *)map->hva) +
3229                                offset_in_page(vmcs12->posted_intr_desc_addr));
3230                        vmcs_write64(POSTED_INTR_DESC_ADDR,
3231                                     pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
3232                } else {
3233                        /*
3234                         * Defer the KVM_INTERNAL_EXIT until KVM tries to
3235                         * access the contents of the VMCS12 posted interrupt
3236                         * descriptor. (Note that KVM may do this when it
3237                         * should not, per the architectural specification.)
3238                         */
3239                        vmx->nested.pi_desc = NULL;
3240                        pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR);
3241                }
3242        }
3243        if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
3244                exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
3245        else
3246                exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
3247
3248        return true;
3249}
3250
3251static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
3252{
3253        if (!nested_get_evmcs_page(vcpu)) {
3254                pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
3255                                     __func__);
3256                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3257                vcpu->run->internal.suberror =
3258                        KVM_INTERNAL_ERROR_EMULATION;
3259                vcpu->run->internal.ndata = 0;
3260
3261                return false;
3262        }
3263
3264        if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
3265                return false;
3266
3267        return true;
3268}
3269
3270static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
3271{
3272        struct vmcs12 *vmcs12;
3273        struct vcpu_vmx *vmx = to_vmx(vcpu);
3274        gpa_t dst;
3275
3276        if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
3277                return 0;
3278
3279        if (WARN_ON_ONCE(vmx->nested.pml_full))
3280                return 1;
3281
3282        /*
3283         * Check if PML is enabled for the nested guest. Whether eptp bit 6 is
3284         * set is already checked as part of A/D emulation.
3285         */
3286        vmcs12 = get_vmcs12(vcpu);
3287        if (!nested_cpu_has_pml(vmcs12))
3288                return 0;
3289
3290        if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
3291                vmx->nested.pml_full = true;
3292                return 1;
3293        }
3294
3295        gpa &= ~0xFFFull;
3296        dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
3297
3298        if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
3299                                 offset_in_page(dst), sizeof(gpa)))
3300                return 0;
3301
3302        vmcs12->guest_pml_index--;
3303
3304        return 0;
3305}
3306
3307/*
3308 * Intel's VMX Instruction Reference specifies a common set of prerequisites
3309 * for running VMX instructions (except VMXON, whose prerequisites are
3310 * slightly different). It also specifies what exception to inject otherwise.
3311 * Note that many of these exceptions have priority over VM exits, so they
3312 * don't have to be checked again here.
3313 */
3314static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
3315{
3316        if (!to_vmx(vcpu)->nested.vmxon) {
3317                kvm_queue_exception(vcpu, UD_VECTOR);
3318                return 0;
3319        }
3320
3321        if (vmx_get_cpl(vcpu)) {
3322                kvm_inject_gp(vcpu, 0);
3323                return 0;
3324        }
3325
3326        return 1;
3327}
3328
3329static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
3330{
3331        u8 rvi = vmx_get_rvi();
3332        u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
3333
3334        return ((rvi & 0xf0) > (vppr & 0xf0));
3335}
3336
3337static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
3338                                   struct vmcs12 *vmcs12);
3339
3340/*
3341 * If from_vmentry is false, this is being called from state restore (either RSM
3342 * or KVM_SET_NESTED_STATE).  Otherwise it's called from vmlaunch/vmresume.
3343 *
3344 * Returns:
3345 *      NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode
3346 *      NVMX_VMENTRY_VMFAIL:  Consistency check VMFail
3347 *      NVMX_VMENTRY_VMEXIT:  Consistency check VMExit
3348 *      NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error
3349 */
3350enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
3351                                                        bool from_vmentry)
3352{
3353        struct vcpu_vmx *vmx = to_vmx(vcpu);
3354        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3355        enum vm_entry_failure_code entry_failure_code;
3356        bool evaluate_pending_interrupts;
3357        union vmx_exit_reason exit_reason = {
3358                .basic = EXIT_REASON_INVALID_STATE,
3359                .failed_vmentry = 1,
3360        };
3361        u32 failed_index;
3362
3363        if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
3364                kvm_vcpu_flush_tlb_current(vcpu);
3365
3366        evaluate_pending_interrupts = exec_controls_get(vmx) &
3367                (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
3368        if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
3369                evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
3370
3371        if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
3372                vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
3373        if (kvm_mpx_supported() &&
3374                !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
3375                vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3376
3377        /*
3378         * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
3379         * nested early checks are disabled.  In the event of a "late" VM-Fail,
3380         * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
3381         * software model to the pre-VMEntry host state.  When EPT is disabled,
3382         * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
3383         * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3.  Stuffing
3384         * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
3385         * the correct value.  Smashing vmcs01.GUEST_CR3 is safe because nested
3386         * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
3387         * guaranteed to be overwritten with a shadow CR3 prior to re-entering
3388         * L1.  Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
3389         * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
3390         * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
3391         * path would need to manually save/restore vmcs01.GUEST_CR3.
3392         */
3393        if (!enable_ept && !nested_early_check)
3394                vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3395
3396        vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
3397
3398        prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12);
3399
3400        if (from_vmentry) {
3401                if (unlikely(!nested_get_vmcs12_pages(vcpu))) {
3402                        vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3403                        return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
3404                }
3405
3406                if (nested_vmx_check_vmentry_hw(vcpu)) {
3407                        vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3408                        return NVMX_VMENTRY_VMFAIL;
3409                }
3410
3411                if (nested_vmx_check_guest_state(vcpu, vmcs12,
3412                                                 &entry_failure_code)) {
3413                        exit_reason.basic = EXIT_REASON_INVALID_STATE;
3414                        vmcs12->exit_qualification = entry_failure_code;
3415                        goto vmentry_fail_vmexit;
3416                }
3417        }
3418
3419        enter_guest_mode(vcpu);
3420
3421        if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) {
3422                exit_reason.basic = EXIT_REASON_INVALID_STATE;
3423                vmcs12->exit_qualification = entry_failure_code;
3424                goto vmentry_fail_vmexit_guest_mode;
3425        }
3426
3427        if (from_vmentry) {
3428                failed_index = nested_vmx_load_msr(vcpu,
3429                                                   vmcs12->vm_entry_msr_load_addr,
3430                                                   vmcs12->vm_entry_msr_load_count);
3431                if (failed_index) {
3432                        exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL;
3433                        vmcs12->exit_qualification = failed_index;
3434                        goto vmentry_fail_vmexit_guest_mode;
3435                }
3436        } else {
3437                /*
3438                 * The MMU is not initialized to point at the right entities yet and
3439                 * "get pages" would need to read data from the guest (i.e. we will
3440                 * need to perform gpa to hpa translation). Request a call
3441                 * to nested_get_vmcs12_pages before the next VM-entry.  The MSRs
3442                 * have already been set at vmentry time and should not be reset.
3443                 */
3444                kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
3445        }
3446
3447        /*
3448         * If L1 had a pending IRQ/NMI until it executed
3449         * VMLAUNCH/VMRESUME which wasn't delivered because it was
3450         * disallowed (e.g. interrupts disabled), L0 needs to
3451         * evaluate if this pending event should cause an exit from L2
3452         * to L1 or delivered directly to L2 (e.g. In case L1 don't
3453         * intercept EXTERNAL_INTERRUPT).
3454         *
3455         * Usually this would be handled by the processor noticing an
3456         * IRQ/NMI window request, or checking RVI during evaluation of
3457         * pending virtual interrupts.  However, this setting was done
3458         * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
3459         * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
3460         */
3461        if (unlikely(evaluate_pending_interrupts))
3462                kvm_make_request(KVM_REQ_EVENT, vcpu);
3463
3464        /*
3465         * Do not start the preemption timer hrtimer until after we know
3466         * we are successful, so that only nested_vmx_vmexit needs to cancel
3467         * the timer.
3468         */
3469        vmx->nested.preemption_timer_expired = false;
3470        if (nested_cpu_has_preemption_timer(vmcs12)) {
3471                u64 timer_value = vmx_calc_preemption_timer_value(vcpu);
3472                vmx_start_preemption_timer(vcpu, timer_value);
3473        }
3474
3475        /*
3476         * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
3477         * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
3478         * returned as far as L1 is concerned. It will only return (and set
3479         * the success flag) when L2 exits (see nested_vmx_vmexit()).
3480         */
3481        return NVMX_VMENTRY_SUCCESS;
3482
3483        /*
3484         * A failed consistency check that leads to a VMExit during L1's
3485         * VMEnter to L2 is a variation of a normal VMexit, as explained in
3486         * 26.7 "VM-entry failures during or after loading guest state".
3487         */
3488vmentry_fail_vmexit_guest_mode:
3489        if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
3490                vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
3491        leave_guest_mode(vcpu);
3492
3493vmentry_fail_vmexit:
3494        vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3495
3496        if (!from_vmentry)
3497                return NVMX_VMENTRY_VMEXIT;
3498
3499        load_vmcs12_host_state(vcpu, vmcs12);
3500        vmcs12->vm_exit_reason = exit_reason.full;
3501        if (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
3502                vmx->nested.need_vmcs12_to_shadow_sync = true;
3503        return NVMX_VMENTRY_VMEXIT;
3504}
3505
3506/*
3507 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
3508 * for running an L2 nested guest.
3509 */
3510static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
3511{
3512        struct vmcs12 *vmcs12;
3513        enum nvmx_vmentry_status status;
3514        struct vcpu_vmx *vmx = to_vmx(vcpu);
3515        u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
3516        enum nested_evmptrld_status evmptrld_status;
3517
3518        if (!nested_vmx_check_permission(vcpu))
3519                return 1;
3520
3521        evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch);
3522        if (evmptrld_status == EVMPTRLD_ERROR) {
3523                kvm_queue_exception(vcpu, UD_VECTOR);
3524                return 1;
3525        } else if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) {
3526                return nested_vmx_failInvalid(vcpu);
3527        }
3528
3529        if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) &&
3530               vmx->nested.current_vmptr == -1ull))
3531                return nested_vmx_failInvalid(vcpu);
3532
3533        vmcs12 = get_vmcs12(vcpu);
3534
3535        /*
3536         * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
3537         * that there *is* a valid VMCS pointer, RFLAGS.CF is set
3538         * rather than RFLAGS.ZF, and no error number is stored to the
3539         * VM-instruction error field.
3540         */
3541        if (CC(vmcs12->hdr.shadow_vmcs))
3542                return nested_vmx_failInvalid(vcpu);
3543
3544        if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
3545                copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_evmcs->hv_clean_fields);
3546                /* Enlightened VMCS doesn't have launch state */
3547                vmcs12->launch_state = !launch;
3548        } else if (enable_shadow_vmcs) {
3549                copy_shadow_to_vmcs12(vmx);
3550        }
3551
3552        /*
3553         * The nested entry process starts with enforcing various prerequisites
3554         * on vmcs12 as required by the Intel SDM, and act appropriately when
3555         * they fail: As the SDM explains, some conditions should cause the
3556         * instruction to fail, while others will cause the instruction to seem
3557         * to succeed, but return an EXIT_REASON_INVALID_STATE.
3558         * To speed up the normal (success) code path, we should avoid checking
3559         * for misconfigurations which will anyway be caught by the processor
3560         * when using the merged vmcs02.
3561         */
3562        if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS))
3563                return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
3564
3565        if (CC(vmcs12->launch_state == launch))
3566                return nested_vmx_fail(vcpu,
3567                        launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
3568                               : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
3569
3570        if (nested_vmx_check_controls(vcpu, vmcs12))
3571                return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3572
3573        if (nested_vmx_check_host_state(vcpu, vmcs12))
3574                return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
3575
3576        /*
3577         * We're finally done with prerequisite checking, and can start with
3578         * the nested entry.
3579         */
3580        vmx->nested.nested_run_pending = 1;
3581        vmx->nested.has_preemption_timer_deadline = false;
3582        status = nested_vmx_enter_non_root_mode(vcpu, true);
3583        if (unlikely(status != NVMX_VMENTRY_SUCCESS))
3584                goto vmentry_failed;
3585
3586        /* Emulate processing of posted interrupts on VM-Enter. */
3587        if (nested_cpu_has_posted_intr(vmcs12) &&
3588            kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) {
3589                vmx->nested.pi_pending = true;
3590                kvm_make_request(KVM_REQ_EVENT, vcpu);
3591                kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv);
3592        }
3593
3594        /* Hide L1D cache contents from the nested guest.  */
3595        vmx->vcpu.arch.l1tf_flush_l1d = true;
3596
3597        /*
3598         * Must happen outside of nested_vmx_enter_non_root_mode() as it will
3599         * also be used as part of restoring nVMX state for
3600         * snapshot restore (migration).
3601         *
3602         * In this flow, it is assumed that vmcs12 cache was
3603         * transferred as part of captured nVMX state and should
3604         * therefore not be read from guest memory (which may not
3605         * exist on destination host yet).
3606         */
3607        nested_cache_shadow_vmcs12(vcpu, vmcs12);
3608
3609        switch (vmcs12->guest_activity_state) {
3610        case GUEST_ACTIVITY_HLT:
3611                /*
3612                 * If we're entering a halted L2 vcpu and the L2 vcpu won't be
3613                 * awakened by event injection or by an NMI-window VM-exit or
3614                 * by an interrupt-window VM-exit, halt the vcpu.
3615                 */
3616                if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
3617                    !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) &&
3618                    !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) &&
3619                      (vmcs12->guest_rflags & X86_EFLAGS_IF))) {
3620                        vmx->nested.nested_run_pending = 0;
3621                        return kvm_vcpu_halt(vcpu);
3622                }
3623                break;
3624        case GUEST_ACTIVITY_WAIT_SIPI:
3625                vmx->nested.nested_run_pending = 0;
3626                vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
3627                break;
3628        default:
3629                break;
3630        }
3631
3632        return 1;
3633
3634vmentry_failed:
3635        vmx->nested.nested_run_pending = 0;
3636        if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR)
3637                return 0;
3638        if (status == NVMX_VMENTRY_VMEXIT)
3639                return 1;
3640        WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL);
3641        return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3642}
3643
3644/*
3645 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
3646 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK).
3647 * This function returns the new value we should put in vmcs12.guest_cr0.
3648 * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
3649 *  1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
3650 *     available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
3651 *     didn't trap the bit, because if L1 did, so would L0).
3652 *  2. Bits that L1 asked to trap (and therefore L0 also did) could not have
3653 *     been modified by L2, and L1 knows it. So just leave the old value of
3654 *     the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
3655 *     isn't relevant, because if L0 traps this bit it can set it to anything.
3656 *  3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
3657 *     changed these bits, and therefore they need to be updated, but L0
3658 *     didn't necessarily allow them to be changed in GUEST_CR0 - and rather
3659 *     put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
3660 */
3661static inline unsigned long
3662vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3663{
3664        return
3665        /*1*/   (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
3666        /*2*/   (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
3667        /*3*/   (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
3668                        vcpu->arch.cr0_guest_owned_bits));
3669}
3670
3671static inline unsigned long
3672vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3673{
3674        return
3675        /*1*/   (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
3676        /*2*/   (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
3677        /*3*/   (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
3678                        vcpu->arch.cr4_guest_owned_bits));
3679}
3680
3681static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
3682                                      struct vmcs12 *vmcs12)
3683{
3684        u32 idt_vectoring;
3685        unsigned int nr;
3686
3687        if (vcpu->arch.exception.injected) {
3688                nr = vcpu->arch.exception.nr;
3689                idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3690
3691                if (kvm_exception_is_soft(nr)) {
3692                        vmcs12->vm_exit_instruction_len =
3693                                vcpu->arch.event_exit_inst_len;
3694                        idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
3695                } else
3696                        idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
3697
3698                if (vcpu->arch.exception.has_error_code) {
3699                        idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
3700                        vmcs12->idt_vectoring_error_code =
3701                                vcpu->arch.exception.error_code;
3702                }
3703
3704                vmcs12->idt_vectoring_info_field = idt_vectoring;
3705        } else if (vcpu->arch.nmi_injected) {
3706                vmcs12->idt_vectoring_info_field =
3707                        INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
3708        } else if (vcpu->arch.interrupt.injected) {
3709                nr = vcpu->arch.interrupt.nr;
3710                idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3711
3712                if (vcpu->arch.interrupt.soft) {
3713                        idt_vectoring |= INTR_TYPE_SOFT_INTR;
3714                        vmcs12->vm_entry_instruction_len =
3715                                vcpu->arch.event_exit_inst_len;
3716                } else
3717                        idt_vectoring |= INTR_TYPE_EXT_INTR;
3718
3719                vmcs12->idt_vectoring_info_field = idt_vectoring;
3720        }
3721}
3722
3723
3724void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
3725{
3726        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3727        gfn_t gfn;
3728
3729        /*
3730         * Don't need to mark the APIC access page dirty; it is never
3731         * written to by the CPU during APIC virtualization.
3732         */
3733
3734        if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3735                gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
3736                kvm_vcpu_mark_page_dirty(vcpu, gfn);
3737        }
3738
3739        if (nested_cpu_has_posted_intr(vmcs12)) {
3740                gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
3741                kvm_vcpu_mark_page_dirty(vcpu, gfn);
3742        }
3743}
3744
3745static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
3746{
3747        struct vcpu_vmx *vmx = to_vmx(vcpu);
3748        int max_irr;
3749        void *vapic_page;
3750        u16 status;
3751
3752        if (!vmx->nested.pi_pending)
3753                return 0;
3754
3755        if (!vmx->nested.pi_desc)
3756                goto mmio_needed;
3757
3758        vmx->nested.pi_pending = false;
3759
3760        if (!pi_test_and_clear_on(vmx->nested.pi_desc))
3761                return 0;
3762
3763        max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
3764        if (max_irr != 256) {
3765                vapic_page = vmx->nested.virtual_apic_map.hva;
3766                if (!vapic_page)
3767                        goto mmio_needed;
3768
3769                __kvm_apic_update_irr(vmx->nested.pi_desc->pir,
3770                        vapic_page, &max_irr);
3771                status = vmcs_read16(GUEST_INTR_STATUS);
3772                if ((u8)max_irr > ((u8)status & 0xff)) {
3773                        status &= ~0xff;
3774                        status |= (u8)max_irr;
3775                        vmcs_write16(GUEST_INTR_STATUS, status);
3776                }
3777        }
3778
3779        nested_mark_vmcs12_pages_dirty(vcpu);
3780        return 0;
3781
3782mmio_needed:
3783        kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL);
3784        return -ENXIO;
3785}
3786
3787static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
3788                                               unsigned long exit_qual)
3789{
3790        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3791        unsigned int nr = vcpu->arch.exception.nr;
3792        u32 intr_info = nr | INTR_INFO_VALID_MASK;
3793
3794        if (vcpu->arch.exception.has_error_code) {
3795                vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
3796                intr_info |= INTR_INFO_DELIVER_CODE_MASK;
3797        }
3798
3799        if (kvm_exception_is_soft(nr))
3800                intr_info |= INTR_TYPE_SOFT_EXCEPTION;
3801        else
3802                intr_info |= INTR_TYPE_HARD_EXCEPTION;
3803
3804        if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
3805            vmx_get_nmi_mask(vcpu))
3806                intr_info |= INTR_INFO_UNBLOCK_NMI;
3807
3808        nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
3809}
3810
3811/*
3812 * Returns true if a debug trap is pending delivery.
3813 *
3814 * In KVM, debug traps bear an exception payload. As such, the class of a #DB
3815 * exception may be inferred from the presence of an exception payload.
3816 */
3817static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu)
3818{
3819        return vcpu->arch.exception.pending &&
3820                        vcpu->arch.exception.nr == DB_VECTOR &&
3821                        vcpu->arch.exception.payload;
3822}
3823
3824/*
3825 * Certain VM-exits set the 'pending debug exceptions' field to indicate a
3826 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM
3827 * represents these debug traps with a payload that is said to be compatible
3828 * with the 'pending debug exceptions' field, write the payload to the VMCS
3829 * field if a VM-exit is delivered before the debug trap.
3830 */
3831static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
3832{
3833        if (vmx_pending_dbg_trap(vcpu))
3834                vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
3835                            vcpu->arch.exception.payload);
3836}
3837
3838static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
3839{
3840        return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
3841               to_vmx(vcpu)->nested.preemption_timer_expired;
3842}
3843
3844static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
3845{
3846        struct vcpu_vmx *vmx = to_vmx(vcpu);
3847        unsigned long exit_qual;
3848        bool block_nested_events =
3849            vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
3850        bool mtf_pending = vmx->nested.mtf_pending;
3851        struct kvm_lapic *apic = vcpu->arch.apic;
3852
3853        /*
3854         * Clear the MTF state. If a higher priority VM-exit is delivered first,
3855         * this state is discarded.
3856         */
3857        if (!block_nested_events)
3858                vmx->nested.mtf_pending = false;
3859
3860        if (lapic_in_kernel(vcpu) &&
3861                test_bit(KVM_APIC_INIT, &apic->pending_events)) {
3862                if (block_nested_events)
3863                        return -EBUSY;
3864                nested_vmx_update_pending_dbg(vcpu);
3865                clear_bit(KVM_APIC_INIT, &apic->pending_events);
3866                if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED)
3867                        nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
3868                return 0;
3869        }
3870
3871        if (lapic_in_kernel(vcpu) &&
3872            test_bit(KVM_APIC_SIPI, &apic->pending_events)) {
3873                if (block_nested_events)
3874                        return -EBUSY;
3875
3876                clear_bit(KVM_APIC_SIPI, &apic->pending_events);
3877                if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
3878                        nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0,
3879                                                apic->sipi_vector & 0xFFUL);
3880                return 0;
3881        }
3882
3883        /*
3884         * Process any exceptions that are not debug traps before MTF.
3885         *
3886         * Note that only a pending nested run can block a pending exception.
3887         * Otherwise an injected NMI/interrupt should either be
3888         * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO,
3889         * while delivering the pending exception.
3890         */
3891
3892        if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) {
3893                if (vmx->nested.nested_run_pending)
3894                        return -EBUSY;
3895                if (!nested_vmx_check_exception(vcpu, &exit_qual))
3896                        goto no_vmexit;
3897                nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
3898                return 0;
3899        }
3900
3901        if (mtf_pending) {
3902                if (block_nested_events)
3903                        return -EBUSY;
3904                nested_vmx_update_pending_dbg(vcpu);
3905                nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0);
3906                return 0;
3907        }
3908
3909        if (vcpu->arch.exception.pending) {
3910                if (vmx->nested.nested_run_pending)
3911                        return -EBUSY;
3912                if (!nested_vmx_check_exception(vcpu, &exit_qual))
3913                        goto no_vmexit;
3914                nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
3915                return 0;
3916        }
3917
3918        if (nested_vmx_preemption_timer_pending(vcpu)) {
3919                if (block_nested_events)
3920                        return -EBUSY;
3921                nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
3922                return 0;
3923        }
3924
3925        if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
3926                if (block_nested_events)
3927                        return -EBUSY;
3928                goto no_vmexit;
3929        }
3930
3931        if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) {
3932                if (block_nested_events)
3933                        return -EBUSY;
3934                if (!nested_exit_on_nmi(vcpu))
3935                        goto no_vmexit;
3936
3937                nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
3938                                  NMI_VECTOR | INTR_TYPE_NMI_INTR |
3939                                  INTR_INFO_VALID_MASK, 0);
3940                /*
3941                 * The NMI-triggered VM exit counts as injection:
3942                 * clear this one and block further NMIs.
3943                 */
3944                vcpu->arch.nmi_pending = 0;
3945                vmx_set_nmi_mask(vcpu, true);
3946                return 0;
3947        }
3948
3949        if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) {
3950                if (block_nested_events)
3951                        return -EBUSY;
3952                if (!nested_exit_on_intr(vcpu))
3953                        goto no_vmexit;
3954                nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
3955                return 0;
3956        }
3957
3958no_vmexit:
3959        return vmx_complete_nested_posted_interrupt(vcpu);
3960}
3961
3962static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
3963{
3964        ktime_t remaining =
3965                hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
3966        u64 value;
3967
3968        if (ktime_to_ns(remaining) <= 0)
3969                return 0;
3970
3971        value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
3972        do_div(value, 1000000);
3973        return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
3974}
3975
3976static bool is_vmcs12_ext_field(unsigned long field)
3977{
3978        switch (field) {
3979        case GUEST_ES_SELECTOR:
3980        case GUEST_CS_SELECTOR:
3981        case GUEST_SS_SELECTOR:
3982        case GUEST_DS_SELECTOR:
3983        case GUEST_FS_SELECTOR:
3984        case GUEST_GS_SELECTOR:
3985        case GUEST_LDTR_SELECTOR:
3986        case GUEST_TR_SELECTOR:
3987        case GUEST_ES_LIMIT:
3988        case GUEST_CS_LIMIT:
3989        case GUEST_SS_LIMIT:
3990        case GUEST_DS_LIMIT:
3991        case GUEST_FS_LIMIT:
3992        case GUEST_GS_LIMIT:
3993        case GUEST_LDTR_LIMIT:
3994        case GUEST_TR_LIMIT:
3995        case GUEST_GDTR_LIMIT:
3996        case GUEST_IDTR_LIMIT:
3997        case GUEST_ES_AR_BYTES:
3998        case GUEST_DS_AR_BYTES:
3999        case GUEST_FS_AR_BYTES:
4000        case GUEST_GS_AR_BYTES:
4001        case GUEST_LDTR_AR_BYTES:
4002        case GUEST_TR_AR_BYTES:
4003        case GUEST_ES_BASE:
4004        case GUEST_CS_BASE:
4005        case GUEST_SS_BASE:
4006        case GUEST_DS_BASE:
4007        case GUEST_FS_BASE:
4008        case GUEST_GS_BASE:
4009        case GUEST_LDTR_BASE:
4010        case GUEST_TR_BASE:
4011        case GUEST_GDTR_BASE:
4012        case GUEST_IDTR_BASE:
4013        case GUEST_PENDING_DBG_EXCEPTIONS:
4014        case GUEST_BNDCFGS:
4015                return true;
4016        default:
4017                break;
4018        }
4019
4020        return false;
4021}
4022
4023static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
4024                                       struct vmcs12 *vmcs12)
4025{
4026        struct vcpu_vmx *vmx = to_vmx(vcpu);
4027
4028        vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
4029        vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
4030        vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
4031        vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
4032        vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
4033        vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
4034        vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
4035        vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
4036        vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
4037        vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
4038        vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
4039        vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
4040        vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
4041        vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
4042        vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
4043        vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
4044        vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
4045        vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
4046        vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
4047        vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
4048        vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
4049        vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
4050        vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
4051        vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
4052        vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
4053        vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
4054        vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
4055        vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
4056        vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
4057        vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
4058        vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
4059        vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
4060        vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
4061        vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
4062        vmcs12->guest_pending_dbg_exceptions =
4063                vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
4064        if (kvm_mpx_supported())
4065                vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
4066
4067        vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
4068}
4069
4070static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
4071                                       struct vmcs12 *vmcs12)
4072{
4073        struct vcpu_vmx *vmx = to_vmx(vcpu);
4074        int cpu;
4075
4076        if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare)
4077                return;
4078
4079
4080        WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
4081
4082        cpu = get_cpu();
4083        vmx->loaded_vmcs = &vmx->nested.vmcs02;
4084        vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01);
4085
4086        sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4087
4088        vmx->loaded_vmcs = &vmx->vmcs01;
4089        vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02);
4090        put_cpu();
4091}
4092
4093/*
4094 * Update the guest state fields of vmcs12 to reflect changes that
4095 * occurred while L2 was running. (The "IA-32e mode guest" bit of the
4096 * VM-entry controls is also updated, since this is really a guest
4097 * state bit.)
4098 */
4099static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
4100{
4101        struct vcpu_vmx *vmx = to_vmx(vcpu);
4102
4103        if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
4104                sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4105
4106        vmx->nested.need_sync_vmcs02_to_vmcs12_rare =
4107                !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr);
4108
4109        vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
4110        vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
4111
4112        vmcs12->guest_rsp = kvm_rsp_read(vcpu);
4113        vmcs12->guest_rip = kvm_rip_read(vcpu);
4114        vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
4115
4116        vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
4117        vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
4118
4119        vmcs12->guest_interruptibility_info =
4120                vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
4121
4122        if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
4123                vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
4124        else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
4125                vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI;
4126        else
4127                vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
4128
4129        if (nested_cpu_has_preemption_timer(vmcs12) &&
4130            vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER &&
4131            !vmx->nested.nested_run_pending)
4132                vmcs12->vmx_preemption_timer_value =
4133                        vmx_get_preemption_timer_value(vcpu);
4134
4135        /*
4136         * In some cases (usually, nested EPT), L2 is allowed to change its
4137         * own CR3 without exiting. If it has changed it, we must keep it.
4138         * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
4139         * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
4140         *
4141         * Additionally, restore L2's PDPTR to vmcs12.
4142         */
4143        if (enable_ept) {
4144                vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
4145                if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
4146                        vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
4147                        vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
4148                        vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
4149                        vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
4150                }
4151        }
4152
4153        vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
4154
4155        if (nested_cpu_has_vid(vmcs12))
4156                vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
4157
4158        vmcs12->vm_entry_controls =
4159                (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
4160                (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
4161
4162        if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
4163                kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
4164
4165        if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
4166                vmcs12->guest_ia32_efer = vcpu->arch.efer;
4167}
4168
4169/*
4170 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
4171 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
4172 * and this function updates it to reflect the changes to the guest state while
4173 * L2 was running (and perhaps made some exits which were handled directly by L0
4174 * without going back to L1), and to reflect the exit reason.
4175 * Note that we do not have to copy here all VMCS fields, just those that
4176 * could have changed by the L2 guest or the exit - i.e., the guest-state and
4177 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
4178 * which already writes to vmcs12 directly.
4179 */
4180static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
4181                           u32 vm_exit_reason, u32 exit_intr_info,
4182                           unsigned long exit_qualification)
4183{
4184        /* update exit information fields: */
4185        vmcs12->vm_exit_reason = vm_exit_reason;
4186        if (to_vmx(vcpu)->exit_reason.enclave_mode)
4187                vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE;
4188        vmcs12->exit_qualification = exit_qualification;
4189        vmcs12->vm_exit_intr_info = exit_intr_info;
4190
4191        vmcs12->idt_vectoring_info_field = 0;
4192        vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
4193        vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4194
4195        if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
4196                vmcs12->launch_state = 1;
4197
4198                /* vm_entry_intr_info_field is cleared on exit. Emulate this
4199                 * instead of reading the real value. */
4200                vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
4201
4202                /*
4203                 * Transfer the event that L0 or L1 may wanted to inject into
4204                 * L2 to IDT_VECTORING_INFO_FIELD.
4205                 */
4206                vmcs12_save_pending_event(vcpu, vmcs12);
4207
4208                /*
4209                 * According to spec, there's no need to store the guest's
4210                 * MSRs if the exit is due to a VM-entry failure that occurs
4211                 * during or after loading the guest state. Since this exit
4212                 * does not fall in that category, we need to save the MSRs.
4213                 */
4214                if (nested_vmx_store_msr(vcpu,
4215                                         vmcs12->vm_exit_msr_store_addr,
4216                                         vmcs12->vm_exit_msr_store_count))
4217                        nested_vmx_abort(vcpu,
4218                                         VMX_ABORT_SAVE_GUEST_MSR_FAIL);
4219        }
4220
4221        /*
4222         * Drop what we picked up for L2 via vmx_complete_interrupts. It is
4223         * preserved above and would only end up incorrectly in L1.
4224         */
4225        vcpu->arch.nmi_injected = false;
4226        kvm_clear_exception_queue(vcpu);
4227        kvm_clear_interrupt_queue(vcpu);
4228}
4229
4230/*
4231 * A part of what we need to when the nested L2 guest exits and we want to
4232 * run its L1 parent, is to reset L1's guest state to the host state specified
4233 * in vmcs12.
4234 * This function is to be called not only on normal nested exit, but also on
4235 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
4236 * Failures During or After Loading Guest State").
4237 * This function should be called when the active VMCS is L1's (vmcs01).
4238 */
4239static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
4240                                   struct vmcs12 *vmcs12)
4241{
4242        enum vm_entry_failure_code ignored;
4243        struct kvm_segment seg;
4244
4245        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
4246                vcpu->arch.efer = vmcs12->host_ia32_efer;
4247        else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
4248                vcpu->arch.efer |= (EFER_LMA | EFER_LME);
4249        else
4250                vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
4251        vmx_set_efer(vcpu, vcpu->arch.efer);
4252
4253        kvm_rsp_write(vcpu, vmcs12->host_rsp);
4254        kvm_rip_write(vcpu, vmcs12->host_rip);
4255        vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
4256        vmx_set_interrupt_shadow(vcpu, 0);
4257
4258        /*
4259         * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
4260         * actually changed, because vmx_set_cr0 refers to efer set above.
4261         *
4262         * CR0_GUEST_HOST_MASK is already set in the original vmcs01
4263         * (KVM doesn't change it);
4264         */
4265        vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
4266        vmx_set_cr0(vcpu, vmcs12->host_cr0);
4267
4268        /* Same as above - no reason to call set_cr4_guest_host_mask().  */
4269        vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
4270        vmx_set_cr4(vcpu, vmcs12->host_cr4);
4271
4272        nested_ept_uninit_mmu_context(vcpu);
4273
4274        /*
4275         * Only PDPTE load can fail as the value of cr3 was checked on entry and
4276         * couldn't have changed.
4277         */
4278        if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored))
4279                nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
4280
4281        nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
4282
4283        vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
4284        vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
4285        vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
4286        vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
4287        vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
4288        vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
4289        vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
4290
4291        /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1.  */
4292        if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
4293                vmcs_write64(GUEST_BNDCFGS, 0);
4294
4295        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
4296                vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
4297                vcpu->arch.pat = vmcs12->host_ia32_pat;
4298        }
4299        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
4300                WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
4301                                         vmcs12->host_ia32_perf_global_ctrl));
4302
4303        /* Set L1 segment info according to Intel SDM
4304            27.5.2 Loading Host Segment and Descriptor-Table Registers */
4305        seg = (struct kvm_segment) {
4306                .base = 0,
4307                .limit = 0xFFFFFFFF,
4308                .selector = vmcs12->host_cs_selector,
4309                .type = 11,
4310                .present = 1,
4311                .s = 1,
4312                .g = 1
4313        };
4314        if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
4315                seg.l = 1;
4316        else
4317                seg.db = 1;
4318        __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
4319        seg = (struct kvm_segment) {
4320                .base = 0,
4321                .limit = 0xFFFFFFFF,
4322                .type = 3,
4323                .present = 1,
4324                .s = 1,
4325                .db = 1,
4326                .g = 1
4327        };
4328        seg.selector = vmcs12->host_ds_selector;
4329        __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
4330        seg.selector = vmcs12->host_es_selector;
4331        __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
4332        seg.selector = vmcs12->host_ss_selector;
4333        __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
4334        seg.selector = vmcs12->host_fs_selector;
4335        seg.base = vmcs12->host_fs_base;
4336        __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
4337        seg.selector = vmcs12->host_gs_selector;
4338        seg.base = vmcs12->host_gs_base;
4339        __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
4340        seg = (struct kvm_segment) {
4341                .base = vmcs12->host_tr_base,
4342                .limit = 0x67,
4343                .selector = vmcs12->host_tr_selector,
4344                .type = 11,
4345                .present = 1
4346        };
4347        __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
4348
4349        memset(&seg, 0, sizeof(seg));
4350        seg.unusable = 1;
4351        __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
4352
4353        kvm_set_dr(vcpu, 7, 0x400);
4354        vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4355
4356        if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
4357                                vmcs12->vm_exit_msr_load_count))
4358                nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4359
4360        to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
4361}
4362
4363static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
4364{
4365        struct vmx_uret_msr *efer_msr;
4366        unsigned int i;
4367
4368        if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
4369                return vmcs_read64(GUEST_IA32_EFER);
4370
4371        if (cpu_has_load_ia32_efer())
4372                return host_efer;
4373
4374        for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
4375                if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
4376                        return vmx->msr_autoload.guest.val[i].value;
4377        }
4378
4379        efer_msr = vmx_find_uret_msr(vmx, MSR_EFER);
4380        if (efer_msr)
4381                return efer_msr->data;
4382
4383        return host_efer;
4384}
4385
4386static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
4387{
4388        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4389        struct vcpu_vmx *vmx = to_vmx(vcpu);
4390        struct vmx_msr_entry g, h;
4391        gpa_t gpa;
4392        u32 i, j;
4393
4394        vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
4395
4396        if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
4397                /*
4398                 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
4399                 * as vmcs01.GUEST_DR7 contains a userspace defined value
4400                 * and vcpu->arch.dr7 is not squirreled away before the
4401                 * nested VMENTER (not worth adding a variable in nested_vmx).
4402                 */
4403                if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
4404                        kvm_set_dr(vcpu, 7, DR7_FIXED_1);
4405                else
4406                        WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
4407        }
4408
4409        /*
4410         * Note that calling vmx_set_{efer,cr0,cr4} is important as they
4411         * handle a variety of side effects to KVM's software model.
4412         */
4413        vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
4414
4415        vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
4416        vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
4417
4418        vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
4419        vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
4420
4421        nested_ept_uninit_mmu_context(vcpu);
4422        vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
4423        kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
4424
4425        /*
4426         * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
4427         * from vmcs01 (if necessary).  The PDPTRs are not loaded on
4428         * VMFail, like everything else we just need to ensure our
4429         * software model is up-to-date.
4430         */
4431        if (enable_ept && is_pae_paging(vcpu))
4432                ept_save_pdptrs(vcpu);
4433
4434        kvm_mmu_reset_context(vcpu);
4435
4436        /*
4437         * This nasty bit of open coding is a compromise between blindly
4438         * loading L1's MSRs using the exit load lists (incorrect emulation
4439         * of VMFail), leaving the nested VM's MSRs in the software model
4440         * (incorrect behavior) and snapshotting the modified MSRs (too
4441         * expensive since the lists are unbound by hardware).  For each
4442         * MSR that was (prematurely) loaded from the nested VMEntry load
4443         * list, reload it from the exit load list if it exists and differs
4444         * from the guest value.  The intent is to stuff host state as
4445         * silently as possible, not to fully process the exit load list.
4446         */
4447        for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
4448                gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
4449                if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
4450                        pr_debug_ratelimited(
4451                                "%s read MSR index failed (%u, 0x%08llx)\n",
4452                                __func__, i, gpa);
4453                        goto vmabort;
4454                }
4455
4456                for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
4457                        gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
4458                        if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
4459                                pr_debug_ratelimited(
4460                                        "%s read MSR failed (%u, 0x%08llx)\n",
4461                                        __func__, j, gpa);
4462                                goto vmabort;
4463                        }
4464                        if (h.index != g.index)
4465                                continue;
4466                        if (h.value == g.value)
4467                                break;
4468
4469                        if (nested_vmx_load_msr_check(vcpu, &h)) {
4470                                pr_debug_ratelimited(
4471                                        "%s check failed (%u, 0x%x, 0x%x)\n",
4472                                        __func__, j, h.index, h.reserved);
4473                                goto vmabort;
4474                        }
4475
4476                        if (kvm_set_msr(vcpu, h.index, h.value)) {
4477                                pr_debug_ratelimited(
4478                                        "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
4479                                        __func__, j, h.index, h.value);
4480                                goto vmabort;
4481                        }
4482                }
4483        }
4484
4485        return;
4486
4487vmabort:
4488        nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4489}
4490
4491/*
4492 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
4493 * and modify vmcs12 to make it see what it would expect to see there if
4494 * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
4495 */
4496void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
4497                       u32 exit_intr_info, unsigned long exit_qualification)
4498{
4499        struct vcpu_vmx *vmx = to_vmx(vcpu);
4500        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4501
4502        /* trying to cancel vmlaunch/vmresume is a bug */
4503        WARN_ON_ONCE(vmx->nested.nested_run_pending);
4504
4505        /* Similarly, triple faults in L2 should never escape. */
4506        WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
4507
4508        if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
4509                /*
4510                 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
4511                 * Enlightened VMCS after migration and we still need to
4512                 * do that when something is forcing L2->L1 exit prior to
4513                 * the first L2 run.
4514                 */
4515                (void)nested_get_evmcs_page(vcpu);
4516        }
4517
4518        /* Service the TLB flush request for L2 before switching to L1. */
4519        if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
4520                kvm_vcpu_flush_tlb_current(vcpu);
4521
4522        /*
4523         * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
4524         * now and the new vmentry.  Ensure that the VMCS02 PDPTR fields are
4525         * up-to-date before switching to L1.
4526         */
4527        if (enable_ept && is_pae_paging(vcpu))
4528                vmx_ept_load_pdptrs(vcpu);
4529
4530        leave_guest_mode(vcpu);
4531
4532        if (nested_cpu_has_preemption_timer(vmcs12))
4533                hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
4534
4535        if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) {
4536                vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset;
4537                if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
4538                        vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
4539        }
4540
4541        if (likely(!vmx->fail)) {
4542                sync_vmcs02_to_vmcs12(vcpu, vmcs12);
4543
4544                if (vm_exit_reason != -1)
4545                        prepare_vmcs12(vcpu, vmcs12, vm_exit_reason,
4546                                       exit_intr_info, exit_qualification);
4547
4548                /*
4549                 * Must happen outside of sync_vmcs02_to_vmcs12() as it will
4550                 * also be used to capture vmcs12 cache as part of
4551                 * capturing nVMX state for snapshot (migration).
4552                 *
4553                 * Otherwise, this flush will dirty guest memory at a
4554                 * point it is already assumed by user-space to be
4555                 * immutable.
4556                 */
4557                nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
4558        } else {
4559                /*
4560                 * The only expected VM-instruction error is "VM entry with
4561                 * invalid control field(s)." Anything else indicates a
4562                 * problem with L0.  And we should never get here with a
4563                 * VMFail of any type if early consistency checks are enabled.
4564                 */
4565                WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
4566                             VMXERR_ENTRY_INVALID_CONTROL_FIELD);
4567                WARN_ON_ONCE(nested_early_check);
4568        }
4569
4570        vmx_switch_vmcs(vcpu, &vmx->vmcs01);
4571
4572        /* Update any VMCS fields that might have changed while L2 ran */
4573        vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
4574        vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
4575        vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
4576        if (kvm_has_tsc_control)
4577                vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
4578
4579        if (vmx->nested.l1_tpr_threshold != -1)
4580                vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
4581
4582        if (vmx->nested.change_vmcs01_virtual_apic_mode) {
4583                vmx->nested.change_vmcs01_virtual_apic_mode = false;
4584                vmx_set_virtual_apic_mode(vcpu);
4585        }
4586
4587        if (vmx->nested.update_vmcs01_cpu_dirty_logging) {
4588                vmx->nested.update_vmcs01_cpu_dirty_logging = false;
4589                vmx_update_cpu_dirty_logging(vcpu);
4590        }
4591
4592        /* Unpin physical memory we referred to in vmcs02 */
4593        if (vmx->nested.apic_access_page) {
4594                kvm_release_page_clean(vmx->nested.apic_access_page);
4595                vmx->nested.apic_access_page = NULL;
4596        }
4597        kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
4598        kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
4599        vmx->nested.pi_desc = NULL;
4600
4601        if (vmx->nested.reload_vmcs01_apic_access_page) {
4602                vmx->nested.reload_vmcs01_apic_access_page = false;
4603                kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
4604        }
4605
4606        if ((vm_exit_reason != -1) &&
4607            (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)))
4608                vmx->nested.need_vmcs12_to_shadow_sync = true;
4609
4610        /* in case we halted in L2 */
4611        vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4612
4613        if (likely(!vmx->fail)) {
4614                if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
4615                    nested_exit_intr_ack_set(vcpu)) {
4616                        int irq = kvm_cpu_get_interrupt(vcpu);
4617                        WARN_ON(irq < 0);
4618                        vmcs12->vm_exit_intr_info = irq |
4619                                INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
4620                }
4621
4622                if (vm_exit_reason != -1)
4623                        trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
4624                                                       vmcs12->exit_qualification,
4625                                                       vmcs12->idt_vectoring_info_field,
4626                                                       vmcs12->vm_exit_intr_info,
4627                                                       vmcs12->vm_exit_intr_error_code,
4628                                                       KVM_ISA_VMX);
4629
4630                load_vmcs12_host_state(vcpu, vmcs12);
4631
4632                return;
4633        }
4634
4635        /*
4636         * After an early L2 VM-entry failure, we're now back
4637         * in L1 which thinks it just finished a VMLAUNCH or
4638         * VMRESUME instruction, so we need to set the failure
4639         * flag and the VM-instruction error field of the VMCS
4640         * accordingly, and skip the emulated instruction.
4641         */
4642        (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
4643
4644        /*
4645         * Restore L1's host state to KVM's software model.  We're here
4646         * because a consistency check was caught by hardware, which
4647         * means some amount of guest state has been propagated to KVM's
4648         * model and needs to be unwound to the host's state.
4649         */
4650        nested_vmx_restore_host_state(vcpu);
4651
4652        vmx->fail = 0;
4653}
4654
4655static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu)
4656{
4657        nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
4658}
4659
4660/*
4661 * Decode the memory-address operand of a vmx instruction, as recorded on an
4662 * exit caused by such an instruction (run by a guest hypervisor).
4663 * On success, returns 0. When the operand is invalid, returns 1 and throws
4664 * #UD, #GP, or #SS.
4665 */
4666int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
4667                        u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
4668{
4669        gva_t off;
4670        bool exn;
4671        struct kvm_segment s;
4672
4673        /*
4674         * According to Vol. 3B, "Information for VM Exits Due to Instruction
4675         * Execution", on an exit, vmx_instruction_info holds most of the
4676         * addressing components of the operand. Only the displacement part
4677         * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
4678         * For how an actual address is calculated from all these components,
4679         * refer to Vol. 1, "Operand Addressing".
4680         */
4681        int  scaling = vmx_instruction_info & 3;
4682        int  addr_size = (vmx_instruction_info >> 7) & 7;
4683        bool is_reg = vmx_instruction_info & (1u << 10);
4684        int  seg_reg = (vmx_instruction_info >> 15) & 7;
4685        int  index_reg = (vmx_instruction_info >> 18) & 0xf;
4686        bool index_is_valid = !(vmx_instruction_info & (1u << 22));
4687        int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
4688        bool base_is_valid  = !(vmx_instruction_info & (1u << 27));
4689
4690        if (is_reg) {
4691                kvm_queue_exception(vcpu, UD_VECTOR);
4692                return 1;
4693        }
4694
4695        /* Addr = segment_base + offset */
4696        /* offset = base + [index * scale] + displacement */
4697        off = exit_qualification; /* holds the displacement */
4698        if (addr_size == 1)
4699                off = (gva_t)sign_extend64(off, 31);
4700        else if (addr_size == 0)
4701                off = (gva_t)sign_extend64(off, 15);
4702        if (base_is_valid)
4703                off += kvm_register_read(vcpu, base_reg);
4704        if (index_is_valid)
4705                off += kvm_register_read(vcpu, index_reg) << scaling;
4706        vmx_get_segment(vcpu, &s, seg_reg);
4707
4708        /*
4709         * The effective address, i.e. @off, of a memory operand is truncated
4710         * based on the address size of the instruction.  Note that this is
4711         * the *effective address*, i.e. the address prior to accounting for
4712         * the segment's base.
4713         */
4714        if (addr_size == 1) /* 32 bit */
4715                off &= 0xffffffff;
4716        else if (addr_size == 0) /* 16 bit */
4717                off &= 0xffff;
4718
4719        /* Checks for #GP/#SS exceptions. */
4720        exn = false;
4721        if (is_long_mode(vcpu)) {
4722                /*
4723                 * The virtual/linear address is never truncated in 64-bit
4724                 * mode, e.g. a 32-bit address size can yield a 64-bit virtual
4725                 * address when using FS/GS with a non-zero base.
4726                 */
4727                if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS)
4728                        *ret = s.base + off;
4729                else
4730                        *ret = off;
4731
4732                /* Long mode: #GP(0)/#SS(0) if the memory address is in a
4733                 * non-canonical form. This is the only check on the memory
4734                 * destination for long mode!
4735                 */
4736                exn = is_noncanonical_address(*ret, vcpu);
4737        } else {
4738                /*
4739                 * When not in long mode, the virtual/linear address is
4740                 * unconditionally truncated to 32 bits regardless of the
4741                 * address size.
4742                 */
4743                *ret = (s.base + off) & 0xffffffff;
4744
4745                /* Protected mode: apply checks for segment validity in the
4746                 * following order:
4747                 * - segment type check (#GP(0) may be thrown)
4748                 * - usability check (#GP(0)/#SS(0))
4749                 * - limit check (#GP(0)/#SS(0))
4750                 */
4751                if (wr)
4752                        /* #GP(0) if the destination operand is located in a
4753                         * read-only data segment or any code segment.
4754                         */
4755                        exn = ((s.type & 0xa) == 0 || (s.type & 8));
4756                else
4757                        /* #GP(0) if the source operand is located in an
4758                         * execute-only code segment
4759                         */
4760                        exn = ((s.type & 0xa) == 8);
4761                if (exn) {
4762                        kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
4763                        return 1;
4764                }
4765                /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
4766                 */
4767                exn = (s.unusable != 0);
4768
4769                /*
4770                 * Protected mode: #GP(0)/#SS(0) if the memory operand is
4771                 * outside the segment limit.  All CPUs that support VMX ignore
4772                 * limit checks for flat segments, i.e. segments with base==0,
4773                 * limit==0xffffffff and of type expand-up data or code.
4774                 */
4775                if (!(s.base == 0 && s.limit == 0xffffffff &&
4776                     ((s.type & 8) || !(s.type & 4))))
4777                        exn = exn || ((u64)off + len - 1 > s.limit);
4778        }
4779        if (exn) {
4780                kvm_queue_exception_e(vcpu,
4781                                      seg_reg == VCPU_SREG_SS ?
4782                                                SS_VECTOR : GP_VECTOR,
4783                                      0);
4784                return 1;
4785        }
4786
4787        return 0;
4788}
4789
4790void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
4791{
4792        struct vcpu_vmx *vmx;
4793
4794        if (!nested_vmx_allowed(vcpu))
4795                return;
4796
4797        vmx = to_vmx(vcpu);
4798        if (kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) {
4799                vmx->nested.msrs.entry_ctls_high |=
4800                                VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4801                vmx->nested.msrs.exit_ctls_high |=
4802                                VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
4803        } else {
4804                vmx->nested.msrs.entry_ctls_high &=
4805                                ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4806                vmx->nested.msrs.exit_ctls_high &=
4807                                ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
4808        }
4809}
4810
4811static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer,
4812                                int *ret)
4813{
4814        gva_t gva;
4815        struct x86_exception e;
4816        int r;
4817
4818        if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
4819                                vmcs_read32(VMX_INSTRUCTION_INFO), false,
4820                                sizeof(*vmpointer), &gva)) {
4821                *ret = 1;
4822                return -EINVAL;
4823        }
4824
4825        r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e);
4826        if (r != X86EMUL_CONTINUE) {
4827                *ret = kvm_handle_memory_failure(vcpu, r, &e);
4828                return -EINVAL;
4829        }
4830
4831        return 0;
4832}
4833
4834/*
4835 * Allocate a shadow VMCS and associate it with the currently loaded
4836 * VMCS, unless such a shadow VMCS already exists. The newly allocated
4837 * VMCS is also VMCLEARed, so that it is ready for use.
4838 */
4839static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
4840{
4841        struct vcpu_vmx *vmx = to_vmx(vcpu);
4842        struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
4843
4844        /*
4845         * We should allocate a shadow vmcs for vmcs01 only when L1
4846         * executes VMXON and free it when L1 executes VMXOFF.
4847         * As it is invalid to execute VMXON twice, we shouldn't reach
4848         * here when vmcs01 already have an allocated shadow vmcs.
4849         */
4850        WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
4851
4852        if (!loaded_vmcs->shadow_vmcs) {
4853                loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
4854                if (loaded_vmcs->shadow_vmcs)
4855                        vmcs_clear(loaded_vmcs->shadow_vmcs);
4856        }
4857        return loaded_vmcs->shadow_vmcs;
4858}
4859
4860static int enter_vmx_operation(struct kvm_vcpu *vcpu)
4861{
4862        struct vcpu_vmx *vmx = to_vmx(vcpu);
4863        int r;
4864
4865        r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
4866        if (r < 0)
4867                goto out_vmcs02;
4868
4869        vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4870        if (!vmx->nested.cached_vmcs12)
4871                goto out_cached_vmcs12;
4872
4873        vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4874        if (!vmx->nested.cached_shadow_vmcs12)
4875                goto out_cached_shadow_vmcs12;
4876
4877        if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
4878                goto out_shadow_vmcs;
4879
4880        hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
4881                     HRTIMER_MODE_ABS_PINNED);
4882        vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
4883
4884        vmx->nested.vpid02 = allocate_vpid();
4885
4886        vmx->nested.vmcs02_initialized = false;
4887        vmx->nested.vmxon = true;
4888
4889        if (vmx_pt_mode_is_host_guest()) {
4890                vmx->pt_desc.guest.ctl = 0;
4891                pt_update_intercept_for_msr(vcpu);
4892        }
4893
4894        return 0;
4895
4896out_shadow_vmcs:
4897        kfree(vmx->nested.cached_shadow_vmcs12);
4898
4899out_cached_shadow_vmcs12:
4900        kfree(vmx->nested.cached_vmcs12);
4901
4902out_cached_vmcs12:
4903        free_loaded_vmcs(&vmx->nested.vmcs02);
4904
4905out_vmcs02:
4906        return -ENOMEM;
4907}
4908
4909/* Emulate the VMXON instruction. */
4910static int handle_vmon(struct kvm_vcpu *vcpu)
4911{
4912        int ret;
4913        gpa_t vmptr;
4914        uint32_t revision;
4915        struct vcpu_vmx *vmx = to_vmx(vcpu);
4916        const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED
4917                | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
4918
4919        /*
4920         * The Intel VMX Instruction Reference lists a bunch of bits that are
4921         * prerequisite to running VMXON, most notably cr4.VMXE must be set to
4922         * 1 (see vmx_is_valid_cr4() for when we allow the guest to set this).
4923         * Otherwise, we should fail with #UD.  But most faulting conditions
4924         * have already been checked by hardware, prior to the VM-exit for
4925         * VMXON.  We do test guest cr4.VMXE because processor CR4 always has
4926         * that bit set to 1 in non-root mode.
4927         */
4928        if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
4929                kvm_queue_exception(vcpu, UD_VECTOR);
4930                return 1;
4931        }
4932
4933        /* CPL=0 must be checked manually. */
4934        if (vmx_get_cpl(vcpu)) {
4935                kvm_inject_gp(vcpu, 0);
4936                return 1;
4937        }
4938
4939        if (vmx->nested.vmxon)
4940                return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
4941
4942        if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
4943                        != VMXON_NEEDED_FEATURES) {
4944                kvm_inject_gp(vcpu, 0);
4945                return 1;
4946        }
4947
4948        if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret))
4949                return ret;
4950
4951        /*
4952         * SDM 3: 24.11.5
4953         * The first 4 bytes of VMXON region contain the supported
4954         * VMCS revision identifier
4955         *
4956         * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
4957         * which replaces physical address width with 32
4958         */
4959        if (!page_address_valid(vcpu, vmptr))
4960                return nested_vmx_failInvalid(vcpu);
4961
4962        if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
4963            revision != VMCS12_REVISION)
4964                return nested_vmx_failInvalid(vcpu);
4965
4966        vmx->nested.vmxon_ptr = vmptr;
4967        ret = enter_vmx_operation(vcpu);
4968        if (ret)
4969                return ret;
4970
4971        return nested_vmx_succeed(vcpu);
4972}
4973
4974static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
4975{
4976        struct vcpu_vmx *vmx = to_vmx(vcpu);
4977
4978        if (vmx->nested.current_vmptr == -1ull)
4979                return;
4980
4981        copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
4982
4983        if (enable_shadow_vmcs) {
4984                /* copy to memory all shadowed fields in case
4985                   they were modified */
4986                copy_shadow_to_vmcs12(vmx);
4987                vmx_disable_shadow_vmcs(vmx);
4988        }
4989        vmx->nested.posted_intr_nv = -1;
4990
4991        /* Flush VMCS12 to guest memory */
4992        kvm_vcpu_write_guest_page(vcpu,
4993                                  vmx->nested.current_vmptr >> PAGE_SHIFT,
4994                                  vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
4995
4996        kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
4997
4998        vmx->nested.current_vmptr = -1ull;
4999}
5000
5001/* Emulate the VMXOFF instruction */
5002static int handle_vmoff(struct kvm_vcpu *vcpu)
5003{
5004        if (!nested_vmx_check_permission(vcpu))
5005                return 1;
5006
5007        free_nested(vcpu);
5008
5009        /* Process a latched INIT during time CPU was in VMX operation */
5010        kvm_make_request(KVM_REQ_EVENT, vcpu);
5011
5012        return nested_vmx_succeed(vcpu);
5013}
5014
5015/* Emulate the VMCLEAR instruction */
5016static int handle_vmclear(struct kvm_vcpu *vcpu)
5017{
5018        struct vcpu_vmx *vmx = to_vmx(vcpu);
5019        u32 zero = 0;
5020        gpa_t vmptr;
5021        u64 evmcs_gpa;
5022        int r;
5023
5024        if (!nested_vmx_check_permission(vcpu))
5025                return 1;
5026
5027        if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
5028                return r;
5029
5030        if (!page_address_valid(vcpu, vmptr))
5031                return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
5032
5033        if (vmptr == vmx->nested.vmxon_ptr)
5034                return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
5035
5036        /*
5037         * When Enlightened VMEntry is enabled on the calling CPU we treat
5038         * memory area pointer by vmptr as Enlightened VMCS (as there's no good
5039         * way to distinguish it from VMCS12) and we must not corrupt it by
5040         * writing to the non-existent 'launch_state' field. The area doesn't
5041         * have to be the currently active EVMCS on the calling CPU and there's
5042         * nothing KVM has to do to transition it from 'active' to 'non-active'
5043         * state. It is possible that the area will stay mapped as
5044         * vmx->nested.hv_evmcs but this shouldn't be a problem.
5045         */
5046        if (likely(!vmx->nested.enlightened_vmcs_enabled ||
5047                   !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
5048                if (vmptr == vmx->nested.current_vmptr)
5049                        nested_release_vmcs12(vcpu);
5050
5051                kvm_vcpu_write_guest(vcpu,
5052                                     vmptr + offsetof(struct vmcs12,
5053                                                      launch_state),
5054                                     &zero, sizeof(zero));
5055        } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) {
5056                nested_release_evmcs(vcpu);
5057        }
5058
5059        return nested_vmx_succeed(vcpu);
5060}
5061
5062/* Emulate the VMLAUNCH instruction */
5063static int handle_vmlaunch(struct kvm_vcpu *vcpu)
5064{
5065        return nested_vmx_run(vcpu, true);
5066}
5067
5068/* Emulate the VMRESUME instruction */
5069static int handle_vmresume(struct kvm_vcpu *vcpu)
5070{
5071
5072        return nested_vmx_run(vcpu, false);
5073}
5074
5075static int handle_vmread(struct kvm_vcpu *vcpu)
5076{
5077        struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
5078                                                    : get_vmcs12(vcpu);
5079        unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5080        u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5081        struct vcpu_vmx *vmx = to_vmx(vcpu);
5082        struct x86_exception e;
5083        unsigned long field;
5084        u64 value;
5085        gva_t gva = 0;
5086        short offset;
5087        int len, r;
5088
5089        if (!nested_vmx_check_permission(vcpu))
5090                return 1;
5091
5092        /*
5093         * In VMX non-root operation, when the VMCS-link pointer is -1ull,
5094         * any VMREAD sets the ALU flags for VMfailInvalid.
5095         */
5096        if (vmx->nested.current_vmptr == -1ull ||
5097            (is_guest_mode(vcpu) &&
5098             get_vmcs12(vcpu)->vmcs_link_pointer == -1ull))
5099                return nested_vmx_failInvalid(vcpu);
5100
5101        /* Decode instruction info and find the field to read */
5102        field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
5103
5104        offset = vmcs_field_to_offset(field);
5105        if (offset < 0)
5106                return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5107
5108        if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
5109                copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
5110
5111        /* Read the field, zero-extended to a u64 value */
5112        value = vmcs12_read_any(vmcs12, field, offset);
5113
5114        /*
5115         * Now copy part of this value to register or memory, as requested.
5116         * Note that the number of bits actually copied is 32 or 64 depending
5117         * on the guest's mode (32 or 64 bit), not on the given field's length.
5118         */
5119        if (instr_info & BIT(10)) {
5120                kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value);
5121        } else {
5122                len = is_64_bit_mode(vcpu) ? 8 : 4;
5123                if (get_vmx_mem_address(vcpu, exit_qualification,
5124                                        instr_info, true, len, &gva))
5125                        return 1;
5126                /* _system ok, nested_vmx_check_permission has verified cpl=0 */
5127                r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e);
5128                if (r != X86EMUL_CONTINUE)
5129                        return kvm_handle_memory_failure(vcpu, r, &e);
5130        }
5131
5132        return nested_vmx_succeed(vcpu);
5133}
5134
5135static bool is_shadow_field_rw(unsigned long field)
5136{
5137        switch (field) {
5138#define SHADOW_FIELD_RW(x, y) case x:
5139#include "vmcs_shadow_fields.h"
5140                return true;
5141        default:
5142                break;
5143        }
5144        return false;
5145}
5146
5147static bool is_shadow_field_ro(unsigned long field)
5148{
5149        switch (field) {
5150#define SHADOW_FIELD_RO(x, y) case x:
5151#include "vmcs_shadow_fields.h"
5152                return true;
5153        default:
5154                break;
5155        }
5156        return false;
5157}
5158
5159static int handle_vmwrite(struct kvm_vcpu *vcpu)
5160{
5161        struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
5162                                                    : get_vmcs12(vcpu);
5163        unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5164        u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5165        struct vcpu_vmx *vmx = to_vmx(vcpu);
5166        struct x86_exception e;
5167        unsigned long field;
5168        short offset;
5169        gva_t gva;
5170        int len, r;
5171
5172        /*
5173         * The value to write might be 32 or 64 bits, depending on L1's long
5174         * mode, and eventually we need to write that into a field of several
5175         * possible lengths. The code below first zero-extends the value to 64
5176         * bit (value), and then copies only the appropriate number of
5177         * bits into the vmcs12 field.
5178         */
5179        u64 value = 0;
5180
5181        if (!nested_vmx_check_permission(vcpu))
5182                return 1;
5183
5184        /*
5185         * In VMX non-root operation, when the VMCS-link pointer is -1ull,
5186         * any VMWRITE sets the ALU flags for VMfailInvalid.
5187         */
5188        if (vmx->nested.current_vmptr == -1ull ||
5189            (is_guest_mode(vcpu) &&
5190             get_vmcs12(vcpu)->vmcs_link_pointer == -1ull))
5191                return nested_vmx_failInvalid(vcpu);
5192
5193        if (instr_info & BIT(10))
5194                value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf));
5195        else {
5196                len = is_64_bit_mode(vcpu) ? 8 : 4;
5197                if (get_vmx_mem_address(vcpu, exit_qualification,
5198                                        instr_info, false, len, &gva))
5199                        return 1;
5200                r = kvm_read_guest_virt(vcpu, gva, &value, len, &e);
5201                if (r != X86EMUL_CONTINUE)
5202                        return kvm_handle_memory_failure(vcpu, r, &e);
5203        }
5204
5205        field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
5206
5207        offset = vmcs_field_to_offset(field);
5208        if (offset < 0)
5209                return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5210
5211        /*
5212         * If the vCPU supports "VMWRITE to any supported field in the
5213         * VMCS," then the "read-only" fields are actually read/write.
5214         */
5215        if (vmcs_field_readonly(field) &&
5216            !nested_cpu_has_vmwrite_any_field(vcpu))
5217                return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
5218
5219        /*
5220         * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
5221         * vmcs12, else we may crush a field or consume a stale value.
5222         */
5223        if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field))
5224                copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
5225
5226        /*
5227         * Some Intel CPUs intentionally drop the reserved bits of the AR byte
5228         * fields on VMWRITE.  Emulate this behavior to ensure consistent KVM
5229         * behavior regardless of the underlying hardware, e.g. if an AR_BYTE
5230         * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD
5231         * from L1 will return a different value than VMREAD from L2 (L1 sees
5232         * the stripped down value, L2 sees the full value as stored by KVM).
5233         */
5234        if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
5235                value &= 0x1f0ff;
5236
5237        vmcs12_write_any(vmcs12, field, offset, value);
5238
5239        /*
5240         * Do not track vmcs12 dirty-state if in guest-mode as we actually
5241         * dirty shadow vmcs12 instead of vmcs12.  Fields that can be updated
5242         * by L1 without a vmexit are always updated in the vmcs02, i.e. don't
5243         * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path.
5244         */
5245        if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) {
5246                /*
5247                 * L1 can read these fields without exiting, ensure the
5248                 * shadow VMCS is up-to-date.
5249                 */
5250                if (enable_shadow_vmcs && is_shadow_field_ro(field)) {
5251                        preempt_disable();
5252                        vmcs_load(vmx->vmcs01.shadow_vmcs);
5253
5254                        __vmcs_writel(field, value);
5255
5256                        vmcs_clear(vmx->vmcs01.shadow_vmcs);
5257                        vmcs_load(vmx->loaded_vmcs->vmcs);
5258                        preempt_enable();
5259                }
5260                vmx->nested.dirty_vmcs12 = true;
5261        }
5262
5263        return nested_vmx_succeed(vcpu);
5264}
5265
5266static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
5267{
5268        vmx->nested.current_vmptr = vmptr;
5269        if (enable_shadow_vmcs) {
5270                secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
5271                vmcs_write64(VMCS_LINK_POINTER,
5272                             __pa(vmx->vmcs01.shadow_vmcs));
5273                vmx->nested.need_vmcs12_to_shadow_sync = true;
5274        }
5275        vmx->nested.dirty_vmcs12 = true;
5276}
5277
5278/* Emulate the VMPTRLD instruction */
5279static int handle_vmptrld(struct kvm_vcpu *vcpu)
5280{
5281        struct vcpu_vmx *vmx = to_vmx(vcpu);
5282        gpa_t vmptr;
5283        int r;
5284
5285        if (!nested_vmx_check_permission(vcpu))
5286                return 1;
5287
5288        if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
5289                return r;
5290
5291        if (!page_address_valid(vcpu, vmptr))
5292                return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
5293
5294        if (vmptr == vmx->nested.vmxon_ptr)
5295                return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
5296
5297        /* Forbid normal VMPTRLD if Enlightened version was used */
5298        if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
5299                return 1;
5300
5301        if (vmx->nested.current_vmptr != vmptr) {
5302                struct kvm_host_map map;
5303                struct vmcs12 *new_vmcs12;
5304
5305                if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) {
5306                        /*
5307                         * Reads from an unbacked page return all 1s,
5308                         * which means that the 32 bits located at the
5309                         * given physical address won't match the required
5310                         * VMCS12_REVISION identifier.
5311                         */
5312                        return nested_vmx_fail(vcpu,
5313                                VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5314                }
5315
5316                new_vmcs12 = map.hva;
5317
5318                if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
5319                    (new_vmcs12->hdr.shadow_vmcs &&
5320                     !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
5321                        kvm_vcpu_unmap(vcpu, &map, false);
5322                        return nested_vmx_fail(vcpu,
5323                                VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5324                }
5325
5326                nested_release_vmcs12(vcpu);
5327
5328                /*
5329                 * Load VMCS12 from guest memory since it is not already
5330                 * cached.
5331                 */
5332                memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
5333                kvm_vcpu_unmap(vcpu, &map, false);
5334
5335                set_current_vmptr(vmx, vmptr);
5336        }
5337
5338        return nested_vmx_succeed(vcpu);
5339}
5340
5341/* Emulate the VMPTRST instruction */
5342static int handle_vmptrst(struct kvm_vcpu *vcpu)
5343{
5344        unsigned long exit_qual = vmx_get_exit_qual(vcpu);
5345        u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5346        gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
5347        struct x86_exception e;
5348        gva_t gva;
5349        int r;
5350
5351        if (!nested_vmx_check_permission(vcpu))
5352                return 1;
5353
5354        if (unlikely(evmptr_is_valid(to_vmx(vcpu)->nested.hv_evmcs_vmptr)))
5355                return 1;
5356
5357        if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
5358                                true, sizeof(gpa_t), &gva))
5359                return 1;
5360        /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
5361        r = kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
5362                                        sizeof(gpa_t), &e);
5363        if (r != X86EMUL_CONTINUE)
5364                return kvm_handle_memory_failure(vcpu, r, &e);
5365
5366        return nested_vmx_succeed(vcpu);
5367}
5368
5369/* Emulate the INVEPT instruction */
5370static int handle_invept(struct kvm_vcpu *vcpu)
5371{
5372        struct vcpu_vmx *vmx = to_vmx(vcpu);
5373        u32 vmx_instruction_info, types;
5374        unsigned long type, roots_to_free;
5375        struct kvm_mmu *mmu;
5376        gva_t gva;
5377        struct x86_exception e;
5378        struct {
5379                u64 eptp, gpa;
5380        } operand;
5381        int i, r;
5382
5383        if (!(vmx->nested.msrs.secondary_ctls_high &
5384              SECONDARY_EXEC_ENABLE_EPT) ||
5385            !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
5386                kvm_queue_exception(vcpu, UD_VECTOR);
5387                return 1;
5388        }
5389
5390        if (!nested_vmx_check_permission(vcpu))
5391                return 1;
5392
5393        vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5394        type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
5395
5396        types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
5397
5398        if (type >= 32 || !(types & (1 << type)))
5399                return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5400
5401        /* According to the Intel VMX instruction reference, the memory
5402         * operand is read even if it isn't needed (e.g., for type==global)
5403         */
5404        if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
5405                        vmx_instruction_info, false, sizeof(operand), &gva))
5406                return 1;
5407        r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
5408        if (r != X86EMUL_CONTINUE)
5409                return kvm_handle_memory_failure(vcpu, r, &e);
5410
5411        /*
5412         * Nested EPT roots are always held through guest_mmu,
5413         * not root_mmu.
5414         */
5415        mmu = &vcpu->arch.guest_mmu;
5416
5417        switch (type) {
5418        case VMX_EPT_EXTENT_CONTEXT:
5419                if (!nested_vmx_check_eptp(vcpu, operand.eptp))
5420                        return nested_vmx_fail(vcpu,
5421                                VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5422
5423                roots_to_free = 0;
5424                if (nested_ept_root_matches(mmu->root_hpa, mmu->root_pgd,
5425                                            operand.eptp))
5426                        roots_to_free |= KVM_MMU_ROOT_CURRENT;
5427
5428                for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5429                        if (nested_ept_root_matches(mmu->prev_roots[i].hpa,
5430                                                    mmu->prev_roots[i].pgd,
5431                                                    operand.eptp))
5432                                roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
5433                }
5434                break;
5435        case VMX_EPT_EXTENT_GLOBAL:
5436                roots_to_free = KVM_MMU_ROOTS_ALL;
5437                break;
5438        default:
5439                BUG();
5440                break;
5441        }
5442
5443        if (roots_to_free)
5444                kvm_mmu_free_roots(vcpu, mmu, roots_to_free);
5445
5446        return nested_vmx_succeed(vcpu);
5447}
5448
5449static int handle_invvpid(struct kvm_vcpu *vcpu)
5450{
5451        struct vcpu_vmx *vmx = to_vmx(vcpu);
5452        u32 vmx_instruction_info;
5453        unsigned long type, types;
5454        gva_t gva;
5455        struct x86_exception e;
5456        struct {
5457                u64 vpid;
5458                u64 gla;
5459        } operand;
5460        u16 vpid02;
5461        int r;
5462
5463        if (!(vmx->nested.msrs.secondary_ctls_high &
5464              SECONDARY_EXEC_ENABLE_VPID) ||
5465                        !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
5466                kvm_queue_exception(vcpu, UD_VECTOR);
5467                return 1;
5468        }
5469
5470        if (!nested_vmx_check_permission(vcpu))
5471                return 1;
5472
5473        vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5474        type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
5475
5476        types = (vmx->nested.msrs.vpid_caps &
5477                        VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
5478
5479        if (type >= 32 || !(types & (1 << type)))
5480                return nested_vmx_fail(vcpu,
5481                        VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5482
5483        /* according to the intel vmx instruction reference, the memory
5484         * operand is read even if it isn't needed (e.g., for type==global)
5485         */
5486        if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
5487                        vmx_instruction_info, false, sizeof(operand), &gva))
5488                return 1;
5489        r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
5490        if (r != X86EMUL_CONTINUE)
5491                return kvm_handle_memory_failure(vcpu, r, &e);
5492
5493        if (operand.vpid >> 16)
5494                return nested_vmx_fail(vcpu,
5495                        VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5496
5497        vpid02 = nested_get_vpid02(vcpu);
5498        switch (type) {
5499        case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
5500                if (!operand.vpid ||
5501                    is_noncanonical_address(operand.gla, vcpu))
5502                        return nested_vmx_fail(vcpu,
5503                                VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5504                vpid_sync_vcpu_addr(vpid02, operand.gla);
5505                break;
5506        case VMX_VPID_EXTENT_SINGLE_CONTEXT:
5507        case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
5508                if (!operand.vpid)
5509                        return nested_vmx_fail(vcpu,
5510                                VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5511                vpid_sync_context(vpid02);
5512                break;
5513        case VMX_VPID_EXTENT_ALL_CONTEXT:
5514                vpid_sync_context(vpid02);
5515                break;
5516        default:
5517                WARN_ON_ONCE(1);
5518                return kvm_skip_emulated_instruction(vcpu);
5519        }
5520
5521        /*
5522         * Sync the shadow page tables if EPT is disabled, L1 is invalidating
5523         * linear mappings for L2 (tagged with L2's VPID).  Free all guest
5524         * roots as VPIDs are not tracked in the MMU role.
5525         *
5526         * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share
5527         * an MMU when EPT is disabled.
5528         *
5529         * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR.
5530         */
5531        if (!enable_ept)
5532                kvm_mmu_free_guest_mode_roots(vcpu, &vcpu->arch.root_mmu);
5533
5534        return nested_vmx_succeed(vcpu);
5535}
5536
5537static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
5538                                     struct vmcs12 *vmcs12)
5539{
5540        u32 index = kvm_rcx_read(vcpu);
5541        u64 new_eptp;
5542
5543        if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12)))
5544                return 1;
5545        if (index >= VMFUNC_EPTP_ENTRIES)
5546                return 1;
5547
5548        if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
5549                                     &new_eptp, index * 8, 8))
5550                return 1;
5551
5552        /*
5553         * If the (L2) guest does a vmfunc to the currently
5554         * active ept pointer, we don't have to do anything else
5555         */
5556        if (vmcs12->ept_pointer != new_eptp) {
5557                if (!nested_vmx_check_eptp(vcpu, new_eptp))
5558                        return 1;
5559
5560                vmcs12->ept_pointer = new_eptp;
5561                nested_ept_new_eptp(vcpu);
5562
5563                if (!nested_cpu_has_vpid(vmcs12))
5564                        kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
5565        }
5566
5567        return 0;
5568}
5569
5570static int handle_vmfunc(struct kvm_vcpu *vcpu)
5571{
5572        struct vcpu_vmx *vmx = to_vmx(vcpu);
5573        struct vmcs12 *vmcs12;
5574        u32 function = kvm_rax_read(vcpu);
5575
5576        /*
5577         * VMFUNC is only supported for nested guests, but we always enable the
5578         * secondary control for simplicity; for non-nested mode, fake that we
5579         * didn't by injecting #UD.
5580         */
5581        if (!is_guest_mode(vcpu)) {
5582                kvm_queue_exception(vcpu, UD_VECTOR);
5583                return 1;
5584        }
5585
5586        vmcs12 = get_vmcs12(vcpu);
5587
5588        /*
5589         * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC
5590         * is enabled in vmcs02 if and only if it's enabled in vmcs12.
5591         */
5592        if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) {
5593                kvm_queue_exception(vcpu, UD_VECTOR);
5594                return 1;
5595        }
5596
5597        if (!(vmcs12->vm_function_control & BIT_ULL(function)))
5598                goto fail;
5599
5600        switch (function) {
5601        case 0:
5602                if (nested_vmx_eptp_switching(vcpu, vmcs12))
5603                        goto fail;
5604                break;
5605        default:
5606                goto fail;
5607        }
5608        return kvm_skip_emulated_instruction(vcpu);
5609
5610fail:
5611        /*
5612         * This is effectively a reflected VM-Exit, as opposed to a synthesized
5613         * nested VM-Exit.  Pass the original exit reason, i.e. don't hardcode
5614         * EXIT_REASON_VMFUNC as the exit reason.
5615         */
5616        nested_vmx_vmexit(vcpu, vmx->exit_reason.full,
5617                          vmx_get_intr_info(vcpu),
5618                          vmx_get_exit_qual(vcpu));
5619        return 1;
5620}
5621
5622/*
5623 * Return true if an IO instruction with the specified port and size should cause
5624 * a VM-exit into L1.
5625 */
5626bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
5627                                 int size)
5628{
5629        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5630        gpa_t bitmap, last_bitmap;
5631        u8 b;
5632
5633        last_bitmap = (gpa_t)-1;
5634        b = -1;
5635
5636        while (size > 0) {
5637                if (port < 0x8000)
5638                        bitmap = vmcs12->io_bitmap_a;
5639                else if (port < 0x10000)
5640                        bitmap = vmcs12->io_bitmap_b;
5641                else
5642                        return true;
5643                bitmap += (port & 0x7fff) / 8;
5644
5645                if (last_bitmap != bitmap)
5646                        if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
5647                                return true;
5648                if (b & (1 << (port & 7)))
5649                        return true;
5650
5651                port++;
5652                size--;
5653                last_bitmap = bitmap;
5654        }
5655
5656        return false;
5657}
5658
5659static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
5660                                       struct vmcs12 *vmcs12)
5661{
5662        unsigned long exit_qualification;
5663        unsigned short port;
5664        int size;
5665
5666        if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
5667                return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
5668
5669        exit_qualification = vmx_get_exit_qual(vcpu);
5670
5671        port = exit_qualification >> 16;
5672        size = (exit_qualification & 7) + 1;
5673
5674        return nested_vmx_check_io_bitmaps(vcpu, port, size);
5675}
5676
5677/*
5678 * Return 1 if we should exit from L2 to L1 to handle an MSR access,
5679 * rather than handle it ourselves in L0. I.e., check whether L1 expressed
5680 * disinterest in the current event (read or write a specific MSR) by using an
5681 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
5682 */
5683static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
5684                                        struct vmcs12 *vmcs12,
5685                                        union vmx_exit_reason exit_reason)
5686{
5687        u32 msr_index = kvm_rcx_read(vcpu);
5688        gpa_t bitmap;
5689
5690        if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
5691                return true;
5692
5693        /*
5694         * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
5695         * for the four combinations of read/write and low/high MSR numbers.
5696         * First we need to figure out which of the four to use:
5697         */
5698        bitmap = vmcs12->msr_bitmap;
5699        if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
5700                bitmap += 2048;
5701        if (msr_index >= 0xc0000000) {
5702                msr_index -= 0xc0000000;
5703                bitmap += 1024;
5704        }
5705
5706        /* Then read the msr_index'th bit from this bitmap: */
5707        if (msr_index < 1024*8) {
5708                unsigned char b;
5709                if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
5710                        return true;
5711                return 1 & (b >> (msr_index & 7));
5712        } else
5713                return true; /* let L1 handle the wrong parameter */
5714}
5715
5716/*
5717 * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
5718 * rather than handle it ourselves in L0. I.e., check if L1 wanted to
5719 * intercept (via guest_host_mask etc.) the current event.
5720 */
5721static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
5722        struct vmcs12 *vmcs12)
5723{
5724        unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5725        int cr = exit_qualification & 15;
5726        int reg;
5727        unsigned long val;
5728
5729        switch ((exit_qualification >> 4) & 3) {
5730        case 0: /* mov to cr */
5731                reg = (exit_qualification >> 8) & 15;
5732                val = kvm_register_read(vcpu, reg);
5733                switch (cr) {
5734                case 0:
5735                        if (vmcs12->cr0_guest_host_mask &
5736                            (val ^ vmcs12->cr0_read_shadow))
5737                                return true;
5738                        break;
5739                case 3:
5740                        if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
5741                                return true;
5742                        break;
5743                case 4:
5744                        if (vmcs12->cr4_guest_host_mask &
5745                            (vmcs12->cr4_read_shadow ^ val))
5746                                return true;
5747                        break;
5748                case 8:
5749                        if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
5750                                return true;
5751                        break;
5752                }
5753                break;
5754        case 2: /* clts */
5755                if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
5756                    (vmcs12->cr0_read_shadow & X86_CR0_TS))
5757                        return true;
5758                break;
5759        case 1: /* mov from cr */
5760                switch (cr) {
5761                case 3:
5762                        if (vmcs12->cpu_based_vm_exec_control &
5763                            CPU_BASED_CR3_STORE_EXITING)
5764                                return true;
5765                        break;
5766                case 8:
5767                        if (vmcs12->cpu_based_vm_exec_control &
5768                            CPU_BASED_CR8_STORE_EXITING)
5769                                return true;
5770                        break;
5771                }
5772                break;
5773        case 3: /* lmsw */
5774                /*
5775                 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
5776                 * cr0. Other attempted changes are ignored, with no exit.
5777                 */
5778                val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5779                if (vmcs12->cr0_guest_host_mask & 0xe &
5780                    (val ^ vmcs12->cr0_read_shadow))
5781                        return true;
5782                if ((vmcs12->cr0_guest_host_mask & 0x1) &&
5783                    !(vmcs12->cr0_read_shadow & 0x1) &&
5784                    (val & 0x1))
5785                        return true;
5786                break;
5787        }
5788        return false;
5789}
5790
5791static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu,
5792                                          struct vmcs12 *vmcs12)
5793{
5794        u32 encls_leaf;
5795
5796        if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) ||
5797            !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING))
5798                return false;
5799
5800        encls_leaf = kvm_rax_read(vcpu);
5801        if (encls_leaf > 62)
5802                encls_leaf = 63;
5803        return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf);
5804}
5805
5806static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
5807        struct vmcs12 *vmcs12, gpa_t bitmap)
5808{
5809        u32 vmx_instruction_info;
5810        unsigned long field;
5811        u8 b;
5812
5813        if (!nested_cpu_has_shadow_vmcs(vmcs12))
5814                return true;
5815
5816        /* Decode instruction info and find the field to access */
5817        vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5818        field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
5819
5820        /* Out-of-range fields always cause a VM exit from L2 to L1 */
5821        if (field >> 15)
5822                return true;
5823
5824        if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
5825                return true;
5826
5827        return 1 & (b >> (field & 7));
5828}
5829
5830static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12)
5831{
5832        u32 entry_intr_info = vmcs12->vm_entry_intr_info_field;
5833
5834        if (nested_cpu_has_mtf(vmcs12))
5835                return true;
5836
5837        /*
5838         * An MTF VM-exit may be injected into the guest by setting the
5839         * interruption-type to 7 (other event) and the vector field to 0. Such
5840         * is the case regardless of the 'monitor trap flag' VM-execution
5841         * control.
5842         */
5843        return entry_intr_info == (INTR_INFO_VALID_MASK
5844                                   | INTR_TYPE_OTHER_EVENT);
5845}
5846
5847/*
5848 * Return true if L0 wants to handle an exit from L2 regardless of whether or not
5849 * L1 wants the exit.  Only call this when in is_guest_mode (L2).
5850 */
5851static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
5852                                     union vmx_exit_reason exit_reason)
5853{
5854        u32 intr_info;
5855
5856        switch ((u16)exit_reason.basic) {
5857        case EXIT_REASON_EXCEPTION_NMI:
5858                intr_info = vmx_get_intr_info(vcpu);
5859                if (is_nmi(intr_info))
5860                        return true;
5861                else if (is_page_fault(intr_info))
5862                        return vcpu->arch.apf.host_apf_flags ||
5863                               vmx_need_pf_intercept(vcpu);
5864                else if (is_debug(intr_info) &&
5865                         vcpu->guest_debug &
5866                         (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
5867                        return true;
5868                else if (is_breakpoint(intr_info) &&
5869                         vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
5870                        return true;
5871                else if (is_alignment_check(intr_info) &&
5872                         !vmx_guest_inject_ac(vcpu))
5873                        return true;
5874                return false;
5875        case EXIT_REASON_EXTERNAL_INTERRUPT:
5876                return true;
5877        case EXIT_REASON_MCE_DURING_VMENTRY:
5878                return true;
5879        case EXIT_REASON_EPT_VIOLATION:
5880                /*
5881                 * L0 always deals with the EPT violation. If nested EPT is
5882                 * used, and the nested mmu code discovers that the address is
5883                 * missing in the guest EPT table (EPT12), the EPT violation
5884                 * will be injected with nested_ept_inject_page_fault()
5885                 */
5886                return true;
5887        case EXIT_REASON_EPT_MISCONFIG:
5888                /*
5889                 * L2 never uses directly L1's EPT, but rather L0's own EPT
5890                 * table (shadow on EPT) or a merged EPT table that L0 built
5891                 * (EPT on EPT). So any problems with the structure of the
5892                 * table is L0's fault.
5893                 */
5894                return true;
5895        case EXIT_REASON_PREEMPTION_TIMER:
5896                return true;
5897        case EXIT_REASON_PML_FULL:
5898                /*
5899                 * PML is emulated for an L1 VMM and should never be enabled in
5900                 * vmcs02, always "handle" PML_FULL by exiting to userspace.
5901                 */
5902                return true;
5903        case EXIT_REASON_VMFUNC:
5904                /* VM functions are emulated through L2->L0 vmexits. */
5905                return true;
5906        case EXIT_REASON_BUS_LOCK:
5907                /*
5908                 * At present, bus lock VM exit is never exposed to L1.
5909                 * Handle L2's bus locks in L0 directly.
5910                 */
5911                return true;
5912        default:
5913                break;
5914        }
5915        return false;
5916}
5917
5918/*
5919 * Return 1 if L1 wants to intercept an exit from L2.  Only call this when in
5920 * is_guest_mode (L2).
5921 */
5922static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
5923                                     union vmx_exit_reason exit_reason)
5924{
5925        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5926        u32 intr_info;
5927
5928        switch ((u16)exit_reason.basic) {
5929        case EXIT_REASON_EXCEPTION_NMI:
5930                intr_info = vmx_get_intr_info(vcpu);
5931                if (is_nmi(intr_info))
5932                        return true;
5933                else if (is_page_fault(intr_info))
5934                        return true;
5935                return vmcs12->exception_bitmap &
5936                                (1u << (intr_info & INTR_INFO_VECTOR_MASK));
5937        case EXIT_REASON_EXTERNAL_INTERRUPT:
5938                return nested_exit_on_intr(vcpu);
5939        case EXIT_REASON_TRIPLE_FAULT:
5940                return true;
5941        case EXIT_REASON_INTERRUPT_WINDOW:
5942                return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING);
5943        case EXIT_REASON_NMI_WINDOW:
5944                return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING);
5945        case EXIT_REASON_TASK_SWITCH:
5946                return true;
5947        case EXIT_REASON_CPUID:
5948                return true;
5949        case EXIT_REASON_HLT:
5950                return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
5951        case EXIT_REASON_INVD:
5952                return true;
5953        case EXIT_REASON_INVLPG:
5954                return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5955        case EXIT_REASON_RDPMC:
5956                return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
5957        case EXIT_REASON_RDRAND:
5958                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
5959        case EXIT_REASON_RDSEED:
5960                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
5961        case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
5962                return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
5963        case EXIT_REASON_VMREAD:
5964                return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
5965                        vmcs12->vmread_bitmap);
5966        case EXIT_REASON_VMWRITE:
5967                return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
5968                        vmcs12->vmwrite_bitmap);
5969        case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
5970        case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
5971        case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
5972        case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
5973        case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
5974                /*
5975                 * VMX instructions trap unconditionally. This allows L1 to
5976                 * emulate them for its L2 guest, i.e., allows 3-level nesting!
5977                 */
5978                return true;
5979        case EXIT_REASON_CR_ACCESS:
5980                return nested_vmx_exit_handled_cr(vcpu, vmcs12);
5981        case EXIT_REASON_DR_ACCESS:
5982                return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
5983        case EXIT_REASON_IO_INSTRUCTION:
5984                return nested_vmx_exit_handled_io(vcpu, vmcs12);
5985        case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
5986                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
5987        case EXIT_REASON_MSR_READ:
5988        case EXIT_REASON_MSR_WRITE:
5989                return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
5990        case EXIT_REASON_INVALID_STATE:
5991                return true;
5992        case EXIT_REASON_MWAIT_INSTRUCTION:
5993                return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
5994        case EXIT_REASON_MONITOR_TRAP_FLAG:
5995                return nested_vmx_exit_handled_mtf(vmcs12);
5996        case EXIT_REASON_MONITOR_INSTRUCTION:
5997                return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
5998        case EXIT_REASON_PAUSE_INSTRUCTION:
5999                return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
6000                        nested_cpu_has2(vmcs12,
6001                                SECONDARY_EXEC_PAUSE_LOOP_EXITING);
6002        case EXIT_REASON_MCE_DURING_VMENTRY:
6003                return true;
6004        case EXIT_REASON_TPR_BELOW_THRESHOLD:
6005                return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
6006        case EXIT_REASON_APIC_ACCESS:
6007        case EXIT_REASON_APIC_WRITE:
6008        case EXIT_REASON_EOI_INDUCED:
6009                /*
6010                 * The controls for "virtualize APIC accesses," "APIC-
6011                 * register virtualization," and "virtual-interrupt
6012                 * delivery" only come from vmcs12.
6013                 */
6014                return true;
6015        case EXIT_REASON_INVPCID:
6016                return
6017                        nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
6018                        nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
6019        case EXIT_REASON_WBINVD:
6020                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
6021        case EXIT_REASON_XSETBV:
6022                return true;
6023        case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
6024                /*
6025                 * This should never happen, since it is not possible to
6026                 * set XSS to a non-zero value---neither in L1 nor in L2.
6027                 * If if it were, XSS would have to be checked against
6028                 * the XSS exit bitmap in vmcs12.
6029                 */
6030                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
6031        case EXIT_REASON_UMWAIT:
6032        case EXIT_REASON_TPAUSE:
6033                return nested_cpu_has2(vmcs12,
6034                        SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
6035        case EXIT_REASON_ENCLS:
6036                return nested_vmx_exit_handled_encls(vcpu, vmcs12);
6037        default:
6038                return true;
6039        }
6040}
6041
6042/*
6043 * Conditionally reflect a VM-Exit into L1.  Returns %true if the VM-Exit was
6044 * reflected into L1.
6045 */
6046bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
6047{
6048        struct vcpu_vmx *vmx = to_vmx(vcpu);
6049        union vmx_exit_reason exit_reason = vmx->exit_reason;
6050        unsigned long exit_qual;
6051        u32 exit_intr_info;
6052
6053        WARN_ON_ONCE(vmx->nested.nested_run_pending);
6054
6055        /*
6056         * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM
6057         * has already loaded L2's state.
6058         */
6059        if (unlikely(vmx->fail)) {
6060                trace_kvm_nested_vmenter_failed(
6061                        "hardware VM-instruction error: ",
6062                        vmcs_read32(VM_INSTRUCTION_ERROR));
6063                exit_intr_info = 0;
6064                exit_qual = 0;
6065                goto reflect_vmexit;
6066        }
6067
6068        trace_kvm_nested_vmexit(exit_reason.full, vcpu, KVM_ISA_VMX);
6069
6070        /* If L0 (KVM) wants the exit, it trumps L1's desires. */
6071        if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
6072                return false;
6073
6074        /* If L1 doesn't want the exit, handle it in L0. */
6075        if (!nested_vmx_l1_wants_exit(vcpu, exit_reason))
6076                return false;
6077
6078        /*
6079         * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits.  For
6080         * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would
6081         * need to be synthesized by querying the in-kernel LAPIC, but external
6082         * interrupts are never reflected to L1 so it's a non-issue.
6083         */
6084        exit_intr_info = vmx_get_intr_info(vcpu);
6085        if (is_exception_with_error_code(exit_intr_info)) {
6086                struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6087
6088                vmcs12->vm_exit_intr_error_code =
6089                        vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
6090        }
6091        exit_qual = vmx_get_exit_qual(vcpu);
6092
6093reflect_vmexit:
6094        nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual);
6095        return true;
6096}
6097
6098static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
6099                                struct kvm_nested_state __user *user_kvm_nested_state,
6100                                u32 user_data_size)
6101{
6102        struct vcpu_vmx *vmx;
6103        struct vmcs12 *vmcs12;
6104        struct kvm_nested_state kvm_state = {
6105                .flags = 0,
6106                .format = KVM_STATE_NESTED_FORMAT_VMX,
6107                .size = sizeof(kvm_state),
6108                .hdr.vmx.flags = 0,
6109                .hdr.vmx.vmxon_pa = -1ull,
6110                .hdr.vmx.vmcs12_pa = -1ull,
6111                .hdr.vmx.preemption_timer_deadline = 0,
6112        };
6113        struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
6114                &user_kvm_nested_state->data.vmx[0];
6115
6116        if (!vcpu)
6117                return kvm_state.size + sizeof(*user_vmx_nested_state);
6118
6119        vmx = to_vmx(vcpu);
6120        vmcs12 = get_vmcs12(vcpu);
6121
6122        if (nested_vmx_allowed(vcpu) &&
6123            (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
6124                kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
6125                kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr;
6126
6127                if (vmx_has_valid_vmcs12(vcpu)) {
6128                        kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
6129
6130                        /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */
6131                        if (vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID)
6132                                kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
6133
6134                        if (is_guest_mode(vcpu) &&
6135                            nested_cpu_has_shadow_vmcs(vmcs12) &&
6136                            vmcs12->vmcs_link_pointer != -1ull)
6137                                kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12);
6138                }
6139
6140                if (vmx->nested.smm.vmxon)
6141                        kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
6142
6143                if (vmx->nested.smm.guest_mode)
6144                        kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
6145
6146                if (is_guest_mode(vcpu)) {
6147                        kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
6148
6149                        if (vmx->nested.nested_run_pending)
6150                                kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
6151
6152                        if (vmx->nested.mtf_pending)
6153                                kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING;
6154
6155                        if (nested_cpu_has_preemption_timer(vmcs12) &&
6156                            vmx->nested.has_preemption_timer_deadline) {
6157                                kvm_state.hdr.vmx.flags |=
6158                                        KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE;
6159                                kvm_state.hdr.vmx.preemption_timer_deadline =
6160                                        vmx->nested.preemption_timer_deadline;
6161                        }
6162                }
6163        }
6164
6165        if (user_data_size < kvm_state.size)
6166                goto out;
6167
6168        if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
6169                return -EFAULT;
6170
6171        if (!vmx_has_valid_vmcs12(vcpu))
6172                goto out;
6173
6174        /*
6175         * When running L2, the authoritative vmcs12 state is in the
6176         * vmcs02. When running L1, the authoritative vmcs12 state is
6177         * in the shadow or enlightened vmcs linked to vmcs01, unless
6178         * need_vmcs12_to_shadow_sync is set, in which case, the authoritative
6179         * vmcs12 state is in the vmcs12 already.
6180         */
6181        if (is_guest_mode(vcpu)) {
6182                sync_vmcs02_to_vmcs12(vcpu, vmcs12);
6183                sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
6184        } else  {
6185                copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
6186                if (!vmx->nested.need_vmcs12_to_shadow_sync) {
6187                        if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
6188                                /*
6189                                 * L1 hypervisor is not obliged to keep eVMCS
6190                                 * clean fields data always up-to-date while
6191                                 * not in guest mode, 'hv_clean_fields' is only
6192                                 * supposed to be actual upon vmentry so we need
6193                                 * to ignore it here and do full copy.
6194                                 */
6195                                copy_enlightened_to_vmcs12(vmx, 0);
6196                        else if (enable_shadow_vmcs)
6197                                copy_shadow_to_vmcs12(vmx);
6198                }
6199        }
6200
6201        BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
6202        BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE);
6203
6204        /*
6205         * Copy over the full allocated size of vmcs12 rather than just the size
6206         * of the struct.
6207         */
6208        if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE))
6209                return -EFAULT;
6210
6211        if (nested_cpu_has_shadow_vmcs(vmcs12) &&
6212            vmcs12->vmcs_link_pointer != -1ull) {
6213                if (copy_to_user(user_vmx_nested_state->shadow_vmcs12,
6214                                 get_shadow_vmcs12(vcpu), VMCS12_SIZE))
6215                        return -EFAULT;
6216        }
6217out:
6218        return kvm_state.size;
6219}
6220
6221/*
6222 * Forcibly leave nested mode in order to be able to reset the VCPU later on.
6223 */
6224void vmx_leave_nested(struct kvm_vcpu *vcpu)
6225{
6226        if (is_guest_mode(vcpu)) {
6227                to_vmx(vcpu)->nested.nested_run_pending = 0;
6228                nested_vmx_vmexit(vcpu, -1, 0, 0);
6229        }
6230        free_nested(vcpu);
6231}
6232
6233static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
6234                                struct kvm_nested_state __user *user_kvm_nested_state,
6235                                struct kvm_nested_state *kvm_state)
6236{
6237        struct vcpu_vmx *vmx = to_vmx(vcpu);
6238        struct vmcs12 *vmcs12;
6239        enum vm_entry_failure_code ignored;
6240        struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
6241                &user_kvm_nested_state->data.vmx[0];
6242        int ret;
6243
6244        if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX)
6245                return -EINVAL;
6246
6247        if (kvm_state->hdr.vmx.vmxon_pa == -1ull) {
6248                if (kvm_state->hdr.vmx.smm.flags)
6249                        return -EINVAL;
6250
6251                if (kvm_state->hdr.vmx.vmcs12_pa != -1ull)
6252                        return -EINVAL;
6253
6254                /*
6255                 * KVM_STATE_NESTED_EVMCS used to signal that KVM should
6256                 * enable eVMCS capability on vCPU. However, since then
6257                 * code was changed such that flag signals vmcs12 should
6258                 * be copied into eVMCS in guest memory.
6259                 *
6260                 * To preserve backwards compatability, allow user
6261                 * to set this flag even when there is no VMXON region.
6262                 */
6263                if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
6264                        return -EINVAL;
6265        } else {
6266                if (!nested_vmx_allowed(vcpu))
6267                        return -EINVAL;
6268
6269                if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa))
6270                        return -EINVAL;
6271        }
6272
6273        if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
6274            (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
6275                return -EINVAL;
6276
6277        if (kvm_state->hdr.vmx.smm.flags &
6278            ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
6279                return -EINVAL;
6280
6281        if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE)
6282                return -EINVAL;
6283
6284        /*
6285         * SMM temporarily disables VMX, so we cannot be in guest mode,
6286         * nor can VMLAUNCH/VMRESUME be pending.  Outside SMM, SMM flags
6287         * must be zero.
6288         */
6289        if (is_smm(vcpu) ?
6290                (kvm_state->flags &
6291                 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
6292                : kvm_state->hdr.vmx.smm.flags)
6293                return -EINVAL;
6294
6295        if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
6296            !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
6297                return -EINVAL;
6298
6299        if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) &&
6300                (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled))
6301                        return -EINVAL;
6302
6303        vmx_leave_nested(vcpu);
6304
6305        if (kvm_state->hdr.vmx.vmxon_pa == -1ull)
6306                return 0;
6307
6308        vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa;
6309        ret = enter_vmx_operation(vcpu);
6310        if (ret)
6311                return ret;
6312
6313        /* Empty 'VMXON' state is permitted if no VMCS loaded */
6314        if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) {
6315                /* See vmx_has_valid_vmcs12.  */
6316                if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) ||
6317                    (kvm_state->flags & KVM_STATE_NESTED_EVMCS) ||
6318                    (kvm_state->hdr.vmx.vmcs12_pa != -1ull))
6319                        return -EINVAL;
6320                else
6321                        return 0;
6322        }
6323
6324        if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) {
6325                if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
6326                    !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa))
6327                        return -EINVAL;
6328
6329                set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa);
6330        } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
6331                /*
6332                 * nested_vmx_handle_enlightened_vmptrld() cannot be called
6333                 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be
6334                 * restored yet. EVMCS will be mapped from
6335                 * nested_get_vmcs12_pages().
6336                 */
6337                vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING;
6338                kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
6339        } else {
6340                return -EINVAL;
6341        }
6342
6343        if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
6344                vmx->nested.smm.vmxon = true;
6345                vmx->nested.vmxon = false;
6346
6347                if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
6348                        vmx->nested.smm.guest_mode = true;
6349        }
6350
6351        vmcs12 = get_vmcs12(vcpu);
6352        if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12)))
6353                return -EFAULT;
6354
6355        if (vmcs12->hdr.revision_id != VMCS12_REVISION)
6356                return -EINVAL;
6357
6358        if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
6359                return 0;
6360
6361        vmx->nested.nested_run_pending =
6362                !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
6363
6364        vmx->nested.mtf_pending =
6365                !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING);
6366
6367        ret = -EINVAL;
6368        if (nested_cpu_has_shadow_vmcs(vmcs12) &&
6369            vmcs12->vmcs_link_pointer != -1ull) {
6370                struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
6371
6372                if (kvm_state->size <
6373                    sizeof(*kvm_state) +
6374                    sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12))
6375                        goto error_guest_mode;
6376
6377                if (copy_from_user(shadow_vmcs12,
6378                                   user_vmx_nested_state->shadow_vmcs12,
6379                                   sizeof(*shadow_vmcs12))) {
6380                        ret = -EFAULT;
6381                        goto error_guest_mode;
6382                }
6383
6384                if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
6385                    !shadow_vmcs12->hdr.shadow_vmcs)
6386                        goto error_guest_mode;
6387        }
6388
6389        vmx->nested.has_preemption_timer_deadline = false;
6390        if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) {
6391                vmx->nested.has_preemption_timer_deadline = true;
6392                vmx->nested.preemption_timer_deadline =
6393                        kvm_state->hdr.vmx.preemption_timer_deadline;
6394        }
6395
6396        if (nested_vmx_check_controls(vcpu, vmcs12) ||
6397            nested_vmx_check_host_state(vcpu, vmcs12) ||
6398            nested_vmx_check_guest_state(vcpu, vmcs12, &ignored))
6399                goto error_guest_mode;
6400
6401        vmx->nested.dirty_vmcs12 = true;
6402        ret = nested_vmx_enter_non_root_mode(vcpu, false);
6403        if (ret)
6404                goto error_guest_mode;
6405
6406        return 0;
6407
6408error_guest_mode:
6409        vmx->nested.nested_run_pending = 0;
6410        return ret;
6411}
6412
6413void nested_vmx_set_vmcs_shadowing_bitmap(void)
6414{
6415        if (enable_shadow_vmcs) {
6416                vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
6417                vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
6418        }
6419}
6420
6421/*
6422 * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6.  Undo
6423 * that madness to get the encoding for comparison.
6424 */
6425#define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10)))
6426
6427static u64 nested_vmx_calc_vmcs_enum_msr(void)
6428{
6429        /*
6430         * Note these are the so called "index" of the VMCS field encoding, not
6431         * the index into vmcs12.
6432         */
6433        unsigned int max_idx, idx;
6434        int i;
6435
6436        /*
6437         * For better or worse, KVM allows VMREAD/VMWRITE to all fields in
6438         * vmcs12, regardless of whether or not the associated feature is
6439         * exposed to L1.  Simply find the field with the highest index.
6440         */
6441        max_idx = 0;
6442        for (i = 0; i < nr_vmcs12_fields; i++) {
6443                /* The vmcs12 table is very, very sparsely populated. */
6444                if (!vmcs_field_to_offset_table[i])
6445                        continue;
6446
6447                idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i));
6448                if (idx > max_idx)
6449                        max_idx = idx;
6450        }
6451
6452        return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT;
6453}
6454
6455/*
6456 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
6457 * returned for the various VMX controls MSRs when nested VMX is enabled.
6458 * The same values should also be used to verify that vmcs12 control fields are
6459 * valid during nested entry from L1 to L2.
6460 * Each of these control msrs has a low and high 32-bit half: A low bit is on
6461 * if the corresponding bit in the (32-bit) control field *must* be on, and a
6462 * bit in the high half is on if the corresponding bit in the control field
6463 * may be on. See also vmx_control_verify().
6464 */
6465void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
6466{
6467        /*
6468         * Note that as a general rule, the high half of the MSRs (bits in
6469         * the control fields which may be 1) should be initialized by the
6470         * intersection of the underlying hardware's MSR (i.e., features which
6471         * can be supported) and the list of features we want to expose -
6472         * because they are known to be properly supported in our code.
6473         * Also, usually, the low half of the MSRs (bits which must be 1) can
6474         * be set to 0, meaning that L1 may turn off any of these bits. The
6475         * reason is that if one of these bits is necessary, it will appear
6476         * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
6477         * fields of vmcs01 and vmcs02, will turn these bits off - and
6478         * nested_vmx_l1_wants_exit() will not pass related exits to L1.
6479         * These rules have exceptions below.
6480         */
6481
6482        /* pin-based controls */
6483        rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
6484                msrs->pinbased_ctls_low,
6485                msrs->pinbased_ctls_high);
6486        msrs->pinbased_ctls_low |=
6487                PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
6488        msrs->pinbased_ctls_high &=
6489                PIN_BASED_EXT_INTR_MASK |
6490                PIN_BASED_NMI_EXITING |
6491                PIN_BASED_VIRTUAL_NMIS |
6492                (enable_apicv ? PIN_BASED_POSTED_INTR : 0);
6493        msrs->pinbased_ctls_high |=
6494                PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
6495                PIN_BASED_VMX_PREEMPTION_TIMER;
6496
6497        /* exit controls */
6498        rdmsr(MSR_IA32_VMX_EXIT_CTLS,
6499                msrs->exit_ctls_low,
6500                msrs->exit_ctls_high);
6501        msrs->exit_ctls_low =
6502                VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
6503
6504        msrs->exit_ctls_high &=
6505#ifdef CONFIG_X86_64
6506                VM_EXIT_HOST_ADDR_SPACE_SIZE |
6507#endif
6508                VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
6509                VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
6510        msrs->exit_ctls_high |=
6511                VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
6512                VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
6513                VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
6514
6515        /* We support free control of debug control saving. */
6516        msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
6517
6518        /* entry controls */
6519        rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
6520                msrs->entry_ctls_low,
6521                msrs->entry_ctls_high);
6522        msrs->entry_ctls_low =
6523                VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
6524        msrs->entry_ctls_high &=
6525#ifdef CONFIG_X86_64
6526                VM_ENTRY_IA32E_MODE |
6527#endif
6528                VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS |
6529                VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
6530        msrs->entry_ctls_high |=
6531                (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
6532
6533        /* We support free control of debug control loading. */
6534        msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
6535
6536        /* cpu-based controls */
6537        rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
6538                msrs->procbased_ctls_low,
6539                msrs->procbased_ctls_high);
6540        msrs->procbased_ctls_low =
6541                CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
6542        msrs->procbased_ctls_high &=
6543                CPU_BASED_INTR_WINDOW_EXITING |
6544                CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING |
6545                CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
6546                CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
6547                CPU_BASED_CR3_STORE_EXITING |
6548#ifdef CONFIG_X86_64
6549                CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
6550#endif
6551                CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
6552                CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
6553                CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
6554                CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
6555                CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
6556        /*
6557         * We can allow some features even when not supported by the
6558         * hardware. For example, L1 can specify an MSR bitmap - and we
6559         * can use it to avoid exits to L1 - even when L0 runs L2
6560         * without MSR bitmaps.
6561         */
6562        msrs->procbased_ctls_high |=
6563                CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
6564                CPU_BASED_USE_MSR_BITMAPS;
6565
6566        /* We support free control of CR3 access interception. */
6567        msrs->procbased_ctls_low &=
6568                ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
6569
6570        /*
6571         * secondary cpu-based controls.  Do not include those that
6572         * depend on CPUID bits, they are added later by
6573         * vmx_vcpu_after_set_cpuid.
6574         */
6575        if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
6576                rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
6577                      msrs->secondary_ctls_low,
6578                      msrs->secondary_ctls_high);
6579
6580        msrs->secondary_ctls_low = 0;
6581        msrs->secondary_ctls_high &=
6582                SECONDARY_EXEC_DESC |
6583                SECONDARY_EXEC_ENABLE_RDTSCP |
6584                SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
6585                SECONDARY_EXEC_WBINVD_EXITING |
6586                SECONDARY_EXEC_APIC_REGISTER_VIRT |
6587                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
6588                SECONDARY_EXEC_RDRAND_EXITING |
6589                SECONDARY_EXEC_ENABLE_INVPCID |
6590                SECONDARY_EXEC_RDSEED_EXITING |
6591                SECONDARY_EXEC_XSAVES |
6592                SECONDARY_EXEC_TSC_SCALING;
6593
6594        /*
6595         * We can emulate "VMCS shadowing," even if the hardware
6596         * doesn't support it.
6597         */
6598        msrs->secondary_ctls_high |=
6599                SECONDARY_EXEC_SHADOW_VMCS;
6600
6601        if (enable_ept) {
6602                /* nested EPT: emulate EPT also to L1 */
6603                msrs->secondary_ctls_high |=
6604                        SECONDARY_EXEC_ENABLE_EPT;
6605                msrs->ept_caps =
6606                        VMX_EPT_PAGE_WALK_4_BIT |
6607                        VMX_EPT_PAGE_WALK_5_BIT |
6608                        VMX_EPTP_WB_BIT |
6609                        VMX_EPT_INVEPT_BIT |
6610                        VMX_EPT_EXECUTE_ONLY_BIT;
6611
6612                msrs->ept_caps &= ept_caps;
6613                msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
6614                        VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
6615                        VMX_EPT_1GB_PAGE_BIT;
6616                if (enable_ept_ad_bits) {
6617                        msrs->secondary_ctls_high |=
6618                                SECONDARY_EXEC_ENABLE_PML;
6619                        msrs->ept_caps |= VMX_EPT_AD_BIT;
6620                }
6621        }
6622
6623        if (cpu_has_vmx_vmfunc()) {
6624                msrs->secondary_ctls_high |=
6625                        SECONDARY_EXEC_ENABLE_VMFUNC;
6626                /*
6627                 * Advertise EPTP switching unconditionally
6628                 * since we emulate it
6629                 */
6630                if (enable_ept)
6631                        msrs->vmfunc_controls =
6632                                VMX_VMFUNC_EPTP_SWITCHING;
6633        }
6634
6635        /*
6636         * Old versions of KVM use the single-context version without
6637         * checking for support, so declare that it is supported even
6638         * though it is treated as global context.  The alternative is
6639         * not failing the single-context invvpid, and it is worse.
6640         */
6641        if (enable_vpid) {
6642                msrs->secondary_ctls_high |=
6643                        SECONDARY_EXEC_ENABLE_VPID;
6644                msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
6645                        VMX_VPID_EXTENT_SUPPORTED_MASK;
6646        }
6647
6648        if (enable_unrestricted_guest)
6649                msrs->secondary_ctls_high |=
6650                        SECONDARY_EXEC_UNRESTRICTED_GUEST;
6651
6652        if (flexpriority_enabled)
6653                msrs->secondary_ctls_high |=
6654                        SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6655
6656        if (enable_sgx)
6657                msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING;
6658
6659        /* miscellaneous data */
6660        rdmsr(MSR_IA32_VMX_MISC,
6661                msrs->misc_low,
6662                msrs->misc_high);
6663        msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
6664        msrs->misc_low |=
6665                MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
6666                VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
6667                VMX_MISC_ACTIVITY_HLT |
6668                VMX_MISC_ACTIVITY_WAIT_SIPI;
6669        msrs->misc_high = 0;
6670
6671        /*
6672         * This MSR reports some information about VMX support. We
6673         * should return information about the VMX we emulate for the
6674         * guest, and the VMCS structure we give it - not about the
6675         * VMX support of the underlying hardware.
6676         */
6677        msrs->basic =
6678                VMCS12_REVISION |
6679                VMX_BASIC_TRUE_CTLS |
6680                ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
6681                (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
6682
6683        if (cpu_has_vmx_basic_inout())
6684                msrs->basic |= VMX_BASIC_INOUT;
6685
6686        /*
6687         * These MSRs specify bits which the guest must keep fixed on
6688         * while L1 is in VMXON mode (in L1's root mode, or running an L2).
6689         * We picked the standard core2 setting.
6690         */
6691#define VMXON_CR0_ALWAYSON     (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
6692#define VMXON_CR4_ALWAYSON     X86_CR4_VMXE
6693        msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
6694        msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
6695
6696        /* These MSRs specify bits which the guest must keep fixed off. */
6697        rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
6698        rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
6699
6700        msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr();
6701}
6702
6703void nested_vmx_hardware_unsetup(void)
6704{
6705        int i;
6706
6707        if (enable_shadow_vmcs) {
6708                for (i = 0; i < VMX_BITMAP_NR; i++)
6709                        free_page((unsigned long)vmx_bitmap[i]);
6710        }
6711}
6712
6713__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
6714{
6715        int i;
6716
6717        if (!cpu_has_vmx_shadow_vmcs())
6718                enable_shadow_vmcs = 0;
6719        if (enable_shadow_vmcs) {
6720                for (i = 0; i < VMX_BITMAP_NR; i++) {
6721                        /*
6722                         * The vmx_bitmap is not tied to a VM and so should
6723                         * not be charged to a memcg.
6724                         */
6725                        vmx_bitmap[i] = (unsigned long *)
6726                                __get_free_page(GFP_KERNEL);
6727                        if (!vmx_bitmap[i]) {
6728                                nested_vmx_hardware_unsetup();
6729                                return -ENOMEM;
6730                        }
6731                }
6732
6733                init_vmcs_shadow_fields();
6734        }
6735
6736        exit_handlers[EXIT_REASON_VMCLEAR]      = handle_vmclear;
6737        exit_handlers[EXIT_REASON_VMLAUNCH]     = handle_vmlaunch;
6738        exit_handlers[EXIT_REASON_VMPTRLD]      = handle_vmptrld;
6739        exit_handlers[EXIT_REASON_VMPTRST]      = handle_vmptrst;
6740        exit_handlers[EXIT_REASON_VMREAD]       = handle_vmread;
6741        exit_handlers[EXIT_REASON_VMRESUME]     = handle_vmresume;
6742        exit_handlers[EXIT_REASON_VMWRITE]      = handle_vmwrite;
6743        exit_handlers[EXIT_REASON_VMOFF]        = handle_vmoff;
6744        exit_handlers[EXIT_REASON_VMON]         = handle_vmon;
6745        exit_handlers[EXIT_REASON_INVEPT]       = handle_invept;
6746        exit_handlers[EXIT_REASON_INVVPID]      = handle_invvpid;
6747        exit_handlers[EXIT_REASON_VMFUNC]       = handle_vmfunc;
6748
6749        return 0;
6750}
6751
6752struct kvm_x86_nested_ops vmx_nested_ops = {
6753        .check_events = vmx_check_nested_events,
6754        .hv_timer_pending = nested_vmx_preemption_timer_pending,
6755        .triple_fault = nested_vmx_triple_fault,
6756        .get_state = vmx_get_nested_state,
6757        .set_state = vmx_set_nested_state,
6758        .get_nested_state_pages = vmx_get_nested_state_pages,
6759        .write_log_dirty = nested_vmx_write_pml_buffer,
6760        .enable_evmcs = nested_enable_evmcs,
6761        .get_evmcs_version = nested_get_evmcs_version,
6762};
6763