linux/arch/x86/kvm/vmx/nested.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2
   3#include <linux/frame.h>
   4#include <linux/percpu.h>
   5
   6#include <asm/debugreg.h>
   7#include <asm/mmu_context.h>
   8
   9#include "cpuid.h"
  10#include "hyperv.h"
  11#include "mmu.h"
  12#include "nested.h"
  13#include "pmu.h"
  14#include "trace.h"
  15#include "x86.h"
  16
  17static bool __read_mostly enable_shadow_vmcs = 1;
  18module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
  19
  20static bool __read_mostly nested_early_check = 0;
  21module_param(nested_early_check, bool, S_IRUGO);
  22
  23#define CC(consistency_check)                                           \
  24({                                                                      \
  25        bool failed = (consistency_check);                              \
  26        if (failed)                                                     \
  27                trace_kvm_nested_vmenter_failed(#consistency_check, 0); \
  28        failed;                                                         \
  29})
  30
  31/*
  32 * Hyper-V requires all of these, so mark them as supported even though
  33 * they are just treated the same as all-context.
  34 */
  35#define VMX_VPID_EXTENT_SUPPORTED_MASK          \
  36        (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT |  \
  37        VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |    \
  38        VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT |    \
  39        VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
  40
  41#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
  42
  43enum {
  44        VMX_VMREAD_BITMAP,
  45        VMX_VMWRITE_BITMAP,
  46        VMX_BITMAP_NR
  47};
  48static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
  49
  50#define vmx_vmread_bitmap                    (vmx_bitmap[VMX_VMREAD_BITMAP])
  51#define vmx_vmwrite_bitmap                   (vmx_bitmap[VMX_VMWRITE_BITMAP])
  52
  53struct shadow_vmcs_field {
  54        u16     encoding;
  55        u16     offset;
  56};
  57static struct shadow_vmcs_field shadow_read_only_fields[] = {
  58#define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) },
  59#include "vmcs_shadow_fields.h"
  60};
  61static int max_shadow_read_only_fields =
  62        ARRAY_SIZE(shadow_read_only_fields);
  63
  64static struct shadow_vmcs_field shadow_read_write_fields[] = {
  65#define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
  66#include "vmcs_shadow_fields.h"
  67};
  68static int max_shadow_read_write_fields =
  69        ARRAY_SIZE(shadow_read_write_fields);
  70
  71static void init_vmcs_shadow_fields(void)
  72{
  73        int i, j;
  74
  75        memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
  76        memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
  77
  78        for (i = j = 0; i < max_shadow_read_only_fields; i++) {
  79                struct shadow_vmcs_field entry = shadow_read_only_fields[i];
  80                u16 field = entry.encoding;
  81
  82                if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
  83                    (i + 1 == max_shadow_read_only_fields ||
  84                     shadow_read_only_fields[i + 1].encoding != field + 1))
  85                        pr_err("Missing field from shadow_read_only_field %x\n",
  86                               field + 1);
  87
  88                clear_bit(field, vmx_vmread_bitmap);
  89                if (field & 1)
  90#ifdef CONFIG_X86_64
  91                        continue;
  92#else
  93                        entry.offset += sizeof(u32);
  94#endif
  95                shadow_read_only_fields[j++] = entry;
  96        }
  97        max_shadow_read_only_fields = j;
  98
  99        for (i = j = 0; i < max_shadow_read_write_fields; i++) {
 100                struct shadow_vmcs_field entry = shadow_read_write_fields[i];
 101                u16 field = entry.encoding;
 102
 103                if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
 104                    (i + 1 == max_shadow_read_write_fields ||
 105                     shadow_read_write_fields[i + 1].encoding != field + 1))
 106                        pr_err("Missing field from shadow_read_write_field %x\n",
 107                               field + 1);
 108
 109                WARN_ONCE(field >= GUEST_ES_AR_BYTES &&
 110                          field <= GUEST_TR_AR_BYTES,
 111                          "Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
 112
 113                /*
 114                 * PML and the preemption timer can be emulated, but the
 115                 * processor cannot vmwrite to fields that don't exist
 116                 * on bare metal.
 117                 */
 118                switch (field) {
 119                case GUEST_PML_INDEX:
 120                        if (!cpu_has_vmx_pml())
 121                                continue;
 122                        break;
 123                case VMX_PREEMPTION_TIMER_VALUE:
 124                        if (!cpu_has_vmx_preemption_timer())
 125                                continue;
 126                        break;
 127                case GUEST_INTR_STATUS:
 128                        if (!cpu_has_vmx_apicv())
 129                                continue;
 130                        break;
 131                default:
 132                        break;
 133                }
 134
 135                clear_bit(field, vmx_vmwrite_bitmap);
 136                clear_bit(field, vmx_vmread_bitmap);
 137                if (field & 1)
 138#ifdef CONFIG_X86_64
 139                        continue;
 140#else
 141                        entry.offset += sizeof(u32);
 142#endif
 143                shadow_read_write_fields[j++] = entry;
 144        }
 145        max_shadow_read_write_fields = j;
 146}
 147
 148/*
 149 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
 150 * set the success or error code of an emulated VMX instruction (as specified
 151 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
 152 * instruction.
 153 */
 154static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
 155{
 156        vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
 157                        & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
 158                            X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
 159        return kvm_skip_emulated_instruction(vcpu);
 160}
 161
 162static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
 163{
 164        vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
 165                        & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
 166                            X86_EFLAGS_SF | X86_EFLAGS_OF))
 167                        | X86_EFLAGS_CF);
 168        return kvm_skip_emulated_instruction(vcpu);
 169}
 170
 171static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
 172                                u32 vm_instruction_error)
 173{
 174        struct vcpu_vmx *vmx = to_vmx(vcpu);
 175
 176        /*
 177         * failValid writes the error number to the current VMCS, which
 178         * can't be done if there isn't a current VMCS.
 179         */
 180        if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
 181                return nested_vmx_failInvalid(vcpu);
 182
 183        vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
 184                        & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
 185                            X86_EFLAGS_SF | X86_EFLAGS_OF))
 186                        | X86_EFLAGS_ZF);
 187        get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
 188        /*
 189         * We don't need to force a shadow sync because
 190         * VM_INSTRUCTION_ERROR is not shadowed
 191         */
 192        return kvm_skip_emulated_instruction(vcpu);
 193}
 194
 195static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
 196{
 197        /* TODO: not to reset guest simply here. */
 198        kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 199        pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
 200}
 201
 202static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
 203{
 204        return fixed_bits_valid(control, low, high);
 205}
 206
 207static inline u64 vmx_control_msr(u32 low, u32 high)
 208{
 209        return low | ((u64)high << 32);
 210}
 211
 212static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
 213{
 214        secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
 215        vmcs_write64(VMCS_LINK_POINTER, -1ull);
 216        vmx->nested.need_vmcs12_to_shadow_sync = false;
 217}
 218
 219static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
 220{
 221        struct vcpu_vmx *vmx = to_vmx(vcpu);
 222
 223        if (!vmx->nested.hv_evmcs)
 224                return;
 225
 226        kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
 227        vmx->nested.hv_evmcs_vmptr = 0;
 228        vmx->nested.hv_evmcs = NULL;
 229}
 230
 231/*
 232 * Free whatever needs to be freed from vmx->nested when L1 goes down, or
 233 * just stops using VMX.
 234 */
 235static void free_nested(struct kvm_vcpu *vcpu)
 236{
 237        struct vcpu_vmx *vmx = to_vmx(vcpu);
 238
 239        if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
 240                return;
 241
 242        kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
 243
 244        vmx->nested.vmxon = false;
 245        vmx->nested.smm.vmxon = false;
 246        free_vpid(vmx->nested.vpid02);
 247        vmx->nested.posted_intr_nv = -1;
 248        vmx->nested.current_vmptr = -1ull;
 249        if (enable_shadow_vmcs) {
 250                vmx_disable_shadow_vmcs(vmx);
 251                vmcs_clear(vmx->vmcs01.shadow_vmcs);
 252                free_vmcs(vmx->vmcs01.shadow_vmcs);
 253                vmx->vmcs01.shadow_vmcs = NULL;
 254        }
 255        kfree(vmx->nested.cached_vmcs12);
 256        vmx->nested.cached_vmcs12 = NULL;
 257        kfree(vmx->nested.cached_shadow_vmcs12);
 258        vmx->nested.cached_shadow_vmcs12 = NULL;
 259        /* Unpin physical memory we referred to in the vmcs02 */
 260        if (vmx->nested.apic_access_page) {
 261                kvm_release_page_clean(vmx->nested.apic_access_page);
 262                vmx->nested.apic_access_page = NULL;
 263        }
 264        kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
 265        kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
 266        vmx->nested.pi_desc = NULL;
 267
 268        kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
 269
 270        nested_release_evmcs(vcpu);
 271
 272        free_loaded_vmcs(&vmx->nested.vmcs02);
 273}
 274
 275static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
 276                                     struct loaded_vmcs *prev)
 277{
 278        struct vmcs_host_state *dest, *src;
 279
 280        if (unlikely(!vmx->guest_state_loaded))
 281                return;
 282
 283        src = &prev->host_state;
 284        dest = &vmx->loaded_vmcs->host_state;
 285
 286        vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
 287        dest->ldt_sel = src->ldt_sel;
 288#ifdef CONFIG_X86_64
 289        dest->ds_sel = src->ds_sel;
 290        dest->es_sel = src->es_sel;
 291#endif
 292}
 293
 294static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
 295{
 296        struct vcpu_vmx *vmx = to_vmx(vcpu);
 297        struct loaded_vmcs *prev;
 298        int cpu;
 299
 300        if (vmx->loaded_vmcs == vmcs)
 301                return;
 302
 303        cpu = get_cpu();
 304        prev = vmx->loaded_vmcs;
 305        vmx->loaded_vmcs = vmcs;
 306        vmx_vcpu_load_vmcs(vcpu, cpu);
 307        vmx_sync_vmcs_host_state(vmx, prev);
 308        put_cpu();
 309
 310        vmx_segment_cache_clear(vmx);
 311}
 312
 313/*
 314 * Ensure that the current vmcs of the logical processor is the
 315 * vmcs01 of the vcpu before calling free_nested().
 316 */
 317void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
 318{
 319        vcpu_load(vcpu);
 320        vmx_leave_nested(vcpu);
 321        vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
 322        free_nested(vcpu);
 323        vcpu_put(vcpu);
 324}
 325
 326static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
 327                struct x86_exception *fault)
 328{
 329        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 330        struct vcpu_vmx *vmx = to_vmx(vcpu);
 331        u32 exit_reason;
 332        unsigned long exit_qualification = vcpu->arch.exit_qualification;
 333
 334        if (vmx->nested.pml_full) {
 335                exit_reason = EXIT_REASON_PML_FULL;
 336                vmx->nested.pml_full = false;
 337                exit_qualification &= INTR_INFO_UNBLOCK_NMI;
 338        } else if (fault->error_code & PFERR_RSVD_MASK)
 339                exit_reason = EXIT_REASON_EPT_MISCONFIG;
 340        else
 341                exit_reason = EXIT_REASON_EPT_VIOLATION;
 342
 343        nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification);
 344        vmcs12->guest_physical_address = fault->address;
 345}
 346
 347static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
 348{
 349        WARN_ON(mmu_is_nested(vcpu));
 350
 351        vcpu->arch.mmu = &vcpu->arch.guest_mmu;
 352        kvm_init_shadow_ept_mmu(vcpu,
 353                        to_vmx(vcpu)->nested.msrs.ept_caps &
 354                        VMX_EPT_EXECUTE_ONLY_BIT,
 355                        nested_ept_ad_enabled(vcpu),
 356                        nested_ept_get_cr3(vcpu));
 357        vcpu->arch.mmu->set_cr3           = vmx_set_cr3;
 358        vcpu->arch.mmu->get_cr3           = nested_ept_get_cr3;
 359        vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
 360        vcpu->arch.mmu->get_pdptr         = kvm_pdptr_read;
 361
 362        vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
 363}
 364
 365static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
 366{
 367        vcpu->arch.mmu = &vcpu->arch.root_mmu;
 368        vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
 369}
 370
 371static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
 372                                            u16 error_code)
 373{
 374        bool inequality, bit;
 375
 376        bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
 377        inequality =
 378                (error_code & vmcs12->page_fault_error_code_mask) !=
 379                 vmcs12->page_fault_error_code_match;
 380        return inequality ^ bit;
 381}
 382
 383
 384/*
 385 * KVM wants to inject page-faults which it got to the guest. This function
 386 * checks whether in a nested guest, we need to inject them to L1 or L2.
 387 */
 388static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
 389{
 390        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 391        unsigned int nr = vcpu->arch.exception.nr;
 392        bool has_payload = vcpu->arch.exception.has_payload;
 393        unsigned long payload = vcpu->arch.exception.payload;
 394
 395        if (nr == PF_VECTOR) {
 396                if (vcpu->arch.exception.nested_apf) {
 397                        *exit_qual = vcpu->arch.apf.nested_apf_token;
 398                        return 1;
 399                }
 400                if (nested_vmx_is_page_fault_vmexit(vmcs12,
 401                                                    vcpu->arch.exception.error_code)) {
 402                        *exit_qual = has_payload ? payload : vcpu->arch.cr2;
 403                        return 1;
 404                }
 405        } else if (vmcs12->exception_bitmap & (1u << nr)) {
 406                if (nr == DB_VECTOR) {
 407                        if (!has_payload) {
 408                                payload = vcpu->arch.dr6;
 409                                payload &= ~(DR6_FIXED_1 | DR6_BT);
 410                                payload ^= DR6_RTM;
 411                        }
 412                        *exit_qual = payload;
 413                } else
 414                        *exit_qual = 0;
 415                return 1;
 416        }
 417
 418        return 0;
 419}
 420
 421
 422static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
 423                struct x86_exception *fault)
 424{
 425        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 426
 427        WARN_ON(!is_guest_mode(vcpu));
 428
 429        if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
 430                !to_vmx(vcpu)->nested.nested_run_pending) {
 431                vmcs12->vm_exit_intr_error_code = fault->error_code;
 432                nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
 433                                  PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
 434                                  INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
 435                                  fault->address);
 436        } else {
 437                kvm_inject_page_fault(vcpu, fault);
 438        }
 439}
 440
 441static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
 442{
 443        return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
 444}
 445
 446static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
 447                                               struct vmcs12 *vmcs12)
 448{
 449        if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
 450                return 0;
 451
 452        if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) ||
 453            CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b)))
 454                return -EINVAL;
 455
 456        return 0;
 457}
 458
 459static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
 460                                                struct vmcs12 *vmcs12)
 461{
 462        if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
 463                return 0;
 464
 465        if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap)))
 466                return -EINVAL;
 467
 468        return 0;
 469}
 470
 471static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
 472                                                struct vmcs12 *vmcs12)
 473{
 474        if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
 475                return 0;
 476
 477        if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)))
 478                return -EINVAL;
 479
 480        return 0;
 481}
 482
 483/*
 484 * Check if MSR is intercepted for L01 MSR bitmap.
 485 */
 486static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
 487{
 488        unsigned long *msr_bitmap;
 489        int f = sizeof(unsigned long);
 490
 491        if (!cpu_has_vmx_msr_bitmap())
 492                return true;
 493
 494        msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
 495
 496        if (msr <= 0x1fff) {
 497                return !!test_bit(msr, msr_bitmap + 0x800 / f);
 498        } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
 499                msr &= 0x1fff;
 500                return !!test_bit(msr, msr_bitmap + 0xc00 / f);
 501        }
 502
 503        return true;
 504}
 505
 506/*
 507 * If a msr is allowed by L0, we should check whether it is allowed by L1.
 508 * The corresponding bit will be cleared unless both of L0 and L1 allow it.
 509 */
 510static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
 511                                               unsigned long *msr_bitmap_nested,
 512                                               u32 msr, int type)
 513{
 514        int f = sizeof(unsigned long);
 515
 516        /*
 517         * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
 518         * have the write-low and read-high bitmap offsets the wrong way round.
 519         * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
 520         */
 521        if (msr <= 0x1fff) {
 522                if (type & MSR_TYPE_R &&
 523                   !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
 524                        /* read-low */
 525                        __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
 526
 527                if (type & MSR_TYPE_W &&
 528                   !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
 529                        /* write-low */
 530                        __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
 531
 532        } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
 533                msr &= 0x1fff;
 534                if (type & MSR_TYPE_R &&
 535                   !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
 536                        /* read-high */
 537                        __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
 538
 539                if (type & MSR_TYPE_W &&
 540                   !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
 541                        /* write-high */
 542                        __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
 543
 544        }
 545}
 546
 547static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
 548{
 549        int msr;
 550
 551        for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
 552                unsigned word = msr / BITS_PER_LONG;
 553
 554                msr_bitmap[word] = ~0;
 555                msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
 556        }
 557}
 558
 559/*
 560 * Merge L0's and L1's MSR bitmap, return false to indicate that
 561 * we do not use the hardware.
 562 */
 563static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 564                                                 struct vmcs12 *vmcs12)
 565{
 566        int msr;
 567        unsigned long *msr_bitmap_l1;
 568        unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
 569        struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map;
 570
 571        /* Nothing to do if the MSR bitmap is not in use.  */
 572        if (!cpu_has_vmx_msr_bitmap() ||
 573            !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
 574                return false;
 575
 576        if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
 577                return false;
 578
 579        msr_bitmap_l1 = (unsigned long *)map->hva;
 580
 581        /*
 582         * To keep the control flow simple, pay eight 8-byte writes (sixteen
 583         * 4-byte writes on 32-bit systems) up front to enable intercepts for
 584         * the x2APIC MSR range and selectively disable them below.
 585         */
 586        enable_x2apic_msr_intercepts(msr_bitmap_l0);
 587
 588        if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
 589                if (nested_cpu_has_apic_reg_virt(vmcs12)) {
 590                        /*
 591                         * L0 need not intercept reads for MSRs between 0x800
 592                         * and 0x8ff, it just lets the processor take the value
 593                         * from the virtual-APIC page; take those 256 bits
 594                         * directly from the L1 bitmap.
 595                         */
 596                        for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
 597                                unsigned word = msr / BITS_PER_LONG;
 598
 599                                msr_bitmap_l0[word] = msr_bitmap_l1[word];
 600                        }
 601                }
 602
 603                nested_vmx_disable_intercept_for_msr(
 604                        msr_bitmap_l1, msr_bitmap_l0,
 605                        X2APIC_MSR(APIC_TASKPRI),
 606                        MSR_TYPE_R | MSR_TYPE_W);
 607
 608                if (nested_cpu_has_vid(vmcs12)) {
 609                        nested_vmx_disable_intercept_for_msr(
 610                                msr_bitmap_l1, msr_bitmap_l0,
 611                                X2APIC_MSR(APIC_EOI),
 612                                MSR_TYPE_W);
 613                        nested_vmx_disable_intercept_for_msr(
 614                                msr_bitmap_l1, msr_bitmap_l0,
 615                                X2APIC_MSR(APIC_SELF_IPI),
 616                                MSR_TYPE_W);
 617                }
 618        }
 619
 620        /* KVM unconditionally exposes the FS/GS base MSRs to L1. */
 621        nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
 622                                             MSR_FS_BASE, MSR_TYPE_RW);
 623
 624        nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
 625                                             MSR_GS_BASE, MSR_TYPE_RW);
 626
 627        nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
 628                                             MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
 629
 630        /*
 631         * Checking the L0->L1 bitmap is trying to verify two things:
 632         *
 633         * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
 634         *    ensures that we do not accidentally generate an L02 MSR bitmap
 635         *    from the L12 MSR bitmap that is too permissive.
 636         * 2. That L1 or L2s have actually used the MSR. This avoids
 637         *    unnecessarily merging of the bitmap if the MSR is unused. This
 638         *    works properly because we only update the L01 MSR bitmap lazily.
 639         *    So even if L0 should pass L1 these MSRs, the L01 bitmap is only
 640         *    updated to reflect this when L1 (or its L2s) actually write to
 641         *    the MSR.
 642         */
 643        if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL))
 644                nested_vmx_disable_intercept_for_msr(
 645                                        msr_bitmap_l1, msr_bitmap_l0,
 646                                        MSR_IA32_SPEC_CTRL,
 647                                        MSR_TYPE_R | MSR_TYPE_W);
 648
 649        if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD))
 650                nested_vmx_disable_intercept_for_msr(
 651                                        msr_bitmap_l1, msr_bitmap_l0,
 652                                        MSR_IA32_PRED_CMD,
 653                                        MSR_TYPE_W);
 654
 655        kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false);
 656
 657        return true;
 658}
 659
 660static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
 661                                       struct vmcs12 *vmcs12)
 662{
 663        struct kvm_host_map map;
 664        struct vmcs12 *shadow;
 665
 666        if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
 667            vmcs12->vmcs_link_pointer == -1ull)
 668                return;
 669
 670        shadow = get_shadow_vmcs12(vcpu);
 671
 672        if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
 673                return;
 674
 675        memcpy(shadow, map.hva, VMCS12_SIZE);
 676        kvm_vcpu_unmap(vcpu, &map, false);
 677}
 678
 679static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
 680                                              struct vmcs12 *vmcs12)
 681{
 682        struct vcpu_vmx *vmx = to_vmx(vcpu);
 683
 684        if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
 685            vmcs12->vmcs_link_pointer == -1ull)
 686                return;
 687
 688        kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
 689                        get_shadow_vmcs12(vcpu), VMCS12_SIZE);
 690}
 691
 692/*
 693 * In nested virtualization, check if L1 has set
 694 * VM_EXIT_ACK_INTR_ON_EXIT
 695 */
 696static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
 697{
 698        return get_vmcs12(vcpu)->vm_exit_controls &
 699                VM_EXIT_ACK_INTR_ON_EXIT;
 700}
 701
 702static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
 703{
 704        return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
 705}
 706
 707static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
 708                                          struct vmcs12 *vmcs12)
 709{
 710        if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
 711            CC(!page_address_valid(vcpu, vmcs12->apic_access_addr)))
 712                return -EINVAL;
 713        else
 714                return 0;
 715}
 716
 717static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
 718                                           struct vmcs12 *vmcs12)
 719{
 720        if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
 721            !nested_cpu_has_apic_reg_virt(vmcs12) &&
 722            !nested_cpu_has_vid(vmcs12) &&
 723            !nested_cpu_has_posted_intr(vmcs12))
 724                return 0;
 725
 726        /*
 727         * If virtualize x2apic mode is enabled,
 728         * virtualize apic access must be disabled.
 729         */
 730        if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) &&
 731               nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)))
 732                return -EINVAL;
 733
 734        /*
 735         * If virtual interrupt delivery is enabled,
 736         * we must exit on external interrupts.
 737         */
 738        if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu)))
 739                return -EINVAL;
 740
 741        /*
 742         * bits 15:8 should be zero in posted_intr_nv,
 743         * the descriptor address has been already checked
 744         * in nested_get_vmcs12_pages.
 745         *
 746         * bits 5:0 of posted_intr_desc_addr should be zero.
 747         */
 748        if (nested_cpu_has_posted_intr(vmcs12) &&
 749           (CC(!nested_cpu_has_vid(vmcs12)) ||
 750            CC(!nested_exit_intr_ack_set(vcpu)) ||
 751            CC((vmcs12->posted_intr_nv & 0xff00)) ||
 752            CC((vmcs12->posted_intr_desc_addr & 0x3f)) ||
 753            CC((vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu)))))
 754                return -EINVAL;
 755
 756        /* tpr shadow is needed by all apicv features. */
 757        if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)))
 758                return -EINVAL;
 759
 760        return 0;
 761}
 762
 763static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
 764                                       u32 count, u64 addr)
 765{
 766        int maxphyaddr;
 767
 768        if (count == 0)
 769                return 0;
 770        maxphyaddr = cpuid_maxphyaddr(vcpu);
 771        if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
 772            (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr)
 773                return -EINVAL;
 774
 775        return 0;
 776}
 777
 778static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu,
 779                                                     struct vmcs12 *vmcs12)
 780{
 781        if (CC(nested_vmx_check_msr_switch(vcpu,
 782                                           vmcs12->vm_exit_msr_load_count,
 783                                           vmcs12->vm_exit_msr_load_addr)) ||
 784            CC(nested_vmx_check_msr_switch(vcpu,
 785                                           vmcs12->vm_exit_msr_store_count,
 786                                           vmcs12->vm_exit_msr_store_addr)))
 787                return -EINVAL;
 788
 789        return 0;
 790}
 791
 792static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu,
 793                                                      struct vmcs12 *vmcs12)
 794{
 795        if (CC(nested_vmx_check_msr_switch(vcpu,
 796                                           vmcs12->vm_entry_msr_load_count,
 797                                           vmcs12->vm_entry_msr_load_addr)))
 798                return -EINVAL;
 799
 800        return 0;
 801}
 802
 803static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
 804                                         struct vmcs12 *vmcs12)
 805{
 806        if (!nested_cpu_has_pml(vmcs12))
 807                return 0;
 808
 809        if (CC(!nested_cpu_has_ept(vmcs12)) ||
 810            CC(!page_address_valid(vcpu, vmcs12->pml_address)))
 811                return -EINVAL;
 812
 813        return 0;
 814}
 815
 816static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu,
 817                                                        struct vmcs12 *vmcs12)
 818{
 819        if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) &&
 820               !nested_cpu_has_ept(vmcs12)))
 821                return -EINVAL;
 822        return 0;
 823}
 824
 825static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu,
 826                                                         struct vmcs12 *vmcs12)
 827{
 828        if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) &&
 829               !nested_cpu_has_ept(vmcs12)))
 830                return -EINVAL;
 831        return 0;
 832}
 833
 834static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
 835                                                 struct vmcs12 *vmcs12)
 836{
 837        if (!nested_cpu_has_shadow_vmcs(vmcs12))
 838                return 0;
 839
 840        if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) ||
 841            CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap)))
 842                return -EINVAL;
 843
 844        return 0;
 845}
 846
 847static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
 848                                       struct vmx_msr_entry *e)
 849{
 850        /* x2APIC MSR accesses are not allowed */
 851        if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8))
 852                return -EINVAL;
 853        if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */
 854            CC(e->index == MSR_IA32_UCODE_REV))
 855                return -EINVAL;
 856        if (CC(e->reserved != 0))
 857                return -EINVAL;
 858        return 0;
 859}
 860
 861static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
 862                                     struct vmx_msr_entry *e)
 863{
 864        if (CC(e->index == MSR_FS_BASE) ||
 865            CC(e->index == MSR_GS_BASE) ||
 866            CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */
 867            nested_vmx_msr_check_common(vcpu, e))
 868                return -EINVAL;
 869        return 0;
 870}
 871
 872static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
 873                                      struct vmx_msr_entry *e)
 874{
 875        if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */
 876            nested_vmx_msr_check_common(vcpu, e))
 877                return -EINVAL;
 878        return 0;
 879}
 880
 881static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
 882{
 883        struct vcpu_vmx *vmx = to_vmx(vcpu);
 884        u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
 885                                       vmx->nested.msrs.misc_high);
 886
 887        return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
 888}
 889
 890/*
 891 * Load guest's/host's msr at nested entry/exit.
 892 * return 0 for success, entry index for failure.
 893 *
 894 * One of the failure modes for MSR load/store is when a list exceeds the
 895 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch
 896 * as possible, process all valid entries before failing rather than precheck
 897 * for a capacity violation.
 898 */
 899static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
 900{
 901        u32 i;
 902        struct vmx_msr_entry e;
 903        u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
 904
 905        for (i = 0; i < count; i++) {
 906                if (unlikely(i >= max_msr_list_size))
 907                        goto fail;
 908
 909                if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
 910                                        &e, sizeof(e))) {
 911                        pr_debug_ratelimited(
 912                                "%s cannot read MSR entry (%u, 0x%08llx)\n",
 913                                __func__, i, gpa + i * sizeof(e));
 914                        goto fail;
 915                }
 916                if (nested_vmx_load_msr_check(vcpu, &e)) {
 917                        pr_debug_ratelimited(
 918                                "%s check failed (%u, 0x%x, 0x%x)\n",
 919                                __func__, i, e.index, e.reserved);
 920                        goto fail;
 921                }
 922                if (kvm_set_msr(vcpu, e.index, e.value)) {
 923                        pr_debug_ratelimited(
 924                                "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
 925                                __func__, i, e.index, e.value);
 926                        goto fail;
 927                }
 928        }
 929        return 0;
 930fail:
 931        return i + 1;
 932}
 933
 934static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
 935                                            u32 msr_index,
 936                                            u64 *data)
 937{
 938        struct vcpu_vmx *vmx = to_vmx(vcpu);
 939
 940        /*
 941         * If the L0 hypervisor stored a more accurate value for the TSC that
 942         * does not include the time taken for emulation of the L2->L1
 943         * VM-exit in L0, use the more accurate value.
 944         */
 945        if (msr_index == MSR_IA32_TSC) {
 946                int index = vmx_find_msr_index(&vmx->msr_autostore.guest,
 947                                               MSR_IA32_TSC);
 948
 949                if (index >= 0) {
 950                        u64 val = vmx->msr_autostore.guest.val[index].value;
 951
 952                        *data = kvm_read_l1_tsc(vcpu, val);
 953                        return true;
 954                }
 955        }
 956
 957        if (kvm_get_msr(vcpu, msr_index, data)) {
 958                pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
 959                        msr_index);
 960                return false;
 961        }
 962        return true;
 963}
 964
 965static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i,
 966                                     struct vmx_msr_entry *e)
 967{
 968        if (kvm_vcpu_read_guest(vcpu,
 969                                gpa + i * sizeof(*e),
 970                                e, 2 * sizeof(u32))) {
 971                pr_debug_ratelimited(
 972                        "%s cannot read MSR entry (%u, 0x%08llx)\n",
 973                        __func__, i, gpa + i * sizeof(*e));
 974                return false;
 975        }
 976        if (nested_vmx_store_msr_check(vcpu, e)) {
 977                pr_debug_ratelimited(
 978                        "%s check failed (%u, 0x%x, 0x%x)\n",
 979                        __func__, i, e->index, e->reserved);
 980                return false;
 981        }
 982        return true;
 983}
 984
 985static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
 986{
 987        u64 data;
 988        u32 i;
 989        struct vmx_msr_entry e;
 990        u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
 991
 992        for (i = 0; i < count; i++) {
 993                if (unlikely(i >= max_msr_list_size))
 994                        return -EINVAL;
 995
 996                if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
 997                        return -EINVAL;
 998
 999                if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data))
1000                        return -EINVAL;
1001
1002                if (kvm_vcpu_write_guest(vcpu,
1003                                         gpa + i * sizeof(e) +
1004                                             offsetof(struct vmx_msr_entry, value),
1005                                         &data, sizeof(data))) {
1006                        pr_debug_ratelimited(
1007                                "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
1008                                __func__, i, e.index, data);
1009                        return -EINVAL;
1010                }
1011        }
1012        return 0;
1013}
1014
1015static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index)
1016{
1017        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1018        u32 count = vmcs12->vm_exit_msr_store_count;
1019        u64 gpa = vmcs12->vm_exit_msr_store_addr;
1020        struct vmx_msr_entry e;
1021        u32 i;
1022
1023        for (i = 0; i < count; i++) {
1024                if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
1025                        return false;
1026
1027                if (e.index == msr_index)
1028                        return true;
1029        }
1030        return false;
1031}
1032
1033static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
1034                                           u32 msr_index)
1035{
1036        struct vcpu_vmx *vmx = to_vmx(vcpu);
1037        struct vmx_msrs *autostore = &vmx->msr_autostore.guest;
1038        bool in_vmcs12_store_list;
1039        int msr_autostore_index;
1040        bool in_autostore_list;
1041        int last;
1042
1043        msr_autostore_index = vmx_find_msr_index(autostore, msr_index);
1044        in_autostore_list = msr_autostore_index >= 0;
1045        in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index);
1046
1047        if (in_vmcs12_store_list && !in_autostore_list) {
1048                if (autostore->nr == NR_LOADSTORE_MSRS) {
1049                        /*
1050                         * Emulated VMEntry does not fail here.  Instead a less
1051                         * accurate value will be returned by
1052                         * nested_vmx_get_vmexit_msr_value() using kvm_get_msr()
1053                         * instead of reading the value from the vmcs02 VMExit
1054                         * MSR-store area.
1055                         */
1056                        pr_warn_ratelimited(
1057                                "Not enough msr entries in msr_autostore.  Can't add msr %x\n",
1058                                msr_index);
1059                        return;
1060                }
1061                last = autostore->nr++;
1062                autostore->val[last].index = msr_index;
1063        } else if (!in_vmcs12_store_list && in_autostore_list) {
1064                last = --autostore->nr;
1065                autostore->val[msr_autostore_index] = autostore->val[last];
1066        }
1067}
1068
1069static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
1070{
1071        unsigned long invalid_mask;
1072
1073        invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
1074        return (val & invalid_mask) == 0;
1075}
1076
1077/*
1078 * Load guest's/host's cr3 at nested entry/exit.  @nested_ept is true if we are
1079 * emulating VM-Entry into a guest with EPT enabled.  On failure, the expected
1080 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to
1081 * @entry_failure_code.
1082 */
1083static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
1084                               u32 *entry_failure_code)
1085{
1086        if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
1087                if (CC(!nested_cr3_valid(vcpu, cr3))) {
1088                        *entry_failure_code = ENTRY_FAIL_DEFAULT;
1089                        return -EINVAL;
1090                }
1091
1092                /*
1093                 * If PAE paging and EPT are both on, CR3 is not used by the CPU and
1094                 * must not be dereferenced.
1095                 */
1096                if (is_pae_paging(vcpu) && !nested_ept) {
1097                        if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) {
1098                                *entry_failure_code = ENTRY_FAIL_PDPTE;
1099                                return -EINVAL;
1100                        }
1101                }
1102        }
1103
1104        if (!nested_ept)
1105                kvm_mmu_new_cr3(vcpu, cr3, false);
1106
1107        vcpu->arch.cr3 = cr3;
1108        kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
1109
1110        kvm_init_mmu(vcpu, false);
1111
1112        return 0;
1113}
1114
1115/*
1116 * Returns if KVM is able to config CPU to tag TLB entries
1117 * populated by L2 differently than TLB entries populated
1118 * by L1.
1119 *
1120 * If L0 uses EPT, L1 and L2 run with different EPTP because
1121 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries
1122 * are tagged with different EPTP.
1123 *
1124 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
1125 * with different VPID (L1 entries are tagged with vmx->vpid
1126 * while L2 entries are tagged with vmx->nested.vpid02).
1127 */
1128static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
1129{
1130        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1131
1132        return enable_ept ||
1133               (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
1134}
1135
1136static u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
1137{
1138        struct vcpu_vmx *vmx = to_vmx(vcpu);
1139
1140        return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
1141}
1142
1143static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
1144{
1145        superset &= mask;
1146        subset &= mask;
1147
1148        return (superset | subset) == superset;
1149}
1150
1151static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
1152{
1153        const u64 feature_and_reserved =
1154                /* feature (except bit 48; see below) */
1155                BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
1156                /* reserved */
1157                BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
1158        u64 vmx_basic = vmx->nested.msrs.basic;
1159
1160        if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
1161                return -EINVAL;
1162
1163        /*
1164         * KVM does not emulate a version of VMX that constrains physical
1165         * addresses of VMX structures (e.g. VMCS) to 32-bits.
1166         */
1167        if (data & BIT_ULL(48))
1168                return -EINVAL;
1169
1170        if (vmx_basic_vmcs_revision_id(vmx_basic) !=
1171            vmx_basic_vmcs_revision_id(data))
1172                return -EINVAL;
1173
1174        if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
1175                return -EINVAL;
1176
1177        vmx->nested.msrs.basic = data;
1178        return 0;
1179}
1180
1181static int
1182vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1183{
1184        u64 supported;
1185        u32 *lowp, *highp;
1186
1187        switch (msr_index) {
1188        case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1189                lowp = &vmx->nested.msrs.pinbased_ctls_low;
1190                highp = &vmx->nested.msrs.pinbased_ctls_high;
1191                break;
1192        case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1193                lowp = &vmx->nested.msrs.procbased_ctls_low;
1194                highp = &vmx->nested.msrs.procbased_ctls_high;
1195                break;
1196        case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1197                lowp = &vmx->nested.msrs.exit_ctls_low;
1198                highp = &vmx->nested.msrs.exit_ctls_high;
1199                break;
1200        case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1201                lowp = &vmx->nested.msrs.entry_ctls_low;
1202                highp = &vmx->nested.msrs.entry_ctls_high;
1203                break;
1204        case MSR_IA32_VMX_PROCBASED_CTLS2:
1205                lowp = &vmx->nested.msrs.secondary_ctls_low;
1206                highp = &vmx->nested.msrs.secondary_ctls_high;
1207                break;
1208        default:
1209                BUG();
1210        }
1211
1212        supported = vmx_control_msr(*lowp, *highp);
1213
1214        /* Check must-be-1 bits are still 1. */
1215        if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
1216                return -EINVAL;
1217
1218        /* Check must-be-0 bits are still 0. */
1219        if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
1220                return -EINVAL;
1221
1222        *lowp = data;
1223        *highp = data >> 32;
1224        return 0;
1225}
1226
1227static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
1228{
1229        const u64 feature_and_reserved_bits =
1230                /* feature */
1231                BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
1232                BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
1233                /* reserved */
1234                GENMASK_ULL(13, 9) | BIT_ULL(31);
1235        u64 vmx_misc;
1236
1237        vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
1238                                   vmx->nested.msrs.misc_high);
1239
1240        if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
1241                return -EINVAL;
1242
1243        if ((vmx->nested.msrs.pinbased_ctls_high &
1244             PIN_BASED_VMX_PREEMPTION_TIMER) &&
1245            vmx_misc_preemption_timer_rate(data) !=
1246            vmx_misc_preemption_timer_rate(vmx_misc))
1247                return -EINVAL;
1248
1249        if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
1250                return -EINVAL;
1251
1252        if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
1253                return -EINVAL;
1254
1255        if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
1256                return -EINVAL;
1257
1258        vmx->nested.msrs.misc_low = data;
1259        vmx->nested.msrs.misc_high = data >> 32;
1260
1261        return 0;
1262}
1263
1264static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
1265{
1266        u64 vmx_ept_vpid_cap;
1267
1268        vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
1269                                           vmx->nested.msrs.vpid_caps);
1270
1271        /* Every bit is either reserved or a feature bit. */
1272        if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
1273                return -EINVAL;
1274
1275        vmx->nested.msrs.ept_caps = data;
1276        vmx->nested.msrs.vpid_caps = data >> 32;
1277        return 0;
1278}
1279
1280static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1281{
1282        u64 *msr;
1283
1284        switch (msr_index) {
1285        case MSR_IA32_VMX_CR0_FIXED0:
1286                msr = &vmx->nested.msrs.cr0_fixed0;
1287                break;
1288        case MSR_IA32_VMX_CR4_FIXED0:
1289                msr = &vmx->nested.msrs.cr4_fixed0;
1290                break;
1291        default:
1292                BUG();
1293        }
1294
1295        /*
1296         * 1 bits (which indicates bits which "must-be-1" during VMX operation)
1297         * must be 1 in the restored value.
1298         */
1299        if (!is_bitwise_subset(data, *msr, -1ULL))
1300                return -EINVAL;
1301
1302        *msr = data;
1303        return 0;
1304}
1305
1306/*
1307 * Called when userspace is restoring VMX MSRs.
1308 *
1309 * Returns 0 on success, non-0 otherwise.
1310 */
1311int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1312{
1313        struct vcpu_vmx *vmx = to_vmx(vcpu);
1314
1315        /*
1316         * Don't allow changes to the VMX capability MSRs while the vCPU
1317         * is in VMX operation.
1318         */
1319        if (vmx->nested.vmxon)
1320                return -EBUSY;
1321
1322        switch (msr_index) {
1323        case MSR_IA32_VMX_BASIC:
1324                return vmx_restore_vmx_basic(vmx, data);
1325        case MSR_IA32_VMX_PINBASED_CTLS:
1326        case MSR_IA32_VMX_PROCBASED_CTLS:
1327        case MSR_IA32_VMX_EXIT_CTLS:
1328        case MSR_IA32_VMX_ENTRY_CTLS:
1329                /*
1330                 * The "non-true" VMX capability MSRs are generated from the
1331                 * "true" MSRs, so we do not support restoring them directly.
1332                 *
1333                 * If userspace wants to emulate VMX_BASIC[55]=0, userspace
1334                 * should restore the "true" MSRs with the must-be-1 bits
1335                 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
1336                 * DEFAULT SETTINGS".
1337                 */
1338                return -EINVAL;
1339        case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1340        case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1341        case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1342        case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1343        case MSR_IA32_VMX_PROCBASED_CTLS2:
1344                return vmx_restore_control_msr(vmx, msr_index, data);
1345        case MSR_IA32_VMX_MISC:
1346                return vmx_restore_vmx_misc(vmx, data);
1347        case MSR_IA32_VMX_CR0_FIXED0:
1348        case MSR_IA32_VMX_CR4_FIXED0:
1349                return vmx_restore_fixed0_msr(vmx, msr_index, data);
1350        case MSR_IA32_VMX_CR0_FIXED1:
1351        case MSR_IA32_VMX_CR4_FIXED1:
1352                /*
1353                 * These MSRs are generated based on the vCPU's CPUID, so we
1354                 * do not support restoring them directly.
1355                 */
1356                return -EINVAL;
1357        case MSR_IA32_VMX_EPT_VPID_CAP:
1358                return vmx_restore_vmx_ept_vpid_cap(vmx, data);
1359        case MSR_IA32_VMX_VMCS_ENUM:
1360                vmx->nested.msrs.vmcs_enum = data;
1361                return 0;
1362        case MSR_IA32_VMX_VMFUNC:
1363                if (data & ~vmx->nested.msrs.vmfunc_controls)
1364                        return -EINVAL;
1365                vmx->nested.msrs.vmfunc_controls = data;
1366                return 0;
1367        default:
1368                /*
1369                 * The rest of the VMX capability MSRs do not support restore.
1370                 */
1371                return -EINVAL;
1372        }
1373}
1374
1375/* Returns 0 on success, non-0 otherwise. */
1376int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
1377{
1378        switch (msr_index) {
1379        case MSR_IA32_VMX_BASIC:
1380                *pdata = msrs->basic;
1381                break;
1382        case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1383        case MSR_IA32_VMX_PINBASED_CTLS:
1384                *pdata = vmx_control_msr(
1385                        msrs->pinbased_ctls_low,
1386                        msrs->pinbased_ctls_high);
1387                if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
1388                        *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1389                break;
1390        case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1391        case MSR_IA32_VMX_PROCBASED_CTLS:
1392                *pdata = vmx_control_msr(
1393                        msrs->procbased_ctls_low,
1394                        msrs->procbased_ctls_high);
1395                if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
1396                        *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1397                break;
1398        case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1399        case MSR_IA32_VMX_EXIT_CTLS:
1400                *pdata = vmx_control_msr(
1401                        msrs->exit_ctls_low,
1402                        msrs->exit_ctls_high);
1403                if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
1404                        *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
1405                break;
1406        case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1407        case MSR_IA32_VMX_ENTRY_CTLS:
1408                *pdata = vmx_control_msr(
1409                        msrs->entry_ctls_low,
1410                        msrs->entry_ctls_high);
1411                if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
1412                        *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
1413                break;
1414        case MSR_IA32_VMX_MISC:
1415                *pdata = vmx_control_msr(
1416                        msrs->misc_low,
1417                        msrs->misc_high);
1418                break;
1419        case MSR_IA32_VMX_CR0_FIXED0:
1420                *pdata = msrs->cr0_fixed0;
1421                break;
1422        case MSR_IA32_VMX_CR0_FIXED1:
1423                *pdata = msrs->cr0_fixed1;
1424                break;
1425        case MSR_IA32_VMX_CR4_FIXED0:
1426                *pdata = msrs->cr4_fixed0;
1427                break;
1428        case MSR_IA32_VMX_CR4_FIXED1:
1429                *pdata = msrs->cr4_fixed1;
1430                break;
1431        case MSR_IA32_VMX_VMCS_ENUM:
1432                *pdata = msrs->vmcs_enum;
1433                break;
1434        case MSR_IA32_VMX_PROCBASED_CTLS2:
1435                *pdata = vmx_control_msr(
1436                        msrs->secondary_ctls_low,
1437                        msrs->secondary_ctls_high);
1438                break;
1439        case MSR_IA32_VMX_EPT_VPID_CAP:
1440                *pdata = msrs->ept_caps |
1441                        ((u64)msrs->vpid_caps << 32);
1442                break;
1443        case MSR_IA32_VMX_VMFUNC:
1444                *pdata = msrs->vmfunc_controls;
1445                break;
1446        default:
1447                return 1;
1448        }
1449
1450        return 0;
1451}
1452
1453/*
1454 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have
1455 * been modified by the L1 guest.  Note, "writable" in this context means
1456 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of
1457 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only"
1458 * VM-exit information fields (which are actually writable if the vCPU is
1459 * configured to support "VMWRITE to any supported field in the VMCS").
1460 */
1461static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
1462{
1463        struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1464        struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1465        struct shadow_vmcs_field field;
1466        unsigned long val;
1467        int i;
1468
1469        if (WARN_ON(!shadow_vmcs))
1470                return;
1471
1472        preempt_disable();
1473
1474        vmcs_load(shadow_vmcs);
1475
1476        for (i = 0; i < max_shadow_read_write_fields; i++) {
1477                field = shadow_read_write_fields[i];
1478                val = __vmcs_readl(field.encoding);
1479                vmcs12_write_any(vmcs12, field.encoding, field.offset, val);
1480        }
1481
1482        vmcs_clear(shadow_vmcs);
1483        vmcs_load(vmx->loaded_vmcs->vmcs);
1484
1485        preempt_enable();
1486}
1487
1488static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
1489{
1490        const struct shadow_vmcs_field *fields[] = {
1491                shadow_read_write_fields,
1492                shadow_read_only_fields
1493        };
1494        const int max_fields[] = {
1495                max_shadow_read_write_fields,
1496                max_shadow_read_only_fields
1497        };
1498        struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1499        struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1500        struct shadow_vmcs_field field;
1501        unsigned long val;
1502        int i, q;
1503
1504        if (WARN_ON(!shadow_vmcs))
1505                return;
1506
1507        vmcs_load(shadow_vmcs);
1508
1509        for (q = 0; q < ARRAY_SIZE(fields); q++) {
1510                for (i = 0; i < max_fields[q]; i++) {
1511                        field = fields[q][i];
1512                        val = vmcs12_read_any(vmcs12, field.encoding,
1513                                              field.offset);
1514                        __vmcs_writel(field.encoding, val);
1515                }
1516        }
1517
1518        vmcs_clear(shadow_vmcs);
1519        vmcs_load(vmx->loaded_vmcs->vmcs);
1520}
1521
1522static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
1523{
1524        struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1525        struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1526
1527        /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
1528        vmcs12->tpr_threshold = evmcs->tpr_threshold;
1529        vmcs12->guest_rip = evmcs->guest_rip;
1530
1531        if (unlikely(!(evmcs->hv_clean_fields &
1532                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
1533                vmcs12->guest_rsp = evmcs->guest_rsp;
1534                vmcs12->guest_rflags = evmcs->guest_rflags;
1535                vmcs12->guest_interruptibility_info =
1536                        evmcs->guest_interruptibility_info;
1537        }
1538
1539        if (unlikely(!(evmcs->hv_clean_fields &
1540                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
1541                vmcs12->cpu_based_vm_exec_control =
1542                        evmcs->cpu_based_vm_exec_control;
1543        }
1544
1545        if (unlikely(!(evmcs->hv_clean_fields &
1546                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) {
1547                vmcs12->exception_bitmap = evmcs->exception_bitmap;
1548        }
1549
1550        if (unlikely(!(evmcs->hv_clean_fields &
1551                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
1552                vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
1553        }
1554
1555        if (unlikely(!(evmcs->hv_clean_fields &
1556                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
1557                vmcs12->vm_entry_intr_info_field =
1558                        evmcs->vm_entry_intr_info_field;
1559                vmcs12->vm_entry_exception_error_code =
1560                        evmcs->vm_entry_exception_error_code;
1561                vmcs12->vm_entry_instruction_len =
1562                        evmcs->vm_entry_instruction_len;
1563        }
1564
1565        if (unlikely(!(evmcs->hv_clean_fields &
1566                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
1567                vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
1568                vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
1569                vmcs12->host_cr0 = evmcs->host_cr0;
1570                vmcs12->host_cr3 = evmcs->host_cr3;
1571                vmcs12->host_cr4 = evmcs->host_cr4;
1572                vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
1573                vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
1574                vmcs12->host_rip = evmcs->host_rip;
1575                vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
1576                vmcs12->host_es_selector = evmcs->host_es_selector;
1577                vmcs12->host_cs_selector = evmcs->host_cs_selector;
1578                vmcs12->host_ss_selector = evmcs->host_ss_selector;
1579                vmcs12->host_ds_selector = evmcs->host_ds_selector;
1580                vmcs12->host_fs_selector = evmcs->host_fs_selector;
1581                vmcs12->host_gs_selector = evmcs->host_gs_selector;
1582                vmcs12->host_tr_selector = evmcs->host_tr_selector;
1583        }
1584
1585        if (unlikely(!(evmcs->hv_clean_fields &
1586                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) {
1587                vmcs12->pin_based_vm_exec_control =
1588                        evmcs->pin_based_vm_exec_control;
1589                vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
1590                vmcs12->secondary_vm_exec_control =
1591                        evmcs->secondary_vm_exec_control;
1592        }
1593
1594        if (unlikely(!(evmcs->hv_clean_fields &
1595                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
1596                vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
1597                vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
1598        }
1599
1600        if (unlikely(!(evmcs->hv_clean_fields &
1601                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
1602                vmcs12->msr_bitmap = evmcs->msr_bitmap;
1603        }
1604
1605        if (unlikely(!(evmcs->hv_clean_fields &
1606                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
1607                vmcs12->guest_es_base = evmcs->guest_es_base;
1608                vmcs12->guest_cs_base = evmcs->guest_cs_base;
1609                vmcs12->guest_ss_base = evmcs->guest_ss_base;
1610                vmcs12->guest_ds_base = evmcs->guest_ds_base;
1611                vmcs12->guest_fs_base = evmcs->guest_fs_base;
1612                vmcs12->guest_gs_base = evmcs->guest_gs_base;
1613                vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
1614                vmcs12->guest_tr_base = evmcs->guest_tr_base;
1615                vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
1616                vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
1617                vmcs12->guest_es_limit = evmcs->guest_es_limit;
1618                vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
1619                vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
1620                vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
1621                vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
1622                vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
1623                vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
1624                vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
1625                vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
1626                vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
1627                vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
1628                vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
1629                vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
1630                vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
1631                vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
1632                vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
1633                vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
1634                vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
1635                vmcs12->guest_es_selector = evmcs->guest_es_selector;
1636                vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
1637                vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
1638                vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
1639                vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
1640                vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
1641                vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
1642                vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
1643        }
1644
1645        if (unlikely(!(evmcs->hv_clean_fields &
1646                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
1647                vmcs12->tsc_offset = evmcs->tsc_offset;
1648                vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
1649                vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
1650        }
1651
1652        if (unlikely(!(evmcs->hv_clean_fields &
1653                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
1654                vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
1655                vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
1656                vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
1657                vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
1658                vmcs12->guest_cr0 = evmcs->guest_cr0;
1659                vmcs12->guest_cr3 = evmcs->guest_cr3;
1660                vmcs12->guest_cr4 = evmcs->guest_cr4;
1661                vmcs12->guest_dr7 = evmcs->guest_dr7;
1662        }
1663
1664        if (unlikely(!(evmcs->hv_clean_fields &
1665                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
1666                vmcs12->host_fs_base = evmcs->host_fs_base;
1667                vmcs12->host_gs_base = evmcs->host_gs_base;
1668                vmcs12->host_tr_base = evmcs->host_tr_base;
1669                vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
1670                vmcs12->host_idtr_base = evmcs->host_idtr_base;
1671                vmcs12->host_rsp = evmcs->host_rsp;
1672        }
1673
1674        if (unlikely(!(evmcs->hv_clean_fields &
1675                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
1676                vmcs12->ept_pointer = evmcs->ept_pointer;
1677                vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
1678        }
1679
1680        if (unlikely(!(evmcs->hv_clean_fields &
1681                       HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
1682                vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
1683                vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
1684                vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
1685                vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
1686                vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
1687                vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
1688                vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
1689                vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
1690                vmcs12->guest_pending_dbg_exceptions =
1691                        evmcs->guest_pending_dbg_exceptions;
1692                vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
1693                vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
1694                vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
1695                vmcs12->guest_activity_state = evmcs->guest_activity_state;
1696                vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
1697        }
1698
1699        /*
1700         * Not used?
1701         * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
1702         * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
1703         * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
1704         * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0;
1705         * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1;
1706         * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2;
1707         * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3;
1708         * vmcs12->page_fault_error_code_mask =
1709         *              evmcs->page_fault_error_code_mask;
1710         * vmcs12->page_fault_error_code_match =
1711         *              evmcs->page_fault_error_code_match;
1712         * vmcs12->cr3_target_count = evmcs->cr3_target_count;
1713         * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
1714         * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
1715         * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
1716         */
1717
1718        /*
1719         * Read only fields:
1720         * vmcs12->guest_physical_address = evmcs->guest_physical_address;
1721         * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
1722         * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
1723         * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
1724         * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
1725         * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
1726         * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
1727         * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
1728         * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
1729         * vmcs12->exit_qualification = evmcs->exit_qualification;
1730         * vmcs12->guest_linear_address = evmcs->guest_linear_address;
1731         *
1732         * Not present in struct vmcs12:
1733         * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
1734         * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
1735         * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
1736         * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
1737         */
1738
1739        return 0;
1740}
1741
1742static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
1743{
1744        struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1745        struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1746
1747        /*
1748         * Should not be changed by KVM:
1749         *
1750         * evmcs->host_es_selector = vmcs12->host_es_selector;
1751         * evmcs->host_cs_selector = vmcs12->host_cs_selector;
1752         * evmcs->host_ss_selector = vmcs12->host_ss_selector;
1753         * evmcs->host_ds_selector = vmcs12->host_ds_selector;
1754         * evmcs->host_fs_selector = vmcs12->host_fs_selector;
1755         * evmcs->host_gs_selector = vmcs12->host_gs_selector;
1756         * evmcs->host_tr_selector = vmcs12->host_tr_selector;
1757         * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
1758         * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
1759         * evmcs->host_cr0 = vmcs12->host_cr0;
1760         * evmcs->host_cr3 = vmcs12->host_cr3;
1761         * evmcs->host_cr4 = vmcs12->host_cr4;
1762         * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
1763         * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
1764         * evmcs->host_rip = vmcs12->host_rip;
1765         * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
1766         * evmcs->host_fs_base = vmcs12->host_fs_base;
1767         * evmcs->host_gs_base = vmcs12->host_gs_base;
1768         * evmcs->host_tr_base = vmcs12->host_tr_base;
1769         * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
1770         * evmcs->host_idtr_base = vmcs12->host_idtr_base;
1771         * evmcs->host_rsp = vmcs12->host_rsp;
1772         * sync_vmcs02_to_vmcs12() doesn't read these:
1773         * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
1774         * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
1775         * evmcs->msr_bitmap = vmcs12->msr_bitmap;
1776         * evmcs->ept_pointer = vmcs12->ept_pointer;
1777         * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
1778         * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
1779         * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
1780         * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
1781         * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0;
1782         * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1;
1783         * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2;
1784         * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3;
1785         * evmcs->tpr_threshold = vmcs12->tpr_threshold;
1786         * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
1787         * evmcs->exception_bitmap = vmcs12->exception_bitmap;
1788         * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
1789         * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
1790         * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
1791         * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
1792         * evmcs->page_fault_error_code_mask =
1793         *              vmcs12->page_fault_error_code_mask;
1794         * evmcs->page_fault_error_code_match =
1795         *              vmcs12->page_fault_error_code_match;
1796         * evmcs->cr3_target_count = vmcs12->cr3_target_count;
1797         * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
1798         * evmcs->tsc_offset = vmcs12->tsc_offset;
1799         * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
1800         * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
1801         * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
1802         * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
1803         * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
1804         * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
1805         * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
1806         * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
1807         *
1808         * Not present in struct vmcs12:
1809         * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
1810         * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
1811         * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
1812         * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
1813         */
1814
1815        evmcs->guest_es_selector = vmcs12->guest_es_selector;
1816        evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
1817        evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
1818        evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
1819        evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
1820        evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
1821        evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
1822        evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
1823
1824        evmcs->guest_es_limit = vmcs12->guest_es_limit;
1825        evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
1826        evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
1827        evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
1828        evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
1829        evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
1830        evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
1831        evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
1832        evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
1833        evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
1834
1835        evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
1836        evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
1837        evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
1838        evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
1839        evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
1840        evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
1841        evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
1842        evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
1843
1844        evmcs->guest_es_base = vmcs12->guest_es_base;
1845        evmcs->guest_cs_base = vmcs12->guest_cs_base;
1846        evmcs->guest_ss_base = vmcs12->guest_ss_base;
1847        evmcs->guest_ds_base = vmcs12->guest_ds_base;
1848        evmcs->guest_fs_base = vmcs12->guest_fs_base;
1849        evmcs->guest_gs_base = vmcs12->guest_gs_base;
1850        evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
1851        evmcs->guest_tr_base = vmcs12->guest_tr_base;
1852        evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
1853        evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
1854
1855        evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
1856        evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
1857
1858        evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
1859        evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
1860        evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
1861        evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
1862
1863        evmcs->guest_pending_dbg_exceptions =
1864                vmcs12->guest_pending_dbg_exceptions;
1865        evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
1866        evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
1867
1868        evmcs->guest_activity_state = vmcs12->guest_activity_state;
1869        evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
1870
1871        evmcs->guest_cr0 = vmcs12->guest_cr0;
1872        evmcs->guest_cr3 = vmcs12->guest_cr3;
1873        evmcs->guest_cr4 = vmcs12->guest_cr4;
1874        evmcs->guest_dr7 = vmcs12->guest_dr7;
1875
1876        evmcs->guest_physical_address = vmcs12->guest_physical_address;
1877
1878        evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
1879        evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
1880        evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
1881        evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
1882        evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
1883        evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
1884        evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
1885        evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
1886
1887        evmcs->exit_qualification = vmcs12->exit_qualification;
1888
1889        evmcs->guest_linear_address = vmcs12->guest_linear_address;
1890        evmcs->guest_rsp = vmcs12->guest_rsp;
1891        evmcs->guest_rflags = vmcs12->guest_rflags;
1892
1893        evmcs->guest_interruptibility_info =
1894                vmcs12->guest_interruptibility_info;
1895        evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
1896        evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
1897        evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
1898        evmcs->vm_entry_exception_error_code =
1899                vmcs12->vm_entry_exception_error_code;
1900        evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
1901
1902        evmcs->guest_rip = vmcs12->guest_rip;
1903
1904        evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
1905
1906        return 0;
1907}
1908
1909/*
1910 * This is an equivalent of the nested hypervisor executing the vmptrld
1911 * instruction.
1912 */
1913static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
1914                                                 bool from_launch)
1915{
1916        struct vcpu_vmx *vmx = to_vmx(vcpu);
1917        bool evmcs_gpa_changed = false;
1918        u64 evmcs_gpa;
1919
1920        if (likely(!vmx->nested.enlightened_vmcs_enabled))
1921                return 1;
1922
1923        if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa))
1924                return 1;
1925
1926        if (unlikely(!vmx->nested.hv_evmcs ||
1927                     evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
1928                if (!vmx->nested.hv_evmcs)
1929                        vmx->nested.current_vmptr = -1ull;
1930
1931                nested_release_evmcs(vcpu);
1932
1933                if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
1934                                 &vmx->nested.hv_evmcs_map))
1935                        return 0;
1936
1937                vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
1938
1939                /*
1940                 * Currently, KVM only supports eVMCS version 1
1941                 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this
1942                 * value to first u32 field of eVMCS which should specify eVMCS
1943                 * VersionNumber.
1944                 *
1945                 * Guest should be aware of supported eVMCS versions by host by
1946                 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
1947                 * expected to set this CPUID leaf according to the value
1948                 * returned in vmcs_version from nested_enable_evmcs().
1949                 *
1950                 * However, it turns out that Microsoft Hyper-V fails to comply
1951                 * to their own invented interface: When Hyper-V use eVMCS, it
1952                 * just sets first u32 field of eVMCS to revision_id specified
1953                 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
1954                 * which is one of the supported versions specified in
1955                 * CPUID.0x4000000A.EAX[0:15].
1956                 *
1957                 * To overcome Hyper-V bug, we accept here either a supported
1958                 * eVMCS version or VMCS12 revision_id as valid values for first
1959                 * u32 field of eVMCS.
1960                 */
1961                if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
1962                    (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
1963                        nested_release_evmcs(vcpu);
1964                        return 0;
1965                }
1966
1967                vmx->nested.dirty_vmcs12 = true;
1968                vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
1969
1970                evmcs_gpa_changed = true;
1971                /*
1972                 * Unlike normal vmcs12, enlightened vmcs12 is not fully
1973                 * reloaded from guest's memory (read only fields, fields not
1974                 * present in struct hv_enlightened_vmcs, ...). Make sure there
1975                 * are no leftovers.
1976                 */
1977                if (from_launch) {
1978                        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1979                        memset(vmcs12, 0, sizeof(*vmcs12));
1980                        vmcs12->hdr.revision_id = VMCS12_REVISION;
1981                }
1982
1983        }
1984
1985        /*
1986         * Clean fields data can't be used on VMLAUNCH and when we switch
1987         * between different L2 guests as KVM keeps a single VMCS12 per L1.
1988         */
1989        if (from_launch || evmcs_gpa_changed)
1990                vmx->nested.hv_evmcs->hv_clean_fields &=
1991                        ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
1992
1993        return 1;
1994}
1995
1996void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
1997{
1998        struct vcpu_vmx *vmx = to_vmx(vcpu);
1999
2000        /*
2001         * hv_evmcs may end up being not mapped after migration (when
2002         * L2 was running), map it here to make sure vmcs12 changes are
2003         * properly reflected.
2004         */
2005        if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs)
2006                nested_vmx_handle_enlightened_vmptrld(vcpu, false);
2007
2008        if (vmx->nested.hv_evmcs) {
2009                copy_vmcs12_to_enlightened(vmx);
2010                /* All fields are clean */
2011                vmx->nested.hv_evmcs->hv_clean_fields |=
2012                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
2013        } else {
2014                copy_vmcs12_to_shadow(vmx);
2015        }
2016
2017        vmx->nested.need_vmcs12_to_shadow_sync = false;
2018}
2019
2020static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
2021{
2022        struct vcpu_vmx *vmx =
2023                container_of(timer, struct vcpu_vmx, nested.preemption_timer);
2024
2025        vmx->nested.preemption_timer_expired = true;
2026        kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
2027        kvm_vcpu_kick(&vmx->vcpu);
2028
2029        return HRTIMER_NORESTART;
2030}
2031
2032static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
2033{
2034        u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
2035        struct vcpu_vmx *vmx = to_vmx(vcpu);
2036
2037        /*
2038         * A timer value of zero is architecturally guaranteed to cause
2039         * a VMExit prior to executing any instructions in the guest.
2040         */
2041        if (preemption_timeout == 0) {
2042                vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
2043                return;
2044        }
2045
2046        if (vcpu->arch.virtual_tsc_khz == 0)
2047                return;
2048
2049        preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
2050        preemption_timeout *= 1000000;
2051        do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
2052        hrtimer_start(&vmx->nested.preemption_timer,
2053                      ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
2054}
2055
2056static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2057{
2058        if (vmx->nested.nested_run_pending &&
2059            (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
2060                return vmcs12->guest_ia32_efer;
2061        else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
2062                return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
2063        else
2064                return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
2065}
2066
2067static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
2068{
2069        /*
2070         * If vmcs02 hasn't been initialized, set the constant vmcs02 state
2071         * according to L0's settings (vmcs12 is irrelevant here).  Host
2072         * fields that come from L0 and are not constant, e.g. HOST_CR3,
2073         * will be set as needed prior to VMLAUNCH/VMRESUME.
2074         */
2075        if (vmx->nested.vmcs02_initialized)
2076                return;
2077        vmx->nested.vmcs02_initialized = true;
2078
2079        /*
2080         * We don't care what the EPTP value is we just need to guarantee
2081         * it's valid so we don't get a false positive when doing early
2082         * consistency checks.
2083         */
2084        if (enable_ept && nested_early_check)
2085                vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0));
2086
2087        /* All VMFUNCs are currently emulated through L0 vmexits.  */
2088        if (cpu_has_vmx_vmfunc())
2089                vmcs_write64(VM_FUNCTION_CONTROL, 0);
2090
2091        if (cpu_has_vmx_posted_intr())
2092                vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
2093
2094        if (cpu_has_vmx_msr_bitmap())
2095                vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
2096
2097        /*
2098         * The PML address never changes, so it is constant in vmcs02.
2099         * Conceptually we want to copy the PML index from vmcs01 here,
2100         * and then back to vmcs01 on nested vmexit.  But since we flush
2101         * the log and reset GUEST_PML_INDEX on each vmexit, the PML
2102         * index is also effectively constant in vmcs02.
2103         */
2104        if (enable_pml) {
2105                vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
2106                vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
2107        }
2108
2109        if (cpu_has_vmx_encls_vmexit())
2110                vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
2111
2112        /*
2113         * Set the MSR load/store lists to match L0's settings.  Only the
2114         * addresses are constant (for vmcs02), the counts can change based
2115         * on L2's behavior, e.g. switching to/from long mode.
2116         */
2117        vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val));
2118        vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
2119        vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
2120
2121        vmx_set_constant_host_state(vmx);
2122}
2123
2124static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
2125                                      struct vmcs12 *vmcs12)
2126{
2127        prepare_vmcs02_constant_state(vmx);
2128
2129        vmcs_write64(VMCS_LINK_POINTER, -1ull);
2130
2131        if (enable_vpid) {
2132                if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
2133                        vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
2134                else
2135                        vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2136        }
2137}
2138
2139static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2140{
2141        u32 exec_control, vmcs12_exec_ctrl;
2142        u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
2143
2144        if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
2145                prepare_vmcs02_early_rare(vmx, vmcs12);
2146
2147        /*
2148         * PIN CONTROLS
2149         */
2150        exec_control = vmx_pin_based_exec_ctrl(vmx);
2151        exec_control |= (vmcs12->pin_based_vm_exec_control &
2152                         ~PIN_BASED_VMX_PREEMPTION_TIMER);
2153
2154        /* Posted interrupts setting is only taken from vmcs12.  */
2155        if (nested_cpu_has_posted_intr(vmcs12)) {
2156                vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
2157                vmx->nested.pi_pending = false;
2158        } else {
2159                exec_control &= ~PIN_BASED_POSTED_INTR;
2160        }
2161        pin_controls_set(vmx, exec_control);
2162
2163        /*
2164         * EXEC CONTROLS
2165         */
2166        exec_control = vmx_exec_control(vmx); /* L0's desires */
2167        exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING;
2168        exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING;
2169        exec_control &= ~CPU_BASED_TPR_SHADOW;
2170        exec_control |= vmcs12->cpu_based_vm_exec_control;
2171
2172        vmx->nested.l1_tpr_threshold = -1;
2173        if (exec_control & CPU_BASED_TPR_SHADOW)
2174                vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
2175#ifdef CONFIG_X86_64
2176        else
2177                exec_control |= CPU_BASED_CR8_LOAD_EXITING |
2178                                CPU_BASED_CR8_STORE_EXITING;
2179#endif
2180
2181        /*
2182         * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
2183         * for I/O port accesses.
2184         */
2185        exec_control |= CPU_BASED_UNCOND_IO_EXITING;
2186        exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
2187
2188        /*
2189         * This bit will be computed in nested_get_vmcs12_pages, because
2190         * we do not have access to L1's MSR bitmap yet.  For now, keep
2191         * the same bit as before, hoping to avoid multiple VMWRITEs that
2192         * only set/clear this bit.
2193         */
2194        exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
2195        exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS;
2196
2197        exec_controls_set(vmx, exec_control);
2198
2199        /*
2200         * SECONDARY EXEC CONTROLS
2201         */
2202        if (cpu_has_secondary_exec_ctrls()) {
2203                exec_control = vmx->secondary_exec_control;
2204
2205                /* Take the following fields only from vmcs12 */
2206                exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2207                                  SECONDARY_EXEC_ENABLE_INVPCID |
2208                                  SECONDARY_EXEC_RDTSCP |
2209                                  SECONDARY_EXEC_XSAVES |
2210                                  SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
2211                                  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2212                                  SECONDARY_EXEC_APIC_REGISTER_VIRT |
2213                                  SECONDARY_EXEC_ENABLE_VMFUNC);
2214                if (nested_cpu_has(vmcs12,
2215                                   CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
2216                        vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
2217                                ~SECONDARY_EXEC_ENABLE_PML;
2218                        exec_control |= vmcs12_exec_ctrl;
2219                }
2220
2221                /* VMCS shadowing for L2 is emulated for now */
2222                exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
2223
2224                /*
2225                 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
2226                 * will not have to rewrite the controls just for this bit.
2227                 */
2228                if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() &&
2229                    (vmcs12->guest_cr4 & X86_CR4_UMIP))
2230                        exec_control |= SECONDARY_EXEC_DESC;
2231
2232                if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
2233                        vmcs_write16(GUEST_INTR_STATUS,
2234                                vmcs12->guest_intr_status);
2235
2236                secondary_exec_controls_set(vmx, exec_control);
2237        }
2238
2239        /*
2240         * ENTRY CONTROLS
2241         *
2242         * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
2243         * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
2244         * on the related bits (if supported by the CPU) in the hope that
2245         * we can avoid VMWrites during vmx_set_efer().
2246         */
2247        exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) &
2248                        ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
2249        if (cpu_has_load_ia32_efer()) {
2250                if (guest_efer & EFER_LMA)
2251                        exec_control |= VM_ENTRY_IA32E_MODE;
2252                if (guest_efer != host_efer)
2253                        exec_control |= VM_ENTRY_LOAD_IA32_EFER;
2254        }
2255        vm_entry_controls_set(vmx, exec_control);
2256
2257        /*
2258         * EXIT CONTROLS
2259         *
2260         * L2->L1 exit controls are emulated - the hardware exit is to L0 so
2261         * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
2262         * bits may be modified by vmx_set_efer() in prepare_vmcs02().
2263         */
2264        exec_control = vmx_vmexit_ctrl();
2265        if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
2266                exec_control |= VM_EXIT_LOAD_IA32_EFER;
2267        vm_exit_controls_set(vmx, exec_control);
2268
2269        /*
2270         * Interrupt/Exception Fields
2271         */
2272        if (vmx->nested.nested_run_pending) {
2273                vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2274                             vmcs12->vm_entry_intr_info_field);
2275                vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2276                             vmcs12->vm_entry_exception_error_code);
2277                vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2278                             vmcs12->vm_entry_instruction_len);
2279                vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2280                             vmcs12->guest_interruptibility_info);
2281                vmx->loaded_vmcs->nmi_known_unmasked =
2282                        !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
2283        } else {
2284                vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
2285        }
2286}
2287
2288static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2289{
2290        struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
2291
2292        if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2293                           HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
2294                vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
2295                vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
2296                vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
2297                vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
2298                vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
2299                vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
2300                vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
2301                vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
2302                vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
2303                vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
2304                vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
2305                vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
2306                vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
2307                vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
2308                vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
2309                vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
2310                vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
2311                vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
2312                vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
2313                vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
2314                vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
2315                vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
2316                vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
2317                vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
2318                vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
2319                vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
2320                vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
2321                vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
2322                vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
2323                vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
2324                vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
2325                vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
2326                vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
2327                vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
2328                vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
2329                vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
2330        }
2331
2332        if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2333                           HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
2334                vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
2335                vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
2336                            vmcs12->guest_pending_dbg_exceptions);
2337                vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
2338                vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
2339
2340                /*
2341                 * L1 may access the L2's PDPTR, so save them to construct
2342                 * vmcs12
2343                 */
2344                if (enable_ept) {
2345                        vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2346                        vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2347                        vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2348                        vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2349                }
2350
2351                if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
2352                    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
2353                        vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
2354        }
2355
2356        if (nested_cpu_has_xsaves(vmcs12))
2357                vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
2358
2359        /*
2360         * Whether page-faults are trapped is determined by a combination of
2361         * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
2362         * If enable_ept, L0 doesn't care about page faults and we should
2363         * set all of these to L1's desires. However, if !enable_ept, L0 does
2364         * care about (at least some) page faults, and because it is not easy
2365         * (if at all possible?) to merge L0 and L1's desires, we simply ask
2366         * to exit on each and every L2 page fault. This is done by setting
2367         * MASK=MATCH=0 and (see below) EB.PF=1.
2368         * Note that below we don't need special code to set EB.PF beyond the
2369         * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
2370         * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
2371         * !enable_ept, EB.PF is 1, so the "or" will always be 1.
2372         */
2373        vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
2374                enable_ept ? vmcs12->page_fault_error_code_mask : 0);
2375        vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
2376                enable_ept ? vmcs12->page_fault_error_code_match : 0);
2377
2378        if (cpu_has_vmx_apicv()) {
2379                vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
2380                vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
2381                vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
2382                vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
2383        }
2384
2385        /*
2386         * Make sure the msr_autostore list is up to date before we set the
2387         * count in the vmcs02.
2388         */
2389        prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC);
2390
2391        vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr);
2392        vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
2393        vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2394
2395        set_cr4_guest_host_mask(vmx);
2396}
2397
2398/*
2399 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
2400 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
2401 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
2402 * guest in a way that will both be appropriate to L1's requests, and our
2403 * needs. In addition to modifying the active vmcs (which is vmcs02), this
2404 * function also has additional necessary side-effects, like setting various
2405 * vcpu->arch fields.
2406 * Returns 0 on success, 1 on failure. Invalid state exit qualification code
2407 * is assigned to entry_failure_code on failure.
2408 */
2409static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
2410                          u32 *entry_failure_code)
2411{
2412        struct vcpu_vmx *vmx = to_vmx(vcpu);
2413        struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
2414        bool load_guest_pdptrs_vmcs12 = false;
2415
2416        if (vmx->nested.dirty_vmcs12 || hv_evmcs) {
2417                prepare_vmcs02_rare(vmx, vmcs12);
2418                vmx->nested.dirty_vmcs12 = false;
2419
2420                load_guest_pdptrs_vmcs12 = !hv_evmcs ||
2421                        !(hv_evmcs->hv_clean_fields &
2422                          HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
2423        }
2424
2425        if (vmx->nested.nested_run_pending &&
2426            (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
2427                kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
2428                vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
2429        } else {
2430                kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
2431                vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
2432        }
2433        if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
2434            !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
2435                vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
2436        vmx_set_rflags(vcpu, vmcs12->guest_rflags);
2437
2438        /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
2439         * bitwise-or of what L1 wants to trap for L2, and what we want to
2440         * trap. Note that CR0.TS also needs updating - we do this later.
2441         */
2442        update_exception_bitmap(vcpu);
2443        vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
2444        vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
2445
2446        if (vmx->nested.nested_run_pending &&
2447            (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
2448                vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
2449                vcpu->arch.pat = vmcs12->guest_ia32_pat;
2450        } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2451                vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
2452        }
2453
2454        vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
2455
2456        if (kvm_has_tsc_control)
2457                decache_tsc_multiplier(vmx);
2458
2459        if (enable_vpid) {
2460                /*
2461                 * There is no direct mapping between vpid02 and vpid12, the
2462                 * vpid02 is per-vCPU for L0 and reused while the value of
2463                 * vpid12 is changed w/ one invvpid during nested vmentry.
2464                 * The vpid12 is allocated by L1 for L2, so it will not
2465                 * influence global bitmap(for vpid01 and vpid02 allocation)
2466                 * even if spawn a lot of nested vCPUs.
2467                 */
2468                if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) {
2469                        if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
2470                                vmx->nested.last_vpid = vmcs12->virtual_processor_id;
2471                                __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false);
2472                        }
2473                } else {
2474                        /*
2475                         * If L1 use EPT, then L0 needs to execute INVEPT on
2476                         * EPTP02 instead of EPTP01. Therefore, delay TLB
2477                         * flush until vmcs02->eptp is fully updated by
2478                         * KVM_REQ_LOAD_CR3. Note that this assumes
2479                         * KVM_REQ_TLB_FLUSH is evaluated after
2480                         * KVM_REQ_LOAD_CR3 in vcpu_enter_guest().
2481                         */
2482                        kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2483                }
2484        }
2485
2486        if (nested_cpu_has_ept(vmcs12))
2487                nested_ept_init_mmu_context(vcpu);
2488
2489        /*
2490         * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
2491         * bits which we consider mandatory enabled.
2492         * The CR0_READ_SHADOW is what L2 should have expected to read given
2493         * the specifications by L1; It's not enough to take
2494         * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
2495         * have more bits than L1 expected.
2496         */
2497        vmx_set_cr0(vcpu, vmcs12->guest_cr0);
2498        vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
2499
2500        vmx_set_cr4(vcpu, vmcs12->guest_cr4);
2501        vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
2502
2503        vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
2504        /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
2505        vmx_set_efer(vcpu, vcpu->arch.efer);
2506
2507        /*
2508         * Guest state is invalid and unrestricted guest is disabled,
2509         * which means L1 attempted VMEntry to L2 with invalid state.
2510         * Fail the VMEntry.
2511         */
2512        if (vmx->emulation_required) {
2513                *entry_failure_code = ENTRY_FAIL_DEFAULT;
2514                return -EINVAL;
2515        }
2516
2517        /* Shadow page tables on either EPT or shadow page tables. */
2518        if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
2519                                entry_failure_code))
2520                return -EINVAL;
2521
2522        /*
2523         * Immediately write vmcs02.GUEST_CR3.  It will be propagated to vmcs12
2524         * on nested VM-Exit, which can occur without actually running L2 and
2525         * thus without hitting vmx_set_cr3(), e.g. if L1 is entering L2 with
2526         * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the
2527         * transition to HLT instead of running L2.
2528         */
2529        if (enable_ept)
2530                vmcs_writel(GUEST_CR3, vmcs12->guest_cr3);
2531
2532        /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
2533        if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
2534            is_pae_paging(vcpu)) {
2535                vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2536                vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2537                vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2538                vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2539        }
2540
2541        if (!enable_ept)
2542                vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
2543
2544        if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2545            WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
2546                                     vmcs12->guest_ia32_perf_global_ctrl)))
2547                return -EINVAL;
2548
2549        kvm_rsp_write(vcpu, vmcs12->guest_rsp);
2550        kvm_rip_write(vcpu, vmcs12->guest_rip);
2551        return 0;
2552}
2553
2554static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
2555{
2556        if (CC(!nested_cpu_has_nmi_exiting(vmcs12) &&
2557               nested_cpu_has_virtual_nmis(vmcs12)))
2558                return -EINVAL;
2559
2560        if (CC(!nested_cpu_has_virtual_nmis(vmcs12) &&
2561               nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING)))
2562                return -EINVAL;
2563
2564        return 0;
2565}
2566
2567static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
2568{
2569        struct vcpu_vmx *vmx = to_vmx(vcpu);
2570        int maxphyaddr = cpuid_maxphyaddr(vcpu);
2571
2572        /* Check for memory type validity */
2573        switch (address & VMX_EPTP_MT_MASK) {
2574        case VMX_EPTP_MT_UC:
2575                if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)))
2576                        return false;
2577                break;
2578        case VMX_EPTP_MT_WB:
2579                if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)))
2580                        return false;
2581                break;
2582        default:
2583                return false;
2584        }
2585
2586        /* only 4 levels page-walk length are valid */
2587        if (CC((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4))
2588                return false;
2589
2590        /* Reserved bits should not be set */
2591        if (CC(address >> maxphyaddr || ((address >> 7) & 0x1f)))
2592                return false;
2593
2594        /* AD, if set, should be supported */
2595        if (address & VMX_EPTP_AD_ENABLE_BIT) {
2596                if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)))
2597                        return false;
2598        }
2599
2600        return true;
2601}
2602
2603/*
2604 * Checks related to VM-Execution Control Fields
2605 */
2606static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
2607                                              struct vmcs12 *vmcs12)
2608{
2609        struct vcpu_vmx *vmx = to_vmx(vcpu);
2610
2611        if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
2612                                   vmx->nested.msrs.pinbased_ctls_low,
2613                                   vmx->nested.msrs.pinbased_ctls_high)) ||
2614            CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
2615                                   vmx->nested.msrs.procbased_ctls_low,
2616                                   vmx->nested.msrs.procbased_ctls_high)))
2617                return -EINVAL;
2618
2619        if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
2620            CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control,
2621                                   vmx->nested.msrs.secondary_ctls_low,
2622                                   vmx->nested.msrs.secondary_ctls_high)))
2623                return -EINVAL;
2624
2625        if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) ||
2626            nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) ||
2627            nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) ||
2628            nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) ||
2629            nested_vmx_check_apic_access_controls(vcpu, vmcs12) ||
2630            nested_vmx_check_apicv_controls(vcpu, vmcs12) ||
2631            nested_vmx_check_nmi_controls(vmcs12) ||
2632            nested_vmx_check_pml_controls(vcpu, vmcs12) ||
2633            nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) ||
2634            nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) ||
2635            nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) ||
2636            CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
2637                return -EINVAL;
2638
2639        if (!nested_cpu_has_preemption_timer(vmcs12) &&
2640            nested_cpu_has_save_preemption_timer(vmcs12))
2641                return -EINVAL;
2642
2643        if (nested_cpu_has_ept(vmcs12) &&
2644            CC(!valid_ept_address(vcpu, vmcs12->ept_pointer)))
2645                return -EINVAL;
2646
2647        if (nested_cpu_has_vmfunc(vmcs12)) {
2648                if (CC(vmcs12->vm_function_control &
2649                       ~vmx->nested.msrs.vmfunc_controls))
2650                        return -EINVAL;
2651
2652                if (nested_cpu_has_eptp_switching(vmcs12)) {
2653                        if (CC(!nested_cpu_has_ept(vmcs12)) ||
2654                            CC(!page_address_valid(vcpu, vmcs12->eptp_list_address)))
2655                                return -EINVAL;
2656                }
2657        }
2658
2659        return 0;
2660}
2661
2662/*
2663 * Checks related to VM-Exit Control Fields
2664 */
2665static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu,
2666                                         struct vmcs12 *vmcs12)
2667{
2668        struct vcpu_vmx *vmx = to_vmx(vcpu);
2669
2670        if (CC(!vmx_control_verify(vmcs12->vm_exit_controls,
2671                                    vmx->nested.msrs.exit_ctls_low,
2672                                    vmx->nested.msrs.exit_ctls_high)) ||
2673            CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12)))
2674                return -EINVAL;
2675
2676        return 0;
2677}
2678
2679/*
2680 * Checks related to VM-Entry Control Fields
2681 */
2682static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
2683                                          struct vmcs12 *vmcs12)
2684{
2685        struct vcpu_vmx *vmx = to_vmx(vcpu);
2686
2687        if (CC(!vmx_control_verify(vmcs12->vm_entry_controls,
2688                                    vmx->nested.msrs.entry_ctls_low,
2689                                    vmx->nested.msrs.entry_ctls_high)))
2690                return -EINVAL;
2691
2692        /*
2693         * From the Intel SDM, volume 3:
2694         * Fields relevant to VM-entry event injection must be set properly.
2695         * These fields are the VM-entry interruption-information field, the
2696         * VM-entry exception error code, and the VM-entry instruction length.
2697         */
2698        if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
2699                u32 intr_info = vmcs12->vm_entry_intr_info_field;
2700                u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
2701                u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
2702                bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
2703                bool should_have_error_code;
2704                bool urg = nested_cpu_has2(vmcs12,
2705                                           SECONDARY_EXEC_UNRESTRICTED_GUEST);
2706                bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
2707
2708                /* VM-entry interruption-info field: interruption type */
2709                if (CC(intr_type == INTR_TYPE_RESERVED) ||
2710                    CC(intr_type == INTR_TYPE_OTHER_EVENT &&
2711                       !nested_cpu_supports_monitor_trap_flag(vcpu)))
2712                        return -EINVAL;
2713
2714                /* VM-entry interruption-info field: vector */
2715                if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
2716                    CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
2717                    CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
2718                        return -EINVAL;
2719
2720                /* VM-entry interruption-info field: deliver error code */
2721                should_have_error_code =
2722                        intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
2723                        x86_exception_has_error_code(vector);
2724                if (CC(has_error_code != should_have_error_code))
2725                        return -EINVAL;
2726
2727                /* VM-entry exception error code */
2728                if (CC(has_error_code &&
2729                       vmcs12->vm_entry_exception_error_code & GENMASK(31, 16)))
2730                        return -EINVAL;
2731
2732                /* VM-entry interruption-info field: reserved bits */
2733                if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK))
2734                        return -EINVAL;
2735
2736                /* VM-entry instruction length */
2737                switch (intr_type) {
2738                case INTR_TYPE_SOFT_EXCEPTION:
2739                case INTR_TYPE_SOFT_INTR:
2740                case INTR_TYPE_PRIV_SW_EXCEPTION:
2741                        if (CC(vmcs12->vm_entry_instruction_len > 15) ||
2742                            CC(vmcs12->vm_entry_instruction_len == 0 &&
2743                            CC(!nested_cpu_has_zero_length_injection(vcpu))))
2744                                return -EINVAL;
2745                }
2746        }
2747
2748        if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12))
2749                return -EINVAL;
2750
2751        return 0;
2752}
2753
2754static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
2755                                     struct vmcs12 *vmcs12)
2756{
2757        if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
2758            nested_check_vm_exit_controls(vcpu, vmcs12) ||
2759            nested_check_vm_entry_controls(vcpu, vmcs12))
2760                return -EINVAL;
2761
2762        if (to_vmx(vcpu)->nested.enlightened_vmcs_enabled)
2763                return nested_evmcs_check_controls(vmcs12);
2764
2765        return 0;
2766}
2767
2768static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
2769                                       struct vmcs12 *vmcs12)
2770{
2771        bool ia32e;
2772
2773        if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) ||
2774            CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) ||
2775            CC(!nested_cr3_valid(vcpu, vmcs12->host_cr3)))
2776                return -EINVAL;
2777
2778        if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
2779            CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
2780                return -EINVAL;
2781
2782        if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
2783            CC(!kvm_pat_valid(vmcs12->host_ia32_pat)))
2784                return -EINVAL;
2785
2786        if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2787            CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
2788                                           vmcs12->host_ia32_perf_global_ctrl)))
2789                return -EINVAL;
2790
2791#ifdef CONFIG_X86_64
2792        ia32e = !!(vcpu->arch.efer & EFER_LMA);
2793#else
2794        ia32e = false;
2795#endif
2796
2797        if (ia32e) {
2798                if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) ||
2799                    CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
2800                        return -EINVAL;
2801        } else {
2802                if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) ||
2803                    CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
2804                    CC(vmcs12->host_cr4 & X86_CR4_PCIDE) ||
2805                    CC((vmcs12->host_rip) >> 32))
2806                        return -EINVAL;
2807        }
2808
2809        if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2810            CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2811            CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2812            CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2813            CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2814            CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2815            CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2816            CC(vmcs12->host_cs_selector == 0) ||
2817            CC(vmcs12->host_tr_selector == 0) ||
2818            CC(vmcs12->host_ss_selector == 0 && !ia32e))
2819                return -EINVAL;
2820
2821        if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) ||
2822            CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) ||
2823            CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) ||
2824            CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) ||
2825            CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) ||
2826            CC(is_noncanonical_address(vmcs12->host_rip, vcpu)))
2827                return -EINVAL;
2828
2829        /*
2830         * If the load IA32_EFER VM-exit control is 1, bits reserved in the
2831         * IA32_EFER MSR must be 0 in the field for that register. In addition,
2832         * the values of the LMA and LME bits in the field must each be that of
2833         * the host address-space size VM-exit control.
2834         */
2835        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
2836                if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) ||
2837                    CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) ||
2838                    CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)))
2839                        return -EINVAL;
2840        }
2841
2842        return 0;
2843}
2844
2845static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
2846                                          struct vmcs12 *vmcs12)
2847{
2848        int r = 0;
2849        struct vmcs12 *shadow;
2850        struct kvm_host_map map;
2851
2852        if (vmcs12->vmcs_link_pointer == -1ull)
2853                return 0;
2854
2855        if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)))
2856                return -EINVAL;
2857
2858        if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)))
2859                return -EINVAL;
2860
2861        shadow = map.hva;
2862
2863        if (CC(shadow->hdr.revision_id != VMCS12_REVISION) ||
2864            CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)))
2865                r = -EINVAL;
2866
2867        kvm_vcpu_unmap(vcpu, &map, false);
2868        return r;
2869}
2870
2871/*
2872 * Checks related to Guest Non-register State
2873 */
2874static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
2875{
2876        if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
2877               vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT))
2878                return -EINVAL;
2879
2880        return 0;
2881}
2882
2883static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
2884                                        struct vmcs12 *vmcs12,
2885                                        u32 *exit_qual)
2886{
2887        bool ia32e;
2888
2889        *exit_qual = ENTRY_FAIL_DEFAULT;
2890
2891        if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) ||
2892            CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)))
2893                return -EINVAL;
2894
2895        if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
2896            CC(!kvm_dr7_valid(vmcs12->guest_dr7)))
2897                return -EINVAL;
2898
2899        if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
2900            CC(!kvm_pat_valid(vmcs12->guest_ia32_pat)))
2901                return -EINVAL;
2902
2903        if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
2904                *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
2905                return -EINVAL;
2906        }
2907
2908        if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2909            CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
2910                                           vmcs12->guest_ia32_perf_global_ctrl)))
2911                return -EINVAL;
2912
2913        /*
2914         * If the load IA32_EFER VM-entry control is 1, the following checks
2915         * are performed on the field for the IA32_EFER MSR:
2916         * - Bits reserved in the IA32_EFER MSR must be 0.
2917         * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
2918         *   the IA-32e mode guest VM-exit control. It must also be identical
2919         *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
2920         *   CR0.PG) is 1.
2921         */
2922        if (to_vmx(vcpu)->nested.nested_run_pending &&
2923            (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
2924                ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
2925                if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) ||
2926                    CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) ||
2927                    CC(((vmcs12->guest_cr0 & X86_CR0_PG) &&
2928                     ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))))
2929                        return -EINVAL;
2930        }
2931
2932        if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
2933            (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) ||
2934             CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))))
2935                return -EINVAL;
2936
2937        if (nested_check_guest_non_reg_state(vmcs12))
2938                return -EINVAL;
2939
2940        return 0;
2941}
2942
2943static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
2944{
2945        struct vcpu_vmx *vmx = to_vmx(vcpu);
2946        unsigned long cr3, cr4;
2947        bool vm_fail;
2948
2949        if (!nested_early_check)
2950                return 0;
2951
2952        if (vmx->msr_autoload.host.nr)
2953                vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
2954        if (vmx->msr_autoload.guest.nr)
2955                vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
2956
2957        preempt_disable();
2958
2959        vmx_prepare_switch_to_guest(vcpu);
2960
2961        /*
2962         * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
2963         * which is reserved to '1' by hardware.  GUEST_RFLAGS is guaranteed to
2964         * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e.
2965         * there is no need to preserve other bits or save/restore the field.
2966         */
2967        vmcs_writel(GUEST_RFLAGS, 0);
2968
2969        cr3 = __get_current_cr3_fast();
2970        if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
2971                vmcs_writel(HOST_CR3, cr3);
2972                vmx->loaded_vmcs->host_state.cr3 = cr3;
2973        }
2974
2975        cr4 = cr4_read_shadow();
2976        if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
2977                vmcs_writel(HOST_CR4, cr4);
2978                vmx->loaded_vmcs->host_state.cr4 = cr4;
2979        }
2980
2981        asm(
2982                "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
2983                "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
2984                "je 1f \n\t"
2985                __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t"
2986                "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
2987                "1: \n\t"
2988                "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
2989
2990                /* Check if vmlaunch or vmresume is needed */
2991                "cmpb $0, %c[launched](%[loaded_vmcs])\n\t"
2992
2993                /*
2994                 * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set
2995                 * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail
2996                 * Valid.  vmx_vmenter() directly "returns" RFLAGS, and so the
2997                 * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail.
2998                 */
2999                "call vmx_vmenter\n\t"
3000
3001                CC_SET(be)
3002              : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail)
3003              : [HOST_RSP]"r"((unsigned long)HOST_RSP),
3004                [loaded_vmcs]"r"(vmx->loaded_vmcs),
3005                [launched]"i"(offsetof(struct loaded_vmcs, launched)),
3006                [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)),
3007                [wordsize]"i"(sizeof(ulong))
3008              : "memory"
3009        );
3010
3011        if (vmx->msr_autoload.host.nr)
3012                vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
3013        if (vmx->msr_autoload.guest.nr)
3014                vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
3015
3016        if (vm_fail) {
3017                u32 error = vmcs_read32(VM_INSTRUCTION_ERROR);
3018
3019                preempt_enable();
3020
3021                trace_kvm_nested_vmenter_failed(
3022                        "early hardware check VM-instruction error: ", error);
3023                WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3024                return 1;
3025        }
3026
3027        /*
3028         * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
3029         */
3030        local_irq_enable();
3031        if (hw_breakpoint_active())
3032                set_debugreg(__this_cpu_read(cpu_dr7), 7);
3033        preempt_enable();
3034
3035        /*
3036         * A non-failing VMEntry means we somehow entered guest mode with
3037         * an illegal RIP, and that's just the tip of the iceberg.  There
3038         * is no telling what memory has been modified or what state has
3039         * been exposed to unknown code.  Hitting this all but guarantees
3040         * a (very critical) hardware issue.
3041         */
3042        WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
3043                VMX_EXIT_REASONS_FAILED_VMENTRY));
3044
3045        return 0;
3046}
3047
3048static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
3049{
3050        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3051        struct vcpu_vmx *vmx = to_vmx(vcpu);
3052        struct kvm_host_map *map;
3053        struct page *page;
3054        u64 hpa;
3055
3056        if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3057                /*
3058                 * Translate L1 physical address to host physical
3059                 * address for vmcs02. Keep the page pinned, so this
3060                 * physical address remains valid. We keep a reference
3061                 * to it so we can release it later.
3062                 */
3063                if (vmx->nested.apic_access_page) { /* shouldn't happen */
3064                        kvm_release_page_clean(vmx->nested.apic_access_page);
3065                        vmx->nested.apic_access_page = NULL;
3066                }
3067                page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
3068                if (!is_error_page(page)) {
3069                        vmx->nested.apic_access_page = page;
3070                        hpa = page_to_phys(vmx->nested.apic_access_page);
3071                        vmcs_write64(APIC_ACCESS_ADDR, hpa);
3072                } else {
3073                        pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n",
3074                                             __func__);
3075                        vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3076                        vcpu->run->internal.suberror =
3077                                KVM_INTERNAL_ERROR_EMULATION;
3078                        vcpu->run->internal.ndata = 0;
3079                        return false;
3080                }
3081        }
3082
3083        if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3084                map = &vmx->nested.virtual_apic_map;
3085
3086                if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
3087                        vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
3088                } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
3089                           nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
3090                           !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3091                        /*
3092                         * The processor will never use the TPR shadow, simply
3093                         * clear the bit from the execution control.  Such a
3094                         * configuration is useless, but it happens in tests.
3095                         * For any other configuration, failing the vm entry is
3096                         * _not_ what the processor does but it's basically the
3097                         * only possibility we have.
3098                         */
3099                        exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW);
3100                } else {
3101                        /*
3102                         * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
3103                         * force VM-Entry to fail.
3104                         */
3105                        vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
3106                }
3107        }
3108
3109        if (nested_cpu_has_posted_intr(vmcs12)) {
3110                map = &vmx->nested.pi_desc_map;
3111
3112                if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
3113                        vmx->nested.pi_desc =
3114                                (struct pi_desc *)(((void *)map->hva) +
3115                                offset_in_page(vmcs12->posted_intr_desc_addr));
3116                        vmcs_write64(POSTED_INTR_DESC_ADDR,
3117                                     pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
3118                }
3119        }
3120        if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
3121                exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
3122        else
3123                exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
3124        return true;
3125}
3126
3127/*
3128 * Intel's VMX Instruction Reference specifies a common set of prerequisites
3129 * for running VMX instructions (except VMXON, whose prerequisites are
3130 * slightly different). It also specifies what exception to inject otherwise.
3131 * Note that many of these exceptions have priority over VM exits, so they
3132 * don't have to be checked again here.
3133 */
3134static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
3135{
3136        if (!to_vmx(vcpu)->nested.vmxon) {
3137                kvm_queue_exception(vcpu, UD_VECTOR);
3138                return 0;
3139        }
3140
3141        if (vmx_get_cpl(vcpu)) {
3142                kvm_inject_gp(vcpu, 0);
3143                return 0;
3144        }
3145
3146        return 1;
3147}
3148
3149static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
3150{
3151        u8 rvi = vmx_get_rvi();
3152        u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
3153
3154        return ((rvi & 0xf0) > (vppr & 0xf0));
3155}
3156
3157static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
3158                                   struct vmcs12 *vmcs12);
3159
3160/*
3161 * If from_vmentry is false, this is being called from state restore (either RSM
3162 * or KVM_SET_NESTED_STATE).  Otherwise it's called from vmlaunch/vmresume.
3163 *
3164 * Returns:
3165 *      NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode
3166 *      NVMX_VMENTRY_VMFAIL:  Consistency check VMFail
3167 *      NVMX_VMENTRY_VMEXIT:  Consistency check VMExit
3168 *      NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error
3169 */
3170enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
3171                                                        bool from_vmentry)
3172{
3173        struct vcpu_vmx *vmx = to_vmx(vcpu);
3174        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3175        bool evaluate_pending_interrupts;
3176        u32 exit_reason = EXIT_REASON_INVALID_STATE;
3177        u32 exit_qual;
3178
3179        evaluate_pending_interrupts = exec_controls_get(vmx) &
3180                (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
3181        if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
3182                evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
3183
3184        if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
3185                vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
3186        if (kvm_mpx_supported() &&
3187                !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
3188                vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3189
3190        /*
3191         * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
3192         * nested early checks are disabled.  In the event of a "late" VM-Fail,
3193         * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
3194         * software model to the pre-VMEntry host state.  When EPT is disabled,
3195         * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
3196         * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3.  Stuffing
3197         * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
3198         * the correct value.  Smashing vmcs01.GUEST_CR3 is safe because nested
3199         * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
3200         * guaranteed to be overwritten with a shadow CR3 prior to re-entering
3201         * L1.  Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
3202         * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
3203         * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
3204         * path would need to manually save/restore vmcs01.GUEST_CR3.
3205         */
3206        if (!enable_ept && !nested_early_check)
3207                vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3208
3209        vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
3210
3211        prepare_vmcs02_early(vmx, vmcs12);
3212
3213        if (from_vmentry) {
3214                if (unlikely(!nested_get_vmcs12_pages(vcpu)))
3215                        return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
3216
3217                if (nested_vmx_check_vmentry_hw(vcpu)) {
3218                        vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3219                        return NVMX_VMENTRY_VMFAIL;
3220                }
3221
3222                if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
3223                        goto vmentry_fail_vmexit;
3224        }
3225
3226        enter_guest_mode(vcpu);
3227        if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
3228                vcpu->arch.tsc_offset += vmcs12->tsc_offset;
3229
3230        if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
3231                goto vmentry_fail_vmexit_guest_mode;
3232
3233        if (from_vmentry) {
3234                exit_reason = EXIT_REASON_MSR_LOAD_FAIL;
3235                exit_qual = nested_vmx_load_msr(vcpu,
3236                                                vmcs12->vm_entry_msr_load_addr,
3237                                                vmcs12->vm_entry_msr_load_count);
3238                if (exit_qual)
3239                        goto vmentry_fail_vmexit_guest_mode;
3240        } else {
3241                /*
3242                 * The MMU is not initialized to point at the right entities yet and
3243                 * "get pages" would need to read data from the guest (i.e. we will
3244                 * need to perform gpa to hpa translation). Request a call
3245                 * to nested_get_vmcs12_pages before the next VM-entry.  The MSRs
3246                 * have already been set at vmentry time and should not be reset.
3247                 */
3248                kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
3249        }
3250
3251        /*
3252         * If L1 had a pending IRQ/NMI until it executed
3253         * VMLAUNCH/VMRESUME which wasn't delivered because it was
3254         * disallowed (e.g. interrupts disabled), L0 needs to
3255         * evaluate if this pending event should cause an exit from L2
3256         * to L1 or delivered directly to L2 (e.g. In case L1 don't
3257         * intercept EXTERNAL_INTERRUPT).
3258         *
3259         * Usually this would be handled by the processor noticing an
3260         * IRQ/NMI window request, or checking RVI during evaluation of
3261         * pending virtual interrupts.  However, this setting was done
3262         * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
3263         * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
3264         */
3265        if (unlikely(evaluate_pending_interrupts))
3266                kvm_make_request(KVM_REQ_EVENT, vcpu);
3267
3268        /*
3269         * Do not start the preemption timer hrtimer until after we know
3270         * we are successful, so that only nested_vmx_vmexit needs to cancel
3271         * the timer.
3272         */
3273        vmx->nested.preemption_timer_expired = false;
3274        if (nested_cpu_has_preemption_timer(vmcs12))
3275                vmx_start_preemption_timer(vcpu);
3276
3277        /*
3278         * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
3279         * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
3280         * returned as far as L1 is concerned. It will only return (and set
3281         * the success flag) when L2 exits (see nested_vmx_vmexit()).
3282         */
3283        return NVMX_VMENTRY_SUCCESS;
3284
3285        /*
3286         * A failed consistency check that leads to a VMExit during L1's
3287         * VMEnter to L2 is a variation of a normal VMexit, as explained in
3288         * 26.7 "VM-entry failures during or after loading guest state".
3289         */
3290vmentry_fail_vmexit_guest_mode:
3291        if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
3292                vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
3293        leave_guest_mode(vcpu);
3294
3295vmentry_fail_vmexit:
3296        vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3297
3298        if (!from_vmentry)
3299                return NVMX_VMENTRY_VMEXIT;
3300
3301        load_vmcs12_host_state(vcpu, vmcs12);
3302        vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
3303        vmcs12->exit_qualification = exit_qual;
3304        if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
3305                vmx->nested.need_vmcs12_to_shadow_sync = true;
3306        return NVMX_VMENTRY_VMEXIT;
3307}
3308
3309/*
3310 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
3311 * for running an L2 nested guest.
3312 */
3313static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
3314{
3315        struct vmcs12 *vmcs12;
3316        enum nvmx_vmentry_status status;
3317        struct vcpu_vmx *vmx = to_vmx(vcpu);
3318        u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
3319
3320        if (!nested_vmx_check_permission(vcpu))
3321                return 1;
3322
3323        if (!nested_vmx_handle_enlightened_vmptrld(vcpu, launch))
3324                return 1;
3325
3326        if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)
3327                return nested_vmx_failInvalid(vcpu);
3328
3329        vmcs12 = get_vmcs12(vcpu);
3330
3331        /*
3332         * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
3333         * that there *is* a valid VMCS pointer, RFLAGS.CF is set
3334         * rather than RFLAGS.ZF, and no error number is stored to the
3335         * VM-instruction error field.
3336         */
3337        if (vmcs12->hdr.shadow_vmcs)
3338                return nested_vmx_failInvalid(vcpu);
3339
3340        if (vmx->nested.hv_evmcs) {
3341                copy_enlightened_to_vmcs12(vmx);
3342                /* Enlightened VMCS doesn't have launch state */
3343                vmcs12->launch_state = !launch;
3344        } else if (enable_shadow_vmcs) {
3345                copy_shadow_to_vmcs12(vmx);
3346        }
3347
3348        /*
3349         * The nested entry process starts with enforcing various prerequisites
3350         * on vmcs12 as required by the Intel SDM, and act appropriately when
3351         * they fail: As the SDM explains, some conditions should cause the
3352         * instruction to fail, while others will cause the instruction to seem
3353         * to succeed, but return an EXIT_REASON_INVALID_STATE.
3354         * To speed up the normal (success) code path, we should avoid checking
3355         * for misconfigurations which will anyway be caught by the processor
3356         * when using the merged vmcs02.
3357         */
3358        if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)
3359                return nested_vmx_failValid(vcpu,
3360                        VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
3361
3362        if (vmcs12->launch_state == launch)
3363                return nested_vmx_failValid(vcpu,
3364                        launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
3365                               : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
3366
3367        if (nested_vmx_check_controls(vcpu, vmcs12))
3368                return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3369
3370        if (nested_vmx_check_host_state(vcpu, vmcs12))
3371                return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
3372
3373        /*
3374         * We're finally done with prerequisite checking, and can start with
3375         * the nested entry.
3376         */
3377        vmx->nested.nested_run_pending = 1;
3378        status = nested_vmx_enter_non_root_mode(vcpu, true);
3379        if (unlikely(status != NVMX_VMENTRY_SUCCESS))
3380                goto vmentry_failed;
3381
3382        /* Hide L1D cache contents from the nested guest.  */
3383        vmx->vcpu.arch.l1tf_flush_l1d = true;
3384
3385        /*
3386         * Must happen outside of nested_vmx_enter_non_root_mode() as it will
3387         * also be used as part of restoring nVMX state for
3388         * snapshot restore (migration).
3389         *
3390         * In this flow, it is assumed that vmcs12 cache was
3391         * trasferred as part of captured nVMX state and should
3392         * therefore not be read from guest memory (which may not
3393         * exist on destination host yet).
3394         */
3395        nested_cache_shadow_vmcs12(vcpu, vmcs12);
3396
3397        /*
3398         * If we're entering a halted L2 vcpu and the L2 vcpu won't be
3399         * awakened by event injection or by an NMI-window VM-exit or
3400         * by an interrupt-window VM-exit, halt the vcpu.
3401         */
3402        if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
3403            !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
3404            !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_NMI_WINDOW_EXITING) &&
3405            !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_INTR_WINDOW_EXITING) &&
3406              (vmcs12->guest_rflags & X86_EFLAGS_IF))) {
3407                vmx->nested.nested_run_pending = 0;
3408                return kvm_vcpu_halt(vcpu);
3409        }
3410        return 1;
3411
3412vmentry_failed:
3413        vmx->nested.nested_run_pending = 0;
3414        if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR)
3415                return 0;
3416        if (status == NVMX_VMENTRY_VMEXIT)
3417                return 1;
3418        WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL);
3419        return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3420}
3421
3422/*
3423 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
3424 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK).
3425 * This function returns the new value we should put in vmcs12.guest_cr0.
3426 * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
3427 *  1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
3428 *     available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
3429 *     didn't trap the bit, because if L1 did, so would L0).
3430 *  2. Bits that L1 asked to trap (and therefore L0 also did) could not have
3431 *     been modified by L2, and L1 knows it. So just leave the old value of
3432 *     the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
3433 *     isn't relevant, because if L0 traps this bit it can set it to anything.
3434 *  3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
3435 *     changed these bits, and therefore they need to be updated, but L0
3436 *     didn't necessarily allow them to be changed in GUEST_CR0 - and rather
3437 *     put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
3438 */
3439static inline unsigned long
3440vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3441{
3442        return
3443        /*1*/   (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
3444        /*2*/   (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
3445        /*3*/   (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
3446                        vcpu->arch.cr0_guest_owned_bits));
3447}
3448
3449static inline unsigned long
3450vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3451{
3452        return
3453        /*1*/   (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
3454        /*2*/   (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
3455        /*3*/   (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
3456                        vcpu->arch.cr4_guest_owned_bits));
3457}
3458
3459static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
3460                                      struct vmcs12 *vmcs12)
3461{
3462        u32 idt_vectoring;
3463        unsigned int nr;
3464
3465        if (vcpu->arch.exception.injected) {
3466                nr = vcpu->arch.exception.nr;
3467                idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3468
3469                if (kvm_exception_is_soft(nr)) {
3470                        vmcs12->vm_exit_instruction_len =
3471                                vcpu->arch.event_exit_inst_len;
3472                        idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
3473                } else
3474                        idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
3475
3476                if (vcpu->arch.exception.has_error_code) {
3477                        idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
3478                        vmcs12->idt_vectoring_error_code =
3479                                vcpu->arch.exception.error_code;
3480                }
3481
3482                vmcs12->idt_vectoring_info_field = idt_vectoring;
3483        } else if (vcpu->arch.nmi_injected) {
3484                vmcs12->idt_vectoring_info_field =
3485                        INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
3486        } else if (vcpu->arch.interrupt.injected) {
3487                nr = vcpu->arch.interrupt.nr;
3488                idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3489
3490                if (vcpu->arch.interrupt.soft) {
3491                        idt_vectoring |= INTR_TYPE_SOFT_INTR;
3492                        vmcs12->vm_entry_instruction_len =
3493                                vcpu->arch.event_exit_inst_len;
3494                } else
3495                        idt_vectoring |= INTR_TYPE_EXT_INTR;
3496
3497                vmcs12->idt_vectoring_info_field = idt_vectoring;
3498        }
3499}
3500
3501
3502static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
3503{
3504        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3505        gfn_t gfn;
3506
3507        /*
3508         * Don't need to mark the APIC access page dirty; it is never
3509         * written to by the CPU during APIC virtualization.
3510         */
3511
3512        if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3513                gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
3514                kvm_vcpu_mark_page_dirty(vcpu, gfn);
3515        }
3516
3517        if (nested_cpu_has_posted_intr(vmcs12)) {
3518                gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
3519                kvm_vcpu_mark_page_dirty(vcpu, gfn);
3520        }
3521}
3522
3523static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
3524{
3525        struct vcpu_vmx *vmx = to_vmx(vcpu);
3526        int max_irr;
3527        void *vapic_page;
3528        u16 status;
3529
3530        if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
3531                return;
3532
3533        vmx->nested.pi_pending = false;
3534        if (!pi_test_and_clear_on(vmx->nested.pi_desc))
3535                return;
3536
3537        max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
3538        if (max_irr != 256) {
3539                vapic_page = vmx->nested.virtual_apic_map.hva;
3540                if (!vapic_page)
3541                        return;
3542
3543                __kvm_apic_update_irr(vmx->nested.pi_desc->pir,
3544                        vapic_page, &max_irr);
3545                status = vmcs_read16(GUEST_INTR_STATUS);
3546                if ((u8)max_irr > ((u8)status & 0xff)) {
3547                        status &= ~0xff;
3548                        status |= (u8)max_irr;
3549                        vmcs_write16(GUEST_INTR_STATUS, status);
3550                }
3551        }
3552
3553        nested_mark_vmcs12_pages_dirty(vcpu);
3554}
3555
3556static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
3557                                               unsigned long exit_qual)
3558{
3559        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3560        unsigned int nr = vcpu->arch.exception.nr;
3561        u32 intr_info = nr | INTR_INFO_VALID_MASK;
3562
3563        if (vcpu->arch.exception.has_error_code) {
3564                vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
3565                intr_info |= INTR_INFO_DELIVER_CODE_MASK;
3566        }
3567
3568        if (kvm_exception_is_soft(nr))
3569                intr_info |= INTR_TYPE_SOFT_EXCEPTION;
3570        else
3571                intr_info |= INTR_TYPE_HARD_EXCEPTION;
3572
3573        if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
3574            vmx_get_nmi_mask(vcpu))
3575                intr_info |= INTR_INFO_UNBLOCK_NMI;
3576
3577        nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
3578}
3579
3580/*
3581 * Returns true if a debug trap is pending delivery.
3582 *
3583 * In KVM, debug traps bear an exception payload. As such, the class of a #DB
3584 * exception may be inferred from the presence of an exception payload.
3585 */
3586static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu)
3587{
3588        return vcpu->arch.exception.pending &&
3589                        vcpu->arch.exception.nr == DB_VECTOR &&
3590                        vcpu->arch.exception.payload;
3591}
3592
3593/*
3594 * Certain VM-exits set the 'pending debug exceptions' field to indicate a
3595 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM
3596 * represents these debug traps with a payload that is said to be compatible
3597 * with the 'pending debug exceptions' field, write the payload to the VMCS
3598 * field if a VM-exit is delivered before the debug trap.
3599 */
3600static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
3601{
3602        if (vmx_pending_dbg_trap(vcpu))
3603                vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
3604                            vcpu->arch.exception.payload);
3605}
3606
3607static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
3608{
3609        struct vcpu_vmx *vmx = to_vmx(vcpu);
3610        unsigned long exit_qual;
3611        bool block_nested_events =
3612            vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
3613        bool mtf_pending = vmx->nested.mtf_pending;
3614        struct kvm_lapic *apic = vcpu->arch.apic;
3615
3616        /*
3617         * Clear the MTF state. If a higher priority VM-exit is delivered first,
3618         * this state is discarded.
3619         */
3620        vmx->nested.mtf_pending = false;
3621
3622        if (lapic_in_kernel(vcpu) &&
3623                test_bit(KVM_APIC_INIT, &apic->pending_events)) {
3624                if (block_nested_events)
3625                        return -EBUSY;
3626                nested_vmx_update_pending_dbg(vcpu);
3627                clear_bit(KVM_APIC_INIT, &apic->pending_events);
3628                nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
3629                return 0;
3630        }
3631
3632        /*
3633         * Process any exceptions that are not debug traps before MTF.
3634         */
3635        if (vcpu->arch.exception.pending &&
3636            !vmx_pending_dbg_trap(vcpu) &&
3637            nested_vmx_check_exception(vcpu, &exit_qual)) {
3638                if (block_nested_events)
3639                        return -EBUSY;
3640                nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
3641                return 0;
3642        }
3643
3644        if (mtf_pending) {
3645                if (block_nested_events)
3646                        return -EBUSY;
3647                nested_vmx_update_pending_dbg(vcpu);
3648                nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0);
3649                return 0;
3650        }
3651
3652        if (vcpu->arch.exception.pending &&
3653            nested_vmx_check_exception(vcpu, &exit_qual)) {
3654                if (block_nested_events)
3655                        return -EBUSY;
3656                nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
3657                return 0;
3658        }
3659
3660        if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
3661            vmx->nested.preemption_timer_expired) {
3662                if (block_nested_events)
3663                        return -EBUSY;
3664                nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
3665                return 0;
3666        }
3667
3668        if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
3669                if (block_nested_events)
3670                        return -EBUSY;
3671                nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
3672                                  NMI_VECTOR | INTR_TYPE_NMI_INTR |
3673                                  INTR_INFO_VALID_MASK, 0);
3674                /*
3675                 * The NMI-triggered VM exit counts as injection:
3676                 * clear this one and block further NMIs.
3677                 */
3678                vcpu->arch.nmi_pending = 0;
3679                vmx_set_nmi_mask(vcpu, true);
3680                return 0;
3681        }
3682
3683        if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
3684            nested_exit_on_intr(vcpu)) {
3685                if (block_nested_events)
3686                        return -EBUSY;
3687                nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
3688                return 0;
3689        }
3690
3691        vmx_complete_nested_posted_interrupt(vcpu);
3692        return 0;
3693}
3694
3695static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
3696{
3697        ktime_t remaining =
3698                hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
3699        u64 value;
3700
3701        if (ktime_to_ns(remaining) <= 0)
3702                return 0;
3703
3704        value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
3705        do_div(value, 1000000);
3706        return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
3707}
3708
3709static bool is_vmcs12_ext_field(unsigned long field)
3710{
3711        switch (field) {
3712        case GUEST_ES_SELECTOR:
3713        case GUEST_CS_SELECTOR:
3714        case GUEST_SS_SELECTOR:
3715        case GUEST_DS_SELECTOR:
3716        case GUEST_FS_SELECTOR:
3717        case GUEST_GS_SELECTOR:
3718        case GUEST_LDTR_SELECTOR:
3719        case GUEST_TR_SELECTOR:
3720        case GUEST_ES_LIMIT:
3721        case GUEST_CS_LIMIT:
3722        case GUEST_SS_LIMIT:
3723        case GUEST_DS_LIMIT:
3724        case GUEST_FS_LIMIT:
3725        case GUEST_GS_LIMIT:
3726        case GUEST_LDTR_LIMIT:
3727        case GUEST_TR_LIMIT:
3728        case GUEST_GDTR_LIMIT:
3729        case GUEST_IDTR_LIMIT:
3730        case GUEST_ES_AR_BYTES:
3731        case GUEST_DS_AR_BYTES:
3732        case GUEST_FS_AR_BYTES:
3733        case GUEST_GS_AR_BYTES:
3734        case GUEST_LDTR_AR_BYTES:
3735        case GUEST_TR_AR_BYTES:
3736        case GUEST_ES_BASE:
3737        case GUEST_CS_BASE:
3738        case GUEST_SS_BASE:
3739        case GUEST_DS_BASE:
3740        case GUEST_FS_BASE:
3741        case GUEST_GS_BASE:
3742        case GUEST_LDTR_BASE:
3743        case GUEST_TR_BASE:
3744        case GUEST_GDTR_BASE:
3745        case GUEST_IDTR_BASE:
3746        case GUEST_PENDING_DBG_EXCEPTIONS:
3747        case GUEST_BNDCFGS:
3748                return true;
3749        default:
3750                break;
3751        }
3752
3753        return false;
3754}
3755
3756static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
3757                                       struct vmcs12 *vmcs12)
3758{
3759        struct vcpu_vmx *vmx = to_vmx(vcpu);
3760
3761        vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
3762        vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
3763        vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
3764        vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
3765        vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
3766        vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
3767        vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
3768        vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
3769        vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
3770        vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
3771        vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
3772        vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
3773        vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
3774        vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
3775        vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
3776        vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
3777        vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
3778        vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
3779        vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
3780        vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
3781        vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
3782        vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
3783        vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
3784        vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
3785        vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
3786        vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
3787        vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
3788        vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
3789        vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
3790        vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
3791        vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
3792        vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
3793        vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
3794        vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
3795        vmcs12->guest_pending_dbg_exceptions =
3796                vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
3797        if (kvm_mpx_supported())
3798                vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3799
3800        vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
3801}
3802
3803static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
3804                                       struct vmcs12 *vmcs12)
3805{
3806        struct vcpu_vmx *vmx = to_vmx(vcpu);
3807        int cpu;
3808
3809        if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare)
3810                return;
3811
3812
3813        WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
3814
3815        cpu = get_cpu();
3816        vmx->loaded_vmcs = &vmx->nested.vmcs02;
3817        vmx_vcpu_load(&vmx->vcpu, cpu);
3818
3819        sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
3820
3821        vmx->loaded_vmcs = &vmx->vmcs01;
3822        vmx_vcpu_load(&vmx->vcpu, cpu);
3823        put_cpu();
3824}
3825
3826/*
3827 * Update the guest state fields of vmcs12 to reflect changes that
3828 * occurred while L2 was running. (The "IA-32e mode guest" bit of the
3829 * VM-entry controls is also updated, since this is really a guest
3830 * state bit.)
3831 */
3832static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3833{
3834        struct vcpu_vmx *vmx = to_vmx(vcpu);
3835
3836        if (vmx->nested.hv_evmcs)
3837                sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
3838
3839        vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs;
3840
3841        vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
3842        vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
3843
3844        vmcs12->guest_rsp = kvm_rsp_read(vcpu);
3845        vmcs12->guest_rip = kvm_rip_read(vcpu);
3846        vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
3847
3848        vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
3849        vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
3850
3851        vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
3852        vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
3853        vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
3854
3855        vmcs12->guest_interruptibility_info =
3856                vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3857
3858        if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
3859                vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
3860        else
3861                vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
3862
3863        if (nested_cpu_has_preemption_timer(vmcs12) &&
3864            vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
3865                        vmcs12->vmx_preemption_timer_value =
3866                                vmx_get_preemption_timer_value(vcpu);
3867
3868        /*
3869         * In some cases (usually, nested EPT), L2 is allowed to change its
3870         * own CR3 without exiting. If it has changed it, we must keep it.
3871         * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
3872         * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
3873         *
3874         * Additionally, restore L2's PDPTR to vmcs12.
3875         */
3876        if (enable_ept) {
3877                vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
3878                if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
3879                        vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
3880                        vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
3881                        vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
3882                        vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
3883                }
3884        }
3885
3886        vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
3887
3888        if (nested_cpu_has_vid(vmcs12))
3889                vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
3890
3891        vmcs12->vm_entry_controls =
3892                (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
3893                (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
3894
3895        if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
3896                kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
3897
3898        if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
3899                vmcs12->guest_ia32_efer = vcpu->arch.efer;
3900}
3901
3902/*
3903 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
3904 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
3905 * and this function updates it to reflect the changes to the guest state while
3906 * L2 was running (and perhaps made some exits which were handled directly by L0
3907 * without going back to L1), and to reflect the exit reason.
3908 * Note that we do not have to copy here all VMCS fields, just those that
3909 * could have changed by the L2 guest or the exit - i.e., the guest-state and
3910 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
3911 * which already writes to vmcs12 directly.
3912 */
3913static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
3914                           u32 exit_reason, u32 exit_intr_info,
3915                           unsigned long exit_qualification)
3916{
3917        /* update exit information fields: */
3918        vmcs12->vm_exit_reason = exit_reason;
3919        vmcs12->exit_qualification = exit_qualification;
3920        vmcs12->vm_exit_intr_info = exit_intr_info;
3921
3922        vmcs12->idt_vectoring_info_field = 0;
3923        vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3924        vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
3925
3926        if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
3927                vmcs12->launch_state = 1;
3928
3929                /* vm_entry_intr_info_field is cleared on exit. Emulate this
3930                 * instead of reading the real value. */
3931                vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
3932
3933                /*
3934                 * Transfer the event that L0 or L1 may wanted to inject into
3935                 * L2 to IDT_VECTORING_INFO_FIELD.
3936                 */
3937                vmcs12_save_pending_event(vcpu, vmcs12);
3938
3939                /*
3940                 * According to spec, there's no need to store the guest's
3941                 * MSRs if the exit is due to a VM-entry failure that occurs
3942                 * during or after loading the guest state. Since this exit
3943                 * does not fall in that category, we need to save the MSRs.
3944                 */
3945                if (nested_vmx_store_msr(vcpu,
3946                                         vmcs12->vm_exit_msr_store_addr,
3947                                         vmcs12->vm_exit_msr_store_count))
3948                        nested_vmx_abort(vcpu,
3949                                         VMX_ABORT_SAVE_GUEST_MSR_FAIL);
3950        }
3951
3952        /*
3953         * Drop what we picked up for L2 via vmx_complete_interrupts. It is
3954         * preserved above and would only end up incorrectly in L1.
3955         */
3956        vcpu->arch.nmi_injected = false;
3957        kvm_clear_exception_queue(vcpu);
3958        kvm_clear_interrupt_queue(vcpu);
3959}
3960
3961/*
3962 * A part of what we need to when the nested L2 guest exits and we want to
3963 * run its L1 parent, is to reset L1's guest state to the host state specified
3964 * in vmcs12.
3965 * This function is to be called not only on normal nested exit, but also on
3966 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
3967 * Failures During or After Loading Guest State").
3968 * This function should be called when the active VMCS is L1's (vmcs01).
3969 */
3970static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
3971                                   struct vmcs12 *vmcs12)
3972{
3973        struct kvm_segment seg;
3974        u32 entry_failure_code;
3975
3976        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
3977                vcpu->arch.efer = vmcs12->host_ia32_efer;
3978        else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
3979                vcpu->arch.efer |= (EFER_LMA | EFER_LME);
3980        else
3981                vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
3982        vmx_set_efer(vcpu, vcpu->arch.efer);
3983
3984        kvm_rsp_write(vcpu, vmcs12->host_rsp);
3985        kvm_rip_write(vcpu, vmcs12->host_rip);
3986        vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
3987        vmx_set_interrupt_shadow(vcpu, 0);
3988
3989        /*
3990         * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
3991         * actually changed, because vmx_set_cr0 refers to efer set above.
3992         *
3993         * CR0_GUEST_HOST_MASK is already set in the original vmcs01
3994         * (KVM doesn't change it);
3995         */
3996        vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
3997        vmx_set_cr0(vcpu, vmcs12->host_cr0);
3998
3999        /* Same as above - no reason to call set_cr4_guest_host_mask().  */
4000        vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
4001        vmx_set_cr4(vcpu, vmcs12->host_cr4);
4002
4003        nested_ept_uninit_mmu_context(vcpu);
4004
4005        /*
4006         * Only PDPTE load can fail as the value of cr3 was checked on entry and
4007         * couldn't have changed.
4008         */
4009        if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
4010                nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
4011
4012        if (!enable_ept)
4013                vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
4014
4015        /*
4016         * If vmcs01 doesn't use VPID, CPU flushes TLB on every
4017         * VMEntry/VMExit. Thus, no need to flush TLB.
4018         *
4019         * If vmcs12 doesn't use VPID, L1 expects TLB to be
4020         * flushed on every VMEntry/VMExit.
4021         *
4022         * Otherwise, we can preserve TLB entries as long as we are
4023         * able to tag L1 TLB entries differently than L2 TLB entries.
4024         *
4025         * If vmcs12 uses EPT, we need to execute this flush on EPTP01
4026         * and therefore we request the TLB flush to happen only after VMCS EPTP
4027         * has been set by KVM_REQ_LOAD_CR3.
4028         */
4029        if (enable_vpid &&
4030            (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) {
4031                kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
4032        }
4033
4034        vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
4035        vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
4036        vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
4037        vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
4038        vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
4039        vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
4040        vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
4041
4042        /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1.  */
4043        if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
4044                vmcs_write64(GUEST_BNDCFGS, 0);
4045
4046        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
4047                vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
4048                vcpu->arch.pat = vmcs12->host_ia32_pat;
4049        }
4050        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
4051                WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
4052                                         vmcs12->host_ia32_perf_global_ctrl));
4053
4054        /* Set L1 segment info according to Intel SDM
4055            27.5.2 Loading Host Segment and Descriptor-Table Registers */
4056        seg = (struct kvm_segment) {
4057                .base = 0,
4058                .limit = 0xFFFFFFFF,
4059                .selector = vmcs12->host_cs_selector,
4060                .type = 11,
4061                .present = 1,
4062                .s = 1,
4063                .g = 1
4064        };
4065        if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
4066                seg.l = 1;
4067        else
4068                seg.db = 1;
4069        vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
4070        seg = (struct kvm_segment) {
4071                .base = 0,
4072                .limit = 0xFFFFFFFF,
4073                .type = 3,
4074                .present = 1,
4075                .s = 1,
4076                .db = 1,
4077                .g = 1
4078        };
4079        seg.selector = vmcs12->host_ds_selector;
4080        vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
4081        seg.selector = vmcs12->host_es_selector;
4082        vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
4083        seg.selector = vmcs12->host_ss_selector;
4084        vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
4085        seg.selector = vmcs12->host_fs_selector;
4086        seg.base = vmcs12->host_fs_base;
4087        vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
4088        seg.selector = vmcs12->host_gs_selector;
4089        seg.base = vmcs12->host_gs_base;
4090        vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
4091        seg = (struct kvm_segment) {
4092                .base = vmcs12->host_tr_base,
4093                .limit = 0x67,
4094                .selector = vmcs12->host_tr_selector,
4095                .type = 11,
4096                .present = 1
4097        };
4098        vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
4099
4100        kvm_set_dr(vcpu, 7, 0x400);
4101        vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4102
4103        if (cpu_has_vmx_msr_bitmap())
4104                vmx_update_msr_bitmap(vcpu);
4105
4106        if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
4107                                vmcs12->vm_exit_msr_load_count))
4108                nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4109}
4110
4111static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
4112{
4113        struct shared_msr_entry *efer_msr;
4114        unsigned int i;
4115
4116        if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
4117                return vmcs_read64(GUEST_IA32_EFER);
4118
4119        if (cpu_has_load_ia32_efer())
4120                return host_efer;
4121
4122        for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
4123                if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
4124                        return vmx->msr_autoload.guest.val[i].value;
4125        }
4126
4127        efer_msr = find_msr_entry(vmx, MSR_EFER);
4128        if (efer_msr)
4129                return efer_msr->data;
4130
4131        return host_efer;
4132}
4133
4134static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
4135{
4136        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4137        struct vcpu_vmx *vmx = to_vmx(vcpu);
4138        struct vmx_msr_entry g, h;
4139        gpa_t gpa;
4140        u32 i, j;
4141
4142        vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
4143
4144        if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
4145                /*
4146                 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
4147                 * as vmcs01.GUEST_DR7 contains a userspace defined value
4148                 * and vcpu->arch.dr7 is not squirreled away before the
4149                 * nested VMENTER (not worth adding a variable in nested_vmx).
4150                 */
4151                if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
4152                        kvm_set_dr(vcpu, 7, DR7_FIXED_1);
4153                else
4154                        WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
4155        }
4156
4157        /*
4158         * Note that calling vmx_set_{efer,cr0,cr4} is important as they
4159         * handle a variety of side effects to KVM's software model.
4160         */
4161        vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
4162
4163        vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
4164        vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
4165
4166        vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
4167        vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
4168
4169        nested_ept_uninit_mmu_context(vcpu);
4170        vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
4171        kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
4172
4173        /*
4174         * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
4175         * from vmcs01 (if necessary).  The PDPTRs are not loaded on
4176         * VMFail, like everything else we just need to ensure our
4177         * software model is up-to-date.
4178         */
4179        if (enable_ept)
4180                ept_save_pdptrs(vcpu);
4181
4182        kvm_mmu_reset_context(vcpu);
4183
4184        if (cpu_has_vmx_msr_bitmap())
4185                vmx_update_msr_bitmap(vcpu);
4186
4187        /*
4188         * This nasty bit of open coding is a compromise between blindly
4189         * loading L1's MSRs using the exit load lists (incorrect emulation
4190         * of VMFail), leaving the nested VM's MSRs in the software model
4191         * (incorrect behavior) and snapshotting the modified MSRs (too
4192         * expensive since the lists are unbound by hardware).  For each
4193         * MSR that was (prematurely) loaded from the nested VMEntry load
4194         * list, reload it from the exit load list if it exists and differs
4195         * from the guest value.  The intent is to stuff host state as
4196         * silently as possible, not to fully process the exit load list.
4197         */
4198        for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
4199                gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
4200                if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
4201                        pr_debug_ratelimited(
4202                                "%s read MSR index failed (%u, 0x%08llx)\n",
4203                                __func__, i, gpa);
4204                        goto vmabort;
4205                }
4206
4207                for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
4208                        gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
4209                        if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
4210                                pr_debug_ratelimited(
4211                                        "%s read MSR failed (%u, 0x%08llx)\n",
4212                                        __func__, j, gpa);
4213                                goto vmabort;
4214                        }
4215                        if (h.index != g.index)
4216                                continue;
4217                        if (h.value == g.value)
4218                                break;
4219
4220                        if (nested_vmx_load_msr_check(vcpu, &h)) {
4221                                pr_debug_ratelimited(
4222                                        "%s check failed (%u, 0x%x, 0x%x)\n",
4223                                        __func__, j, h.index, h.reserved);
4224                                goto vmabort;
4225                        }
4226
4227                        if (kvm_set_msr(vcpu, h.index, h.value)) {
4228                                pr_debug_ratelimited(
4229                                        "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
4230                                        __func__, j, h.index, h.value);
4231                                goto vmabort;
4232                        }
4233                }
4234        }
4235
4236        return;
4237
4238vmabort:
4239        nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4240}
4241
4242/*
4243 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
4244 * and modify vmcs12 to make it see what it would expect to see there if
4245 * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
4246 */
4247void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
4248                       u32 exit_intr_info, unsigned long exit_qualification)
4249{
4250        struct vcpu_vmx *vmx = to_vmx(vcpu);
4251        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4252
4253        /* trying to cancel vmlaunch/vmresume is a bug */
4254        WARN_ON_ONCE(vmx->nested.nested_run_pending);
4255
4256        leave_guest_mode(vcpu);
4257
4258        if (nested_cpu_has_preemption_timer(vmcs12))
4259                hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
4260
4261        if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
4262                vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
4263
4264        if (likely(!vmx->fail)) {
4265                sync_vmcs02_to_vmcs12(vcpu, vmcs12);
4266
4267                if (exit_reason != -1)
4268                        prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
4269                                       exit_qualification);
4270
4271                /*
4272                 * Must happen outside of sync_vmcs02_to_vmcs12() as it will
4273                 * also be used to capture vmcs12 cache as part of
4274                 * capturing nVMX state for snapshot (migration).
4275                 *
4276                 * Otherwise, this flush will dirty guest memory at a
4277                 * point it is already assumed by user-space to be
4278                 * immutable.
4279                 */
4280                nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
4281        } else {
4282                /*
4283                 * The only expected VM-instruction error is "VM entry with
4284                 * invalid control field(s)." Anything else indicates a
4285                 * problem with L0.  And we should never get here with a
4286                 * VMFail of any type if early consistency checks are enabled.
4287                 */
4288                WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
4289                             VMXERR_ENTRY_INVALID_CONTROL_FIELD);
4290                WARN_ON_ONCE(nested_early_check);
4291        }
4292
4293        vmx_switch_vmcs(vcpu, &vmx->vmcs01);
4294
4295        /* Update any VMCS fields that might have changed while L2 ran */
4296        vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
4297        vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
4298        vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
4299        if (vmx->nested.l1_tpr_threshold != -1)
4300                vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
4301
4302        if (kvm_has_tsc_control)
4303                decache_tsc_multiplier(vmx);
4304
4305        if (vmx->nested.change_vmcs01_virtual_apic_mode) {
4306                vmx->nested.change_vmcs01_virtual_apic_mode = false;
4307                vmx_set_virtual_apic_mode(vcpu);
4308        }
4309
4310        /* Unpin physical memory we referred to in vmcs02 */
4311        if (vmx->nested.apic_access_page) {
4312                kvm_release_page_clean(vmx->nested.apic_access_page);
4313                vmx->nested.apic_access_page = NULL;
4314        }
4315        kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
4316        kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
4317        vmx->nested.pi_desc = NULL;
4318
4319        /*
4320         * We are now running in L2, mmu_notifier will force to reload the
4321         * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
4322         */
4323        kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
4324
4325        if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs))
4326                vmx->nested.need_vmcs12_to_shadow_sync = true;
4327
4328        /* in case we halted in L2 */
4329        vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4330
4331        if (likely(!vmx->fail)) {
4332                /*
4333                 * TODO: SDM says that with acknowledge interrupt on
4334                 * exit, bit 31 of the VM-exit interrupt information
4335                 * (valid interrupt) is always set to 1 on
4336                 * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't
4337                 * need kvm_cpu_has_interrupt().  See the commit
4338                 * message for details.
4339                 */
4340                if (nested_exit_intr_ack_set(vcpu) &&
4341                    exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
4342                    kvm_cpu_has_interrupt(vcpu)) {
4343                        int irq = kvm_cpu_get_interrupt(vcpu);
4344                        WARN_ON(irq < 0);
4345                        vmcs12->vm_exit_intr_info = irq |
4346                                INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
4347                }
4348
4349                if (exit_reason != -1)
4350                        trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
4351                                                       vmcs12->exit_qualification,
4352                                                       vmcs12->idt_vectoring_info_field,
4353                                                       vmcs12->vm_exit_intr_info,
4354                                                       vmcs12->vm_exit_intr_error_code,
4355                                                       KVM_ISA_VMX);
4356
4357                load_vmcs12_host_state(vcpu, vmcs12);
4358
4359                return;
4360        }
4361
4362        /*
4363         * After an early L2 VM-entry failure, we're now back
4364         * in L1 which thinks it just finished a VMLAUNCH or
4365         * VMRESUME instruction, so we need to set the failure
4366         * flag and the VM-instruction error field of the VMCS
4367         * accordingly, and skip the emulated instruction.
4368         */
4369        (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
4370
4371        /*
4372         * Restore L1's host state to KVM's software model.  We're here
4373         * because a consistency check was caught by hardware, which
4374         * means some amount of guest state has been propagated to KVM's
4375         * model and needs to be unwound to the host's state.
4376         */
4377        nested_vmx_restore_host_state(vcpu);
4378
4379        vmx->fail = 0;
4380}
4381
4382/*
4383 * Decode the memory-address operand of a vmx instruction, as recorded on an
4384 * exit caused by such an instruction (run by a guest hypervisor).
4385 * On success, returns 0. When the operand is invalid, returns 1 and throws
4386 * #UD or #GP.
4387 */
4388int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
4389                        u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
4390{
4391        gva_t off;
4392        bool exn;
4393        struct kvm_segment s;
4394
4395        /*
4396         * According to Vol. 3B, "Information for VM Exits Due to Instruction
4397         * Execution", on an exit, vmx_instruction_info holds most of the
4398         * addressing components of the operand. Only the displacement part
4399         * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
4400         * For how an actual address is calculated from all these components,
4401         * refer to Vol. 1, "Operand Addressing".
4402         */
4403        int  scaling = vmx_instruction_info & 3;
4404        int  addr_size = (vmx_instruction_info >> 7) & 7;
4405        bool is_reg = vmx_instruction_info & (1u << 10);
4406        int  seg_reg = (vmx_instruction_info >> 15) & 7;
4407        int  index_reg = (vmx_instruction_info >> 18) & 0xf;
4408        bool index_is_valid = !(vmx_instruction_info & (1u << 22));
4409        int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
4410        bool base_is_valid  = !(vmx_instruction_info & (1u << 27));
4411
4412        if (is_reg) {
4413                kvm_queue_exception(vcpu, UD_VECTOR);
4414                return 1;
4415        }
4416
4417        /* Addr = segment_base + offset */
4418        /* offset = base + [index * scale] + displacement */
4419        off = exit_qualification; /* holds the displacement */
4420        if (addr_size == 1)
4421                off = (gva_t)sign_extend64(off, 31);
4422        else if (addr_size == 0)
4423                off = (gva_t)sign_extend64(off, 15);
4424        if (base_is_valid)
4425                off += kvm_register_read(vcpu, base_reg);
4426        if (index_is_valid)
4427                off += kvm_register_read(vcpu, index_reg)<<scaling;
4428        vmx_get_segment(vcpu, &s, seg_reg);
4429
4430        /*
4431         * The effective address, i.e. @off, of a memory operand is truncated
4432         * based on the address size of the instruction.  Note that this is
4433         * the *effective address*, i.e. the address prior to accounting for
4434         * the segment's base.
4435         */
4436        if (addr_size == 1) /* 32 bit */
4437                off &= 0xffffffff;
4438        else if (addr_size == 0) /* 16 bit */
4439                off &= 0xffff;
4440
4441        /* Checks for #GP/#SS exceptions. */
4442        exn = false;
4443        if (is_long_mode(vcpu)) {
4444                /*
4445                 * The virtual/linear address is never truncated in 64-bit
4446                 * mode, e.g. a 32-bit address size can yield a 64-bit virtual
4447                 * address when using FS/GS with a non-zero base.
4448                 */
4449                if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS)
4450                        *ret = s.base + off;
4451                else
4452                        *ret = off;
4453
4454                /* Long mode: #GP(0)/#SS(0) if the memory address is in a
4455                 * non-canonical form. This is the only check on the memory
4456                 * destination for long mode!
4457                 */
4458                exn = is_noncanonical_address(*ret, vcpu);
4459        } else {
4460                /*
4461                 * When not in long mode, the virtual/linear address is
4462                 * unconditionally truncated to 32 bits regardless of the
4463                 * address size.
4464                 */
4465                *ret = (s.base + off) & 0xffffffff;
4466
4467                /* Protected mode: apply checks for segment validity in the
4468                 * following order:
4469                 * - segment type check (#GP(0) may be thrown)
4470                 * - usability check (#GP(0)/#SS(0))
4471                 * - limit check (#GP(0)/#SS(0))
4472                 */
4473                if (wr)
4474                        /* #GP(0) if the destination operand is located in a
4475                         * read-only data segment or any code segment.
4476                         */
4477                        exn = ((s.type & 0xa) == 0 || (s.type & 8));
4478                else
4479                        /* #GP(0) if the source operand is located in an
4480                         * execute-only code segment
4481                         */
4482                        exn = ((s.type & 0xa) == 8);
4483                if (exn) {
4484                        kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
4485                        return 1;
4486                }
4487                /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
4488                 */
4489                exn = (s.unusable != 0);
4490
4491                /*
4492                 * Protected mode: #GP(0)/#SS(0) if the memory operand is
4493                 * outside the segment limit.  All CPUs that support VMX ignore
4494                 * limit checks for flat segments, i.e. segments with base==0,
4495                 * limit==0xffffffff and of type expand-up data or code.
4496                 */
4497                if (!(s.base == 0 && s.limit == 0xffffffff &&
4498                     ((s.type & 8) || !(s.type & 4))))
4499                        exn = exn || ((u64)off + len - 1 > s.limit);
4500        }
4501        if (exn) {
4502                kvm_queue_exception_e(vcpu,
4503                                      seg_reg == VCPU_SREG_SS ?
4504                                                SS_VECTOR : GP_VECTOR,
4505                                      0);
4506                return 1;
4507        }
4508
4509        return 0;
4510}
4511
4512void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
4513{
4514        struct vcpu_vmx *vmx;
4515
4516        if (!nested_vmx_allowed(vcpu))
4517                return;
4518
4519        vmx = to_vmx(vcpu);
4520        if (kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) {
4521                vmx->nested.msrs.entry_ctls_high |=
4522                                VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4523                vmx->nested.msrs.exit_ctls_high |=
4524                                VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
4525        } else {
4526                vmx->nested.msrs.entry_ctls_high &=
4527                                ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4528                vmx->nested.msrs.exit_ctls_high &=
4529                                ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4530        }
4531}
4532
4533static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
4534{
4535        gva_t gva;
4536        struct x86_exception e;
4537
4538        if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
4539                                vmcs_read32(VMX_INSTRUCTION_INFO), false,
4540                                sizeof(*vmpointer), &gva))
4541                return 1;
4542
4543        if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
4544                kvm_inject_page_fault(vcpu, &e);
4545                return 1;
4546        }
4547
4548        return 0;
4549}
4550
4551/*
4552 * Allocate a shadow VMCS and associate it with the currently loaded
4553 * VMCS, unless such a shadow VMCS already exists. The newly allocated
4554 * VMCS is also VMCLEARed, so that it is ready for use.
4555 */
4556static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
4557{
4558        struct vcpu_vmx *vmx = to_vmx(vcpu);
4559        struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
4560
4561        /*
4562         * We should allocate a shadow vmcs for vmcs01 only when L1
4563         * executes VMXON and free it when L1 executes VMXOFF.
4564         * As it is invalid to execute VMXON twice, we shouldn't reach
4565         * here when vmcs01 already have an allocated shadow vmcs.
4566         */
4567        WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
4568
4569        if (!loaded_vmcs->shadow_vmcs) {
4570                loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
4571                if (loaded_vmcs->shadow_vmcs)
4572                        vmcs_clear(loaded_vmcs->shadow_vmcs);
4573        }
4574        return loaded_vmcs->shadow_vmcs;
4575}
4576
4577static int enter_vmx_operation(struct kvm_vcpu *vcpu)
4578{
4579        struct vcpu_vmx *vmx = to_vmx(vcpu);
4580        int r;
4581
4582        r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
4583        if (r < 0)
4584                goto out_vmcs02;
4585
4586        vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4587        if (!vmx->nested.cached_vmcs12)
4588                goto out_cached_vmcs12;
4589
4590        vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4591        if (!vmx->nested.cached_shadow_vmcs12)
4592                goto out_cached_shadow_vmcs12;
4593
4594        if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
4595                goto out_shadow_vmcs;
4596
4597        hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
4598                     HRTIMER_MODE_REL_PINNED);
4599        vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
4600
4601        vmx->nested.vpid02 = allocate_vpid();
4602
4603        vmx->nested.vmcs02_initialized = false;
4604        vmx->nested.vmxon = true;
4605
4606        if (pt_mode == PT_MODE_HOST_GUEST) {
4607                vmx->pt_desc.guest.ctl = 0;
4608                pt_update_intercept_for_msr(vmx);
4609        }
4610
4611        return 0;
4612
4613out_shadow_vmcs:
4614        kfree(vmx->nested.cached_shadow_vmcs12);
4615
4616out_cached_shadow_vmcs12:
4617        kfree(vmx->nested.cached_vmcs12);
4618
4619out_cached_vmcs12:
4620        free_loaded_vmcs(&vmx->nested.vmcs02);
4621
4622out_vmcs02:
4623        return -ENOMEM;
4624}
4625
4626/*
4627 * Emulate the VMXON instruction.
4628 * Currently, we just remember that VMX is active, and do not save or even
4629 * inspect the argument to VMXON (the so-called "VMXON pointer") because we
4630 * do not currently need to store anything in that guest-allocated memory
4631 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
4632 * argument is different from the VMXON pointer (which the spec says they do).
4633 */
4634static int handle_vmon(struct kvm_vcpu *vcpu)
4635{
4636        int ret;
4637        gpa_t vmptr;
4638        uint32_t revision;
4639        struct vcpu_vmx *vmx = to_vmx(vcpu);
4640        const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED
4641                | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
4642
4643        /*
4644         * The Intel VMX Instruction Reference lists a bunch of bits that are
4645         * prerequisite to running VMXON, most notably cr4.VMXE must be set to
4646         * 1 (see vmx_set_cr4() for when we allow the guest to set this).
4647         * Otherwise, we should fail with #UD.  But most faulting conditions
4648         * have already been checked by hardware, prior to the VM-exit for
4649         * VMXON.  We do test guest cr4.VMXE because processor CR4 always has
4650         * that bit set to 1 in non-root mode.
4651         */
4652        if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
4653                kvm_queue_exception(vcpu, UD_VECTOR);
4654                return 1;
4655        }
4656
4657        /* CPL=0 must be checked manually. */
4658        if (vmx_get_cpl(vcpu)) {
4659                kvm_inject_gp(vcpu, 0);
4660                return 1;
4661        }
4662
4663        if (vmx->nested.vmxon)
4664                return nested_vmx_failValid(vcpu,
4665                        VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
4666
4667        if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
4668                        != VMXON_NEEDED_FEATURES) {
4669                kvm_inject_gp(vcpu, 0);
4670                return 1;
4671        }
4672
4673        if (nested_vmx_get_vmptr(vcpu, &vmptr))
4674                return 1;
4675
4676        /*
4677         * SDM 3: 24.11.5
4678         * The first 4 bytes of VMXON region contain the supported
4679         * VMCS revision identifier
4680         *
4681         * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
4682         * which replaces physical address width with 32
4683         */
4684        if (!page_address_valid(vcpu, vmptr))
4685                return nested_vmx_failInvalid(vcpu);
4686
4687        if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
4688            revision != VMCS12_REVISION)
4689                return nested_vmx_failInvalid(vcpu);
4690
4691        vmx->nested.vmxon_ptr = vmptr;
4692        ret = enter_vmx_operation(vcpu);
4693        if (ret)
4694                return ret;
4695
4696        return nested_vmx_succeed(vcpu);
4697}
4698
4699static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
4700{
4701        struct vcpu_vmx *vmx = to_vmx(vcpu);
4702
4703        if (vmx->nested.current_vmptr == -1ull)
4704                return;
4705
4706        copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
4707
4708        if (enable_shadow_vmcs) {
4709                /* copy to memory all shadowed fields in case
4710                   they were modified */
4711                copy_shadow_to_vmcs12(vmx);
4712                vmx_disable_shadow_vmcs(vmx);
4713        }
4714        vmx->nested.posted_intr_nv = -1;
4715
4716        /* Flush VMCS12 to guest memory */
4717        kvm_vcpu_write_guest_page(vcpu,
4718                                  vmx->nested.current_vmptr >> PAGE_SHIFT,
4719                                  vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
4720
4721        kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
4722
4723        vmx->nested.current_vmptr = -1ull;
4724}
4725
4726/* Emulate the VMXOFF instruction */
4727static int handle_vmoff(struct kvm_vcpu *vcpu)
4728{
4729        if (!nested_vmx_check_permission(vcpu))
4730                return 1;
4731
4732        free_nested(vcpu);
4733
4734        /* Process a latched INIT during time CPU was in VMX operation */
4735        kvm_make_request(KVM_REQ_EVENT, vcpu);
4736
4737        return nested_vmx_succeed(vcpu);
4738}
4739
4740/* Emulate the VMCLEAR instruction */
4741static int handle_vmclear(struct kvm_vcpu *vcpu)
4742{
4743        struct vcpu_vmx *vmx = to_vmx(vcpu);
4744        u32 zero = 0;
4745        gpa_t vmptr;
4746        u64 evmcs_gpa;
4747
4748        if (!nested_vmx_check_permission(vcpu))
4749                return 1;
4750
4751        if (nested_vmx_get_vmptr(vcpu, &vmptr))
4752                return 1;
4753
4754        if (!page_address_valid(vcpu, vmptr))
4755                return nested_vmx_failValid(vcpu,
4756                        VMXERR_VMCLEAR_INVALID_ADDRESS);
4757
4758        if (vmptr == vmx->nested.vmxon_ptr)
4759                return nested_vmx_failValid(vcpu,
4760                        VMXERR_VMCLEAR_VMXON_POINTER);
4761
4762        /*
4763         * When Enlightened VMEntry is enabled on the calling CPU we treat
4764         * memory area pointer by vmptr as Enlightened VMCS (as there's no good
4765         * way to distinguish it from VMCS12) and we must not corrupt it by
4766         * writing to the non-existent 'launch_state' field. The area doesn't
4767         * have to be the currently active EVMCS on the calling CPU and there's
4768         * nothing KVM has to do to transition it from 'active' to 'non-active'
4769         * state. It is possible that the area will stay mapped as
4770         * vmx->nested.hv_evmcs but this shouldn't be a problem.
4771         */
4772        if (likely(!vmx->nested.enlightened_vmcs_enabled ||
4773                   !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
4774                if (vmptr == vmx->nested.current_vmptr)
4775                        nested_release_vmcs12(vcpu);
4776
4777                kvm_vcpu_write_guest(vcpu,
4778                                     vmptr + offsetof(struct vmcs12,
4779                                                      launch_state),
4780                                     &zero, sizeof(zero));
4781        }
4782
4783        return nested_vmx_succeed(vcpu);
4784}
4785
4786/* Emulate the VMLAUNCH instruction */
4787static int handle_vmlaunch(struct kvm_vcpu *vcpu)
4788{
4789        return nested_vmx_run(vcpu, true);
4790}
4791
4792/* Emulate the VMRESUME instruction */
4793static int handle_vmresume(struct kvm_vcpu *vcpu)
4794{
4795
4796        return nested_vmx_run(vcpu, false);
4797}
4798
4799static int handle_vmread(struct kvm_vcpu *vcpu)
4800{
4801        struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
4802                                                    : get_vmcs12(vcpu);
4803        unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4804        u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4805        struct vcpu_vmx *vmx = to_vmx(vcpu);
4806        struct x86_exception e;
4807        unsigned long field;
4808        u64 value;
4809        gva_t gva = 0;
4810        short offset;
4811        int len;
4812
4813        if (!nested_vmx_check_permission(vcpu))
4814                return 1;
4815
4816        /*
4817         * In VMX non-root operation, when the VMCS-link pointer is -1ull,
4818         * any VMREAD sets the ALU flags for VMfailInvalid.
4819         */
4820        if (vmx->nested.current_vmptr == -1ull ||
4821            (is_guest_mode(vcpu) &&
4822             get_vmcs12(vcpu)->vmcs_link_pointer == -1ull))
4823                return nested_vmx_failInvalid(vcpu);
4824
4825        /* Decode instruction info and find the field to read */
4826        field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf));
4827
4828        offset = vmcs_field_to_offset(field);
4829        if (offset < 0)
4830                return nested_vmx_failValid(vcpu,
4831                        VMXERR_UNSUPPORTED_VMCS_COMPONENT);
4832
4833        if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
4834                copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4835
4836        /* Read the field, zero-extended to a u64 value */
4837        value = vmcs12_read_any(vmcs12, field, offset);
4838
4839        /*
4840         * Now copy part of this value to register or memory, as requested.
4841         * Note that the number of bits actually copied is 32 or 64 depending
4842         * on the guest's mode (32 or 64 bit), not on the given field's length.
4843         */
4844        if (instr_info & BIT(10)) {
4845                kvm_register_writel(vcpu, (((instr_info) >> 3) & 0xf), value);
4846        } else {
4847                len = is_64_bit_mode(vcpu) ? 8 : 4;
4848                if (get_vmx_mem_address(vcpu, exit_qualification,
4849                                        instr_info, true, len, &gva))
4850                        return 1;
4851                /* _system ok, nested_vmx_check_permission has verified cpl=0 */
4852                if (kvm_write_guest_virt_system(vcpu, gva, &value, len, &e)) {
4853                        kvm_inject_page_fault(vcpu, &e);
4854                        return 1;
4855                }
4856        }
4857
4858        return nested_vmx_succeed(vcpu);
4859}
4860
4861static bool is_shadow_field_rw(unsigned long field)
4862{
4863        switch (field) {
4864#define SHADOW_FIELD_RW(x, y) case x:
4865#include "vmcs_shadow_fields.h"
4866                return true;
4867        default:
4868                break;
4869        }
4870        return false;
4871}
4872
4873static bool is_shadow_field_ro(unsigned long field)
4874{
4875        switch (field) {
4876#define SHADOW_FIELD_RO(x, y) case x:
4877#include "vmcs_shadow_fields.h"
4878                return true;
4879        default:
4880                break;
4881        }
4882        return false;
4883}
4884
4885static int handle_vmwrite(struct kvm_vcpu *vcpu)
4886{
4887        struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
4888                                                    : get_vmcs12(vcpu);
4889        unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4890        u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4891        struct vcpu_vmx *vmx = to_vmx(vcpu);
4892        struct x86_exception e;
4893        unsigned long field;
4894        short offset;
4895        gva_t gva;
4896        int len;
4897
4898        /*
4899         * The value to write might be 32 or 64 bits, depending on L1's long
4900         * mode, and eventually we need to write that into a field of several
4901         * possible lengths. The code below first zero-extends the value to 64
4902         * bit (value), and then copies only the appropriate number of
4903         * bits into the vmcs12 field.
4904         */
4905        u64 value = 0;
4906
4907        if (!nested_vmx_check_permission(vcpu))
4908                return 1;
4909
4910        /*
4911         * In VMX non-root operation, when the VMCS-link pointer is -1ull,
4912         * any VMWRITE sets the ALU flags for VMfailInvalid.
4913         */
4914        if (vmx->nested.current_vmptr == -1ull ||
4915            (is_guest_mode(vcpu) &&
4916             get_vmcs12(vcpu)->vmcs_link_pointer == -1ull))
4917                return nested_vmx_failInvalid(vcpu);
4918
4919        if (instr_info & BIT(10))
4920                value = kvm_register_readl(vcpu, (((instr_info) >> 3) & 0xf));
4921        else {
4922                len = is_64_bit_mode(vcpu) ? 8 : 4;
4923                if (get_vmx_mem_address(vcpu, exit_qualification,
4924                                        instr_info, false, len, &gva))
4925                        return 1;
4926                if (kvm_read_guest_virt(vcpu, gva, &value, len, &e)) {
4927                        kvm_inject_page_fault(vcpu, &e);
4928                        return 1;
4929                }
4930        }
4931
4932        field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf));
4933
4934        offset = vmcs_field_to_offset(field);
4935        if (offset < 0)
4936                return nested_vmx_failValid(vcpu,
4937                        VMXERR_UNSUPPORTED_VMCS_COMPONENT);
4938
4939        /*
4940         * If the vCPU supports "VMWRITE to any supported field in the
4941         * VMCS," then the "read-only" fields are actually read/write.
4942         */
4943        if (vmcs_field_readonly(field) &&
4944            !nested_cpu_has_vmwrite_any_field(vcpu))
4945                return nested_vmx_failValid(vcpu,
4946                        VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
4947
4948        /*
4949         * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
4950         * vmcs12, else we may crush a field or consume a stale value.
4951         */
4952        if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field))
4953                copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4954
4955        /*
4956         * Some Intel CPUs intentionally drop the reserved bits of the AR byte
4957         * fields on VMWRITE.  Emulate this behavior to ensure consistent KVM
4958         * behavior regardless of the underlying hardware, e.g. if an AR_BYTE
4959         * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD
4960         * from L1 will return a different value than VMREAD from L2 (L1 sees
4961         * the stripped down value, L2 sees the full value as stored by KVM).
4962         */
4963        if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
4964                value &= 0x1f0ff;
4965
4966        vmcs12_write_any(vmcs12, field, offset, value);
4967
4968        /*
4969         * Do not track vmcs12 dirty-state if in guest-mode as we actually
4970         * dirty shadow vmcs12 instead of vmcs12.  Fields that can be updated
4971         * by L1 without a vmexit are always updated in the vmcs02, i.e. don't
4972         * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path.
4973         */
4974        if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) {
4975                /*
4976                 * L1 can read these fields without exiting, ensure the
4977                 * shadow VMCS is up-to-date.
4978                 */
4979                if (enable_shadow_vmcs && is_shadow_field_ro(field)) {
4980                        preempt_disable();
4981                        vmcs_load(vmx->vmcs01.shadow_vmcs);
4982
4983                        __vmcs_writel(field, value);
4984
4985                        vmcs_clear(vmx->vmcs01.shadow_vmcs);
4986                        vmcs_load(vmx->loaded_vmcs->vmcs);
4987                        preempt_enable();
4988                }
4989                vmx->nested.dirty_vmcs12 = true;
4990        }
4991
4992        return nested_vmx_succeed(vcpu);
4993}
4994
4995static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
4996{
4997        vmx->nested.current_vmptr = vmptr;
4998        if (enable_shadow_vmcs) {
4999                secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
5000                vmcs_write64(VMCS_LINK_POINTER,
5001                             __pa(vmx->vmcs01.shadow_vmcs));
5002                vmx->nested.need_vmcs12_to_shadow_sync = true;
5003        }
5004        vmx->nested.dirty_vmcs12 = true;
5005}
5006
5007/* Emulate the VMPTRLD instruction */
5008static int handle_vmptrld(struct kvm_vcpu *vcpu)
5009{
5010        struct vcpu_vmx *vmx = to_vmx(vcpu);
5011        gpa_t vmptr;
5012
5013        if (!nested_vmx_check_permission(vcpu))
5014                return 1;
5015
5016        if (nested_vmx_get_vmptr(vcpu, &vmptr))
5017                return 1;
5018
5019        if (!page_address_valid(vcpu, vmptr))
5020                return nested_vmx_failValid(vcpu,
5021                        VMXERR_VMPTRLD_INVALID_ADDRESS);
5022
5023        if (vmptr == vmx->nested.vmxon_ptr)
5024                return nested_vmx_failValid(vcpu,
5025                        VMXERR_VMPTRLD_VMXON_POINTER);
5026
5027        /* Forbid normal VMPTRLD if Enlightened version was used */
5028        if (vmx->nested.hv_evmcs)
5029                return 1;
5030
5031        if (vmx->nested.current_vmptr != vmptr) {
5032                struct kvm_host_map map;
5033                struct vmcs12 *new_vmcs12;
5034
5035                if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) {
5036                        /*
5037                         * Reads from an unbacked page return all 1s,
5038                         * which means that the 32 bits located at the
5039                         * given physical address won't match the required
5040                         * VMCS12_REVISION identifier.
5041                         */
5042                        return nested_vmx_failValid(vcpu,
5043                                VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5044                }
5045
5046                new_vmcs12 = map.hva;
5047
5048                if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
5049                    (new_vmcs12->hdr.shadow_vmcs &&
5050                     !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
5051                        kvm_vcpu_unmap(vcpu, &map, false);
5052                        return nested_vmx_failValid(vcpu,
5053                                VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5054                }
5055
5056                nested_release_vmcs12(vcpu);
5057
5058                /*
5059                 * Load VMCS12 from guest memory since it is not already
5060                 * cached.
5061                 */
5062                memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
5063                kvm_vcpu_unmap(vcpu, &map, false);
5064
5065                set_current_vmptr(vmx, vmptr);
5066        }
5067
5068        return nested_vmx_succeed(vcpu);
5069}
5070
5071/* Emulate the VMPTRST instruction */
5072static int handle_vmptrst(struct kvm_vcpu *vcpu)
5073{
5074        unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION);
5075        u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5076        gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
5077        struct x86_exception e;
5078        gva_t gva;
5079
5080        if (!nested_vmx_check_permission(vcpu))
5081                return 1;
5082
5083        if (unlikely(to_vmx(vcpu)->nested.hv_evmcs))
5084                return 1;
5085
5086        if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
5087                                true, sizeof(gpa_t), &gva))
5088                return 1;
5089        /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
5090        if (kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
5091                                        sizeof(gpa_t), &e)) {
5092                kvm_inject_page_fault(vcpu, &e);
5093                return 1;
5094        }
5095        return nested_vmx_succeed(vcpu);
5096}
5097
5098/* Emulate the INVEPT instruction */
5099static int handle_invept(struct kvm_vcpu *vcpu)
5100{
5101        struct vcpu_vmx *vmx = to_vmx(vcpu);
5102        u32 vmx_instruction_info, types;
5103        unsigned long type;
5104        gva_t gva;
5105        struct x86_exception e;
5106        struct {
5107                u64 eptp, gpa;
5108        } operand;
5109
5110        if (!(vmx->nested.msrs.secondary_ctls_high &
5111              SECONDARY_EXEC_ENABLE_EPT) ||
5112            !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
5113                kvm_queue_exception(vcpu, UD_VECTOR);
5114                return 1;
5115        }
5116
5117        if (!nested_vmx_check_permission(vcpu))
5118                return 1;
5119
5120        vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5121        type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
5122
5123        types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
5124
5125        if (type >= 32 || !(types & (1 << type)))
5126                return nested_vmx_failValid(vcpu,
5127                                VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5128
5129        /* According to the Intel VMX instruction reference, the memory
5130         * operand is read even if it isn't needed (e.g., for type==global)
5131         */
5132        if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
5133                        vmx_instruction_info, false, sizeof(operand), &gva))
5134                return 1;
5135        if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
5136                kvm_inject_page_fault(vcpu, &e);
5137                return 1;
5138        }
5139
5140        switch (type) {
5141        case VMX_EPT_EXTENT_GLOBAL:
5142        case VMX_EPT_EXTENT_CONTEXT:
5143        /*
5144         * TODO: Sync the necessary shadow EPT roots here, rather than
5145         * at the next emulated VM-entry.
5146         */
5147                break;
5148        default:
5149                BUG_ON(1);
5150                break;
5151        }
5152
5153        return nested_vmx_succeed(vcpu);
5154}
5155
5156static int handle_invvpid(struct kvm_vcpu *vcpu)
5157{
5158        struct vcpu_vmx *vmx = to_vmx(vcpu);
5159        u32 vmx_instruction_info;
5160        unsigned long type, types;
5161        gva_t gva;
5162        struct x86_exception e;
5163        struct {
5164                u64 vpid;
5165                u64 gla;
5166        } operand;
5167        u16 vpid02;
5168
5169        if (!(vmx->nested.msrs.secondary_ctls_high &
5170              SECONDARY_EXEC_ENABLE_VPID) ||
5171                        !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
5172                kvm_queue_exception(vcpu, UD_VECTOR);
5173                return 1;
5174        }
5175
5176        if (!nested_vmx_check_permission(vcpu))
5177                return 1;
5178
5179        vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5180        type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
5181
5182        types = (vmx->nested.msrs.vpid_caps &
5183                        VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
5184
5185        if (type >= 32 || !(types & (1 << type)))
5186                return nested_vmx_failValid(vcpu,
5187                        VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5188
5189        /* according to the intel vmx instruction reference, the memory
5190         * operand is read even if it isn't needed (e.g., for type==global)
5191         */
5192        if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
5193                        vmx_instruction_info, false, sizeof(operand), &gva))
5194                return 1;
5195        if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
5196                kvm_inject_page_fault(vcpu, &e);
5197                return 1;
5198        }
5199        if (operand.vpid >> 16)
5200                return nested_vmx_failValid(vcpu,
5201                        VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5202
5203        vpid02 = nested_get_vpid02(vcpu);
5204        switch (type) {
5205        case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
5206                if (!operand.vpid ||
5207                    is_noncanonical_address(operand.gla, vcpu))
5208                        return nested_vmx_failValid(vcpu,
5209                                VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5210                if (cpu_has_vmx_invvpid_individual_addr()) {
5211                        __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
5212                                vpid02, operand.gla);
5213                } else
5214                        __vmx_flush_tlb(vcpu, vpid02, false);
5215                break;
5216        case VMX_VPID_EXTENT_SINGLE_CONTEXT:
5217        case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
5218                if (!operand.vpid)
5219                        return nested_vmx_failValid(vcpu,
5220                                VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5221                __vmx_flush_tlb(vcpu, vpid02, false);
5222                break;
5223        case VMX_VPID_EXTENT_ALL_CONTEXT:
5224                __vmx_flush_tlb(vcpu, vpid02, false);
5225                break;
5226        default:
5227                WARN_ON_ONCE(1);
5228                return kvm_skip_emulated_instruction(vcpu);
5229        }
5230
5231        return nested_vmx_succeed(vcpu);
5232}
5233
5234static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
5235                                     struct vmcs12 *vmcs12)
5236{
5237        u32 index = kvm_rcx_read(vcpu);
5238        u64 address;
5239        bool accessed_dirty;
5240        struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
5241
5242        if (!nested_cpu_has_eptp_switching(vmcs12) ||
5243            !nested_cpu_has_ept(vmcs12))
5244                return 1;
5245
5246        if (index >= VMFUNC_EPTP_ENTRIES)
5247                return 1;
5248
5249
5250        if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
5251                                     &address, index * 8, 8))
5252                return 1;
5253
5254        accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT);
5255
5256        /*
5257         * If the (L2) guest does a vmfunc to the currently
5258         * active ept pointer, we don't have to do anything else
5259         */
5260        if (vmcs12->ept_pointer != address) {
5261                if (!valid_ept_address(vcpu, address))
5262                        return 1;
5263
5264                kvm_mmu_unload(vcpu);
5265                mmu->ept_ad = accessed_dirty;
5266                mmu->mmu_role.base.ad_disabled = !accessed_dirty;
5267                vmcs12->ept_pointer = address;
5268                /*
5269                 * TODO: Check what's the correct approach in case
5270                 * mmu reload fails. Currently, we just let the next
5271                 * reload potentially fail
5272                 */
5273                kvm_mmu_reload(vcpu);
5274        }
5275
5276        return 0;
5277}
5278
5279static int handle_vmfunc(struct kvm_vcpu *vcpu)
5280{
5281        struct vcpu_vmx *vmx = to_vmx(vcpu);
5282        struct vmcs12 *vmcs12;
5283        u32 function = kvm_rax_read(vcpu);
5284
5285        /*
5286         * VMFUNC is only supported for nested guests, but we always enable the
5287         * secondary control for simplicity; for non-nested mode, fake that we
5288         * didn't by injecting #UD.
5289         */
5290        if (!is_guest_mode(vcpu)) {
5291                kvm_queue_exception(vcpu, UD_VECTOR);
5292                return 1;
5293        }
5294
5295        vmcs12 = get_vmcs12(vcpu);
5296        if ((vmcs12->vm_function_control & (1 << function)) == 0)
5297                goto fail;
5298
5299        switch (function) {
5300        case 0:
5301                if (nested_vmx_eptp_switching(vcpu, vmcs12))
5302                        goto fail;
5303                break;
5304        default:
5305                goto fail;
5306        }
5307        return kvm_skip_emulated_instruction(vcpu);
5308
5309fail:
5310        nested_vmx_vmexit(vcpu, vmx->exit_reason,
5311                          vmcs_read32(VM_EXIT_INTR_INFO),
5312                          vmcs_readl(EXIT_QUALIFICATION));
5313        return 1;
5314}
5315
5316/*
5317 * Return true if an IO instruction with the specified port and size should cause
5318 * a VM-exit into L1.
5319 */
5320bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
5321                                 int size)
5322{
5323        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5324        gpa_t bitmap, last_bitmap;
5325        u8 b;
5326
5327        last_bitmap = (gpa_t)-1;
5328        b = -1;
5329
5330        while (size > 0) {
5331                if (port < 0x8000)
5332                        bitmap = vmcs12->io_bitmap_a;
5333                else if (port < 0x10000)
5334                        bitmap = vmcs12->io_bitmap_b;
5335                else
5336                        return true;
5337                bitmap += (port & 0x7fff) / 8;
5338
5339                if (last_bitmap != bitmap)
5340                        if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
5341                                return true;
5342                if (b & (1 << (port & 7)))
5343                        return true;
5344
5345                port++;
5346                size--;
5347                last_bitmap = bitmap;
5348        }
5349
5350        return false;
5351}
5352
5353static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
5354                                       struct vmcs12 *vmcs12)
5355{
5356        unsigned long exit_qualification;
5357        unsigned short port;
5358        int size;
5359
5360        if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
5361                return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
5362
5363        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5364
5365        port = exit_qualification >> 16;
5366        size = (exit_qualification & 7) + 1;
5367
5368        return nested_vmx_check_io_bitmaps(vcpu, port, size);
5369}
5370
5371/*
5372 * Return 1 if we should exit from L2 to L1 to handle an MSR access,
5373 * rather than handle it ourselves in L0. I.e., check whether L1 expressed
5374 * disinterest in the current event (read or write a specific MSR) by using an
5375 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
5376 */
5377static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
5378        struct vmcs12 *vmcs12, u32 exit_reason)
5379{
5380        u32 msr_index = kvm_rcx_read(vcpu);
5381        gpa_t bitmap;
5382
5383        if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
5384                return true;
5385
5386        /*
5387         * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
5388         * for the four combinations of read/write and low/high MSR numbers.
5389         * First we need to figure out which of the four to use:
5390         */
5391        bitmap = vmcs12->msr_bitmap;
5392        if (exit_reason == EXIT_REASON_MSR_WRITE)
5393                bitmap += 2048;
5394        if (msr_index >= 0xc0000000) {
5395                msr_index -= 0xc0000000;
5396                bitmap += 1024;
5397        }
5398
5399        /* Then read the msr_index'th bit from this bitmap: */
5400        if (msr_index < 1024*8) {
5401                unsigned char b;
5402                if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
5403                        return true;
5404                return 1 & (b >> (msr_index & 7));
5405        } else
5406                return true; /* let L1 handle the wrong parameter */
5407}
5408
5409/*
5410 * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
5411 * rather than handle it ourselves in L0. I.e., check if L1 wanted to
5412 * intercept (via guest_host_mask etc.) the current event.
5413 */
5414static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
5415        struct vmcs12 *vmcs12)
5416{
5417        unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5418        int cr = exit_qualification & 15;
5419        int reg;
5420        unsigned long val;
5421
5422        switch ((exit_qualification >> 4) & 3) {
5423        case 0: /* mov to cr */
5424                reg = (exit_qualification >> 8) & 15;
5425                val = kvm_register_readl(vcpu, reg);
5426                switch (cr) {
5427                case 0:
5428                        if (vmcs12->cr0_guest_host_mask &
5429                            (val ^ vmcs12->cr0_read_shadow))
5430                                return true;
5431                        break;
5432                case 3:
5433                        if ((vmcs12->cr3_target_count >= 1 &&
5434                                        vmcs12->cr3_target_value0 == val) ||
5435                                (vmcs12->cr3_target_count >= 2 &&
5436                                        vmcs12->cr3_target_value1 == val) ||
5437                                (vmcs12->cr3_target_count >= 3 &&
5438                                        vmcs12->cr3_target_value2 == val) ||
5439                                (vmcs12->cr3_target_count >= 4 &&
5440                                        vmcs12->cr3_target_value3 == val))
5441                                return false;
5442                        if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
5443                                return true;
5444                        break;
5445                case 4:
5446                        if (vmcs12->cr4_guest_host_mask &
5447                            (vmcs12->cr4_read_shadow ^ val))
5448                                return true;
5449                        break;
5450                case 8:
5451                        if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
5452                                return true;
5453                        break;
5454                }
5455                break;
5456        case 2: /* clts */
5457                if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
5458                    (vmcs12->cr0_read_shadow & X86_CR0_TS))
5459                        return true;
5460                break;
5461        case 1: /* mov from cr */
5462                switch (cr) {
5463                case 3:
5464                        if (vmcs12->cpu_based_vm_exec_control &
5465                            CPU_BASED_CR3_STORE_EXITING)
5466                                return true;
5467                        break;
5468                case 8:
5469                        if (vmcs12->cpu_based_vm_exec_control &
5470                            CPU_BASED_CR8_STORE_EXITING)
5471                                return true;
5472                        break;
5473                }
5474                break;
5475        case 3: /* lmsw */
5476                /*
5477                 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
5478                 * cr0. Other attempted changes are ignored, with no exit.
5479                 */
5480                val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5481                if (vmcs12->cr0_guest_host_mask & 0xe &
5482                    (val ^ vmcs12->cr0_read_shadow))
5483                        return true;
5484                if ((vmcs12->cr0_guest_host_mask & 0x1) &&
5485                    !(vmcs12->cr0_read_shadow & 0x1) &&
5486                    (val & 0x1))
5487                        return true;
5488                break;
5489        }
5490        return false;
5491}
5492
5493static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
5494        struct vmcs12 *vmcs12, gpa_t bitmap)
5495{
5496        u32 vmx_instruction_info;
5497        unsigned long field;
5498        u8 b;
5499
5500        if (!nested_cpu_has_shadow_vmcs(vmcs12))
5501                return true;
5502
5503        /* Decode instruction info and find the field to access */
5504        vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5505        field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
5506
5507        /* Out-of-range fields always cause a VM exit from L2 to L1 */
5508        if (field >> 15)
5509                return true;
5510
5511        if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
5512                return true;
5513
5514        return 1 & (b >> (field & 7));
5515}
5516
5517/*
5518 * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
5519 * should handle it ourselves in L0 (and then continue L2). Only call this
5520 * when in is_guest_mode (L2).
5521 */
5522bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
5523{
5524        u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
5525        struct vcpu_vmx *vmx = to_vmx(vcpu);
5526        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5527
5528        if (vmx->nested.nested_run_pending)
5529                return false;
5530
5531        if (unlikely(vmx->fail)) {
5532                trace_kvm_nested_vmenter_failed(
5533                        "hardware VM-instruction error: ",
5534                        vmcs_read32(VM_INSTRUCTION_ERROR));
5535                return true;
5536        }
5537
5538        /*
5539         * The host physical addresses of some pages of guest memory
5540         * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
5541         * Page). The CPU may write to these pages via their host
5542         * physical address while L2 is running, bypassing any
5543         * address-translation-based dirty tracking (e.g. EPT write
5544         * protection).
5545         *
5546         * Mark them dirty on every exit from L2 to prevent them from
5547         * getting out of sync with dirty tracking.
5548         */
5549        nested_mark_vmcs12_pages_dirty(vcpu);
5550
5551        trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
5552                                vmcs_readl(EXIT_QUALIFICATION),
5553                                vmx->idt_vectoring_info,
5554                                intr_info,
5555                                vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
5556                                KVM_ISA_VMX);
5557
5558        switch (exit_reason) {
5559        case EXIT_REASON_EXCEPTION_NMI:
5560                if (is_nmi(intr_info))
5561                        return false;
5562                else if (is_page_fault(intr_info))
5563                        return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
5564                else if (is_debug(intr_info) &&
5565                         vcpu->guest_debug &
5566                         (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
5567                        return false;
5568                else if (is_breakpoint(intr_info) &&
5569                         vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
5570                        return false;
5571                return vmcs12->exception_bitmap &
5572                                (1u << (intr_info & INTR_INFO_VECTOR_MASK));
5573        case EXIT_REASON_EXTERNAL_INTERRUPT:
5574                return false;
5575        case EXIT_REASON_TRIPLE_FAULT:
5576                return true;
5577        case EXIT_REASON_INTERRUPT_WINDOW:
5578                return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING);
5579        case EXIT_REASON_NMI_WINDOW:
5580                return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING);
5581        case EXIT_REASON_TASK_SWITCH:
5582                return true;
5583        case EXIT_REASON_CPUID:
5584                return true;
5585        case EXIT_REASON_HLT:
5586                return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
5587        case EXIT_REASON_INVD:
5588                return true;
5589        case EXIT_REASON_INVLPG:
5590                return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5591        case EXIT_REASON_RDPMC:
5592                return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
5593        case EXIT_REASON_RDRAND:
5594                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
5595        case EXIT_REASON_RDSEED:
5596                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
5597        case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
5598                return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
5599        case EXIT_REASON_VMREAD:
5600                return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
5601                        vmcs12->vmread_bitmap);
5602        case EXIT_REASON_VMWRITE:
5603                return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
5604                        vmcs12->vmwrite_bitmap);
5605        case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
5606        case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
5607        case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
5608        case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
5609        case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
5610                /*
5611                 * VMX instructions trap unconditionally. This allows L1 to
5612                 * emulate them for its L2 guest, i.e., allows 3-level nesting!
5613                 */
5614                return true;
5615        case EXIT_REASON_CR_ACCESS:
5616                return nested_vmx_exit_handled_cr(vcpu, vmcs12);
5617        case EXIT_REASON_DR_ACCESS:
5618                return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
5619        case EXIT_REASON_IO_INSTRUCTION:
5620                return nested_vmx_exit_handled_io(vcpu, vmcs12);
5621        case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
5622                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
5623        case EXIT_REASON_MSR_READ:
5624        case EXIT_REASON_MSR_WRITE:
5625                return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
5626        case EXIT_REASON_INVALID_STATE:
5627                return true;
5628        case EXIT_REASON_MWAIT_INSTRUCTION:
5629                return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
5630        case EXIT_REASON_MONITOR_TRAP_FLAG:
5631                return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
5632        case EXIT_REASON_MONITOR_INSTRUCTION:
5633                return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
5634        case EXIT_REASON_PAUSE_INSTRUCTION:
5635                return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
5636                        nested_cpu_has2(vmcs12,
5637                                SECONDARY_EXEC_PAUSE_LOOP_EXITING);
5638        case EXIT_REASON_MCE_DURING_VMENTRY:
5639                return false;
5640        case EXIT_REASON_TPR_BELOW_THRESHOLD:
5641                return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
5642        case EXIT_REASON_APIC_ACCESS:
5643        case EXIT_REASON_APIC_WRITE:
5644        case EXIT_REASON_EOI_INDUCED:
5645                /*
5646                 * The controls for "virtualize APIC accesses," "APIC-
5647                 * register virtualization," and "virtual-interrupt
5648                 * delivery" only come from vmcs12.
5649                 */
5650                return true;
5651        case EXIT_REASON_EPT_VIOLATION:
5652                /*
5653                 * L0 always deals with the EPT violation. If nested EPT is
5654                 * used, and the nested mmu code discovers that the address is
5655                 * missing in the guest EPT table (EPT12), the EPT violation
5656                 * will be injected with nested_ept_inject_page_fault()
5657                 */
5658                return false;
5659        case EXIT_REASON_EPT_MISCONFIG:
5660                /*
5661                 * L2 never uses directly L1's EPT, but rather L0's own EPT
5662                 * table (shadow on EPT) or a merged EPT table that L0 built
5663                 * (EPT on EPT). So any problems with the structure of the
5664                 * table is L0's fault.
5665                 */
5666                return false;
5667        case EXIT_REASON_INVPCID:
5668                return
5669                        nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
5670                        nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5671        case EXIT_REASON_WBINVD:
5672                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
5673        case EXIT_REASON_XSETBV:
5674                return true;
5675        case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
5676                /*
5677                 * This should never happen, since it is not possible to
5678                 * set XSS to a non-zero value---neither in L1 nor in L2.
5679                 * If if it were, XSS would have to be checked against
5680                 * the XSS exit bitmap in vmcs12.
5681                 */
5682                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
5683        case EXIT_REASON_PREEMPTION_TIMER:
5684                return false;
5685        case EXIT_REASON_PML_FULL:
5686                /* We emulate PML support to L1. */
5687                return false;
5688        case EXIT_REASON_VMFUNC:
5689                /* VM functions are emulated through L2->L0 vmexits. */
5690                return false;
5691        case EXIT_REASON_ENCLS:
5692                /* SGX is never exposed to L1 */
5693                return false;
5694        case EXIT_REASON_UMWAIT:
5695        case EXIT_REASON_TPAUSE:
5696                return nested_cpu_has2(vmcs12,
5697                        SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
5698        default:
5699                return true;
5700        }
5701}
5702
5703
5704static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
5705                                struct kvm_nested_state __user *user_kvm_nested_state,
5706                                u32 user_data_size)
5707{
5708        struct vcpu_vmx *vmx;
5709        struct vmcs12 *vmcs12;
5710        struct kvm_nested_state kvm_state = {
5711                .flags = 0,
5712                .format = KVM_STATE_NESTED_FORMAT_VMX,
5713                .size = sizeof(kvm_state),
5714                .hdr.vmx.vmxon_pa = -1ull,
5715                .hdr.vmx.vmcs12_pa = -1ull,
5716        };
5717        struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
5718                &user_kvm_nested_state->data.vmx[0];
5719
5720        if (!vcpu)
5721                return kvm_state.size + sizeof(*user_vmx_nested_state);
5722
5723        vmx = to_vmx(vcpu);
5724        vmcs12 = get_vmcs12(vcpu);
5725
5726        if (nested_vmx_allowed(vcpu) &&
5727            (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
5728                kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
5729                kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr;
5730
5731                if (vmx_has_valid_vmcs12(vcpu)) {
5732                        kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
5733
5734                        if (vmx->nested.hv_evmcs)
5735                                kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
5736
5737                        if (is_guest_mode(vcpu) &&
5738                            nested_cpu_has_shadow_vmcs(vmcs12) &&
5739                            vmcs12->vmcs_link_pointer != -1ull)
5740                                kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12);
5741                }
5742
5743                if (vmx->nested.smm.vmxon)
5744                        kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
5745
5746                if (vmx->nested.smm.guest_mode)
5747                        kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
5748
5749                if (is_guest_mode(vcpu)) {
5750                        kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
5751
5752                        if (vmx->nested.nested_run_pending)
5753                                kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
5754
5755                        if (vmx->nested.mtf_pending)
5756                                kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING;
5757                }
5758        }
5759
5760        if (user_data_size < kvm_state.size)
5761                goto out;
5762
5763        if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
5764                return -EFAULT;
5765
5766        if (!vmx_has_valid_vmcs12(vcpu))
5767                goto out;
5768
5769        /*
5770         * When running L2, the authoritative vmcs12 state is in the
5771         * vmcs02. When running L1, the authoritative vmcs12 state is
5772         * in the shadow or enlightened vmcs linked to vmcs01, unless
5773         * need_vmcs12_to_shadow_sync is set, in which case, the authoritative
5774         * vmcs12 state is in the vmcs12 already.
5775         */
5776        if (is_guest_mode(vcpu)) {
5777                sync_vmcs02_to_vmcs12(vcpu, vmcs12);
5778                sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
5779        } else if (!vmx->nested.need_vmcs12_to_shadow_sync) {
5780                if (vmx->nested.hv_evmcs)
5781                        copy_enlightened_to_vmcs12(vmx);
5782                else if (enable_shadow_vmcs)
5783                        copy_shadow_to_vmcs12(vmx);
5784        }
5785
5786        BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
5787        BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE);
5788
5789        /*
5790         * Copy over the full allocated size of vmcs12 rather than just the size
5791         * of the struct.
5792         */
5793        if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE))
5794                return -EFAULT;
5795
5796        if (nested_cpu_has_shadow_vmcs(vmcs12) &&
5797            vmcs12->vmcs_link_pointer != -1ull) {
5798                if (copy_to_user(user_vmx_nested_state->shadow_vmcs12,
5799                                 get_shadow_vmcs12(vcpu), VMCS12_SIZE))
5800                        return -EFAULT;
5801        }
5802
5803out:
5804        return kvm_state.size;
5805}
5806
5807/*
5808 * Forcibly leave nested mode in order to be able to reset the VCPU later on.
5809 */
5810void vmx_leave_nested(struct kvm_vcpu *vcpu)
5811{
5812        if (is_guest_mode(vcpu)) {
5813                to_vmx(vcpu)->nested.nested_run_pending = 0;
5814                nested_vmx_vmexit(vcpu, -1, 0, 0);
5815        }
5816        free_nested(vcpu);
5817}
5818
5819static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
5820                                struct kvm_nested_state __user *user_kvm_nested_state,
5821                                struct kvm_nested_state *kvm_state)
5822{
5823        struct vcpu_vmx *vmx = to_vmx(vcpu);
5824        struct vmcs12 *vmcs12;
5825        u32 exit_qual;
5826        struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
5827                &user_kvm_nested_state->data.vmx[0];
5828        int ret;
5829
5830        if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX)
5831                return -EINVAL;
5832
5833        if (kvm_state->hdr.vmx.vmxon_pa == -1ull) {
5834                if (kvm_state->hdr.vmx.smm.flags)
5835                        return -EINVAL;
5836
5837                if (kvm_state->hdr.vmx.vmcs12_pa != -1ull)
5838                        return -EINVAL;
5839
5840                /*
5841                 * KVM_STATE_NESTED_EVMCS used to signal that KVM should
5842                 * enable eVMCS capability on vCPU. However, since then
5843                 * code was changed such that flag signals vmcs12 should
5844                 * be copied into eVMCS in guest memory.
5845                 *
5846                 * To preserve backwards compatability, allow user
5847                 * to set this flag even when there is no VMXON region.
5848                 */
5849                if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
5850                        return -EINVAL;
5851        } else {
5852                if (!nested_vmx_allowed(vcpu))
5853                        return -EINVAL;
5854
5855                if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa))
5856                        return -EINVAL;
5857        }
5858
5859        if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
5860            (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
5861                return -EINVAL;
5862
5863        if (kvm_state->hdr.vmx.smm.flags &
5864            ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
5865                return -EINVAL;
5866
5867        /*
5868         * SMM temporarily disables VMX, so we cannot be in guest mode,
5869         * nor can VMLAUNCH/VMRESUME be pending.  Outside SMM, SMM flags
5870         * must be zero.
5871         */
5872        if (is_smm(vcpu) ?
5873                (kvm_state->flags &
5874                 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
5875                : kvm_state->hdr.vmx.smm.flags)
5876                return -EINVAL;
5877
5878        if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
5879            !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
5880                return -EINVAL;
5881
5882        if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) &&
5883                (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled))
5884                        return -EINVAL;
5885
5886        vmx_leave_nested(vcpu);
5887
5888        if (kvm_state->hdr.vmx.vmxon_pa == -1ull)
5889                return 0;
5890
5891        vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa;
5892        ret = enter_vmx_operation(vcpu);
5893        if (ret)
5894                return ret;
5895
5896        /* Empty 'VMXON' state is permitted */
5897        if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12))
5898                return 0;
5899
5900        if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) {
5901                if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
5902                    !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa))
5903                        return -EINVAL;
5904
5905                set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa);
5906        } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
5907                /*
5908                 * Sync eVMCS upon entry as we may not have
5909                 * HV_X64_MSR_VP_ASSIST_PAGE set up yet.
5910                 */
5911                vmx->nested.need_vmcs12_to_shadow_sync = true;
5912        } else {
5913                return -EINVAL;
5914        }
5915
5916        if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
5917                vmx->nested.smm.vmxon = true;
5918                vmx->nested.vmxon = false;
5919
5920                if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
5921                        vmx->nested.smm.guest_mode = true;
5922        }
5923
5924        vmcs12 = get_vmcs12(vcpu);
5925        if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12)))
5926                return -EFAULT;
5927
5928        if (vmcs12->hdr.revision_id != VMCS12_REVISION)
5929                return -EINVAL;
5930
5931        if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
5932                return 0;
5933
5934        vmx->nested.nested_run_pending =
5935                !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
5936
5937        vmx->nested.mtf_pending =
5938                !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING);
5939
5940        ret = -EINVAL;
5941        if (nested_cpu_has_shadow_vmcs(vmcs12) &&
5942            vmcs12->vmcs_link_pointer != -1ull) {
5943                struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
5944
5945                if (kvm_state->size <
5946                    sizeof(*kvm_state) +
5947                    sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12))
5948                        goto error_guest_mode;
5949
5950                if (copy_from_user(shadow_vmcs12,
5951                                   user_vmx_nested_state->shadow_vmcs12,
5952                                   sizeof(*shadow_vmcs12))) {
5953                        ret = -EFAULT;
5954                        goto error_guest_mode;
5955                }
5956
5957                if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
5958                    !shadow_vmcs12->hdr.shadow_vmcs)
5959                        goto error_guest_mode;
5960        }
5961
5962        if (nested_vmx_check_controls(vcpu, vmcs12) ||
5963            nested_vmx_check_host_state(vcpu, vmcs12) ||
5964            nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
5965                goto error_guest_mode;
5966
5967        vmx->nested.dirty_vmcs12 = true;
5968        ret = nested_vmx_enter_non_root_mode(vcpu, false);
5969        if (ret)
5970                goto error_guest_mode;
5971
5972        return 0;
5973
5974error_guest_mode:
5975        vmx->nested.nested_run_pending = 0;
5976        return ret;
5977}
5978
5979void nested_vmx_set_vmcs_shadowing_bitmap(void)
5980{
5981        if (enable_shadow_vmcs) {
5982                vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
5983                vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
5984        }
5985}
5986
5987/*
5988 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
5989 * returned for the various VMX controls MSRs when nested VMX is enabled.
5990 * The same values should also be used to verify that vmcs12 control fields are
5991 * valid during nested entry from L1 to L2.
5992 * Each of these control msrs has a low and high 32-bit half: A low bit is on
5993 * if the corresponding bit in the (32-bit) control field *must* be on, and a
5994 * bit in the high half is on if the corresponding bit in the control field
5995 * may be on. See also vmx_control_verify().
5996 */
5997void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
5998{
5999        /*
6000         * Note that as a general rule, the high half of the MSRs (bits in
6001         * the control fields which may be 1) should be initialized by the
6002         * intersection of the underlying hardware's MSR (i.e., features which
6003         * can be supported) and the list of features we want to expose -
6004         * because they are known to be properly supported in our code.
6005         * Also, usually, the low half of the MSRs (bits which must be 1) can
6006         * be set to 0, meaning that L1 may turn off any of these bits. The
6007         * reason is that if one of these bits is necessary, it will appear
6008         * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
6009         * fields of vmcs01 and vmcs02, will turn these bits off - and
6010         * nested_vmx_exit_reflected() will not pass related exits to L1.
6011         * These rules have exceptions below.
6012         */
6013
6014        /* pin-based controls */
6015        rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
6016                msrs->pinbased_ctls_low,
6017                msrs->pinbased_ctls_high);
6018        msrs->pinbased_ctls_low |=
6019                PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
6020        msrs->pinbased_ctls_high &=
6021                PIN_BASED_EXT_INTR_MASK |
6022                PIN_BASED_NMI_EXITING |
6023                PIN_BASED_VIRTUAL_NMIS |
6024                (enable_apicv ? PIN_BASED_POSTED_INTR : 0);
6025        msrs->pinbased_ctls_high |=
6026                PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
6027                PIN_BASED_VMX_PREEMPTION_TIMER;
6028
6029        /* exit controls */
6030        rdmsr(MSR_IA32_VMX_EXIT_CTLS,
6031                msrs->exit_ctls_low,
6032                msrs->exit_ctls_high);
6033        msrs->exit_ctls_low =
6034                VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
6035
6036        msrs->exit_ctls_high &=
6037#ifdef CONFIG_X86_64
6038                VM_EXIT_HOST_ADDR_SPACE_SIZE |
6039#endif
6040                VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
6041        msrs->exit_ctls_high |=
6042                VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
6043                VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
6044                VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
6045
6046        /* We support free control of debug control saving. */
6047        msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
6048
6049        /* entry controls */
6050        rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
6051                msrs->entry_ctls_low,
6052                msrs->entry_ctls_high);
6053        msrs->entry_ctls_low =
6054                VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
6055        msrs->entry_ctls_high &=
6056#ifdef CONFIG_X86_64
6057                VM_ENTRY_IA32E_MODE |
6058#endif
6059                VM_ENTRY_LOAD_IA32_PAT;
6060        msrs->entry_ctls_high |=
6061                (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
6062
6063        /* We support free control of debug control loading. */
6064        msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
6065
6066        /* cpu-based controls */
6067        rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
6068                msrs->procbased_ctls_low,
6069                msrs->procbased_ctls_high);
6070        msrs->procbased_ctls_low =
6071                CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
6072        msrs->procbased_ctls_high &=
6073                CPU_BASED_INTR_WINDOW_EXITING |
6074                CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING |
6075                CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
6076                CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
6077                CPU_BASED_CR3_STORE_EXITING |
6078#ifdef CONFIG_X86_64
6079                CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
6080#endif
6081                CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
6082                CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
6083                CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
6084                CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
6085                CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
6086        /*
6087         * We can allow some features even when not supported by the
6088         * hardware. For example, L1 can specify an MSR bitmap - and we
6089         * can use it to avoid exits to L1 - even when L0 runs L2
6090         * without MSR bitmaps.
6091         */
6092        msrs->procbased_ctls_high |=
6093                CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
6094                CPU_BASED_USE_MSR_BITMAPS;
6095
6096        /* We support free control of CR3 access interception. */
6097        msrs->procbased_ctls_low &=
6098                ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
6099
6100        /*
6101         * secondary cpu-based controls.  Do not include those that
6102         * depend on CPUID bits, they are added later by vmx_cpuid_update.
6103         */
6104        if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
6105                rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
6106                      msrs->secondary_ctls_low,
6107                      msrs->secondary_ctls_high);
6108
6109        msrs->secondary_ctls_low = 0;
6110        msrs->secondary_ctls_high &=
6111                SECONDARY_EXEC_DESC |
6112                SECONDARY_EXEC_RDTSCP |
6113                SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
6114                SECONDARY_EXEC_WBINVD_EXITING |
6115                SECONDARY_EXEC_APIC_REGISTER_VIRT |
6116                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
6117                SECONDARY_EXEC_RDRAND_EXITING |
6118                SECONDARY_EXEC_ENABLE_INVPCID |
6119                SECONDARY_EXEC_RDSEED_EXITING |
6120                SECONDARY_EXEC_XSAVES;
6121
6122        /*
6123         * We can emulate "VMCS shadowing," even if the hardware
6124         * doesn't support it.
6125         */
6126        msrs->secondary_ctls_high |=
6127                SECONDARY_EXEC_SHADOW_VMCS;
6128
6129        if (enable_ept) {
6130                /* nested EPT: emulate EPT also to L1 */
6131                msrs->secondary_ctls_high |=
6132                        SECONDARY_EXEC_ENABLE_EPT;
6133                msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
6134                         VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
6135                if (cpu_has_vmx_ept_execute_only())
6136                        msrs->ept_caps |=
6137                                VMX_EPT_EXECUTE_ONLY_BIT;
6138                msrs->ept_caps &= ept_caps;
6139                msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
6140                        VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
6141                        VMX_EPT_1GB_PAGE_BIT;
6142                if (enable_ept_ad_bits) {
6143                        msrs->secondary_ctls_high |=
6144                                SECONDARY_EXEC_ENABLE_PML;
6145                        msrs->ept_caps |= VMX_EPT_AD_BIT;
6146                }
6147        }
6148
6149        if (cpu_has_vmx_vmfunc()) {
6150                msrs->secondary_ctls_high |=
6151                        SECONDARY_EXEC_ENABLE_VMFUNC;
6152                /*
6153                 * Advertise EPTP switching unconditionally
6154                 * since we emulate it
6155                 */
6156                if (enable_ept)
6157                        msrs->vmfunc_controls =
6158                                VMX_VMFUNC_EPTP_SWITCHING;
6159        }
6160
6161        /*
6162         * Old versions of KVM use the single-context version without
6163         * checking for support, so declare that it is supported even
6164         * though it is treated as global context.  The alternative is
6165         * not failing the single-context invvpid, and it is worse.
6166         */
6167        if (enable_vpid) {
6168                msrs->secondary_ctls_high |=
6169                        SECONDARY_EXEC_ENABLE_VPID;
6170                msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
6171                        VMX_VPID_EXTENT_SUPPORTED_MASK;
6172        }
6173
6174        if (enable_unrestricted_guest)
6175                msrs->secondary_ctls_high |=
6176                        SECONDARY_EXEC_UNRESTRICTED_GUEST;
6177
6178        if (flexpriority_enabled)
6179                msrs->secondary_ctls_high |=
6180                        SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6181
6182        /* miscellaneous data */
6183        rdmsr(MSR_IA32_VMX_MISC,
6184                msrs->misc_low,
6185                msrs->misc_high);
6186        msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
6187        msrs->misc_low |=
6188                MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
6189                VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
6190                VMX_MISC_ACTIVITY_HLT;
6191        msrs->misc_high = 0;
6192
6193        /*
6194         * This MSR reports some information about VMX support. We
6195         * should return information about the VMX we emulate for the
6196         * guest, and the VMCS structure we give it - not about the
6197         * VMX support of the underlying hardware.
6198         */
6199        msrs->basic =
6200                VMCS12_REVISION |
6201                VMX_BASIC_TRUE_CTLS |
6202                ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
6203                (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
6204
6205        if (cpu_has_vmx_basic_inout())
6206                msrs->basic |= VMX_BASIC_INOUT;
6207
6208        /*
6209         * These MSRs specify bits which the guest must keep fixed on
6210         * while L1 is in VMXON mode (in L1's root mode, or running an L2).
6211         * We picked the standard core2 setting.
6212         */
6213#define VMXON_CR0_ALWAYSON     (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
6214#define VMXON_CR4_ALWAYSON     X86_CR4_VMXE
6215        msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
6216        msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
6217
6218        /* These MSRs specify bits which the guest must keep fixed off. */
6219        rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
6220        rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
6221
6222        /* highest index: VMX_PREEMPTION_TIMER_VALUE */
6223        msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
6224}
6225
6226void nested_vmx_hardware_unsetup(void)
6227{
6228        int i;
6229
6230        if (enable_shadow_vmcs) {
6231                for (i = 0; i < VMX_BITMAP_NR; i++)
6232                        free_page((unsigned long)vmx_bitmap[i]);
6233        }
6234}
6235
6236__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
6237{
6238        int i;
6239
6240        if (!cpu_has_vmx_shadow_vmcs())
6241                enable_shadow_vmcs = 0;
6242        if (enable_shadow_vmcs) {
6243                for (i = 0; i < VMX_BITMAP_NR; i++) {
6244                        /*
6245                         * The vmx_bitmap is not tied to a VM and so should
6246                         * not be charged to a memcg.
6247                         */
6248                        vmx_bitmap[i] = (unsigned long *)
6249                                __get_free_page(GFP_KERNEL);
6250                        if (!vmx_bitmap[i]) {
6251                                nested_vmx_hardware_unsetup();
6252                                return -ENOMEM;
6253                        }
6254                }
6255
6256                init_vmcs_shadow_fields();
6257        }
6258
6259        exit_handlers[EXIT_REASON_VMCLEAR]      = handle_vmclear;
6260        exit_handlers[EXIT_REASON_VMLAUNCH]     = handle_vmlaunch;
6261        exit_handlers[EXIT_REASON_VMPTRLD]      = handle_vmptrld;
6262        exit_handlers[EXIT_REASON_VMPTRST]      = handle_vmptrst;
6263        exit_handlers[EXIT_REASON_VMREAD]       = handle_vmread;
6264        exit_handlers[EXIT_REASON_VMRESUME]     = handle_vmresume;
6265        exit_handlers[EXIT_REASON_VMWRITE]      = handle_vmwrite;
6266        exit_handlers[EXIT_REASON_VMOFF]        = handle_vmoff;
6267        exit_handlers[EXIT_REASON_VMON]         = handle_vmon;
6268        exit_handlers[EXIT_REASON_INVEPT]       = handle_invept;
6269        exit_handlers[EXIT_REASON_INVVPID]      = handle_invvpid;
6270        exit_handlers[EXIT_REASON_VMFUNC]       = handle_vmfunc;
6271
6272        kvm_x86_ops->check_nested_events = vmx_check_nested_events;
6273        kvm_x86_ops->get_nested_state = vmx_get_nested_state;
6274        kvm_x86_ops->set_nested_state = vmx_set_nested_state;
6275        kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages;
6276        kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs;
6277        kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version;
6278
6279        return 0;
6280}
6281