linux/arch/x86/include/asm/kvm_host.h
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0-only */
   2/*
   3 * Kernel-based Virtual Machine driver for Linux
   4 *
   5 * This header defines architecture specific interfaces, x86 version
   6 */
   7
   8#ifndef _ASM_X86_KVM_HOST_H
   9#define _ASM_X86_KVM_HOST_H
  10
  11#include <linux/types.h>
  12#include <linux/mm.h>
  13#include <linux/mmu_notifier.h>
  14#include <linux/tracepoint.h>
  15#include <linux/cpumask.h>
  16#include <linux/irq_work.h>
  17#include <linux/irq.h>
  18
  19#include <linux/kvm.h>
  20#include <linux/kvm_para.h>
  21#include <linux/kvm_types.h>
  22#include <linux/perf_event.h>
  23#include <linux/pvclock_gtod.h>
  24#include <linux/clocksource.h>
  25#include <linux/irqbypass.h>
  26#include <linux/hyperv.h>
  27
  28#include <asm/apic.h>
  29#include <asm/pvclock-abi.h>
  30#include <asm/desc.h>
  31#include <asm/mtrr.h>
  32#include <asm/msr-index.h>
  33#include <asm/asm.h>
  34#include <asm/kvm_page_track.h>
  35#include <asm/kvm_vcpu_regs.h>
  36#include <asm/hyperv-tlfs.h>
  37
  38#define __KVM_HAVE_ARCH_VCPU_DEBUGFS
  39
  40#define KVM_MAX_VCPUS 1024
  41
  42/*
  43 * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs
  44 * might be larger than the actual number of VCPUs because the
  45 * APIC ID encodes CPU topology information.
  46 *
  47 * In the worst case, we'll need less than one extra bit for the
  48 * Core ID, and less than one extra bit for the Package (Die) ID,
  49 * so ratio of 4 should be enough.
  50 */
  51#define KVM_VCPU_ID_RATIO 4
  52#define KVM_MAX_VCPU_IDS (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO)
  53
  54/* memory slots that are not exposed to userspace */
  55#define KVM_PRIVATE_MEM_SLOTS 3
  56
  57#define KVM_HALT_POLL_NS_DEFAULT 200000
  58
  59#define KVM_IRQCHIP_NUM_PINS  KVM_IOAPIC_NUM_PINS
  60
  61#define KVM_DIRTY_LOG_MANUAL_CAPS   (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
  62                                        KVM_DIRTY_LOG_INITIALLY_SET)
  63
  64#define KVM_BUS_LOCK_DETECTION_VALID_MODE       (KVM_BUS_LOCK_DETECTION_OFF | \
  65                                                 KVM_BUS_LOCK_DETECTION_EXIT)
  66
  67/* x86-specific vcpu->requests bit members */
  68#define KVM_REQ_MIGRATE_TIMER           KVM_ARCH_REQ(0)
  69#define KVM_REQ_REPORT_TPR_ACCESS       KVM_ARCH_REQ(1)
  70#define KVM_REQ_TRIPLE_FAULT            KVM_ARCH_REQ(2)
  71#define KVM_REQ_MMU_SYNC                KVM_ARCH_REQ(3)
  72#define KVM_REQ_CLOCK_UPDATE            KVM_ARCH_REQ(4)
  73#define KVM_REQ_LOAD_MMU_PGD            KVM_ARCH_REQ(5)
  74#define KVM_REQ_EVENT                   KVM_ARCH_REQ(6)
  75#define KVM_REQ_APF_HALT                KVM_ARCH_REQ(7)
  76#define KVM_REQ_STEAL_UPDATE            KVM_ARCH_REQ(8)
  77#define KVM_REQ_NMI                     KVM_ARCH_REQ(9)
  78#define KVM_REQ_PMU                     KVM_ARCH_REQ(10)
  79#define KVM_REQ_PMI                     KVM_ARCH_REQ(11)
  80#define KVM_REQ_SMI                     KVM_ARCH_REQ(12)
  81#define KVM_REQ_MASTERCLOCK_UPDATE      KVM_ARCH_REQ(13)
  82#define KVM_REQ_MCLOCK_INPROGRESS \
  83        KVM_ARCH_REQ_FLAGS(14, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
  84#define KVM_REQ_SCAN_IOAPIC \
  85        KVM_ARCH_REQ_FLAGS(15, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
  86#define KVM_REQ_GLOBAL_CLOCK_UPDATE     KVM_ARCH_REQ(16)
  87#define KVM_REQ_APIC_PAGE_RELOAD \
  88        KVM_ARCH_REQ_FLAGS(17, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
  89#define KVM_REQ_HV_CRASH                KVM_ARCH_REQ(18)
  90#define KVM_REQ_IOAPIC_EOI_EXIT         KVM_ARCH_REQ(19)
  91#define KVM_REQ_HV_RESET                KVM_ARCH_REQ(20)
  92#define KVM_REQ_HV_EXIT                 KVM_ARCH_REQ(21)
  93#define KVM_REQ_HV_STIMER               KVM_ARCH_REQ(22)
  94#define KVM_REQ_LOAD_EOI_EXITMAP        KVM_ARCH_REQ(23)
  95#define KVM_REQ_GET_NESTED_STATE_PAGES  KVM_ARCH_REQ(24)
  96#define KVM_REQ_APICV_UPDATE \
  97        KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
  98#define KVM_REQ_TLB_FLUSH_CURRENT       KVM_ARCH_REQ(26)
  99#define KVM_REQ_TLB_FLUSH_GUEST \
 100        KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 101#define KVM_REQ_APF_READY               KVM_ARCH_REQ(28)
 102#define KVM_REQ_MSR_FILTER_CHANGED      KVM_ARCH_REQ(29)
 103#define KVM_REQ_UPDATE_CPU_DIRTY_LOGGING \
 104        KVM_ARCH_REQ_FLAGS(30, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 105
 106#define CR0_RESERVED_BITS                                               \
 107        (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
 108                          | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
 109                          | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
 110
 111#define CR4_RESERVED_BITS                                               \
 112        (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
 113                          | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
 114                          | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \
 115                          | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
 116                          | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \
 117                          | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP))
 118
 119#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
 120
 121
 122
 123#define INVALID_PAGE (~(hpa_t)0)
 124#define VALID_PAGE(x) ((x) != INVALID_PAGE)
 125
 126#define UNMAPPED_GVA (~(gpa_t)0)
 127#define INVALID_GPA (~(gpa_t)0)
 128
 129/* KVM Hugepage definitions for x86 */
 130#define KVM_MAX_HUGEPAGE_LEVEL  PG_LEVEL_1G
 131#define KVM_NR_PAGE_SIZES       (KVM_MAX_HUGEPAGE_LEVEL - PG_LEVEL_4K + 1)
 132#define KVM_HPAGE_GFN_SHIFT(x)  (((x) - 1) * 9)
 133#define KVM_HPAGE_SHIFT(x)      (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
 134#define KVM_HPAGE_SIZE(x)       (1UL << KVM_HPAGE_SHIFT(x))
 135#define KVM_HPAGE_MASK(x)       (~(KVM_HPAGE_SIZE(x) - 1))
 136#define KVM_PAGES_PER_HPAGE(x)  (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
 137
 138#define KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO 50
 139#define KVM_MIN_ALLOC_MMU_PAGES 64UL
 140#define KVM_MMU_HASH_SHIFT 12
 141#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
 142#define KVM_MIN_FREE_MMU_PAGES 5
 143#define KVM_REFILL_PAGES 25
 144#define KVM_MAX_CPUID_ENTRIES 256
 145#define KVM_NR_FIXED_MTRR_REGION 88
 146#define KVM_NR_VAR_MTRR 8
 147
 148#define ASYNC_PF_PER_VCPU 64
 149
 150enum kvm_reg {
 151        VCPU_REGS_RAX = __VCPU_REGS_RAX,
 152        VCPU_REGS_RCX = __VCPU_REGS_RCX,
 153        VCPU_REGS_RDX = __VCPU_REGS_RDX,
 154        VCPU_REGS_RBX = __VCPU_REGS_RBX,
 155        VCPU_REGS_RSP = __VCPU_REGS_RSP,
 156        VCPU_REGS_RBP = __VCPU_REGS_RBP,
 157        VCPU_REGS_RSI = __VCPU_REGS_RSI,
 158        VCPU_REGS_RDI = __VCPU_REGS_RDI,
 159#ifdef CONFIG_X86_64
 160        VCPU_REGS_R8  = __VCPU_REGS_R8,
 161        VCPU_REGS_R9  = __VCPU_REGS_R9,
 162        VCPU_REGS_R10 = __VCPU_REGS_R10,
 163        VCPU_REGS_R11 = __VCPU_REGS_R11,
 164        VCPU_REGS_R12 = __VCPU_REGS_R12,
 165        VCPU_REGS_R13 = __VCPU_REGS_R13,
 166        VCPU_REGS_R14 = __VCPU_REGS_R14,
 167        VCPU_REGS_R15 = __VCPU_REGS_R15,
 168#endif
 169        VCPU_REGS_RIP,
 170        NR_VCPU_REGS,
 171
 172        VCPU_EXREG_PDPTR = NR_VCPU_REGS,
 173        VCPU_EXREG_CR0,
 174        VCPU_EXREG_CR3,
 175        VCPU_EXREG_CR4,
 176        VCPU_EXREG_RFLAGS,
 177        VCPU_EXREG_SEGMENTS,
 178        VCPU_EXREG_EXIT_INFO_1,
 179        VCPU_EXREG_EXIT_INFO_2,
 180};
 181
 182enum {
 183        VCPU_SREG_ES,
 184        VCPU_SREG_CS,
 185        VCPU_SREG_SS,
 186        VCPU_SREG_DS,
 187        VCPU_SREG_FS,
 188        VCPU_SREG_GS,
 189        VCPU_SREG_TR,
 190        VCPU_SREG_LDTR,
 191};
 192
 193enum exit_fastpath_completion {
 194        EXIT_FASTPATH_NONE,
 195        EXIT_FASTPATH_REENTER_GUEST,
 196        EXIT_FASTPATH_EXIT_HANDLED,
 197};
 198typedef enum exit_fastpath_completion fastpath_t;
 199
 200struct x86_emulate_ctxt;
 201struct x86_exception;
 202enum x86_intercept;
 203enum x86_intercept_stage;
 204
 205#define KVM_NR_DB_REGS  4
 206
 207#define DR6_BUS_LOCK   (1 << 11)
 208#define DR6_BD          (1 << 13)
 209#define DR6_BS          (1 << 14)
 210#define DR6_BT          (1 << 15)
 211#define DR6_RTM         (1 << 16)
 212/*
 213 * DR6_ACTIVE_LOW combines fixed-1 and active-low bits.
 214 * We can regard all the bits in DR6_FIXED_1 as active_low bits;
 215 * they will never be 0 for now, but when they are defined
 216 * in the future it will require no code change.
 217 *
 218 * DR6_ACTIVE_LOW is also used as the init/reset value for DR6.
 219 */
 220#define DR6_ACTIVE_LOW  0xffff0ff0
 221#define DR6_VOLATILE    0x0001e80f
 222#define DR6_FIXED_1     (DR6_ACTIVE_LOW & ~DR6_VOLATILE)
 223
 224#define DR7_BP_EN_MASK  0x000000ff
 225#define DR7_GE          (1 << 9)
 226#define DR7_GD          (1 << 13)
 227#define DR7_FIXED_1     0x00000400
 228#define DR7_VOLATILE    0xffff2bff
 229
 230#define KVM_GUESTDBG_VALID_MASK \
 231        (KVM_GUESTDBG_ENABLE | \
 232        KVM_GUESTDBG_SINGLESTEP | \
 233        KVM_GUESTDBG_USE_HW_BP | \
 234        KVM_GUESTDBG_USE_SW_BP | \
 235        KVM_GUESTDBG_INJECT_BP | \
 236        KVM_GUESTDBG_INJECT_DB | \
 237        KVM_GUESTDBG_BLOCKIRQ)
 238
 239
 240#define PFERR_PRESENT_BIT 0
 241#define PFERR_WRITE_BIT 1
 242#define PFERR_USER_BIT 2
 243#define PFERR_RSVD_BIT 3
 244#define PFERR_FETCH_BIT 4
 245#define PFERR_PK_BIT 5
 246#define PFERR_SGX_BIT 15
 247#define PFERR_GUEST_FINAL_BIT 32
 248#define PFERR_GUEST_PAGE_BIT 33
 249
 250#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
 251#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
 252#define PFERR_USER_MASK (1U << PFERR_USER_BIT)
 253#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
 254#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
 255#define PFERR_PK_MASK (1U << PFERR_PK_BIT)
 256#define PFERR_SGX_MASK (1U << PFERR_SGX_BIT)
 257#define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT)
 258#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT)
 259
 260#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK |        \
 261                                 PFERR_WRITE_MASK |             \
 262                                 PFERR_PRESENT_MASK)
 263
 264/* apic attention bits */
 265#define KVM_APIC_CHECK_VAPIC    0
 266/*
 267 * The following bit is set with PV-EOI, unset on EOI.
 268 * We detect PV-EOI changes by guest by comparing
 269 * this bit with PV-EOI in guest memory.
 270 * See the implementation in apic_update_pv_eoi.
 271 */
 272#define KVM_APIC_PV_EOI_PENDING 1
 273
 274struct kvm_kernel_irq_routing_entry;
 275
 276/*
 277 * kvm_mmu_page_role tracks the properties of a shadow page (where shadow page
 278 * also includes TDP pages) to determine whether or not a page can be used in
 279 * the given MMU context.  This is a subset of the overall kvm_mmu_role to
 280 * minimize the size of kvm_memory_slot.arch.gfn_track, i.e. allows allocating
 281 * 2 bytes per gfn instead of 4 bytes per gfn.
 282 *
 283 * Indirect upper-level shadow pages are tracked for write-protection via
 284 * gfn_track.  As above, gfn_track is a 16 bit counter, so KVM must not create
 285 * more than 2^16-1 upper-level shadow pages at a single gfn, otherwise
 286 * gfn_track will overflow and explosions will ensure.
 287 *
 288 * A unique shadow page (SP) for a gfn is created if and only if an existing SP
 289 * cannot be reused.  The ability to reuse a SP is tracked by its role, which
 290 * incorporates various mode bits and properties of the SP.  Roughly speaking,
 291 * the number of unique SPs that can theoretically be created is 2^n, where n
 292 * is the number of bits that are used to compute the role.
 293 *
 294 * But, even though there are 19 bits in the mask below, not all combinations
 295 * of modes and flags are possible:
 296 *
 297 *   - invalid shadow pages are not accounted, so the bits are effectively 18
 298 *
 299 *   - quadrant will only be used if has_4_byte_gpte=1 (non-PAE paging);
 300 *     execonly and ad_disabled are only used for nested EPT which has
 301 *     has_4_byte_gpte=0.  Therefore, 2 bits are always unused.
 302 *
 303 *   - the 4 bits of level are effectively limited to the values 2/3/4/5,
 304 *     as 4k SPs are not tracked (allowed to go unsync).  In addition non-PAE
 305 *     paging has exactly one upper level, making level completely redundant
 306 *     when has_4_byte_gpte=1.
 307 *
 308 *   - on top of this, smep_andnot_wp and smap_andnot_wp are only set if
 309 *     cr0_wp=0, therefore these three bits only give rise to 5 possibilities.
 310 *
 311 * Therefore, the maximum number of possible upper-level shadow pages for a
 312 * single gfn is a bit less than 2^13.
 313 */
 314union kvm_mmu_page_role {
 315        u32 word;
 316        struct {
 317                unsigned level:4;
 318                unsigned has_4_byte_gpte:1;
 319                unsigned quadrant:2;
 320                unsigned direct:1;
 321                unsigned access:3;
 322                unsigned invalid:1;
 323                unsigned efer_nx:1;
 324                unsigned cr0_wp:1;
 325                unsigned smep_andnot_wp:1;
 326                unsigned smap_andnot_wp:1;
 327                unsigned ad_disabled:1;
 328                unsigned guest_mode:1;
 329                unsigned :6;
 330
 331                /*
 332                 * This is left at the top of the word so that
 333                 * kvm_memslots_for_spte_role can extract it with a
 334                 * simple shift.  While there is room, give it a whole
 335                 * byte so it is also faster to load it from memory.
 336                 */
 337                unsigned smm:8;
 338        };
 339};
 340
 341/*
 342 * kvm_mmu_extended_role complements kvm_mmu_page_role, tracking properties
 343 * relevant to the current MMU configuration.   When loading CR0, CR4, or EFER,
 344 * including on nested transitions, if nothing in the full role changes then
 345 * MMU re-configuration can be skipped. @valid bit is set on first usage so we
 346 * don't treat all-zero structure as valid data.
 347 *
 348 * The properties that are tracked in the extended role but not the page role
 349 * are for things that either (a) do not affect the validity of the shadow page
 350 * or (b) are indirectly reflected in the shadow page's role.  For example,
 351 * CR4.PKE only affects permission checks for software walks of the guest page
 352 * tables (because KVM doesn't support Protection Keys with shadow paging), and
 353 * CR0.PG, CR4.PAE, and CR4.PSE are indirectly reflected in role.level.
 354 *
 355 * Note, SMEP and SMAP are not redundant with sm*p_andnot_wp in the page role.
 356 * If CR0.WP=1, KVM can reuse shadow pages for the guest regardless of SMEP and
 357 * SMAP, but the MMU's permission checks for software walks need to be SMEP and
 358 * SMAP aware regardless of CR0.WP.
 359 */
 360union kvm_mmu_extended_role {
 361        u32 word;
 362        struct {
 363                unsigned int valid:1;
 364                unsigned int execonly:1;
 365                unsigned int cr0_pg:1;
 366                unsigned int cr4_pae:1;
 367                unsigned int cr4_pse:1;
 368                unsigned int cr4_pke:1;
 369                unsigned int cr4_smap:1;
 370                unsigned int cr4_smep:1;
 371                unsigned int cr4_la57:1;
 372                unsigned int efer_lma:1;
 373        };
 374};
 375
 376union kvm_mmu_role {
 377        u64 as_u64;
 378        struct {
 379                union kvm_mmu_page_role base;
 380                union kvm_mmu_extended_role ext;
 381        };
 382};
 383
 384struct kvm_rmap_head {
 385        unsigned long val;
 386};
 387
 388struct kvm_pio_request {
 389        unsigned long linear_rip;
 390        unsigned long count;
 391        int in;
 392        int port;
 393        int size;
 394};
 395
 396#define PT64_ROOT_MAX_LEVEL 5
 397
 398struct rsvd_bits_validate {
 399        u64 rsvd_bits_mask[2][PT64_ROOT_MAX_LEVEL];
 400        u64 bad_mt_xwr;
 401};
 402
 403struct kvm_mmu_root_info {
 404        gpa_t pgd;
 405        hpa_t hpa;
 406};
 407
 408#define KVM_MMU_ROOT_INFO_INVALID \
 409        ((struct kvm_mmu_root_info) { .pgd = INVALID_PAGE, .hpa = INVALID_PAGE })
 410
 411#define KVM_MMU_NUM_PREV_ROOTS 3
 412
 413#define KVM_HAVE_MMU_RWLOCK
 414
 415struct kvm_mmu_page;
 416struct kvm_page_fault;
 417
 418/*
 419 * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit,
 420 * and 2-level 32-bit).  The kvm_mmu structure abstracts the details of the
 421 * current mmu mode.
 422 */
 423struct kvm_mmu {
 424        unsigned long (*get_guest_pgd)(struct kvm_vcpu *vcpu);
 425        u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index);
 426        int (*page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
 427        void (*inject_page_fault)(struct kvm_vcpu *vcpu,
 428                                  struct x86_exception *fault);
 429        gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 430                            gpa_t gva_or_gpa, u32 access,
 431                            struct x86_exception *exception);
 432        int (*sync_page)(struct kvm_vcpu *vcpu,
 433                         struct kvm_mmu_page *sp);
 434        void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
 435        hpa_t root_hpa;
 436        gpa_t root_pgd;
 437        union kvm_mmu_role mmu_role;
 438        u8 root_level;
 439        u8 shadow_root_level;
 440        u8 ept_ad;
 441        bool direct_map;
 442        struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS];
 443
 444        /*
 445         * Bitmap; bit set = permission fault
 446         * Byte index: page fault error code [4:1]
 447         * Bit index: pte permissions in ACC_* format
 448         */
 449        u8 permissions[16];
 450
 451        /*
 452        * The pkru_mask indicates if protection key checks are needed.  It
 453        * consists of 16 domains indexed by page fault error code bits [4:1],
 454        * with PFEC.RSVD replaced by ACC_USER_MASK from the page tables.
 455        * Each domain has 2 bits which are ANDed with AD and WD from PKRU.
 456        */
 457        u32 pkru_mask;
 458
 459        u64 *pae_root;
 460        u64 *pml4_root;
 461        u64 *pml5_root;
 462
 463        /*
 464         * check zero bits on shadow page table entries, these
 465         * bits include not only hardware reserved bits but also
 466         * the bits spte never used.
 467         */
 468        struct rsvd_bits_validate shadow_zero_check;
 469
 470        struct rsvd_bits_validate guest_rsvd_check;
 471
 472        u64 pdptrs[4]; /* pae */
 473};
 474
 475struct kvm_tlb_range {
 476        u64 start_gfn;
 477        u64 pages;
 478};
 479
 480enum pmc_type {
 481        KVM_PMC_GP = 0,
 482        KVM_PMC_FIXED,
 483};
 484
 485struct kvm_pmc {
 486        enum pmc_type type;
 487        u8 idx;
 488        u64 counter;
 489        u64 eventsel;
 490        struct perf_event *perf_event;
 491        struct kvm_vcpu *vcpu;
 492        /*
 493         * eventsel value for general purpose counters,
 494         * ctrl value for fixed counters.
 495         */
 496        u64 current_config;
 497        bool is_paused;
 498        bool intr;
 499};
 500
 501struct kvm_pmu {
 502        unsigned nr_arch_gp_counters;
 503        unsigned nr_arch_fixed_counters;
 504        unsigned available_event_types;
 505        u64 fixed_ctr_ctrl;
 506        u64 global_ctrl;
 507        u64 global_status;
 508        u64 counter_bitmask[2];
 509        u64 global_ctrl_mask;
 510        u64 global_ovf_ctrl_mask;
 511        u64 reserved_bits;
 512        u8 version;
 513        struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
 514        struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED];
 515        struct irq_work irq_work;
 516        DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX);
 517        DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX);
 518        DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX);
 519
 520        /*
 521         * The gate to release perf_events not marked in
 522         * pmc_in_use only once in a vcpu time slice.
 523         */
 524        bool need_cleanup;
 525
 526        /*
 527         * The total number of programmed perf_events and it helps to avoid
 528         * redundant check before cleanup if guest don't use vPMU at all.
 529         */
 530        u8 event_count;
 531};
 532
 533struct kvm_pmu_ops;
 534
 535enum {
 536        KVM_DEBUGREG_BP_ENABLED = 1,
 537        KVM_DEBUGREG_WONT_EXIT = 2,
 538};
 539
 540struct kvm_mtrr_range {
 541        u64 base;
 542        u64 mask;
 543        struct list_head node;
 544};
 545
 546struct kvm_mtrr {
 547        struct kvm_mtrr_range var_ranges[KVM_NR_VAR_MTRR];
 548        mtrr_type fixed_ranges[KVM_NR_FIXED_MTRR_REGION];
 549        u64 deftype;
 550
 551        struct list_head head;
 552};
 553
 554/* Hyper-V SynIC timer */
 555struct kvm_vcpu_hv_stimer {
 556        struct hrtimer timer;
 557        int index;
 558        union hv_stimer_config config;
 559        u64 count;
 560        u64 exp_time;
 561        struct hv_message msg;
 562        bool msg_pending;
 563};
 564
 565/* Hyper-V synthetic interrupt controller (SynIC)*/
 566struct kvm_vcpu_hv_synic {
 567        u64 version;
 568        u64 control;
 569        u64 msg_page;
 570        u64 evt_page;
 571        atomic64_t sint[HV_SYNIC_SINT_COUNT];
 572        atomic_t sint_to_gsi[HV_SYNIC_SINT_COUNT];
 573        DECLARE_BITMAP(auto_eoi_bitmap, 256);
 574        DECLARE_BITMAP(vec_bitmap, 256);
 575        bool active;
 576        bool dont_zero_synic_pages;
 577};
 578
 579/* Hyper-V per vcpu emulation context */
 580struct kvm_vcpu_hv {
 581        struct kvm_vcpu *vcpu;
 582        u32 vp_index;
 583        u64 hv_vapic;
 584        s64 runtime_offset;
 585        struct kvm_vcpu_hv_synic synic;
 586        struct kvm_hyperv_exit exit;
 587        struct kvm_vcpu_hv_stimer stimer[HV_SYNIC_STIMER_COUNT];
 588        DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
 589        bool enforce_cpuid;
 590        struct {
 591                u32 features_eax; /* HYPERV_CPUID_FEATURES.EAX */
 592                u32 features_ebx; /* HYPERV_CPUID_FEATURES.EBX */
 593                u32 features_edx; /* HYPERV_CPUID_FEATURES.EDX */
 594                u32 enlightenments_eax; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EAX */
 595                u32 enlightenments_ebx; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EBX */
 596                u32 syndbg_cap_eax; /* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */
 597        } cpuid_cache;
 598};
 599
 600/* Xen HVM per vcpu emulation context */
 601struct kvm_vcpu_xen {
 602        u64 hypercall_rip;
 603        u32 current_runstate;
 604        bool vcpu_info_set;
 605        bool vcpu_time_info_set;
 606        bool runstate_set;
 607        struct gfn_to_hva_cache vcpu_info_cache;
 608        struct gfn_to_hva_cache vcpu_time_info_cache;
 609        struct gfn_to_hva_cache runstate_cache;
 610        u64 last_steal;
 611        u64 runstate_entry_time;
 612        u64 runstate_times[4];
 613        unsigned long evtchn_pending_sel;
 614};
 615
 616struct kvm_vcpu_arch {
 617        /*
 618         * rip and regs accesses must go through
 619         * kvm_{register,rip}_{read,write} functions.
 620         */
 621        unsigned long regs[NR_VCPU_REGS];
 622        u32 regs_avail;
 623        u32 regs_dirty;
 624
 625        unsigned long cr0;
 626        unsigned long cr0_guest_owned_bits;
 627        unsigned long cr2;
 628        unsigned long cr3;
 629        unsigned long cr4;
 630        unsigned long cr4_guest_owned_bits;
 631        unsigned long cr4_guest_rsvd_bits;
 632        unsigned long cr8;
 633        u32 host_pkru;
 634        u32 pkru;
 635        u32 hflags;
 636        u64 efer;
 637        u64 apic_base;
 638        struct kvm_lapic *apic;    /* kernel irqchip context */
 639        bool apicv_active;
 640        bool load_eoi_exitmap_pending;
 641        DECLARE_BITMAP(ioapic_handled_vectors, 256);
 642        unsigned long apic_attention;
 643        int32_t apic_arb_prio;
 644        int mp_state;
 645        u64 ia32_misc_enable_msr;
 646        u64 smbase;
 647        u64 smi_count;
 648        bool tpr_access_reporting;
 649        bool xsaves_enabled;
 650        bool xfd_no_write_intercept;
 651        u64 ia32_xss;
 652        u64 microcode_version;
 653        u64 arch_capabilities;
 654        u64 perf_capabilities;
 655
 656        /*
 657         * Paging state of the vcpu
 658         *
 659         * If the vcpu runs in guest mode with two level paging this still saves
 660         * the paging mode of the l1 guest. This context is always used to
 661         * handle faults.
 662         */
 663        struct kvm_mmu *mmu;
 664
 665        /* Non-nested MMU for L1 */
 666        struct kvm_mmu root_mmu;
 667
 668        /* L1 MMU when running nested */
 669        struct kvm_mmu guest_mmu;
 670
 671        /*
 672         * Paging state of an L2 guest (used for nested npt)
 673         *
 674         * This context will save all necessary information to walk page tables
 675         * of an L2 guest. This context is only initialized for page table
 676         * walking and not for faulting since we never handle l2 page faults on
 677         * the host.
 678         */
 679        struct kvm_mmu nested_mmu;
 680
 681        /*
 682         * Pointer to the mmu context currently used for
 683         * gva_to_gpa translations.
 684         */
 685        struct kvm_mmu *walk_mmu;
 686
 687        struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
 688        struct kvm_mmu_memory_cache mmu_shadow_page_cache;
 689        struct kvm_mmu_memory_cache mmu_gfn_array_cache;
 690        struct kvm_mmu_memory_cache mmu_page_header_cache;
 691
 692        /*
 693         * QEMU userspace and the guest each have their own FPU state.
 694         * In vcpu_run, we switch between the user and guest FPU contexts.
 695         * While running a VCPU, the VCPU thread will have the guest FPU
 696         * context.
 697         *
 698         * Note that while the PKRU state lives inside the fpu registers,
 699         * it is switched out separately at VMENTER and VMEXIT time. The
 700         * "guest_fpstate" state here contains the guest FPU context, with the
 701         * host PRKU bits.
 702         */
 703        struct fpu_guest guest_fpu;
 704
 705        u64 xcr0;
 706
 707        struct kvm_pio_request pio;
 708        void *pio_data;
 709        void *sev_pio_data;
 710        unsigned sev_pio_count;
 711
 712        u8 event_exit_inst_len;
 713
 714        struct kvm_queued_exception {
 715                bool pending;
 716                bool injected;
 717                bool has_error_code;
 718                u8 nr;
 719                u32 error_code;
 720                unsigned long payload;
 721                bool has_payload;
 722                u8 nested_apf;
 723        } exception;
 724
 725        struct kvm_queued_interrupt {
 726                bool injected;
 727                bool soft;
 728                u8 nr;
 729        } interrupt;
 730
 731        int halt_request; /* real mode on Intel only */
 732
 733        int cpuid_nent;
 734        struct kvm_cpuid_entry2 *cpuid_entries;
 735        u32 kvm_cpuid_base;
 736
 737        u64 reserved_gpa_bits;
 738        int maxphyaddr;
 739
 740        /* emulate context */
 741
 742        struct x86_emulate_ctxt *emulate_ctxt;
 743        bool emulate_regs_need_sync_to_vcpu;
 744        bool emulate_regs_need_sync_from_vcpu;
 745        int (*complete_userspace_io)(struct kvm_vcpu *vcpu);
 746
 747        gpa_t time;
 748        struct pvclock_vcpu_time_info hv_clock;
 749        unsigned int hw_tsc_khz;
 750        struct gfn_to_hva_cache pv_time;
 751        bool pv_time_enabled;
 752        /* set guest stopped flag in pvclock flags field */
 753        bool pvclock_set_guest_stopped_request;
 754
 755        struct {
 756                u8 preempted;
 757                u64 msr_val;
 758                u64 last_steal;
 759                struct gfn_to_hva_cache cache;
 760        } st;
 761
 762        u64 l1_tsc_offset;
 763        u64 tsc_offset; /* current tsc offset */
 764        u64 last_guest_tsc;
 765        u64 last_host_tsc;
 766        u64 tsc_offset_adjustment;
 767        u64 this_tsc_nsec;
 768        u64 this_tsc_write;
 769        u64 this_tsc_generation;
 770        bool tsc_catchup;
 771        bool tsc_always_catchup;
 772        s8 virtual_tsc_shift;
 773        u32 virtual_tsc_mult;
 774        u32 virtual_tsc_khz;
 775        s64 ia32_tsc_adjust_msr;
 776        u64 msr_ia32_power_ctl;
 777        u64 l1_tsc_scaling_ratio;
 778        u64 tsc_scaling_ratio; /* current scaling ratio */
 779
 780        atomic_t nmi_queued;  /* unprocessed asynchronous NMIs */
 781        unsigned nmi_pending; /* NMI queued after currently running handler */
 782        bool nmi_injected;    /* Trying to inject an NMI this entry */
 783        bool smi_pending;    /* SMI queued after currently running handler */
 784        u8 handling_intr_from_guest;
 785
 786        struct kvm_mtrr mtrr_state;
 787        u64 pat;
 788
 789        unsigned switch_db_regs;
 790        unsigned long db[KVM_NR_DB_REGS];
 791        unsigned long dr6;
 792        unsigned long dr7;
 793        unsigned long eff_db[KVM_NR_DB_REGS];
 794        unsigned long guest_debug_dr7;
 795        u64 msr_platform_info;
 796        u64 msr_misc_features_enables;
 797
 798        u64 mcg_cap;
 799        u64 mcg_status;
 800        u64 mcg_ctl;
 801        u64 mcg_ext_ctl;
 802        u64 *mce_banks;
 803
 804        /* Cache MMIO info */
 805        u64 mmio_gva;
 806        unsigned mmio_access;
 807        gfn_t mmio_gfn;
 808        u64 mmio_gen;
 809
 810        struct kvm_pmu pmu;
 811
 812        /* used for guest single stepping over the given code position */
 813        unsigned long singlestep_rip;
 814
 815        bool hyperv_enabled;
 816        struct kvm_vcpu_hv *hyperv;
 817        struct kvm_vcpu_xen xen;
 818
 819        cpumask_var_t wbinvd_dirty_mask;
 820
 821        unsigned long last_retry_eip;
 822        unsigned long last_retry_addr;
 823
 824        struct {
 825                bool halted;
 826                gfn_t gfns[ASYNC_PF_PER_VCPU];
 827                struct gfn_to_hva_cache data;
 828                u64 msr_en_val; /* MSR_KVM_ASYNC_PF_EN */
 829                u64 msr_int_val; /* MSR_KVM_ASYNC_PF_INT */
 830                u16 vec;
 831                u32 id;
 832                bool send_user_only;
 833                u32 host_apf_flags;
 834                unsigned long nested_apf_token;
 835                bool delivery_as_pf_vmexit;
 836                bool pageready_pending;
 837        } apf;
 838
 839        /* OSVW MSRs (AMD only) */
 840        struct {
 841                u64 length;
 842                u64 status;
 843        } osvw;
 844
 845        struct {
 846                u64 msr_val;
 847                struct gfn_to_hva_cache data;
 848        } pv_eoi;
 849
 850        u64 msr_kvm_poll_control;
 851
 852        /*
 853         * Indicates the guest is trying to write a gfn that contains one or
 854         * more of the PTEs used to translate the write itself, i.e. the access
 855         * is changing its own translation in the guest page tables.  KVM exits
 856         * to userspace if emulation of the faulting instruction fails and this
 857         * flag is set, as KVM cannot make forward progress.
 858         *
 859         * If emulation fails for a write to guest page tables, KVM unprotects
 860         * (zaps) the shadow page for the target gfn and resumes the guest to
 861         * retry the non-emulatable instruction (on hardware).  Unprotecting the
 862         * gfn doesn't allow forward progress for a self-changing access because
 863         * doing so also zaps the translation for the gfn, i.e. retrying the
 864         * instruction will hit a !PRESENT fault, which results in a new shadow
 865         * page and sends KVM back to square one.
 866         */
 867        bool write_fault_to_shadow_pgtable;
 868
 869        /* set at EPT violation at this point */
 870        unsigned long exit_qualification;
 871
 872        /* pv related host specific info */
 873        struct {
 874                bool pv_unhalted;
 875        } pv;
 876
 877        int pending_ioapic_eoi;
 878        int pending_external_vector;
 879
 880        /* be preempted when it's in kernel-mode(cpl=0) */
 881        bool preempted_in_kernel;
 882
 883        /* Flush the L1 Data cache for L1TF mitigation on VMENTER */
 884        bool l1tf_flush_l1d;
 885
 886        /* Host CPU on which VM-entry was most recently attempted */
 887        int last_vmentry_cpu;
 888
 889        /* AMD MSRC001_0015 Hardware Configuration */
 890        u64 msr_hwcr;
 891
 892        /* pv related cpuid info */
 893        struct {
 894                /*
 895                 * value of the eax register in the KVM_CPUID_FEATURES CPUID
 896                 * leaf.
 897                 */
 898                u32 features;
 899
 900                /*
 901                 * indicates whether pv emulation should be disabled if features
 902                 * are not present in the guest's cpuid
 903                 */
 904                bool enforce;
 905        } pv_cpuid;
 906
 907        /* Protected Guests */
 908        bool guest_state_protected;
 909
 910        /*
 911         * Set when PDPTS were loaded directly by the userspace without
 912         * reading the guest memory
 913         */
 914        bool pdptrs_from_userspace;
 915
 916#if IS_ENABLED(CONFIG_HYPERV)
 917        hpa_t hv_root_tdp;
 918#endif
 919};
 920
 921struct kvm_lpage_info {
 922        int disallow_lpage;
 923};
 924
 925struct kvm_arch_memory_slot {
 926        struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES];
 927        struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
 928        unsigned short *gfn_track[KVM_PAGE_TRACK_MAX];
 929};
 930
 931/*
 932 * We use as the mode the number of bits allocated in the LDR for the
 933 * logical processor ID.  It happens that these are all powers of two.
 934 * This makes it is very easy to detect cases where the APICs are
 935 * configured for multiple modes; in that case, we cannot use the map and
 936 * hence cannot use kvm_irq_delivery_to_apic_fast either.
 937 */
 938#define KVM_APIC_MODE_XAPIC_CLUSTER          4
 939#define KVM_APIC_MODE_XAPIC_FLAT             8
 940#define KVM_APIC_MODE_X2APIC                16
 941
 942struct kvm_apic_map {
 943        struct rcu_head rcu;
 944        u8 mode;
 945        u32 max_apic_id;
 946        union {
 947                struct kvm_lapic *xapic_flat_map[8];
 948                struct kvm_lapic *xapic_cluster_map[16][4];
 949        };
 950        struct kvm_lapic *phys_map[];
 951};
 952
 953/* Hyper-V synthetic debugger (SynDbg)*/
 954struct kvm_hv_syndbg {
 955        struct {
 956                u64 control;
 957                u64 status;
 958                u64 send_page;
 959                u64 recv_page;
 960                u64 pending_page;
 961        } control;
 962        u64 options;
 963};
 964
 965/* Current state of Hyper-V TSC page clocksource */
 966enum hv_tsc_page_status {
 967        /* TSC page was not set up or disabled */
 968        HV_TSC_PAGE_UNSET = 0,
 969        /* TSC page MSR was written by the guest, update pending */
 970        HV_TSC_PAGE_GUEST_CHANGED,
 971        /* TSC page MSR was written by KVM userspace, update pending */
 972        HV_TSC_PAGE_HOST_CHANGED,
 973        /* TSC page was properly set up and is currently active  */
 974        HV_TSC_PAGE_SET,
 975        /* TSC page is currently being updated and therefore is inactive */
 976        HV_TSC_PAGE_UPDATING,
 977        /* TSC page was set up with an inaccessible GPA */
 978        HV_TSC_PAGE_BROKEN,
 979};
 980
 981/* Hyper-V emulation context */
 982struct kvm_hv {
 983        struct mutex hv_lock;
 984        u64 hv_guest_os_id;
 985        u64 hv_hypercall;
 986        u64 hv_tsc_page;
 987        enum hv_tsc_page_status hv_tsc_page_status;
 988
 989        /* Hyper-v based guest crash (NT kernel bugcheck) parameters */
 990        u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS];
 991        u64 hv_crash_ctl;
 992
 993        struct ms_hyperv_tsc_page tsc_ref;
 994
 995        struct idr conn_to_evt;
 996
 997        u64 hv_reenlightenment_control;
 998        u64 hv_tsc_emulation_control;
 999        u64 hv_tsc_emulation_status;
1000
1001        /* How many vCPUs have VP index != vCPU index */
1002        atomic_t num_mismatched_vp_indexes;
1003
1004        /*
1005         * How many SynICs use 'AutoEOI' feature
1006         * (protected by arch.apicv_update_lock)
1007         */
1008        unsigned int synic_auto_eoi_used;
1009
1010        struct hv_partition_assist_pg *hv_pa_pg;
1011        struct kvm_hv_syndbg hv_syndbg;
1012};
1013
1014struct msr_bitmap_range {
1015        u32 flags;
1016        u32 nmsrs;
1017        u32 base;
1018        unsigned long *bitmap;
1019};
1020
1021/* Xen emulation context */
1022struct kvm_xen {
1023        bool long_mode;
1024        u8 upcall_vector;
1025        struct gfn_to_pfn_cache shinfo_cache;
1026};
1027
1028enum kvm_irqchip_mode {
1029        KVM_IRQCHIP_NONE,
1030        KVM_IRQCHIP_KERNEL,       /* created with KVM_CREATE_IRQCHIP */
1031        KVM_IRQCHIP_SPLIT,        /* created with KVM_CAP_SPLIT_IRQCHIP */
1032};
1033
1034struct kvm_x86_msr_filter {
1035        u8 count;
1036        bool default_allow:1;
1037        struct msr_bitmap_range ranges[16];
1038};
1039
1040#define APICV_INHIBIT_REASON_DISABLE    0
1041#define APICV_INHIBIT_REASON_HYPERV     1
1042#define APICV_INHIBIT_REASON_NESTED     2
1043#define APICV_INHIBIT_REASON_IRQWIN     3
1044#define APICV_INHIBIT_REASON_PIT_REINJ  4
1045#define APICV_INHIBIT_REASON_X2APIC     5
1046#define APICV_INHIBIT_REASON_BLOCKIRQ   6
1047#define APICV_INHIBIT_REASON_ABSENT     7
1048
1049struct kvm_arch {
1050        unsigned long n_used_mmu_pages;
1051        unsigned long n_requested_mmu_pages;
1052        unsigned long n_max_mmu_pages;
1053        unsigned int indirect_shadow_pages;
1054        u8 mmu_valid_gen;
1055        struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
1056        struct list_head active_mmu_pages;
1057        struct list_head zapped_obsolete_pages;
1058        struct list_head lpage_disallowed_mmu_pages;
1059        struct kvm_page_track_notifier_node mmu_sp_tracker;
1060        struct kvm_page_track_notifier_head track_notifier_head;
1061        /*
1062         * Protects marking pages unsync during page faults, as TDP MMU page
1063         * faults only take mmu_lock for read.  For simplicity, the unsync
1064         * pages lock is always taken when marking pages unsync regardless of
1065         * whether mmu_lock is held for read or write.
1066         */
1067        spinlock_t mmu_unsync_pages_lock;
1068
1069        struct list_head assigned_dev_head;
1070        struct iommu_domain *iommu_domain;
1071        bool iommu_noncoherent;
1072#define __KVM_HAVE_ARCH_NONCOHERENT_DMA
1073        atomic_t noncoherent_dma_count;
1074#define __KVM_HAVE_ARCH_ASSIGNED_DEVICE
1075        atomic_t assigned_device_count;
1076        struct kvm_pic *vpic;
1077        struct kvm_ioapic *vioapic;
1078        struct kvm_pit *vpit;
1079        atomic_t vapics_in_nmi_mode;
1080        struct mutex apic_map_lock;
1081        struct kvm_apic_map __rcu *apic_map;
1082        atomic_t apic_map_dirty;
1083
1084        /* Protects apic_access_memslot_enabled and apicv_inhibit_reasons */
1085        struct rw_semaphore apicv_update_lock;
1086
1087        bool apic_access_memslot_enabled;
1088        unsigned long apicv_inhibit_reasons;
1089
1090        gpa_t wall_clock;
1091
1092        bool mwait_in_guest;
1093        bool hlt_in_guest;
1094        bool pause_in_guest;
1095        bool cstate_in_guest;
1096
1097        unsigned long irq_sources_bitmap;
1098        s64 kvmclock_offset;
1099
1100        /*
1101         * This also protects nr_vcpus_matched_tsc which is read from a
1102         * preemption-disabled region, so it must be a raw spinlock.
1103         */
1104        raw_spinlock_t tsc_write_lock;
1105        u64 last_tsc_nsec;
1106        u64 last_tsc_write;
1107        u32 last_tsc_khz;
1108        u64 last_tsc_offset;
1109        u64 cur_tsc_nsec;
1110        u64 cur_tsc_write;
1111        u64 cur_tsc_offset;
1112        u64 cur_tsc_generation;
1113        int nr_vcpus_matched_tsc;
1114
1115        seqcount_raw_spinlock_t pvclock_sc;
1116        bool use_master_clock;
1117        u64 master_kernel_ns;
1118        u64 master_cycle_now;
1119        struct delayed_work kvmclock_update_work;
1120        struct delayed_work kvmclock_sync_work;
1121
1122        struct kvm_xen_hvm_config xen_hvm_config;
1123
1124        /* reads protected by irq_srcu, writes by irq_lock */
1125        struct hlist_head mask_notifier_list;
1126
1127        struct kvm_hv hyperv;
1128        struct kvm_xen xen;
1129
1130        #ifdef CONFIG_KVM_MMU_AUDIT
1131        int audit_point;
1132        #endif
1133
1134        bool backwards_tsc_observed;
1135        bool boot_vcpu_runs_old_kvmclock;
1136        u32 bsp_vcpu_id;
1137
1138        u64 disabled_quirks;
1139        int cpu_dirty_logging_count;
1140
1141        enum kvm_irqchip_mode irqchip_mode;
1142        u8 nr_reserved_ioapic_pins;
1143
1144        bool disabled_lapic_found;
1145
1146        bool x2apic_format;
1147        bool x2apic_broadcast_quirk_disabled;
1148
1149        bool guest_can_read_msr_platform_info;
1150        bool exception_payload_enabled;
1151
1152        bool bus_lock_detection_enabled;
1153        /*
1154         * If exit_on_emulation_error is set, and the in-kernel instruction
1155         * emulator fails to emulate an instruction, allow userspace
1156         * the opportunity to look at it.
1157         */
1158        bool exit_on_emulation_error;
1159
1160        /* Deflect RDMSR and WRMSR to user space when they trigger a #GP */
1161        u32 user_space_msr_mask;
1162        struct kvm_x86_msr_filter __rcu *msr_filter;
1163
1164        u32 hypercall_exit_enabled;
1165
1166        /* Guest can access the SGX PROVISIONKEY. */
1167        bool sgx_provisioning_allowed;
1168
1169        struct kvm_pmu_event_filter __rcu *pmu_event_filter;
1170        struct task_struct *nx_lpage_recovery_thread;
1171
1172#ifdef CONFIG_X86_64
1173        /*
1174         * Whether the TDP MMU is enabled for this VM. This contains a
1175         * snapshot of the TDP MMU module parameter from when the VM was
1176         * created and remains unchanged for the life of the VM. If this is
1177         * true, TDP MMU handler functions will run for various MMU
1178         * operations.
1179         */
1180        bool tdp_mmu_enabled;
1181
1182        /*
1183         * List of struct kvm_mmu_pages being used as roots.
1184         * All struct kvm_mmu_pages in the list should have
1185         * tdp_mmu_page set.
1186         *
1187         * For reads, this list is protected by:
1188         *      the MMU lock in read mode + RCU or
1189         *      the MMU lock in write mode
1190         *
1191         * For writes, this list is protected by:
1192         *      the MMU lock in read mode + the tdp_mmu_pages_lock or
1193         *      the MMU lock in write mode
1194         *
1195         * Roots will remain in the list until their tdp_mmu_root_count
1196         * drops to zero, at which point the thread that decremented the
1197         * count to zero should removed the root from the list and clean
1198         * it up, freeing the root after an RCU grace period.
1199         */
1200        struct list_head tdp_mmu_roots;
1201
1202        /*
1203         * List of struct kvmp_mmu_pages not being used as roots.
1204         * All struct kvm_mmu_pages in the list should have
1205         * tdp_mmu_page set and a tdp_mmu_root_count of 0.
1206         */
1207        struct list_head tdp_mmu_pages;
1208
1209        /*
1210         * Protects accesses to the following fields when the MMU lock
1211         * is held in read mode:
1212         *  - tdp_mmu_roots (above)
1213         *  - tdp_mmu_pages (above)
1214         *  - the link field of struct kvm_mmu_pages used by the TDP MMU
1215         *  - lpage_disallowed_mmu_pages
1216         *  - the lpage_disallowed_link field of struct kvm_mmu_pages used
1217         *    by the TDP MMU
1218         * It is acceptable, but not necessary, to acquire this lock when
1219         * the thread holds the MMU lock in write mode.
1220         */
1221        spinlock_t tdp_mmu_pages_lock;
1222#endif /* CONFIG_X86_64 */
1223
1224        /*
1225         * If set, at least one shadow root has been allocated. This flag
1226         * is used as one input when determining whether certain memslot
1227         * related allocations are necessary.
1228         */
1229        bool shadow_root_allocated;
1230
1231#if IS_ENABLED(CONFIG_HYPERV)
1232        hpa_t   hv_root_tdp;
1233        spinlock_t hv_root_tdp_lock;
1234#endif
1235};
1236
1237struct kvm_vm_stat {
1238        struct kvm_vm_stat_generic generic;
1239        u64 mmu_shadow_zapped;
1240        u64 mmu_pte_write;
1241        u64 mmu_pde_zapped;
1242        u64 mmu_flooded;
1243        u64 mmu_recycled;
1244        u64 mmu_cache_miss;
1245        u64 mmu_unsync;
1246        union {
1247                struct {
1248                        atomic64_t pages_4k;
1249                        atomic64_t pages_2m;
1250                        atomic64_t pages_1g;
1251                };
1252                atomic64_t pages[KVM_NR_PAGE_SIZES];
1253        };
1254        u64 nx_lpage_splits;
1255        u64 max_mmu_page_hash_collisions;
1256        u64 max_mmu_rmap_size;
1257};
1258
1259struct kvm_vcpu_stat {
1260        struct kvm_vcpu_stat_generic generic;
1261        u64 pf_fixed;
1262        u64 pf_guest;
1263        u64 tlb_flush;
1264        u64 invlpg;
1265
1266        u64 exits;
1267        u64 io_exits;
1268        u64 mmio_exits;
1269        u64 signal_exits;
1270        u64 irq_window_exits;
1271        u64 nmi_window_exits;
1272        u64 l1d_flush;
1273        u64 halt_exits;
1274        u64 request_irq_exits;
1275        u64 irq_exits;
1276        u64 host_state_reload;
1277        u64 fpu_reload;
1278        u64 insn_emulation;
1279        u64 insn_emulation_fail;
1280        u64 hypercalls;
1281        u64 irq_injections;
1282        u64 nmi_injections;
1283        u64 req_event;
1284        u64 nested_run;
1285        u64 directed_yield_attempted;
1286        u64 directed_yield_successful;
1287        u64 guest_mode;
1288};
1289
1290struct x86_instruction_info;
1291
1292struct msr_data {
1293        bool host_initiated;
1294        u32 index;
1295        u64 data;
1296};
1297
1298struct kvm_lapic_irq {
1299        u32 vector;
1300        u16 delivery_mode;
1301        u16 dest_mode;
1302        bool level;
1303        u16 trig_mode;
1304        u32 shorthand;
1305        u32 dest_id;
1306        bool msi_redir_hint;
1307};
1308
1309static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
1310{
1311        return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL;
1312}
1313
1314struct kvm_x86_ops {
1315        const char *name;
1316
1317        int (*hardware_enable)(void);
1318        void (*hardware_disable)(void);
1319        void (*hardware_unsetup)(void);
1320        bool (*cpu_has_accelerated_tpr)(void);
1321        bool (*has_emulated_msr)(struct kvm *kvm, u32 index);
1322        void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu);
1323
1324        unsigned int vm_size;
1325        int (*vm_init)(struct kvm *kvm);
1326        void (*vm_destroy)(struct kvm *kvm);
1327
1328        /* Create, but do not attach this VCPU */
1329        int (*vcpu_create)(struct kvm_vcpu *vcpu);
1330        void (*vcpu_free)(struct kvm_vcpu *vcpu);
1331        void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
1332
1333        void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
1334        void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
1335        void (*vcpu_put)(struct kvm_vcpu *vcpu);
1336
1337        void (*update_exception_bitmap)(struct kvm_vcpu *vcpu);
1338        int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
1339        int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
1340        u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
1341        void (*get_segment)(struct kvm_vcpu *vcpu,
1342                            struct kvm_segment *var, int seg);
1343        int (*get_cpl)(struct kvm_vcpu *vcpu);
1344        void (*set_segment)(struct kvm_vcpu *vcpu,
1345                            struct kvm_segment *var, int seg);
1346        void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
1347        void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
1348        void (*post_set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
1349        bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr0);
1350        void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
1351        int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
1352        void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
1353        void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
1354        void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
1355        void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
1356        void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu);
1357        void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
1358        void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
1359        unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
1360        void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
1361        bool (*get_if_flag)(struct kvm_vcpu *vcpu);
1362
1363        void (*tlb_flush_all)(struct kvm_vcpu *vcpu);
1364        void (*tlb_flush_current)(struct kvm_vcpu *vcpu);
1365        int  (*tlb_remote_flush)(struct kvm *kvm);
1366        int  (*tlb_remote_flush_with_range)(struct kvm *kvm,
1367                        struct kvm_tlb_range *range);
1368
1369        /*
1370         * Flush any TLB entries associated with the given GVA.
1371         * Does not need to flush GPA->HPA mappings.
1372         * Can potentially get non-canonical addresses through INVLPGs, which
1373         * the implementation may choose to ignore if appropriate.
1374         */
1375        void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr);
1376
1377        /*
1378         * Flush any TLB entries created by the guest.  Like tlb_flush_gva(),
1379         * does not need to flush GPA->HPA mappings.
1380         */
1381        void (*tlb_flush_guest)(struct kvm_vcpu *vcpu);
1382
1383        int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
1384        enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu);
1385        int (*handle_exit)(struct kvm_vcpu *vcpu,
1386                enum exit_fastpath_completion exit_fastpath);
1387        int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
1388        void (*update_emulated_instruction)(struct kvm_vcpu *vcpu);
1389        void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
1390        u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu);
1391        void (*patch_hypercall)(struct kvm_vcpu *vcpu,
1392                                unsigned char *hypercall_addr);
1393        void (*set_irq)(struct kvm_vcpu *vcpu);
1394        void (*set_nmi)(struct kvm_vcpu *vcpu);
1395        void (*queue_exception)(struct kvm_vcpu *vcpu);
1396        void (*cancel_injection)(struct kvm_vcpu *vcpu);
1397        int (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
1398        int (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
1399        bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
1400        void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
1401        void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
1402        void (*enable_irq_window)(struct kvm_vcpu *vcpu);
1403        void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
1404        bool (*check_apicv_inhibit_reasons)(ulong bit);
1405        void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
1406        void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
1407        void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr);
1408        bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu);
1409        void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
1410        void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
1411        void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu);
1412        void (*deliver_interrupt)(struct kvm_lapic *apic, int delivery_mode,
1413                                  int trig_mode, int vector);
1414        int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
1415        int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
1416        int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
1417        u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
1418
1419        void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
1420                             int root_level);
1421
1422        bool (*has_wbinvd_exit)(void);
1423
1424        u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
1425        u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu);
1426        void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
1427        void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu, u64 multiplier);
1428
1429        /*
1430         * Retrieve somewhat arbitrary exit information.  Intended to
1431         * be used only from within tracepoints or error paths.
1432         */
1433        void (*get_exit_info)(struct kvm_vcpu *vcpu, u32 *reason,
1434                              u64 *info1, u64 *info2,
1435                              u32 *exit_int_info, u32 *exit_int_info_err_code);
1436
1437        int (*check_intercept)(struct kvm_vcpu *vcpu,
1438                               struct x86_instruction_info *info,
1439                               enum x86_intercept_stage stage,
1440                               struct x86_exception *exception);
1441        void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
1442
1443        void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
1444
1445        void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
1446
1447        /*
1448         * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer.  A zero
1449         * value indicates CPU dirty logging is unsupported or disabled.
1450         */
1451        int cpu_dirty_log_size;
1452        void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu);
1453
1454        /* pmu operations of sub-arch */
1455        const struct kvm_pmu_ops *pmu_ops;
1456        const struct kvm_x86_nested_ops *nested_ops;
1457
1458        void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
1459        void (*vcpu_unblocking)(struct kvm_vcpu *vcpu);
1460
1461        int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
1462                              uint32_t guest_irq, bool set);
1463        void (*start_assignment)(struct kvm *kvm);
1464        void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
1465        bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu);
1466
1467        int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
1468                            bool *expired);
1469        void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
1470
1471        void (*setup_mce)(struct kvm_vcpu *vcpu);
1472
1473        int (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
1474        int (*enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
1475        int (*leave_smm)(struct kvm_vcpu *vcpu, const char *smstate);
1476        void (*enable_smi_window)(struct kvm_vcpu *vcpu);
1477
1478        int (*mem_enc_op)(struct kvm *kvm, void __user *argp);
1479        int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
1480        int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
1481        int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
1482        int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
1483
1484        int (*get_msr_feature)(struct kvm_msr_entry *entry);
1485
1486        bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type,
1487                                        void *insn, int insn_len);
1488
1489        bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
1490        int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
1491
1492        void (*migrate_timers)(struct kvm_vcpu *vcpu);
1493        void (*msr_filter_changed)(struct kvm_vcpu *vcpu);
1494        int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err);
1495
1496        void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);
1497};
1498
1499struct kvm_x86_nested_ops {
1500        void (*leave_nested)(struct kvm_vcpu *vcpu);
1501        int (*check_events)(struct kvm_vcpu *vcpu);
1502        bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
1503        void (*triple_fault)(struct kvm_vcpu *vcpu);
1504        int (*get_state)(struct kvm_vcpu *vcpu,
1505                         struct kvm_nested_state __user *user_kvm_nested_state,
1506                         unsigned user_data_size);
1507        int (*set_state)(struct kvm_vcpu *vcpu,
1508                         struct kvm_nested_state __user *user_kvm_nested_state,
1509                         struct kvm_nested_state *kvm_state);
1510        bool (*get_nested_state_pages)(struct kvm_vcpu *vcpu);
1511        int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa);
1512
1513        int (*enable_evmcs)(struct kvm_vcpu *vcpu,
1514                            uint16_t *vmcs_version);
1515        uint16_t (*get_evmcs_version)(struct kvm_vcpu *vcpu);
1516};
1517
1518struct kvm_x86_init_ops {
1519        int (*cpu_has_kvm_support)(void);
1520        int (*disabled_by_bios)(void);
1521        int (*check_processor_compatibility)(void);
1522        int (*hardware_setup)(void);
1523        unsigned int (*handle_intel_pt_intr)(void);
1524
1525        struct kvm_x86_ops *runtime_ops;
1526};
1527
1528struct kvm_arch_async_pf {
1529        u32 token;
1530        gfn_t gfn;
1531        unsigned long cr3;
1532        bool direct_map;
1533};
1534
1535extern u32 __read_mostly kvm_nr_uret_msrs;
1536extern u64 __read_mostly host_efer;
1537extern bool __read_mostly allow_smaller_maxphyaddr;
1538extern bool __read_mostly enable_apicv;
1539extern struct kvm_x86_ops kvm_x86_ops;
1540
1541#define KVM_X86_OP(func) \
1542        DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func));
1543#define KVM_X86_OP_NULL KVM_X86_OP
1544#include <asm/kvm-x86-ops.h>
1545
1546static inline void kvm_ops_static_call_update(void)
1547{
1548#define KVM_X86_OP(func) \
1549        static_call_update(kvm_x86_##func, kvm_x86_ops.func);
1550#define KVM_X86_OP_NULL KVM_X86_OP
1551#include <asm/kvm-x86-ops.h>
1552}
1553
1554#define __KVM_HAVE_ARCH_VM_ALLOC
1555static inline struct kvm *kvm_arch_alloc_vm(void)
1556{
1557        return __vmalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1558}
1559
1560#define __KVM_HAVE_ARCH_VM_FREE
1561void kvm_arch_free_vm(struct kvm *kvm);
1562
1563#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB
1564static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
1565{
1566        if (kvm_x86_ops.tlb_remote_flush &&
1567            !static_call(kvm_x86_tlb_remote_flush)(kvm))
1568                return 0;
1569        else
1570                return -ENOTSUPP;
1571}
1572
1573#define kvm_arch_pmi_in_guest(vcpu) \
1574        ((vcpu) && (vcpu)->arch.handling_intr_from_guest)
1575
1576int kvm_mmu_module_init(void);
1577void kvm_mmu_module_exit(void);
1578
1579void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
1580int kvm_mmu_create(struct kvm_vcpu *vcpu);
1581void kvm_mmu_init_vm(struct kvm *kvm);
1582void kvm_mmu_uninit_vm(struct kvm *kvm);
1583
1584void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu);
1585void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
1586void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
1587                                      const struct kvm_memory_slot *memslot,
1588                                      int start_level);
1589void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
1590                                   const struct kvm_memory_slot *memslot);
1591void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
1592                                   const struct kvm_memory_slot *memslot);
1593void kvm_mmu_zap_all(struct kvm *kvm);
1594void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
1595void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages);
1596
1597int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
1598
1599int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1600                          const void *val, int bytes);
1601
1602struct kvm_irq_mask_notifier {
1603        void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
1604        int irq;
1605        struct hlist_node link;
1606};
1607
1608void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
1609                                    struct kvm_irq_mask_notifier *kimn);
1610void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
1611                                      struct kvm_irq_mask_notifier *kimn);
1612void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
1613                             bool mask);
1614
1615extern bool tdp_enabled;
1616
1617u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
1618
1619/* control of guest tsc rate supported? */
1620extern bool kvm_has_tsc_control;
1621/* maximum supported tsc_khz for guests */
1622extern u32  kvm_max_guest_tsc_khz;
1623/* number of bits of the fractional part of the TSC scaling ratio */
1624extern u8   kvm_tsc_scaling_ratio_frac_bits;
1625/* maximum allowed value of TSC scaling ratio */
1626extern u64  kvm_max_tsc_scaling_ratio;
1627/* 1ull << kvm_tsc_scaling_ratio_frac_bits */
1628extern u64  kvm_default_tsc_scaling_ratio;
1629/* bus lock detection supported? */
1630extern bool kvm_has_bus_lock_exit;
1631
1632extern u64 kvm_mce_cap_supported;
1633
1634/*
1635 * EMULTYPE_NO_DECODE - Set when re-emulating an instruction (after completing
1636 *                      userspace I/O) to indicate that the emulation context
1637 *                      should be reused as is, i.e. skip initialization of
1638 *                      emulation context, instruction fetch and decode.
1639 *
1640 * EMULTYPE_TRAP_UD - Set when emulating an intercepted #UD from hardware.
1641 *                    Indicates that only select instructions (tagged with
1642 *                    EmulateOnUD) should be emulated (to minimize the emulator
1643 *                    attack surface).  See also EMULTYPE_TRAP_UD_FORCED.
1644 *
1645 * EMULTYPE_SKIP - Set when emulating solely to skip an instruction, i.e. to
1646 *                 decode the instruction length.  For use *only* by
1647 *                 kvm_x86_ops.skip_emulated_instruction() implementations if
1648 *                 EMULTYPE_COMPLETE_USER_EXIT is not set.
1649 *
1650 * EMULTYPE_ALLOW_RETRY_PF - Set when the emulator should resume the guest to
1651 *                           retry native execution under certain conditions,
1652 *                           Can only be set in conjunction with EMULTYPE_PF.
1653 *
1654 * EMULTYPE_TRAP_UD_FORCED - Set when emulating an intercepted #UD that was
1655 *                           triggered by KVM's magic "force emulation" prefix,
1656 *                           which is opt in via module param (off by default).
1657 *                           Bypasses EmulateOnUD restriction despite emulating
1658 *                           due to an intercepted #UD (see EMULTYPE_TRAP_UD).
1659 *                           Used to test the full emulator from userspace.
1660 *
1661 * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
1662 *                      backdoor emulation, which is opt in via module param.
1663 *                      VMware backdoor emulation handles select instructions
1664 *                      and reinjects the #GP for all other cases.
1665 *
1666 * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which
1667 *               case the CR2/GPA value pass on the stack is valid.
1668 *
1669 * EMULTYPE_COMPLETE_USER_EXIT - Set when the emulator should update interruptibility
1670 *                               state and inject single-step #DBs after skipping
1671 *                               an instruction (after completing userspace I/O).
1672 */
1673#define EMULTYPE_NO_DECODE          (1 << 0)
1674#define EMULTYPE_TRAP_UD            (1 << 1)
1675#define EMULTYPE_SKIP               (1 << 2)
1676#define EMULTYPE_ALLOW_RETRY_PF     (1 << 3)
1677#define EMULTYPE_TRAP_UD_FORCED     (1 << 4)
1678#define EMULTYPE_VMWARE_GP          (1 << 5)
1679#define EMULTYPE_PF                 (1 << 6)
1680#define EMULTYPE_COMPLETE_USER_EXIT (1 << 7)
1681
1682int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
1683int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
1684                                        void *insn, int insn_len);
1685void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu,
1686                                          u64 *data, u8 ndata);
1687void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu);
1688
1689void kvm_enable_efer_bits(u64);
1690bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer);
1691int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated);
1692int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data);
1693int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data);
1694int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu);
1695int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu);
1696int kvm_emulate_as_nop(struct kvm_vcpu *vcpu);
1697int kvm_emulate_invd(struct kvm_vcpu *vcpu);
1698int kvm_emulate_mwait(struct kvm_vcpu *vcpu);
1699int kvm_handle_invalid_op(struct kvm_vcpu *vcpu);
1700int kvm_emulate_monitor(struct kvm_vcpu *vcpu);
1701
1702int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in);
1703int kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
1704int kvm_emulate_halt(struct kvm_vcpu *vcpu);
1705int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu);
1706int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu);
1707int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
1708
1709void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
1710int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
1711void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
1712
1713int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
1714                    int reason, bool has_error_code, u32 error_code);
1715
1716void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0);
1717void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4);
1718int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
1719int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
1720int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
1721int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
1722int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
1723void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
1724unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
1725void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
1726void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
1727int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu);
1728
1729int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
1730int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
1731
1732unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
1733void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
1734int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu);
1735
1736void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
1737void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
1738void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload);
1739void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
1740void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
1741void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
1742bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
1743                                    struct x86_exception *fault);
1744bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
1745bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr);
1746
1747static inline int __kvm_irq_line_state(unsigned long *irq_state,
1748                                       int irq_source_id, int level)
1749{
1750        /* Logical OR for level trig interrupt */
1751        if (level)
1752                __set_bit(irq_source_id, irq_state);
1753        else
1754                __clear_bit(irq_source_id, irq_state);
1755
1756        return !!(*irq_state);
1757}
1758
1759#define KVM_MMU_ROOT_CURRENT            BIT(0)
1760#define KVM_MMU_ROOT_PREVIOUS(i)        BIT(1+i)
1761#define KVM_MMU_ROOTS_ALL               (~0UL)
1762
1763int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
1764void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
1765
1766void kvm_inject_nmi(struct kvm_vcpu *vcpu);
1767
1768void kvm_update_dr7(struct kvm_vcpu *vcpu);
1769
1770int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
1771void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
1772                        ulong roots_to_free);
1773void kvm_mmu_free_guest_mode_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu);
1774gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
1775                              struct x86_exception *exception);
1776gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
1777                               struct x86_exception *exception);
1778gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
1779                               struct x86_exception *exception);
1780gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
1781                                struct x86_exception *exception);
1782
1783bool kvm_apicv_activated(struct kvm *kvm);
1784void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
1785void kvm_request_apicv_update(struct kvm *kvm, bool activate,
1786                              unsigned long bit);
1787
1788void __kvm_request_apicv_update(struct kvm *kvm, bool activate,
1789                                unsigned long bit);
1790
1791int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
1792
1793int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
1794                       void *insn, int insn_len);
1795void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
1796void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
1797                            gva_t gva, hpa_t root_hpa);
1798void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
1799void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
1800
1801void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
1802                       int tdp_max_root_level, int tdp_huge_page_level);
1803
1804static inline u16 kvm_read_ldt(void)
1805{
1806        u16 ldt;
1807        asm("sldt %0" : "=g"(ldt));
1808        return ldt;
1809}
1810
1811static inline void kvm_load_ldt(u16 sel)
1812{
1813        asm("lldt %0" : : "rm"(sel));
1814}
1815
1816#ifdef CONFIG_X86_64
1817static inline unsigned long read_msr(unsigned long msr)
1818{
1819        u64 value;
1820
1821        rdmsrl(msr, value);
1822        return value;
1823}
1824#endif
1825
1826static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
1827{
1828        kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
1829}
1830
1831#define TSS_IOPB_BASE_OFFSET 0x66
1832#define TSS_BASE_SIZE 0x68
1833#define TSS_IOPB_SIZE (65536 / 8)
1834#define TSS_REDIRECTION_SIZE (256 / 8)
1835#define RMODE_TSS_SIZE                                                  \
1836        (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
1837
1838enum {
1839        TASK_SWITCH_CALL = 0,
1840        TASK_SWITCH_IRET = 1,
1841        TASK_SWITCH_JMP = 2,
1842        TASK_SWITCH_GATE = 3,
1843};
1844
1845#define HF_GIF_MASK             (1 << 0)
1846#define HF_NMI_MASK             (1 << 3)
1847#define HF_IRET_MASK            (1 << 4)
1848#define HF_GUEST_MASK           (1 << 5) /* VCPU is in guest-mode */
1849#define HF_SMM_MASK             (1 << 6)
1850#define HF_SMM_INSIDE_NMI_MASK  (1 << 7)
1851
1852#define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
1853#define KVM_ADDRESS_SPACE_NUM 2
1854
1855#define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0)
1856#define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm)
1857
1858#define KVM_ARCH_WANT_MMU_NOTIFIER
1859
1860int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
1861int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
1862int kvm_cpu_has_extint(struct kvm_vcpu *v);
1863int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
1864int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
1865void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
1866
1867int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
1868                    unsigned long ipi_bitmap_high, u32 min,
1869                    unsigned long icr, int op_64_bit);
1870
1871int kvm_add_user_return_msr(u32 msr);
1872int kvm_find_user_return_msr(u32 msr);
1873int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask);
1874
1875static inline bool kvm_is_supported_user_return_msr(u32 msr)
1876{
1877        return kvm_find_user_return_msr(msr) >= 0;
1878}
1879
1880u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio);
1881u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc);
1882u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier);
1883u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier);
1884
1885unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu);
1886bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
1887
1888void kvm_make_scan_ioapic_request(struct kvm *kvm);
1889void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
1890                                       unsigned long *vcpu_bitmap);
1891
1892bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
1893                                     struct kvm_async_pf *work);
1894void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
1895                                 struct kvm_async_pf *work);
1896void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
1897                               struct kvm_async_pf *work);
1898void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu);
1899bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu);
1900extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
1901
1902int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu);
1903int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
1904void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu);
1905
1906void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
1907                                     u32 size);
1908bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
1909bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
1910
1911bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
1912                             struct kvm_vcpu **dest_vcpu);
1913
1914void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
1915                     struct kvm_lapic_irq *irq);
1916
1917static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq)
1918{
1919        /* We can only post Fixed and LowPrio IRQs */
1920        return (irq->delivery_mode == APIC_DM_FIXED ||
1921                irq->delivery_mode == APIC_DM_LOWEST);
1922}
1923
1924static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
1925{
1926        static_call_cond(kvm_x86_vcpu_blocking)(vcpu);
1927}
1928
1929static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
1930{
1931        static_call_cond(kvm_x86_vcpu_unblocking)(vcpu);
1932}
1933
1934static inline int kvm_cpu_get_apicid(int mps_cpu)
1935{
1936#ifdef CONFIG_X86_LOCAL_APIC
1937        return default_cpu_present_to_apicid(mps_cpu);
1938#else
1939        WARN_ON_ONCE(1);
1940        return BAD_APICID;
1941#endif
1942}
1943
1944#define put_smstate(type, buf, offset, val)                      \
1945        *(type *)((buf) + (offset) - 0x7e00) = val
1946
1947#define GET_SMSTATE(type, buf, offset)          \
1948        (*(type *)((buf) + (offset) - 0x7e00))
1949
1950int kvm_cpu_dirty_log_size(void);
1951
1952int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
1953
1954#define KVM_CLOCK_VALID_FLAGS                                           \
1955        (KVM_CLOCK_TSC_STABLE | KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC)
1956
1957#endif /* _ASM_X86_KVM_HOST_H */
1958