LXR linux/arch/x86/kvm/mmu.c

   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Kernel-based Virtual Machine driver for Linux
   4 *
   5 * This module enables machines with Intel VT-x extensions to run virtual
   6 * machines without emulation or binary translation.
   7 *
   8 * MMU support
   9 *
  10 * Copyright (C) 2006 Qumranet, Inc.
  11 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  12 *
  13 * Authors:
  14 *   Yaniv Kamay  <yaniv@qumranet.com>
  15 *   Avi Kivity   <avi@qumranet.com>
  16 */
  17
  18#include "irq.h"
  19#include "mmu.h"
  20#include "x86.h"
  21#include "kvm_cache_regs.h"
  22#include "cpuid.h"
  23
  24#include <linux/kvm_host.h>
  25#include <linux/types.h>
  26#include <linux/string.h>
  27#include <linux/mm.h>
  28#include <linux/highmem.h>
  29#include <linux/moduleparam.h>
  30#include <linux/export.h>
  31#include <linux/swap.h>
  32#include <linux/hugetlb.h>
  33#include <linux/compiler.h>
  34#include <linux/srcu.h>
  35#include <linux/slab.h>
  36#include <linux/sched/signal.h>
  37#include <linux/uaccess.h>
  38#include <linux/hash.h>
  39#include <linux/kern_levels.h>
  40
  41#include <asm/page.h>
  42#include <asm/pat.h>
  43#include <asm/cmpxchg.h>
  44#include <asm/e820/api.h>
  45#include <asm/io.h>
  46#include <asm/vmx.h>
  47#include <asm/kvm_page_track.h>
  48#include "trace.h"
  49
  50/*
  51 * When setting this variable to true it enables Two-Dimensional-Paging
  52 * where the hardware walks 2 page tables:
  53 * 1. the guest-virtual to guest-physical
  54 * 2. while doing 1. it walks guest-physical to host-physical
  55 * If the hardware supports that we don't need to do shadow paging.
  56 */
  57bool tdp_enabled = false;
  58
  59enum {
  60        AUDIT_PRE_PAGE_FAULT,
  61        AUDIT_POST_PAGE_FAULT,
  62        AUDIT_PRE_PTE_WRITE,
  63        AUDIT_POST_PTE_WRITE,
  64        AUDIT_PRE_SYNC,
  65        AUDIT_POST_SYNC
  66};
  67
  68#undef MMU_DEBUG
  69
  70#ifdef MMU_DEBUG
  71static bool dbg = 0;
  72module_param(dbg, bool, 0644);
  73
  74#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
  75#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
  76#define MMU_WARN_ON(x) WARN_ON(x)
  77#else
  78#define pgprintk(x...) do { } while (0)
  79#define rmap_printk(x...) do { } while (0)
  80#define MMU_WARN_ON(x) do { } while (0)
  81#endif
  82
  83#define PTE_PREFETCH_NUM                8
  84
  85#define PT_FIRST_AVAIL_BITS_SHIFT 10
  86#define PT64_SECOND_AVAIL_BITS_SHIFT 52
  87
  88#define PT64_LEVEL_BITS 9
  89
  90#define PT64_LEVEL_SHIFT(level) \
  91                (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
  92
  93#define PT64_INDEX(address, level)\
  94        (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
  95
  96
  97#define PT32_LEVEL_BITS 10
  98
  99#define PT32_LEVEL_SHIFT(level) \
 100                (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
 101
 102#define PT32_LVL_OFFSET_MASK(level) \
 103        (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
 104                                                * PT32_LEVEL_BITS))) - 1))
 105
 106#define PT32_INDEX(address, level)\
 107        (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
 108
 109
 110#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
 111#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
 112#else
 113#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
 114#endif
 115#define PT64_LVL_ADDR_MASK(level) \
 116        (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
 117                                                * PT64_LEVEL_BITS))) - 1))
 118#define PT64_LVL_OFFSET_MASK(level) \
 119        (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
 120                                                * PT64_LEVEL_BITS))) - 1))
 121
 122#define PT32_BASE_ADDR_MASK PAGE_MASK
 123#define PT32_DIR_BASE_ADDR_MASK \
 124        (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
 125#define PT32_LVL_ADDR_MASK(level) \
 126        (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
 127                                            * PT32_LEVEL_BITS))) - 1))
 128
 129#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
 130                        | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
 131
 132#define ACC_EXEC_MASK    1
 133#define ACC_WRITE_MASK   PT_WRITABLE_MASK
 134#define ACC_USER_MASK    PT_USER_MASK
 135#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
 136
 137/* The mask for the R/X bits in EPT PTEs */
 138#define PT64_EPT_READABLE_MASK                  0x1ull
 139#define PT64_EPT_EXECUTABLE_MASK                0x4ull
 140
 141#include <trace/events/kvm.h>
 142
 143#define CREATE_TRACE_POINTS
 144#include "mmutrace.h"
 145
 146#define SPTE_HOST_WRITEABLE     (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
 147#define SPTE_MMU_WRITEABLE      (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
 148
 149#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 150
 151/* make pte_list_desc fit well in cache line */
 152#define PTE_LIST_EXT 3
 153
 154/*
 155 * Return values of handle_mmio_page_fault and mmu.page_fault:
 156 * RET_PF_RETRY: let CPU fault again on the address.
 157 * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
 158 *
 159 * For handle_mmio_page_fault only:
 160 * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
 161 */
 162enum {
 163        RET_PF_RETRY = 0,
 164        RET_PF_EMULATE = 1,
 165        RET_PF_INVALID = 2,
 166};
 167
 168struct pte_list_desc {
 169        u64 *sptes[PTE_LIST_EXT];
 170        struct pte_list_desc *more;
 171};
 172
 173struct kvm_shadow_walk_iterator {
 174        u64 addr;
 175        hpa_t shadow_addr;
 176        u64 *sptep;
 177        int level;
 178        unsigned index;
 179};
 180
 181static const union kvm_mmu_page_role mmu_base_role_mask = {
 182        .cr0_wp = 1,
 183        .gpte_is_8_bytes = 1,
 184        .nxe = 1,
 185        .smep_andnot_wp = 1,
 186        .smap_andnot_wp = 1,
 187        .smm = 1,
 188        .guest_mode = 1,
 189        .ad_disabled = 1,
 190};
 191
 192#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker)     \
 193        for (shadow_walk_init_using_root(&(_walker), (_vcpu),              \
 194                                         (_root), (_addr));                \
 195             shadow_walk_okay(&(_walker));                                 \
 196             shadow_walk_next(&(_walker)))
 197
 198#define for_each_shadow_entry(_vcpu, _addr, _walker)            \
 199        for (shadow_walk_init(&(_walker), _vcpu, _addr);        \
 200             shadow_walk_okay(&(_walker));                      \
 201             shadow_walk_next(&(_walker)))
 202
 203#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte)     \
 204        for (shadow_walk_init(&(_walker), _vcpu, _addr);                \
 205             shadow_walk_okay(&(_walker)) &&                            \
 206                ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });  \
 207             __shadow_walk_next(&(_walker), spte))
 208
 209static struct kmem_cache *pte_list_desc_cache;
 210static struct kmem_cache *mmu_page_header_cache;
 211static struct percpu_counter kvm_total_used_mmu_pages;
 212
 213static u64 __read_mostly shadow_nx_mask;
 214static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
 215static u64 __read_mostly shadow_user_mask;
 216static u64 __read_mostly shadow_accessed_mask;
 217static u64 __read_mostly shadow_dirty_mask;
 218static u64 __read_mostly shadow_mmio_mask;
 219static u64 __read_mostly shadow_mmio_value;
 220static u64 __read_mostly shadow_present_mask;
 221static u64 __read_mostly shadow_me_mask;
 222
 223/*
 224 * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value.
 225 * Non-present SPTEs with shadow_acc_track_value set are in place for access
 226 * tracking.
 227 */
 228static u64 __read_mostly shadow_acc_track_mask;
 229static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
 230
 231/*
 232 * The mask/shift to use for saving the original R/X bits when marking the PTE
 233 * as not-present for access tracking purposes. We do not save the W bit as the
 234 * PTEs being access tracked also need to be dirty tracked, so the W bit will be
 235 * restored only when a write is attempted to the page.
 236 */
 237static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
 238                                                    PT64_EPT_EXECUTABLE_MASK;
 239static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
 240
 241/*
 242 * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order
 243 * to guard against L1TF attacks.
 244 */
 245static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
 246
 247/*
 248 * The number of high-order 1 bits to use in the mask above.
 249 */
 250static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
 251
 252/*
 253 * In some cases, we need to preserve the GFN of a non-present or reserved
 254 * SPTE when we usurp the upper five bits of the physical address space to
 255 * defend against L1TF, e.g. for MMIO SPTEs.  To preserve the GFN, we'll
 256 * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask
 257 * left into the reserved bits, i.e. the GFN in the SPTE will be split into
 258 * high and low parts.  This mask covers the lower bits of the GFN.
 259 */
 260static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
 261
 262
 263static void mmu_spte_set(u64 *sptep, u64 spte);
 264static union kvm_mmu_page_role
 265kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
 266
 267
 268static inline bool kvm_available_flush_tlb_with_range(void)
 269{
 270        return kvm_x86_ops->tlb_remote_flush_with_range;
 271}
 272
 273static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
 274                struct kvm_tlb_range *range)
 275{
 276        int ret = -ENOTSUPP;
 277
 278        if (range && kvm_x86_ops->tlb_remote_flush_with_range)
 279                ret = kvm_x86_ops->tlb_remote_flush_with_range(kvm, range);
 280
 281        if (ret)
 282                kvm_flush_remote_tlbs(kvm);
 283}
 284
 285static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
 286                u64 start_gfn, u64 pages)
 287{
 288        struct kvm_tlb_range range;
 289
 290        range.start_gfn = start_gfn;
 291        range.pages = pages;
 292
 293        kvm_flush_remote_tlbs_with_range(kvm, &range);
 294}
 295
 296void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
 297{
 298        BUG_ON((mmio_mask & mmio_value) != mmio_value);
 299        shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK;
 300        shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
 301}
 302EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 303
 304static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
 305{
 306        return sp->role.ad_disabled;
 307}
 308
 309static inline bool spte_ad_enabled(u64 spte)
 310{
 311        MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
 312        return !(spte & shadow_acc_track_value);
 313}
 314
 315static inline u64 spte_shadow_accessed_mask(u64 spte)
 316{
 317        MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
 318        return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
 319}
 320
 321static inline u64 spte_shadow_dirty_mask(u64 spte)
 322{
 323        MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
 324        return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
 325}
 326
 327static inline bool is_access_track_spte(u64 spte)
 328{
 329        return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
 330}
 331
 332/*
 333 * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
 334 * the memslots generation and is derived as follows:
 335 *
 336 * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
 337 * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61
 338 *
 339 * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
 340 * the MMIO generation number, as doing so would require stealing a bit from
 341 * the "real" generation number and thus effectively halve the maximum number
 342 * of MMIO generations that can be handled before encountering a wrap (which
 343 * requires a full MMU zap).  The flag is instead explicitly queried when
 344 * checking for MMIO spte cache hits.
 345 */
 346#define MMIO_SPTE_GEN_MASK              GENMASK_ULL(18, 0)
 347
 348#define MMIO_SPTE_GEN_LOW_START         3
 349#define MMIO_SPTE_GEN_LOW_END           11
 350#define MMIO_SPTE_GEN_LOW_MASK          GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
 351                                                    MMIO_SPTE_GEN_LOW_START)
 352
 353#define MMIO_SPTE_GEN_HIGH_START        52
 354#define MMIO_SPTE_GEN_HIGH_END          61
 355#define MMIO_SPTE_GEN_HIGH_MASK         GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
 356                                                    MMIO_SPTE_GEN_HIGH_START)
 357static u64 generation_mmio_spte_mask(u64 gen)
 358{
 359        u64 mask;
 360
 361        WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
 362
 363        mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
 364        mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
 365        return mask;
 366}
 367
 368static u64 get_mmio_spte_generation(u64 spte)
 369{
 370        u64 gen;
 371
 372        spte &= ~shadow_mmio_mask;
 373
 374        gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
 375        gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
 376        return gen;
 377}
 378
 379static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
 380                           unsigned access)
 381{
 382        u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
 383        u64 mask = generation_mmio_spte_mask(gen);
 384        u64 gpa = gfn << PAGE_SHIFT;
 385
 386        access &= ACC_WRITE_MASK | ACC_USER_MASK;
 387        mask |= shadow_mmio_value | access;
 388        mask |= gpa | shadow_nonpresent_or_rsvd_mask;
 389        mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
 390                << shadow_nonpresent_or_rsvd_mask_len;
 391
 392        page_header(__pa(sptep))->mmio_cached = true;
 393
 394        trace_mark_mmio_spte(sptep, gfn, access, gen);
 395        mmu_spte_set(sptep, mask);
 396}
 397
 398static bool is_mmio_spte(u64 spte)
 399{
 400        return (spte & shadow_mmio_mask) == shadow_mmio_value;
 401}
 402
 403static gfn_t get_mmio_spte_gfn(u64 spte)
 404{
 405        u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
 406
 407        gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
 408               & shadow_nonpresent_or_rsvd_mask;
 409
 410        return gpa >> PAGE_SHIFT;
 411}
 412
 413static unsigned get_mmio_spte_access(u64 spte)
 414{
 415        u64 mask = generation_mmio_spte_mask(MMIO_SPTE_GEN_MASK) | shadow_mmio_mask;
 416        return (spte & ~mask) & ~PAGE_MASK;
 417}
 418
 419static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
 420                          kvm_pfn_t pfn, unsigned access)
 421{
 422        if (unlikely(is_noslot_pfn(pfn))) {
 423                mark_mmio_spte(vcpu, sptep, gfn, access);
 424                return true;
 425        }
 426
 427        return false;
 428}
 429
 430static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
 431{
 432        u64 kvm_gen, spte_gen, gen;
 433
 434        gen = kvm_vcpu_memslots(vcpu)->generation;
 435        if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
 436                return false;
 437
 438        kvm_gen = gen & MMIO_SPTE_GEN_MASK;
 439        spte_gen = get_mmio_spte_generation(spte);
 440
 441        trace_check_mmio_spte(spte, kvm_gen, spte_gen);
 442        return likely(kvm_gen == spte_gen);
 443}
 444
 445/*
 446 * Sets the shadow PTE masks used by the MMU.
 447 *
 448 * Assumptions:
 449 *  - Setting either @accessed_mask or @dirty_mask requires setting both
 450 *  - At least one of @accessed_mask or @acc_track_mask must be set
 451 */
 452void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 453                u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
 454                u64 acc_track_mask, u64 me_mask)
 455{
 456        BUG_ON(!dirty_mask != !accessed_mask);
 457        BUG_ON(!accessed_mask && !acc_track_mask);
 458        BUG_ON(acc_track_mask & shadow_acc_track_value);
 459
 460        shadow_user_mask = user_mask;
 461        shadow_accessed_mask = accessed_mask;
 462        shadow_dirty_mask = dirty_mask;
 463        shadow_nx_mask = nx_mask;
 464        shadow_x_mask = x_mask;
 465        shadow_present_mask = p_mask;
 466        shadow_acc_track_mask = acc_track_mask;
 467        shadow_me_mask = me_mask;
 468}
 469EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 470
 471static void kvm_mmu_reset_all_pte_masks(void)
 472{
 473        u8 low_phys_bits;
 474
 475        shadow_user_mask = 0;
 476        shadow_accessed_mask = 0;
 477        shadow_dirty_mask = 0;
 478        shadow_nx_mask = 0;
 479        shadow_x_mask = 0;
 480        shadow_mmio_mask = 0;
 481        shadow_present_mask = 0;
 482        shadow_acc_track_mask = 0;
 483
 484        /*
 485         * If the CPU has 46 or less physical address bits, then set an
 486         * appropriate mask to guard against L1TF attacks. Otherwise, it is
 487         * assumed that the CPU is not vulnerable to L1TF.
 488         *
 489         * Some Intel CPUs address the L1 cache using more PA bits than are
 490         * reported by CPUID. Use the PA width of the L1 cache when possible
 491         * to achieve more effective mitigation, e.g. if system RAM overlaps
 492         * the most significant bits of legal physical address space.
 493         */
 494        shadow_nonpresent_or_rsvd_mask = 0;
 495        low_phys_bits = boot_cpu_data.x86_cache_bits;
 496        if (boot_cpu_data.x86_cache_bits <
 497            52 - shadow_nonpresent_or_rsvd_mask_len) {
 498                shadow_nonpresent_or_rsvd_mask =
 499                        rsvd_bits(boot_cpu_data.x86_cache_bits -
 500                                  shadow_nonpresent_or_rsvd_mask_len,
 501                                  boot_cpu_data.x86_cache_bits - 1);
 502                low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len;
 503        } else
 504                WARN_ON_ONCE(boot_cpu_has_bug(X86_BUG_L1TF));
 505
 506        shadow_nonpresent_or_rsvd_lower_gfn_mask =
 507                GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
 508}
 509
 510static int is_cpuid_PSE36(void)
 511{
 512        return 1;
 513}
 514
 515static int is_nx(struct kvm_vcpu *vcpu)
 516{
 517        return vcpu->arch.efer & EFER_NX;
 518}
 519
 520static int is_shadow_present_pte(u64 pte)
 521{
 522        return (pte != 0) && !is_mmio_spte(pte);
 523}
 524
 525static int is_large_pte(u64 pte)
 526{
 527        return pte & PT_PAGE_SIZE_MASK;
 528}
 529
 530static int is_last_spte(u64 pte, int level)
 531{
 532        if (level == PT_PAGE_TABLE_LEVEL)
 533                return 1;
 534        if (is_large_pte(pte))
 535                return 1;
 536        return 0;
 537}
 538
 539static bool is_executable_pte(u64 spte)
 540{
 541        return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
 542}
 543
 544static kvm_pfn_t spte_to_pfn(u64 pte)
 545{
 546        return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
 547}
 548
 549static gfn_t pse36_gfn_delta(u32 gpte)
 550{
 551        int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
 552
 553        return (gpte & PT32_DIR_PSE36_MASK) << shift;
 554}
 555
 556#ifdef CONFIG_X86_64
 557static void __set_spte(u64 *sptep, u64 spte)
 558{
 559        WRITE_ONCE(*sptep, spte);
 560}
 561
 562static void __update_clear_spte_fast(u64 *sptep, u64 spte)
 563{
 564        WRITE_ONCE(*sptep, spte);
 565}
 566
 567static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
 568{
 569        return xchg(sptep, spte);
 570}
 571
 572static u64 __get_spte_lockless(u64 *sptep)
 573{
 574        return READ_ONCE(*sptep);
 575}
 576#else
 577union split_spte {
 578        struct {
 579                u32 spte_low;
 580                u32 spte_high;
 581        };
 582        u64 spte;
 583};
 584
 585static void count_spte_clear(u64 *sptep, u64 spte)
 586{
 587        struct kvm_mmu_page *sp =  page_header(__pa(sptep));
 588
 589        if (is_shadow_present_pte(spte))
 590                return;
 591
 592        /* Ensure the spte is completely set before we increase the count */
 593        smp_wmb();
 594        sp->clear_spte_count++;
 595}
 596
 597static void __set_spte(u64 *sptep, u64 spte)
 598{
 599        union split_spte *ssptep, sspte;
 600
 601        ssptep = (union split_spte *)sptep;
 602        sspte = (union split_spte)spte;
 603
 604        ssptep->spte_high = sspte.spte_high;
 605
 606        /*
 607         * If we map the spte from nonpresent to present, We should store
 608         * the high bits firstly, then set present bit, so cpu can not
 609         * fetch this spte while we are setting the spte.
 610         */
 611        smp_wmb();
 612
 613        WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
 614}
 615
 616static void __update_clear_spte_fast(u64 *sptep, u64 spte)
 617{
 618        union split_spte *ssptep, sspte;
 619
 620        ssptep = (union split_spte *)sptep;
 621        sspte = (union split_spte)spte;
 622
 623        WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
 624
 625        /*
 626         * If we map the spte from present to nonpresent, we should clear
 627         * present bit firstly to avoid vcpu fetch the old high bits.
 628         */
 629        smp_wmb();
 630
 631        ssptep->spte_high = sspte.spte_high;
 632        count_spte_clear(sptep, spte);
 633}
 634
 635static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
 636{
 637        union split_spte *ssptep, sspte, orig;
 638
 639        ssptep = (union split_spte *)sptep;
 640        sspte = (union split_spte)spte;
 641
 642        /* xchg acts as a barrier before the setting of the high bits */
 643        orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
 644        orig.spte_high = ssptep->spte_high;
 645        ssptep->spte_high = sspte.spte_high;
 646        count_spte_clear(sptep, spte);
 647
 648        return orig.spte;
 649}
 650
 651/*
 652 * The idea using the light way get the spte on x86_32 guest is from
 653 * gup_get_pte(arch/x86/mm/gup.c).
 654 *
 655 * An spte tlb flush may be pending, because kvm_set_pte_rmapp
 656 * coalesces them and we are running out of the MMU lock.  Therefore
 657 * we need to protect against in-progress updates of the spte.
 658 *
 659 * Reading the spte while an update is in progress may get the old value
 660 * for the high part of the spte.  The race is fine for a present->non-present
 661 * change (because the high part of the spte is ignored for non-present spte),
 662 * but for a present->present change we must reread the spte.
 663 *
 664 * All such changes are done in two steps (present->non-present and
 665 * non-present->present), hence it is enough to count the number of
 666 * present->non-present updates: if it changed while reading the spte,
 667 * we might have hit the race.  This is done using clear_spte_count.
 668 */
 669static u64 __get_spte_lockless(u64 *sptep)
 670{
 671        struct kvm_mmu_page *sp =  page_header(__pa(sptep));
 672        union split_spte spte, *orig = (union split_spte *)sptep;
 673        int count;
 674
 675retry:
 676        count = sp->clear_spte_count;
 677        smp_rmb();
 678
 679        spte.spte_low = orig->spte_low;
 680        smp_rmb();
 681
 682        spte.spte_high = orig->spte_high;
 683        smp_rmb();
 684
 685        if (unlikely(spte.spte_low != orig->spte_low ||
 686              count != sp->clear_spte_count))
 687                goto retry;
 688
 689        return spte.spte;
 690}
 691#endif
 692
 693static bool spte_can_locklessly_be_made_writable(u64 spte)
 694{
 695        return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
 696                (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
 697}
 698
 699static bool spte_has_volatile_bits(u64 spte)
 700{
 701        if (!is_shadow_present_pte(spte))
 702                return false;
 703
 704        /*
 705         * Always atomically update spte if it can be updated
 706         * out of mmu-lock, it can ensure dirty bit is not lost,
 707         * also, it can help us to get a stable is_writable_pte()
 708         * to ensure tlb flush is not missed.
 709         */
 710        if (spte_can_locklessly_be_made_writable(spte) ||
 711            is_access_track_spte(spte))
 712                return true;
 713
 714        if (spte_ad_enabled(spte)) {
 715                if ((spte & shadow_accessed_mask) == 0 ||
 716                    (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
 717                        return true;
 718        }
 719
 720        return false;
 721}
 722
 723static bool is_accessed_spte(u64 spte)
 724{
 725        u64 accessed_mask = spte_shadow_accessed_mask(spte);
 726
 727        return accessed_mask ? spte & accessed_mask
 728                             : !is_access_track_spte(spte);
 729}
 730
 731static bool is_dirty_spte(u64 spte)
 732{
 733        u64 dirty_mask = spte_shadow_dirty_mask(spte);
 734
 735        return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
 736}
 737
 738/* Rules for using mmu_spte_set:
 739 * Set the sptep from nonpresent to present.
 740 * Note: the sptep being assigned *must* be either not present
 741 * or in a state where the hardware will not attempt to update
 742 * the spte.
 743 */
 744static void mmu_spte_set(u64 *sptep, u64 new_spte)
 745{
 746        WARN_ON(is_shadow_present_pte(*sptep));
 747        __set_spte(sptep, new_spte);
 748}
 749
 750/*
 751 * Update the SPTE (excluding the PFN), but do not track changes in its
 752 * accessed/dirty status.
 753 */
 754static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
 755{
 756        u64 old_spte = *sptep;
 757
 758        WARN_ON(!is_shadow_present_pte(new_spte));
 759
 760        if (!is_shadow_present_pte(old_spte)) {
 761                mmu_spte_set(sptep, new_spte);
 762                return old_spte;
 763        }
 764
 765        if (!spte_has_volatile_bits(old_spte))
 766                __update_clear_spte_fast(sptep, new_spte);
 767        else
 768                old_spte = __update_clear_spte_slow(sptep, new_spte);
 769
 770        WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
 771
 772        return old_spte;
 773}
 774
 775/* Rules for using mmu_spte_update:
 776 * Update the state bits, it means the mapped pfn is not changed.
 777 *
 778 * Whenever we overwrite a writable spte with a read-only one we
 779 * should flush remote TLBs. Otherwise rmap_write_protect
 780 * will find a read-only spte, even though the writable spte
 781 * might be cached on a CPU's TLB, the return value indicates this
 782 * case.
 783 *
 784 * Returns true if the TLB needs to be flushed
 785 */
 786static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 787{
 788        bool flush = false;
 789        u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
 790
 791        if (!is_shadow_present_pte(old_spte))
 792                return false;
 793
 794        /*
 795         * For the spte updated out of mmu-lock is safe, since
 796         * we always atomically update it, see the comments in
 797         * spte_has_volatile_bits().
 798         */
 799        if (spte_can_locklessly_be_made_writable(old_spte) &&
 800              !is_writable_pte(new_spte))
 801                flush = true;
 802
 803        /*
 804         * Flush TLB when accessed/dirty states are changed in the page tables,
 805         * to guarantee consistency between TLB and page tables.
 806         */
 807
 808        if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
 809                flush = true;
 810                kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 811        }
 812
 813        if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
 814                flush = true;
 815                kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 816        }
 817
 818        return flush;
 819}
 820
 821/*
 822 * Rules for using mmu_spte_clear_track_bits:
 823 * It sets the sptep from present to nonpresent, and track the
 824 * state bits, it is used to clear the last level sptep.
 825 * Returns non-zero if the PTE was previously valid.
 826 */
 827static int mmu_spte_clear_track_bits(u64 *sptep)
 828{
 829        kvm_pfn_t pfn;
 830        u64 old_spte = *sptep;
 831
 832        if (!spte_has_volatile_bits(old_spte))
 833                __update_clear_spte_fast(sptep, 0ull);
 834        else
 835                old_spte = __update_clear_spte_slow(sptep, 0ull);
 836
 837        if (!is_shadow_present_pte(old_spte))
 838                return 0;
 839
 840        pfn = spte_to_pfn(old_spte);
 841
 842        /*
 843         * KVM does not hold the refcount of the page used by
 844         * kvm mmu, before reclaiming the page, we should
 845         * unmap it from mmu first.
 846         */
 847        WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
 848
 849        if (is_accessed_spte(old_spte))
 850                kvm_set_pfn_accessed(pfn);
 851
 852        if (is_dirty_spte(old_spte))
 853                kvm_set_pfn_dirty(pfn);
 854
 855        return 1;
 856}
 857
 858/*
 859 * Rules for using mmu_spte_clear_no_track:
 860 * Directly clear spte without caring the state bits of sptep,
 861 * it is used to set the upper level spte.
 862 */
 863static void mmu_spte_clear_no_track(u64 *sptep)
 864{
 865        __update_clear_spte_fast(sptep, 0ull);
 866}
 867
 868static u64 mmu_spte_get_lockless(u64 *sptep)
 869{
 870        return __get_spte_lockless(sptep);
 871}
 872
 873static u64 mark_spte_for_access_track(u64 spte)
 874{
 875        if (spte_ad_enabled(spte))
 876                return spte & ~shadow_accessed_mask;
 877
 878        if (is_access_track_spte(spte))
 879                return spte;
 880
 881        /*
 882         * Making an Access Tracking PTE will result in removal of write access
 883         * from the PTE. So, verify that we will be able to restore the write
 884         * access in the fast page fault path later on.
 885         */
 886        WARN_ONCE((spte & PT_WRITABLE_MASK) &&
 887                  !spte_can_locklessly_be_made_writable(spte),
 888                  "kvm: Writable SPTE is not locklessly dirty-trackable\n");
 889
 890        WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
 891                          shadow_acc_track_saved_bits_shift),
 892                  "kvm: Access Tracking saved bit locations are not zero\n");
 893
 894        spte |= (spte & shadow_acc_track_saved_bits_mask) <<
 895                shadow_acc_track_saved_bits_shift;
 896        spte &= ~shadow_acc_track_mask;
 897
 898        return spte;
 899}
 900
 901/* Restore an acc-track PTE back to a regular PTE */
 902static u64 restore_acc_track_spte(u64 spte)
 903{
 904        u64 new_spte = spte;
 905        u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
 906                         & shadow_acc_track_saved_bits_mask;
 907
 908        WARN_ON_ONCE(spte_ad_enabled(spte));
 909        WARN_ON_ONCE(!is_access_track_spte(spte));
 910
 911        new_spte &= ~shadow_acc_track_mask;
 912        new_spte &= ~(shadow_acc_track_saved_bits_mask <<
 913                      shadow_acc_track_saved_bits_shift);
 914        new_spte |= saved_bits;
 915
 916        return new_spte;
 917}
 918
 919/* Returns the Accessed status of the PTE and resets it at the same time. */
 920static bool mmu_spte_age(u64 *sptep)
 921{
 922        u64 spte = mmu_spte_get_lockless(sptep);
 923
 924        if (!is_accessed_spte(spte))
 925                return false;
 926
 927        if (spte_ad_enabled(spte)) {
 928                clear_bit((ffs(shadow_accessed_mask) - 1),
 929                          (unsigned long *)sptep);
 930        } else {
 931                /*
 932                 * Capture the dirty status of the page, so that it doesn't get
 933                 * lost when the SPTE is marked for access tracking.
 934                 */
 935                if (is_writable_pte(spte))
 936                        kvm_set_pfn_dirty(spte_to_pfn(spte));
 937
 938                spte = mark_spte_for_access_track(spte);
 939                mmu_spte_update_no_track(sptep, spte);
 940        }
 941
 942        return true;
 943}
 944
 945static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
 946{
 947        /*
 948         * Prevent page table teardown by making any free-er wait during
 949         * kvm_flush_remote_tlbs() IPI to all active vcpus.
 950         */
 951        local_irq_disable();
 952
 953        /*
 954         * Make sure a following spte read is not reordered ahead of the write
 955         * to vcpu->mode.
 956         */
 957        smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
 958}
 959
 960static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
 961{
 962        /*
 963         * Make sure the write to vcpu->mode is not reordered in front of
 964         * reads to sptes.  If it does, kvm_mmu_commit_zap_page() can see us
 965         * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
 966         */
 967        smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
 968        local_irq_enable();
 969}
 970
 971static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 972                                  struct kmem_cache *base_cache, int min)
 973{
 974        void *obj;
 975
 976        if (cache->nobjs >= min)
 977                return 0;
 978        while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
 979                obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT);
 980                if (!obj)
 981                        return cache->nobjs >= min ? 0 : -ENOMEM;
 982                cache->objects[cache->nobjs++] = obj;
 983        }
 984        return 0;
 985}
 986
 987static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
 988{
 989        return cache->nobjs;
 990}
 991
 992static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
 993                                  struct kmem_cache *cache)
 994{
 995        while (mc->nobjs)
 996                kmem_cache_free(cache, mc->objects[--mc->nobjs]);
 997}
 998
 999static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
1000                                       int min)

1001{
1002        void *page;
1003
1004        if (cache->nobjs >= min)
1005                return 0;
1006        while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
1007                page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
1008                if (!page)
1009                        return cache->nobjs >= min ? 0 : -ENOMEM;
1010                cache->objects[cache->nobjs++] = page;
1011        }
1012        return 0;
1013}
1014
1015static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
1016{
1017        while (mc->nobjs)
1018                free_page((unsigned long)mc->objects[--mc->nobjs]);
1019}
1020
1021static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
1022{
1023        int r;
1024
1025        r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
1026                                   pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
1027        if (r)
1028                goto out;
1029        r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
1030        if (r)
1031                goto out;
1032        r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
1033                                   mmu_page_header_cache, 4);
1034out:
1035        return r;
1036}
1037
1038static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
1039{
1040        mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
1041                                pte_list_desc_cache);
1042        mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
1043        mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
1044                                mmu_page_header_cache);
1045}
1046
1047static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
1048{
1049        void *p;
1050
1051        BUG_ON(!mc->nobjs);
1052        p = mc->objects[--mc->nobjs];
1053        return p;
1054}
1055
1056static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
1057{
1058        return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
1059}
1060
1061static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
1062{
1063        kmem_cache_free(pte_list_desc_cache, pte_list_desc);
1064}
1065
1066static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
1067{
1068        if (!sp->role.direct)
1069                return sp->gfns[index];
1070
1071        return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
1072}
1073
1074static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
1075{
1076        if (sp->role.direct)
1077                BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
1078        else
1079                sp->gfns[index] = gfn;
1080}
1081
1082/*
1083 * Return the pointer to the large page information for a given gfn,
1084 * handling slots that are not large page aligned.
1085 */
1086static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
1087                                              struct kvm_memory_slot *slot,
1088                                              int level)
1089{
1090        unsigned long idx;
1091
1092        idx = gfn_to_index(gfn, slot->base_gfn, level);
1093        return &slot->arch.lpage_info[level - 2][idx];
1094}
1095
1096static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
1097                                            gfn_t gfn, int count)
1098{
1099        struct kvm_lpage_info *linfo;
1100        int i;
1101
1102        for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1103                linfo = lpage_info_slot(gfn, slot, i);
1104                linfo->disallow_lpage += count;
1105                WARN_ON(linfo->disallow_lpage < 0);
1106        }
1107}
1108
1109void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
1110{
1111        update_gfn_disallow_lpage_count(slot, gfn, 1);
1112}
1113
1114void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
1115{
1116        update_gfn_disallow_lpage_count(slot, gfn, -1);
1117}
1118
1119static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1120{
1121        struct kvm_memslots *slots;
1122        struct kvm_memory_slot *slot;
1123        gfn_t gfn;
1124
1125        kvm->arch.indirect_shadow_pages++;
1126        gfn = sp->gfn;
1127        slots = kvm_memslots_for_spte_role(kvm, sp->role);
1128        slot = __gfn_to_memslot(slots, gfn);
1129
1130        /* the non-leaf shadow pages are keeping readonly. */
1131        if (sp->role.level > PT_PAGE_TABLE_LEVEL)
1132                return kvm_slot_page_track_add_page(kvm, slot, gfn,
1133                                                    KVM_PAGE_TRACK_WRITE);
1134
1135        kvm_mmu_gfn_disallow_lpage(slot, gfn);
1136}
1137
1138static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1139{
1140        struct kvm_memslots *slots;
1141        struct kvm_memory_slot *slot;
1142        gfn_t gfn;
1143
1144        kvm->arch.indirect_shadow_pages--;
1145        gfn = sp->gfn;
1146        slots = kvm_memslots_for_spte_role(kvm, sp->role);
1147        slot = __gfn_to_memslot(slots, gfn);
1148        if (sp->role.level > PT_PAGE_TABLE_LEVEL)
1149                return kvm_slot_page_track_remove_page(kvm, slot, gfn,
1150                                                       KVM_PAGE_TRACK_WRITE);
1151
1152        kvm_mmu_gfn_allow_lpage(slot, gfn);
1153}
1154
1155static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
1156                                          struct kvm_memory_slot *slot)
1157{
1158        struct kvm_lpage_info *linfo;
1159
1160        if (slot) {
1161                linfo = lpage_info_slot(gfn, slot, level);
1162                return !!linfo->disallow_lpage;
1163        }
1164
1165        return true;
1166}
1167
1168static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
1169                                        int level)
1170{
1171        struct kvm_memory_slot *slot;
1172
1173        slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1174        return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
1175}
1176
1177static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
1178{
1179        unsigned long page_size;
1180        int i, ret = 0;
1181
1182        page_size = kvm_host_page_size(kvm, gfn);
1183
1184        for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1185                if (page_size >= KVM_HPAGE_SIZE(i))
1186                        ret = i;
1187                else
1188                        break;
1189        }
1190
1191        return ret;
1192}
1193
1194static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot,
1195                                          bool no_dirty_log)
1196{
1197        if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
1198                return false;
1199        if (no_dirty_log && slot->dirty_bitmap)
1200                return false;
1201
1202        return true;
1203}
1204
1205static struct kvm_memory_slot *
1206gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
1207                            bool no_dirty_log)
1208{
1209        struct kvm_memory_slot *slot;
1210
1211        slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1212        if (!memslot_valid_for_gpte(slot, no_dirty_log))
1213                slot = NULL;
1214
1215        return slot;
1216}
1217
1218static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
1219                         bool *force_pt_level)
1220{
1221        int host_level, level, max_level;
1222        struct kvm_memory_slot *slot;
1223
1224        if (unlikely(*force_pt_level))
1225                return PT_PAGE_TABLE_LEVEL;
1226
1227        slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
1228        *force_pt_level = !memslot_valid_for_gpte(slot, true);
1229        if (unlikely(*force_pt_level))
1230                return PT_PAGE_TABLE_LEVEL;
1231
1232        host_level = host_mapping_level(vcpu->kvm, large_gfn);
1233
1234        if (host_level == PT_PAGE_TABLE_LEVEL)
1235                return host_level;
1236
1237        max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
1238
1239        for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
1240                if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
1241                        break;
1242
1243        return level - 1;
1244}
1245
1246/*
1247 * About rmap_head encoding:
1248 *
1249 * If the bit zero of rmap_head->val is clear, then it points to the only spte
1250 * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
1251 * pte_list_desc containing more mappings.
1252 */
1253
1254/*
1255 * Returns the number of pointers in the rmap chain, not counting the new one.
1256 */
1257static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
1258                        struct kvm_rmap_head *rmap_head)
1259{
1260        struct pte_list_desc *desc;
1261        int i, count = 0;
1262
1263        if (!rmap_head->val) {
1264                rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
1265                rmap_head->val = (unsigned long)spte;
1266        } else if (!(rmap_head->val & 1)) {
1267                rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
1268                desc = mmu_alloc_pte_list_desc(vcpu);
1269                desc->sptes[0] = (u64 *)rmap_head->val;
1270                desc->sptes[1] = spte;
1271                rmap_head->val = (unsigned long)desc | 1;
1272                ++count;
1273        } else {
1274                rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
1275                desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1276                while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
1277                        desc = desc->more;
1278                        count += PTE_LIST_EXT;
1279                }
1280                if (desc->sptes[PTE_LIST_EXT-1]) {
1281                        desc->more = mmu_alloc_pte_list_desc(vcpu);
1282                        desc = desc->more;
1283                }
1284                for (i = 0; desc->sptes[i]; ++i)
1285                        ++count;
1286                desc->sptes[i] = spte;
1287        }
1288        return count;
1289}
1290
1291static void
1292pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
1293                           struct pte_list_desc *desc, int i,
1294                           struct pte_list_desc *prev_desc)
1295{
1296        int j;
1297
1298        for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
1299                ;
1300        desc->sptes[i] = desc->sptes[j];
1301        desc->sptes[j] = NULL;
1302        if (j != 0)
1303                return;
1304        if (!prev_desc && !desc->more)
1305                rmap_head->val = (unsigned long)desc->sptes[0];
1306        else
1307                if (prev_desc)
1308                        prev_desc->more = desc->more;
1309                else
1310                        rmap_head->val = (unsigned long)desc->more | 1;
1311        mmu_free_pte_list_desc(desc);
1312}
1313
1314static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
1315{
1316        struct pte_list_desc *desc;
1317        struct pte_list_desc *prev_desc;
1318        int i;
1319
1320        if (!rmap_head->val) {
1321                pr_err("%s: %p 0->BUG\n", __func__, spte);
1322                BUG();
1323        } else if (!(rmap_head->val & 1)) {
1324                rmap_printk("%s:  %p 1->0\n", __func__, spte);
1325                if ((u64 *)rmap_head->val != spte) {
1326                        pr_err("%s:  %p 1->BUG\n", __func__, spte);
1327                        BUG();
1328                }
1329                rmap_head->val = 0;
1330        } else {
1331                rmap_printk("%s:  %p many->many\n", __func__, spte);
1332                desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1333                prev_desc = NULL;
1334                while (desc) {
1335                        for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
1336                                if (desc->sptes[i] == spte) {
1337                                        pte_list_desc_remove_entry(rmap_head,
1338                                                        desc, i, prev_desc);
1339                                        return;
1340                                }
1341                        }
1342                        prev_desc = desc;
1343                        desc = desc->more;
1344                }
1345                pr_err("%s: %p many->many\n", __func__, spte);
1346                BUG();
1347        }
1348}
1349
1350static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep)
1351{
1352        mmu_spte_clear_track_bits(sptep);
1353        __pte_list_remove(sptep, rmap_head);
1354}
1355
1356static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
1357                                           struct kvm_memory_slot *slot)
1358{
1359        unsigned long idx;
1360
1361        idx = gfn_to_index(gfn, slot->base_gfn, level);
1362        return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
1363}
1364
1365static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
1366                                         struct kvm_mmu_page *sp)
1367{
1368        struct kvm_memslots *slots;
1369        struct kvm_memory_slot *slot;
1370
1371        slots = kvm_memslots_for_spte_role(kvm, sp->role);
1372        slot = __gfn_to_memslot(slots, gfn);
1373        return __gfn_to_rmap(gfn, sp->role.level, slot);
1374}
1375
1376static bool rmap_can_add(struct kvm_vcpu *vcpu)
1377{
1378        struct kvm_mmu_memory_cache *cache;
1379
1380        cache = &vcpu->arch.mmu_pte_list_desc_cache;
1381        return mmu_memory_cache_free_objects(cache);
1382}
1383
1384static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
1385{
1386        struct kvm_mmu_page *sp;
1387        struct kvm_rmap_head *rmap_head;
1388
1389        sp = page_header(__pa(spte));
1390        kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
1391        rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
1392        return pte_list_add(vcpu, spte, rmap_head);
1393}
1394
1395static void rmap_remove(struct kvm *kvm, u64 *spte)
1396{
1397        struct kvm_mmu_page *sp;
1398        gfn_t gfn;
1399        struct kvm_rmap_head *rmap_head;
1400
1401        sp = page_header(__pa(spte));
1402        gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
1403        rmap_head = gfn_to_rmap(kvm, gfn, sp);
1404        __pte_list_remove(spte, rmap_head);
1405}
1406
1407/*
1408 * Used by the following functions to iterate through the sptes linked by a
1409 * rmap.  All fields are private and not assumed to be used outside.
1410 */
1411struct rmap_iterator {
1412        /* private fields */
1413        struct pte_list_desc *desc;     /* holds the sptep if not NULL */
1414        int pos;                        /* index of the sptep */
1415};
1416
1417/*
1418 * Iteration must be started by this function.  This should also be used after
1419 * removing/dropping sptes from the rmap link because in such cases the
1420 * information in the itererator may not be valid.
1421 *
1422 * Returns sptep if found, NULL otherwise.
1423 */
1424static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
1425                           struct rmap_iterator *iter)
1426{
1427        u64 *sptep;
1428
1429        if (!rmap_head->val)
1430                return NULL;
1431
1432        if (!(rmap_head->val & 1)) {
1433                iter->desc = NULL;
1434                sptep = (u64 *)rmap_head->val;
1435                goto out;
1436        }
1437
1438        iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1439        iter->pos = 0;
1440        sptep = iter->desc->sptes[iter->pos];
1441out:
1442        BUG_ON(!is_shadow_present_pte(*sptep));
1443        return sptep;
1444}
1445
1446/*
1447 * Must be used with a valid iterator: e.g. after rmap_get_first().
1448 *
1449 * Returns sptep if found, NULL otherwise.
1450 */
1451static u64 *rmap_get_next(struct rmap_iterator *iter)
1452{
1453        u64 *sptep;
1454
1455        if (iter->desc) {
1456                if (iter->pos < PTE_LIST_EXT - 1) {
1457                        ++iter->pos;
1458                        sptep = iter->desc->sptes[iter->pos];
1459                        if (sptep)
1460                                goto out;
1461                }
1462
1463                iter->desc = iter->desc->more;
1464
1465                if (iter->desc) {
1466                        iter->pos = 0;
1467                        /* desc->sptes[0] cannot be NULL */
1468                        sptep = iter->desc->sptes[iter->pos];
1469                        goto out;
1470                }
1471        }
1472
1473        return NULL;
1474out:
1475        BUG_ON(!is_shadow_present_pte(*sptep));
1476        return sptep;
1477}
1478
1479#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_)                 \
1480        for (_spte_ = rmap_get_first(_rmap_head_, _iter_);              \
1481             _spte_; _spte_ = rmap_get_next(_iter_))
1482
1483static void drop_spte(struct kvm *kvm, u64 *sptep)
1484{
1485        if (mmu_spte_clear_track_bits(sptep))
1486                rmap_remove(kvm, sptep);
1487}
1488
1489
1490static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
1491{
1492        if (is_large_pte(*sptep)) {
1493                WARN_ON(page_header(__pa(sptep))->role.level ==
1494                        PT_PAGE_TABLE_LEVEL);
1495                drop_spte(kvm, sptep);
1496                --kvm->stat.lpages;
1497                return true;
1498        }
1499
1500        return false;
1501}
1502
1503static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1504{
1505        if (__drop_large_spte(vcpu->kvm, sptep)) {
1506                struct kvm_mmu_page *sp = page_header(__pa(sptep));
1507
1508                kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1509                        KVM_PAGES_PER_HPAGE(sp->role.level));
1510        }
1511}
1512
1513/*
1514 * Write-protect on the specified @sptep, @pt_protect indicates whether
1515 * spte write-protection is caused by protecting shadow page table.
1516 *
1517 * Note: write protection is difference between dirty logging and spte
1518 * protection:
1519 * - for dirty logging, the spte can be set to writable at anytime if
1520 *   its dirty bitmap is properly set.
1521 * - for spte protection, the spte can be writable only after unsync-ing
1522 *   shadow page.
1523 *
1524 * Return true if tlb need be flushed.
1525 */
1526static bool spte_write_protect(u64 *sptep, bool pt_protect)
1527{
1528        u64 spte = *sptep;
1529
1530        if (!is_writable_pte(spte) &&
1531              !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
1532                return false;
1533
1534        rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
1535
1536        if (pt_protect)
1537                spte &= ~SPTE_MMU_WRITEABLE;
1538        spte = spte & ~PT_WRITABLE_MASK;
1539
1540        return mmu_spte_update(sptep, spte);
1541}
1542
1543static bool __rmap_write_protect(struct kvm *kvm,
1544                                 struct kvm_rmap_head *rmap_head,
1545                                 bool pt_protect)
1546{
1547        u64 *sptep;
1548        struct rmap_iterator iter;
1549        bool flush = false;
1550
1551        for_each_rmap_spte(rmap_head, &iter, sptep)
1552                flush |= spte_write_protect(sptep, pt_protect);
1553
1554        return flush;
1555}
1556
1557static bool spte_clear_dirty(u64 *sptep)
1558{
1559        u64 spte = *sptep;
1560
1561        rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
1562
1563        spte &= ~shadow_dirty_mask;
1564
1565        return mmu_spte_update(sptep, spte);
1566}
1567
1568static bool wrprot_ad_disabled_spte(u64 *sptep)
1569{
1570        bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
1571                                               (unsigned long *)sptep);
1572        if (was_writable)
1573                kvm_set_pfn_dirty(spte_to_pfn(*sptep));
1574
1575        return was_writable;
1576}
1577
1578/*
1579 * Gets the GFN ready for another round of dirty logging by clearing the
1580 *      - D bit on ad-enabled SPTEs, and
1581 *      - W bit on ad-disabled SPTEs.
1582 * Returns true iff any D or W bits were cleared.
1583 */
1584static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1585{
1586        u64 *sptep;
1587        struct rmap_iterator iter;
1588        bool flush = false;
1589
1590        for_each_rmap_spte(rmap_head, &iter, sptep)
1591                if (spte_ad_enabled(*sptep))
1592                        flush |= spte_clear_dirty(sptep);
1593                else
1594                        flush |= wrprot_ad_disabled_spte(sptep);
1595
1596        return flush;
1597}
1598
1599static bool spte_set_dirty(u64 *sptep)
1600{
1601        u64 spte = *sptep;
1602
1603        rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
1604
1605        spte |= shadow_dirty_mask;
1606
1607        return mmu_spte_update(sptep, spte);
1608}
1609
1610static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1611{
1612        u64 *sptep;
1613        struct rmap_iterator iter;
1614        bool flush = false;
1615
1616        for_each_rmap_spte(rmap_head, &iter, sptep)
1617                if (spte_ad_enabled(*sptep))
1618                        flush |= spte_set_dirty(sptep);
1619
1620        return flush;
1621}
1622
1623/**
1624 * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
1625 * @kvm: kvm instance
1626 * @slot: slot to protect
1627 * @gfn_offset: start of the BITS_PER_LONG pages we care about
1628 * @mask: indicates which pages we should protect
1629 *
1630 * Used when we do not need to care about huge page mappings: e.g. during dirty
1631 * logging we do not have any such mappings.
1632 */
1633static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1634                                     struct kvm_memory_slot *slot,
1635                                     gfn_t gfn_offset, unsigned long mask)
1636{
1637        struct kvm_rmap_head *rmap_head;
1638
1639        while (mask) {
1640                rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1641                                          PT_PAGE_TABLE_LEVEL, slot);
1642                __rmap_write_protect(kvm, rmap_head, false);
1643
1644                /* clear the first set bit */
1645                mask &= mask - 1;
1646        }
1647}
1648
1649/**
1650 * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
1651 * protect the page if the D-bit isn't supported.
1652 * @kvm: kvm instance
1653 * @slot: slot to clear D-bit
1654 * @gfn_offset: start of the BITS_PER_LONG pages we care about
1655 * @mask: indicates which pages we should clear D-bit
1656 *
1657 * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
1658 */
1659void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1660                                     struct kvm_memory_slot *slot,
1661                                     gfn_t gfn_offset, unsigned long mask)
1662{
1663        struct kvm_rmap_head *rmap_head;
1664
1665        while (mask) {
1666                rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1667                                          PT_PAGE_TABLE_LEVEL, slot);
1668                __rmap_clear_dirty(kvm, rmap_head);
1669
1670                /* clear the first set bit */
1671                mask &= mask - 1;
1672        }
1673}
1674EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
1675
1676/**
1677 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1678 * PT level pages.
1679 *
1680 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1681 * enable dirty logging for them.
1682 *
1683 * Used when we do not need to care about huge page mappings: e.g. during dirty
1684 * logging we do not have any such mappings.
1685 */
1686void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1687                                struct kvm_memory_slot *slot,
1688                                gfn_t gfn_offset, unsigned long mask)
1689{
1690        if (kvm_x86_ops->enable_log_dirty_pt_masked)
1691                kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
1692                                mask);
1693        else
1694                kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1695}
1696
1697/**
1698 * kvm_arch_write_log_dirty - emulate dirty page logging
1699 * @vcpu: Guest mode vcpu
1700 *
1701 * Emulate arch specific page modification logging for the
1702 * nested hypervisor
1703 */
1704int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu)
1705{
1706        if (kvm_x86_ops->write_log_dirty)
1707                return kvm_x86_ops->write_log_dirty(vcpu);
1708
1709        return 0;
1710}
1711
1712bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
1713                                    struct kvm_memory_slot *slot, u64 gfn)
1714{
1715        struct kvm_rmap_head *rmap_head;
1716        int i;
1717        bool write_protected = false;
1718
1719        for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1720                rmap_head = __gfn_to_rmap(gfn, i, slot);
1721                write_protected |= __rmap_write_protect(kvm, rmap_head, true);
1722        }
1723
1724        return write_protected;
1725}
1726
1727static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
1728{
1729        struct kvm_memory_slot *slot;
1730
1731        slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1732        return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
1733}
1734
1735static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1736{
1737        u64 *sptep;
1738        struct rmap_iterator iter;
1739        bool flush = false;
1740
1741        while ((sptep = rmap_get_first(rmap_head, &iter))) {
1742                rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
1743
1744                pte_list_remove(rmap_head, sptep);
1745                flush = true;
1746        }
1747
1748        return flush;
1749}
1750
1751static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1752                           struct kvm_memory_slot *slot, gfn_t gfn, int level,
1753                           unsigned long data)
1754{
1755        return kvm_zap_rmapp(kvm, rmap_head);
1756}
1757
1758static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1759                             struct kvm_memory_slot *slot, gfn_t gfn, int level,
1760                             unsigned long data)
1761{
1762        u64 *sptep;
1763        struct rmap_iterator iter;
1764        int need_flush = 0;
1765        u64 new_spte;
1766        pte_t *ptep = (pte_t *)data;
1767        kvm_pfn_t new_pfn;
1768
1769        WARN_ON(pte_huge(*ptep));
1770        new_pfn = pte_pfn(*ptep);
1771
1772restart:
1773        for_each_rmap_spte(rmap_head, &iter, sptep) {
1774                rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
1775                            sptep, *sptep, gfn, level);
1776
1777                need_flush = 1;
1778
1779                if (pte_write(*ptep)) {
1780                        pte_list_remove(rmap_head, sptep);
1781                        goto restart;
1782                } else {
1783                        new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
1784                        new_spte |= (u64)new_pfn << PAGE_SHIFT;
1785
1786                        new_spte &= ~PT_WRITABLE_MASK;
1787                        new_spte &= ~SPTE_HOST_WRITEABLE;
1788
1789                        new_spte = mark_spte_for_access_track(new_spte);
1790
1791                        mmu_spte_clear_track_bits(sptep);
1792                        mmu_spte_set(sptep, new_spte);
1793                }
1794        }
1795
1796        if (need_flush && kvm_available_flush_tlb_with_range()) {
1797                kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1798                return 0;
1799        }
1800
1801        return need_flush;
1802}
1803
1804struct slot_rmap_walk_iterator {
1805        /* input fields. */
1806        struct kvm_memory_slot *slot;
1807        gfn_t start_gfn;
1808        gfn_t end_gfn;
1809        int start_level;
1810        int end_level;
1811
1812        /* output fields. */
1813        gfn_t gfn;
1814        struct kvm_rmap_head *rmap;
1815        int level;
1816
1817        /* private field. */
1818        struct kvm_rmap_head *end_rmap;
1819};
1820
1821static void
1822rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
1823{
1824        iterator->level = level;
1825        iterator->gfn = iterator->start_gfn;
1826        iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
1827        iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
1828                                           iterator->slot);
1829}
1830
1831static void
1832slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
1833                    struct kvm_memory_slot *slot, int start_level,
1834                    int end_level, gfn_t start_gfn, gfn_t end_gfn)
1835{
1836        iterator->slot = slot;
1837        iterator->start_level = start_level;
1838        iterator->end_level = end_level;
1839        iterator->start_gfn = start_gfn;
1840        iterator->end_gfn = end_gfn;
1841
1842        rmap_walk_init_level(iterator, iterator->start_level);
1843}
1844
1845static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
1846{
1847        return !!iterator->rmap;
1848}
1849
1850static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
1851{
1852        if (++iterator->rmap <= iterator->end_rmap) {
1853                iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
1854                return;
1855        }
1856
1857        if (++iterator->level > iterator->end_level) {
1858                iterator->rmap = NULL;
1859                return;
1860        }
1861
1862        rmap_walk_init_level(iterator, iterator->level);
1863}
1864
1865#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_,    \
1866           _start_gfn, _end_gfn, _iter_)                                \
1867        for (slot_rmap_walk_init(_iter_, _slot_, _start_level_,         \
1868                                 _end_level_, _start_gfn, _end_gfn);    \
1869             slot_rmap_walk_okay(_iter_);                               \
1870             slot_rmap_walk_next(_iter_))
1871
1872static int kvm_handle_hva_range(struct kvm *kvm,
1873                                unsigned long start,
1874                                unsigned long end,
1875                                unsigned long data,
1876                                int (*handler)(struct kvm *kvm,
1877                                               struct kvm_rmap_head *rmap_head,
1878                                               struct kvm_memory_slot *slot,
1879                                               gfn_t gfn,
1880                                               int level,
1881                                               unsigned long data))
1882{
1883        struct kvm_memslots *slots;
1884        struct kvm_memory_slot *memslot;
1885        struct slot_rmap_walk_iterator iterator;
1886        int ret = 0;
1887        int i;
1888
1889        for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1890                slots = __kvm_memslots(kvm, i);
1891                kvm_for_each_memslot(memslot, slots) {
1892                        unsigned long hva_start, hva_end;
1893                        gfn_t gfn_start, gfn_end;
1894
1895                        hva_start = max(start, memslot->userspace_addr);
1896                        hva_end = min(end, memslot->userspace_addr +
1897                                      (memslot->npages << PAGE_SHIFT));
1898                        if (hva_start >= hva_end)
1899                                continue;
1900                        /*
1901                         * {gfn(page) | page intersects with [hva_start, hva_end)} =
1902                         * {gfn_start, gfn_start+1, ..., gfn_end-1}.
1903                         */
1904                        gfn_start = hva_to_gfn_memslot(hva_start, memslot);
1905                        gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
1906
1907                        for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
1908                                                 PT_MAX_HUGEPAGE_LEVEL,
1909                                                 gfn_start, gfn_end - 1,
1910                                                 &iterator)
1911                                ret |= handler(kvm, iterator.rmap, memslot,
1912                                               iterator.gfn, iterator.level, data);
1913                }
1914        }
1915
1916        return ret;
1917}
1918
1919static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
1920                          unsigned long data,
1921                          int (*handler)(struct kvm *kvm,
1922                                         struct kvm_rmap_head *rmap_head,
1923                                         struct kvm_memory_slot *slot,
1924                                         gfn_t gfn, int level,
1925                                         unsigned long data))
1926{
1927        return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
1928}
1929
1930int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
1931{
1932        return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
1933}
1934
1935int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1936{
1937        return kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
1938}
1939
1940static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1941                         struct kvm_memory_slot *slot, gfn_t gfn, int level,
1942                         unsigned long data)
1943{
1944        u64 *sptep;
1945        struct rmap_iterator uninitialized_var(iter);
1946        int young = 0;
1947
1948        for_each_rmap_spte(rmap_head, &iter, sptep)
1949                young |= mmu_spte_age(sptep);
1950
1951        trace_kvm_age_page(gfn, level, slot, young);
1952        return young;
1953}
1954
1955static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1956                              struct kvm_memory_slot *slot, gfn_t gfn,
1957                              int level, unsigned long data)
1958{
1959        u64 *sptep;
1960        struct rmap_iterator iter;
1961
1962        for_each_rmap_spte(rmap_head, &iter, sptep)
1963                if (is_accessed_spte(*sptep))
1964                        return 1;
1965        return 0;
1966}
1967
1968#define RMAP_RECYCLE_THRESHOLD 1000
1969
1970static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
1971{
1972        struct kvm_rmap_head *rmap_head;
1973        struct kvm_mmu_page *sp;
1974
1975        sp = page_header(__pa(spte));
1976
1977        rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
1978
1979        kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
1980        kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1981                        KVM_PAGES_PER_HPAGE(sp->role.level));
1982}
1983
1984int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
1985{
1986        return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
1987}
1988
1989int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
1990{
1991        return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
1992}
1993
1994#ifdef MMU_DEBUG
1995static int is_empty_shadow_page(u64 *spt)
1996{
1997        u64 *pos;
1998        u64 *end;
1999
2000        for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)

2001                if (is_shadow_present_pte(*pos)) {
2002                        printk(KERN_ERR "%s: %p %llx\n", __func__,
2003                               pos, *pos);
2004                        return 0;
2005                }
2006        return 1;
2007}
2008#endif
2009
2010/*
2011 * This value is the sum of all of the kvm instances's
2012 * kvm->arch.n_used_mmu_pages values.  We need a global,
2013 * aggregate version in order to make the slab shrinker
2014 * faster
2015 */
2016static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr)
2017{
2018        kvm->arch.n_used_mmu_pages += nr;
2019        percpu_counter_add(&kvm_total_used_mmu_pages, nr);
2020}
2021
2022static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
2023{
2024        MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
2025        hlist_del(&sp->hash_link);
2026        list_del(&sp->link);
2027        free_page((unsigned long)sp->spt);
2028        if (!sp->role.direct)
2029                free_page((unsigned long)sp->gfns);
2030        kmem_cache_free(mmu_page_header_cache, sp);
2031}
2032
2033static unsigned kvm_page_table_hashfn(gfn_t gfn)
2034{
2035        return hash_64(gfn, KVM_MMU_HASH_SHIFT);
2036}
2037
2038static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
2039                                    struct kvm_mmu_page *sp, u64 *parent_pte)
2040{
2041        if (!parent_pte)
2042                return;
2043
2044        pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
2045}
2046
2047static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
2048                                       u64 *parent_pte)
2049{
2050        __pte_list_remove(parent_pte, &sp->parent_ptes);
2051}
2052
2053static void drop_parent_pte(struct kvm_mmu_page *sp,
2054                            u64 *parent_pte)
2055{
2056        mmu_page_remove_parent_pte(sp, parent_pte);
2057        mmu_spte_clear_no_track(parent_pte);
2058}
2059
2060static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
2061{
2062        struct kvm_mmu_page *sp;
2063
2064        sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
2065        sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
2066        if (!direct)
2067                sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
2068        set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
2069        list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
2070        kvm_mod_used_mmu_pages(vcpu->kvm, +1);
2071        return sp;
2072}
2073
2074static void mark_unsync(u64 *spte);
2075static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
2076{
2077        u64 *sptep;
2078        struct rmap_iterator iter;
2079
2080        for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
2081                mark_unsync(sptep);
2082        }
2083}
2084
2085static void mark_unsync(u64 *spte)
2086{
2087        struct kvm_mmu_page *sp;
2088        unsigned int index;
2089
2090        sp = page_header(__pa(spte));
2091        index = spte - sp->spt;
2092        if (__test_and_set_bit(index, sp->unsync_child_bitmap))
2093                return;
2094        if (sp->unsync_children++)
2095                return;
2096        kvm_mmu_mark_parents_unsync(sp);
2097}
2098
2099static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
2100                               struct kvm_mmu_page *sp)
2101{
2102        return 0;
2103}
2104
2105static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root)
2106{
2107}
2108
2109static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
2110                                 struct kvm_mmu_page *sp, u64 *spte,
2111                                 const void *pte)
2112{
2113        WARN_ON(1);
2114}
2115
2116#define KVM_PAGE_ARRAY_NR 16
2117
2118struct kvm_mmu_pages {
2119        struct mmu_page_and_offset {
2120                struct kvm_mmu_page *sp;
2121                unsigned int idx;
2122        } page[KVM_PAGE_ARRAY_NR];
2123        unsigned int nr;
2124};
2125
2126static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
2127                         int idx)
2128{
2129        int i;
2130
2131        if (sp->unsync)
2132                for (i=0; i < pvec->nr; i++)
2133                        if (pvec->page[i].sp == sp)
2134                                return 0;
2135
2136        pvec->page[pvec->nr].sp = sp;
2137        pvec->page[pvec->nr].idx = idx;
2138        pvec->nr++;
2139        return (pvec->nr == KVM_PAGE_ARRAY_NR);
2140}
2141
2142static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
2143{
2144        --sp->unsync_children;
2145        WARN_ON((int)sp->unsync_children < 0);
2146        __clear_bit(idx, sp->unsync_child_bitmap);
2147}
2148
2149static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
2150                           struct kvm_mmu_pages *pvec)
2151{
2152        int i, ret, nr_unsync_leaf = 0;
2153
2154        for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
2155                struct kvm_mmu_page *child;
2156                u64 ent = sp->spt[i];
2157
2158                if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
2159                        clear_unsync_child_bit(sp, i);
2160                        continue;
2161                }
2162
2163                child = page_header(ent & PT64_BASE_ADDR_MASK);
2164
2165                if (child->unsync_children) {
2166                        if (mmu_pages_add(pvec, child, i))
2167                                return -ENOSPC;
2168
2169                        ret = __mmu_unsync_walk(child, pvec);
2170                        if (!ret) {
2171                                clear_unsync_child_bit(sp, i);
2172                                continue;
2173                        } else if (ret > 0) {
2174                                nr_unsync_leaf += ret;
2175                        } else
2176                                return ret;
2177                } else if (child->unsync) {
2178                        nr_unsync_leaf++;
2179                        if (mmu_pages_add(pvec, child, i))
2180                                return -ENOSPC;
2181                } else
2182                        clear_unsync_child_bit(sp, i);
2183        }
2184
2185        return nr_unsync_leaf;
2186}
2187
2188#define INVALID_INDEX (-1)
2189
2190static int mmu_unsync_walk(struct kvm_mmu_page *sp,
2191                           struct kvm_mmu_pages *pvec)
2192{
2193        pvec->nr = 0;
2194        if (!sp->unsync_children)
2195                return 0;
2196
2197        mmu_pages_add(pvec, sp, INVALID_INDEX);
2198        return __mmu_unsync_walk(sp, pvec);
2199}
2200
2201static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
2202{
2203        WARN_ON(!sp->unsync);
2204        trace_kvm_mmu_sync_page(sp);
2205        sp->unsync = 0;
2206        --kvm->stat.mmu_unsync;
2207}
2208
2209static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2210                                     struct list_head *invalid_list);
2211static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2212                                    struct list_head *invalid_list);
2213
2214
2215#define for_each_valid_sp(_kvm, _sp, _gfn)                              \
2216        hlist_for_each_entry(_sp,                                       \
2217          &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
2218                if ((_sp)->role.invalid) {    \
2219                } else
2220
2221#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)                 \
2222        for_each_valid_sp(_kvm, _sp, _gfn)                              \
2223                if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
2224
2225static inline bool is_ept_sp(struct kvm_mmu_page *sp)
2226{
2227        return sp->role.cr0_wp && sp->role.smap_andnot_wp;
2228}
2229
2230/* @sp->gfn should be write-protected at the call site */
2231static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2232                            struct list_head *invalid_list)
2233{
2234        if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) ||
2235            vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
2236                kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
2237                return false;
2238        }
2239
2240        return true;
2241}
2242
2243static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
2244                                        struct list_head *invalid_list,
2245                                        bool remote_flush)
2246{
2247        if (!remote_flush && list_empty(invalid_list))
2248                return false;
2249
2250        if (!list_empty(invalid_list))
2251                kvm_mmu_commit_zap_page(kvm, invalid_list);
2252        else
2253                kvm_flush_remote_tlbs(kvm);
2254        return true;
2255}
2256
2257static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
2258                                 struct list_head *invalid_list,
2259                                 bool remote_flush, bool local_flush)
2260{
2261        if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush))
2262                return;
2263
2264        if (local_flush)
2265                kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2266}
2267
2268#ifdef CONFIG_KVM_MMU_AUDIT
2269#include "mmu_audit.c"
2270#else
2271static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
2272static void mmu_audit_disable(void) { }
2273#endif
2274
2275static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2276                         struct list_head *invalid_list)
2277{
2278        kvm_unlink_unsync_page(vcpu->kvm, sp);
2279        return __kvm_sync_page(vcpu, sp, invalid_list);
2280}
2281
2282/* @gfn should be write-protected at the call site */
2283static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
2284                           struct list_head *invalid_list)
2285{
2286        struct kvm_mmu_page *s;
2287        bool ret = false;
2288
2289        for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
2290                if (!s->unsync)
2291                        continue;
2292
2293                WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
2294                ret |= kvm_sync_page(vcpu, s, invalid_list);
2295        }
2296
2297        return ret;
2298}
2299
2300struct mmu_page_path {
2301        struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
2302        unsigned int idx[PT64_ROOT_MAX_LEVEL];
2303};
2304
2305#define for_each_sp(pvec, sp, parents, i)                       \
2306                for (i = mmu_pages_first(&pvec, &parents);      \
2307                        i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});   \
2308                        i = mmu_pages_next(&pvec, &parents, i))
2309
2310static int mmu_pages_next(struct kvm_mmu_pages *pvec,
2311                          struct mmu_page_path *parents,
2312                          int i)
2313{
2314        int n;
2315
2316        for (n = i+1; n < pvec->nr; n++) {
2317                struct kvm_mmu_page *sp = pvec->page[n].sp;
2318                unsigned idx = pvec->page[n].idx;
2319                int level = sp->role.level;
2320
2321                parents->idx[level-1] = idx;
2322                if (level == PT_PAGE_TABLE_LEVEL)
2323                        break;
2324
2325                parents->parent[level-2] = sp;
2326        }
2327
2328        return n;
2329}
2330
2331static int mmu_pages_first(struct kvm_mmu_pages *pvec,
2332                           struct mmu_page_path *parents)
2333{
2334        struct kvm_mmu_page *sp;
2335        int level;
2336
2337        if (pvec->nr == 0)
2338                return 0;
2339
2340        WARN_ON(pvec->page[0].idx != INVALID_INDEX);
2341
2342        sp = pvec->page[0].sp;
2343        level = sp->role.level;
2344        WARN_ON(level == PT_PAGE_TABLE_LEVEL);
2345
2346        parents->parent[level-2] = sp;
2347
2348        /* Also set up a sentinel.  Further entries in pvec are all
2349         * children of sp, so this element is never overwritten.
2350         */
2351        parents->parent[level-1] = NULL;
2352        return mmu_pages_next(pvec, parents, 0);
2353}
2354
2355static void mmu_pages_clear_parents(struct mmu_page_path *parents)
2356{
2357        struct kvm_mmu_page *sp;
2358        unsigned int level = 0;
2359
2360        do {
2361                unsigned int idx = parents->idx[level];
2362                sp = parents->parent[level];
2363                if (!sp)
2364                        return;
2365
2366                WARN_ON(idx == INVALID_INDEX);
2367                clear_unsync_child_bit(sp, idx);
2368                level++;
2369        } while (!sp->unsync_children);
2370}
2371
2372static void mmu_sync_children(struct kvm_vcpu *vcpu,
2373                              struct kvm_mmu_page *parent)
2374{
2375        int i;
2376        struct kvm_mmu_page *sp;
2377        struct mmu_page_path parents;
2378        struct kvm_mmu_pages pages;
2379        LIST_HEAD(invalid_list);
2380        bool flush = false;
2381
2382        while (mmu_unsync_walk(parent, &pages)) {
2383                bool protected = false;
2384
2385                for_each_sp(pages, sp, parents, i)
2386                        protected |= rmap_write_protect(vcpu, sp->gfn);
2387
2388                if (protected) {
2389                        kvm_flush_remote_tlbs(vcpu->kvm);
2390                        flush = false;
2391                }
2392
2393                for_each_sp(pages, sp, parents, i) {
2394                        flush |= kvm_sync_page(vcpu, sp, &invalid_list);
2395                        mmu_pages_clear_parents(&parents);
2396                }
2397                if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
2398                        kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2399                        cond_resched_lock(&vcpu->kvm->mmu_lock);
2400                        flush = false;
2401                }
2402        }
2403
2404        kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2405}
2406
2407static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
2408{
2409        atomic_set(&sp->write_flooding_count,  0);
2410}
2411
2412static void clear_sp_write_flooding_count(u64 *spte)
2413{
2414        struct kvm_mmu_page *sp =  page_header(__pa(spte));
2415
2416        __clear_sp_write_flooding_count(sp);
2417}
2418
2419static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
2420                                             gfn_t gfn,
2421                                             gva_t gaddr,
2422                                             unsigned level,
2423                                             int direct,
2424                                             unsigned access)
2425{
2426        union kvm_mmu_page_role role;
2427        unsigned quadrant;
2428        struct kvm_mmu_page *sp;
2429        bool need_sync = false;
2430        bool flush = false;
2431        int collisions = 0;
2432        LIST_HEAD(invalid_list);
2433
2434        role = vcpu->arch.mmu->mmu_role.base;
2435        role.level = level;
2436        role.direct = direct;
2437        if (role.direct)
2438                role.gpte_is_8_bytes = true;
2439        role.access = access;
2440        if (!vcpu->arch.mmu->direct_map
2441            && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
2442                quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
2443                quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
2444                role.quadrant = quadrant;
2445        }
2446        for_each_valid_sp(vcpu->kvm, sp, gfn) {
2447                if (sp->gfn != gfn) {
2448                        collisions++;
2449                        continue;
2450                }
2451
2452                if (!need_sync && sp->unsync)
2453                        need_sync = true;
2454
2455                if (sp->role.word != role.word)
2456                        continue;
2457
2458                if (sp->unsync) {
2459                        /* The page is good, but __kvm_sync_page might still end
2460                         * up zapping it.  If so, break in order to rebuild it.
2461                         */
2462                        if (!__kvm_sync_page(vcpu, sp, &invalid_list))
2463                                break;
2464
2465                        WARN_ON(!list_empty(&invalid_list));
2466                        kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2467                }
2468
2469                if (sp->unsync_children)
2470                        kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
2471
2472                __clear_sp_write_flooding_count(sp);
2473                trace_kvm_mmu_get_page(sp, false);
2474                goto out;
2475        }
2476
2477        ++vcpu->kvm->stat.mmu_cache_miss;
2478
2479        sp = kvm_mmu_alloc_page(vcpu, direct);
2480
2481        sp->gfn = gfn;
2482        sp->role = role;
2483        hlist_add_head(&sp->hash_link,
2484                &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
2485        if (!direct) {
2486                /*
2487                 * we should do write protection before syncing pages
2488                 * otherwise the content of the synced shadow page may
2489                 * be inconsistent with guest page table.
2490                 */
2491                account_shadowed(vcpu->kvm, sp);
2492                if (level == PT_PAGE_TABLE_LEVEL &&
2493                      rmap_write_protect(vcpu, gfn))
2494                        kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
2495
2496                if (level > PT_PAGE_TABLE_LEVEL && need_sync)
2497                        flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
2498        }
2499        clear_page(sp->spt);
2500        trace_kvm_mmu_get_page(sp, true);
2501
2502        kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2503out:
2504        if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
2505                vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
2506        return sp;
2507}
2508
2509static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
2510                                        struct kvm_vcpu *vcpu, hpa_t root,
2511                                        u64 addr)
2512{
2513        iterator->addr = addr;
2514        iterator->shadow_addr = root;
2515        iterator->level = vcpu->arch.mmu->shadow_root_level;
2516
2517        if (iterator->level == PT64_ROOT_4LEVEL &&
2518            vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
2519            !vcpu->arch.mmu->direct_map)
2520                --iterator->level;
2521
2522        if (iterator->level == PT32E_ROOT_LEVEL) {
2523                /*
2524                 * prev_root is currently only used for 64-bit hosts. So only
2525                 * the active root_hpa is valid here.
2526                 */
2527                BUG_ON(root != vcpu->arch.mmu->root_hpa);
2528
2529                iterator->shadow_addr
2530                        = vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
2531                iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
2532                --iterator->level;
2533                if (!iterator->shadow_addr)
2534                        iterator->level = 0;
2535        }
2536}
2537
2538static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
2539                             struct kvm_vcpu *vcpu, u64 addr)
2540{
2541        shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa,
2542                                    addr);
2543}
2544
2545static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
2546{
2547        if (iterator->level < PT_PAGE_TABLE_LEVEL)
2548                return false;
2549
2550        iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
2551        iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
2552        return true;
2553}
2554
2555static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
2556                               u64 spte)
2557{
2558        if (is_last_spte(spte, iterator->level)) {
2559                iterator->level = 0;
2560                return;
2561        }
2562
2563        iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
2564        --iterator->level;
2565}
2566
2567static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
2568{
2569        __shadow_walk_next(iterator, *iterator->sptep);
2570}
2571
2572static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
2573                             struct kvm_mmu_page *sp)
2574{
2575        u64 spte;
2576
2577        BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
2578
2579        spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
2580               shadow_user_mask | shadow_x_mask | shadow_me_mask;
2581
2582        if (sp_ad_disabled(sp))
2583                spte |= shadow_acc_track_value;
2584        else
2585                spte |= shadow_accessed_mask;
2586
2587        mmu_spte_set(sptep, spte);
2588
2589        mmu_page_add_parent_pte(vcpu, sp, sptep);
2590
2591        if (sp->unsync_children || sp->unsync)
2592                mark_unsync(sptep);
2593}
2594
2595static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2596                                   unsigned direct_access)
2597{
2598        if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
2599                struct kvm_mmu_page *child;
2600
2601                /*
2602                 * For the direct sp, if the guest pte's dirty bit
2603                 * changed form clean to dirty, it will corrupt the
2604                 * sp's access: allow writable in the read-only sp,
2605                 * so we should update the spte at this point to get
2606                 * a new sp with the correct access.
2607                 */
2608                child = page_header(*sptep & PT64_BASE_ADDR_MASK);
2609                if (child->role.access == direct_access)
2610                        return;
2611
2612                drop_parent_pte(child, sptep);
2613                kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1);
2614        }
2615}
2616
2617static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
2618                             u64 *spte)
2619{
2620        u64 pte;
2621        struct kvm_mmu_page *child;
2622
2623        pte = *spte;
2624        if (is_shadow_present_pte(pte)) {
2625                if (is_last_spte(pte, sp->role.level)) {
2626                        drop_spte(kvm, spte);
2627                        if (is_large_pte(pte))
2628                                --kvm->stat.lpages;
2629                } else {
2630                        child = page_header(pte & PT64_BASE_ADDR_MASK);
2631                        drop_parent_pte(child, spte);
2632                }
2633                return true;
2634        }
2635
2636        if (is_mmio_spte(pte))
2637                mmu_spte_clear_no_track(spte);
2638
2639        return false;
2640}
2641
2642static void kvm_mmu_page_unlink_children(struct kvm *kvm,
2643                                         struct kvm_mmu_page *sp)
2644{
2645        unsigned i;
2646
2647        for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2648                mmu_page_zap_pte(kvm, sp, sp->spt + i);
2649}
2650
2651static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
2652{
2653        u64 *sptep;
2654        struct rmap_iterator iter;
2655
2656        while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
2657                drop_parent_pte(sp, sptep);
2658}
2659
2660static int mmu_zap_unsync_children(struct kvm *kvm,
2661                                   struct kvm_mmu_page *parent,
2662                                   struct list_head *invalid_list)
2663{
2664        int i, zapped = 0;
2665        struct mmu_page_path parents;
2666        struct kvm_mmu_pages pages;
2667
2668        if (parent->role.level == PT_PAGE_TABLE_LEVEL)
2669                return 0;
2670
2671        while (mmu_unsync_walk(parent, &pages)) {
2672                struct kvm_mmu_page *sp;
2673
2674                for_each_sp(pages, sp, parents, i) {
2675                        kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2676                        mmu_pages_clear_parents(&parents);
2677                        zapped++;
2678                }
2679        }
2680
2681        return zapped;
2682}
2683
2684static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
2685                                       struct kvm_mmu_page *sp,
2686                                       struct list_head *invalid_list,
2687                                       int *nr_zapped)
2688{
2689        bool list_unstable;
2690
2691        trace_kvm_mmu_prepare_zap_page(sp);
2692        ++kvm->stat.mmu_shadow_zapped;
2693        *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
2694        kvm_mmu_page_unlink_children(kvm, sp);
2695        kvm_mmu_unlink_parents(kvm, sp);
2696
2697        /* Zapping children means active_mmu_pages has become unstable. */
2698        list_unstable = *nr_zapped;
2699
2700        if (!sp->role.invalid && !sp->role.direct)
2701                unaccount_shadowed(kvm, sp);
2702
2703        if (sp->unsync)
2704                kvm_unlink_unsync_page(kvm, sp);
2705        if (!sp->root_count) {
2706                /* Count self */
2707                (*nr_zapped)++;
2708                list_move(&sp->link, invalid_list);
2709                kvm_mod_used_mmu_pages(kvm, -1);
2710        } else {
2711                list_move(&sp->link, &kvm->arch.active_mmu_pages);
2712
2713                if (!sp->role.invalid)
2714                        kvm_reload_remote_mmus(kvm);
2715        }
2716
2717        sp->role.invalid = 1;
2718        return list_unstable;
2719}
2720
2721static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2722                                     struct list_head *invalid_list)
2723{
2724        int nr_zapped;
2725
2726        __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
2727        return nr_zapped;
2728}
2729
2730static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2731                                    struct list_head *invalid_list)
2732{
2733        struct kvm_mmu_page *sp, *nsp;
2734
2735        if (list_empty(invalid_list))
2736                return;
2737
2738        /*
2739         * We need to make sure everyone sees our modifications to
2740         * the page tables and see changes to vcpu->mode here. The barrier
2741         * in the kvm_flush_remote_tlbs() achieves this. This pairs
2742         * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
2743         *
2744         * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
2745         * guest mode and/or lockless shadow page table walks.
2746         */
2747        kvm_flush_remote_tlbs(kvm);
2748
2749        list_for_each_entry_safe(sp, nsp, invalid_list, link) {
2750                WARN_ON(!sp->role.invalid || sp->root_count);
2751                kvm_mmu_free_page(sp);
2752        }
2753}
2754
2755static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
2756                                        struct list_head *invalid_list)
2757{
2758        struct kvm_mmu_page *sp;
2759
2760        if (list_empty(&kvm->arch.active_mmu_pages))
2761                return false;
2762
2763        sp = list_last_entry(&kvm->arch.active_mmu_pages,
2764                             struct kvm_mmu_page, link);
2765        return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2766}
2767
2768/*
2769 * Changing the number of mmu pages allocated to the vm
2770 * Note: if goal_nr_mmu_pages is too small, you will get dead lock
2771 */
2772void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
2773{
2774        LIST_HEAD(invalid_list);
2775
2776        spin_lock(&kvm->mmu_lock);
2777
2778        if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2779                /* Need to free some mmu pages to achieve the goal. */
2780                while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
2781                        if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list))
2782                                break;
2783
2784                kvm_mmu_commit_zap_page(kvm, &invalid_list);
2785                goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
2786        }
2787
2788        kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
2789
2790        spin_unlock(&kvm->mmu_lock);
2791}
2792
2793int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2794{
2795        struct kvm_mmu_page *sp;
2796        LIST_HEAD(invalid_list);
2797        int r;
2798
2799        pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
2800        r = 0;
2801        spin_lock(&kvm->mmu_lock);
2802        for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
2803                pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
2804                         sp->role.word);
2805                r = 1;
2806                kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
2807        }
2808        kvm_mmu_commit_zap_page(kvm, &invalid_list);
2809        spin_unlock(&kvm->mmu_lock);
2810
2811        return r;
2812}
2813EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
2814
2815static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
2816{
2817        trace_kvm_mmu_unsync_page(sp);
2818        ++vcpu->kvm->stat.mmu_unsync;
2819        sp->unsync = 1;
2820
2821        kvm_mmu_mark_parents_unsync(sp);
2822}
2823
2824static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2825                                   bool can_unsync)
2826{
2827        struct kvm_mmu_page *sp;
2828
2829        if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
2830                return true;
2831
2832        for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
2833                if (!can_unsync)
2834                        return true;
2835
2836                if (sp->unsync)
2837                        continue;
2838
2839                WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
2840                kvm_unsync_page(vcpu, sp);
2841        }
2842
2843        /*
2844         * We need to ensure that the marking of unsync pages is visible
2845         * before the SPTE is updated to allow writes because
2846         * kvm_mmu_sync_roots() checks the unsync flags without holding
2847         * the MMU lock and so can race with this. If the SPTE was updated
2848         * before the page had been marked as unsync-ed, something like the
2849         * following could happen:
2850         *
2851         * CPU 1                    CPU 2
2852         * ---------------------------------------------------------------------
2853         * 1.2 Host updates SPTE
2854         *     to be writable
2855         *                      2.1 Guest writes a GPTE for GVA X.
2856         *                          (GPTE being in the guest page table shadowed
2857         *                           by the SP from CPU 1.)
2858         *                          This reads SPTE during the page table walk.
2859         *                          Since SPTE.W is read as 1, there is no
2860         *                          fault.
2861         *
2862         *                      2.2 Guest issues TLB flush.
2863         *                          That causes a VM Exit.
2864         *
2865         *                      2.3 kvm_mmu_sync_pages() reads sp->unsync.
2866         *                          Since it is false, so it just returns.
2867         *
2868         *                      2.4 Guest accesses GVA X.
2869         *                          Since the mapping in the SP was not updated,
2870         *                          so the old mapping for GVA X incorrectly
2871         *                          gets used.
2872         * 1.1 Host marks SP
2873         *     as unsync
2874         *     (sp->unsync = true)
2875         *
2876         * The write barrier below ensures that 1.1 happens before 1.2 and thus
2877         * the situation in 2.4 does not arise. The implicit barrier in 2.2
2878         * pairs with this write barrier.
2879         */
2880        smp_wmb();
2881
2882        return false;
2883}
2884
2885static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
2886{
2887        if (pfn_valid(pfn))
2888                return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
2889                        /*
2890                         * Some reserved pages, such as those from NVDIMM
2891                         * DAX devices, are not for MMIO, and can be mapped
2892                         * with cached memory type for better performance.
2893                         * However, the above check misconceives those pages
2894                         * as MMIO, and results in KVM mapping them with UC
2895                         * memory type, which would hurt the performance.
2896                         * Therefore, we check the host memory type in addition
2897                         * and only treat UC/UC-/WC pages as MMIO.
2898                         */
2899                        (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
2900
2901        return !e820__mapped_raw_any(pfn_to_hpa(pfn),
2902                                     pfn_to_hpa(pfn + 1) - 1,
2903                                     E820_TYPE_RAM);
2904}
2905
2906/* Bits which may be returned by set_spte() */
2907#define SET_SPTE_WRITE_PROTECTED_PT     BIT(0)
2908#define SET_SPTE_NEED_REMOTE_TLB_FLUSH  BIT(1)
2909
2910static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2911                    unsigned pte_access, int level,
2912                    gfn_t gfn, kvm_pfn_t pfn, bool speculative,
2913                    bool can_unsync, bool host_writable)
2914{
2915        u64 spte = 0;
2916        int ret = 0;
2917        struct kvm_mmu_page *sp;
2918
2919        if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
2920                return 0;
2921
2922        sp = page_header(__pa(sptep));
2923        if (sp_ad_disabled(sp))
2924                spte |= shadow_acc_track_value;
2925
2926        /*
2927         * For the EPT case, shadow_present_mask is 0 if hardware
2928         * supports exec-only page table entries.  In that case,
2929         * ACC_USER_MASK and shadow_user_mask are used to represent
2930         * read access.  See FNAME(gpte_access) in paging_tmpl.h.
2931         */
2932        spte |= shadow_present_mask;
2933        if (!speculative)
2934                spte |= spte_shadow_accessed_mask(spte);
2935
2936        if (pte_access & ACC_EXEC_MASK)
2937                spte |= shadow_x_mask;
2938        else
2939                spte |= shadow_nx_mask;
2940
2941        if (pte_access & ACC_USER_MASK)
2942                spte |= shadow_user_mask;
2943
2944        if (level > PT_PAGE_TABLE_LEVEL)
2945                spte |= PT_PAGE_SIZE_MASK;
2946        if (tdp_enabled)
2947                spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
2948                        kvm_is_mmio_pfn(pfn));
2949
2950        if (host_writable)
2951                spte |= SPTE_HOST_WRITEABLE;
2952        else
2953                pte_access &= ~ACC_WRITE_MASK;
2954
2955        if (!kvm_is_mmio_pfn(pfn))
2956                spte |= shadow_me_mask;
2957
2958        spte |= (u64)pfn << PAGE_SHIFT;
2959
2960        if (pte_access & ACC_WRITE_MASK) {
2961
2962                /*
2963                 * Other vcpu creates new sp in the window between
2964                 * mapping_level() and acquiring mmu-lock. We can
2965                 * allow guest to retry the access, the mapping can
2966                 * be fixed if guest refault.
2967                 */
2968                if (level > PT_PAGE_TABLE_LEVEL &&
2969                    mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
2970                        goto done;
2971
2972                spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
2973
2974                /*
2975                 * Optimization: for pte sync, if spte was writable the hash
2976                 * lookup is unnecessary (and expensive). Write protection
2977                 * is responsibility of mmu_get_page / kvm_sync_page.
2978                 * Same reasoning can be applied to dirty page accounting.
2979                 */
2980                if (!can_unsync && is_writable_pte(*sptep))
2981                        goto set_pte;
2982
2983                if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
2984                        pgprintk("%s: found shadow page for %llx, marking ro\n",
2985                                 __func__, gfn);
2986                        ret |= SET_SPTE_WRITE_PROTECTED_PT;
2987                        pte_access &= ~ACC_WRITE_MASK;
2988                        spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
2989                }
2990        }
2991
2992        if (pte_access & ACC_WRITE_MASK) {
2993                kvm_vcpu_mark_page_dirty(vcpu, gfn);
2994                spte |= spte_shadow_dirty_mask(spte);
2995        }
2996
2997        if (speculative)
2998                spte = mark_spte_for_access_track(spte);
2999
3000set_pte:

3001        if (mmu_spte_update(sptep, spte))
3002                ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
3003done:
3004        return ret;
3005}
3006
3007static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
3008                        int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
3009                        bool speculative, bool host_writable)
3010{
3011        int was_rmapped = 0;
3012        int rmap_count;
3013        int set_spte_ret;
3014        int ret = RET_PF_RETRY;
3015        bool flush = false;
3016
3017        pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
3018                 *sptep, write_fault, gfn);
3019
3020        if (is_shadow_present_pte(*sptep)) {
3021                /*
3022                 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
3023                 * the parent of the now unreachable PTE.
3024                 */
3025                if (level > PT_PAGE_TABLE_LEVEL &&
3026                    !is_large_pte(*sptep)) {
3027                        struct kvm_mmu_page *child;
3028                        u64 pte = *sptep;
3029
3030                        child = page_header(pte & PT64_BASE_ADDR_MASK);
3031                        drop_parent_pte(child, sptep);
3032                        flush = true;
3033                } else if (pfn != spte_to_pfn(*sptep)) {
3034                        pgprintk("hfn old %llx new %llx\n",
3035                                 spte_to_pfn(*sptep), pfn);
3036                        drop_spte(vcpu->kvm, sptep);
3037                        flush = true;
3038                } else
3039                        was_rmapped = 1;
3040        }
3041
3042        set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
3043                                speculative, true, host_writable);
3044        if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
3045                if (write_fault)
3046                        ret = RET_PF_EMULATE;
3047                kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3048        }
3049
3050        if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
3051                kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
3052                                KVM_PAGES_PER_HPAGE(level));
3053
3054        if (unlikely(is_mmio_spte(*sptep)))
3055                ret = RET_PF_EMULATE;
3056
3057        pgprintk("%s: setting spte %llx\n", __func__, *sptep);
3058        pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
3059                 is_large_pte(*sptep)? "2MB" : "4kB",
3060                 *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn,
3061                 *sptep, sptep);
3062        if (!was_rmapped && is_large_pte(*sptep))
3063                ++vcpu->kvm->stat.lpages;
3064
3065        if (is_shadow_present_pte(*sptep)) {
3066                if (!was_rmapped) {
3067                        rmap_count = rmap_add(vcpu, sptep, gfn);
3068                        if (rmap_count > RMAP_RECYCLE_THRESHOLD)
3069                                rmap_recycle(vcpu, sptep, gfn);
3070                }
3071        }
3072
3073        kvm_release_pfn_clean(pfn);
3074
3075        return ret;
3076}
3077
3078static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
3079                                     bool no_dirty_log)
3080{
3081        struct kvm_memory_slot *slot;
3082
3083        slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
3084        if (!slot)
3085                return KVM_PFN_ERR_FAULT;
3086
3087        return gfn_to_pfn_memslot_atomic(slot, gfn);
3088}
3089
3090static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
3091                                    struct kvm_mmu_page *sp,
3092                                    u64 *start, u64 *end)
3093{
3094        struct page *pages[PTE_PREFETCH_NUM];
3095        struct kvm_memory_slot *slot;
3096        unsigned access = sp->role.access;
3097        int i, ret;
3098        gfn_t gfn;
3099
3100        gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
3101        slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
3102        if (!slot)
3103                return -1;
3104
3105        ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
3106        if (ret <= 0)
3107                return -1;
3108
3109        for (i = 0; i < ret; i++, gfn++, start++)
3110                mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
3111                             page_to_pfn(pages[i]), true, true);
3112
3113        return 0;
3114}
3115
3116static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
3117                                  struct kvm_mmu_page *sp, u64 *sptep)
3118{
3119        u64 *spte, *start = NULL;
3120        int i;
3121
3122        WARN_ON(!sp->role.direct);
3123
3124        i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
3125        spte = sp->spt + i;
3126
3127        for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
3128                if (is_shadow_present_pte(*spte) || spte == sptep) {
3129                        if (!start)
3130                                continue;
3131                        if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
3132                                break;
3133                        start = NULL;
3134                } else if (!start)
3135                        start = spte;
3136        }
3137}
3138
3139static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
3140{
3141        struct kvm_mmu_page *sp;
3142
3143        sp = page_header(__pa(sptep));
3144
3145        /*
3146         * Without accessed bits, there's no way to distinguish between
3147         * actually accessed translations and prefetched, so disable pte
3148         * prefetch if accessed bits aren't available.
3149         */
3150        if (sp_ad_disabled(sp))
3151                return;
3152
3153        if (sp->role.level > PT_PAGE_TABLE_LEVEL)
3154                return;
3155
3156        __direct_pte_prefetch(vcpu, sp, sptep);
3157}
3158
3159static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable,
3160                        int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault)
3161{
3162        struct kvm_shadow_walk_iterator iterator;
3163        struct kvm_mmu_page *sp;
3164        int emulate = 0;
3165        gfn_t pseudo_gfn;
3166
3167        if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3168                return 0;
3169
3170        for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
3171                if (iterator.level == level) {
3172                        emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
3173                                               write, level, gfn, pfn, prefault,
3174                                               map_writable);
3175                        direct_pte_prefetch(vcpu, iterator.sptep);
3176                        ++vcpu->stat.pf_fixed;
3177                        break;
3178                }
3179
3180                drop_large_spte(vcpu, iterator.sptep);
3181                if (!is_shadow_present_pte(*iterator.sptep)) {
3182                        u64 base_addr = iterator.addr;
3183
3184                        base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
3185                        pseudo_gfn = base_addr >> PAGE_SHIFT;
3186                        sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
3187                                              iterator.level - 1, 1, ACC_ALL);
3188
3189                        link_shadow_page(vcpu, iterator.sptep, sp);
3190                }
3191        }
3192        return emulate;
3193}
3194
3195static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
3196{
3197        send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
3198}
3199
3200static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
3201{
3202        /*
3203         * Do not cache the mmio info caused by writing the readonly gfn
3204         * into the spte otherwise read access on readonly gfn also can
3205         * caused mmio page fault and treat it as mmio access.
3206         */
3207        if (pfn == KVM_PFN_ERR_RO_FAULT)
3208                return RET_PF_EMULATE;
3209
3210        if (pfn == KVM_PFN_ERR_HWPOISON) {
3211                kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
3212                return RET_PF_RETRY;
3213        }
3214
3215        return -EFAULT;
3216}
3217
3218static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
3219                                        gfn_t *gfnp, kvm_pfn_t *pfnp,
3220                                        int *levelp)
3221{
3222        kvm_pfn_t pfn = *pfnp;
3223        gfn_t gfn = *gfnp;
3224        int level = *levelp;
3225
3226        /*
3227         * Check if it's a transparent hugepage. If this would be an
3228         * hugetlbfs page, level wouldn't be set to
3229         * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
3230         * here.
3231         */
3232        if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
3233            level == PT_PAGE_TABLE_LEVEL &&
3234            PageTransCompoundMap(pfn_to_page(pfn)) &&
3235            !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
3236                unsigned long mask;
3237                /*
3238                 * mmu_notifier_retry was successful and we hold the
3239                 * mmu_lock here, so the pmd can't become splitting
3240                 * from under us, and in turn
3241                 * __split_huge_page_refcount() can't run from under
3242                 * us and we can safely transfer the refcount from
3243                 * PG_tail to PG_head as we switch the pfn to tail to
3244                 * head.
3245                 */
3246                *levelp = level = PT_DIRECTORY_LEVEL;
3247                mask = KVM_PAGES_PER_HPAGE(level) - 1;
3248                VM_BUG_ON((gfn & mask) != (pfn & mask));
3249                if (pfn & mask) {
3250                        gfn &= ~mask;
3251                        *gfnp = gfn;
3252                        kvm_release_pfn_clean(pfn);
3253                        pfn &= ~mask;
3254                        kvm_get_pfn(pfn);
3255                        *pfnp = pfn;
3256                }
3257        }
3258}
3259
3260static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
3261                                kvm_pfn_t pfn, unsigned access, int *ret_val)
3262{
3263        /* The pfn is invalid, report the error! */
3264        if (unlikely(is_error_pfn(pfn))) {
3265                *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
3266                return true;
3267        }
3268
3269        if (unlikely(is_noslot_pfn(pfn)))
3270                vcpu_cache_mmio_info(vcpu, gva, gfn, access);
3271
3272        return false;
3273}
3274
3275static bool page_fault_can_be_fast(u32 error_code)
3276{
3277        /*
3278         * Do not fix the mmio spte with invalid generation number which
3279         * need to be updated by slow page fault path.
3280         */
3281        if (unlikely(error_code & PFERR_RSVD_MASK))
3282                return false;
3283
3284        /* See if the page fault is due to an NX violation */
3285        if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
3286                      == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
3287                return false;
3288
3289        /*
3290         * #PF can be fast if:
3291         * 1. The shadow page table entry is not present, which could mean that
3292         *    the fault is potentially caused by access tracking (if enabled).
3293         * 2. The shadow page table entry is present and the fault
3294         *    is caused by write-protect, that means we just need change the W
3295         *    bit of the spte which can be done out of mmu-lock.
3296         *
3297         * However, if access tracking is disabled we know that a non-present
3298         * page must be a genuine page fault where we have to create a new SPTE.
3299         * So, if access tracking is disabled, we return true only for write
3300         * accesses to a present page.
3301         */
3302
3303        return shadow_acc_track_mask != 0 ||
3304               ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
3305                == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
3306}
3307
3308/*
3309 * Returns true if the SPTE was fixed successfully. Otherwise,
3310 * someone else modified the SPTE from its original value.
3311 */
3312static bool
3313fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
3314                        u64 *sptep, u64 old_spte, u64 new_spte)
3315{
3316        gfn_t gfn;
3317
3318        WARN_ON(!sp->role.direct);
3319
3320        /*
3321         * Theoretically we could also set dirty bit (and flush TLB) here in
3322         * order to eliminate unnecessary PML logging. See comments in
3323         * set_spte. But fast_page_fault is very unlikely to happen with PML
3324         * enabled, so we do not do this. This might result in the same GPA
3325         * to be logged in PML buffer again when the write really happens, and
3326         * eventually to be called by mark_page_dirty twice. But it's also no
3327         * harm. This also avoids the TLB flush needed after setting dirty bit
3328         * so non-PML cases won't be impacted.
3329         *
3330         * Compare with set_spte where instead shadow_dirty_mask is set.
3331         */
3332        if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
3333                return false;
3334
3335        if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
3336                /*
3337                 * The gfn of direct spte is stable since it is
3338                 * calculated by sp->gfn.
3339                 */
3340                gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
3341                kvm_vcpu_mark_page_dirty(vcpu, gfn);
3342        }
3343
3344        return true;
3345}
3346
3347static bool is_access_allowed(u32 fault_err_code, u64 spte)
3348{
3349        if (fault_err_code & PFERR_FETCH_MASK)
3350                return is_executable_pte(spte);
3351
3352        if (fault_err_code & PFERR_WRITE_MASK)
3353                return is_writable_pte(spte);
3354
3355        /* Fault was on Read access */
3356        return spte & PT_PRESENT_MASK;
3357}
3358
3359/*
3360 * Return value:
3361 * - true: let the vcpu to access on the same address again.
3362 * - false: let the real page fault path to fix it.
3363 */
3364static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
3365                            u32 error_code)
3366{
3367        struct kvm_shadow_walk_iterator iterator;
3368        struct kvm_mmu_page *sp;
3369        bool fault_handled = false;
3370        u64 spte = 0ull;
3371        uint retry_count = 0;
3372
3373        if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3374                return false;
3375
3376        if (!page_fault_can_be_fast(error_code))
3377                return false;
3378
3379        walk_shadow_page_lockless_begin(vcpu);
3380
3381        do {
3382                u64 new_spte;
3383
3384                for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
3385                        if (!is_shadow_present_pte(spte) ||
3386                            iterator.level < level)
3387                                break;
3388
3389                sp = page_header(__pa(iterator.sptep));
3390                if (!is_last_spte(spte, sp->role.level))
3391                        break;
3392
3393                /*
3394                 * Check whether the memory access that caused the fault would
3395                 * still cause it if it were to be performed right now. If not,
3396                 * then this is a spurious fault caused by TLB lazily flushed,
3397                 * or some other CPU has already fixed the PTE after the
3398                 * current CPU took the fault.
3399                 *
3400                 * Need not check the access of upper level table entries since
3401                 * they are always ACC_ALL.
3402                 */
3403                if (is_access_allowed(error_code, spte)) {
3404                        fault_handled = true;
3405                        break;
3406                }
3407
3408                new_spte = spte;
3409
3410                if (is_access_track_spte(spte))
3411                        new_spte = restore_acc_track_spte(new_spte);
3412
3413                /*
3414                 * Currently, to simplify the code, write-protection can
3415                 * be removed in the fast path only if the SPTE was
3416                 * write-protected for dirty-logging or access tracking.
3417                 */
3418                if ((error_code & PFERR_WRITE_MASK) &&
3419                    spte_can_locklessly_be_made_writable(spte))
3420                {
3421                        new_spte |= PT_WRITABLE_MASK;
3422
3423                        /*
3424                         * Do not fix write-permission on the large spte.  Since
3425                         * we only dirty the first page into the dirty-bitmap in
3426                         * fast_pf_fix_direct_spte(), other pages are missed
3427                         * if its slot has dirty logging enabled.
3428                         *
3429                         * Instead, we let the slow page fault path create a
3430                         * normal spte to fix the access.
3431                         *
3432                         * See the comments in kvm_arch_commit_memory_region().
3433                         */
3434                        if (sp->role.level > PT_PAGE_TABLE_LEVEL)
3435                                break;
3436                }
3437
3438                /* Verify that the fault can be handled in the fast path */
3439                if (new_spte == spte ||
3440                    !is_access_allowed(error_code, new_spte))
3441                        break;
3442
3443                /*
3444                 * Currently, fast page fault only works for direct mapping
3445                 * since the gfn is not stable for indirect shadow page. See
3446                 * Documentation/virtual/kvm/locking.txt to get more detail.
3447                 */
3448                fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
3449                                                        iterator.sptep, spte,
3450                                                        new_spte);
3451                if (fault_handled)
3452                        break;
3453
3454                if (++retry_count > 4) {
3455                        printk_once(KERN_WARNING
3456                                "kvm: Fast #PF retrying more than 4 times.\n");
3457                        break;
3458                }
3459
3460        } while (true);
3461
3462        trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
3463                              spte, fault_handled);
3464        walk_shadow_page_lockless_end(vcpu);
3465
3466        return fault_handled;
3467}
3468
3469static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
3470                         gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable);
3471static int make_mmu_pages_available(struct kvm_vcpu *vcpu);
3472
3473static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
3474                         gfn_t gfn, bool prefault)
3475{
3476        int r;
3477        int level;
3478        bool force_pt_level = false;
3479        kvm_pfn_t pfn;
3480        unsigned long mmu_seq;
3481        bool map_writable, write = error_code & PFERR_WRITE_MASK;
3482
3483        level = mapping_level(vcpu, gfn, &force_pt_level);
3484        if (likely(!force_pt_level)) {
3485                /*
3486                 * This path builds a PAE pagetable - so we can map
3487                 * 2mb pages at maximum. Therefore check if the level
3488                 * is larger than that.
3489                 */
3490                if (level > PT_DIRECTORY_LEVEL)
3491                        level = PT_DIRECTORY_LEVEL;
3492
3493                gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
3494        }
3495
3496        if (fast_page_fault(vcpu, v, level, error_code))
3497                return RET_PF_RETRY;
3498
3499        mmu_seq = vcpu->kvm->mmu_notifier_seq;
3500        smp_rmb();
3501
3502        if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
3503                return RET_PF_RETRY;
3504
3505        if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
3506                return r;
3507
3508        spin_lock(&vcpu->kvm->mmu_lock);
3509        if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
3510                goto out_unlock;
3511        if (make_mmu_pages_available(vcpu) < 0)
3512                goto out_unlock;
3513        if (likely(!force_pt_level))
3514                transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
3515        r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
3516        spin_unlock(&vcpu->kvm->mmu_lock);
3517
3518        return r;
3519
3520out_unlock:
3521        spin_unlock(&vcpu->kvm->mmu_lock);
3522        kvm_release_pfn_clean(pfn);
3523        return RET_PF_RETRY;
3524}
3525
3526static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
3527                               struct list_head *invalid_list)
3528{
3529        struct kvm_mmu_page *sp;
3530
3531        if (!VALID_PAGE(*root_hpa))
3532                return;
3533
3534        sp = page_header(*root_hpa & PT64_BASE_ADDR_MASK);
3535        --sp->root_count;
3536        if (!sp->root_count && sp->role.invalid)
3537                kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
3538
3539        *root_hpa = INVALID_PAGE;
3540}
3541
3542/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
3543void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
3544                        ulong roots_to_free)
3545{
3546        int i;
3547        LIST_HEAD(invalid_list);
3548        bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
3549
3550        BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
3551
3552        /* Before acquiring the MMU lock, see if we need to do any real work. */
3553        if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) {
3554                for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3555                        if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
3556                            VALID_PAGE(mmu->prev_roots[i].hpa))
3557                                break;
3558
3559                if (i == KVM_MMU_NUM_PREV_ROOTS)
3560                        return;
3561        }
3562
3563        spin_lock(&vcpu->kvm->mmu_lock);
3564
3565        for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3566                if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
3567                        mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa,
3568                                           &invalid_list);
3569
3570        if (free_active_root) {
3571                if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
3572                    (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
3573                        mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
3574                                           &invalid_list);
3575                } else {
3576                        for (i = 0; i < 4; ++i)
3577                                if (mmu->pae_root[i] != 0)
3578                                        mmu_free_root_page(vcpu->kvm,
3579                                                           &mmu->pae_root[i],
3580                                                           &invalid_list);
3581                        mmu->root_hpa = INVALID_PAGE;
3582                }
3583                mmu->root_cr3 = 0;
3584        }
3585
3586        kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3587        spin_unlock(&vcpu->kvm->mmu_lock);
3588}
3589EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
3590
3591static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
3592{
3593        int ret = 0;
3594
3595        if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
3596                kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3597                ret = 1;
3598        }
3599
3600        return ret;
3601}
3602
3603static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
3604{
3605        struct kvm_mmu_page *sp;
3606        unsigned i;
3607
3608        if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
3609                spin_lock(&vcpu->kvm->mmu_lock);
3610                if(make_mmu_pages_available(vcpu) < 0) {
3611                        spin_unlock(&vcpu->kvm->mmu_lock);
3612                        return -ENOSPC;
3613                }
3614                sp = kvm_mmu_get_page(vcpu, 0, 0,
3615                                vcpu->arch.mmu->shadow_root_level, 1, ACC_ALL);
3616                ++sp->root_count;
3617                spin_unlock(&vcpu->kvm->mmu_lock);
3618                vcpu->arch.mmu->root_hpa = __pa(sp->spt);
3619        } else if (vcpu->arch.mmu->shadow_root_level == PT32E_ROOT_LEVEL) {
3620                for (i = 0; i < 4; ++i) {
3621                        hpa_t root = vcpu->arch.mmu->pae_root[i];
3622
3623                        MMU_WARN_ON(VALID_PAGE(root));
3624                        spin_lock(&vcpu->kvm->mmu_lock);
3625                        if (make_mmu_pages_available(vcpu) < 0) {
3626                                spin_unlock(&vcpu->kvm->mmu_lock);
3627                                return -ENOSPC;
3628                        }
3629                        sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
3630                                        i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
3631                        root = __pa(sp->spt);
3632                        ++sp->root_count;
3633                        spin_unlock(&vcpu->kvm->mmu_lock);
3634                        vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
3635                }
3636                vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
3637        } else
3638                BUG();
3639        vcpu->arch.mmu->root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
3640
3641        return 0;
3642}
3643
3644static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3645{
3646        struct kvm_mmu_page *sp;
3647        u64 pdptr, pm_mask;
3648        gfn_t root_gfn, root_cr3;
3649        int i;
3650
3651        root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
3652        root_gfn = root_cr3 >> PAGE_SHIFT;
3653
3654        if (mmu_check_root(vcpu, root_gfn))
3655                return 1;
3656
3657        /*
3658         * Do we shadow a long mode page table? If so we need to
3659         * write-protect the guests page table root.
3660         */
3661        if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
3662                hpa_t root = vcpu->arch.mmu->root_hpa;
3663
3664                MMU_WARN_ON(VALID_PAGE(root));
3665
3666                spin_lock(&vcpu->kvm->mmu_lock);
3667                if (make_mmu_pages_available(vcpu) < 0) {
3668                        spin_unlock(&vcpu->kvm->mmu_lock);
3669                        return -ENOSPC;
3670                }
3671                sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
3672                                vcpu->arch.mmu->shadow_root_level, 0, ACC_ALL);
3673                root = __pa(sp->spt);
3674                ++sp->root_count;
3675                spin_unlock(&vcpu->kvm->mmu_lock);
3676                vcpu->arch.mmu->root_hpa = root;
3677                goto set_root_cr3;
3678        }
3679
3680        /*
3681         * We shadow a 32 bit page table. This may be a legacy 2-level
3682         * or a PAE 3-level page table. In either case we need to be aware that
3683         * the shadow page table may be a PAE or a long mode page table.
3684         */
3685        pm_mask = PT_PRESENT_MASK;
3686        if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL)
3687                pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
3688
3689        for (i = 0; i < 4; ++i) {
3690                hpa_t root = vcpu->arch.mmu->pae_root[i];
3691
3692                MMU_WARN_ON(VALID_PAGE(root));
3693                if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
3694                        pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
3695                        if (!(pdptr & PT_PRESENT_MASK)) {
3696                                vcpu->arch.mmu->pae_root[i] = 0;
3697                                continue;
3698                        }
3699                        root_gfn = pdptr >> PAGE_SHIFT;
3700                        if (mmu_check_root(vcpu, root_gfn))
3701                                return 1;
3702                }
3703                spin_lock(&vcpu->kvm->mmu_lock);
3704                if (make_mmu_pages_available(vcpu) < 0) {
3705                        spin_unlock(&vcpu->kvm->mmu_lock);
3706                        return -ENOSPC;
3707                }
3708                sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
3709                                      0, ACC_ALL);
3710                root = __pa(sp->spt);
3711                ++sp->root_count;
3712                spin_unlock(&vcpu->kvm->mmu_lock);
3713
3714                vcpu->arch.mmu->pae_root[i] = root | pm_mask;
3715        }
3716        vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
3717
3718        /*
3719         * If we shadow a 32 bit page table with a long mode page
3720         * table we enter this path.
3721         */
3722        if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
3723                if (vcpu->arch.mmu->lm_root == NULL) {
3724                        /*
3725                         * The additional page necessary for this is only
3726                         * allocated on demand.
3727                         */
3728
3729                        u64 *lm_root;
3730
3731                        lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3732                        if (lm_root == NULL)
3733                                return 1;
3734
3735                        lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask;
3736
3737                        vcpu->arch.mmu->lm_root = lm_root;
3738                }
3739
3740                vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
3741        }
3742
3743set_root_cr3:
3744        vcpu->arch.mmu->root_cr3 = root_cr3;
3745
3746        return 0;
3747}
3748
3749static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
3750{
3751        if (vcpu->arch.mmu->direct_map)
3752                return mmu_alloc_direct_roots(vcpu);
3753        else
3754                return mmu_alloc_shadow_roots(vcpu);
3755}
3756
3757void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
3758{
3759        int i;
3760        struct kvm_mmu_page *sp;
3761
3762        if (vcpu->arch.mmu->direct_map)
3763                return;
3764
3765        if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3766                return;
3767
3768        vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
3769
3770        if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
3771                hpa_t root = vcpu->arch.mmu->root_hpa;
3772                sp = page_header(root);
3773
3774                /*
3775                 * Even if another CPU was marking the SP as unsync-ed
3776                 * simultaneously, any guest page table changes are not
3777                 * guaranteed to be visible anyway until this VCPU issues a TLB
3778                 * flush strictly after those changes are made. We only need to
3779                 * ensure that the other CPU sets these flags before any actual
3780                 * changes to the page tables are made. The comments in
3781                 * mmu_need_write_protect() describe what could go wrong if this
3782                 * requirement isn't satisfied.
3783                 */
3784                if (!smp_load_acquire(&sp->unsync) &&
3785                    !smp_load_acquire(&sp->unsync_children))
3786                        return;
3787
3788                spin_lock(&vcpu->kvm->mmu_lock);
3789                kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
3790
3791                mmu_sync_children(vcpu, sp);
3792
3793                kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
3794                spin_unlock(&vcpu->kvm->mmu_lock);
3795                return;
3796        }
3797
3798        spin_lock(&vcpu->kvm->mmu_lock);
3799        kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
3800
3801        for (i = 0; i < 4; ++i) {
3802                hpa_t root = vcpu->arch.mmu->pae_root[i];
3803
3804                if (root && VALID_PAGE(root)) {
3805                        root &= PT64_BASE_ADDR_MASK;
3806                        sp = page_header(root);
3807                        mmu_sync_children(vcpu, sp);
3808                }
3809        }
3810
3811        kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
3812        spin_unlock(&vcpu->kvm->mmu_lock);
3813}
3814EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
3815
3816static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
3817                                  u32 access, struct x86_exception *exception)
3818{
3819        if (exception)
3820                exception->error_code = 0;
3821        return vaddr;
3822}
3823
3824static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
3825                                         u32 access,
3826                                         struct x86_exception *exception)
3827{
3828        if (exception)
3829                exception->error_code = 0;
3830        return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
3831}
3832
3833static bool
3834__is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
3835{
3836        int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f;
3837
3838        return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) |
3839                ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0);
3840}
3841
3842static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
3843{
3844        return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level);
3845}
3846
3847static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
3848{
3849        return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
3850}
3851
3852static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3853{
3854        /*
3855         * A nested guest cannot use the MMIO cache if it is using nested
3856         * page tables, because cr2 is a nGPA while the cache stores GPAs.
3857         */
3858        if (mmu_is_nested(vcpu))
3859                return false;
3860
3861        if (direct)
3862                return vcpu_match_mmio_gpa(vcpu, addr);
3863
3864        return vcpu_match_mmio_gva(vcpu, addr);
3865}
3866
3867/* return true if reserved bit is detected on spte. */
3868static bool
3869walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
3870{
3871        struct kvm_shadow_walk_iterator iterator;
3872        u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull;
3873        int root, leaf;
3874        bool reserved = false;
3875
3876        if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3877                goto exit;
3878
3879        walk_shadow_page_lockless_begin(vcpu);
3880
3881        for (shadow_walk_init(&iterator, vcpu, addr),
3882                 leaf = root = iterator.level;
3883             shadow_walk_okay(&iterator);
3884             __shadow_walk_next(&iterator, spte)) {
3885                spte = mmu_spte_get_lockless(iterator.sptep);
3886
3887                sptes[leaf - 1] = spte;
3888                leaf--;
3889
3890                if (!is_shadow_present_pte(spte))
3891                        break;
3892
3893                reserved |= is_shadow_zero_bits_set(vcpu->arch.mmu, spte,
3894                                                    iterator.level);
3895        }
3896
3897        walk_shadow_page_lockless_end(vcpu);
3898
3899        if (reserved) {
3900                pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
3901                       __func__, addr);
3902                while (root > leaf) {
3903                        pr_err("------ spte 0x%llx level %d.\n",
3904                               sptes[root - 1], root);
3905                        root--;
3906                }
3907        }
3908exit:
3909        *sptep = spte;
3910        return reserved;
3911}
3912
3913static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3914{
3915        u64 spte;
3916        bool reserved;
3917
3918        if (mmio_info_in_cache(vcpu, addr, direct))
3919                return RET_PF_EMULATE;
3920
3921        reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
3922        if (WARN_ON(reserved))
3923                return -EINVAL;
3924
3925        if (is_mmio_spte(spte)) {
3926                gfn_t gfn = get_mmio_spte_gfn(spte);
3927                unsigned access = get_mmio_spte_access(spte);
3928
3929                if (!check_mmio_spte(vcpu, spte))
3930                        return RET_PF_INVALID;
3931
3932                if (direct)
3933                        addr = 0;
3934
3935                trace_handle_mmio_page_fault(addr, gfn, access);
3936                vcpu_cache_mmio_info(vcpu, addr, gfn, access);
3937                return RET_PF_EMULATE;
3938        }
3939
3940        /*
3941         * If the page table is zapped by other cpus, let CPU fault again on
3942         * the address.
3943         */
3944        return RET_PF_RETRY;
3945}
3946
3947static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
3948                                         u32 error_code, gfn_t gfn)
3949{
3950        if (unlikely(error_code & PFERR_RSVD_MASK))
3951                return false;
3952
3953        if (!(error_code & PFERR_PRESENT_MASK) ||
3954              !(error_code & PFERR_WRITE_MASK))
3955                return false;
3956
3957        /*
3958         * guest is writing the page which is write tracked which can
3959         * not be fixed by page fault handler.
3960         */
3961        if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
3962                return true;
3963
3964        return false;
3965}
3966
3967static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
3968{
3969        struct kvm_shadow_walk_iterator iterator;
3970        u64 spte;
3971
3972        if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3973                return;
3974
3975        walk_shadow_page_lockless_begin(vcpu);
3976        for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
3977                clear_sp_write_flooding_count(iterator.sptep);
3978                if (!is_shadow_present_pte(spte))
3979                        break;
3980        }
3981        walk_shadow_page_lockless_end(vcpu);
3982}
3983
3984static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
3985                                u32 error_code, bool prefault)
3986{
3987        gfn_t gfn = gva >> PAGE_SHIFT;
3988        int r;
3989
3990        pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
3991
3992        if (page_fault_handle_page_track(vcpu, error_code, gfn))
3993                return RET_PF_EMULATE;
3994
3995        r = mmu_topup_memory_caches(vcpu);
3996        if (r)
3997                return r;
3998
3999        MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
4000

4001
4002        return nonpaging_map(vcpu, gva & PAGE_MASK,
4003                             error_code, gfn, prefault);
4004}
4005
4006static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
4007{
4008        struct kvm_arch_async_pf arch;
4009
4010        arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
4011        arch.gfn = gfn;
4012        arch.direct_map = vcpu->arch.mmu->direct_map;
4013        arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu);
4014
4015        return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
4016}
4017
4018bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
4019{
4020        if (unlikely(!lapic_in_kernel(vcpu) ||
4021                     kvm_event_needs_reinjection(vcpu) ||
4022                     vcpu->arch.exception.pending))
4023                return false;
4024
4025        if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
4026                return false;
4027
4028        return kvm_x86_ops->interrupt_allowed(vcpu);
4029}
4030
4031static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
4032                         gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
4033{
4034        struct kvm_memory_slot *slot;
4035        bool async;
4036
4037        /*
4038         * Don't expose private memslots to L2.
4039         */
4040        if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) {
4041                *pfn = KVM_PFN_NOSLOT;
4042                return false;
4043        }
4044
4045        slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
4046        async = false;
4047        *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
4048        if (!async)
4049                return false; /* *pfn has correct page already */
4050
4051        if (!prefault && kvm_can_do_async_pf(vcpu)) {
4052                trace_kvm_try_async_get_page(gva, gfn);
4053                if (kvm_find_async_pf_gfn(vcpu, gfn)) {
4054                        trace_kvm_async_pf_doublefault(gva, gfn);
4055                        kvm_make_request(KVM_REQ_APF_HALT, vcpu);
4056                        return true;
4057                } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
4058                        return true;
4059        }
4060
4061        *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
4062        return false;
4063}
4064
4065int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
4066                                u64 fault_address, char *insn, int insn_len)
4067{
4068        int r = 1;
4069
4070        vcpu->arch.l1tf_flush_l1d = true;
4071        switch (vcpu->arch.apf.host_apf_reason) {
4072        default:
4073                trace_kvm_page_fault(fault_address, error_code);
4074
4075                if (kvm_event_needs_reinjection(vcpu))
4076                        kvm_mmu_unprotect_page_virt(vcpu, fault_address);
4077                r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
4078                                insn_len);
4079                break;
4080        case KVM_PV_REASON_PAGE_NOT_PRESENT:
4081                vcpu->arch.apf.host_apf_reason = 0;
4082                local_irq_disable();
4083                kvm_async_pf_task_wait(fault_address, 0);
4084                local_irq_enable();
4085                break;
4086        case KVM_PV_REASON_PAGE_READY:
4087                vcpu->arch.apf.host_apf_reason = 0;
4088                local_irq_disable();
4089                kvm_async_pf_task_wake(fault_address);
4090                local_irq_enable();
4091                break;
4092        }
4093        return r;
4094}
4095EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
4096
4097static bool
4098check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
4099{
4100        int page_num = KVM_PAGES_PER_HPAGE(level);
4101
4102        gfn &= ~(page_num - 1);
4103
4104        return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
4105}
4106
4107static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
4108                          bool prefault)
4109{
4110        kvm_pfn_t pfn;
4111        int r;
4112        int level;
4113        bool force_pt_level;
4114        gfn_t gfn = gpa >> PAGE_SHIFT;
4115        unsigned long mmu_seq;
4116        int write = error_code & PFERR_WRITE_MASK;
4117        bool map_writable;
4118
4119        MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
4120
4121        if (page_fault_handle_page_track(vcpu, error_code, gfn))
4122                return RET_PF_EMULATE;
4123
4124        r = mmu_topup_memory_caches(vcpu);
4125        if (r)
4126                return r;
4127
4128        force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
4129                                                           PT_DIRECTORY_LEVEL);
4130        level = mapping_level(vcpu, gfn, &force_pt_level);
4131        if (likely(!force_pt_level)) {
4132                if (level > PT_DIRECTORY_LEVEL &&
4133                    !check_hugepage_cache_consistency(vcpu, gfn, level))
4134                        level = PT_DIRECTORY_LEVEL;
4135                gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
4136        }
4137
4138        if (fast_page_fault(vcpu, gpa, level, error_code))
4139                return RET_PF_RETRY;
4140
4141        mmu_seq = vcpu->kvm->mmu_notifier_seq;
4142        smp_rmb();
4143
4144        if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
4145                return RET_PF_RETRY;
4146
4147        if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
4148                return r;
4149
4150        spin_lock(&vcpu->kvm->mmu_lock);
4151        if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
4152                goto out_unlock;
4153        if (make_mmu_pages_available(vcpu) < 0)
4154                goto out_unlock;
4155        if (likely(!force_pt_level))
4156                transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
4157        r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
4158        spin_unlock(&vcpu->kvm->mmu_lock);
4159
4160        return r;
4161
4162out_unlock:
4163        spin_unlock(&vcpu->kvm->mmu_lock);
4164        kvm_release_pfn_clean(pfn);
4165        return RET_PF_RETRY;
4166}
4167
4168static void nonpaging_init_context(struct kvm_vcpu *vcpu,
4169                                   struct kvm_mmu *context)
4170{
4171        context->page_fault = nonpaging_page_fault;
4172        context->gva_to_gpa = nonpaging_gva_to_gpa;
4173        context->sync_page = nonpaging_sync_page;
4174        context->invlpg = nonpaging_invlpg;
4175        context->update_pte = nonpaging_update_pte;
4176        context->root_level = 0;
4177        context->shadow_root_level = PT32E_ROOT_LEVEL;
4178        context->direct_map = true;
4179        context->nx = false;
4180}
4181
4182/*
4183 * Find out if a previously cached root matching the new CR3/role is available.
4184 * The current root is also inserted into the cache.
4185 * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is
4186 * returned.
4187 * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and
4188 * false is returned. This root should now be freed by the caller.
4189 */
4190static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4191                                  union kvm_mmu_page_role new_role)
4192{
4193        uint i;
4194        struct kvm_mmu_root_info root;
4195        struct kvm_mmu *mmu = vcpu->arch.mmu;
4196
4197        root.cr3 = mmu->root_cr3;
4198        root.hpa = mmu->root_hpa;
4199
4200        for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
4201                swap(root, mmu->prev_roots[i]);
4202
4203                if (new_cr3 == root.cr3 && VALID_PAGE(root.hpa) &&
4204                    page_header(root.hpa) != NULL &&
4205                    new_role.word == page_header(root.hpa)->role.word)
4206                        break;
4207        }
4208
4209        mmu->root_hpa = root.hpa;
4210        mmu->root_cr3 = root.cr3;
4211
4212        return i < KVM_MMU_NUM_PREV_ROOTS;
4213}
4214
4215static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4216                            union kvm_mmu_page_role new_role,
4217                            bool skip_tlb_flush)
4218{
4219        struct kvm_mmu *mmu = vcpu->arch.mmu;
4220
4221        /*
4222         * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid
4223         * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
4224         * later if necessary.
4225         */
4226        if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
4227            mmu->root_level >= PT64_ROOT_4LEVEL) {
4228                if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT))
4229                        return false;
4230
4231                if (cached_root_available(vcpu, new_cr3, new_role)) {
4232                        kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
4233                        if (!skip_tlb_flush) {
4234                                kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
4235                                kvm_x86_ops->tlb_flush(vcpu, true);
4236                        }
4237
4238                        /*
4239                         * The last MMIO access's GVA and GPA are cached in the
4240                         * VCPU. When switching to a new CR3, that GVA->GPA
4241                         * mapping may no longer be valid. So clear any cached
4242                         * MMIO info even when we don't need to sync the shadow
4243                         * page tables.
4244                         */
4245                        vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
4246
4247                        __clear_sp_write_flooding_count(
4248                                page_header(mmu->root_hpa));
4249
4250                        return true;
4251                }
4252        }
4253
4254        return false;
4255}
4256
4257static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4258                              union kvm_mmu_page_role new_role,
4259                              bool skip_tlb_flush)
4260{
4261        if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush))
4262                kvm_mmu_free_roots(vcpu, vcpu->arch.mmu,
4263                                   KVM_MMU_ROOT_CURRENT);
4264}
4265
4266void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
4267{
4268        __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu),
4269                          skip_tlb_flush);
4270}
4271EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3);
4272
4273static unsigned long get_cr3(struct kvm_vcpu *vcpu)
4274{
4275        return kvm_read_cr3(vcpu);
4276}
4277
4278static void inject_page_fault(struct kvm_vcpu *vcpu,
4279                              struct x86_exception *fault)
4280{
4281        vcpu->arch.mmu->inject_page_fault(vcpu, fault);
4282}
4283
4284static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
4285                           unsigned access, int *nr_present)
4286{
4287        if (unlikely(is_mmio_spte(*sptep))) {
4288                if (gfn != get_mmio_spte_gfn(*sptep)) {
4289                        mmu_spte_clear_no_track(sptep);
4290                        return true;
4291                }
4292
4293                (*nr_present)++;
4294                mark_mmio_spte(vcpu, sptep, gfn, access);
4295                return true;
4296        }
4297
4298        return false;
4299}
4300
4301static inline bool is_last_gpte(struct kvm_mmu *mmu,
4302                                unsigned level, unsigned gpte)
4303{
4304        /*
4305         * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
4306         * If it is clear, there are no large pages at this level, so clear
4307         * PT_PAGE_SIZE_MASK in gpte if that is the case.
4308         */
4309        gpte &= level - mmu->last_nonleaf_level;
4310
4311        /*
4312         * PT_PAGE_TABLE_LEVEL always terminates.  The RHS has bit 7 set
4313         * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
4314         * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
4315         */
4316        gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
4317
4318        return gpte & PT_PAGE_SIZE_MASK;
4319}
4320
4321#define PTTYPE_EPT 18 /* arbitrary */
4322#define PTTYPE PTTYPE_EPT
4323#include "paging_tmpl.h"
4324#undef PTTYPE
4325
4326#define PTTYPE 64
4327#include "paging_tmpl.h"
4328#undef PTTYPE
4329
4330#define PTTYPE 32
4331#include "paging_tmpl.h"
4332#undef PTTYPE
4333
4334static void
4335__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4336                        struct rsvd_bits_validate *rsvd_check,
4337                        int maxphyaddr, int level, bool nx, bool gbpages,
4338                        bool pse, bool amd)
4339{
4340        u64 exb_bit_rsvd = 0;
4341        u64 gbpages_bit_rsvd = 0;
4342        u64 nonleaf_bit8_rsvd = 0;
4343
4344        rsvd_check->bad_mt_xwr = 0;
4345
4346        if (!nx)
4347                exb_bit_rsvd = rsvd_bits(63, 63);
4348        if (!gbpages)
4349                gbpages_bit_rsvd = rsvd_bits(7, 7);
4350
4351        /*
4352         * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
4353         * leaf entries) on AMD CPUs only.
4354         */
4355        if (amd)
4356                nonleaf_bit8_rsvd = rsvd_bits(8, 8);
4357
4358        switch (level) {
4359        case PT32_ROOT_LEVEL:
4360                /* no rsvd bits for 2 level 4K page table entries */
4361                rsvd_check->rsvd_bits_mask[0][1] = 0;
4362                rsvd_check->rsvd_bits_mask[0][0] = 0;
4363                rsvd_check->rsvd_bits_mask[1][0] =
4364                        rsvd_check->rsvd_bits_mask[0][0];
4365
4366                if (!pse) {
4367                        rsvd_check->rsvd_bits_mask[1][1] = 0;
4368                        break;
4369                }
4370
4371                if (is_cpuid_PSE36())
4372                        /* 36bits PSE 4MB page */
4373                        rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
4374                else
4375                        /* 32 bits PSE 4MB page */
4376                        rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
4377                break;
4378        case PT32E_ROOT_LEVEL:
4379                rsvd_check->rsvd_bits_mask[0][2] =
4380                        rsvd_bits(maxphyaddr, 63) |
4381                        rsvd_bits(5, 8) | rsvd_bits(1, 2);      /* PDPTE */
4382                rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
4383                        rsvd_bits(maxphyaddr, 62);      /* PDE */
4384                rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
4385                        rsvd_bits(maxphyaddr, 62);      /* PTE */
4386                rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
4387                        rsvd_bits(maxphyaddr, 62) |
4388                        rsvd_bits(13, 20);              /* large page */
4389                rsvd_check->rsvd_bits_mask[1][0] =
4390                        rsvd_check->rsvd_bits_mask[0][0];
4391                break;
4392        case PT64_ROOT_5LEVEL:
4393                rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |
4394                        nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
4395                        rsvd_bits(maxphyaddr, 51);
4396                rsvd_check->rsvd_bits_mask[1][4] =
4397                        rsvd_check->rsvd_bits_mask[0][4];
4398                /* fall through */
4399        case PT64_ROOT_4LEVEL:
4400                rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
4401                        nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
4402                        rsvd_bits(maxphyaddr, 51);
4403                rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
4404                        nonleaf_bit8_rsvd | gbpages_bit_rsvd |
4405                        rsvd_bits(maxphyaddr, 51);
4406                rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
4407                        rsvd_bits(maxphyaddr, 51);
4408                rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
4409                        rsvd_bits(maxphyaddr, 51);
4410                rsvd_check->rsvd_bits_mask[1][3] =
4411                        rsvd_check->rsvd_bits_mask[0][3];
4412                rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd |
4413                        gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
4414                        rsvd_bits(13, 29);
4415                rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
4416                        rsvd_bits(maxphyaddr, 51) |
4417                        rsvd_bits(13, 20);              /* large page */
4418                rsvd_check->rsvd_bits_mask[1][0] =
4419                        rsvd_check->rsvd_bits_mask[0][0];
4420                break;
4421        }
4422}
4423
4424static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4425                                  struct kvm_mmu *context)
4426{
4427        __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
4428                                cpuid_maxphyaddr(vcpu), context->root_level,
4429                                context->nx,
4430                                guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
4431                                is_pse(vcpu), guest_cpuid_is_amd(vcpu));
4432}
4433
4434static void
4435__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
4436                            int maxphyaddr, bool execonly)
4437{
4438        u64 bad_mt_xwr;
4439
4440        rsvd_check->rsvd_bits_mask[0][4] =
4441                rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
4442        rsvd_check->rsvd_bits_mask[0][3] =
4443                rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
4444        rsvd_check->rsvd_bits_mask[0][2] =
4445                rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
4446        rsvd_check->rsvd_bits_mask[0][1] =
4447                rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
4448        rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
4449
4450        /* large page */
4451        rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
4452        rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
4453        rsvd_check->rsvd_bits_mask[1][2] =
4454                rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
4455        rsvd_check->rsvd_bits_mask[1][1] =
4456                rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
4457        rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
4458
4459        bad_mt_xwr = 0xFFull << (2 * 8);        /* bits 3..5 must not be 2 */
4460        bad_mt_xwr |= 0xFFull << (3 * 8);       /* bits 3..5 must not be 3 */
4461        bad_mt_xwr |= 0xFFull << (7 * 8);       /* bits 3..5 must not be 7 */
4462        bad_mt_xwr |= REPEAT_BYTE(1ull << 2);   /* bits 0..2 must not be 010 */
4463        bad_mt_xwr |= REPEAT_BYTE(1ull << 6);   /* bits 0..2 must not be 110 */
4464        if (!execonly) {
4465                /* bits 0..2 must not be 100 unless VMX capabilities allow it */
4466                bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
4467        }
4468        rsvd_check->bad_mt_xwr = bad_mt_xwr;
4469}
4470
4471static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
4472                struct kvm_mmu *context, bool execonly)
4473{
4474        __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
4475                                    cpuid_maxphyaddr(vcpu), execonly);
4476}
4477
4478/*
4479 * the page table on host is the shadow page table for the page
4480 * table in guest or amd nested guest, its mmu features completely
4481 * follow the features in guest.
4482 */
4483void
4484reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
4485{
4486        bool uses_nx = context->nx ||
4487                context->mmu_role.base.smep_andnot_wp;
4488        struct rsvd_bits_validate *shadow_zero_check;
4489        int i;
4490
4491        /*
4492         * Passing "true" to the last argument is okay; it adds a check
4493         * on bit 8 of the SPTEs which KVM doesn't use anyway.
4494         */
4495        shadow_zero_check = &context->shadow_zero_check;
4496        __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4497                                boot_cpu_data.x86_phys_bits,
4498                                context->shadow_root_level, uses_nx,
4499                                guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
4500                                is_pse(vcpu), true);
4501
4502        if (!shadow_me_mask)
4503                return;
4504
4505        for (i = context->shadow_root_level; --i >= 0;) {
4506                shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4507                shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4508        }
4509
4510}
4511EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
4512
4513static inline bool boot_cpu_is_amd(void)
4514{
4515        WARN_ON_ONCE(!tdp_enabled);
4516        return shadow_x_mask == 0;
4517}
4518
4519/*
4520 * the direct page table on host, use as much mmu features as
4521 * possible, however, kvm currently does not do execution-protection.
4522 */
4523static void
4524reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4525                                struct kvm_mmu *context)
4526{
4527        struct rsvd_bits_validate *shadow_zero_check;
4528        int i;
4529
4530        shadow_zero_check = &context->shadow_zero_check;
4531
4532        if (boot_cpu_is_amd())
4533                __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4534                                        boot_cpu_data.x86_phys_bits,
4535                                        context->shadow_root_level, false,
4536                                        boot_cpu_has(X86_FEATURE_GBPAGES),
4537                                        true, true);
4538        else
4539                __reset_rsvds_bits_mask_ept(shadow_zero_check,
4540                                            boot_cpu_data.x86_phys_bits,
4541                                            false);
4542
4543        if (!shadow_me_mask)
4544                return;
4545
4546        for (i = context->shadow_root_level; --i >= 0;) {
4547                shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4548                shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4549        }
4550}
4551
4552/*
4553 * as the comments in reset_shadow_zero_bits_mask() except it
4554 * is the shadow page table for intel nested guest.
4555 */
4556static void
4557reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4558                                struct kvm_mmu *context, bool execonly)
4559{
4560        __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
4561                                    boot_cpu_data.x86_phys_bits, execonly);
4562}
4563
4564#define BYTE_MASK(access) \
4565        ((1 & (access) ? 2 : 0) | \
4566         (2 & (access) ? 4 : 0) | \
4567         (3 & (access) ? 8 : 0) | \
4568         (4 & (access) ? 16 : 0) | \
4569         (5 & (access) ? 32 : 0) | \
4570         (6 & (access) ? 64 : 0) | \
4571         (7 & (access) ? 128 : 0))
4572
4573
4574static void update_permission_bitmask(struct kvm_vcpu *vcpu,
4575                                      struct kvm_mmu *mmu, bool ept)
4576{
4577        unsigned byte;
4578
4579        const u8 x = BYTE_MASK(ACC_EXEC_MASK);
4580        const u8 w = BYTE_MASK(ACC_WRITE_MASK);
4581        const u8 u = BYTE_MASK(ACC_USER_MASK);
4582
4583        bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0;
4584        bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0;
4585        bool cr0_wp = is_write_protection(vcpu);
4586
4587        for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
4588                unsigned pfec = byte << 1;
4589
4590                /*
4591                 * Each "*f" variable has a 1 bit for each UWX value
4592                 * that causes a fault with the given PFEC.
4593                 */
4594
4595                /* Faults from writes to non-writable pages */
4596                u8 wf = (pfec & PFERR_WRITE_MASK) ? ~w : 0;
4597                /* Faults from user mode accesses to supervisor pages */
4598                u8 uf = (pfec & PFERR_USER_MASK) ? ~u : 0;
4599                /* Faults from fetches of non-executable pages*/
4600                u8 ff = (pfec & PFERR_FETCH_MASK) ? ~x : 0;
4601                /* Faults from kernel mode fetches of user pages */
4602                u8 smepf = 0;
4603                /* Faults from kernel mode accesses of user pages */
4604                u8 smapf = 0;
4605
4606                if (!ept) {
4607                        /* Faults from kernel mode accesses to user pages */
4608                        u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
4609
4610                        /* Not really needed: !nx will cause pte.nx to fault */
4611                        if (!mmu->nx)
4612                                ff = 0;
4613
4614                        /* Allow supervisor writes if !cr0.wp */
4615                        if (!cr0_wp)
4616                                wf = (pfec & PFERR_USER_MASK) ? wf : 0;
4617
4618                        /* Disallow supervisor fetches of user code if cr4.smep */
4619                        if (cr4_smep)
4620                                smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
4621
4622                        /*
4623                         * SMAP:kernel-mode data accesses from user-mode
4624                         * mappings should fault. A fault is considered
4625                         * as a SMAP violation if all of the following
4626                         * conditions are true:
4627                         *   - X86_CR4_SMAP is set in CR4
4628                         *   - A user page is accessed
4629                         *   - The access is not a fetch
4630                         *   - Page fault in kernel mode
4631                         *   - if CPL = 3 or X86_EFLAGS_AC is clear
4632                         *
4633                         * Here, we cover the first three conditions.
4634                         * The fourth is computed dynamically in permission_fault();
4635                         * PFERR_RSVD_MASK bit will be set in PFEC if the access is
4636                         * *not* subject to SMAP restrictions.
4637                         */
4638                        if (cr4_smap)
4639                                smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
4640                }
4641
4642                mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
4643        }
4644}
4645
4646/*
4647* PKU is an additional mechanism by which the paging controls access to
4648* user-mode addresses based on the value in the PKRU register.  Protection
4649* key violations are reported through a bit in the page fault error code.
4650* Unlike other bits of the error code, the PK bit is not known at the
4651* call site of e.g. gva_to_gpa; it must be computed directly in
4652* permission_fault based on two bits of PKRU, on some machine state (CR4,
4653* CR0, EFER, CPL), and on other bits of the error code and the page tables.
4654*
4655* In particular the following conditions come from the error code, the
4656* page tables and the machine state:
4657* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
4658* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
4659* - PK is always zero if U=0 in the page tables
4660* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
4661*
4662* The PKRU bitmask caches the result of these four conditions.  The error
4663* code (minus the P bit) and the page table's U bit form an index into the
4664* PKRU bitmask.  Two bits of the PKRU bitmask are then extracted and ANDed
4665* with the two bits of the PKRU register corresponding to the protection key.
4666* For the first three conditions above the bits will be 00, thus masking
4667* away both AD and WD.  For all reads or if the last condition holds, WD
4668* only will be masked away.
4669*/
4670static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
4671                                bool ept)
4672{
4673        unsigned bit;
4674        bool wp;
4675
4676        if (ept) {
4677                mmu->pkru_mask = 0;
4678                return;
4679        }
4680
4681        /* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */
4682        if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) {
4683                mmu->pkru_mask = 0;
4684                return;
4685        }
4686
4687        wp = is_write_protection(vcpu);
4688
4689        for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
4690                unsigned pfec, pkey_bits;
4691                bool check_pkey, check_write, ff, uf, wf, pte_user;
4692
4693                pfec = bit << 1;
4694                ff = pfec & PFERR_FETCH_MASK;
4695                uf = pfec & PFERR_USER_MASK;
4696                wf = pfec & PFERR_WRITE_MASK;
4697
4698                /* PFEC.RSVD is replaced by ACC_USER_MASK. */
4699                pte_user = pfec & PFERR_RSVD_MASK;
4700
4701                /*
4702                 * Only need to check the access which is not an
4703                 * instruction fetch and is to a user page.
4704                 */
4705                check_pkey = (!ff && pte_user);
4706                /*
4707                 * write access is controlled by PKRU if it is a
4708                 * user access or CR0.WP = 1.
4709                 */
4710                check_write = check_pkey && wf && (uf || wp);
4711
4712                /* PKRU.AD stops both read and write access. */
4713                pkey_bits = !!check_pkey;
4714                /* PKRU.WD stops write access. */
4715                pkey_bits |= (!!check_write) << 1;
4716
4717                mmu->pkru_mask |= (pkey_bits & 3) << pfec;
4718        }
4719}
4720
4721static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
4722{
4723        unsigned root_level = mmu->root_level;
4724
4725        mmu->last_nonleaf_level = root_level;
4726        if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
4727                mmu->last_nonleaf_level++;
4728}
4729
4730static void paging64_init_context_common(struct kvm_vcpu *vcpu,
4731                                         struct kvm_mmu *context,
4732                                         int level)
4733{
4734        context->nx = is_nx(vcpu);
4735        context->root_level = level;
4736
4737        reset_rsvds_bits_mask(vcpu, context);
4738        update_permission_bitmask(vcpu, context, false);
4739        update_pkru_bitmask(vcpu, context, false);
4740        update_last_nonleaf_level(vcpu, context);
4741
4742        MMU_WARN_ON(!is_pae(vcpu));
4743        context->page_fault = paging64_page_fault;
4744        context->gva_to_gpa = paging64_gva_to_gpa;
4745        context->sync_page = paging64_sync_page;
4746        context->invlpg = paging64_invlpg;
4747        context->update_pte = paging64_update_pte;
4748        context->shadow_root_level = level;
4749        context->direct_map = false;
4750}
4751
4752static void paging64_init_context(struct kvm_vcpu *vcpu,
4753                                  struct kvm_mmu *context)
4754{
4755        int root_level = is_la57_mode(vcpu) ?
4756                         PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4757
4758        paging64_init_context_common(vcpu, context, root_level);
4759}
4760
4761static void paging32_init_context(struct kvm_vcpu *vcpu,
4762                                  struct kvm_mmu *context)
4763{
4764        context->nx = false;
4765        context->root_level = PT32_ROOT_LEVEL;
4766
4767        reset_rsvds_bits_mask(vcpu, context);
4768        update_permission_bitmask(vcpu, context, false);
4769        update_pkru_bitmask(vcpu, context, false);
4770        update_last_nonleaf_level(vcpu, context);
4771
4772        context->page_fault = paging32_page_fault;
4773        context->gva_to_gpa = paging32_gva_to_gpa;
4774        context->sync_page = paging32_sync_page;
4775        context->invlpg = paging32_invlpg;
4776        context->update_pte = paging32_update_pte;
4777        context->shadow_root_level = PT32E_ROOT_LEVEL;
4778        context->direct_map = false;
4779}
4780
4781static void paging32E_init_context(struct kvm_vcpu *vcpu,
4782                                   struct kvm_mmu *context)
4783{
4784        paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
4785}
4786
4787static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
4788{
4789        union kvm_mmu_extended_role ext = {0};
4790
4791        ext.cr0_pg = !!is_paging(vcpu);
4792        ext.cr4_pae = !!is_pae(vcpu);
4793        ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
4794        ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
4795        ext.cr4_pse = !!is_pse(vcpu);
4796        ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
4797        ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
4798        ext.maxphyaddr = cpuid_maxphyaddr(vcpu);
4799
4800        ext.valid = 1;
4801
4802        return ext;
4803}
4804
4805static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
4806                                                   bool base_only)
4807{
4808        union kvm_mmu_role role = {0};
4809
4810        role.base.access = ACC_ALL;
4811        role.base.nxe = !!is_nx(vcpu);
4812        role.base.cr0_wp = is_write_protection(vcpu);
4813        role.base.smm = is_smm(vcpu);
4814        role.base.guest_mode = is_guest_mode(vcpu);
4815
4816        if (base_only)
4817                return role;
4818
4819        role.ext = kvm_calc_mmu_role_ext(vcpu);
4820
4821        return role;
4822}
4823
4824static union kvm_mmu_role
4825kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
4826{
4827        union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
4828
4829        role.base.ad_disabled = (shadow_accessed_mask == 0);
4830        role.base.level = kvm_x86_ops->get_tdp_level(vcpu);
4831        role.base.direct = true;
4832        role.base.gpte_is_8_bytes = true;
4833
4834        return role;
4835}
4836
4837static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
4838{
4839        struct kvm_mmu *context = vcpu->arch.mmu;
4840        union kvm_mmu_role new_role =
4841                kvm_calc_tdp_mmu_root_page_role(vcpu, false);
4842
4843        new_role.base.word &= mmu_base_role_mask.word;
4844        if (new_role.as_u64 == context->mmu_role.as_u64)
4845                return;
4846
4847        context->mmu_role.as_u64 = new_role.as_u64;
4848        context->page_fault = tdp_page_fault;
4849        context->sync_page = nonpaging_sync_page;
4850        context->invlpg = nonpaging_invlpg;
4851        context->update_pte = nonpaging_update_pte;
4852        context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
4853        context->direct_map = true;
4854        context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
4855        context->get_cr3 = get_cr3;
4856        context->get_pdptr = kvm_pdptr_read;
4857        context->inject_page_fault = kvm_inject_page_fault;
4858
4859        if (!is_paging(vcpu)) {
4860                context->nx = false;
4861                context->gva_to_gpa = nonpaging_gva_to_gpa;
4862                context->root_level = 0;
4863        } else if (is_long_mode(vcpu)) {
4864                context->nx = is_nx(vcpu);
4865                context->root_level = is_la57_mode(vcpu) ?
4866                                PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4867                reset_rsvds_bits_mask(vcpu, context);
4868                context->gva_to_gpa = paging64_gva_to_gpa;
4869        } else if (is_pae(vcpu)) {
4870                context->nx = is_nx(vcpu);
4871                context->root_level = PT32E_ROOT_LEVEL;
4872                reset_rsvds_bits_mask(vcpu, context);
4873                context->gva_to_gpa = paging64_gva_to_gpa;
4874        } else {
4875                context->nx = false;
4876                context->root_level = PT32_ROOT_LEVEL;
4877                reset_rsvds_bits_mask(vcpu, context);
4878                context->gva_to_gpa = paging32_gva_to_gpa;
4879        }
4880
4881        update_permission_bitmask(vcpu, context, false);
4882        update_pkru_bitmask(vcpu, context, false);
4883        update_last_nonleaf_level(vcpu, context);
4884        reset_tdp_shadow_zero_bits_mask(vcpu, context);
4885}
4886
4887static union kvm_mmu_role
4888kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
4889{
4890        union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
4891
4892        role.base.smep_andnot_wp = role.ext.cr4_smep &&
4893                !is_write_protection(vcpu);
4894        role.base.smap_andnot_wp = role.ext.cr4_smap &&
4895                !is_write_protection(vcpu);
4896        role.base.direct = !is_paging(vcpu);
4897        role.base.gpte_is_8_bytes = !!is_pae(vcpu);
4898
4899        if (!is_long_mode(vcpu))
4900                role.base.level = PT32E_ROOT_LEVEL;
4901        else if (is_la57_mode(vcpu))
4902                role.base.level = PT64_ROOT_5LEVEL;
4903        else
4904                role.base.level = PT64_ROOT_4LEVEL;
4905
4906        return role;
4907}
4908
4909void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
4910{
4911        struct kvm_mmu *context = vcpu->arch.mmu;
4912        union kvm_mmu_role new_role =
4913                kvm_calc_shadow_mmu_root_page_role(vcpu, false);
4914
4915        new_role.base.word &= mmu_base_role_mask.word;
4916        if (new_role.as_u64 == context->mmu_role.as_u64)
4917                return;
4918
4919        if (!is_paging(vcpu))
4920                nonpaging_init_context(vcpu, context);
4921        else if (is_long_mode(vcpu))
4922                paging64_init_context(vcpu, context);
4923        else if (is_pae(vcpu))
4924                paging32E_init_context(vcpu, context);
4925        else
4926                paging32_init_context(vcpu, context);
4927
4928        context->mmu_role.as_u64 = new_role.as_u64;
4929        reset_shadow_zero_bits_mask(vcpu, context);
4930}
4931EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
4932
4933static union kvm_mmu_role
4934kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
4935                                   bool execonly)
4936{
4937        union kvm_mmu_role role = {0};
4938
4939        /* SMM flag is inherited from root_mmu */
4940        role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
4941
4942        role.base.level = PT64_ROOT_4LEVEL;
4943        role.base.gpte_is_8_bytes = true;
4944        role.base.direct = false;
4945        role.base.ad_disabled = !accessed_dirty;
4946        role.base.guest_mode = true;
4947        role.base.access = ACC_ALL;
4948
4949        /*
4950         * WP=1 and NOT_WP=1 is an impossible combination, use WP and the
4951         * SMAP variation to denote shadow EPT entries.
4952         */
4953        role.base.cr0_wp = true;
4954        role.base.smap_andnot_wp = true;
4955
4956        role.ext = kvm_calc_mmu_role_ext(vcpu);
4957        role.ext.execonly = execonly;
4958
4959        return role;
4960}
4961
4962void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
4963                             bool accessed_dirty, gpa_t new_eptp)
4964{
4965        struct kvm_mmu *context = vcpu->arch.mmu;
4966        union kvm_mmu_role new_role =
4967                kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
4968                                                   execonly);
4969
4970        __kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false);
4971
4972        new_role.base.word &= mmu_base_role_mask.word;
4973        if (new_role.as_u64 == context->mmu_role.as_u64)
4974                return;
4975
4976        context->shadow_root_level = PT64_ROOT_4LEVEL;
4977
4978        context->nx = true;
4979        context->ept_ad = accessed_dirty;
4980        context->page_fault = ept_page_fault;
4981        context->gva_to_gpa = ept_gva_to_gpa;
4982        context->sync_page = ept_sync_page;
4983        context->invlpg = ept_invlpg;
4984        context->update_pte = ept_update_pte;
4985        context->root_level = PT64_ROOT_4LEVEL;
4986        context->direct_map = false;
4987        context->mmu_role.as_u64 = new_role.as_u64;
4988
4989        update_permission_bitmask(vcpu, context, true);
4990        update_pkru_bitmask(vcpu, context, true);
4991        update_last_nonleaf_level(vcpu, context);
4992        reset_rsvds_bits_mask_ept(vcpu, context, execonly);
4993        reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
4994}
4995EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
4996
4997static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
4998{
4999        struct kvm_mmu *context = vcpu->arch.mmu;
5000

5001        kvm_init_shadow_mmu(vcpu);
5002        context->set_cr3           = kvm_x86_ops->set_cr3;
5003        context->get_cr3           = get_cr3;
5004        context->get_pdptr         = kvm_pdptr_read;
5005        context->inject_page_fault = kvm_inject_page_fault;
5006}
5007
5008static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
5009{
5010        union kvm_mmu_role new_role = kvm_calc_mmu_role_common(vcpu, false);
5011        struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
5012
5013        new_role.base.word &= mmu_base_role_mask.word;
5014        if (new_role.as_u64 == g_context->mmu_role.as_u64)
5015                return;
5016
5017        g_context->mmu_role.as_u64 = new_role.as_u64;
5018        g_context->get_cr3           = get_cr3;
5019        g_context->get_pdptr         = kvm_pdptr_read;
5020        g_context->inject_page_fault = kvm_inject_page_fault;
5021
5022        /*
5023         * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
5024         * L1's nested page tables (e.g. EPT12). The nested translation
5025         * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
5026         * L2's page tables as the first level of translation and L1's
5027         * nested page tables as the second level of translation. Basically
5028         * the gva_to_gpa functions between mmu and nested_mmu are swapped.
5029         */
5030        if (!is_paging(vcpu)) {
5031                g_context->nx = false;
5032                g_context->root_level = 0;
5033                g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
5034        } else if (is_long_mode(vcpu)) {
5035                g_context->nx = is_nx(vcpu);
5036                g_context->root_level = is_la57_mode(vcpu) ?
5037                                        PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
5038                reset_rsvds_bits_mask(vcpu, g_context);
5039                g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
5040        } else if (is_pae(vcpu)) {
5041                g_context->nx = is_nx(vcpu);
5042                g_context->root_level = PT32E_ROOT_LEVEL;
5043                reset_rsvds_bits_mask(vcpu, g_context);
5044                g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
5045        } else {
5046                g_context->nx = false;
5047                g_context->root_level = PT32_ROOT_LEVEL;
5048                reset_rsvds_bits_mask(vcpu, g_context);
5049                g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
5050        }
5051
5052        update_permission_bitmask(vcpu, g_context, false);
5053        update_pkru_bitmask(vcpu, g_context, false);
5054        update_last_nonleaf_level(vcpu, g_context);
5055}
5056
5057void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
5058{
5059        if (reset_roots) {
5060                uint i;
5061
5062                vcpu->arch.mmu->root_hpa = INVALID_PAGE;
5063
5064                for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5065                        vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5066        }
5067
5068        if (mmu_is_nested(vcpu))
5069                init_kvm_nested_mmu(vcpu);
5070        else if (tdp_enabled)
5071                init_kvm_tdp_mmu(vcpu);
5072        else
5073                init_kvm_softmmu(vcpu);
5074}
5075EXPORT_SYMBOL_GPL(kvm_init_mmu);
5076
5077static union kvm_mmu_page_role
5078kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
5079{
5080        union kvm_mmu_role role;
5081
5082        if (tdp_enabled)
5083                role = kvm_calc_tdp_mmu_root_page_role(vcpu, true);
5084        else
5085                role = kvm_calc_shadow_mmu_root_page_role(vcpu, true);
5086
5087        return role.base;
5088}
5089
5090void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
5091{
5092        kvm_mmu_unload(vcpu);
5093        kvm_init_mmu(vcpu, true);
5094}
5095EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
5096
5097int kvm_mmu_load(struct kvm_vcpu *vcpu)
5098{
5099        int r;
5100
5101        r = mmu_topup_memory_caches(vcpu);
5102        if (r)
5103                goto out;
5104        r = mmu_alloc_roots(vcpu);
5105        kvm_mmu_sync_roots(vcpu);
5106        if (r)
5107                goto out;
5108        kvm_mmu_load_cr3(vcpu);
5109        kvm_x86_ops->tlb_flush(vcpu, true);
5110out:
5111        return r;
5112}
5113EXPORT_SYMBOL_GPL(kvm_mmu_load);
5114
5115void kvm_mmu_unload(struct kvm_vcpu *vcpu)
5116{
5117        kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
5118        WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa));
5119        kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
5120        WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
5121}
5122EXPORT_SYMBOL_GPL(kvm_mmu_unload);
5123
5124static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
5125                                  struct kvm_mmu_page *sp, u64 *spte,
5126                                  const void *new)
5127{
5128        if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
5129                ++vcpu->kvm->stat.mmu_pde_zapped;
5130                return;
5131        }
5132
5133        ++vcpu->kvm->stat.mmu_pte_updated;
5134        vcpu->arch.mmu->update_pte(vcpu, sp, spte, new);
5135}
5136
5137static bool need_remote_flush(u64 old, u64 new)
5138{
5139        if (!is_shadow_present_pte(old))
5140                return false;
5141        if (!is_shadow_present_pte(new))
5142                return true;
5143        if ((old ^ new) & PT64_BASE_ADDR_MASK)
5144                return true;
5145        old ^= shadow_nx_mask;
5146        new ^= shadow_nx_mask;
5147        return (old & ~new & PT64_PERM_MASK) != 0;
5148}
5149
5150static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
5151                                    int *bytes)
5152{
5153        u64 gentry = 0;
5154        int r;
5155
5156        /*
5157         * Assume that the pte write on a page table of the same type
5158         * as the current vcpu paging mode since we update the sptes only
5159         * when they have the same mode.
5160         */
5161        if (is_pae(vcpu) && *bytes == 4) {
5162                /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
5163                *gpa &= ~(gpa_t)7;
5164                *bytes = 8;
5165        }
5166
5167        if (*bytes == 4 || *bytes == 8) {
5168                r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
5169                if (r)
5170                        gentry = 0;
5171        }
5172
5173        return gentry;
5174}
5175
5176/*
5177 * If we're seeing too many writes to a page, it may no longer be a page table,
5178 * or we may be forking, in which case it is better to unmap the page.
5179 */
5180static bool detect_write_flooding(struct kvm_mmu_page *sp)
5181{
5182        /*
5183         * Skip write-flooding detected for the sp whose level is 1, because
5184         * it can become unsync, then the guest page is not write-protected.
5185         */
5186        if (sp->role.level == PT_PAGE_TABLE_LEVEL)
5187                return false;
5188
5189        atomic_inc(&sp->write_flooding_count);
5190        return atomic_read(&sp->write_flooding_count) >= 3;
5191}
5192
5193/*
5194 * Misaligned accesses are too much trouble to fix up; also, they usually
5195 * indicate a page is not used as a page table.
5196 */
5197static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
5198                                    int bytes)
5199{
5200        unsigned offset, pte_size, misaligned;
5201
5202        pgprintk("misaligned: gpa %llx bytes %d role %x\n",
5203                 gpa, bytes, sp->role.word);
5204
5205        offset = offset_in_page(gpa);
5206        pte_size = sp->role.gpte_is_8_bytes ? 8 : 4;
5207
5208        /*
5209         * Sometimes, the OS only writes the last one bytes to update status
5210         * bits, for example, in linux, andb instruction is used in clear_bit().
5211         */
5212        if (!(offset & (pte_size - 1)) && bytes == 1)
5213                return false;
5214
5215        misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
5216        misaligned |= bytes < 4;
5217
5218        return misaligned;
5219}
5220
5221static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
5222{
5223        unsigned page_offset, quadrant;
5224        u64 *spte;
5225        int level;
5226
5227        page_offset = offset_in_page(gpa);
5228        level = sp->role.level;
5229        *nspte = 1;
5230        if (!sp->role.gpte_is_8_bytes) {
5231                page_offset <<= 1;      /* 32->64 */
5232                /*
5233                 * A 32-bit pde maps 4MB while the shadow pdes map
5234                 * only 2MB.  So we need to double the offset again
5235                 * and zap two pdes instead of one.
5236                 */
5237                if (level == PT32_ROOT_LEVEL) {
5238                        page_offset &= ~7; /* kill rounding error */
5239                        page_offset <<= 1;
5240                        *nspte = 2;
5241                }
5242                quadrant = page_offset >> PAGE_SHIFT;
5243                page_offset &= ~PAGE_MASK;
5244                if (quadrant != sp->role.quadrant)
5245                        return NULL;
5246        }
5247
5248        spte = &sp->spt[page_offset / sizeof(*spte)];
5249        return spte;
5250}
5251
5252static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
5253                              const u8 *new, int bytes,
5254                              struct kvm_page_track_notifier_node *node)
5255{
5256        gfn_t gfn = gpa >> PAGE_SHIFT;
5257        struct kvm_mmu_page *sp;
5258        LIST_HEAD(invalid_list);
5259        u64 entry, gentry, *spte;
5260        int npte;
5261        bool remote_flush, local_flush;
5262
5263        /*
5264         * If we don't have indirect shadow pages, it means no page is
5265         * write-protected, so we can exit simply.
5266         */
5267        if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
5268                return;
5269
5270        remote_flush = local_flush = false;
5271
5272        pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
5273
5274        /*
5275         * No need to care whether allocation memory is successful
5276         * or not since pte prefetch is skiped if it does not have
5277         * enough objects in the cache.
5278         */
5279        mmu_topup_memory_caches(vcpu);
5280
5281        spin_lock(&vcpu->kvm->mmu_lock);
5282
5283        gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
5284
5285        ++vcpu->kvm->stat.mmu_pte_write;
5286        kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
5287
5288        for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
5289                if (detect_write_misaligned(sp, gpa, bytes) ||
5290                      detect_write_flooding(sp)) {
5291                        kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
5292                        ++vcpu->kvm->stat.mmu_flooded;
5293                        continue;
5294                }
5295
5296                spte = get_written_sptes(sp, gpa, &npte);
5297                if (!spte)
5298                        continue;
5299
5300                local_flush = true;
5301                while (npte--) {
5302                        u32 base_role = vcpu->arch.mmu->mmu_role.base.word;
5303
5304                        entry = *spte;
5305                        mmu_page_zap_pte(vcpu->kvm, sp, spte);
5306                        if (gentry &&
5307                              !((sp->role.word ^ base_role)
5308                              & mmu_base_role_mask.word) && rmap_can_add(vcpu))
5309                                mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
5310                        if (need_remote_flush(entry, *spte))
5311                                remote_flush = true;
5312                        ++spte;
5313                }
5314        }
5315        kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
5316        kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
5317        spin_unlock(&vcpu->kvm->mmu_lock);
5318}
5319
5320int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
5321{
5322        gpa_t gpa;
5323        int r;
5324
5325        if (vcpu->arch.mmu->direct_map)
5326                return 0;
5327
5328        gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
5329
5330        r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
5331
5332        return r;
5333}
5334EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
5335
5336static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
5337{
5338        LIST_HEAD(invalid_list);
5339
5340        if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
5341                return 0;
5342
5343        while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
5344                if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
5345                        break;
5346
5347                ++vcpu->kvm->stat.mmu_recycled;
5348        }
5349        kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
5350
5351        if (!kvm_mmu_available_pages(vcpu->kvm))
5352                return -ENOSPC;
5353        return 0;
5354}
5355
5356int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
5357                       void *insn, int insn_len)
5358{
5359        int r, emulation_type = 0;
5360        enum emulation_result er;
5361        bool direct = vcpu->arch.mmu->direct_map;
5362
5363        /* With shadow page tables, fault_address contains a GVA or nGPA.  */
5364        if (vcpu->arch.mmu->direct_map) {
5365                vcpu->arch.gpa_available = true;
5366                vcpu->arch.gpa_val = cr2;
5367        }
5368
5369        r = RET_PF_INVALID;
5370        if (unlikely(error_code & PFERR_RSVD_MASK)) {
5371                r = handle_mmio_page_fault(vcpu, cr2, direct);
5372                if (r == RET_PF_EMULATE)
5373                        goto emulate;
5374        }
5375
5376        if (r == RET_PF_INVALID) {
5377                r = vcpu->arch.mmu->page_fault(vcpu, cr2,
5378                                               lower_32_bits(error_code),
5379                                               false);
5380                WARN_ON(r == RET_PF_INVALID);
5381        }
5382
5383        if (r == RET_PF_RETRY)
5384                return 1;
5385        if (r < 0)
5386                return r;
5387
5388        /*
5389         * Before emulating the instruction, check if the error code
5390         * was due to a RO violation while translating the guest page.
5391         * This can occur when using nested virtualization with nested
5392         * paging in both guests. If true, we simply unprotect the page
5393         * and resume the guest.
5394         */
5395        if (vcpu->arch.mmu->direct_map &&
5396            (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
5397                kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
5398                return 1;
5399        }
5400
5401        /*
5402         * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
5403         * optimistically try to just unprotect the page and let the processor
5404         * re-execute the instruction that caused the page fault.  Do not allow
5405         * retrying MMIO emulation, as it's not only pointless but could also
5406         * cause us to enter an infinite loop because the processor will keep
5407         * faulting on the non-existent MMIO address.  Retrying an instruction
5408         * from a nested guest is also pointless and dangerous as we are only
5409         * explicitly shadowing L1's page tables, i.e. unprotecting something
5410         * for L1 isn't going to magically fix whatever issue cause L2 to fail.
5411         */
5412        if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu))
5413                emulation_type = EMULTYPE_ALLOW_RETRY;
5414emulate:
5415        /*
5416         * On AMD platforms, under certain conditions insn_len may be zero on #NPF.
5417         * This can happen if a guest gets a page-fault on data access but the HW
5418         * table walker is not able to read the instruction page (e.g instruction
5419         * page is not present in memory). In those cases we simply restart the
5420         * guest, with the exception of AMD Erratum 1096 which is unrecoverable.
5421         */
5422        if (unlikely(insn && !insn_len)) {
5423                if (!kvm_x86_ops->need_emulation_on_page_fault(vcpu))
5424                        return 1;
5425        }
5426
5427        er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
5428
5429        switch (er) {
5430        case EMULATE_DONE:
5431                return 1;
5432        case EMULATE_USER_EXIT:
5433                ++vcpu->stat.mmio_exits;
5434                /* fall through */
5435        case EMULATE_FAIL:
5436                return 0;
5437        default:
5438                BUG();
5439        }
5440}
5441EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
5442
5443void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
5444{
5445        struct kvm_mmu *mmu = vcpu->arch.mmu;
5446        int i;
5447
5448        /* INVLPG on a * non-canonical address is a NOP according to the SDM.  */
5449        if (is_noncanonical_address(gva, vcpu))
5450                return;
5451
5452        mmu->invlpg(vcpu, gva, mmu->root_hpa);
5453
5454        /*
5455         * INVLPG is required to invalidate any global mappings for the VA,
5456         * irrespective of PCID. Since it would take us roughly similar amount
5457         * of work to determine whether any of the prev_root mappings of the VA
5458         * is marked global, or to just sync it blindly, so we might as well
5459         * just always sync it.
5460         *
5461         * Mappings not reachable via the current cr3 or the prev_roots will be
5462         * synced when switching to that cr3, so nothing needs to be done here
5463         * for them.
5464         */
5465        for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5466                if (VALID_PAGE(mmu->prev_roots[i].hpa))
5467                        mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5468
5469        kvm_x86_ops->tlb_flush_gva(vcpu, gva);
5470        ++vcpu->stat.invlpg;
5471}
5472EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
5473
5474void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
5475{
5476        struct kvm_mmu *mmu = vcpu->arch.mmu;
5477        bool tlb_flush = false;
5478        uint i;
5479
5480        if (pcid == kvm_get_active_pcid(vcpu)) {
5481                mmu->invlpg(vcpu, gva, mmu->root_hpa);
5482                tlb_flush = true;
5483        }
5484
5485        for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5486                if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
5487                    pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) {
5488                        mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5489                        tlb_flush = true;
5490                }
5491        }
5492
5493        if (tlb_flush)
5494                kvm_x86_ops->tlb_flush_gva(vcpu, gva);
5495
5496        ++vcpu->stat.invlpg;
5497
5498        /*
5499         * Mappings not reachable via the current cr3 or the prev_roots will be
5500         * synced when switching to that cr3, so nothing needs to be done here
5501         * for them.
5502         */
5503}
5504EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
5505
5506void kvm_enable_tdp(void)
5507{
5508        tdp_enabled = true;
5509}
5510EXPORT_SYMBOL_GPL(kvm_enable_tdp);
5511
5512void kvm_disable_tdp(void)
5513{
5514        tdp_enabled = false;
5515}
5516EXPORT_SYMBOL_GPL(kvm_disable_tdp);
5517
5518
5519/* The return value indicates if tlb flush on all vcpus is needed. */
5520typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
5521
5522/* The caller should hold mmu-lock before calling this function. */
5523static __always_inline bool
5524slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
5525                        slot_level_handler fn, int start_level, int end_level,
5526                        gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
5527{
5528        struct slot_rmap_walk_iterator iterator;
5529        bool flush = false;
5530
5531        for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
5532                        end_gfn, &iterator) {
5533                if (iterator.rmap)
5534                        flush |= fn(kvm, iterator.rmap);
5535
5536                if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
5537                        if (flush && lock_flush_tlb) {
5538                                kvm_flush_remote_tlbs_with_address(kvm,
5539                                                start_gfn,
5540                                                iterator.gfn - start_gfn + 1);
5541                                flush = false;
5542                        }
5543                        cond_resched_lock(&kvm->mmu_lock);
5544                }
5545        }
5546
5547        if (flush && lock_flush_tlb) {
5548                kvm_flush_remote_tlbs_with_address(kvm, start_gfn,
5549                                                   end_gfn - start_gfn + 1);
5550                flush = false;
5551        }
5552
5553        return flush;
5554}
5555
5556static __always_inline bool
5557slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5558                  slot_level_handler fn, int start_level, int end_level,
5559                  bool lock_flush_tlb)
5560{
5561        return slot_handle_level_range(kvm, memslot, fn, start_level,
5562                        end_level, memslot->base_gfn,
5563                        memslot->base_gfn + memslot->npages - 1,
5564                        lock_flush_tlb);
5565}
5566
5567static __always_inline bool
5568slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5569                      slot_level_handler fn, bool lock_flush_tlb)
5570{
5571        return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
5572                                 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5573}
5574
5575static __always_inline bool
5576slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5577                        slot_level_handler fn, bool lock_flush_tlb)
5578{
5579        return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
5580                                 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5581}
5582
5583static __always_inline bool
5584slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
5585                 slot_level_handler fn, bool lock_flush_tlb)
5586{
5587        return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
5588                                 PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
5589}
5590
5591static void free_mmu_pages(struct kvm_vcpu *vcpu)
5592{
5593        free_page((unsigned long)vcpu->arch.mmu->pae_root);
5594        free_page((unsigned long)vcpu->arch.mmu->lm_root);
5595}
5596
5597static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
5598{
5599        struct page *page;
5600        int i;
5601
5602        /*
5603         * When using PAE paging, the four PDPTEs are treated as 'root' pages,
5604         * while the PDP table is a per-vCPU construct that's allocated at MMU
5605         * creation.  When emulating 32-bit mode, cr3 is only 32 bits even on
5606         * x86_64.  Therefore we need to allocate the PDP table in the first
5607         * 4GB of memory, which happens to fit the DMA32 zone.  Except for
5608         * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can
5609         * skip allocating the PDP table.
5610         */
5611        if (tdp_enabled && kvm_x86_ops->get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
5612                return 0;
5613
5614        page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
5615        if (!page)
5616                return -ENOMEM;
5617
5618        vcpu->arch.mmu->pae_root = page_address(page);
5619        for (i = 0; i < 4; ++i)
5620                vcpu->arch.mmu->pae_root[i] = INVALID_PAGE;
5621
5622        return 0;
5623}
5624
5625int kvm_mmu_create(struct kvm_vcpu *vcpu)
5626{
5627        uint i;
5628
5629        vcpu->arch.mmu = &vcpu->arch.root_mmu;
5630        vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
5631
5632        vcpu->arch.root_mmu.root_hpa = INVALID_PAGE;
5633        vcpu->arch.root_mmu.root_cr3 = 0;
5634        vcpu->arch.root_mmu.translate_gpa = translate_gpa;
5635        for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5636                vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5637
5638        vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE;
5639        vcpu->arch.guest_mmu.root_cr3 = 0;
5640        vcpu->arch.guest_mmu.translate_gpa = translate_gpa;
5641        for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5642                vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5643
5644        vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
5645        return alloc_mmu_pages(vcpu);
5646}
5647
5648static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
5649                        struct kvm_memory_slot *slot,
5650                        struct kvm_page_track_notifier_node *node)
5651{
5652        struct kvm_mmu_page *sp;
5653        LIST_HEAD(invalid_list);
5654        unsigned long i;
5655        bool flush;
5656        gfn_t gfn;
5657
5658        spin_lock(&kvm->mmu_lock);
5659
5660        if (list_empty(&kvm->arch.active_mmu_pages))
5661                goto out_unlock;
5662
5663        flush = slot_handle_all_level(kvm, slot, kvm_zap_rmapp, false);
5664
5665        for (i = 0; i < slot->npages; i++) {
5666                gfn = slot->base_gfn + i;
5667
5668                for_each_valid_sp(kvm, sp, gfn) {
5669                        if (sp->gfn != gfn)
5670                                continue;
5671
5672                        kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
5673                }
5674                if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
5675                        kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
5676                        flush = false;
5677                        cond_resched_lock(&kvm->mmu_lock);
5678                }
5679        }
5680        kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
5681
5682out_unlock:
5683        spin_unlock(&kvm->mmu_lock);
5684}
5685
5686void kvm_mmu_init_vm(struct kvm *kvm)
5687{
5688        struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5689
5690        node->track_write = kvm_mmu_pte_write;
5691        node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
5692        kvm_page_track_register_notifier(kvm, node);
5693}
5694
5695void kvm_mmu_uninit_vm(struct kvm *kvm)
5696{
5697        struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5698
5699        kvm_page_track_unregister_notifier(kvm, node);
5700}
5701
5702void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
5703{
5704        struct kvm_memslots *slots;
5705        struct kvm_memory_slot *memslot;
5706        int i;
5707
5708        spin_lock(&kvm->mmu_lock);
5709        for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
5710                slots = __kvm_memslots(kvm, i);
5711                kvm_for_each_memslot(memslot, slots) {
5712                        gfn_t start, end;
5713
5714                        start = max(gfn_start, memslot->base_gfn);
5715                        end = min(gfn_end, memslot->base_gfn + memslot->npages);
5716                        if (start >= end)
5717                                continue;
5718
5719                        slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
5720                                                PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
5721                                                start, end - 1, true);
5722                }
5723        }
5724
5725        spin_unlock(&kvm->mmu_lock);
5726}
5727
5728static bool slot_rmap_write_protect(struct kvm *kvm,
5729                                    struct kvm_rmap_head *rmap_head)
5730{
5731        return __rmap_write_protect(kvm, rmap_head, false);
5732}
5733
5734void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
5735                                      struct kvm_memory_slot *memslot)
5736{
5737        bool flush;
5738
5739        spin_lock(&kvm->mmu_lock);
5740        flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
5741                                      false);
5742        spin_unlock(&kvm->mmu_lock);
5743
5744        /*
5745         * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log()
5746         * which do tlb flush out of mmu-lock should be serialized by
5747         * kvm->slots_lock otherwise tlb flush would be missed.
5748         */
5749        lockdep_assert_held(&kvm->slots_lock);
5750
5751        /*
5752         * We can flush all the TLBs out of the mmu lock without TLB
5753         * corruption since we just change the spte from writable to
5754         * readonly so that we only need to care the case of changing
5755         * spte from present to present (changing the spte from present
5756         * to nonpresent will flush all the TLBs immediately), in other
5757         * words, the only case we care is mmu_spte_update() where we
5758         * have checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE
5759         * instead of PT_WRITABLE_MASK, that means it does not depend
5760         * on PT_WRITABLE_MASK anymore.
5761         */
5762        if (flush)
5763                kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5764                        memslot->npages);
5765}
5766
5767static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
5768                                         struct kvm_rmap_head *rmap_head)
5769{
5770        u64 *sptep;
5771        struct rmap_iterator iter;
5772        int need_tlb_flush = 0;
5773        kvm_pfn_t pfn;
5774        struct kvm_mmu_page *sp;
5775
5776restart:
5777        for_each_rmap_spte(rmap_head, &iter, sptep) {
5778                sp = page_header(__pa(sptep));
5779                pfn = spte_to_pfn(*sptep);
5780
5781                /*
5782                 * We cannot do huge page mapping for indirect shadow pages,
5783                 * which are found on the last rmap (level = 1) when not using
5784                 * tdp; such shadow pages are synced with the page table in
5785                 * the guest, and the guest page table is using 4K page size
5786                 * mapping if the indirect sp has level = 1.
5787                 */
5788                if (sp->role.direct &&
5789                        !kvm_is_reserved_pfn(pfn) &&
5790                        PageTransCompoundMap(pfn_to_page(pfn))) {
5791                        pte_list_remove(rmap_head, sptep);
5792
5793                        if (kvm_available_flush_tlb_with_range())
5794                                kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
5795                                        KVM_PAGES_PER_HPAGE(sp->role.level));
5796                        else
5797                                need_tlb_flush = 1;
5798
5799                        goto restart;
5800                }
5801        }
5802
5803        return need_tlb_flush;
5804}
5805
5806void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
5807                                   const struct kvm_memory_slot *memslot)
5808{
5809        /* FIXME: const-ify all uses of struct kvm_memory_slot.  */
5810        spin_lock(&kvm->mmu_lock);
5811        slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
5812                         kvm_mmu_zap_collapsible_spte, true);
5813        spin_unlock(&kvm->mmu_lock);
5814}
5815
5816void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
5817                                   struct kvm_memory_slot *memslot)
5818{
5819        bool flush;
5820
5821        spin_lock(&kvm->mmu_lock);
5822        flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
5823        spin_unlock(&kvm->mmu_lock);
5824
5825        lockdep_assert_held(&kvm->slots_lock);
5826
5827        /*
5828         * It's also safe to flush TLBs out of mmu lock here as currently this
5829         * function is only used for dirty logging, in which case flushing TLB
5830         * out of mmu lock also guarantees no dirty pages will be lost in
5831         * dirty_bitmap.
5832         */
5833        if (flush)
5834                kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5835                                memslot->npages);
5836}
5837EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
5838
5839void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
5840                                        struct kvm_memory_slot *memslot)
5841{
5842        bool flush;
5843
5844        spin_lock(&kvm->mmu_lock);
5845        flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
5846                                        false);
5847        spin_unlock(&kvm->mmu_lock);
5848
5849        /* see kvm_mmu_slot_remove_write_access */
5850        lockdep_assert_held(&kvm->slots_lock);
5851
5852        if (flush)
5853                kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5854                                memslot->npages);
5855}
5856EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
5857
5858void kvm_mmu_slot_set_dirty(struct kvm *kvm,
5859                            struct kvm_memory_slot *memslot)
5860{
5861        bool flush;
5862
5863        spin_lock(&kvm->mmu_lock);
5864        flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
5865        spin_unlock(&kvm->mmu_lock);
5866
5867        lockdep_assert_held(&kvm->slots_lock);
5868
5869        /* see kvm_mmu_slot_leaf_clear_dirty */
5870        if (flush)
5871                kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5872                                memslot->npages);
5873}
5874EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
5875
5876static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only)
5877{
5878        struct kvm_mmu_page *sp, *node;
5879        LIST_HEAD(invalid_list);
5880        int ign;
5881
5882        spin_lock(&kvm->mmu_lock);
5883restart:
5884        list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
5885                if (mmio_only && !sp->mmio_cached)
5886                        continue;
5887                if (sp->role.invalid && sp->root_count)
5888                        continue;
5889                if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) {
5890                        WARN_ON_ONCE(mmio_only);
5891                        goto restart;
5892                }
5893                if (cond_resched_lock(&kvm->mmu_lock))
5894                        goto restart;
5895        }
5896
5897        kvm_mmu_commit_zap_page(kvm, &invalid_list);
5898        spin_unlock(&kvm->mmu_lock);
5899}
5900
5901void kvm_mmu_zap_all(struct kvm *kvm)
5902{
5903        return __kvm_mmu_zap_all(kvm, false);
5904}
5905
5906void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
5907{
5908        WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
5909
5910        gen &= MMIO_SPTE_GEN_MASK;
5911
5912        /*
5913         * Generation numbers are incremented in multiples of the number of
5914         * address spaces in order to provide unique generations across all
5915         * address spaces.  Strip what is effectively the address space
5916         * modifier prior to checking for a wrap of the MMIO generation so
5917         * that a wrap in any address space is detected.
5918         */
5919        gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
5920
5921        /*
5922         * The very rare case: if the MMIO generation number has wrapped,
5923         * zap all shadow pages.
5924         */
5925        if (unlikely(gen == 0)) {
5926                kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
5927                __kvm_mmu_zap_all(kvm, true);
5928        }
5929}
5930
5931static unsigned long
5932mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
5933{
5934        struct kvm *kvm;
5935        int nr_to_scan = sc->nr_to_scan;
5936        unsigned long freed = 0;
5937
5938        spin_lock(&kvm_lock);
5939
5940        list_for_each_entry(kvm, &vm_list, vm_list) {
5941                int idx;
5942                LIST_HEAD(invalid_list);
5943
5944                /*
5945                 * Never scan more than sc->nr_to_scan VM instances.
5946                 * Will not hit this condition practically since we do not try
5947                 * to shrink more than one VM and it is very unlikely to see
5948                 * !n_used_mmu_pages so many times.
5949                 */
5950                if (!nr_to_scan--)
5951                        break;
5952                /*
5953                 * n_used_mmu_pages is accessed without holding kvm->mmu_lock
5954                 * here. We may skip a VM instance errorneosly, but we do not
5955                 * want to shrink a VM that only started to populate its MMU
5956                 * anyway.
5957                 */
5958                if (!kvm->arch.n_used_mmu_pages)
5959                        continue;
5960
5961                idx = srcu_read_lock(&kvm->srcu);
5962                spin_lock(&kvm->mmu_lock);
5963
5964                if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
5965                        freed++;
5966                kvm_mmu_commit_zap_page(kvm, &invalid_list);
5967
5968                spin_unlock(&kvm->mmu_lock);
5969                srcu_read_unlock(&kvm->srcu, idx);
5970
5971                /*
5972                 * unfair on small ones
5973                 * per-vm shrinkers cry out
5974                 * sadness comes quickly
5975                 */
5976                list_move_tail(&kvm->vm_list, &vm_list);
5977                break;
5978        }
5979
5980        spin_unlock(&kvm_lock);
5981        return freed;
5982}
5983
5984static unsigned long
5985mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
5986{
5987        return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
5988}
5989
5990static struct shrinker mmu_shrinker = {
5991        .count_objects = mmu_shrink_count,
5992        .scan_objects = mmu_shrink_scan,
5993        .seeks = DEFAULT_SEEKS * 10,
5994};
5995
5996static void mmu_destroy_caches(void)
5997{
5998        kmem_cache_destroy(pte_list_desc_cache);
5999        kmem_cache_destroy(mmu_page_header_cache);
6000}

6001
6002int kvm_mmu_module_init(void)
6003{
6004        int ret = -ENOMEM;
6005
6006        /*
6007         * MMU roles use union aliasing which is, generally speaking, an
6008         * undefined behavior. However, we supposedly know how compilers behave
6009         * and the current status quo is unlikely to change. Guardians below are
6010         * supposed to let us know if the assumption becomes false.
6011         */
6012        BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
6013        BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
6014        BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64));
6015
6016        kvm_mmu_reset_all_pte_masks();
6017
6018        pte_list_desc_cache = kmem_cache_create("pte_list_desc",
6019                                            sizeof(struct pte_list_desc),
6020                                            0, SLAB_ACCOUNT, NULL);
6021        if (!pte_list_desc_cache)
6022                goto out;
6023
6024        mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
6025                                                  sizeof(struct kvm_mmu_page),
6026                                                  0, SLAB_ACCOUNT, NULL);
6027        if (!mmu_page_header_cache)
6028                goto out;
6029
6030        if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
6031                goto out;
6032
6033        ret = register_shrinker(&mmu_shrinker);
6034        if (ret)
6035                goto out;
6036
6037        return 0;
6038
6039out:
6040        mmu_destroy_caches();
6041        return ret;
6042}
6043
6044/*
6045 * Calculate mmu pages needed for kvm.
6046 */
6047unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
6048{
6049        unsigned long nr_mmu_pages;
6050        unsigned long nr_pages = 0;
6051        struct kvm_memslots *slots;
6052        struct kvm_memory_slot *memslot;
6053        int i;
6054
6055        for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
6056                slots = __kvm_memslots(kvm, i);
6057
6058                kvm_for_each_memslot(memslot, slots)
6059                        nr_pages += memslot->npages;
6060        }
6061
6062        nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
6063        nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
6064
6065        return nr_mmu_pages;
6066}
6067
6068void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
6069{
6070        kvm_mmu_unload(vcpu);
6071        free_mmu_pages(vcpu);
6072        mmu_free_memory_caches(vcpu);
6073}
6074
6075void kvm_mmu_module_exit(void)
6076{
6077        mmu_destroy_caches();
6078        percpu_counter_destroy(&kvm_total_used_mmu_pages);
6079        unregister_shrinker(&mmu_shrinker);
6080        mmu_audit_disable();
6081}
6082