1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18#include "irq.h"
19#include "mmu.h"
20#include "x86.h"
21#include "kvm_cache_regs.h"
22#include "cpuid.h"
23
24#include <linux/kvm_host.h>
25#include <linux/types.h>
26#include <linux/string.h>
27#include <linux/mm.h>
28#include <linux/highmem.h>
29#include <linux/moduleparam.h>
30#include <linux/export.h>
31#include <linux/swap.h>
32#include <linux/hugetlb.h>
33#include <linux/compiler.h>
34#include <linux/srcu.h>
35#include <linux/slab.h>
36#include <linux/sched/signal.h>
37#include <linux/uaccess.h>
38#include <linux/hash.h>
39#include <linux/kern_levels.h>
40#include <linux/kthread.h>
41
42#include <asm/page.h>
43#include <asm/pat.h>
44#include <asm/cmpxchg.h>
45#include <asm/e820/api.h>
46#include <asm/io.h>
47#include <asm/vmx.h>
48#include <asm/kvm_page_track.h>
49#include "trace.h"
50
51extern bool itlb_multihit_kvm_mitigation;
52
53static int __read_mostly nx_huge_pages = -1;
54#ifdef CONFIG_PREEMPT_RT
55
56static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
57#else
58static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
59#endif
60
61static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
62static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);
63
64static struct kernel_param_ops nx_huge_pages_ops = {
65 .set = set_nx_huge_pages,
66 .get = param_get_bool,
67};
68
69static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
70 .set = set_nx_huge_pages_recovery_ratio,
71 .get = param_get_uint,
72};
73
74module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
75__MODULE_PARM_TYPE(nx_huge_pages, "bool");
76module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
77 &nx_huge_pages_recovery_ratio, 0644);
78__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
79
80
81
82
83
84
85
86
87bool tdp_enabled = false;
88
89enum {
90 AUDIT_PRE_PAGE_FAULT,
91 AUDIT_POST_PAGE_FAULT,
92 AUDIT_PRE_PTE_WRITE,
93 AUDIT_POST_PTE_WRITE,
94 AUDIT_PRE_SYNC,
95 AUDIT_POST_SYNC
96};
97
98#undef MMU_DEBUG
99
100#ifdef MMU_DEBUG
101static bool dbg = 0;
102module_param(dbg, bool, 0644);
103
104#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
105#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
106#define MMU_WARN_ON(x) WARN_ON(x)
107#else
108#define pgprintk(x...) do { } while (0)
109#define rmap_printk(x...) do { } while (0)
110#define MMU_WARN_ON(x) do { } while (0)
111#endif
112
113#define PTE_PREFETCH_NUM 8
114
115#define PT_FIRST_AVAIL_BITS_SHIFT 10
116#define PT64_SECOND_AVAIL_BITS_SHIFT 54
117
118
119
120
121
122#define SPTE_SPECIAL_MASK (3ULL << 52)
123#define SPTE_AD_ENABLED_MASK (0ULL << 52)
124#define SPTE_AD_DISABLED_MASK (1ULL << 52)
125#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
126#define SPTE_MMIO_MASK (3ULL << 52)
127
128#define PT64_LEVEL_BITS 9
129
130#define PT64_LEVEL_SHIFT(level) \
131 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
132
133#define PT64_INDEX(address, level)\
134 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
135
136
137#define PT32_LEVEL_BITS 10
138
139#define PT32_LEVEL_SHIFT(level) \
140 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
141
142#define PT32_LVL_OFFSET_MASK(level) \
143 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
144 * PT32_LEVEL_BITS))) - 1))
145
146#define PT32_INDEX(address, level)\
147 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
148
149
150#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
151#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
152#else
153#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
154#endif
155#define PT64_LVL_ADDR_MASK(level) \
156 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
157 * PT64_LEVEL_BITS))) - 1))
158#define PT64_LVL_OFFSET_MASK(level) \
159 (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
160 * PT64_LEVEL_BITS))) - 1))
161
162#define PT32_BASE_ADDR_MASK PAGE_MASK
163#define PT32_DIR_BASE_ADDR_MASK \
164 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
165#define PT32_LVL_ADDR_MASK(level) \
166 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
167 * PT32_LEVEL_BITS))) - 1))
168
169#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
170 | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
171
172#define ACC_EXEC_MASK 1
173#define ACC_WRITE_MASK PT_WRITABLE_MASK
174#define ACC_USER_MASK PT_USER_MASK
175#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
176
177
178#define PT64_EPT_READABLE_MASK 0x1ull
179#define PT64_EPT_EXECUTABLE_MASK 0x4ull
180
181#include <trace/events/kvm.h>
182
183#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
184#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
185
186#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
187
188
189#define PTE_LIST_EXT 3
190
191
192
193
194
195
196
197
198
199enum {
200 RET_PF_RETRY = 0,
201 RET_PF_EMULATE = 1,
202 RET_PF_INVALID = 2,
203};
204
205struct pte_list_desc {
206 u64 *sptes[PTE_LIST_EXT];
207 struct pte_list_desc *more;
208};
209
210struct kvm_shadow_walk_iterator {
211 u64 addr;
212 hpa_t shadow_addr;
213 u64 *sptep;
214 int level;
215 unsigned index;
216};
217
218static const union kvm_mmu_page_role mmu_base_role_mask = {
219 .cr0_wp = 1,
220 .gpte_is_8_bytes = 1,
221 .nxe = 1,
222 .smep_andnot_wp = 1,
223 .smap_andnot_wp = 1,
224 .smm = 1,
225 .guest_mode = 1,
226 .ad_disabled = 1,
227};
228
229#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \
230 for (shadow_walk_init_using_root(&(_walker), (_vcpu), \
231 (_root), (_addr)); \
232 shadow_walk_okay(&(_walker)); \
233 shadow_walk_next(&(_walker)))
234
235#define for_each_shadow_entry(_vcpu, _addr, _walker) \
236 for (shadow_walk_init(&(_walker), _vcpu, _addr); \
237 shadow_walk_okay(&(_walker)); \
238 shadow_walk_next(&(_walker)))
239
240#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
241 for (shadow_walk_init(&(_walker), _vcpu, _addr); \
242 shadow_walk_okay(&(_walker)) && \
243 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
244 __shadow_walk_next(&(_walker), spte))
245
246static struct kmem_cache *pte_list_desc_cache;
247static struct kmem_cache *mmu_page_header_cache;
248static struct percpu_counter kvm_total_used_mmu_pages;
249
250static u64 __read_mostly shadow_nx_mask;
251static u64 __read_mostly shadow_x_mask;
252static u64 __read_mostly shadow_user_mask;
253static u64 __read_mostly shadow_accessed_mask;
254static u64 __read_mostly shadow_dirty_mask;
255static u64 __read_mostly shadow_mmio_mask;
256static u64 __read_mostly shadow_mmio_value;
257static u64 __read_mostly shadow_mmio_access_mask;
258static u64 __read_mostly shadow_present_mask;
259static u64 __read_mostly shadow_me_mask;
260
261
262
263
264
265
266static u64 __read_mostly shadow_acc_track_mask;
267
268
269
270
271
272
273
274static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
275 PT64_EPT_EXECUTABLE_MASK;
276static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
277
278
279
280
281
282static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
283
284
285
286
287static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
288
289
290
291
292
293
294
295
296
297static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
298
299
300
301
302
303static u8 __read_mostly shadow_phys_bits;
304
305static void mmu_spte_set(u64 *sptep, u64 spte);
306static bool is_executable_pte(u64 spte);
307static union kvm_mmu_page_role
308kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
309
310#define CREATE_TRACE_POINTS
311#include "mmutrace.h"
312
313
314static inline bool kvm_available_flush_tlb_with_range(void)
315{
316 return kvm_x86_ops->tlb_remote_flush_with_range;
317}
318
319static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
320 struct kvm_tlb_range *range)
321{
322 int ret = -ENOTSUPP;
323
324 if (range && kvm_x86_ops->tlb_remote_flush_with_range)
325 ret = kvm_x86_ops->tlb_remote_flush_with_range(kvm, range);
326
327 if (ret)
328 kvm_flush_remote_tlbs(kvm);
329}
330
331static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
332 u64 start_gfn, u64 pages)
333{
334 struct kvm_tlb_range range;
335
336 range.start_gfn = start_gfn;
337 range.pages = pages;
338
339 kvm_flush_remote_tlbs_with_range(kvm, &range);
340}
341
342void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask)
343{
344 BUG_ON((u64)(unsigned)access_mask != access_mask);
345 BUG_ON((mmio_mask & mmio_value) != mmio_value);
346 shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
347 shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
348 shadow_mmio_access_mask = access_mask;
349}
350EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
351
352static bool is_mmio_spte(u64 spte)
353{
354 return (spte & shadow_mmio_mask) == shadow_mmio_value;
355}
356
357static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
358{
359 return sp->role.ad_disabled;
360}
361
362static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
363{
364
365
366
367
368
369
370 return vcpu->arch.mmu == &vcpu->arch.guest_mmu;
371}
372
373static inline bool spte_ad_enabled(u64 spte)
374{
375 MMU_WARN_ON(is_mmio_spte(spte));
376 return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK;
377}
378
379static inline bool spte_ad_need_write_protect(u64 spte)
380{
381 MMU_WARN_ON(is_mmio_spte(spte));
382 return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK;
383}
384
385static bool is_nx_huge_page_enabled(void)
386{
387 return READ_ONCE(nx_huge_pages);
388}
389
390static inline u64 spte_shadow_accessed_mask(u64 spte)
391{
392 MMU_WARN_ON(is_mmio_spte(spte));
393 return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
394}
395
396static inline u64 spte_shadow_dirty_mask(u64 spte)
397{
398 MMU_WARN_ON(is_mmio_spte(spte));
399 return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
400}
401
402static inline bool is_access_track_spte(u64 spte)
403{
404 return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
405}
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421#define MMIO_SPTE_GEN_MASK GENMASK_ULL(18, 0)
422
423#define MMIO_SPTE_GEN_LOW_START 3
424#define MMIO_SPTE_GEN_LOW_END 11
425#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
426 MMIO_SPTE_GEN_LOW_START)
427
428#define MMIO_SPTE_GEN_HIGH_START 52
429#define MMIO_SPTE_GEN_HIGH_END 61
430#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
431 MMIO_SPTE_GEN_HIGH_START)
432static u64 generation_mmio_spte_mask(u64 gen)
433{
434 u64 mask;
435
436 WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
437
438 mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
439 mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
440 return mask;
441}
442
443static u64 get_mmio_spte_generation(u64 spte)
444{
445 u64 gen;
446
447 spte &= ~shadow_mmio_mask;
448
449 gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
450 gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
451 return gen;
452}
453
454static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
455 unsigned access)
456{
457 u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
458 u64 mask = generation_mmio_spte_mask(gen);
459 u64 gpa = gfn << PAGE_SHIFT;
460
461 access &= shadow_mmio_access_mask;
462 mask |= shadow_mmio_value | access;
463 mask |= gpa | shadow_nonpresent_or_rsvd_mask;
464 mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
465 << shadow_nonpresent_or_rsvd_mask_len;
466
467 trace_mark_mmio_spte(sptep, gfn, access, gen);
468 mmu_spte_set(sptep, mask);
469}
470
471static gfn_t get_mmio_spte_gfn(u64 spte)
472{
473 u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
474
475 gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
476 & shadow_nonpresent_or_rsvd_mask;
477
478 return gpa >> PAGE_SHIFT;
479}
480
481static unsigned get_mmio_spte_access(u64 spte)
482{
483 return spte & shadow_mmio_access_mask;
484}
485
486static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
487 kvm_pfn_t pfn, unsigned access)
488{
489 if (unlikely(is_noslot_pfn(pfn))) {
490 mark_mmio_spte(vcpu, sptep, gfn, access);
491 return true;
492 }
493
494 return false;
495}
496
497static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
498{
499 u64 kvm_gen, spte_gen, gen;
500
501 gen = kvm_vcpu_memslots(vcpu)->generation;
502 if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
503 return false;
504
505 kvm_gen = gen & MMIO_SPTE_GEN_MASK;
506 spte_gen = get_mmio_spte_generation(spte);
507
508 trace_check_mmio_spte(spte, kvm_gen, spte_gen);
509 return likely(kvm_gen == spte_gen);
510}
511
512
513
514
515
516
517
518
519void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
520 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
521 u64 acc_track_mask, u64 me_mask)
522{
523 BUG_ON(!dirty_mask != !accessed_mask);
524 BUG_ON(!accessed_mask && !acc_track_mask);
525 BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK);
526
527 shadow_user_mask = user_mask;
528 shadow_accessed_mask = accessed_mask;
529 shadow_dirty_mask = dirty_mask;
530 shadow_nx_mask = nx_mask;
531 shadow_x_mask = x_mask;
532 shadow_present_mask = p_mask;
533 shadow_acc_track_mask = acc_track_mask;
534 shadow_me_mask = me_mask;
535}
536EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
537
538static u8 kvm_get_shadow_phys_bits(void)
539{
540
541
542
543
544
545
546 if (!boot_cpu_has(X86_FEATURE_TME) ||
547 WARN_ON_ONCE(boot_cpu_data.extended_cpuid_level < 0x80000008))
548 return boot_cpu_data.x86_phys_bits;
549
550 return cpuid_eax(0x80000008) & 0xff;
551}
552
553static void kvm_mmu_reset_all_pte_masks(void)
554{
555 u8 low_phys_bits;
556
557 shadow_user_mask = 0;
558 shadow_accessed_mask = 0;
559 shadow_dirty_mask = 0;
560 shadow_nx_mask = 0;
561 shadow_x_mask = 0;
562 shadow_mmio_mask = 0;
563 shadow_present_mask = 0;
564 shadow_acc_track_mask = 0;
565
566 shadow_phys_bits = kvm_get_shadow_phys_bits();
567
568
569
570
571
572
573
574
575
576
577
578 shadow_nonpresent_or_rsvd_mask = 0;
579 low_phys_bits = boot_cpu_data.x86_cache_bits;
580 if (boot_cpu_data.x86_cache_bits <
581 52 - shadow_nonpresent_or_rsvd_mask_len) {
582 shadow_nonpresent_or_rsvd_mask =
583 rsvd_bits(boot_cpu_data.x86_cache_bits -
584 shadow_nonpresent_or_rsvd_mask_len,
585 boot_cpu_data.x86_cache_bits - 1);
586 low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len;
587 } else
588 WARN_ON_ONCE(boot_cpu_has_bug(X86_BUG_L1TF));
589
590 shadow_nonpresent_or_rsvd_lower_gfn_mask =
591 GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
592}
593
594static int is_cpuid_PSE36(void)
595{
596 return 1;
597}
598
599static int is_nx(struct kvm_vcpu *vcpu)
600{
601 return vcpu->arch.efer & EFER_NX;
602}
603
604static int is_shadow_present_pte(u64 pte)
605{
606 return (pte != 0) && !is_mmio_spte(pte);
607}
608
609static int is_large_pte(u64 pte)
610{
611 return pte & PT_PAGE_SIZE_MASK;
612}
613
614static int is_last_spte(u64 pte, int level)
615{
616 if (level == PT_PAGE_TABLE_LEVEL)
617 return 1;
618 if (is_large_pte(pte))
619 return 1;
620 return 0;
621}
622
623static bool is_executable_pte(u64 spte)
624{
625 return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
626}
627
628static kvm_pfn_t spte_to_pfn(u64 pte)
629{
630 return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
631}
632
633static gfn_t pse36_gfn_delta(u32 gpte)
634{
635 int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
636
637 return (gpte & PT32_DIR_PSE36_MASK) << shift;
638}
639
640#ifdef CONFIG_X86_64
641static void __set_spte(u64 *sptep, u64 spte)
642{
643 WRITE_ONCE(*sptep, spte);
644}
645
646static void __update_clear_spte_fast(u64 *sptep, u64 spte)
647{
648 WRITE_ONCE(*sptep, spte);
649}
650
651static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
652{
653 return xchg(sptep, spte);
654}
655
656static u64 __get_spte_lockless(u64 *sptep)
657{
658 return READ_ONCE(*sptep);
659}
660#else
661union split_spte {
662 struct {
663 u32 spte_low;
664 u32 spte_high;
665 };
666 u64 spte;
667};
668
669static void count_spte_clear(u64 *sptep, u64 spte)
670{
671 struct kvm_mmu_page *sp = page_header(__pa(sptep));
672
673 if (is_shadow_present_pte(spte))
674 return;
675
676
677 smp_wmb();
678 sp->clear_spte_count++;
679}
680
681static void __set_spte(u64 *sptep, u64 spte)
682{
683 union split_spte *ssptep, sspte;
684
685 ssptep = (union split_spte *)sptep;
686 sspte = (union split_spte)spte;
687
688 ssptep->spte_high = sspte.spte_high;
689
690
691
692
693
694
695 smp_wmb();
696
697 WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
698}
699
700static void __update_clear_spte_fast(u64 *sptep, u64 spte)
701{
702 union split_spte *ssptep, sspte;
703
704 ssptep = (union split_spte *)sptep;
705 sspte = (union split_spte)spte;
706
707 WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
708
709
710
711
712
713 smp_wmb();
714
715 ssptep->spte_high = sspte.spte_high;
716 count_spte_clear(sptep, spte);
717}
718
719static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
720{
721 union split_spte *ssptep, sspte, orig;
722
723 ssptep = (union split_spte *)sptep;
724 sspte = (union split_spte)spte;
725
726
727 orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
728 orig.spte_high = ssptep->spte_high;
729 ssptep->spte_high = sspte.spte_high;
730 count_spte_clear(sptep, spte);
731
732 return orig.spte;
733}
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753static u64 __get_spte_lockless(u64 *sptep)
754{
755 struct kvm_mmu_page *sp = page_header(__pa(sptep));
756 union split_spte spte, *orig = (union split_spte *)sptep;
757 int count;
758
759retry:
760 count = sp->clear_spte_count;
761 smp_rmb();
762
763 spte.spte_low = orig->spte_low;
764 smp_rmb();
765
766 spte.spte_high = orig->spte_high;
767 smp_rmb();
768
769 if (unlikely(spte.spte_low != orig->spte_low ||
770 count != sp->clear_spte_count))
771 goto retry;
772
773 return spte.spte;
774}
775#endif
776
777static bool spte_can_locklessly_be_made_writable(u64 spte)
778{
779 return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
780 (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
781}
782
783static bool spte_has_volatile_bits(u64 spte)
784{
785 if (!is_shadow_present_pte(spte))
786 return false;
787
788
789
790
791
792
793
794 if (spte_can_locklessly_be_made_writable(spte) ||
795 is_access_track_spte(spte))
796 return true;
797
798 if (spte_ad_enabled(spte)) {
799 if ((spte & shadow_accessed_mask) == 0 ||
800 (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
801 return true;
802 }
803
804 return false;
805}
806
807static bool is_accessed_spte(u64 spte)
808{
809 u64 accessed_mask = spte_shadow_accessed_mask(spte);
810
811 return accessed_mask ? spte & accessed_mask
812 : !is_access_track_spte(spte);
813}
814
815static bool is_dirty_spte(u64 spte)
816{
817 u64 dirty_mask = spte_shadow_dirty_mask(spte);
818
819 return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
820}
821
822
823
824
825
826
827
828static void mmu_spte_set(u64 *sptep, u64 new_spte)
829{
830 WARN_ON(is_shadow_present_pte(*sptep));
831 __set_spte(sptep, new_spte);
832}
833
834
835
836
837
838static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
839{
840 u64 old_spte = *sptep;
841
842 WARN_ON(!is_shadow_present_pte(new_spte));
843
844 if (!is_shadow_present_pte(old_spte)) {
845 mmu_spte_set(sptep, new_spte);
846 return old_spte;
847 }
848
849 if (!spte_has_volatile_bits(old_spte))
850 __update_clear_spte_fast(sptep, new_spte);
851 else
852 old_spte = __update_clear_spte_slow(sptep, new_spte);
853
854 WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
855
856 return old_spte;
857}
858
859
860
861
862
863
864
865
866
867
868
869
870static bool mmu_spte_update(u64 *sptep, u64 new_spte)
871{
872 bool flush = false;
873 u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
874
875 if (!is_shadow_present_pte(old_spte))
876 return false;
877
878
879
880
881
882
883 if (spte_can_locklessly_be_made_writable(old_spte) &&
884 !is_writable_pte(new_spte))
885 flush = true;
886
887
888
889
890
891
892 if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
893 flush = true;
894 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
895 }
896
897 if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
898 flush = true;
899 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
900 }
901
902 return flush;
903}
904
905
906
907
908
909
910
911static int mmu_spte_clear_track_bits(u64 *sptep)
912{
913 kvm_pfn_t pfn;
914 u64 old_spte = *sptep;
915
916 if (!spte_has_volatile_bits(old_spte))
917 __update_clear_spte_fast(sptep, 0ull);
918 else
919 old_spte = __update_clear_spte_slow(sptep, 0ull);
920
921 if (!is_shadow_present_pte(old_spte))
922 return 0;
923
924 pfn = spte_to_pfn(old_spte);
925
926
927
928
929
930
931 WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
932
933 if (is_accessed_spte(old_spte))
934 kvm_set_pfn_accessed(pfn);
935
936 if (is_dirty_spte(old_spte))
937 kvm_set_pfn_dirty(pfn);
938
939 return 1;
940}
941
942
943
944
945
946
947static void mmu_spte_clear_no_track(u64 *sptep)
948{
949 __update_clear_spte_fast(sptep, 0ull);
950}
951
952static u64 mmu_spte_get_lockless(u64 *sptep)
953{
954 return __get_spte_lockless(sptep);
955}
956
957static u64 mark_spte_for_access_track(u64 spte)
958{
959 if (spte_ad_enabled(spte))
960 return spte & ~shadow_accessed_mask;
961
962 if (is_access_track_spte(spte))
963 return spte;
964
965
966
967
968
969
970 WARN_ONCE((spte & PT_WRITABLE_MASK) &&
971 !spte_can_locklessly_be_made_writable(spte),
972 "kvm: Writable SPTE is not locklessly dirty-trackable\n");
973
974 WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
975 shadow_acc_track_saved_bits_shift),
976 "kvm: Access Tracking saved bit locations are not zero\n");
977
978 spte |= (spte & shadow_acc_track_saved_bits_mask) <<
979 shadow_acc_track_saved_bits_shift;
980 spte &= ~shadow_acc_track_mask;
981
982 return spte;
983}
984
985
986static u64 restore_acc_track_spte(u64 spte)
987{
988 u64 new_spte = spte;
989 u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
990 & shadow_acc_track_saved_bits_mask;
991
992 WARN_ON_ONCE(spte_ad_enabled(spte));
993 WARN_ON_ONCE(!is_access_track_spte(spte));
994
995 new_spte &= ~shadow_acc_track_mask;
996 new_spte &= ~(shadow_acc_track_saved_bits_mask <<
997 shadow_acc_track_saved_bits_shift);
998 new_spte |= saved_bits;
999
1000 return new_spte;
1001}
1002
1003
1004static bool mmu_spte_age(u64 *sptep)
1005{
1006 u64 spte = mmu_spte_get_lockless(sptep);
1007
1008 if (!is_accessed_spte(spte))
1009 return false;
1010
1011 if (spte_ad_enabled(spte)) {
1012 clear_bit((ffs(shadow_accessed_mask) - 1),
1013 (unsigned long *)sptep);
1014 } else {
1015
1016
1017
1018
1019 if (is_writable_pte(spte))
1020 kvm_set_pfn_dirty(spte_to_pfn(spte));
1021
1022 spte = mark_spte_for_access_track(spte);
1023 mmu_spte_update_no_track(sptep, spte);
1024 }
1025
1026 return true;
1027}
1028
1029static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
1030{
1031
1032
1033
1034
1035 local_irq_disable();
1036
1037
1038
1039
1040
1041 smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
1042}
1043
1044static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
1045{
1046
1047
1048
1049
1050
1051 smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
1052 local_irq_enable();
1053}
1054
1055static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
1056 struct kmem_cache *base_cache, int min)
1057{
1058 void *obj;
1059
1060 if (cache->nobjs >= min)
1061 return 0;
1062 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
1063 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT);
1064 if (!obj)
1065 return cache->nobjs >= min ? 0 : -ENOMEM;
1066 cache->objects[cache->nobjs++] = obj;
1067 }
1068 return 0;
1069}
1070
1071static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
1072{
1073 return cache->nobjs;
1074}
1075
1076static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
1077 struct kmem_cache *cache)
1078{
1079 while (mc->nobjs)
1080 kmem_cache_free(cache, mc->objects[--mc->nobjs]);
1081}
1082
1083static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
1084 int min)
1085{
1086 void *page;
1087
1088 if (cache->nobjs >= min)
1089 return 0;
1090 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
1091 page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
1092 if (!page)
1093 return cache->nobjs >= min ? 0 : -ENOMEM;
1094 cache->objects[cache->nobjs++] = page;
1095 }
1096 return 0;
1097}
1098
1099static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
1100{
1101 while (mc->nobjs)
1102 free_page((unsigned long)mc->objects[--mc->nobjs]);
1103}
1104
1105static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
1106{
1107 int r;
1108
1109 r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
1110 pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
1111 if (r)
1112 goto out;
1113 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
1114 if (r)
1115 goto out;
1116 r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
1117 mmu_page_header_cache, 4);
1118out:
1119 return r;
1120}
1121
1122static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
1123{
1124 mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
1125 pte_list_desc_cache);
1126 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
1127 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
1128 mmu_page_header_cache);
1129}
1130
1131static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
1132{
1133 void *p;
1134
1135 BUG_ON(!mc->nobjs);
1136 p = mc->objects[--mc->nobjs];
1137 return p;
1138}
1139
1140static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
1141{
1142 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
1143}
1144
1145static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
1146{
1147 kmem_cache_free(pte_list_desc_cache, pte_list_desc);
1148}
1149
1150static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
1151{
1152 if (!sp->role.direct)
1153 return sp->gfns[index];
1154
1155 return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
1156}
1157
1158static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
1159{
1160 if (!sp->role.direct) {
1161 sp->gfns[index] = gfn;
1162 return;
1163 }
1164
1165 if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
1166 pr_err_ratelimited("gfn mismatch under direct page %llx "
1167 "(expected %llx, got %llx)\n",
1168 sp->gfn,
1169 kvm_mmu_page_get_gfn(sp, index), gfn);
1170}
1171
1172
1173
1174
1175
1176static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
1177 struct kvm_memory_slot *slot,
1178 int level)
1179{
1180 unsigned long idx;
1181
1182 idx = gfn_to_index(gfn, slot->base_gfn, level);
1183 return &slot->arch.lpage_info[level - 2][idx];
1184}
1185
1186static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
1187 gfn_t gfn, int count)
1188{
1189 struct kvm_lpage_info *linfo;
1190 int i;
1191
1192 for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1193 linfo = lpage_info_slot(gfn, slot, i);
1194 linfo->disallow_lpage += count;
1195 WARN_ON(linfo->disallow_lpage < 0);
1196 }
1197}
1198
1199void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
1200{
1201 update_gfn_disallow_lpage_count(slot, gfn, 1);
1202}
1203
1204void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
1205{
1206 update_gfn_disallow_lpage_count(slot, gfn, -1);
1207}
1208
1209static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1210{
1211 struct kvm_memslots *slots;
1212 struct kvm_memory_slot *slot;
1213 gfn_t gfn;
1214
1215 kvm->arch.indirect_shadow_pages++;
1216 gfn = sp->gfn;
1217 slots = kvm_memslots_for_spte_role(kvm, sp->role);
1218 slot = __gfn_to_memslot(slots, gfn);
1219
1220
1221 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
1222 return kvm_slot_page_track_add_page(kvm, slot, gfn,
1223 KVM_PAGE_TRACK_WRITE);
1224
1225 kvm_mmu_gfn_disallow_lpage(slot, gfn);
1226}
1227
1228static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1229{
1230 if (sp->lpage_disallowed)
1231 return;
1232
1233 ++kvm->stat.nx_lpage_splits;
1234 list_add_tail(&sp->lpage_disallowed_link,
1235 &kvm->arch.lpage_disallowed_mmu_pages);
1236 sp->lpage_disallowed = true;
1237}
1238
1239static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1240{
1241 struct kvm_memslots *slots;
1242 struct kvm_memory_slot *slot;
1243 gfn_t gfn;
1244
1245 kvm->arch.indirect_shadow_pages--;
1246 gfn = sp->gfn;
1247 slots = kvm_memslots_for_spte_role(kvm, sp->role);
1248 slot = __gfn_to_memslot(slots, gfn);
1249 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
1250 return kvm_slot_page_track_remove_page(kvm, slot, gfn,
1251 KVM_PAGE_TRACK_WRITE);
1252
1253 kvm_mmu_gfn_allow_lpage(slot, gfn);
1254}
1255
1256static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1257{
1258 --kvm->stat.nx_lpage_splits;
1259 sp->lpage_disallowed = false;
1260 list_del(&sp->lpage_disallowed_link);
1261}
1262
1263static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
1264 struct kvm_memory_slot *slot)
1265{
1266 struct kvm_lpage_info *linfo;
1267
1268 if (slot) {
1269 linfo = lpage_info_slot(gfn, slot, level);
1270 return !!linfo->disallow_lpage;
1271 }
1272
1273 return true;
1274}
1275
1276static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
1277 int level)
1278{
1279 struct kvm_memory_slot *slot;
1280
1281 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1282 return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
1283}
1284
1285static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
1286{
1287 unsigned long page_size;
1288 int i, ret = 0;
1289
1290 page_size = kvm_host_page_size(kvm, gfn);
1291
1292 for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1293 if (page_size >= KVM_HPAGE_SIZE(i))
1294 ret = i;
1295 else
1296 break;
1297 }
1298
1299 return ret;
1300}
1301
1302static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot,
1303 bool no_dirty_log)
1304{
1305 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
1306 return false;
1307 if (no_dirty_log && slot->dirty_bitmap)
1308 return false;
1309
1310 return true;
1311}
1312
1313static struct kvm_memory_slot *
1314gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
1315 bool no_dirty_log)
1316{
1317 struct kvm_memory_slot *slot;
1318
1319 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1320 if (!memslot_valid_for_gpte(slot, no_dirty_log))
1321 slot = NULL;
1322
1323 return slot;
1324}
1325
1326static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
1327 bool *force_pt_level)
1328{
1329 int host_level, level, max_level;
1330 struct kvm_memory_slot *slot;
1331
1332 if (unlikely(*force_pt_level))
1333 return PT_PAGE_TABLE_LEVEL;
1334
1335 slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
1336 *force_pt_level = !memslot_valid_for_gpte(slot, true);
1337 if (unlikely(*force_pt_level))
1338 return PT_PAGE_TABLE_LEVEL;
1339
1340 host_level = host_mapping_level(vcpu->kvm, large_gfn);
1341
1342 if (host_level == PT_PAGE_TABLE_LEVEL)
1343 return host_level;
1344
1345 max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
1346
1347 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
1348 if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
1349 break;
1350
1351 return level - 1;
1352}
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
1366 struct kvm_rmap_head *rmap_head)
1367{
1368 struct pte_list_desc *desc;
1369 int i, count = 0;
1370
1371 if (!rmap_head->val) {
1372 rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
1373 rmap_head->val = (unsigned long)spte;
1374 } else if (!(rmap_head->val & 1)) {
1375 rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
1376 desc = mmu_alloc_pte_list_desc(vcpu);
1377 desc->sptes[0] = (u64 *)rmap_head->val;
1378 desc->sptes[1] = spte;
1379 rmap_head->val = (unsigned long)desc | 1;
1380 ++count;
1381 } else {
1382 rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
1383 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1384 while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
1385 desc = desc->more;
1386 count += PTE_LIST_EXT;
1387 }
1388 if (desc->sptes[PTE_LIST_EXT-1]) {
1389 desc->more = mmu_alloc_pte_list_desc(vcpu);
1390 desc = desc->more;
1391 }
1392 for (i = 0; desc->sptes[i]; ++i)
1393 ++count;
1394 desc->sptes[i] = spte;
1395 }
1396 return count;
1397}
1398
1399static void
1400pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
1401 struct pte_list_desc *desc, int i,
1402 struct pte_list_desc *prev_desc)
1403{
1404 int j;
1405
1406 for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
1407 ;
1408 desc->sptes[i] = desc->sptes[j];
1409 desc->sptes[j] = NULL;
1410 if (j != 0)
1411 return;
1412 if (!prev_desc && !desc->more)
1413 rmap_head->val = (unsigned long)desc->sptes[0];
1414 else
1415 if (prev_desc)
1416 prev_desc->more = desc->more;
1417 else
1418 rmap_head->val = (unsigned long)desc->more | 1;
1419 mmu_free_pte_list_desc(desc);
1420}
1421
1422static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
1423{
1424 struct pte_list_desc *desc;
1425 struct pte_list_desc *prev_desc;
1426 int i;
1427
1428 if (!rmap_head->val) {
1429 pr_err("%s: %p 0->BUG\n", __func__, spte);
1430 BUG();
1431 } else if (!(rmap_head->val & 1)) {
1432 rmap_printk("%s: %p 1->0\n", __func__, spte);
1433 if ((u64 *)rmap_head->val != spte) {
1434 pr_err("%s: %p 1->BUG\n", __func__, spte);
1435 BUG();
1436 }
1437 rmap_head->val = 0;
1438 } else {
1439 rmap_printk("%s: %p many->many\n", __func__, spte);
1440 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1441 prev_desc = NULL;
1442 while (desc) {
1443 for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
1444 if (desc->sptes[i] == spte) {
1445 pte_list_desc_remove_entry(rmap_head,
1446 desc, i, prev_desc);
1447 return;
1448 }
1449 }
1450 prev_desc = desc;
1451 desc = desc->more;
1452 }
1453 pr_err("%s: %p many->many\n", __func__, spte);
1454 BUG();
1455 }
1456}
1457
1458static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep)
1459{
1460 mmu_spte_clear_track_bits(sptep);
1461 __pte_list_remove(sptep, rmap_head);
1462}
1463
1464static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
1465 struct kvm_memory_slot *slot)
1466{
1467 unsigned long idx;
1468
1469 idx = gfn_to_index(gfn, slot->base_gfn, level);
1470 return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
1471}
1472
1473static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
1474 struct kvm_mmu_page *sp)
1475{
1476 struct kvm_memslots *slots;
1477 struct kvm_memory_slot *slot;
1478
1479 slots = kvm_memslots_for_spte_role(kvm, sp->role);
1480 slot = __gfn_to_memslot(slots, gfn);
1481 return __gfn_to_rmap(gfn, sp->role.level, slot);
1482}
1483
1484static bool rmap_can_add(struct kvm_vcpu *vcpu)
1485{
1486 struct kvm_mmu_memory_cache *cache;
1487
1488 cache = &vcpu->arch.mmu_pte_list_desc_cache;
1489 return mmu_memory_cache_free_objects(cache);
1490}
1491
1492static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
1493{
1494 struct kvm_mmu_page *sp;
1495 struct kvm_rmap_head *rmap_head;
1496
1497 sp = page_header(__pa(spte));
1498 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
1499 rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
1500 return pte_list_add(vcpu, spte, rmap_head);
1501}
1502
1503static void rmap_remove(struct kvm *kvm, u64 *spte)
1504{
1505 struct kvm_mmu_page *sp;
1506 gfn_t gfn;
1507 struct kvm_rmap_head *rmap_head;
1508
1509 sp = page_header(__pa(spte));
1510 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
1511 rmap_head = gfn_to_rmap(kvm, gfn, sp);
1512 __pte_list_remove(spte, rmap_head);
1513}
1514
1515
1516
1517
1518
1519struct rmap_iterator {
1520
1521 struct pte_list_desc *desc;
1522 int pos;
1523};
1524
1525
1526
1527
1528
1529
1530
1531
1532static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
1533 struct rmap_iterator *iter)
1534{
1535 u64 *sptep;
1536
1537 if (!rmap_head->val)
1538 return NULL;
1539
1540 if (!(rmap_head->val & 1)) {
1541 iter->desc = NULL;
1542 sptep = (u64 *)rmap_head->val;
1543 goto out;
1544 }
1545
1546 iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1547 iter->pos = 0;
1548 sptep = iter->desc->sptes[iter->pos];
1549out:
1550 BUG_ON(!is_shadow_present_pte(*sptep));
1551 return sptep;
1552}
1553
1554
1555
1556
1557
1558
1559static u64 *rmap_get_next(struct rmap_iterator *iter)
1560{
1561 u64 *sptep;
1562
1563 if (iter->desc) {
1564 if (iter->pos < PTE_LIST_EXT - 1) {
1565 ++iter->pos;
1566 sptep = iter->desc->sptes[iter->pos];
1567 if (sptep)
1568 goto out;
1569 }
1570
1571 iter->desc = iter->desc->more;
1572
1573 if (iter->desc) {
1574 iter->pos = 0;
1575
1576 sptep = iter->desc->sptes[iter->pos];
1577 goto out;
1578 }
1579 }
1580
1581 return NULL;
1582out:
1583 BUG_ON(!is_shadow_present_pte(*sptep));
1584 return sptep;
1585}
1586
1587#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \
1588 for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \
1589 _spte_; _spte_ = rmap_get_next(_iter_))
1590
1591static void drop_spte(struct kvm *kvm, u64 *sptep)
1592{
1593 if (mmu_spte_clear_track_bits(sptep))
1594 rmap_remove(kvm, sptep);
1595}
1596
1597
1598static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
1599{
1600 if (is_large_pte(*sptep)) {
1601 WARN_ON(page_header(__pa(sptep))->role.level ==
1602 PT_PAGE_TABLE_LEVEL);
1603 drop_spte(kvm, sptep);
1604 --kvm->stat.lpages;
1605 return true;
1606 }
1607
1608 return false;
1609}
1610
1611static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1612{
1613 if (__drop_large_spte(vcpu->kvm, sptep)) {
1614 struct kvm_mmu_page *sp = page_header(__pa(sptep));
1615
1616 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1617 KVM_PAGES_PER_HPAGE(sp->role.level));
1618 }
1619}
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634static bool spte_write_protect(u64 *sptep, bool pt_protect)
1635{
1636 u64 spte = *sptep;
1637
1638 if (!is_writable_pte(spte) &&
1639 !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
1640 return false;
1641
1642 rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
1643
1644 if (pt_protect)
1645 spte &= ~SPTE_MMU_WRITEABLE;
1646 spte = spte & ~PT_WRITABLE_MASK;
1647
1648 return mmu_spte_update(sptep, spte);
1649}
1650
1651static bool __rmap_write_protect(struct kvm *kvm,
1652 struct kvm_rmap_head *rmap_head,
1653 bool pt_protect)
1654{
1655 u64 *sptep;
1656 struct rmap_iterator iter;
1657 bool flush = false;
1658
1659 for_each_rmap_spte(rmap_head, &iter, sptep)
1660 flush |= spte_write_protect(sptep, pt_protect);
1661
1662 return flush;
1663}
1664
1665static bool spte_clear_dirty(u64 *sptep)
1666{
1667 u64 spte = *sptep;
1668
1669 rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
1670
1671 MMU_WARN_ON(!spte_ad_enabled(spte));
1672 spte &= ~shadow_dirty_mask;
1673 return mmu_spte_update(sptep, spte);
1674}
1675
1676static bool spte_wrprot_for_clear_dirty(u64 *sptep)
1677{
1678 bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
1679 (unsigned long *)sptep);
1680 if (was_writable && !spte_ad_enabled(*sptep))
1681 kvm_set_pfn_dirty(spte_to_pfn(*sptep));
1682
1683 return was_writable;
1684}
1685
1686
1687
1688
1689
1690
1691
1692static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1693{
1694 u64 *sptep;
1695 struct rmap_iterator iter;
1696 bool flush = false;
1697
1698 for_each_rmap_spte(rmap_head, &iter, sptep)
1699 if (spte_ad_need_write_protect(*sptep))
1700 flush |= spte_wrprot_for_clear_dirty(sptep);
1701 else
1702 flush |= spte_clear_dirty(sptep);
1703
1704 return flush;
1705}
1706
1707static bool spte_set_dirty(u64 *sptep)
1708{
1709 u64 spte = *sptep;
1710
1711 rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
1712
1713
1714
1715
1716
1717
1718 spte |= shadow_dirty_mask;
1719
1720 return mmu_spte_update(sptep, spte);
1721}
1722
1723static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1724{
1725 u64 *sptep;
1726 struct rmap_iterator iter;
1727 bool flush = false;
1728
1729 for_each_rmap_spte(rmap_head, &iter, sptep)
1730 if (spte_ad_enabled(*sptep))
1731 flush |= spte_set_dirty(sptep);
1732
1733 return flush;
1734}
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1747 struct kvm_memory_slot *slot,
1748 gfn_t gfn_offset, unsigned long mask)
1749{
1750 struct kvm_rmap_head *rmap_head;
1751
1752 while (mask) {
1753 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1754 PT_PAGE_TABLE_LEVEL, slot);
1755 __rmap_write_protect(kvm, rmap_head, false);
1756
1757
1758 mask &= mask - 1;
1759 }
1760}
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1773 struct kvm_memory_slot *slot,
1774 gfn_t gfn_offset, unsigned long mask)
1775{
1776 struct kvm_rmap_head *rmap_head;
1777
1778 while (mask) {
1779 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1780 PT_PAGE_TABLE_LEVEL, slot);
1781 __rmap_clear_dirty(kvm, rmap_head);
1782
1783
1784 mask &= mask - 1;
1785 }
1786}
1787EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1800 struct kvm_memory_slot *slot,
1801 gfn_t gfn_offset, unsigned long mask)
1802{
1803 if (kvm_x86_ops->enable_log_dirty_pt_masked)
1804 kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
1805 mask);
1806 else
1807 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1808}
1809
1810
1811
1812
1813
1814
1815
1816
1817int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu)
1818{
1819 if (kvm_x86_ops->write_log_dirty)
1820 return kvm_x86_ops->write_log_dirty(vcpu);
1821
1822 return 0;
1823}
1824
1825bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
1826 struct kvm_memory_slot *slot, u64 gfn)
1827{
1828 struct kvm_rmap_head *rmap_head;
1829 int i;
1830 bool write_protected = false;
1831
1832 for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1833 rmap_head = __gfn_to_rmap(gfn, i, slot);
1834 write_protected |= __rmap_write_protect(kvm, rmap_head, true);
1835 }
1836
1837 return write_protected;
1838}
1839
1840static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
1841{
1842 struct kvm_memory_slot *slot;
1843
1844 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1845 return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
1846}
1847
1848static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1849{
1850 u64 *sptep;
1851 struct rmap_iterator iter;
1852 bool flush = false;
1853
1854 while ((sptep = rmap_get_first(rmap_head, &iter))) {
1855 rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
1856
1857 pte_list_remove(rmap_head, sptep);
1858 flush = true;
1859 }
1860
1861 return flush;
1862}
1863
1864static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1865 struct kvm_memory_slot *slot, gfn_t gfn, int level,
1866 unsigned long data)
1867{
1868 return kvm_zap_rmapp(kvm, rmap_head);
1869}
1870
1871static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1872 struct kvm_memory_slot *slot, gfn_t gfn, int level,
1873 unsigned long data)
1874{
1875 u64 *sptep;
1876 struct rmap_iterator iter;
1877 int need_flush = 0;
1878 u64 new_spte;
1879 pte_t *ptep = (pte_t *)data;
1880 kvm_pfn_t new_pfn;
1881
1882 WARN_ON(pte_huge(*ptep));
1883 new_pfn = pte_pfn(*ptep);
1884
1885restart:
1886 for_each_rmap_spte(rmap_head, &iter, sptep) {
1887 rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
1888 sptep, *sptep, gfn, level);
1889
1890 need_flush = 1;
1891
1892 if (pte_write(*ptep)) {
1893 pte_list_remove(rmap_head, sptep);
1894 goto restart;
1895 } else {
1896 new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
1897 new_spte |= (u64)new_pfn << PAGE_SHIFT;
1898
1899 new_spte &= ~PT_WRITABLE_MASK;
1900 new_spte &= ~SPTE_HOST_WRITEABLE;
1901
1902 new_spte = mark_spte_for_access_track(new_spte);
1903
1904 mmu_spte_clear_track_bits(sptep);
1905 mmu_spte_set(sptep, new_spte);
1906 }
1907 }
1908
1909 if (need_flush && kvm_available_flush_tlb_with_range()) {
1910 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1911 return 0;
1912 }
1913
1914 return need_flush;
1915}
1916
1917struct slot_rmap_walk_iterator {
1918
1919 struct kvm_memory_slot *slot;
1920 gfn_t start_gfn;
1921 gfn_t end_gfn;
1922 int start_level;
1923 int end_level;
1924
1925
1926 gfn_t gfn;
1927 struct kvm_rmap_head *rmap;
1928 int level;
1929
1930
1931 struct kvm_rmap_head *end_rmap;
1932};
1933
1934static void
1935rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
1936{
1937 iterator->level = level;
1938 iterator->gfn = iterator->start_gfn;
1939 iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
1940 iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
1941 iterator->slot);
1942}
1943
1944static void
1945slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
1946 struct kvm_memory_slot *slot, int start_level,
1947 int end_level, gfn_t start_gfn, gfn_t end_gfn)
1948{
1949 iterator->slot = slot;
1950 iterator->start_level = start_level;
1951 iterator->end_level = end_level;
1952 iterator->start_gfn = start_gfn;
1953 iterator->end_gfn = end_gfn;
1954
1955 rmap_walk_init_level(iterator, iterator->start_level);
1956}
1957
1958static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
1959{
1960 return !!iterator->rmap;
1961}
1962
1963static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
1964{
1965 if (++iterator->rmap <= iterator->end_rmap) {
1966 iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
1967 return;
1968 }
1969
1970 if (++iterator->level > iterator->end_level) {
1971 iterator->rmap = NULL;
1972 return;
1973 }
1974
1975 rmap_walk_init_level(iterator, iterator->level);
1976}
1977
1978#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \
1979 _start_gfn, _end_gfn, _iter_) \
1980 for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \
1981 _end_level_, _start_gfn, _end_gfn); \
1982 slot_rmap_walk_okay(_iter_); \
1983 slot_rmap_walk_next(_iter_))
1984
1985static int kvm_handle_hva_range(struct kvm *kvm,
1986 unsigned long start,
1987 unsigned long end,
1988 unsigned long data,
1989 int (*handler)(struct kvm *kvm,
1990 struct kvm_rmap_head *rmap_head,
1991 struct kvm_memory_slot *slot,
1992 gfn_t gfn,
1993 int level,
1994 unsigned long data))
1995{
1996 struct kvm_memslots *slots;
1997 struct kvm_memory_slot *memslot;
1998 struct slot_rmap_walk_iterator iterator;
1999 int ret = 0;
2000 int i;
2001
2002 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
2003 slots = __kvm_memslots(kvm, i);
2004 kvm_for_each_memslot(memslot, slots) {
2005 unsigned long hva_start, hva_end;
2006 gfn_t gfn_start, gfn_end;
2007
2008 hva_start = max(start, memslot->userspace_addr);
2009 hva_end = min(end, memslot->userspace_addr +
2010 (memslot->npages << PAGE_SHIFT));
2011 if (hva_start >= hva_end)
2012 continue;
2013
2014
2015
2016
2017 gfn_start = hva_to_gfn_memslot(hva_start, memslot);
2018 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
2019
2020 for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
2021 PT_MAX_HUGEPAGE_LEVEL,
2022 gfn_start, gfn_end - 1,
2023 &iterator)
2024 ret |= handler(kvm, iterator.rmap, memslot,
2025 iterator.gfn, iterator.level, data);
2026 }
2027 }
2028
2029 return ret;
2030}
2031
2032static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
2033 unsigned long data,
2034 int (*handler)(struct kvm *kvm,
2035 struct kvm_rmap_head *rmap_head,
2036 struct kvm_memory_slot *slot,
2037 gfn_t gfn, int level,
2038 unsigned long data))
2039{
2040 return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
2041}
2042
2043int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
2044{
2045 return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
2046}
2047
2048int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
2049{
2050 return kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
2051}
2052
2053static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
2054 struct kvm_memory_slot *slot, gfn_t gfn, int level,
2055 unsigned long data)
2056{
2057 u64 *sptep;
2058 struct rmap_iterator uninitialized_var(iter);
2059 int young = 0;
2060
2061 for_each_rmap_spte(rmap_head, &iter, sptep)
2062 young |= mmu_spte_age(sptep);
2063
2064 trace_kvm_age_page(gfn, level, slot, young);
2065 return young;
2066}
2067
2068static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
2069 struct kvm_memory_slot *slot, gfn_t gfn,
2070 int level, unsigned long data)
2071{
2072 u64 *sptep;
2073 struct rmap_iterator iter;
2074
2075 for_each_rmap_spte(rmap_head, &iter, sptep)
2076 if (is_accessed_spte(*sptep))
2077 return 1;
2078 return 0;
2079}
2080
2081#define RMAP_RECYCLE_THRESHOLD 1000
2082
2083static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
2084{
2085 struct kvm_rmap_head *rmap_head;
2086 struct kvm_mmu_page *sp;
2087
2088 sp = page_header(__pa(spte));
2089
2090 rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
2091
2092 kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
2093 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
2094 KVM_PAGES_PER_HPAGE(sp->role.level));
2095}
2096
2097int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
2098{
2099 return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
2100}
2101
2102int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
2103{
2104 return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
2105}
2106
2107#ifdef MMU_DEBUG
2108static int is_empty_shadow_page(u64 *spt)
2109{
2110 u64 *pos;
2111 u64 *end;
2112
2113 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
2114 if (is_shadow_present_pte(*pos)) {
2115 printk(KERN_ERR "%s: %p %llx\n", __func__,
2116 pos, *pos);
2117 return 0;
2118 }
2119 return 1;
2120}
2121#endif
2122
2123
2124
2125
2126
2127
2128
2129static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr)
2130{
2131 kvm->arch.n_used_mmu_pages += nr;
2132 percpu_counter_add(&kvm_total_used_mmu_pages, nr);
2133}
2134
2135static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
2136{
2137 MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
2138 hlist_del(&sp->hash_link);
2139 list_del(&sp->link);
2140 free_page((unsigned long)sp->spt);
2141 if (!sp->role.direct)
2142 free_page((unsigned long)sp->gfns);
2143 kmem_cache_free(mmu_page_header_cache, sp);
2144}
2145
2146static unsigned kvm_page_table_hashfn(gfn_t gfn)
2147{
2148 return hash_64(gfn, KVM_MMU_HASH_SHIFT);
2149}
2150
2151static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
2152 struct kvm_mmu_page *sp, u64 *parent_pte)
2153{
2154 if (!parent_pte)
2155 return;
2156
2157 pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
2158}
2159
2160static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
2161 u64 *parent_pte)
2162{
2163 __pte_list_remove(parent_pte, &sp->parent_ptes);
2164}
2165
2166static void drop_parent_pte(struct kvm_mmu_page *sp,
2167 u64 *parent_pte)
2168{
2169 mmu_page_remove_parent_pte(sp, parent_pte);
2170 mmu_spte_clear_no_track(parent_pte);
2171}
2172
2173static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
2174{
2175 struct kvm_mmu_page *sp;
2176
2177 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
2178 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
2179 if (!direct)
2180 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
2181 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
2182
2183
2184
2185
2186
2187
2188 sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
2189 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
2190 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
2191 return sp;
2192}
2193
2194static void mark_unsync(u64 *spte);
2195static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
2196{
2197 u64 *sptep;
2198 struct rmap_iterator iter;
2199
2200 for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
2201 mark_unsync(sptep);
2202 }
2203}
2204
2205static void mark_unsync(u64 *spte)
2206{
2207 struct kvm_mmu_page *sp;
2208 unsigned int index;
2209
2210 sp = page_header(__pa(spte));
2211 index = spte - sp->spt;
2212 if (__test_and_set_bit(index, sp->unsync_child_bitmap))
2213 return;
2214 if (sp->unsync_children++)
2215 return;
2216 kvm_mmu_mark_parents_unsync(sp);
2217}
2218
2219static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
2220 struct kvm_mmu_page *sp)
2221{
2222 return 0;
2223}
2224
2225static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root)
2226{
2227}
2228
2229static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
2230 struct kvm_mmu_page *sp, u64 *spte,
2231 const void *pte)
2232{
2233 WARN_ON(1);
2234}
2235
2236#define KVM_PAGE_ARRAY_NR 16
2237
2238struct kvm_mmu_pages {
2239 struct mmu_page_and_offset {
2240 struct kvm_mmu_page *sp;
2241 unsigned int idx;
2242 } page[KVM_PAGE_ARRAY_NR];
2243 unsigned int nr;
2244};
2245
2246static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
2247 int idx)
2248{
2249 int i;
2250
2251 if (sp->unsync)
2252 for (i=0; i < pvec->nr; i++)
2253 if (pvec->page[i].sp == sp)
2254 return 0;
2255
2256 pvec->page[pvec->nr].sp = sp;
2257 pvec->page[pvec->nr].idx = idx;
2258 pvec->nr++;
2259 return (pvec->nr == KVM_PAGE_ARRAY_NR);
2260}
2261
2262static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
2263{
2264 --sp->unsync_children;
2265 WARN_ON((int)sp->unsync_children < 0);
2266 __clear_bit(idx, sp->unsync_child_bitmap);
2267}
2268
2269static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
2270 struct kvm_mmu_pages *pvec)
2271{
2272 int i, ret, nr_unsync_leaf = 0;
2273
2274 for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
2275 struct kvm_mmu_page *child;
2276 u64 ent = sp->spt[i];
2277
2278 if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
2279 clear_unsync_child_bit(sp, i);
2280 continue;
2281 }
2282
2283 child = page_header(ent & PT64_BASE_ADDR_MASK);
2284
2285 if (child->unsync_children) {
2286 if (mmu_pages_add(pvec, child, i))
2287 return -ENOSPC;
2288
2289 ret = __mmu_unsync_walk(child, pvec);
2290 if (!ret) {
2291 clear_unsync_child_bit(sp, i);
2292 continue;
2293 } else if (ret > 0) {
2294 nr_unsync_leaf += ret;
2295 } else
2296 return ret;
2297 } else if (child->unsync) {
2298 nr_unsync_leaf++;
2299 if (mmu_pages_add(pvec, child, i))
2300 return -ENOSPC;
2301 } else
2302 clear_unsync_child_bit(sp, i);
2303 }
2304
2305 return nr_unsync_leaf;
2306}
2307
2308#define INVALID_INDEX (-1)
2309
2310static int mmu_unsync_walk(struct kvm_mmu_page *sp,
2311 struct kvm_mmu_pages *pvec)
2312{
2313 pvec->nr = 0;
2314 if (!sp->unsync_children)
2315 return 0;
2316
2317 mmu_pages_add(pvec, sp, INVALID_INDEX);
2318 return __mmu_unsync_walk(sp, pvec);
2319}
2320
2321static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
2322{
2323 WARN_ON(!sp->unsync);
2324 trace_kvm_mmu_sync_page(sp);
2325 sp->unsync = 0;
2326 --kvm->stat.mmu_unsync;
2327}
2328
2329static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2330 struct list_head *invalid_list);
2331static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2332 struct list_head *invalid_list);
2333
2334
2335#define for_each_valid_sp(_kvm, _sp, _gfn) \
2336 hlist_for_each_entry(_sp, \
2337 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
2338 if (is_obsolete_sp((_kvm), (_sp))) { \
2339 } else
2340
2341#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
2342 for_each_valid_sp(_kvm, _sp, _gfn) \
2343 if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
2344
2345static inline bool is_ept_sp(struct kvm_mmu_page *sp)
2346{
2347 return sp->role.cr0_wp && sp->role.smap_andnot_wp;
2348}
2349
2350
2351static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2352 struct list_head *invalid_list)
2353{
2354 if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) ||
2355 vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
2356 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
2357 return false;
2358 }
2359
2360 return true;
2361}
2362
2363static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
2364 struct list_head *invalid_list,
2365 bool remote_flush)
2366{
2367 if (!remote_flush && list_empty(invalid_list))
2368 return false;
2369
2370 if (!list_empty(invalid_list))
2371 kvm_mmu_commit_zap_page(kvm, invalid_list);
2372 else
2373 kvm_flush_remote_tlbs(kvm);
2374 return true;
2375}
2376
2377static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
2378 struct list_head *invalid_list,
2379 bool remote_flush, bool local_flush)
2380{
2381 if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush))
2382 return;
2383
2384 if (local_flush)
2385 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2386}
2387
2388#ifdef CONFIG_KVM_MMU_AUDIT
2389#include "mmu_audit.c"
2390#else
2391static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
2392static void mmu_audit_disable(void) { }
2393#endif
2394
2395static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
2396{
2397 return sp->role.invalid ||
2398 unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
2399}
2400
2401static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2402 struct list_head *invalid_list)
2403{
2404 kvm_unlink_unsync_page(vcpu->kvm, sp);
2405 return __kvm_sync_page(vcpu, sp, invalid_list);
2406}
2407
2408
2409static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
2410 struct list_head *invalid_list)
2411{
2412 struct kvm_mmu_page *s;
2413 bool ret = false;
2414
2415 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
2416 if (!s->unsync)
2417 continue;
2418
2419 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
2420 ret |= kvm_sync_page(vcpu, s, invalid_list);
2421 }
2422
2423 return ret;
2424}
2425
2426struct mmu_page_path {
2427 struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
2428 unsigned int idx[PT64_ROOT_MAX_LEVEL];
2429};
2430
2431#define for_each_sp(pvec, sp, parents, i) \
2432 for (i = mmu_pages_first(&pvec, &parents); \
2433 i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
2434 i = mmu_pages_next(&pvec, &parents, i))
2435
2436static int mmu_pages_next(struct kvm_mmu_pages *pvec,
2437 struct mmu_page_path *parents,
2438 int i)
2439{
2440 int n;
2441
2442 for (n = i+1; n < pvec->nr; n++) {
2443 struct kvm_mmu_page *sp = pvec->page[n].sp;
2444 unsigned idx = pvec->page[n].idx;
2445 int level = sp->role.level;
2446
2447 parents->idx[level-1] = idx;
2448 if (level == PT_PAGE_TABLE_LEVEL)
2449 break;
2450
2451 parents->parent[level-2] = sp;
2452 }
2453
2454 return n;
2455}
2456
2457static int mmu_pages_first(struct kvm_mmu_pages *pvec,
2458 struct mmu_page_path *parents)
2459{
2460 struct kvm_mmu_page *sp;
2461 int level;
2462
2463 if (pvec->nr == 0)
2464 return 0;
2465
2466 WARN_ON(pvec->page[0].idx != INVALID_INDEX);
2467
2468 sp = pvec->page[0].sp;
2469 level = sp->role.level;
2470 WARN_ON(level == PT_PAGE_TABLE_LEVEL);
2471
2472 parents->parent[level-2] = sp;
2473
2474
2475
2476
2477 parents->parent[level-1] = NULL;
2478 return mmu_pages_next(pvec, parents, 0);
2479}
2480
2481static void mmu_pages_clear_parents(struct mmu_page_path *parents)
2482{
2483 struct kvm_mmu_page *sp;
2484 unsigned int level = 0;
2485
2486 do {
2487 unsigned int idx = parents->idx[level];
2488 sp = parents->parent[level];
2489 if (!sp)
2490 return;
2491
2492 WARN_ON(idx == INVALID_INDEX);
2493 clear_unsync_child_bit(sp, idx);
2494 level++;
2495 } while (!sp->unsync_children);
2496}
2497
2498static void mmu_sync_children(struct kvm_vcpu *vcpu,
2499 struct kvm_mmu_page *parent)
2500{
2501 int i;
2502 struct kvm_mmu_page *sp;
2503 struct mmu_page_path parents;
2504 struct kvm_mmu_pages pages;
2505 LIST_HEAD(invalid_list);
2506 bool flush = false;
2507
2508 while (mmu_unsync_walk(parent, &pages)) {
2509 bool protected = false;
2510
2511 for_each_sp(pages, sp, parents, i)
2512 protected |= rmap_write_protect(vcpu, sp->gfn);
2513
2514 if (protected) {
2515 kvm_flush_remote_tlbs(vcpu->kvm);
2516 flush = false;
2517 }
2518
2519 for_each_sp(pages, sp, parents, i) {
2520 flush |= kvm_sync_page(vcpu, sp, &invalid_list);
2521 mmu_pages_clear_parents(&parents);
2522 }
2523 if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
2524 kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2525 cond_resched_lock(&vcpu->kvm->mmu_lock);
2526 flush = false;
2527 }
2528 }
2529
2530 kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2531}
2532
2533static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
2534{
2535 atomic_set(&sp->write_flooding_count, 0);
2536}
2537
2538static void clear_sp_write_flooding_count(u64 *spte)
2539{
2540 struct kvm_mmu_page *sp = page_header(__pa(spte));
2541
2542 __clear_sp_write_flooding_count(sp);
2543}
2544
2545static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
2546 gfn_t gfn,
2547 gva_t gaddr,
2548 unsigned level,
2549 int direct,
2550 unsigned access)
2551{
2552 union kvm_mmu_page_role role;
2553 unsigned quadrant;
2554 struct kvm_mmu_page *sp;
2555 bool need_sync = false;
2556 bool flush = false;
2557 int collisions = 0;
2558 LIST_HEAD(invalid_list);
2559
2560 role = vcpu->arch.mmu->mmu_role.base;
2561 role.level = level;
2562 role.direct = direct;
2563 if (role.direct)
2564 role.gpte_is_8_bytes = true;
2565 role.access = access;
2566 if (!vcpu->arch.mmu->direct_map
2567 && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
2568 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
2569 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
2570 role.quadrant = quadrant;
2571 }
2572 for_each_valid_sp(vcpu->kvm, sp, gfn) {
2573 if (sp->gfn != gfn) {
2574 collisions++;
2575 continue;
2576 }
2577
2578 if (!need_sync && sp->unsync)
2579 need_sync = true;
2580
2581 if (sp->role.word != role.word)
2582 continue;
2583
2584 if (sp->unsync) {
2585
2586
2587
2588 if (!__kvm_sync_page(vcpu, sp, &invalid_list))
2589 break;
2590
2591 WARN_ON(!list_empty(&invalid_list));
2592 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2593 }
2594
2595 if (sp->unsync_children)
2596 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
2597
2598 __clear_sp_write_flooding_count(sp);
2599 trace_kvm_mmu_get_page(sp, false);
2600 goto out;
2601 }
2602
2603 ++vcpu->kvm->stat.mmu_cache_miss;
2604
2605 sp = kvm_mmu_alloc_page(vcpu, direct);
2606
2607 sp->gfn = gfn;
2608 sp->role = role;
2609 hlist_add_head(&sp->hash_link,
2610 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
2611 if (!direct) {
2612
2613
2614
2615
2616
2617 account_shadowed(vcpu->kvm, sp);
2618 if (level == PT_PAGE_TABLE_LEVEL &&
2619 rmap_write_protect(vcpu, gfn))
2620 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
2621
2622 if (level > PT_PAGE_TABLE_LEVEL && need_sync)
2623 flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
2624 }
2625 clear_page(sp->spt);
2626 trace_kvm_mmu_get_page(sp, true);
2627
2628 kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2629out:
2630 if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
2631 vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
2632 return sp;
2633}
2634
2635static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
2636 struct kvm_vcpu *vcpu, hpa_t root,
2637 u64 addr)
2638{
2639 iterator->addr = addr;
2640 iterator->shadow_addr = root;
2641 iterator->level = vcpu->arch.mmu->shadow_root_level;
2642
2643 if (iterator->level == PT64_ROOT_4LEVEL &&
2644 vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
2645 !vcpu->arch.mmu->direct_map)
2646 --iterator->level;
2647
2648 if (iterator->level == PT32E_ROOT_LEVEL) {
2649
2650
2651
2652
2653 BUG_ON(root != vcpu->arch.mmu->root_hpa);
2654
2655 iterator->shadow_addr
2656 = vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
2657 iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
2658 --iterator->level;
2659 if (!iterator->shadow_addr)
2660 iterator->level = 0;
2661 }
2662}
2663
2664static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
2665 struct kvm_vcpu *vcpu, u64 addr)
2666{
2667 shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa,
2668 addr);
2669}
2670
2671static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
2672{
2673 if (iterator->level < PT_PAGE_TABLE_LEVEL)
2674 return false;
2675
2676 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
2677 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
2678 return true;
2679}
2680
2681static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
2682 u64 spte)
2683{
2684 if (is_last_spte(spte, iterator->level)) {
2685 iterator->level = 0;
2686 return;
2687 }
2688
2689 iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
2690 --iterator->level;
2691}
2692
2693static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
2694{
2695 __shadow_walk_next(iterator, *iterator->sptep);
2696}
2697
2698static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
2699 struct kvm_mmu_page *sp)
2700{
2701 u64 spte;
2702
2703 BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
2704
2705 spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
2706 shadow_user_mask | shadow_x_mask | shadow_me_mask;
2707
2708 if (sp_ad_disabled(sp))
2709 spte |= SPTE_AD_DISABLED_MASK;
2710 else
2711 spte |= shadow_accessed_mask;
2712
2713 mmu_spte_set(sptep, spte);
2714
2715 mmu_page_add_parent_pte(vcpu, sp, sptep);
2716
2717 if (sp->unsync_children || sp->unsync)
2718 mark_unsync(sptep);
2719}
2720
2721static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2722 unsigned direct_access)
2723{
2724 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
2725 struct kvm_mmu_page *child;
2726
2727
2728
2729
2730
2731
2732
2733
2734 child = page_header(*sptep & PT64_BASE_ADDR_MASK);
2735 if (child->role.access == direct_access)
2736 return;
2737
2738 drop_parent_pte(child, sptep);
2739 kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1);
2740 }
2741}
2742
2743static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
2744 u64 *spte)
2745{
2746 u64 pte;
2747 struct kvm_mmu_page *child;
2748
2749 pte = *spte;
2750 if (is_shadow_present_pte(pte)) {
2751 if (is_last_spte(pte, sp->role.level)) {
2752 drop_spte(kvm, spte);
2753 if (is_large_pte(pte))
2754 --kvm->stat.lpages;
2755 } else {
2756 child = page_header(pte & PT64_BASE_ADDR_MASK);
2757 drop_parent_pte(child, spte);
2758 }
2759 return true;
2760 }
2761
2762 if (is_mmio_spte(pte))
2763 mmu_spte_clear_no_track(spte);
2764
2765 return false;
2766}
2767
2768static void kvm_mmu_page_unlink_children(struct kvm *kvm,
2769 struct kvm_mmu_page *sp)
2770{
2771 unsigned i;
2772
2773 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2774 mmu_page_zap_pte(kvm, sp, sp->spt + i);
2775}
2776
2777static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
2778{
2779 u64 *sptep;
2780 struct rmap_iterator iter;
2781
2782 while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
2783 drop_parent_pte(sp, sptep);
2784}
2785
2786static int mmu_zap_unsync_children(struct kvm *kvm,
2787 struct kvm_mmu_page *parent,
2788 struct list_head *invalid_list)
2789{
2790 int i, zapped = 0;
2791 struct mmu_page_path parents;
2792 struct kvm_mmu_pages pages;
2793
2794 if (parent->role.level == PT_PAGE_TABLE_LEVEL)
2795 return 0;
2796
2797 while (mmu_unsync_walk(parent, &pages)) {
2798 struct kvm_mmu_page *sp;
2799
2800 for_each_sp(pages, sp, parents, i) {
2801 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2802 mmu_pages_clear_parents(&parents);
2803 zapped++;
2804 }
2805 }
2806
2807 return zapped;
2808}
2809
2810static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
2811 struct kvm_mmu_page *sp,
2812 struct list_head *invalid_list,
2813 int *nr_zapped)
2814{
2815 bool list_unstable;
2816
2817 trace_kvm_mmu_prepare_zap_page(sp);
2818 ++kvm->stat.mmu_shadow_zapped;
2819 *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
2820 kvm_mmu_page_unlink_children(kvm, sp);
2821 kvm_mmu_unlink_parents(kvm, sp);
2822
2823
2824 list_unstable = *nr_zapped;
2825
2826 if (!sp->role.invalid && !sp->role.direct)
2827 unaccount_shadowed(kvm, sp);
2828
2829 if (sp->unsync)
2830 kvm_unlink_unsync_page(kvm, sp);
2831 if (!sp->root_count) {
2832
2833 (*nr_zapped)++;
2834 list_move(&sp->link, invalid_list);
2835 kvm_mod_used_mmu_pages(kvm, -1);
2836 } else {
2837 list_move(&sp->link, &kvm->arch.active_mmu_pages);
2838
2839
2840
2841
2842
2843
2844 if (!is_obsolete_sp(kvm, sp))
2845 kvm_reload_remote_mmus(kvm);
2846 }
2847
2848 if (sp->lpage_disallowed)
2849 unaccount_huge_nx_page(kvm, sp);
2850
2851 sp->role.invalid = 1;
2852 return list_unstable;
2853}
2854
2855static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2856 struct list_head *invalid_list)
2857{
2858 int nr_zapped;
2859
2860 __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
2861 return nr_zapped;
2862}
2863
2864static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2865 struct list_head *invalid_list)
2866{
2867 struct kvm_mmu_page *sp, *nsp;
2868
2869 if (list_empty(invalid_list))
2870 return;
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881 kvm_flush_remote_tlbs(kvm);
2882
2883 list_for_each_entry_safe(sp, nsp, invalid_list, link) {
2884 WARN_ON(!sp->role.invalid || sp->root_count);
2885 kvm_mmu_free_page(sp);
2886 }
2887}
2888
2889static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
2890 struct list_head *invalid_list)
2891{
2892 struct kvm_mmu_page *sp;
2893
2894 if (list_empty(&kvm->arch.active_mmu_pages))
2895 return false;
2896
2897 sp = list_last_entry(&kvm->arch.active_mmu_pages,
2898 struct kvm_mmu_page, link);
2899 return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2900}
2901
2902
2903
2904
2905
2906void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
2907{
2908 LIST_HEAD(invalid_list);
2909
2910 spin_lock(&kvm->mmu_lock);
2911
2912 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2913
2914 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
2915 if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list))
2916 break;
2917
2918 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2919 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
2920 }
2921
2922 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
2923
2924 spin_unlock(&kvm->mmu_lock);
2925}
2926
2927int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2928{
2929 struct kvm_mmu_page *sp;
2930 LIST_HEAD(invalid_list);
2931 int r;
2932
2933 pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
2934 r = 0;
2935 spin_lock(&kvm->mmu_lock);
2936 for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
2937 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
2938 sp->role.word);
2939 r = 1;
2940 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
2941 }
2942 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2943 spin_unlock(&kvm->mmu_lock);
2944
2945 return r;
2946}
2947EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
2948
2949static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
2950{
2951 trace_kvm_mmu_unsync_page(sp);
2952 ++vcpu->kvm->stat.mmu_unsync;
2953 sp->unsync = 1;
2954
2955 kvm_mmu_mark_parents_unsync(sp);
2956}
2957
2958static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2959 bool can_unsync)
2960{
2961 struct kvm_mmu_page *sp;
2962
2963 if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
2964 return true;
2965
2966 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
2967 if (!can_unsync)
2968 return true;
2969
2970 if (sp->unsync)
2971 continue;
2972
2973 WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
2974 kvm_unsync_page(vcpu, sp);
2975 }
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014 smp_wmb();
3015
3016 return false;
3017}
3018
3019static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
3020{
3021 if (pfn_valid(pfn))
3022 return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033 (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
3034
3035 return !e820__mapped_raw_any(pfn_to_hpa(pfn),
3036 pfn_to_hpa(pfn + 1) - 1,
3037 E820_TYPE_RAM);
3038}
3039
3040
3041#define SET_SPTE_WRITE_PROTECTED_PT BIT(0)
3042#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
3043
3044static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
3045 unsigned pte_access, int level,
3046 gfn_t gfn, kvm_pfn_t pfn, bool speculative,
3047 bool can_unsync, bool host_writable)
3048{
3049 u64 spte = 0;
3050 int ret = 0;
3051 struct kvm_mmu_page *sp;
3052
3053 if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
3054 return 0;
3055
3056 sp = page_header(__pa(sptep));
3057 if (sp_ad_disabled(sp))
3058 spte |= SPTE_AD_DISABLED_MASK;
3059 else if (kvm_vcpu_ad_need_write_protect(vcpu))
3060 spte |= SPTE_AD_WRPROT_ONLY_MASK;
3061
3062
3063
3064
3065
3066
3067
3068 spte |= shadow_present_mask;
3069 if (!speculative)
3070 spte |= spte_shadow_accessed_mask(spte);
3071
3072 if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&
3073 is_nx_huge_page_enabled()) {
3074 pte_access &= ~ACC_EXEC_MASK;
3075 }
3076
3077 if (pte_access & ACC_EXEC_MASK)
3078 spte |= shadow_x_mask;
3079 else
3080 spte |= shadow_nx_mask;
3081
3082 if (pte_access & ACC_USER_MASK)
3083 spte |= shadow_user_mask;
3084
3085 if (level > PT_PAGE_TABLE_LEVEL)
3086 spte |= PT_PAGE_SIZE_MASK;
3087 if (tdp_enabled)
3088 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
3089 kvm_is_mmio_pfn(pfn));
3090
3091 if (host_writable)
3092 spte |= SPTE_HOST_WRITEABLE;
3093 else
3094 pte_access &= ~ACC_WRITE_MASK;
3095
3096 if (!kvm_is_mmio_pfn(pfn))
3097 spte |= shadow_me_mask;
3098
3099 spte |= (u64)pfn << PAGE_SHIFT;
3100
3101 if (pte_access & ACC_WRITE_MASK) {
3102
3103
3104
3105
3106
3107
3108
3109 if (level > PT_PAGE_TABLE_LEVEL &&
3110 mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
3111 goto done;
3112
3113 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
3114
3115
3116
3117
3118
3119
3120
3121 if (!can_unsync && is_writable_pte(*sptep))
3122 goto set_pte;
3123
3124 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
3125 pgprintk("%s: found shadow page for %llx, marking ro\n",
3126 __func__, gfn);
3127 ret |= SET_SPTE_WRITE_PROTECTED_PT;
3128 pte_access &= ~ACC_WRITE_MASK;
3129 spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
3130 }
3131 }
3132
3133 if (pte_access & ACC_WRITE_MASK) {
3134 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3135 spte |= spte_shadow_dirty_mask(spte);
3136 }
3137
3138 if (speculative)
3139 spte = mark_spte_for_access_track(spte);
3140
3141set_pte:
3142 if (mmu_spte_update(sptep, spte))
3143 ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
3144done:
3145 return ret;
3146}
3147
3148static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
3149 int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
3150 bool speculative, bool host_writable)
3151{
3152 int was_rmapped = 0;
3153 int rmap_count;
3154 int set_spte_ret;
3155 int ret = RET_PF_RETRY;
3156 bool flush = false;
3157
3158 pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
3159 *sptep, write_fault, gfn);
3160
3161 if (is_shadow_present_pte(*sptep)) {
3162
3163
3164
3165
3166 if (level > PT_PAGE_TABLE_LEVEL &&
3167 !is_large_pte(*sptep)) {
3168 struct kvm_mmu_page *child;
3169 u64 pte = *sptep;
3170
3171 child = page_header(pte & PT64_BASE_ADDR_MASK);
3172 drop_parent_pte(child, sptep);
3173 flush = true;
3174 } else if (pfn != spte_to_pfn(*sptep)) {
3175 pgprintk("hfn old %llx new %llx\n",
3176 spte_to_pfn(*sptep), pfn);
3177 drop_spte(vcpu->kvm, sptep);
3178 flush = true;
3179 } else
3180 was_rmapped = 1;
3181 }
3182
3183 set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
3184 speculative, true, host_writable);
3185 if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
3186 if (write_fault)
3187 ret = RET_PF_EMULATE;
3188 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3189 }
3190
3191 if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
3192 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
3193 KVM_PAGES_PER_HPAGE(level));
3194
3195 if (unlikely(is_mmio_spte(*sptep)))
3196 ret = RET_PF_EMULATE;
3197
3198 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
3199 trace_kvm_mmu_set_spte(level, gfn, sptep);
3200 if (!was_rmapped && is_large_pte(*sptep))
3201 ++vcpu->kvm->stat.lpages;
3202
3203 if (is_shadow_present_pte(*sptep)) {
3204 if (!was_rmapped) {
3205 rmap_count = rmap_add(vcpu, sptep, gfn);
3206 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
3207 rmap_recycle(vcpu, sptep, gfn);
3208 }
3209 }
3210
3211 return ret;
3212}
3213
3214static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
3215 bool no_dirty_log)
3216{
3217 struct kvm_memory_slot *slot;
3218
3219 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
3220 if (!slot)
3221 return KVM_PFN_ERR_FAULT;
3222
3223 return gfn_to_pfn_memslot_atomic(slot, gfn);
3224}
3225
3226static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
3227 struct kvm_mmu_page *sp,
3228 u64 *start, u64 *end)
3229{
3230 struct page *pages[PTE_PREFETCH_NUM];
3231 struct kvm_memory_slot *slot;
3232 unsigned access = sp->role.access;
3233 int i, ret;
3234 gfn_t gfn;
3235
3236 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
3237 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
3238 if (!slot)
3239 return -1;
3240
3241 ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
3242 if (ret <= 0)
3243 return -1;
3244
3245 for (i = 0; i < ret; i++, gfn++, start++) {
3246 mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
3247 page_to_pfn(pages[i]), true, true);
3248 put_page(pages[i]);
3249 }
3250
3251 return 0;
3252}
3253
3254static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
3255 struct kvm_mmu_page *sp, u64 *sptep)
3256{
3257 u64 *spte, *start = NULL;
3258 int i;
3259
3260 WARN_ON(!sp->role.direct);
3261
3262 i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
3263 spte = sp->spt + i;
3264
3265 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
3266 if (is_shadow_present_pte(*spte) || spte == sptep) {
3267 if (!start)
3268 continue;
3269 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
3270 break;
3271 start = NULL;
3272 } else if (!start)
3273 start = spte;
3274 }
3275}
3276
3277static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
3278{
3279 struct kvm_mmu_page *sp;
3280
3281 sp = page_header(__pa(sptep));
3282
3283
3284
3285
3286
3287
3288 if (sp_ad_disabled(sp))
3289 return;
3290
3291 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
3292 return;
3293
3294 __direct_pte_prefetch(vcpu, sp, sptep);
3295}
3296
3297static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
3298 gfn_t gfn, kvm_pfn_t *pfnp, int *levelp)
3299{
3300 int level = *levelp;
3301 u64 spte = *it.sptep;
3302
3303 if (it.level == level && level > PT_PAGE_TABLE_LEVEL &&
3304 is_nx_huge_page_enabled() &&
3305 is_shadow_present_pte(spte) &&
3306 !is_large_pte(spte)) {
3307
3308
3309
3310
3311
3312
3313
3314 u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1);
3315 *pfnp |= gfn & page_mask;
3316 (*levelp)--;
3317 }
3318}
3319
3320static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
3321 int map_writable, int level, kvm_pfn_t pfn,
3322 bool prefault, bool lpage_disallowed)
3323{
3324 struct kvm_shadow_walk_iterator it;
3325 struct kvm_mmu_page *sp;
3326 int ret;
3327 gfn_t gfn = gpa >> PAGE_SHIFT;
3328 gfn_t base_gfn = gfn;
3329
3330 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3331 return RET_PF_RETRY;
3332
3333 trace_kvm_mmu_spte_requested(gpa, level, pfn);
3334 for_each_shadow_entry(vcpu, gpa, it) {
3335
3336
3337
3338
3339 disallowed_hugepage_adjust(it, gfn, &pfn, &level);
3340
3341 base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
3342 if (it.level == level)
3343 break;
3344
3345 drop_large_spte(vcpu, it.sptep);
3346 if (!is_shadow_present_pte(*it.sptep)) {
3347 sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
3348 it.level - 1, true, ACC_ALL);
3349
3350 link_shadow_page(vcpu, it.sptep, sp);
3351 if (lpage_disallowed)
3352 account_huge_nx_page(vcpu->kvm, sp);
3353 }
3354 }
3355
3356 ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
3357 write, level, base_gfn, pfn, prefault,
3358 map_writable);
3359 direct_pte_prefetch(vcpu, it.sptep);
3360 ++vcpu->stat.pf_fixed;
3361 return ret;
3362}
3363
3364static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
3365{
3366 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
3367}
3368
3369static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
3370{
3371
3372
3373
3374
3375
3376 if (pfn == KVM_PFN_ERR_RO_FAULT)
3377 return RET_PF_EMULATE;
3378
3379 if (pfn == KVM_PFN_ERR_HWPOISON) {
3380 kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
3381 return RET_PF_RETRY;
3382 }
3383
3384 return -EFAULT;
3385}
3386
3387static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
3388 gfn_t gfn, kvm_pfn_t *pfnp,
3389 int *levelp)
3390{
3391 kvm_pfn_t pfn = *pfnp;
3392 int level = *levelp;
3393
3394
3395
3396
3397
3398
3399
3400 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
3401 !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL &&
3402 PageTransCompoundMap(pfn_to_page(pfn)) &&
3403 !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
3404 unsigned long mask;
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414 *levelp = level = PT_DIRECTORY_LEVEL;
3415 mask = KVM_PAGES_PER_HPAGE(level) - 1;
3416 VM_BUG_ON((gfn & mask) != (pfn & mask));
3417 if (pfn & mask) {
3418 kvm_release_pfn_clean(pfn);
3419 pfn &= ~mask;
3420 kvm_get_pfn(pfn);
3421 *pfnp = pfn;
3422 }
3423 }
3424}
3425
3426static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
3427 kvm_pfn_t pfn, unsigned access, int *ret_val)
3428{
3429
3430 if (unlikely(is_error_pfn(pfn))) {
3431 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
3432 return true;
3433 }
3434
3435 if (unlikely(is_noslot_pfn(pfn)))
3436 vcpu_cache_mmio_info(vcpu, gva, gfn,
3437 access & shadow_mmio_access_mask);
3438
3439 return false;
3440}
3441
3442static bool page_fault_can_be_fast(u32 error_code)
3443{
3444
3445
3446
3447
3448 if (unlikely(error_code & PFERR_RSVD_MASK))
3449 return false;
3450
3451
3452 if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
3453 == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
3454 return false;
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470 return shadow_acc_track_mask != 0 ||
3471 ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
3472 == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
3473}
3474
3475
3476
3477
3478
3479static bool
3480fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
3481 u64 *sptep, u64 old_spte, u64 new_spte)
3482{
3483 gfn_t gfn;
3484
3485 WARN_ON(!sp->role.direct);
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499 if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
3500 return false;
3501
3502 if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
3503
3504
3505
3506
3507 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
3508 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3509 }
3510
3511 return true;
3512}
3513
3514static bool is_access_allowed(u32 fault_err_code, u64 spte)
3515{
3516 if (fault_err_code & PFERR_FETCH_MASK)
3517 return is_executable_pte(spte);
3518
3519 if (fault_err_code & PFERR_WRITE_MASK)
3520 return is_writable_pte(spte);
3521
3522
3523 return spte & PT_PRESENT_MASK;
3524}
3525
3526
3527
3528
3529
3530
3531static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
3532 u32 error_code)
3533{
3534 struct kvm_shadow_walk_iterator iterator;
3535 struct kvm_mmu_page *sp;
3536 bool fault_handled = false;
3537 u64 spte = 0ull;
3538 uint retry_count = 0;
3539
3540 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3541 return false;
3542
3543 if (!page_fault_can_be_fast(error_code))
3544 return false;
3545
3546 walk_shadow_page_lockless_begin(vcpu);
3547
3548 do {
3549 u64 new_spte;
3550
3551 for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
3552 if (!is_shadow_present_pte(spte) ||
3553 iterator.level < level)
3554 break;
3555
3556 sp = page_header(__pa(iterator.sptep));
3557 if (!is_last_spte(spte, sp->role.level))
3558 break;
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570 if (is_access_allowed(error_code, spte)) {
3571 fault_handled = true;
3572 break;
3573 }
3574
3575 new_spte = spte;
3576
3577 if (is_access_track_spte(spte))
3578 new_spte = restore_acc_track_spte(new_spte);
3579
3580
3581
3582
3583
3584
3585 if ((error_code & PFERR_WRITE_MASK) &&
3586 spte_can_locklessly_be_made_writable(spte))
3587 {
3588 new_spte |= PT_WRITABLE_MASK;
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
3602 break;
3603 }
3604
3605
3606 if (new_spte == spte ||
3607 !is_access_allowed(error_code, new_spte))
3608 break;
3609
3610
3611
3612
3613
3614
3615 fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
3616 iterator.sptep, spte,
3617 new_spte);
3618 if (fault_handled)
3619 break;
3620
3621 if (++retry_count > 4) {
3622 printk_once(KERN_WARNING
3623 "kvm: Fast #PF retrying more than 4 times.\n");
3624 break;
3625 }
3626
3627 } while (true);
3628
3629 trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
3630 spte, fault_handled);
3631 walk_shadow_page_lockless_end(vcpu);
3632
3633 return fault_handled;
3634}
3635
3636static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
3637 gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable);
3638static int make_mmu_pages_available(struct kvm_vcpu *vcpu);
3639
3640static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
3641 gfn_t gfn, bool prefault)
3642{
3643 int r;
3644 int level;
3645 bool force_pt_level;
3646 kvm_pfn_t pfn;
3647 unsigned long mmu_seq;
3648 bool map_writable, write = error_code & PFERR_WRITE_MASK;
3649 bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
3650 is_nx_huge_page_enabled();
3651
3652 force_pt_level = lpage_disallowed;
3653 level = mapping_level(vcpu, gfn, &force_pt_level);
3654 if (likely(!force_pt_level)) {
3655
3656
3657
3658
3659
3660 if (level > PT_DIRECTORY_LEVEL)
3661 level = PT_DIRECTORY_LEVEL;
3662
3663 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
3664 }
3665
3666 if (fast_page_fault(vcpu, v, level, error_code))
3667 return RET_PF_RETRY;
3668
3669 mmu_seq = vcpu->kvm->mmu_notifier_seq;
3670 smp_rmb();
3671
3672 if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
3673 return RET_PF_RETRY;
3674
3675 if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
3676 return r;
3677
3678 r = RET_PF_RETRY;
3679 spin_lock(&vcpu->kvm->mmu_lock);
3680 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
3681 goto out_unlock;
3682 if (make_mmu_pages_available(vcpu) < 0)
3683 goto out_unlock;
3684 if (likely(!force_pt_level))
3685 transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
3686 r = __direct_map(vcpu, v, write, map_writable, level, pfn,
3687 prefault, false);
3688out_unlock:
3689 spin_unlock(&vcpu->kvm->mmu_lock);
3690 kvm_release_pfn_clean(pfn);
3691 return r;
3692}
3693
3694static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
3695 struct list_head *invalid_list)
3696{
3697 struct kvm_mmu_page *sp;
3698
3699 if (!VALID_PAGE(*root_hpa))
3700 return;
3701
3702 sp = page_header(*root_hpa & PT64_BASE_ADDR_MASK);
3703 --sp->root_count;
3704 if (!sp->root_count && sp->role.invalid)
3705 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
3706
3707 *root_hpa = INVALID_PAGE;
3708}
3709
3710
3711void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
3712 ulong roots_to_free)
3713{
3714 int i;
3715 LIST_HEAD(invalid_list);
3716 bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
3717
3718 BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
3719
3720
3721 if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) {
3722 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3723 if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
3724 VALID_PAGE(mmu->prev_roots[i].hpa))
3725 break;
3726
3727 if (i == KVM_MMU_NUM_PREV_ROOTS)
3728 return;
3729 }
3730
3731 spin_lock(&vcpu->kvm->mmu_lock);
3732
3733 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3734 if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
3735 mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa,
3736 &invalid_list);
3737
3738 if (free_active_root) {
3739 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
3740 (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
3741 mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
3742 &invalid_list);
3743 } else {
3744 for (i = 0; i < 4; ++i)
3745 if (mmu->pae_root[i] != 0)
3746 mmu_free_root_page(vcpu->kvm,
3747 &mmu->pae_root[i],
3748 &invalid_list);
3749 mmu->root_hpa = INVALID_PAGE;
3750 }
3751 mmu->root_cr3 = 0;
3752 }
3753
3754 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3755 spin_unlock(&vcpu->kvm->mmu_lock);
3756}
3757EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
3758
3759static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
3760{
3761 int ret = 0;
3762
3763 if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
3764 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3765 ret = 1;
3766 }
3767
3768 return ret;
3769}
3770
3771static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
3772{
3773 struct kvm_mmu_page *sp;
3774 unsigned i;
3775
3776 if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
3777 spin_lock(&vcpu->kvm->mmu_lock);
3778 if(make_mmu_pages_available(vcpu) < 0) {
3779 spin_unlock(&vcpu->kvm->mmu_lock);
3780 return -ENOSPC;
3781 }
3782 sp = kvm_mmu_get_page(vcpu, 0, 0,
3783 vcpu->arch.mmu->shadow_root_level, 1, ACC_ALL);
3784 ++sp->root_count;
3785 spin_unlock(&vcpu->kvm->mmu_lock);
3786 vcpu->arch.mmu->root_hpa = __pa(sp->spt);
3787 } else if (vcpu->arch.mmu->shadow_root_level == PT32E_ROOT_LEVEL) {
3788 for (i = 0; i < 4; ++i) {
3789 hpa_t root = vcpu->arch.mmu->pae_root[i];
3790
3791 MMU_WARN_ON(VALID_PAGE(root));
3792 spin_lock(&vcpu->kvm->mmu_lock);
3793 if (make_mmu_pages_available(vcpu) < 0) {
3794 spin_unlock(&vcpu->kvm->mmu_lock);
3795 return -ENOSPC;
3796 }
3797 sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
3798 i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
3799 root = __pa(sp->spt);
3800 ++sp->root_count;
3801 spin_unlock(&vcpu->kvm->mmu_lock);
3802 vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
3803 }
3804 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
3805 } else
3806 BUG();
3807 vcpu->arch.mmu->root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
3808
3809 return 0;
3810}
3811
3812static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3813{
3814 struct kvm_mmu_page *sp;
3815 u64 pdptr, pm_mask;
3816 gfn_t root_gfn, root_cr3;
3817 int i;
3818
3819 root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
3820 root_gfn = root_cr3 >> PAGE_SHIFT;
3821
3822 if (mmu_check_root(vcpu, root_gfn))
3823 return 1;
3824
3825
3826
3827
3828
3829 if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
3830 hpa_t root = vcpu->arch.mmu->root_hpa;
3831
3832 MMU_WARN_ON(VALID_PAGE(root));
3833
3834 spin_lock(&vcpu->kvm->mmu_lock);
3835 if (make_mmu_pages_available(vcpu) < 0) {
3836 spin_unlock(&vcpu->kvm->mmu_lock);
3837 return -ENOSPC;
3838 }
3839 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
3840 vcpu->arch.mmu->shadow_root_level, 0, ACC_ALL);
3841 root = __pa(sp->spt);
3842 ++sp->root_count;
3843 spin_unlock(&vcpu->kvm->mmu_lock);
3844 vcpu->arch.mmu->root_hpa = root;
3845 goto set_root_cr3;
3846 }
3847
3848
3849
3850
3851
3852
3853 pm_mask = PT_PRESENT_MASK;
3854 if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL)
3855 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
3856
3857 for (i = 0; i < 4; ++i) {
3858 hpa_t root = vcpu->arch.mmu->pae_root[i];
3859
3860 MMU_WARN_ON(VALID_PAGE(root));
3861 if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
3862 pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
3863 if (!(pdptr & PT_PRESENT_MASK)) {
3864 vcpu->arch.mmu->pae_root[i] = 0;
3865 continue;
3866 }
3867 root_gfn = pdptr >> PAGE_SHIFT;
3868 if (mmu_check_root(vcpu, root_gfn))
3869 return 1;
3870 }
3871 spin_lock(&vcpu->kvm->mmu_lock);
3872 if (make_mmu_pages_available(vcpu) < 0) {
3873 spin_unlock(&vcpu->kvm->mmu_lock);
3874 return -ENOSPC;
3875 }
3876 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
3877 0, ACC_ALL);
3878 root = __pa(sp->spt);
3879 ++sp->root_count;
3880 spin_unlock(&vcpu->kvm->mmu_lock);
3881
3882 vcpu->arch.mmu->pae_root[i] = root | pm_mask;
3883 }
3884 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
3885
3886
3887
3888
3889
3890 if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
3891 if (vcpu->arch.mmu->lm_root == NULL) {
3892
3893
3894
3895
3896
3897 u64 *lm_root;
3898
3899 lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3900 if (lm_root == NULL)
3901 return 1;
3902
3903 lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask;
3904
3905 vcpu->arch.mmu->lm_root = lm_root;
3906 }
3907
3908 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
3909 }
3910
3911set_root_cr3:
3912 vcpu->arch.mmu->root_cr3 = root_cr3;
3913
3914 return 0;
3915}
3916
3917static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
3918{
3919 if (vcpu->arch.mmu->direct_map)
3920 return mmu_alloc_direct_roots(vcpu);
3921 else
3922 return mmu_alloc_shadow_roots(vcpu);
3923}
3924
3925void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
3926{
3927 int i;
3928 struct kvm_mmu_page *sp;
3929
3930 if (vcpu->arch.mmu->direct_map)
3931 return;
3932
3933 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3934 return;
3935
3936 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
3937
3938 if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
3939 hpa_t root = vcpu->arch.mmu->root_hpa;
3940 sp = page_header(root);
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952 if (!smp_load_acquire(&sp->unsync) &&
3953 !smp_load_acquire(&sp->unsync_children))
3954 return;
3955
3956 spin_lock(&vcpu->kvm->mmu_lock);
3957 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
3958
3959 mmu_sync_children(vcpu, sp);
3960
3961 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
3962 spin_unlock(&vcpu->kvm->mmu_lock);
3963 return;
3964 }
3965
3966 spin_lock(&vcpu->kvm->mmu_lock);
3967 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
3968
3969 for (i = 0; i < 4; ++i) {
3970 hpa_t root = vcpu->arch.mmu->pae_root[i];
3971
3972 if (root && VALID_PAGE(root)) {
3973 root &= PT64_BASE_ADDR_MASK;
3974 sp = page_header(root);
3975 mmu_sync_children(vcpu, sp);
3976 }
3977 }
3978
3979 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
3980 spin_unlock(&vcpu->kvm->mmu_lock);
3981}
3982EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
3983
3984static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
3985 u32 access, struct x86_exception *exception)
3986{
3987 if (exception)
3988 exception->error_code = 0;
3989 return vaddr;
3990}
3991
3992static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
3993 u32 access,
3994 struct x86_exception *exception)
3995{
3996 if (exception)
3997 exception->error_code = 0;
3998 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
3999}
4000
4001static bool
4002__is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
4003{
4004 int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f;
4005
4006 return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) |
4007 ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0);
4008}
4009
4010static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
4011{
4012 return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level);
4013}
4014
4015static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
4016{
4017 return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
4018}
4019
4020static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
4021{
4022
4023
4024
4025
4026 if (mmu_is_nested(vcpu))
4027 return false;
4028
4029 if (direct)
4030 return vcpu_match_mmio_gpa(vcpu, addr);
4031
4032 return vcpu_match_mmio_gva(vcpu, addr);
4033}
4034
4035
4036static bool
4037walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
4038{
4039 struct kvm_shadow_walk_iterator iterator;
4040 u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull;
4041 int root, leaf;
4042 bool reserved = false;
4043
4044 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
4045 goto exit;
4046
4047 walk_shadow_page_lockless_begin(vcpu);
4048
4049 for (shadow_walk_init(&iterator, vcpu, addr),
4050 leaf = root = iterator.level;
4051 shadow_walk_okay(&iterator);
4052 __shadow_walk_next(&iterator, spte)) {
4053 spte = mmu_spte_get_lockless(iterator.sptep);
4054
4055 sptes[leaf - 1] = spte;
4056 leaf--;
4057
4058 if (!is_shadow_present_pte(spte))
4059 break;
4060
4061 reserved |= is_shadow_zero_bits_set(vcpu->arch.mmu, spte,
4062 iterator.level);
4063 }
4064
4065 walk_shadow_page_lockless_end(vcpu);
4066
4067 if (reserved) {
4068 pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
4069 __func__, addr);
4070 while (root > leaf) {
4071 pr_err("------ spte 0x%llx level %d.\n",
4072 sptes[root - 1], root);
4073 root--;
4074 }
4075 }
4076exit:
4077 *sptep = spte;
4078 return reserved;
4079}
4080
4081static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
4082{
4083 u64 spte;
4084 bool reserved;
4085
4086 if (mmio_info_in_cache(vcpu, addr, direct))
4087 return RET_PF_EMULATE;
4088
4089 reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
4090 if (WARN_ON(reserved))
4091 return -EINVAL;
4092
4093 if (is_mmio_spte(spte)) {
4094 gfn_t gfn = get_mmio_spte_gfn(spte);
4095 unsigned access = get_mmio_spte_access(spte);
4096
4097 if (!check_mmio_spte(vcpu, spte))
4098 return RET_PF_INVALID;
4099
4100 if (direct)
4101 addr = 0;
4102
4103 trace_handle_mmio_page_fault(addr, gfn, access);
4104 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
4105 return RET_PF_EMULATE;
4106 }
4107
4108
4109
4110
4111
4112 return RET_PF_RETRY;
4113}
4114
4115static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
4116 u32 error_code, gfn_t gfn)
4117{
4118 if (unlikely(error_code & PFERR_RSVD_MASK))
4119 return false;
4120
4121 if (!(error_code & PFERR_PRESENT_MASK) ||
4122 !(error_code & PFERR_WRITE_MASK))
4123 return false;
4124
4125
4126
4127
4128
4129 if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
4130 return true;
4131
4132 return false;
4133}
4134
4135static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
4136{
4137 struct kvm_shadow_walk_iterator iterator;
4138 u64 spte;
4139
4140 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
4141 return;
4142
4143 walk_shadow_page_lockless_begin(vcpu);
4144 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
4145 clear_sp_write_flooding_count(iterator.sptep);
4146 if (!is_shadow_present_pte(spte))
4147 break;
4148 }
4149 walk_shadow_page_lockless_end(vcpu);
4150}
4151
4152static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
4153 u32 error_code, bool prefault)
4154{
4155 gfn_t gfn = gva >> PAGE_SHIFT;
4156 int r;
4157
4158 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
4159
4160 if (page_fault_handle_page_track(vcpu, error_code, gfn))
4161 return RET_PF_EMULATE;
4162
4163 r = mmu_topup_memory_caches(vcpu);
4164 if (r)
4165 return r;
4166
4167 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
4168
4169
4170 return nonpaging_map(vcpu, gva & PAGE_MASK,
4171 error_code, gfn, prefault);
4172}
4173
4174static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
4175{
4176 struct kvm_arch_async_pf arch;
4177
4178 arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
4179 arch.gfn = gfn;
4180 arch.direct_map = vcpu->arch.mmu->direct_map;
4181 arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu);
4182
4183 return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
4184}
4185
4186static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
4187 gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
4188{
4189 struct kvm_memory_slot *slot;
4190 bool async;
4191
4192
4193
4194
4195 if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) {
4196 *pfn = KVM_PFN_NOSLOT;
4197 return false;
4198 }
4199
4200 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
4201 async = false;
4202 *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
4203 if (!async)
4204 return false;
4205
4206 if (!prefault && kvm_can_do_async_pf(vcpu)) {
4207 trace_kvm_try_async_get_page(gva, gfn);
4208 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
4209 trace_kvm_async_pf_doublefault(gva, gfn);
4210 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
4211 return true;
4212 } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
4213 return true;
4214 }
4215
4216 *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
4217 return false;
4218}
4219
4220int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
4221 u64 fault_address, char *insn, int insn_len)
4222{
4223 int r = 1;
4224
4225 vcpu->arch.l1tf_flush_l1d = true;
4226 switch (vcpu->arch.apf.host_apf_reason) {
4227 default:
4228 trace_kvm_page_fault(fault_address, error_code);
4229
4230 if (kvm_event_needs_reinjection(vcpu))
4231 kvm_mmu_unprotect_page_virt(vcpu, fault_address);
4232 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
4233 insn_len);
4234 break;
4235 case KVM_PV_REASON_PAGE_NOT_PRESENT:
4236 vcpu->arch.apf.host_apf_reason = 0;
4237 local_irq_disable();
4238 kvm_async_pf_task_wait(fault_address, 0);
4239 local_irq_enable();
4240 break;
4241 case KVM_PV_REASON_PAGE_READY:
4242 vcpu->arch.apf.host_apf_reason = 0;
4243 local_irq_disable();
4244 kvm_async_pf_task_wake(fault_address);
4245 local_irq_enable();
4246 break;
4247 }
4248 return r;
4249}
4250EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
4251
4252static bool
4253check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
4254{
4255 int page_num = KVM_PAGES_PER_HPAGE(level);
4256
4257 gfn &= ~(page_num - 1);
4258
4259 return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
4260}
4261
4262static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
4263 bool prefault)
4264{
4265 kvm_pfn_t pfn;
4266 int r;
4267 int level;
4268 bool force_pt_level;
4269 gfn_t gfn = gpa >> PAGE_SHIFT;
4270 unsigned long mmu_seq;
4271 int write = error_code & PFERR_WRITE_MASK;
4272 bool map_writable;
4273 bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
4274 is_nx_huge_page_enabled();
4275
4276 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
4277
4278 if (page_fault_handle_page_track(vcpu, error_code, gfn))
4279 return RET_PF_EMULATE;
4280
4281 r = mmu_topup_memory_caches(vcpu);
4282 if (r)
4283 return r;
4284
4285 force_pt_level =
4286 lpage_disallowed ||
4287 !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL);
4288 level = mapping_level(vcpu, gfn, &force_pt_level);
4289 if (likely(!force_pt_level)) {
4290 if (level > PT_DIRECTORY_LEVEL &&
4291 !check_hugepage_cache_consistency(vcpu, gfn, level))
4292 level = PT_DIRECTORY_LEVEL;
4293 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
4294 }
4295
4296 if (fast_page_fault(vcpu, gpa, level, error_code))
4297 return RET_PF_RETRY;
4298
4299 mmu_seq = vcpu->kvm->mmu_notifier_seq;
4300 smp_rmb();
4301
4302 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
4303 return RET_PF_RETRY;
4304
4305 if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
4306 return r;
4307
4308 r = RET_PF_RETRY;
4309 spin_lock(&vcpu->kvm->mmu_lock);
4310 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
4311 goto out_unlock;
4312 if (make_mmu_pages_available(vcpu) < 0)
4313 goto out_unlock;
4314 if (likely(!force_pt_level))
4315 transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
4316 r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
4317 prefault, lpage_disallowed);
4318out_unlock:
4319 spin_unlock(&vcpu->kvm->mmu_lock);
4320 kvm_release_pfn_clean(pfn);
4321 return r;
4322}
4323
4324static void nonpaging_init_context(struct kvm_vcpu *vcpu,
4325 struct kvm_mmu *context)
4326{
4327 context->page_fault = nonpaging_page_fault;
4328 context->gva_to_gpa = nonpaging_gva_to_gpa;
4329 context->sync_page = nonpaging_sync_page;
4330 context->invlpg = nonpaging_invlpg;
4331 context->update_pte = nonpaging_update_pte;
4332 context->root_level = 0;
4333 context->shadow_root_level = PT32E_ROOT_LEVEL;
4334 context->direct_map = true;
4335 context->nx = false;
4336}
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4347 union kvm_mmu_page_role new_role)
4348{
4349 uint i;
4350 struct kvm_mmu_root_info root;
4351 struct kvm_mmu *mmu = vcpu->arch.mmu;
4352
4353 root.cr3 = mmu->root_cr3;
4354 root.hpa = mmu->root_hpa;
4355
4356 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
4357 swap(root, mmu->prev_roots[i]);
4358
4359 if (new_cr3 == root.cr3 && VALID_PAGE(root.hpa) &&
4360 page_header(root.hpa) != NULL &&
4361 new_role.word == page_header(root.hpa)->role.word)
4362 break;
4363 }
4364
4365 mmu->root_hpa = root.hpa;
4366 mmu->root_cr3 = root.cr3;
4367
4368 return i < KVM_MMU_NUM_PREV_ROOTS;
4369}
4370
4371static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4372 union kvm_mmu_page_role new_role,
4373 bool skip_tlb_flush)
4374{
4375 struct kvm_mmu *mmu = vcpu->arch.mmu;
4376
4377
4378
4379
4380
4381
4382 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
4383 mmu->root_level >= PT64_ROOT_4LEVEL) {
4384 if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT))
4385 return false;
4386
4387 if (cached_root_available(vcpu, new_cr3, new_role)) {
4388
4389
4390
4391
4392
4393
4394
4395 kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
4396 if (!skip_tlb_flush) {
4397 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
4398 kvm_x86_ops->tlb_flush(vcpu, true);
4399 }
4400
4401
4402
4403
4404
4405
4406
4407
4408 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
4409
4410 __clear_sp_write_flooding_count(
4411 page_header(mmu->root_hpa));
4412
4413 return true;
4414 }
4415 }
4416
4417 return false;
4418}
4419
4420static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4421 union kvm_mmu_page_role new_role,
4422 bool skip_tlb_flush)
4423{
4424 if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush))
4425 kvm_mmu_free_roots(vcpu, vcpu->arch.mmu,
4426 KVM_MMU_ROOT_CURRENT);
4427}
4428
4429void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
4430{
4431 __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu),
4432 skip_tlb_flush);
4433}
4434EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3);
4435
4436static unsigned long get_cr3(struct kvm_vcpu *vcpu)
4437{
4438 return kvm_read_cr3(vcpu);
4439}
4440
4441static void inject_page_fault(struct kvm_vcpu *vcpu,
4442 struct x86_exception *fault)
4443{
4444 vcpu->arch.mmu->inject_page_fault(vcpu, fault);
4445}
4446
4447static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
4448 unsigned access, int *nr_present)
4449{
4450 if (unlikely(is_mmio_spte(*sptep))) {
4451 if (gfn != get_mmio_spte_gfn(*sptep)) {
4452 mmu_spte_clear_no_track(sptep);
4453 return true;
4454 }
4455
4456 (*nr_present)++;
4457 mark_mmio_spte(vcpu, sptep, gfn, access);
4458 return true;
4459 }
4460
4461 return false;
4462}
4463
4464static inline bool is_last_gpte(struct kvm_mmu *mmu,
4465 unsigned level, unsigned gpte)
4466{
4467
4468
4469
4470
4471
4472 gpte &= level - mmu->last_nonleaf_level;
4473
4474
4475
4476
4477
4478
4479 gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
4480
4481 return gpte & PT_PAGE_SIZE_MASK;
4482}
4483
4484#define PTTYPE_EPT 18
4485#define PTTYPE PTTYPE_EPT
4486#include "paging_tmpl.h"
4487#undef PTTYPE
4488
4489#define PTTYPE 64
4490#include "paging_tmpl.h"
4491#undef PTTYPE
4492
4493#define PTTYPE 32
4494#include "paging_tmpl.h"
4495#undef PTTYPE
4496
4497static void
4498__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4499 struct rsvd_bits_validate *rsvd_check,
4500 int maxphyaddr, int level, bool nx, bool gbpages,
4501 bool pse, bool amd)
4502{
4503 u64 exb_bit_rsvd = 0;
4504 u64 gbpages_bit_rsvd = 0;
4505 u64 nonleaf_bit8_rsvd = 0;
4506
4507 rsvd_check->bad_mt_xwr = 0;
4508
4509 if (!nx)
4510 exb_bit_rsvd = rsvd_bits(63, 63);
4511 if (!gbpages)
4512 gbpages_bit_rsvd = rsvd_bits(7, 7);
4513
4514
4515
4516
4517
4518 if (amd)
4519 nonleaf_bit8_rsvd = rsvd_bits(8, 8);
4520
4521 switch (level) {
4522 case PT32_ROOT_LEVEL:
4523
4524 rsvd_check->rsvd_bits_mask[0][1] = 0;
4525 rsvd_check->rsvd_bits_mask[0][0] = 0;
4526 rsvd_check->rsvd_bits_mask[1][0] =
4527 rsvd_check->rsvd_bits_mask[0][0];
4528
4529 if (!pse) {
4530 rsvd_check->rsvd_bits_mask[1][1] = 0;
4531 break;
4532 }
4533
4534 if (is_cpuid_PSE36())
4535
4536 rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
4537 else
4538
4539 rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
4540 break;
4541 case PT32E_ROOT_LEVEL:
4542 rsvd_check->rsvd_bits_mask[0][2] =
4543 rsvd_bits(maxphyaddr, 63) |
4544 rsvd_bits(5, 8) | rsvd_bits(1, 2);
4545 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
4546 rsvd_bits(maxphyaddr, 62);
4547 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
4548 rsvd_bits(maxphyaddr, 62);
4549 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
4550 rsvd_bits(maxphyaddr, 62) |
4551 rsvd_bits(13, 20);
4552 rsvd_check->rsvd_bits_mask[1][0] =
4553 rsvd_check->rsvd_bits_mask[0][0];
4554 break;
4555 case PT64_ROOT_5LEVEL:
4556 rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |
4557 nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
4558 rsvd_bits(maxphyaddr, 51);
4559 rsvd_check->rsvd_bits_mask[1][4] =
4560 rsvd_check->rsvd_bits_mask[0][4];
4561
4562 case PT64_ROOT_4LEVEL:
4563 rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
4564 nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
4565 rsvd_bits(maxphyaddr, 51);
4566 rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
4567 nonleaf_bit8_rsvd | gbpages_bit_rsvd |
4568 rsvd_bits(maxphyaddr, 51);
4569 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
4570 rsvd_bits(maxphyaddr, 51);
4571 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
4572 rsvd_bits(maxphyaddr, 51);
4573 rsvd_check->rsvd_bits_mask[1][3] =
4574 rsvd_check->rsvd_bits_mask[0][3];
4575 rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd |
4576 gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
4577 rsvd_bits(13, 29);
4578 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
4579 rsvd_bits(maxphyaddr, 51) |
4580 rsvd_bits(13, 20);
4581 rsvd_check->rsvd_bits_mask[1][0] =
4582 rsvd_check->rsvd_bits_mask[0][0];
4583 break;
4584 }
4585}
4586
4587static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4588 struct kvm_mmu *context)
4589{
4590 __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
4591 cpuid_maxphyaddr(vcpu), context->root_level,
4592 context->nx,
4593 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
4594 is_pse(vcpu), guest_cpuid_is_amd(vcpu));
4595}
4596
4597static void
4598__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
4599 int maxphyaddr, bool execonly)
4600{
4601 u64 bad_mt_xwr;
4602
4603 rsvd_check->rsvd_bits_mask[0][4] =
4604 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
4605 rsvd_check->rsvd_bits_mask[0][3] =
4606 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
4607 rsvd_check->rsvd_bits_mask[0][2] =
4608 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
4609 rsvd_check->rsvd_bits_mask[0][1] =
4610 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
4611 rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
4612
4613
4614 rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
4615 rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
4616 rsvd_check->rsvd_bits_mask[1][2] =
4617 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
4618 rsvd_check->rsvd_bits_mask[1][1] =
4619 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
4620 rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
4621
4622 bad_mt_xwr = 0xFFull << (2 * 8);
4623 bad_mt_xwr |= 0xFFull << (3 * 8);
4624 bad_mt_xwr |= 0xFFull << (7 * 8);
4625 bad_mt_xwr |= REPEAT_BYTE(1ull << 2);
4626 bad_mt_xwr |= REPEAT_BYTE(1ull << 6);
4627 if (!execonly) {
4628
4629 bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
4630 }
4631 rsvd_check->bad_mt_xwr = bad_mt_xwr;
4632}
4633
4634static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
4635 struct kvm_mmu *context, bool execonly)
4636{
4637 __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
4638 cpuid_maxphyaddr(vcpu), execonly);
4639}
4640
4641
4642
4643
4644
4645
4646void
4647reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
4648{
4649 bool uses_nx = context->nx ||
4650 context->mmu_role.base.smep_andnot_wp;
4651 struct rsvd_bits_validate *shadow_zero_check;
4652 int i;
4653
4654
4655
4656
4657
4658 shadow_zero_check = &context->shadow_zero_check;
4659 __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4660 shadow_phys_bits,
4661 context->shadow_root_level, uses_nx,
4662 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
4663 is_pse(vcpu), true);
4664
4665 if (!shadow_me_mask)
4666 return;
4667
4668 for (i = context->shadow_root_level; --i >= 0;) {
4669 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4670 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4671 }
4672
4673}
4674EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
4675
4676static inline bool boot_cpu_is_amd(void)
4677{
4678 WARN_ON_ONCE(!tdp_enabled);
4679 return shadow_x_mask == 0;
4680}
4681
4682
4683
4684
4685
4686static void
4687reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4688 struct kvm_mmu *context)
4689{
4690 struct rsvd_bits_validate *shadow_zero_check;
4691 int i;
4692
4693 shadow_zero_check = &context->shadow_zero_check;
4694
4695 if (boot_cpu_is_amd())
4696 __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4697 shadow_phys_bits,
4698 context->shadow_root_level, false,
4699 boot_cpu_has(X86_FEATURE_GBPAGES),
4700 true, true);
4701 else
4702 __reset_rsvds_bits_mask_ept(shadow_zero_check,
4703 shadow_phys_bits,
4704 false);
4705
4706 if (!shadow_me_mask)
4707 return;
4708
4709 for (i = context->shadow_root_level; --i >= 0;) {
4710 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4711 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4712 }
4713}
4714
4715
4716
4717
4718
4719static void
4720reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4721 struct kvm_mmu *context, bool execonly)
4722{
4723 __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
4724 shadow_phys_bits, execonly);
4725}
4726
4727#define BYTE_MASK(access) \
4728 ((1 & (access) ? 2 : 0) | \
4729 (2 & (access) ? 4 : 0) | \
4730 (3 & (access) ? 8 : 0) | \
4731 (4 & (access) ? 16 : 0) | \
4732 (5 & (access) ? 32 : 0) | \
4733 (6 & (access) ? 64 : 0) | \
4734 (7 & (access) ? 128 : 0))
4735
4736
4737static void update_permission_bitmask(struct kvm_vcpu *vcpu,
4738 struct kvm_mmu *mmu, bool ept)
4739{
4740 unsigned byte;
4741
4742 const u8 x = BYTE_MASK(ACC_EXEC_MASK);
4743 const u8 w = BYTE_MASK(ACC_WRITE_MASK);
4744 const u8 u = BYTE_MASK(ACC_USER_MASK);
4745
4746 bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0;
4747 bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0;
4748 bool cr0_wp = is_write_protection(vcpu);
4749
4750 for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
4751 unsigned pfec = byte << 1;
4752
4753
4754
4755
4756
4757
4758
4759 u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
4760
4761 u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
4762
4763 u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
4764
4765 u8 smepf = 0;
4766
4767 u8 smapf = 0;
4768
4769 if (!ept) {
4770
4771 u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
4772
4773
4774 if (!mmu->nx)
4775 ff = 0;
4776
4777
4778 if (!cr0_wp)
4779 wf = (pfec & PFERR_USER_MASK) ? wf : 0;
4780
4781
4782 if (cr4_smep)
4783 smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801 if (cr4_smap)
4802 smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
4803 }
4804
4805 mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
4806 }
4807}
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
4834 bool ept)
4835{
4836 unsigned bit;
4837 bool wp;
4838
4839 if (ept) {
4840 mmu->pkru_mask = 0;
4841 return;
4842 }
4843
4844
4845 if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) {
4846 mmu->pkru_mask = 0;
4847 return;
4848 }
4849
4850 wp = is_write_protection(vcpu);
4851
4852 for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
4853 unsigned pfec, pkey_bits;
4854 bool check_pkey, check_write, ff, uf, wf, pte_user;
4855
4856 pfec = bit << 1;
4857 ff = pfec & PFERR_FETCH_MASK;
4858 uf = pfec & PFERR_USER_MASK;
4859 wf = pfec & PFERR_WRITE_MASK;
4860
4861
4862 pte_user = pfec & PFERR_RSVD_MASK;
4863
4864
4865
4866
4867
4868 check_pkey = (!ff && pte_user);
4869
4870
4871
4872
4873 check_write = check_pkey && wf && (uf || wp);
4874
4875
4876 pkey_bits = !!check_pkey;
4877
4878 pkey_bits |= (!!check_write) << 1;
4879
4880 mmu->pkru_mask |= (pkey_bits & 3) << pfec;
4881 }
4882}
4883
4884static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
4885{
4886 unsigned root_level = mmu->root_level;
4887
4888 mmu->last_nonleaf_level = root_level;
4889 if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
4890 mmu->last_nonleaf_level++;
4891}
4892
4893static void paging64_init_context_common(struct kvm_vcpu *vcpu,
4894 struct kvm_mmu *context,
4895 int level)
4896{
4897 context->nx = is_nx(vcpu);
4898 context->root_level = level;
4899
4900 reset_rsvds_bits_mask(vcpu, context);
4901 update_permission_bitmask(vcpu, context, false);
4902 update_pkru_bitmask(vcpu, context, false);
4903 update_last_nonleaf_level(vcpu, context);
4904
4905 MMU_WARN_ON(!is_pae(vcpu));
4906 context->page_fault = paging64_page_fault;
4907 context->gva_to_gpa = paging64_gva_to_gpa;
4908 context->sync_page = paging64_sync_page;
4909 context->invlpg = paging64_invlpg;
4910 context->update_pte = paging64_update_pte;
4911 context->shadow_root_level = level;
4912 context->direct_map = false;
4913}
4914
4915static void paging64_init_context(struct kvm_vcpu *vcpu,
4916 struct kvm_mmu *context)
4917{
4918 int root_level = is_la57_mode(vcpu) ?
4919 PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4920
4921 paging64_init_context_common(vcpu, context, root_level);
4922}
4923
4924static void paging32_init_context(struct kvm_vcpu *vcpu,
4925 struct kvm_mmu *context)
4926{
4927 context->nx = false;
4928 context->root_level = PT32_ROOT_LEVEL;
4929
4930 reset_rsvds_bits_mask(vcpu, context);
4931 update_permission_bitmask(vcpu, context, false);
4932 update_pkru_bitmask(vcpu, context, false);
4933 update_last_nonleaf_level(vcpu, context);
4934
4935 context->page_fault = paging32_page_fault;
4936 context->gva_to_gpa = paging32_gva_to_gpa;
4937 context->sync_page = paging32_sync_page;
4938 context->invlpg = paging32_invlpg;
4939 context->update_pte = paging32_update_pte;
4940 context->shadow_root_level = PT32E_ROOT_LEVEL;
4941 context->direct_map = false;
4942}
4943
4944static void paging32E_init_context(struct kvm_vcpu *vcpu,
4945 struct kvm_mmu *context)
4946{
4947 paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
4948}
4949
4950static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
4951{
4952 union kvm_mmu_extended_role ext = {0};
4953
4954 ext.cr0_pg = !!is_paging(vcpu);
4955 ext.cr4_pae = !!is_pae(vcpu);
4956 ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
4957 ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
4958 ext.cr4_pse = !!is_pse(vcpu);
4959 ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
4960 ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
4961 ext.maxphyaddr = cpuid_maxphyaddr(vcpu);
4962
4963 ext.valid = 1;
4964
4965 return ext;
4966}
4967
4968static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
4969 bool base_only)
4970{
4971 union kvm_mmu_role role = {0};
4972
4973 role.base.access = ACC_ALL;
4974 role.base.nxe = !!is_nx(vcpu);
4975 role.base.cr0_wp = is_write_protection(vcpu);
4976 role.base.smm = is_smm(vcpu);
4977 role.base.guest_mode = is_guest_mode(vcpu);
4978
4979 if (base_only)
4980 return role;
4981
4982 role.ext = kvm_calc_mmu_role_ext(vcpu);
4983
4984 return role;
4985}
4986
4987static union kvm_mmu_role
4988kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
4989{
4990 union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
4991
4992 role.base.ad_disabled = (shadow_accessed_mask == 0);
4993 role.base.level = kvm_x86_ops->get_tdp_level(vcpu);
4994 role.base.direct = true;
4995 role.base.gpte_is_8_bytes = true;
4996
4997 return role;
4998}
4999
5000static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
5001{
5002 struct kvm_mmu *context = vcpu->arch.mmu;
5003 union kvm_mmu_role new_role =
5004 kvm_calc_tdp_mmu_root_page_role(vcpu, false);
5005
5006 new_role.base.word &= mmu_base_role_mask.word;
5007 if (new_role.as_u64 == context->mmu_role.as_u64)
5008 return;
5009
5010 context->mmu_role.as_u64 = new_role.as_u64;
5011 context->page_fault = tdp_page_fault;
5012 context->sync_page = nonpaging_sync_page;
5013 context->invlpg = nonpaging_invlpg;
5014 context->update_pte = nonpaging_update_pte;
5015 context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
5016 context->direct_map = true;
5017 context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
5018 context->get_cr3 = get_cr3;
5019 context->get_pdptr = kvm_pdptr_read;
5020 context->inject_page_fault = kvm_inject_page_fault;
5021
5022 if (!is_paging(vcpu)) {
5023 context->nx = false;
5024 context->gva_to_gpa = nonpaging_gva_to_gpa;
5025 context->root_level = 0;
5026 } else if (is_long_mode(vcpu)) {
5027 context->nx = is_nx(vcpu);
5028 context->root_level = is_la57_mode(vcpu) ?
5029 PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
5030 reset_rsvds_bits_mask(vcpu, context);
5031 context->gva_to_gpa = paging64_gva_to_gpa;
5032 } else if (is_pae(vcpu)) {
5033 context->nx = is_nx(vcpu);
5034 context->root_level = PT32E_ROOT_LEVEL;
5035 reset_rsvds_bits_mask(vcpu, context);
5036 context->gva_to_gpa = paging64_gva_to_gpa;
5037 } else {
5038 context->nx = false;
5039 context->root_level = PT32_ROOT_LEVEL;
5040 reset_rsvds_bits_mask(vcpu, context);
5041 context->gva_to_gpa = paging32_gva_to_gpa;
5042 }
5043
5044 update_permission_bitmask(vcpu, context, false);
5045 update_pkru_bitmask(vcpu, context, false);
5046 update_last_nonleaf_level(vcpu, context);
5047 reset_tdp_shadow_zero_bits_mask(vcpu, context);
5048}
5049
5050static union kvm_mmu_role
5051kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
5052{
5053 union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
5054
5055 role.base.smep_andnot_wp = role.ext.cr4_smep &&
5056 !is_write_protection(vcpu);
5057 role.base.smap_andnot_wp = role.ext.cr4_smap &&
5058 !is_write_protection(vcpu);
5059 role.base.direct = !is_paging(vcpu);
5060 role.base.gpte_is_8_bytes = !!is_pae(vcpu);
5061
5062 if (!is_long_mode(vcpu))
5063 role.base.level = PT32E_ROOT_LEVEL;
5064 else if (is_la57_mode(vcpu))
5065 role.base.level = PT64_ROOT_5LEVEL;
5066 else
5067 role.base.level = PT64_ROOT_4LEVEL;
5068
5069 return role;
5070}
5071
5072void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
5073{
5074 struct kvm_mmu *context = vcpu->arch.mmu;
5075 union kvm_mmu_role new_role =
5076 kvm_calc_shadow_mmu_root_page_role(vcpu, false);
5077
5078 new_role.base.word &= mmu_base_role_mask.word;
5079 if (new_role.as_u64 == context->mmu_role.as_u64)
5080 return;
5081
5082 if (!is_paging(vcpu))
5083 nonpaging_init_context(vcpu, context);
5084 else if (is_long_mode(vcpu))
5085 paging64_init_context(vcpu, context);
5086 else if (is_pae(vcpu))
5087 paging32E_init_context(vcpu, context);
5088 else
5089 paging32_init_context(vcpu, context);
5090
5091 context->mmu_role.as_u64 = new_role.as_u64;
5092 reset_shadow_zero_bits_mask(vcpu, context);
5093}
5094EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
5095
5096static union kvm_mmu_role
5097kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
5098 bool execonly)
5099{
5100 union kvm_mmu_role role = {0};
5101
5102
5103 role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
5104
5105 role.base.level = PT64_ROOT_4LEVEL;
5106 role.base.gpte_is_8_bytes = true;
5107 role.base.direct = false;
5108 role.base.ad_disabled = !accessed_dirty;
5109 role.base.guest_mode = true;
5110 role.base.access = ACC_ALL;
5111
5112
5113
5114
5115
5116 role.base.cr0_wp = true;
5117 role.base.smap_andnot_wp = true;
5118
5119 role.ext = kvm_calc_mmu_role_ext(vcpu);
5120 role.ext.execonly = execonly;
5121
5122 return role;
5123}
5124
5125void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
5126 bool accessed_dirty, gpa_t new_eptp)
5127{
5128 struct kvm_mmu *context = vcpu->arch.mmu;
5129 union kvm_mmu_role new_role =
5130 kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
5131 execonly);
5132
5133 __kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false);
5134
5135 new_role.base.word &= mmu_base_role_mask.word;
5136 if (new_role.as_u64 == context->mmu_role.as_u64)
5137 return;
5138
5139 context->shadow_root_level = PT64_ROOT_4LEVEL;
5140
5141 context->nx = true;
5142 context->ept_ad = accessed_dirty;
5143 context->page_fault = ept_page_fault;
5144 context->gva_to_gpa = ept_gva_to_gpa;
5145 context->sync_page = ept_sync_page;
5146 context->invlpg = ept_invlpg;
5147 context->update_pte = ept_update_pte;
5148 context->root_level = PT64_ROOT_4LEVEL;
5149 context->direct_map = false;
5150 context->mmu_role.as_u64 = new_role.as_u64;
5151
5152 update_permission_bitmask(vcpu, context, true);
5153 update_pkru_bitmask(vcpu, context, true);
5154 update_last_nonleaf_level(vcpu, context);
5155 reset_rsvds_bits_mask_ept(vcpu, context, execonly);
5156 reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
5157}
5158EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
5159
5160static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
5161{
5162 struct kvm_mmu *context = vcpu->arch.mmu;
5163
5164 kvm_init_shadow_mmu(vcpu);
5165 context->set_cr3 = kvm_x86_ops->set_cr3;
5166 context->get_cr3 = get_cr3;
5167 context->get_pdptr = kvm_pdptr_read;
5168 context->inject_page_fault = kvm_inject_page_fault;
5169}
5170
5171static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
5172{
5173 union kvm_mmu_role new_role = kvm_calc_mmu_role_common(vcpu, false);
5174 struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
5175
5176 new_role.base.word &= mmu_base_role_mask.word;
5177 if (new_role.as_u64 == g_context->mmu_role.as_u64)
5178 return;
5179
5180 g_context->mmu_role.as_u64 = new_role.as_u64;
5181 g_context->get_cr3 = get_cr3;
5182 g_context->get_pdptr = kvm_pdptr_read;
5183 g_context->inject_page_fault = kvm_inject_page_fault;
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193 if (!is_paging(vcpu)) {
5194 g_context->nx = false;
5195 g_context->root_level = 0;
5196 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
5197 } else if (is_long_mode(vcpu)) {
5198 g_context->nx = is_nx(vcpu);
5199 g_context->root_level = is_la57_mode(vcpu) ?
5200 PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
5201 reset_rsvds_bits_mask(vcpu, g_context);
5202 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
5203 } else if (is_pae(vcpu)) {
5204 g_context->nx = is_nx(vcpu);
5205 g_context->root_level = PT32E_ROOT_LEVEL;
5206 reset_rsvds_bits_mask(vcpu, g_context);
5207 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
5208 } else {
5209 g_context->nx = false;
5210 g_context->root_level = PT32_ROOT_LEVEL;
5211 reset_rsvds_bits_mask(vcpu, g_context);
5212 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
5213 }
5214
5215 update_permission_bitmask(vcpu, g_context, false);
5216 update_pkru_bitmask(vcpu, g_context, false);
5217 update_last_nonleaf_level(vcpu, g_context);
5218}
5219
5220void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
5221{
5222 if (reset_roots) {
5223 uint i;
5224
5225 vcpu->arch.mmu->root_hpa = INVALID_PAGE;
5226
5227 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5228 vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5229 }
5230
5231 if (mmu_is_nested(vcpu))
5232 init_kvm_nested_mmu(vcpu);
5233 else if (tdp_enabled)
5234 init_kvm_tdp_mmu(vcpu);
5235 else
5236 init_kvm_softmmu(vcpu);
5237}
5238EXPORT_SYMBOL_GPL(kvm_init_mmu);
5239
5240static union kvm_mmu_page_role
5241kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
5242{
5243 union kvm_mmu_role role;
5244
5245 if (tdp_enabled)
5246 role = kvm_calc_tdp_mmu_root_page_role(vcpu, true);
5247 else
5248 role = kvm_calc_shadow_mmu_root_page_role(vcpu, true);
5249
5250 return role.base;
5251}
5252
5253void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
5254{
5255 kvm_mmu_unload(vcpu);
5256 kvm_init_mmu(vcpu, true);
5257}
5258EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
5259
5260int kvm_mmu_load(struct kvm_vcpu *vcpu)
5261{
5262 int r;
5263
5264 r = mmu_topup_memory_caches(vcpu);
5265 if (r)
5266 goto out;
5267 r = mmu_alloc_roots(vcpu);
5268 kvm_mmu_sync_roots(vcpu);
5269 if (r)
5270 goto out;
5271 kvm_mmu_load_cr3(vcpu);
5272 kvm_x86_ops->tlb_flush(vcpu, true);
5273out:
5274 return r;
5275}
5276EXPORT_SYMBOL_GPL(kvm_mmu_load);
5277
5278void kvm_mmu_unload(struct kvm_vcpu *vcpu)
5279{
5280 kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
5281 WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa));
5282 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
5283 WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
5284}
5285EXPORT_SYMBOL_GPL(kvm_mmu_unload);
5286
5287static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
5288 struct kvm_mmu_page *sp, u64 *spte,
5289 const void *new)
5290{
5291 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
5292 ++vcpu->kvm->stat.mmu_pde_zapped;
5293 return;
5294 }
5295
5296 ++vcpu->kvm->stat.mmu_pte_updated;
5297 vcpu->arch.mmu->update_pte(vcpu, sp, spte, new);
5298}
5299
5300static bool need_remote_flush(u64 old, u64 new)
5301{
5302 if (!is_shadow_present_pte(old))
5303 return false;
5304 if (!is_shadow_present_pte(new))
5305 return true;
5306 if ((old ^ new) & PT64_BASE_ADDR_MASK)
5307 return true;
5308 old ^= shadow_nx_mask;
5309 new ^= shadow_nx_mask;
5310 return (old & ~new & PT64_PERM_MASK) != 0;
5311}
5312
5313static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
5314 int *bytes)
5315{
5316 u64 gentry = 0;
5317 int r;
5318
5319
5320
5321
5322
5323
5324 if (is_pae(vcpu) && *bytes == 4) {
5325
5326 *gpa &= ~(gpa_t)7;
5327 *bytes = 8;
5328 }
5329
5330 if (*bytes == 4 || *bytes == 8) {
5331 r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
5332 if (r)
5333 gentry = 0;
5334 }
5335
5336 return gentry;
5337}
5338
5339
5340
5341
5342
5343static bool detect_write_flooding(struct kvm_mmu_page *sp)
5344{
5345
5346
5347
5348
5349 if (sp->role.level == PT_PAGE_TABLE_LEVEL)
5350 return false;
5351
5352 atomic_inc(&sp->write_flooding_count);
5353 return atomic_read(&sp->write_flooding_count) >= 3;
5354}
5355
5356
5357
5358
5359
5360static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
5361 int bytes)
5362{
5363 unsigned offset, pte_size, misaligned;
5364
5365 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
5366 gpa, bytes, sp->role.word);
5367
5368 offset = offset_in_page(gpa);
5369 pte_size = sp->role.gpte_is_8_bytes ? 8 : 4;
5370
5371
5372
5373
5374
5375 if (!(offset & (pte_size - 1)) && bytes == 1)
5376 return false;
5377
5378 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
5379 misaligned |= bytes < 4;
5380
5381 return misaligned;
5382}
5383
5384static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
5385{
5386 unsigned page_offset, quadrant;
5387 u64 *spte;
5388 int level;
5389
5390 page_offset = offset_in_page(gpa);
5391 level = sp->role.level;
5392 *nspte = 1;
5393 if (!sp->role.gpte_is_8_bytes) {
5394 page_offset <<= 1;
5395
5396
5397
5398
5399
5400 if (level == PT32_ROOT_LEVEL) {
5401 page_offset &= ~7;
5402 page_offset <<= 1;
5403 *nspte = 2;
5404 }
5405 quadrant = page_offset >> PAGE_SHIFT;
5406 page_offset &= ~PAGE_MASK;
5407 if (quadrant != sp->role.quadrant)
5408 return NULL;
5409 }
5410
5411 spte = &sp->spt[page_offset / sizeof(*spte)];
5412 return spte;
5413}
5414
5415static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
5416 const u8 *new, int bytes,
5417 struct kvm_page_track_notifier_node *node)
5418{
5419 gfn_t gfn = gpa >> PAGE_SHIFT;
5420 struct kvm_mmu_page *sp;
5421 LIST_HEAD(invalid_list);
5422 u64 entry, gentry, *spte;
5423 int npte;
5424 bool remote_flush, local_flush;
5425
5426
5427
5428
5429
5430 if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
5431 return;
5432
5433 remote_flush = local_flush = false;
5434
5435 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
5436
5437
5438
5439
5440
5441
5442 mmu_topup_memory_caches(vcpu);
5443
5444 spin_lock(&vcpu->kvm->mmu_lock);
5445
5446 gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
5447
5448 ++vcpu->kvm->stat.mmu_pte_write;
5449 kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
5450
5451 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
5452 if (detect_write_misaligned(sp, gpa, bytes) ||
5453 detect_write_flooding(sp)) {
5454 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
5455 ++vcpu->kvm->stat.mmu_flooded;
5456 continue;
5457 }
5458
5459 spte = get_written_sptes(sp, gpa, &npte);
5460 if (!spte)
5461 continue;
5462
5463 local_flush = true;
5464 while (npte--) {
5465 u32 base_role = vcpu->arch.mmu->mmu_role.base.word;
5466
5467 entry = *spte;
5468 mmu_page_zap_pte(vcpu->kvm, sp, spte);
5469 if (gentry &&
5470 !((sp->role.word ^ base_role)
5471 & mmu_base_role_mask.word) && rmap_can_add(vcpu))
5472 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
5473 if (need_remote_flush(entry, *spte))
5474 remote_flush = true;
5475 ++spte;
5476 }
5477 }
5478 kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
5479 kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
5480 spin_unlock(&vcpu->kvm->mmu_lock);
5481}
5482
5483int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
5484{
5485 gpa_t gpa;
5486 int r;
5487
5488 if (vcpu->arch.mmu->direct_map)
5489 return 0;
5490
5491 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
5492
5493 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
5494
5495 return r;
5496}
5497EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
5498
5499static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
5500{
5501 LIST_HEAD(invalid_list);
5502
5503 if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
5504 return 0;
5505
5506 while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
5507 if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
5508 break;
5509
5510 ++vcpu->kvm->stat.mmu_recycled;
5511 }
5512 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
5513
5514 if (!kvm_mmu_available_pages(vcpu->kvm))
5515 return -ENOSPC;
5516 return 0;
5517}
5518
5519int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
5520 void *insn, int insn_len)
5521{
5522 int r, emulation_type = 0;
5523 bool direct = vcpu->arch.mmu->direct_map;
5524
5525
5526 if (vcpu->arch.mmu->direct_map) {
5527 vcpu->arch.gpa_available = true;
5528 vcpu->arch.gpa_val = cr2;
5529 }
5530
5531 r = RET_PF_INVALID;
5532 if (unlikely(error_code & PFERR_RSVD_MASK)) {
5533 r = handle_mmio_page_fault(vcpu, cr2, direct);
5534 if (r == RET_PF_EMULATE)
5535 goto emulate;
5536 }
5537
5538 if (r == RET_PF_INVALID) {
5539 r = vcpu->arch.mmu->page_fault(vcpu, cr2,
5540 lower_32_bits(error_code),
5541 false);
5542 WARN_ON(r == RET_PF_INVALID);
5543 }
5544
5545 if (r == RET_PF_RETRY)
5546 return 1;
5547 if (r < 0)
5548 return r;
5549
5550
5551
5552
5553
5554
5555
5556
5557 if (vcpu->arch.mmu->direct_map &&
5558 (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
5559 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
5560 return 1;
5561 }
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574 if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu))
5575 emulation_type = EMULTYPE_ALLOW_RETRY;
5576emulate:
5577
5578
5579
5580
5581
5582
5583
5584 if (unlikely(insn && !insn_len)) {
5585 if (!kvm_x86_ops->need_emulation_on_page_fault(vcpu))
5586 return 1;
5587 }
5588
5589 return x86_emulate_instruction(vcpu, cr2, emulation_type, insn,
5590 insn_len);
5591}
5592EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
5593
5594void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
5595{
5596 struct kvm_mmu *mmu = vcpu->arch.mmu;
5597 int i;
5598
5599
5600 if (is_noncanonical_address(gva, vcpu))
5601 return;
5602
5603 mmu->invlpg(vcpu, gva, mmu->root_hpa);
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5617 if (VALID_PAGE(mmu->prev_roots[i].hpa))
5618 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5619
5620 kvm_x86_ops->tlb_flush_gva(vcpu, gva);
5621 ++vcpu->stat.invlpg;
5622}
5623EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
5624
5625void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
5626{
5627 struct kvm_mmu *mmu = vcpu->arch.mmu;
5628 bool tlb_flush = false;
5629 uint i;
5630
5631 if (pcid == kvm_get_active_pcid(vcpu)) {
5632 mmu->invlpg(vcpu, gva, mmu->root_hpa);
5633 tlb_flush = true;
5634 }
5635
5636 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5637 if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
5638 pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) {
5639 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5640 tlb_flush = true;
5641 }
5642 }
5643
5644 if (tlb_flush)
5645 kvm_x86_ops->tlb_flush_gva(vcpu, gva);
5646
5647 ++vcpu->stat.invlpg;
5648
5649
5650
5651
5652
5653
5654}
5655EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
5656
5657void kvm_enable_tdp(void)
5658{
5659 tdp_enabled = true;
5660}
5661EXPORT_SYMBOL_GPL(kvm_enable_tdp);
5662
5663void kvm_disable_tdp(void)
5664{
5665 tdp_enabled = false;
5666}
5667EXPORT_SYMBOL_GPL(kvm_disable_tdp);
5668
5669
5670
5671typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
5672
5673
5674static __always_inline bool
5675slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
5676 slot_level_handler fn, int start_level, int end_level,
5677 gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
5678{
5679 struct slot_rmap_walk_iterator iterator;
5680 bool flush = false;
5681
5682 for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
5683 end_gfn, &iterator) {
5684 if (iterator.rmap)
5685 flush |= fn(kvm, iterator.rmap);
5686
5687 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
5688 if (flush && lock_flush_tlb) {
5689 kvm_flush_remote_tlbs_with_address(kvm,
5690 start_gfn,
5691 iterator.gfn - start_gfn + 1);
5692 flush = false;
5693 }
5694 cond_resched_lock(&kvm->mmu_lock);
5695 }
5696 }
5697
5698 if (flush && lock_flush_tlb) {
5699 kvm_flush_remote_tlbs_with_address(kvm, start_gfn,
5700 end_gfn - start_gfn + 1);
5701 flush = false;
5702 }
5703
5704 return flush;
5705}
5706
5707static __always_inline bool
5708slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5709 slot_level_handler fn, int start_level, int end_level,
5710 bool lock_flush_tlb)
5711{
5712 return slot_handle_level_range(kvm, memslot, fn, start_level,
5713 end_level, memslot->base_gfn,
5714 memslot->base_gfn + memslot->npages - 1,
5715 lock_flush_tlb);
5716}
5717
5718static __always_inline bool
5719slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5720 slot_level_handler fn, bool lock_flush_tlb)
5721{
5722 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
5723 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5724}
5725
5726static __always_inline bool
5727slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5728 slot_level_handler fn, bool lock_flush_tlb)
5729{
5730 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
5731 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5732}
5733
5734static __always_inline bool
5735slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
5736 slot_level_handler fn, bool lock_flush_tlb)
5737{
5738 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
5739 PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
5740}
5741
5742static void free_mmu_pages(struct kvm_mmu *mmu)
5743{
5744 free_page((unsigned long)mmu->pae_root);
5745 free_page((unsigned long)mmu->lm_root);
5746}
5747
5748static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
5749{
5750 struct page *page;
5751 int i;
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762 if (tdp_enabled && kvm_x86_ops->get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
5763 return 0;
5764
5765 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
5766 if (!page)
5767 return -ENOMEM;
5768
5769 mmu->pae_root = page_address(page);
5770 for (i = 0; i < 4; ++i)
5771 mmu->pae_root[i] = INVALID_PAGE;
5772
5773 return 0;
5774}
5775
5776int kvm_mmu_create(struct kvm_vcpu *vcpu)
5777{
5778 uint i;
5779 int ret;
5780
5781 vcpu->arch.mmu = &vcpu->arch.root_mmu;
5782 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
5783
5784 vcpu->arch.root_mmu.root_hpa = INVALID_PAGE;
5785 vcpu->arch.root_mmu.root_cr3 = 0;
5786 vcpu->arch.root_mmu.translate_gpa = translate_gpa;
5787 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5788 vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5789
5790 vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE;
5791 vcpu->arch.guest_mmu.root_cr3 = 0;
5792 vcpu->arch.guest_mmu.translate_gpa = translate_gpa;
5793 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5794 vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5795
5796 vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
5797
5798 ret = alloc_mmu_pages(vcpu, &vcpu->arch.guest_mmu);
5799 if (ret)
5800 return ret;
5801
5802 ret = alloc_mmu_pages(vcpu, &vcpu->arch.root_mmu);
5803 if (ret)
5804 goto fail_allocate_root;
5805
5806 return ret;
5807 fail_allocate_root:
5808 free_mmu_pages(&vcpu->arch.guest_mmu);
5809 return ret;
5810}
5811
5812#define BATCH_ZAP_PAGES 10
5813static void kvm_zap_obsolete_pages(struct kvm *kvm)
5814{
5815 struct kvm_mmu_page *sp, *node;
5816 int nr_zapped, batch = 0;
5817
5818restart:
5819 list_for_each_entry_safe_reverse(sp, node,
5820 &kvm->arch.active_mmu_pages, link) {
5821
5822
5823
5824
5825 if (!is_obsolete_sp(kvm, sp))
5826 break;
5827
5828
5829
5830
5831
5832
5833
5834 if (sp->role.invalid && sp->root_count)
5835 continue;
5836
5837
5838
5839
5840
5841
5842
5843 if (batch >= BATCH_ZAP_PAGES &&
5844 cond_resched_lock(&kvm->mmu_lock)) {
5845 batch = 0;
5846 goto restart;
5847 }
5848
5849 if (__kvm_mmu_prepare_zap_page(kvm, sp,
5850 &kvm->arch.zapped_obsolete_pages, &nr_zapped)) {
5851 batch += nr_zapped;
5852 goto restart;
5853 }
5854 }
5855
5856
5857
5858
5859
5860
5861 kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
5862}
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873static void kvm_mmu_zap_all_fast(struct kvm *kvm)
5874{
5875 lockdep_assert_held(&kvm->slots_lock);
5876
5877 spin_lock(&kvm->mmu_lock);
5878 trace_kvm_mmu_zap_all_fast(kvm);
5879
5880
5881
5882
5883
5884
5885
5886
5887 kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897 kvm_reload_remote_mmus(kvm);
5898
5899 kvm_zap_obsolete_pages(kvm);
5900 spin_unlock(&kvm->mmu_lock);
5901}
5902
5903static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
5904{
5905 return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
5906}
5907
5908static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
5909 struct kvm_memory_slot *slot,
5910 struct kvm_page_track_notifier_node *node)
5911{
5912 kvm_mmu_zap_all_fast(kvm);
5913}
5914
5915void kvm_mmu_init_vm(struct kvm *kvm)
5916{
5917 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5918
5919 node->track_write = kvm_mmu_pte_write;
5920 node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
5921 kvm_page_track_register_notifier(kvm, node);
5922}
5923
5924void kvm_mmu_uninit_vm(struct kvm *kvm)
5925{
5926 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5927
5928 kvm_page_track_unregister_notifier(kvm, node);
5929}
5930
5931void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
5932{
5933 struct kvm_memslots *slots;
5934 struct kvm_memory_slot *memslot;
5935 int i;
5936
5937 spin_lock(&kvm->mmu_lock);
5938 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
5939 slots = __kvm_memslots(kvm, i);
5940 kvm_for_each_memslot(memslot, slots) {
5941 gfn_t start, end;
5942
5943 start = max(gfn_start, memslot->base_gfn);
5944 end = min(gfn_end, memslot->base_gfn + memslot->npages);
5945 if (start >= end)
5946 continue;
5947
5948 slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
5949 PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
5950 start, end - 1, true);
5951 }
5952 }
5953
5954 spin_unlock(&kvm->mmu_lock);
5955}
5956
5957static bool slot_rmap_write_protect(struct kvm *kvm,
5958 struct kvm_rmap_head *rmap_head)
5959{
5960 return __rmap_write_protect(kvm, rmap_head, false);
5961}
5962
5963void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
5964 struct kvm_memory_slot *memslot)
5965{
5966 bool flush;
5967
5968 spin_lock(&kvm->mmu_lock);
5969 flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
5970 false);
5971 spin_unlock(&kvm->mmu_lock);
5972
5973
5974
5975
5976
5977
5978 lockdep_assert_held(&kvm->slots_lock);
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991 if (flush)
5992 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5993 memslot->npages);
5994}
5995
5996static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
5997 struct kvm_rmap_head *rmap_head)
5998{
5999 u64 *sptep;
6000 struct rmap_iterator iter;
6001 int need_tlb_flush = 0;
6002 kvm_pfn_t pfn;
6003 struct kvm_mmu_page *sp;
6004
6005restart:
6006 for_each_rmap_spte(rmap_head, &iter, sptep) {
6007 sp = page_header(__pa(sptep));
6008 pfn = spte_to_pfn(*sptep);
6009
6010
6011
6012
6013
6014
6015
6016
6017 if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
6018 !kvm_is_zone_device_pfn(pfn) &&
6019 PageTransCompoundMap(pfn_to_page(pfn))) {
6020 pte_list_remove(rmap_head, sptep);
6021
6022 if (kvm_available_flush_tlb_with_range())
6023 kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
6024 KVM_PAGES_PER_HPAGE(sp->role.level));
6025 else
6026 need_tlb_flush = 1;
6027
6028 goto restart;
6029 }
6030 }
6031
6032 return need_tlb_flush;
6033}
6034
6035void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
6036 const struct kvm_memory_slot *memslot)
6037{
6038
6039 spin_lock(&kvm->mmu_lock);
6040 slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
6041 kvm_mmu_zap_collapsible_spte, true);
6042 spin_unlock(&kvm->mmu_lock);
6043}
6044
6045void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
6046 struct kvm_memory_slot *memslot)
6047{
6048 bool flush;
6049
6050 spin_lock(&kvm->mmu_lock);
6051 flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
6052 spin_unlock(&kvm->mmu_lock);
6053
6054 lockdep_assert_held(&kvm->slots_lock);
6055
6056
6057
6058
6059
6060
6061
6062 if (flush)
6063 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
6064 memslot->npages);
6065}
6066EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
6067
6068void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
6069 struct kvm_memory_slot *memslot)
6070{
6071 bool flush;
6072
6073 spin_lock(&kvm->mmu_lock);
6074 flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
6075 false);
6076 spin_unlock(&kvm->mmu_lock);
6077
6078
6079 lockdep_assert_held(&kvm->slots_lock);
6080
6081 if (flush)
6082 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
6083 memslot->npages);
6084}
6085EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
6086
6087void kvm_mmu_slot_set_dirty(struct kvm *kvm,
6088 struct kvm_memory_slot *memslot)
6089{
6090 bool flush;
6091
6092 spin_lock(&kvm->mmu_lock);
6093 flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
6094 spin_unlock(&kvm->mmu_lock);
6095
6096 lockdep_assert_held(&kvm->slots_lock);
6097
6098
6099 if (flush)
6100 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
6101 memslot->npages);
6102}
6103EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
6104
6105void kvm_mmu_zap_all(struct kvm *kvm)
6106{
6107 struct kvm_mmu_page *sp, *node;
6108 LIST_HEAD(invalid_list);
6109 int ign;
6110
6111 spin_lock(&kvm->mmu_lock);
6112restart:
6113 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
6114 if (sp->role.invalid && sp->root_count)
6115 continue;
6116 if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
6117 goto restart;
6118 if (cond_resched_lock(&kvm->mmu_lock))
6119 goto restart;
6120 }
6121
6122 kvm_mmu_commit_zap_page(kvm, &invalid_list);
6123 spin_unlock(&kvm->mmu_lock);
6124}
6125
6126void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
6127{
6128 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
6129
6130 gen &= MMIO_SPTE_GEN_MASK;
6131
6132
6133
6134
6135
6136
6137
6138
6139 gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
6140
6141
6142
6143
6144
6145 if (unlikely(gen == 0)) {
6146 kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
6147 kvm_mmu_zap_all_fast(kvm);
6148 }
6149}
6150
6151static unsigned long
6152mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
6153{
6154 struct kvm *kvm;
6155 int nr_to_scan = sc->nr_to_scan;
6156 unsigned long freed = 0;
6157
6158 mutex_lock(&kvm_lock);
6159
6160 list_for_each_entry(kvm, &vm_list, vm_list) {
6161 int idx;
6162 LIST_HEAD(invalid_list);
6163
6164
6165
6166
6167
6168
6169
6170 if (!nr_to_scan--)
6171 break;
6172
6173
6174
6175
6176
6177
6178 if (!kvm->arch.n_used_mmu_pages &&
6179 !kvm_has_zapped_obsolete_pages(kvm))
6180 continue;
6181
6182 idx = srcu_read_lock(&kvm->srcu);
6183 spin_lock(&kvm->mmu_lock);
6184
6185 if (kvm_has_zapped_obsolete_pages(kvm)) {
6186 kvm_mmu_commit_zap_page(kvm,
6187 &kvm->arch.zapped_obsolete_pages);
6188 goto unlock;
6189 }
6190
6191 if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
6192 freed++;
6193 kvm_mmu_commit_zap_page(kvm, &invalid_list);
6194
6195unlock:
6196 spin_unlock(&kvm->mmu_lock);
6197 srcu_read_unlock(&kvm->srcu, idx);
6198
6199
6200
6201
6202
6203
6204 list_move_tail(&kvm->vm_list, &vm_list);
6205 break;
6206 }
6207
6208 mutex_unlock(&kvm_lock);
6209 return freed;
6210}
6211
6212static unsigned long
6213mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
6214{
6215 return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
6216}
6217
6218static struct shrinker mmu_shrinker = {
6219 .count_objects = mmu_shrink_count,
6220 .scan_objects = mmu_shrink_scan,
6221 .seeks = DEFAULT_SEEKS * 10,
6222};
6223
6224static void mmu_destroy_caches(void)
6225{
6226 kmem_cache_destroy(pte_list_desc_cache);
6227 kmem_cache_destroy(mmu_page_header_cache);
6228}
6229
6230static void kvm_set_mmio_spte_mask(void)
6231{
6232 u64 mask;
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243 mask = 1ull << 51;
6244
6245
6246 mask |= 1ull;
6247
6248
6249
6250
6251
6252 if (IS_ENABLED(CONFIG_X86_64) && shadow_phys_bits == 52)
6253 mask &= ~1ull;
6254
6255 kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
6256}
6257
6258static bool get_nx_auto_mode(void)
6259{
6260
6261 return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
6262}
6263
6264static void __set_nx_huge_pages(bool val)
6265{
6266 nx_huge_pages = itlb_multihit_kvm_mitigation = val;
6267}
6268
6269static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
6270{
6271 bool old_val = nx_huge_pages;
6272 bool new_val;
6273
6274
6275 if (sysfs_streq(val, "off"))
6276 new_val = 0;
6277 else if (sysfs_streq(val, "force"))
6278 new_val = 1;
6279 else if (sysfs_streq(val, "auto"))
6280 new_val = get_nx_auto_mode();
6281 else if (strtobool(val, &new_val) < 0)
6282 return -EINVAL;
6283
6284 __set_nx_huge_pages(new_val);
6285
6286 if (new_val != old_val) {
6287 struct kvm *kvm;
6288
6289 mutex_lock(&kvm_lock);
6290
6291 list_for_each_entry(kvm, &vm_list, vm_list) {
6292 mutex_lock(&kvm->slots_lock);
6293 kvm_mmu_zap_all_fast(kvm);
6294 mutex_unlock(&kvm->slots_lock);
6295
6296 wake_up_process(kvm->arch.nx_lpage_recovery_thread);
6297 }
6298 mutex_unlock(&kvm_lock);
6299 }
6300
6301 return 0;
6302}
6303
6304int kvm_mmu_module_init(void)
6305{
6306 int ret = -ENOMEM;
6307
6308 if (nx_huge_pages == -1)
6309 __set_nx_huge_pages(get_nx_auto_mode());
6310
6311
6312
6313
6314
6315
6316
6317 BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
6318 BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
6319 BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64));
6320
6321 kvm_mmu_reset_all_pte_masks();
6322
6323 kvm_set_mmio_spte_mask();
6324
6325 pte_list_desc_cache = kmem_cache_create("pte_list_desc",
6326 sizeof(struct pte_list_desc),
6327 0, SLAB_ACCOUNT, NULL);
6328 if (!pte_list_desc_cache)
6329 goto out;
6330
6331 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
6332 sizeof(struct kvm_mmu_page),
6333 0, SLAB_ACCOUNT, NULL);
6334 if (!mmu_page_header_cache)
6335 goto out;
6336
6337 if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
6338 goto out;
6339
6340 ret = register_shrinker(&mmu_shrinker);
6341 if (ret)
6342 goto out;
6343
6344 return 0;
6345
6346out:
6347 mmu_destroy_caches();
6348 return ret;
6349}
6350
6351
6352
6353
6354unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
6355{
6356 unsigned long nr_mmu_pages;
6357 unsigned long nr_pages = 0;
6358 struct kvm_memslots *slots;
6359 struct kvm_memory_slot *memslot;
6360 int i;
6361
6362 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
6363 slots = __kvm_memslots(kvm, i);
6364
6365 kvm_for_each_memslot(memslot, slots)
6366 nr_pages += memslot->npages;
6367 }
6368
6369 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
6370 nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
6371
6372 return nr_mmu_pages;
6373}
6374
6375void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
6376{
6377 kvm_mmu_unload(vcpu);
6378 free_mmu_pages(&vcpu->arch.root_mmu);
6379 free_mmu_pages(&vcpu->arch.guest_mmu);
6380 mmu_free_memory_caches(vcpu);
6381}
6382
6383void kvm_mmu_module_exit(void)
6384{
6385 mmu_destroy_caches();
6386 percpu_counter_destroy(&kvm_total_used_mmu_pages);
6387 unregister_shrinker(&mmu_shrinker);
6388 mmu_audit_disable();
6389}
6390
6391static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp)
6392{
6393 unsigned int old_val;
6394 int err;
6395
6396 old_val = nx_huge_pages_recovery_ratio;
6397 err = param_set_uint(val, kp);
6398 if (err)
6399 return err;
6400
6401 if (READ_ONCE(nx_huge_pages) &&
6402 !old_val && nx_huge_pages_recovery_ratio) {
6403 struct kvm *kvm;
6404
6405 mutex_lock(&kvm_lock);
6406
6407 list_for_each_entry(kvm, &vm_list, vm_list)
6408 wake_up_process(kvm->arch.nx_lpage_recovery_thread);
6409
6410 mutex_unlock(&kvm_lock);
6411 }
6412
6413 return err;
6414}
6415
6416static void kvm_recover_nx_lpages(struct kvm *kvm)
6417{
6418 int rcu_idx;
6419 struct kvm_mmu_page *sp;
6420 unsigned int ratio;
6421 LIST_HEAD(invalid_list);
6422 ulong to_zap;
6423
6424 rcu_idx = srcu_read_lock(&kvm->srcu);
6425 spin_lock(&kvm->mmu_lock);
6426
6427 ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
6428 to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
6429 while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) {
6430
6431
6432
6433
6434
6435 sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
6436 struct kvm_mmu_page,
6437 lpage_disallowed_link);
6438 WARN_ON_ONCE(!sp->lpage_disallowed);
6439 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
6440 WARN_ON_ONCE(sp->lpage_disallowed);
6441
6442 if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) {
6443 kvm_mmu_commit_zap_page(kvm, &invalid_list);
6444 if (to_zap)
6445 cond_resched_lock(&kvm->mmu_lock);
6446 }
6447 }
6448
6449 spin_unlock(&kvm->mmu_lock);
6450 srcu_read_unlock(&kvm->srcu, rcu_idx);
6451}
6452
6453static long get_nx_lpage_recovery_timeout(u64 start_time)
6454{
6455 return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio)
6456 ? start_time + 60 * HZ - get_jiffies_64()
6457 : MAX_SCHEDULE_TIMEOUT;
6458}
6459
6460static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
6461{
6462 u64 start_time;
6463 long remaining_time;
6464
6465 while (true) {
6466 start_time = get_jiffies_64();
6467 remaining_time = get_nx_lpage_recovery_timeout(start_time);
6468
6469 set_current_state(TASK_INTERRUPTIBLE);
6470 while (!kthread_should_stop() && remaining_time > 0) {
6471 schedule_timeout(remaining_time);
6472 remaining_time = get_nx_lpage_recovery_timeout(start_time);
6473 set_current_state(TASK_INTERRUPTIBLE);
6474 }
6475
6476 set_current_state(TASK_RUNNING);
6477
6478 if (kthread_should_stop())
6479 return 0;
6480
6481 kvm_recover_nx_lpages(kvm);
6482 }
6483}
6484
6485int kvm_mmu_post_init_vm(struct kvm *kvm)
6486{
6487 int err;
6488
6489 err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
6490 "kvm-nx-lpage-recovery",
6491 &kvm->arch.nx_lpage_recovery_thread);
6492 if (!err)
6493 kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
6494
6495 return err;
6496}
6497
6498void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
6499{
6500 if (kvm->arch.nx_lpage_recovery_thread)
6501 kthread_stop(kvm->arch.nx_lpage_recovery_thread);
6502}
6503