1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include "irq.h"
22#include "mmu.h"
23#include "x86.h"
24#include "kvm_cache_regs.h"
25#include "cpuid.h"
26
27#include <linux/kvm_host.h>
28#include <linux/types.h>
29#include <linux/string.h>
30#include <linux/mm.h>
31#include <linux/highmem.h>
32#include <linux/moduleparam.h>
33#include <linux/export.h>
34#include <linux/swap.h>
35#include <linux/hugetlb.h>
36#include <linux/compiler.h>
37#include <linux/srcu.h>
38#include <linux/slab.h>
39#include <linux/sched/signal.h>
40#include <linux/uaccess.h>
41#include <linux/hash.h>
42#include <linux/kern_levels.h>
43
44#include <asm/page.h>
45#include <asm/pat.h>
46#include <asm/cmpxchg.h>
47#include <asm/io.h>
48#include <asm/vmx.h>
49#include <asm/kvm_page_track.h>
50#include "trace.h"
51
52
53
54
55
56
57
58
59bool tdp_enabled = false;
60
61enum {
62 AUDIT_PRE_PAGE_FAULT,
63 AUDIT_POST_PAGE_FAULT,
64 AUDIT_PRE_PTE_WRITE,
65 AUDIT_POST_PTE_WRITE,
66 AUDIT_PRE_SYNC,
67 AUDIT_POST_SYNC
68};
69
70#undef MMU_DEBUG
71
72#ifdef MMU_DEBUG
73static bool dbg = 0;
74module_param(dbg, bool, 0644);
75
76#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
77#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
78#define MMU_WARN_ON(x) WARN_ON(x)
79#else
80#define pgprintk(x...) do { } while (0)
81#define rmap_printk(x...) do { } while (0)
82#define MMU_WARN_ON(x) do { } while (0)
83#endif
84
85#define PTE_PREFETCH_NUM 8
86
87#define PT_FIRST_AVAIL_BITS_SHIFT 10
88#define PT64_SECOND_AVAIL_BITS_SHIFT 52
89
90#define PT64_LEVEL_BITS 9
91
92#define PT64_LEVEL_SHIFT(level) \
93 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
94
95#define PT64_INDEX(address, level)\
96 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
97
98
99#define PT32_LEVEL_BITS 10
100
101#define PT32_LEVEL_SHIFT(level) \
102 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
103
104#define PT32_LVL_OFFSET_MASK(level) \
105 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
106 * PT32_LEVEL_BITS))) - 1))
107
108#define PT32_INDEX(address, level)\
109 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
110
111
112#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
113#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
114#else
115#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
116#endif
117#define PT64_LVL_ADDR_MASK(level) \
118 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
119 * PT64_LEVEL_BITS))) - 1))
120#define PT64_LVL_OFFSET_MASK(level) \
121 (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
122 * PT64_LEVEL_BITS))) - 1))
123
124#define PT32_BASE_ADDR_MASK PAGE_MASK
125#define PT32_DIR_BASE_ADDR_MASK \
126 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
127#define PT32_LVL_ADDR_MASK(level) \
128 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
129 * PT32_LEVEL_BITS))) - 1))
130
131#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
132 | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
133
134#define ACC_EXEC_MASK 1
135#define ACC_WRITE_MASK PT_WRITABLE_MASK
136#define ACC_USER_MASK PT_USER_MASK
137#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
138
139
140#define PT64_EPT_READABLE_MASK 0x1ull
141#define PT64_EPT_EXECUTABLE_MASK 0x4ull
142
143#include <trace/events/kvm.h>
144
145#define CREATE_TRACE_POINTS
146#include "mmutrace.h"
147
148#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
149#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
150
151#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
152
153
154#define PTE_LIST_EXT 3
155
156
157
158
159
160
161
162
163
164enum {
165 RET_PF_RETRY = 0,
166 RET_PF_EMULATE = 1,
167 RET_PF_INVALID = 2,
168};
169
170struct pte_list_desc {
171 u64 *sptes[PTE_LIST_EXT];
172 struct pte_list_desc *more;
173};
174
175struct kvm_shadow_walk_iterator {
176 u64 addr;
177 hpa_t shadow_addr;
178 u64 *sptep;
179 int level;
180 unsigned index;
181};
182
183static const union kvm_mmu_page_role mmu_base_role_mask = {
184 .cr0_wp = 1,
185 .gpte_is_8_bytes = 1,
186 .nxe = 1,
187 .smep_andnot_wp = 1,
188 .smap_andnot_wp = 1,
189 .smm = 1,
190 .guest_mode = 1,
191 .ad_disabled = 1,
192};
193
194#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \
195 for (shadow_walk_init_using_root(&(_walker), (_vcpu), \
196 (_root), (_addr)); \
197 shadow_walk_okay(&(_walker)); \
198 shadow_walk_next(&(_walker)))
199
200#define for_each_shadow_entry(_vcpu, _addr, _walker) \
201 for (shadow_walk_init(&(_walker), _vcpu, _addr); \
202 shadow_walk_okay(&(_walker)); \
203 shadow_walk_next(&(_walker)))
204
205#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
206 for (shadow_walk_init(&(_walker), _vcpu, _addr); \
207 shadow_walk_okay(&(_walker)) && \
208 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
209 __shadow_walk_next(&(_walker), spte))
210
211static struct kmem_cache *pte_list_desc_cache;
212static struct kmem_cache *mmu_page_header_cache;
213static struct percpu_counter kvm_total_used_mmu_pages;
214
215static u64 __read_mostly shadow_nx_mask;
216static u64 __read_mostly shadow_x_mask;
217static u64 __read_mostly shadow_user_mask;
218static u64 __read_mostly shadow_accessed_mask;
219static u64 __read_mostly shadow_dirty_mask;
220static u64 __read_mostly shadow_mmio_mask;
221static u64 __read_mostly shadow_mmio_value;
222static u64 __read_mostly shadow_present_mask;
223static u64 __read_mostly shadow_me_mask;
224
225
226
227
228
229
230static u64 __read_mostly shadow_acc_track_mask;
231static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
232
233
234
235
236
237
238
239static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
240 PT64_EPT_EXECUTABLE_MASK;
241static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
242
243
244
245
246
247static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
248
249
250
251
252static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
253
254
255
256
257
258
259
260
261
262static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
263
264
265static void mmu_spte_set(u64 *sptep, u64 spte);
266static union kvm_mmu_page_role
267kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
268
269
270static inline bool kvm_available_flush_tlb_with_range(void)
271{
272 return kvm_x86_ops->tlb_remote_flush_with_range;
273}
274
275static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
276 struct kvm_tlb_range *range)
277{
278 int ret = -ENOTSUPP;
279
280 if (range && kvm_x86_ops->tlb_remote_flush_with_range)
281 ret = kvm_x86_ops->tlb_remote_flush_with_range(kvm, range);
282
283 if (ret)
284 kvm_flush_remote_tlbs(kvm);
285}
286
287static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
288 u64 start_gfn, u64 pages)
289{
290 struct kvm_tlb_range range;
291
292 range.start_gfn = start_gfn;
293 range.pages = pages;
294
295 kvm_flush_remote_tlbs_with_range(kvm, &range);
296}
297
298void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
299{
300 BUG_ON((mmio_mask & mmio_value) != mmio_value);
301 shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK;
302 shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
303}
304EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
305
306static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
307{
308 return sp->role.ad_disabled;
309}
310
311static inline bool spte_ad_enabled(u64 spte)
312{
313 MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
314 return !(spte & shadow_acc_track_value);
315}
316
317static inline u64 spte_shadow_accessed_mask(u64 spte)
318{
319 MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
320 return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
321}
322
323static inline u64 spte_shadow_dirty_mask(u64 spte)
324{
325 MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
326 return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
327}
328
329static inline bool is_access_track_spte(u64 spte)
330{
331 return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
332}
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348#define MMIO_SPTE_GEN_MASK GENMASK_ULL(18, 0)
349
350#define MMIO_SPTE_GEN_LOW_START 3
351#define MMIO_SPTE_GEN_LOW_END 11
352#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
353 MMIO_SPTE_GEN_LOW_START)
354
355#define MMIO_SPTE_GEN_HIGH_START 52
356#define MMIO_SPTE_GEN_HIGH_END 61
357#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
358 MMIO_SPTE_GEN_HIGH_START)
359static u64 generation_mmio_spte_mask(u64 gen)
360{
361 u64 mask;
362
363 WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
364
365 mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
366 mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
367 return mask;
368}
369
370static u64 get_mmio_spte_generation(u64 spte)
371{
372 u64 gen;
373
374 spte &= ~shadow_mmio_mask;
375
376 gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
377 gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
378 return gen;
379}
380
381static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
382 unsigned access)
383{
384 u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
385 u64 mask = generation_mmio_spte_mask(gen);
386 u64 gpa = gfn << PAGE_SHIFT;
387
388 access &= ACC_WRITE_MASK | ACC_USER_MASK;
389 mask |= shadow_mmio_value | access;
390 mask |= gpa | shadow_nonpresent_or_rsvd_mask;
391 mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
392 << shadow_nonpresent_or_rsvd_mask_len;
393
394 page_header(__pa(sptep))->mmio_cached = true;
395
396 trace_mark_mmio_spte(sptep, gfn, access, gen);
397 mmu_spte_set(sptep, mask);
398}
399
400static bool is_mmio_spte(u64 spte)
401{
402 return (spte & shadow_mmio_mask) == shadow_mmio_value;
403}
404
405static gfn_t get_mmio_spte_gfn(u64 spte)
406{
407 u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
408
409 gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
410 & shadow_nonpresent_or_rsvd_mask;
411
412 return gpa >> PAGE_SHIFT;
413}
414
415static unsigned get_mmio_spte_access(u64 spte)
416{
417 u64 mask = generation_mmio_spte_mask(MMIO_SPTE_GEN_MASK) | shadow_mmio_mask;
418 return (spte & ~mask) & ~PAGE_MASK;
419}
420
421static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
422 kvm_pfn_t pfn, unsigned access)
423{
424 if (unlikely(is_noslot_pfn(pfn))) {
425 mark_mmio_spte(vcpu, sptep, gfn, access);
426 return true;
427 }
428
429 return false;
430}
431
432static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
433{
434 u64 kvm_gen, spte_gen, gen;
435
436 gen = kvm_vcpu_memslots(vcpu)->generation;
437 if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
438 return false;
439
440 kvm_gen = gen & MMIO_SPTE_GEN_MASK;
441 spte_gen = get_mmio_spte_generation(spte);
442
443 trace_check_mmio_spte(spte, kvm_gen, spte_gen);
444 return likely(kvm_gen == spte_gen);
445}
446
447
448
449
450
451
452
453
454void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
455 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
456 u64 acc_track_mask, u64 me_mask)
457{
458 BUG_ON(!dirty_mask != !accessed_mask);
459 BUG_ON(!accessed_mask && !acc_track_mask);
460 BUG_ON(acc_track_mask & shadow_acc_track_value);
461
462 shadow_user_mask = user_mask;
463 shadow_accessed_mask = accessed_mask;
464 shadow_dirty_mask = dirty_mask;
465 shadow_nx_mask = nx_mask;
466 shadow_x_mask = x_mask;
467 shadow_present_mask = p_mask;
468 shadow_acc_track_mask = acc_track_mask;
469 shadow_me_mask = me_mask;
470}
471EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
472
473static void kvm_mmu_reset_all_pte_masks(void)
474{
475 u8 low_phys_bits;
476
477 shadow_user_mask = 0;
478 shadow_accessed_mask = 0;
479 shadow_dirty_mask = 0;
480 shadow_nx_mask = 0;
481 shadow_x_mask = 0;
482 shadow_mmio_mask = 0;
483 shadow_present_mask = 0;
484 shadow_acc_track_mask = 0;
485
486
487
488
489
490
491 low_phys_bits = boot_cpu_data.x86_phys_bits;
492 if (boot_cpu_data.x86_phys_bits <
493 52 - shadow_nonpresent_or_rsvd_mask_len) {
494 shadow_nonpresent_or_rsvd_mask =
495 rsvd_bits(boot_cpu_data.x86_phys_bits -
496 shadow_nonpresent_or_rsvd_mask_len,
497 boot_cpu_data.x86_phys_bits - 1);
498 low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len;
499 }
500 shadow_nonpresent_or_rsvd_lower_gfn_mask =
501 GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
502}
503
504static int is_cpuid_PSE36(void)
505{
506 return 1;
507}
508
509static int is_nx(struct kvm_vcpu *vcpu)
510{
511 return vcpu->arch.efer & EFER_NX;
512}
513
514static int is_shadow_present_pte(u64 pte)
515{
516 return (pte != 0) && !is_mmio_spte(pte);
517}
518
519static int is_large_pte(u64 pte)
520{
521 return pte & PT_PAGE_SIZE_MASK;
522}
523
524static int is_last_spte(u64 pte, int level)
525{
526 if (level == PT_PAGE_TABLE_LEVEL)
527 return 1;
528 if (is_large_pte(pte))
529 return 1;
530 return 0;
531}
532
533static bool is_executable_pte(u64 spte)
534{
535 return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
536}
537
538static kvm_pfn_t spte_to_pfn(u64 pte)
539{
540 return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
541}
542
543static gfn_t pse36_gfn_delta(u32 gpte)
544{
545 int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
546
547 return (gpte & PT32_DIR_PSE36_MASK) << shift;
548}
549
550#ifdef CONFIG_X86_64
551static void __set_spte(u64 *sptep, u64 spte)
552{
553 WRITE_ONCE(*sptep, spte);
554}
555
556static void __update_clear_spte_fast(u64 *sptep, u64 spte)
557{
558 WRITE_ONCE(*sptep, spte);
559}
560
561static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
562{
563 return xchg(sptep, spte);
564}
565
566static u64 __get_spte_lockless(u64 *sptep)
567{
568 return READ_ONCE(*sptep);
569}
570#else
571union split_spte {
572 struct {
573 u32 spte_low;
574 u32 spte_high;
575 };
576 u64 spte;
577};
578
579static void count_spte_clear(u64 *sptep, u64 spte)
580{
581 struct kvm_mmu_page *sp = page_header(__pa(sptep));
582
583 if (is_shadow_present_pte(spte))
584 return;
585
586
587 smp_wmb();
588 sp->clear_spte_count++;
589}
590
591static void __set_spte(u64 *sptep, u64 spte)
592{
593 union split_spte *ssptep, sspte;
594
595 ssptep = (union split_spte *)sptep;
596 sspte = (union split_spte)spte;
597
598 ssptep->spte_high = sspte.spte_high;
599
600
601
602
603
604
605 smp_wmb();
606
607 WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
608}
609
610static void __update_clear_spte_fast(u64 *sptep, u64 spte)
611{
612 union split_spte *ssptep, sspte;
613
614 ssptep = (union split_spte *)sptep;
615 sspte = (union split_spte)spte;
616
617 WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
618
619
620
621
622
623 smp_wmb();
624
625 ssptep->spte_high = sspte.spte_high;
626 count_spte_clear(sptep, spte);
627}
628
629static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
630{
631 union split_spte *ssptep, sspte, orig;
632
633 ssptep = (union split_spte *)sptep;
634 sspte = (union split_spte)spte;
635
636
637 orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
638 orig.spte_high = ssptep->spte_high;
639 ssptep->spte_high = sspte.spte_high;
640 count_spte_clear(sptep, spte);
641
642 return orig.spte;
643}
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663static u64 __get_spte_lockless(u64 *sptep)
664{
665 struct kvm_mmu_page *sp = page_header(__pa(sptep));
666 union split_spte spte, *orig = (union split_spte *)sptep;
667 int count;
668
669retry:
670 count = sp->clear_spte_count;
671 smp_rmb();
672
673 spte.spte_low = orig->spte_low;
674 smp_rmb();
675
676 spte.spte_high = orig->spte_high;
677 smp_rmb();
678
679 if (unlikely(spte.spte_low != orig->spte_low ||
680 count != sp->clear_spte_count))
681 goto retry;
682
683 return spte.spte;
684}
685#endif
686
687static bool spte_can_locklessly_be_made_writable(u64 spte)
688{
689 return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
690 (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
691}
692
693static bool spte_has_volatile_bits(u64 spte)
694{
695 if (!is_shadow_present_pte(spte))
696 return false;
697
698
699
700
701
702
703
704 if (spte_can_locklessly_be_made_writable(spte) ||
705 is_access_track_spte(spte))
706 return true;
707
708 if (spte_ad_enabled(spte)) {
709 if ((spte & shadow_accessed_mask) == 0 ||
710 (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
711 return true;
712 }
713
714 return false;
715}
716
717static bool is_accessed_spte(u64 spte)
718{
719 u64 accessed_mask = spte_shadow_accessed_mask(spte);
720
721 return accessed_mask ? spte & accessed_mask
722 : !is_access_track_spte(spte);
723}
724
725static bool is_dirty_spte(u64 spte)
726{
727 u64 dirty_mask = spte_shadow_dirty_mask(spte);
728
729 return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
730}
731
732
733
734
735
736
737
738static void mmu_spte_set(u64 *sptep, u64 new_spte)
739{
740 WARN_ON(is_shadow_present_pte(*sptep));
741 __set_spte(sptep, new_spte);
742}
743
744
745
746
747
748static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
749{
750 u64 old_spte = *sptep;
751
752 WARN_ON(!is_shadow_present_pte(new_spte));
753
754 if (!is_shadow_present_pte(old_spte)) {
755 mmu_spte_set(sptep, new_spte);
756 return old_spte;
757 }
758
759 if (!spte_has_volatile_bits(old_spte))
760 __update_clear_spte_fast(sptep, new_spte);
761 else
762 old_spte = __update_clear_spte_slow(sptep, new_spte);
763
764 WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
765
766 return old_spte;
767}
768
769
770
771
772
773
774
775
776
777
778
779
780static bool mmu_spte_update(u64 *sptep, u64 new_spte)
781{
782 bool flush = false;
783 u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
784
785 if (!is_shadow_present_pte(old_spte))
786 return false;
787
788
789
790
791
792
793 if (spte_can_locklessly_be_made_writable(old_spte) &&
794 !is_writable_pte(new_spte))
795 flush = true;
796
797
798
799
800
801
802 if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
803 flush = true;
804 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
805 }
806
807 if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
808 flush = true;
809 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
810 }
811
812 return flush;
813}
814
815
816
817
818
819
820
821static int mmu_spte_clear_track_bits(u64 *sptep)
822{
823 kvm_pfn_t pfn;
824 u64 old_spte = *sptep;
825
826 if (!spte_has_volatile_bits(old_spte))
827 __update_clear_spte_fast(sptep, 0ull);
828 else
829 old_spte = __update_clear_spte_slow(sptep, 0ull);
830
831 if (!is_shadow_present_pte(old_spte))
832 return 0;
833
834 pfn = spte_to_pfn(old_spte);
835
836
837
838
839
840
841 WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
842
843 if (is_accessed_spte(old_spte))
844 kvm_set_pfn_accessed(pfn);
845
846 if (is_dirty_spte(old_spte))
847 kvm_set_pfn_dirty(pfn);
848
849 return 1;
850}
851
852
853
854
855
856
857static void mmu_spte_clear_no_track(u64 *sptep)
858{
859 __update_clear_spte_fast(sptep, 0ull);
860}
861
862static u64 mmu_spte_get_lockless(u64 *sptep)
863{
864 return __get_spte_lockless(sptep);
865}
866
867static u64 mark_spte_for_access_track(u64 spte)
868{
869 if (spte_ad_enabled(spte))
870 return spte & ~shadow_accessed_mask;
871
872 if (is_access_track_spte(spte))
873 return spte;
874
875
876
877
878
879
880 WARN_ONCE((spte & PT_WRITABLE_MASK) &&
881 !spte_can_locklessly_be_made_writable(spte),
882 "kvm: Writable SPTE is not locklessly dirty-trackable\n");
883
884 WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
885 shadow_acc_track_saved_bits_shift),
886 "kvm: Access Tracking saved bit locations are not zero\n");
887
888 spte |= (spte & shadow_acc_track_saved_bits_mask) <<
889 shadow_acc_track_saved_bits_shift;
890 spte &= ~shadow_acc_track_mask;
891
892 return spte;
893}
894
895
896static u64 restore_acc_track_spte(u64 spte)
897{
898 u64 new_spte = spte;
899 u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
900 & shadow_acc_track_saved_bits_mask;
901
902 WARN_ON_ONCE(spte_ad_enabled(spte));
903 WARN_ON_ONCE(!is_access_track_spte(spte));
904
905 new_spte &= ~shadow_acc_track_mask;
906 new_spte &= ~(shadow_acc_track_saved_bits_mask <<
907 shadow_acc_track_saved_bits_shift);
908 new_spte |= saved_bits;
909
910 return new_spte;
911}
912
913
914static bool mmu_spte_age(u64 *sptep)
915{
916 u64 spte = mmu_spte_get_lockless(sptep);
917
918 if (!is_accessed_spte(spte))
919 return false;
920
921 if (spte_ad_enabled(spte)) {
922 clear_bit((ffs(shadow_accessed_mask) - 1),
923 (unsigned long *)sptep);
924 } else {
925
926
927
928
929 if (is_writable_pte(spte))
930 kvm_set_pfn_dirty(spte_to_pfn(spte));
931
932 spte = mark_spte_for_access_track(spte);
933 mmu_spte_update_no_track(sptep, spte);
934 }
935
936 return true;
937}
938
939static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
940{
941
942
943
944
945 local_irq_disable();
946
947
948
949
950
951 smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
952}
953
954static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
955{
956
957
958
959
960
961 smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
962 local_irq_enable();
963}
964
965static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
966 struct kmem_cache *base_cache, int min)
967{
968 void *obj;
969
970 if (cache->nobjs >= min)
971 return 0;
972 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
973 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT);
974 if (!obj)
975 return cache->nobjs >= min ? 0 : -ENOMEM;
976 cache->objects[cache->nobjs++] = obj;
977 }
978 return 0;
979}
980
981static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
982{
983 return cache->nobjs;
984}
985
986static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
987 struct kmem_cache *cache)
988{
989 while (mc->nobjs)
990 kmem_cache_free(cache, mc->objects[--mc->nobjs]);
991}
992
993static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
994 int min)
995{
996 void *page;
997
998 if (cache->nobjs >= min)
999 return 0;
1000 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
1001 page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
1002 if (!page)
1003 return cache->nobjs >= min ? 0 : -ENOMEM;
1004 cache->objects[cache->nobjs++] = page;
1005 }
1006 return 0;
1007}
1008
1009static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
1010{
1011 while (mc->nobjs)
1012 free_page((unsigned long)mc->objects[--mc->nobjs]);
1013}
1014
1015static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
1016{
1017 int r;
1018
1019 r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
1020 pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
1021 if (r)
1022 goto out;
1023 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
1024 if (r)
1025 goto out;
1026 r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
1027 mmu_page_header_cache, 4);
1028out:
1029 return r;
1030}
1031
1032static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
1033{
1034 mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
1035 pte_list_desc_cache);
1036 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
1037 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
1038 mmu_page_header_cache);
1039}
1040
1041static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
1042{
1043 void *p;
1044
1045 BUG_ON(!mc->nobjs);
1046 p = mc->objects[--mc->nobjs];
1047 return p;
1048}
1049
1050static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
1051{
1052 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
1053}
1054
1055static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
1056{
1057 kmem_cache_free(pte_list_desc_cache, pte_list_desc);
1058}
1059
1060static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
1061{
1062 if (!sp->role.direct)
1063 return sp->gfns[index];
1064
1065 return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
1066}
1067
1068static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
1069{
1070 if (sp->role.direct)
1071 BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
1072 else
1073 sp->gfns[index] = gfn;
1074}
1075
1076
1077
1078
1079
1080static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
1081 struct kvm_memory_slot *slot,
1082 int level)
1083{
1084 unsigned long idx;
1085
1086 idx = gfn_to_index(gfn, slot->base_gfn, level);
1087 return &slot->arch.lpage_info[level - 2][idx];
1088}
1089
1090static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
1091 gfn_t gfn, int count)
1092{
1093 struct kvm_lpage_info *linfo;
1094 int i;
1095
1096 for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1097 linfo = lpage_info_slot(gfn, slot, i);
1098 linfo->disallow_lpage += count;
1099 WARN_ON(linfo->disallow_lpage < 0);
1100 }
1101}
1102
1103void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
1104{
1105 update_gfn_disallow_lpage_count(slot, gfn, 1);
1106}
1107
1108void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
1109{
1110 update_gfn_disallow_lpage_count(slot, gfn, -1);
1111}
1112
1113static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1114{
1115 struct kvm_memslots *slots;
1116 struct kvm_memory_slot *slot;
1117 gfn_t gfn;
1118
1119 kvm->arch.indirect_shadow_pages++;
1120 gfn = sp->gfn;
1121 slots = kvm_memslots_for_spte_role(kvm, sp->role);
1122 slot = __gfn_to_memslot(slots, gfn);
1123
1124
1125 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
1126 return kvm_slot_page_track_add_page(kvm, slot, gfn,
1127 KVM_PAGE_TRACK_WRITE);
1128
1129 kvm_mmu_gfn_disallow_lpage(slot, gfn);
1130}
1131
1132static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1133{
1134 struct kvm_memslots *slots;
1135 struct kvm_memory_slot *slot;
1136 gfn_t gfn;
1137
1138 kvm->arch.indirect_shadow_pages--;
1139 gfn = sp->gfn;
1140 slots = kvm_memslots_for_spte_role(kvm, sp->role);
1141 slot = __gfn_to_memslot(slots, gfn);
1142 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
1143 return kvm_slot_page_track_remove_page(kvm, slot, gfn,
1144 KVM_PAGE_TRACK_WRITE);
1145
1146 kvm_mmu_gfn_allow_lpage(slot, gfn);
1147}
1148
1149static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
1150 struct kvm_memory_slot *slot)
1151{
1152 struct kvm_lpage_info *linfo;
1153
1154 if (slot) {
1155 linfo = lpage_info_slot(gfn, slot, level);
1156 return !!linfo->disallow_lpage;
1157 }
1158
1159 return true;
1160}
1161
1162static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
1163 int level)
1164{
1165 struct kvm_memory_slot *slot;
1166
1167 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1168 return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
1169}
1170
1171static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
1172{
1173 unsigned long page_size;
1174 int i, ret = 0;
1175
1176 page_size = kvm_host_page_size(kvm, gfn);
1177
1178 for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1179 if (page_size >= KVM_HPAGE_SIZE(i))
1180 ret = i;
1181 else
1182 break;
1183 }
1184
1185 return ret;
1186}
1187
1188static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot,
1189 bool no_dirty_log)
1190{
1191 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
1192 return false;
1193 if (no_dirty_log && slot->dirty_bitmap)
1194 return false;
1195
1196 return true;
1197}
1198
1199static struct kvm_memory_slot *
1200gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
1201 bool no_dirty_log)
1202{
1203 struct kvm_memory_slot *slot;
1204
1205 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1206 if (!memslot_valid_for_gpte(slot, no_dirty_log))
1207 slot = NULL;
1208
1209 return slot;
1210}
1211
1212static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
1213 bool *force_pt_level)
1214{
1215 int host_level, level, max_level;
1216 struct kvm_memory_slot *slot;
1217
1218 if (unlikely(*force_pt_level))
1219 return PT_PAGE_TABLE_LEVEL;
1220
1221 slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
1222 *force_pt_level = !memslot_valid_for_gpte(slot, true);
1223 if (unlikely(*force_pt_level))
1224 return PT_PAGE_TABLE_LEVEL;
1225
1226 host_level = host_mapping_level(vcpu->kvm, large_gfn);
1227
1228 if (host_level == PT_PAGE_TABLE_LEVEL)
1229 return host_level;
1230
1231 max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
1232
1233 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
1234 if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
1235 break;
1236
1237 return level - 1;
1238}
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
1252 struct kvm_rmap_head *rmap_head)
1253{
1254 struct pte_list_desc *desc;
1255 int i, count = 0;
1256
1257 if (!rmap_head->val) {
1258 rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
1259 rmap_head->val = (unsigned long)spte;
1260 } else if (!(rmap_head->val & 1)) {
1261 rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
1262 desc = mmu_alloc_pte_list_desc(vcpu);
1263 desc->sptes[0] = (u64 *)rmap_head->val;
1264 desc->sptes[1] = spte;
1265 rmap_head->val = (unsigned long)desc | 1;
1266 ++count;
1267 } else {
1268 rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
1269 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1270 while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
1271 desc = desc->more;
1272 count += PTE_LIST_EXT;
1273 }
1274 if (desc->sptes[PTE_LIST_EXT-1]) {
1275 desc->more = mmu_alloc_pte_list_desc(vcpu);
1276 desc = desc->more;
1277 }
1278 for (i = 0; desc->sptes[i]; ++i)
1279 ++count;
1280 desc->sptes[i] = spte;
1281 }
1282 return count;
1283}
1284
1285static void
1286pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
1287 struct pte_list_desc *desc, int i,
1288 struct pte_list_desc *prev_desc)
1289{
1290 int j;
1291
1292 for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
1293 ;
1294 desc->sptes[i] = desc->sptes[j];
1295 desc->sptes[j] = NULL;
1296 if (j != 0)
1297 return;
1298 if (!prev_desc && !desc->more)
1299 rmap_head->val = (unsigned long)desc->sptes[0];
1300 else
1301 if (prev_desc)
1302 prev_desc->more = desc->more;
1303 else
1304 rmap_head->val = (unsigned long)desc->more | 1;
1305 mmu_free_pte_list_desc(desc);
1306}
1307
1308static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
1309{
1310 struct pte_list_desc *desc;
1311 struct pte_list_desc *prev_desc;
1312 int i;
1313
1314 if (!rmap_head->val) {
1315 pr_err("%s: %p 0->BUG\n", __func__, spte);
1316 BUG();
1317 } else if (!(rmap_head->val & 1)) {
1318 rmap_printk("%s: %p 1->0\n", __func__, spte);
1319 if ((u64 *)rmap_head->val != spte) {
1320 pr_err("%s: %p 1->BUG\n", __func__, spte);
1321 BUG();
1322 }
1323 rmap_head->val = 0;
1324 } else {
1325 rmap_printk("%s: %p many->many\n", __func__, spte);
1326 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1327 prev_desc = NULL;
1328 while (desc) {
1329 for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
1330 if (desc->sptes[i] == spte) {
1331 pte_list_desc_remove_entry(rmap_head,
1332 desc, i, prev_desc);
1333 return;
1334 }
1335 }
1336 prev_desc = desc;
1337 desc = desc->more;
1338 }
1339 pr_err("%s: %p many->many\n", __func__, spte);
1340 BUG();
1341 }
1342}
1343
1344static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep)
1345{
1346 mmu_spte_clear_track_bits(sptep);
1347 __pte_list_remove(sptep, rmap_head);
1348}
1349
1350static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
1351 struct kvm_memory_slot *slot)
1352{
1353 unsigned long idx;
1354
1355 idx = gfn_to_index(gfn, slot->base_gfn, level);
1356 return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
1357}
1358
1359static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
1360 struct kvm_mmu_page *sp)
1361{
1362 struct kvm_memslots *slots;
1363 struct kvm_memory_slot *slot;
1364
1365 slots = kvm_memslots_for_spte_role(kvm, sp->role);
1366 slot = __gfn_to_memslot(slots, gfn);
1367 return __gfn_to_rmap(gfn, sp->role.level, slot);
1368}
1369
1370static bool rmap_can_add(struct kvm_vcpu *vcpu)
1371{
1372 struct kvm_mmu_memory_cache *cache;
1373
1374 cache = &vcpu->arch.mmu_pte_list_desc_cache;
1375 return mmu_memory_cache_free_objects(cache);
1376}
1377
1378static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
1379{
1380 struct kvm_mmu_page *sp;
1381 struct kvm_rmap_head *rmap_head;
1382
1383 sp = page_header(__pa(spte));
1384 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
1385 rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
1386 return pte_list_add(vcpu, spte, rmap_head);
1387}
1388
1389static void rmap_remove(struct kvm *kvm, u64 *spte)
1390{
1391 struct kvm_mmu_page *sp;
1392 gfn_t gfn;
1393 struct kvm_rmap_head *rmap_head;
1394
1395 sp = page_header(__pa(spte));
1396 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
1397 rmap_head = gfn_to_rmap(kvm, gfn, sp);
1398 __pte_list_remove(spte, rmap_head);
1399}
1400
1401
1402
1403
1404
1405struct rmap_iterator {
1406
1407 struct pte_list_desc *desc;
1408 int pos;
1409};
1410
1411
1412
1413
1414
1415
1416
1417
1418static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
1419 struct rmap_iterator *iter)
1420{
1421 u64 *sptep;
1422
1423 if (!rmap_head->val)
1424 return NULL;
1425
1426 if (!(rmap_head->val & 1)) {
1427 iter->desc = NULL;
1428 sptep = (u64 *)rmap_head->val;
1429 goto out;
1430 }
1431
1432 iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1433 iter->pos = 0;
1434 sptep = iter->desc->sptes[iter->pos];
1435out:
1436 BUG_ON(!is_shadow_present_pte(*sptep));
1437 return sptep;
1438}
1439
1440
1441
1442
1443
1444
1445static u64 *rmap_get_next(struct rmap_iterator *iter)
1446{
1447 u64 *sptep;
1448
1449 if (iter->desc) {
1450 if (iter->pos < PTE_LIST_EXT - 1) {
1451 ++iter->pos;
1452 sptep = iter->desc->sptes[iter->pos];
1453 if (sptep)
1454 goto out;
1455 }
1456
1457 iter->desc = iter->desc->more;
1458
1459 if (iter->desc) {
1460 iter->pos = 0;
1461
1462 sptep = iter->desc->sptes[iter->pos];
1463 goto out;
1464 }
1465 }
1466
1467 return NULL;
1468out:
1469 BUG_ON(!is_shadow_present_pte(*sptep));
1470 return sptep;
1471}
1472
1473#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \
1474 for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \
1475 _spte_; _spte_ = rmap_get_next(_iter_))
1476
1477static void drop_spte(struct kvm *kvm, u64 *sptep)
1478{
1479 if (mmu_spte_clear_track_bits(sptep))
1480 rmap_remove(kvm, sptep);
1481}
1482
1483
1484static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
1485{
1486 if (is_large_pte(*sptep)) {
1487 WARN_ON(page_header(__pa(sptep))->role.level ==
1488 PT_PAGE_TABLE_LEVEL);
1489 drop_spte(kvm, sptep);
1490 --kvm->stat.lpages;
1491 return true;
1492 }
1493
1494 return false;
1495}
1496
1497static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1498{
1499 if (__drop_large_spte(vcpu->kvm, sptep)) {
1500 struct kvm_mmu_page *sp = page_header(__pa(sptep));
1501
1502 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1503 KVM_PAGES_PER_HPAGE(sp->role.level));
1504 }
1505}
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520static bool spte_write_protect(u64 *sptep, bool pt_protect)
1521{
1522 u64 spte = *sptep;
1523
1524 if (!is_writable_pte(spte) &&
1525 !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
1526 return false;
1527
1528 rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
1529
1530 if (pt_protect)
1531 spte &= ~SPTE_MMU_WRITEABLE;
1532 spte = spte & ~PT_WRITABLE_MASK;
1533
1534 return mmu_spte_update(sptep, spte);
1535}
1536
1537static bool __rmap_write_protect(struct kvm *kvm,
1538 struct kvm_rmap_head *rmap_head,
1539 bool pt_protect)
1540{
1541 u64 *sptep;
1542 struct rmap_iterator iter;
1543 bool flush = false;
1544
1545 for_each_rmap_spte(rmap_head, &iter, sptep)
1546 flush |= spte_write_protect(sptep, pt_protect);
1547
1548 return flush;
1549}
1550
1551static bool spte_clear_dirty(u64 *sptep)
1552{
1553 u64 spte = *sptep;
1554
1555 rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
1556
1557 spte &= ~shadow_dirty_mask;
1558
1559 return mmu_spte_update(sptep, spte);
1560}
1561
1562static bool wrprot_ad_disabled_spte(u64 *sptep)
1563{
1564 bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
1565 (unsigned long *)sptep);
1566 if (was_writable)
1567 kvm_set_pfn_dirty(spte_to_pfn(*sptep));
1568
1569 return was_writable;
1570}
1571
1572
1573
1574
1575
1576
1577
1578static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1579{
1580 u64 *sptep;
1581 struct rmap_iterator iter;
1582 bool flush = false;
1583
1584 for_each_rmap_spte(rmap_head, &iter, sptep)
1585 if (spte_ad_enabled(*sptep))
1586 flush |= spte_clear_dirty(sptep);
1587 else
1588 flush |= wrprot_ad_disabled_spte(sptep);
1589
1590 return flush;
1591}
1592
1593static bool spte_set_dirty(u64 *sptep)
1594{
1595 u64 spte = *sptep;
1596
1597 rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
1598
1599 spte |= shadow_dirty_mask;
1600
1601 return mmu_spte_update(sptep, spte);
1602}
1603
1604static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1605{
1606 u64 *sptep;
1607 struct rmap_iterator iter;
1608 bool flush = false;
1609
1610 for_each_rmap_spte(rmap_head, &iter, sptep)
1611 if (spte_ad_enabled(*sptep))
1612 flush |= spte_set_dirty(sptep);
1613
1614 return flush;
1615}
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1628 struct kvm_memory_slot *slot,
1629 gfn_t gfn_offset, unsigned long mask)
1630{
1631 struct kvm_rmap_head *rmap_head;
1632
1633 while (mask) {
1634 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1635 PT_PAGE_TABLE_LEVEL, slot);
1636 __rmap_write_protect(kvm, rmap_head, false);
1637
1638
1639 mask &= mask - 1;
1640 }
1641}
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1654 struct kvm_memory_slot *slot,
1655 gfn_t gfn_offset, unsigned long mask)
1656{
1657 struct kvm_rmap_head *rmap_head;
1658
1659 while (mask) {
1660 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1661 PT_PAGE_TABLE_LEVEL, slot);
1662 __rmap_clear_dirty(kvm, rmap_head);
1663
1664
1665 mask &= mask - 1;
1666 }
1667}
1668EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1681 struct kvm_memory_slot *slot,
1682 gfn_t gfn_offset, unsigned long mask)
1683{
1684 if (kvm_x86_ops->enable_log_dirty_pt_masked)
1685 kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
1686 mask);
1687 else
1688 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1689}
1690
1691
1692
1693
1694
1695
1696
1697
1698int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu)
1699{
1700 if (kvm_x86_ops->write_log_dirty)
1701 return kvm_x86_ops->write_log_dirty(vcpu);
1702
1703 return 0;
1704}
1705
1706bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
1707 struct kvm_memory_slot *slot, u64 gfn)
1708{
1709 struct kvm_rmap_head *rmap_head;
1710 int i;
1711 bool write_protected = false;
1712
1713 for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1714 rmap_head = __gfn_to_rmap(gfn, i, slot);
1715 write_protected |= __rmap_write_protect(kvm, rmap_head, true);
1716 }
1717
1718 return write_protected;
1719}
1720
1721static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
1722{
1723 struct kvm_memory_slot *slot;
1724
1725 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1726 return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
1727}
1728
1729static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1730{
1731 u64 *sptep;
1732 struct rmap_iterator iter;
1733 bool flush = false;
1734
1735 while ((sptep = rmap_get_first(rmap_head, &iter))) {
1736 rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
1737
1738 pte_list_remove(rmap_head, sptep);
1739 flush = true;
1740 }
1741
1742 return flush;
1743}
1744
1745static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1746 struct kvm_memory_slot *slot, gfn_t gfn, int level,
1747 unsigned long data)
1748{
1749 return kvm_zap_rmapp(kvm, rmap_head);
1750}
1751
1752static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1753 struct kvm_memory_slot *slot, gfn_t gfn, int level,
1754 unsigned long data)
1755{
1756 u64 *sptep;
1757 struct rmap_iterator iter;
1758 int need_flush = 0;
1759 u64 new_spte;
1760 pte_t *ptep = (pte_t *)data;
1761 kvm_pfn_t new_pfn;
1762
1763 WARN_ON(pte_huge(*ptep));
1764 new_pfn = pte_pfn(*ptep);
1765
1766restart:
1767 for_each_rmap_spte(rmap_head, &iter, sptep) {
1768 rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
1769 sptep, *sptep, gfn, level);
1770
1771 need_flush = 1;
1772
1773 if (pte_write(*ptep)) {
1774 pte_list_remove(rmap_head, sptep);
1775 goto restart;
1776 } else {
1777 new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
1778 new_spte |= (u64)new_pfn << PAGE_SHIFT;
1779
1780 new_spte &= ~PT_WRITABLE_MASK;
1781 new_spte &= ~SPTE_HOST_WRITEABLE;
1782
1783 new_spte = mark_spte_for_access_track(new_spte);
1784
1785 mmu_spte_clear_track_bits(sptep);
1786 mmu_spte_set(sptep, new_spte);
1787 }
1788 }
1789
1790 if (need_flush && kvm_available_flush_tlb_with_range()) {
1791 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1792 return 0;
1793 }
1794
1795 return need_flush;
1796}
1797
1798struct slot_rmap_walk_iterator {
1799
1800 struct kvm_memory_slot *slot;
1801 gfn_t start_gfn;
1802 gfn_t end_gfn;
1803 int start_level;
1804 int end_level;
1805
1806
1807 gfn_t gfn;
1808 struct kvm_rmap_head *rmap;
1809 int level;
1810
1811
1812 struct kvm_rmap_head *end_rmap;
1813};
1814
1815static void
1816rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
1817{
1818 iterator->level = level;
1819 iterator->gfn = iterator->start_gfn;
1820 iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
1821 iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
1822 iterator->slot);
1823}
1824
1825static void
1826slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
1827 struct kvm_memory_slot *slot, int start_level,
1828 int end_level, gfn_t start_gfn, gfn_t end_gfn)
1829{
1830 iterator->slot = slot;
1831 iterator->start_level = start_level;
1832 iterator->end_level = end_level;
1833 iterator->start_gfn = start_gfn;
1834 iterator->end_gfn = end_gfn;
1835
1836 rmap_walk_init_level(iterator, iterator->start_level);
1837}
1838
1839static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
1840{
1841 return !!iterator->rmap;
1842}
1843
1844static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
1845{
1846 if (++iterator->rmap <= iterator->end_rmap) {
1847 iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
1848 return;
1849 }
1850
1851 if (++iterator->level > iterator->end_level) {
1852 iterator->rmap = NULL;
1853 return;
1854 }
1855
1856 rmap_walk_init_level(iterator, iterator->level);
1857}
1858
1859#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \
1860 _start_gfn, _end_gfn, _iter_) \
1861 for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \
1862 _end_level_, _start_gfn, _end_gfn); \
1863 slot_rmap_walk_okay(_iter_); \
1864 slot_rmap_walk_next(_iter_))
1865
1866static int kvm_handle_hva_range(struct kvm *kvm,
1867 unsigned long start,
1868 unsigned long end,
1869 unsigned long data,
1870 int (*handler)(struct kvm *kvm,
1871 struct kvm_rmap_head *rmap_head,
1872 struct kvm_memory_slot *slot,
1873 gfn_t gfn,
1874 int level,
1875 unsigned long data))
1876{
1877 struct kvm_memslots *slots;
1878 struct kvm_memory_slot *memslot;
1879 struct slot_rmap_walk_iterator iterator;
1880 int ret = 0;
1881 int i;
1882
1883 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1884 slots = __kvm_memslots(kvm, i);
1885 kvm_for_each_memslot(memslot, slots) {
1886 unsigned long hva_start, hva_end;
1887 gfn_t gfn_start, gfn_end;
1888
1889 hva_start = max(start, memslot->userspace_addr);
1890 hva_end = min(end, memslot->userspace_addr +
1891 (memslot->npages << PAGE_SHIFT));
1892 if (hva_start >= hva_end)
1893 continue;
1894
1895
1896
1897
1898 gfn_start = hva_to_gfn_memslot(hva_start, memslot);
1899 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
1900
1901 for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
1902 PT_MAX_HUGEPAGE_LEVEL,
1903 gfn_start, gfn_end - 1,
1904 &iterator)
1905 ret |= handler(kvm, iterator.rmap, memslot,
1906 iterator.gfn, iterator.level, data);
1907 }
1908 }
1909
1910 return ret;
1911}
1912
1913static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
1914 unsigned long data,
1915 int (*handler)(struct kvm *kvm,
1916 struct kvm_rmap_head *rmap_head,
1917 struct kvm_memory_slot *slot,
1918 gfn_t gfn, int level,
1919 unsigned long data))
1920{
1921 return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
1922}
1923
1924int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
1925{
1926 return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
1927}
1928
1929int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1930{
1931 return kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
1932}
1933
1934static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1935 struct kvm_memory_slot *slot, gfn_t gfn, int level,
1936 unsigned long data)
1937{
1938 u64 *sptep;
1939 struct rmap_iterator uninitialized_var(iter);
1940 int young = 0;
1941
1942 for_each_rmap_spte(rmap_head, &iter, sptep)
1943 young |= mmu_spte_age(sptep);
1944
1945 trace_kvm_age_page(gfn, level, slot, young);
1946 return young;
1947}
1948
1949static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1950 struct kvm_memory_slot *slot, gfn_t gfn,
1951 int level, unsigned long data)
1952{
1953 u64 *sptep;
1954 struct rmap_iterator iter;
1955
1956 for_each_rmap_spte(rmap_head, &iter, sptep)
1957 if (is_accessed_spte(*sptep))
1958 return 1;
1959 return 0;
1960}
1961
1962#define RMAP_RECYCLE_THRESHOLD 1000
1963
1964static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
1965{
1966 struct kvm_rmap_head *rmap_head;
1967 struct kvm_mmu_page *sp;
1968
1969 sp = page_header(__pa(spte));
1970
1971 rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
1972
1973 kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
1974 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1975 KVM_PAGES_PER_HPAGE(sp->role.level));
1976}
1977
1978int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
1979{
1980 return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
1981}
1982
1983int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
1984{
1985 return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
1986}
1987
1988#ifdef MMU_DEBUG
1989static int is_empty_shadow_page(u64 *spt)
1990{
1991 u64 *pos;
1992 u64 *end;
1993
1994 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
1995 if (is_shadow_present_pte(*pos)) {
1996 printk(KERN_ERR "%s: %p %llx\n", __func__,
1997 pos, *pos);
1998 return 0;
1999 }
2000 return 1;
2001}
2002#endif
2003
2004
2005
2006
2007
2008
2009
2010static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr)
2011{
2012 kvm->arch.n_used_mmu_pages += nr;
2013 percpu_counter_add(&kvm_total_used_mmu_pages, nr);
2014}
2015
2016static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
2017{
2018 MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
2019 hlist_del(&sp->hash_link);
2020 list_del(&sp->link);
2021 free_page((unsigned long)sp->spt);
2022 if (!sp->role.direct)
2023 free_page((unsigned long)sp->gfns);
2024 kmem_cache_free(mmu_page_header_cache, sp);
2025}
2026
2027static unsigned kvm_page_table_hashfn(gfn_t gfn)
2028{
2029 return hash_64(gfn, KVM_MMU_HASH_SHIFT);
2030}
2031
2032static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
2033 struct kvm_mmu_page *sp, u64 *parent_pte)
2034{
2035 if (!parent_pte)
2036 return;
2037
2038 pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
2039}
2040
2041static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
2042 u64 *parent_pte)
2043{
2044 __pte_list_remove(parent_pte, &sp->parent_ptes);
2045}
2046
2047static void drop_parent_pte(struct kvm_mmu_page *sp,
2048 u64 *parent_pte)
2049{
2050 mmu_page_remove_parent_pte(sp, parent_pte);
2051 mmu_spte_clear_no_track(parent_pte);
2052}
2053
2054static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
2055{
2056 struct kvm_mmu_page *sp;
2057
2058 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
2059 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
2060 if (!direct)
2061 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
2062 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
2063 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
2064 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
2065 return sp;
2066}
2067
2068static void mark_unsync(u64 *spte);
2069static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
2070{
2071 u64 *sptep;
2072 struct rmap_iterator iter;
2073
2074 for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
2075 mark_unsync(sptep);
2076 }
2077}
2078
2079static void mark_unsync(u64 *spte)
2080{
2081 struct kvm_mmu_page *sp;
2082 unsigned int index;
2083
2084 sp = page_header(__pa(spte));
2085 index = spte - sp->spt;
2086 if (__test_and_set_bit(index, sp->unsync_child_bitmap))
2087 return;
2088 if (sp->unsync_children++)
2089 return;
2090 kvm_mmu_mark_parents_unsync(sp);
2091}
2092
2093static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
2094 struct kvm_mmu_page *sp)
2095{
2096 return 0;
2097}
2098
2099static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root)
2100{
2101}
2102
2103static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
2104 struct kvm_mmu_page *sp, u64 *spte,
2105 const void *pte)
2106{
2107 WARN_ON(1);
2108}
2109
2110#define KVM_PAGE_ARRAY_NR 16
2111
2112struct kvm_mmu_pages {
2113 struct mmu_page_and_offset {
2114 struct kvm_mmu_page *sp;
2115 unsigned int idx;
2116 } page[KVM_PAGE_ARRAY_NR];
2117 unsigned int nr;
2118};
2119
2120static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
2121 int idx)
2122{
2123 int i;
2124
2125 if (sp->unsync)
2126 for (i=0; i < pvec->nr; i++)
2127 if (pvec->page[i].sp == sp)
2128 return 0;
2129
2130 pvec->page[pvec->nr].sp = sp;
2131 pvec->page[pvec->nr].idx = idx;
2132 pvec->nr++;
2133 return (pvec->nr == KVM_PAGE_ARRAY_NR);
2134}
2135
2136static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
2137{
2138 --sp->unsync_children;
2139 WARN_ON((int)sp->unsync_children < 0);
2140 __clear_bit(idx, sp->unsync_child_bitmap);
2141}
2142
2143static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
2144 struct kvm_mmu_pages *pvec)
2145{
2146 int i, ret, nr_unsync_leaf = 0;
2147
2148 for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
2149 struct kvm_mmu_page *child;
2150 u64 ent = sp->spt[i];
2151
2152 if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
2153 clear_unsync_child_bit(sp, i);
2154 continue;
2155 }
2156
2157 child = page_header(ent & PT64_BASE_ADDR_MASK);
2158
2159 if (child->unsync_children) {
2160 if (mmu_pages_add(pvec, child, i))
2161 return -ENOSPC;
2162
2163 ret = __mmu_unsync_walk(child, pvec);
2164 if (!ret) {
2165 clear_unsync_child_bit(sp, i);
2166 continue;
2167 } else if (ret > 0) {
2168 nr_unsync_leaf += ret;
2169 } else
2170 return ret;
2171 } else if (child->unsync) {
2172 nr_unsync_leaf++;
2173 if (mmu_pages_add(pvec, child, i))
2174 return -ENOSPC;
2175 } else
2176 clear_unsync_child_bit(sp, i);
2177 }
2178
2179 return nr_unsync_leaf;
2180}
2181
2182#define INVALID_INDEX (-1)
2183
2184static int mmu_unsync_walk(struct kvm_mmu_page *sp,
2185 struct kvm_mmu_pages *pvec)
2186{
2187 pvec->nr = 0;
2188 if (!sp->unsync_children)
2189 return 0;
2190
2191 mmu_pages_add(pvec, sp, INVALID_INDEX);
2192 return __mmu_unsync_walk(sp, pvec);
2193}
2194
2195static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
2196{
2197 WARN_ON(!sp->unsync);
2198 trace_kvm_mmu_sync_page(sp);
2199 sp->unsync = 0;
2200 --kvm->stat.mmu_unsync;
2201}
2202
2203static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2204 struct list_head *invalid_list);
2205static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2206 struct list_head *invalid_list);
2207
2208
2209#define for_each_valid_sp(_kvm, _sp, _gfn) \
2210 hlist_for_each_entry(_sp, \
2211 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
2212 if ((_sp)->role.invalid) { \
2213 } else
2214
2215#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
2216 for_each_valid_sp(_kvm, _sp, _gfn) \
2217 if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
2218
2219static inline bool is_ept_sp(struct kvm_mmu_page *sp)
2220{
2221 return sp->role.cr0_wp && sp->role.smap_andnot_wp;
2222}
2223
2224
2225static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2226 struct list_head *invalid_list)
2227{
2228 if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) ||
2229 vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
2230 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
2231 return false;
2232 }
2233
2234 return true;
2235}
2236
2237static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
2238 struct list_head *invalid_list,
2239 bool remote_flush)
2240{
2241 if (!remote_flush && list_empty(invalid_list))
2242 return false;
2243
2244 if (!list_empty(invalid_list))
2245 kvm_mmu_commit_zap_page(kvm, invalid_list);
2246 else
2247 kvm_flush_remote_tlbs(kvm);
2248 return true;
2249}
2250
2251static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
2252 struct list_head *invalid_list,
2253 bool remote_flush, bool local_flush)
2254{
2255 if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush))
2256 return;
2257
2258 if (local_flush)
2259 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2260}
2261
2262#ifdef CONFIG_KVM_MMU_AUDIT
2263#include "mmu_audit.c"
2264#else
2265static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
2266static void mmu_audit_disable(void) { }
2267#endif
2268
2269static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2270 struct list_head *invalid_list)
2271{
2272 kvm_unlink_unsync_page(vcpu->kvm, sp);
2273 return __kvm_sync_page(vcpu, sp, invalid_list);
2274}
2275
2276
2277static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
2278 struct list_head *invalid_list)
2279{
2280 struct kvm_mmu_page *s;
2281 bool ret = false;
2282
2283 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
2284 if (!s->unsync)
2285 continue;
2286
2287 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
2288 ret |= kvm_sync_page(vcpu, s, invalid_list);
2289 }
2290
2291 return ret;
2292}
2293
2294struct mmu_page_path {
2295 struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
2296 unsigned int idx[PT64_ROOT_MAX_LEVEL];
2297};
2298
2299#define for_each_sp(pvec, sp, parents, i) \
2300 for (i = mmu_pages_first(&pvec, &parents); \
2301 i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
2302 i = mmu_pages_next(&pvec, &parents, i))
2303
2304static int mmu_pages_next(struct kvm_mmu_pages *pvec,
2305 struct mmu_page_path *parents,
2306 int i)
2307{
2308 int n;
2309
2310 for (n = i+1; n < pvec->nr; n++) {
2311 struct kvm_mmu_page *sp = pvec->page[n].sp;
2312 unsigned idx = pvec->page[n].idx;
2313 int level = sp->role.level;
2314
2315 parents->idx[level-1] = idx;
2316 if (level == PT_PAGE_TABLE_LEVEL)
2317 break;
2318
2319 parents->parent[level-2] = sp;
2320 }
2321
2322 return n;
2323}
2324
2325static int mmu_pages_first(struct kvm_mmu_pages *pvec,
2326 struct mmu_page_path *parents)
2327{
2328 struct kvm_mmu_page *sp;
2329 int level;
2330
2331 if (pvec->nr == 0)
2332 return 0;
2333
2334 WARN_ON(pvec->page[0].idx != INVALID_INDEX);
2335
2336 sp = pvec->page[0].sp;
2337 level = sp->role.level;
2338 WARN_ON(level == PT_PAGE_TABLE_LEVEL);
2339
2340 parents->parent[level-2] = sp;
2341
2342
2343
2344
2345 parents->parent[level-1] = NULL;
2346 return mmu_pages_next(pvec, parents, 0);
2347}
2348
2349static void mmu_pages_clear_parents(struct mmu_page_path *parents)
2350{
2351 struct kvm_mmu_page *sp;
2352 unsigned int level = 0;
2353
2354 do {
2355 unsigned int idx = parents->idx[level];
2356 sp = parents->parent[level];
2357 if (!sp)
2358 return;
2359
2360 WARN_ON(idx == INVALID_INDEX);
2361 clear_unsync_child_bit(sp, idx);
2362 level++;
2363 } while (!sp->unsync_children);
2364}
2365
2366static void mmu_sync_children(struct kvm_vcpu *vcpu,
2367 struct kvm_mmu_page *parent)
2368{
2369 int i;
2370 struct kvm_mmu_page *sp;
2371 struct mmu_page_path parents;
2372 struct kvm_mmu_pages pages;
2373 LIST_HEAD(invalid_list);
2374 bool flush = false;
2375
2376 while (mmu_unsync_walk(parent, &pages)) {
2377 bool protected = false;
2378
2379 for_each_sp(pages, sp, parents, i)
2380 protected |= rmap_write_protect(vcpu, sp->gfn);
2381
2382 if (protected) {
2383 kvm_flush_remote_tlbs(vcpu->kvm);
2384 flush = false;
2385 }
2386
2387 for_each_sp(pages, sp, parents, i) {
2388 flush |= kvm_sync_page(vcpu, sp, &invalid_list);
2389 mmu_pages_clear_parents(&parents);
2390 }
2391 if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
2392 kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2393 cond_resched_lock(&vcpu->kvm->mmu_lock);
2394 flush = false;
2395 }
2396 }
2397
2398 kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2399}
2400
2401static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
2402{
2403 atomic_set(&sp->write_flooding_count, 0);
2404}
2405
2406static void clear_sp_write_flooding_count(u64 *spte)
2407{
2408 struct kvm_mmu_page *sp = page_header(__pa(spte));
2409
2410 __clear_sp_write_flooding_count(sp);
2411}
2412
2413static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
2414 gfn_t gfn,
2415 gva_t gaddr,
2416 unsigned level,
2417 int direct,
2418 unsigned access)
2419{
2420 union kvm_mmu_page_role role;
2421 unsigned quadrant;
2422 struct kvm_mmu_page *sp;
2423 bool need_sync = false;
2424 bool flush = false;
2425 int collisions = 0;
2426 LIST_HEAD(invalid_list);
2427
2428 role = vcpu->arch.mmu->mmu_role.base;
2429 role.level = level;
2430 role.direct = direct;
2431 if (role.direct)
2432 role.gpte_is_8_bytes = true;
2433 role.access = access;
2434 if (!vcpu->arch.mmu->direct_map
2435 && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
2436 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
2437 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
2438 role.quadrant = quadrant;
2439 }
2440 for_each_valid_sp(vcpu->kvm, sp, gfn) {
2441 if (sp->gfn != gfn) {
2442 collisions++;
2443 continue;
2444 }
2445
2446 if (!need_sync && sp->unsync)
2447 need_sync = true;
2448
2449 if (sp->role.word != role.word)
2450 continue;
2451
2452 if (sp->unsync) {
2453
2454
2455
2456 if (!__kvm_sync_page(vcpu, sp, &invalid_list))
2457 break;
2458
2459 WARN_ON(!list_empty(&invalid_list));
2460 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2461 }
2462
2463 if (sp->unsync_children)
2464 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
2465
2466 __clear_sp_write_flooding_count(sp);
2467 trace_kvm_mmu_get_page(sp, false);
2468 goto out;
2469 }
2470
2471 ++vcpu->kvm->stat.mmu_cache_miss;
2472
2473 sp = kvm_mmu_alloc_page(vcpu, direct);
2474
2475 sp->gfn = gfn;
2476 sp->role = role;
2477 hlist_add_head(&sp->hash_link,
2478 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
2479 if (!direct) {
2480
2481
2482
2483
2484
2485 account_shadowed(vcpu->kvm, sp);
2486 if (level == PT_PAGE_TABLE_LEVEL &&
2487 rmap_write_protect(vcpu, gfn))
2488 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
2489
2490 if (level > PT_PAGE_TABLE_LEVEL && need_sync)
2491 flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
2492 }
2493 clear_page(sp->spt);
2494 trace_kvm_mmu_get_page(sp, true);
2495
2496 kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2497out:
2498 if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
2499 vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
2500 return sp;
2501}
2502
2503static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
2504 struct kvm_vcpu *vcpu, hpa_t root,
2505 u64 addr)
2506{
2507 iterator->addr = addr;
2508 iterator->shadow_addr = root;
2509 iterator->level = vcpu->arch.mmu->shadow_root_level;
2510
2511 if (iterator->level == PT64_ROOT_4LEVEL &&
2512 vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
2513 !vcpu->arch.mmu->direct_map)
2514 --iterator->level;
2515
2516 if (iterator->level == PT32E_ROOT_LEVEL) {
2517
2518
2519
2520
2521 BUG_ON(root != vcpu->arch.mmu->root_hpa);
2522
2523 iterator->shadow_addr
2524 = vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
2525 iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
2526 --iterator->level;
2527 if (!iterator->shadow_addr)
2528 iterator->level = 0;
2529 }
2530}
2531
2532static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
2533 struct kvm_vcpu *vcpu, u64 addr)
2534{
2535 shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa,
2536 addr);
2537}
2538
2539static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
2540{
2541 if (iterator->level < PT_PAGE_TABLE_LEVEL)
2542 return false;
2543
2544 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
2545 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
2546 return true;
2547}
2548
2549static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
2550 u64 spte)
2551{
2552 if (is_last_spte(spte, iterator->level)) {
2553 iterator->level = 0;
2554 return;
2555 }
2556
2557 iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
2558 --iterator->level;
2559}
2560
2561static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
2562{
2563 __shadow_walk_next(iterator, *iterator->sptep);
2564}
2565
2566static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
2567 struct kvm_mmu_page *sp)
2568{
2569 u64 spte;
2570
2571 BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
2572
2573 spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
2574 shadow_user_mask | shadow_x_mask | shadow_me_mask;
2575
2576 if (sp_ad_disabled(sp))
2577 spte |= shadow_acc_track_value;
2578 else
2579 spte |= shadow_accessed_mask;
2580
2581 mmu_spte_set(sptep, spte);
2582
2583 mmu_page_add_parent_pte(vcpu, sp, sptep);
2584
2585 if (sp->unsync_children || sp->unsync)
2586 mark_unsync(sptep);
2587}
2588
2589static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2590 unsigned direct_access)
2591{
2592 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
2593 struct kvm_mmu_page *child;
2594
2595
2596
2597
2598
2599
2600
2601
2602 child = page_header(*sptep & PT64_BASE_ADDR_MASK);
2603 if (child->role.access == direct_access)
2604 return;
2605
2606 drop_parent_pte(child, sptep);
2607 kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1);
2608 }
2609}
2610
2611static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
2612 u64 *spte)
2613{
2614 u64 pte;
2615 struct kvm_mmu_page *child;
2616
2617 pte = *spte;
2618 if (is_shadow_present_pte(pte)) {
2619 if (is_last_spte(pte, sp->role.level)) {
2620 drop_spte(kvm, spte);
2621 if (is_large_pte(pte))
2622 --kvm->stat.lpages;
2623 } else {
2624 child = page_header(pte & PT64_BASE_ADDR_MASK);
2625 drop_parent_pte(child, spte);
2626 }
2627 return true;
2628 }
2629
2630 if (is_mmio_spte(pte))
2631 mmu_spte_clear_no_track(spte);
2632
2633 return false;
2634}
2635
2636static void kvm_mmu_page_unlink_children(struct kvm *kvm,
2637 struct kvm_mmu_page *sp)
2638{
2639 unsigned i;
2640
2641 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2642 mmu_page_zap_pte(kvm, sp, sp->spt + i);
2643}
2644
2645static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
2646{
2647 u64 *sptep;
2648 struct rmap_iterator iter;
2649
2650 while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
2651 drop_parent_pte(sp, sptep);
2652}
2653
2654static int mmu_zap_unsync_children(struct kvm *kvm,
2655 struct kvm_mmu_page *parent,
2656 struct list_head *invalid_list)
2657{
2658 int i, zapped = 0;
2659 struct mmu_page_path parents;
2660 struct kvm_mmu_pages pages;
2661
2662 if (parent->role.level == PT_PAGE_TABLE_LEVEL)
2663 return 0;
2664
2665 while (mmu_unsync_walk(parent, &pages)) {
2666 struct kvm_mmu_page *sp;
2667
2668 for_each_sp(pages, sp, parents, i) {
2669 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2670 mmu_pages_clear_parents(&parents);
2671 zapped++;
2672 }
2673 }
2674
2675 return zapped;
2676}
2677
2678static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
2679 struct kvm_mmu_page *sp,
2680 struct list_head *invalid_list,
2681 int *nr_zapped)
2682{
2683 bool list_unstable;
2684
2685 trace_kvm_mmu_prepare_zap_page(sp);
2686 ++kvm->stat.mmu_shadow_zapped;
2687 *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
2688 kvm_mmu_page_unlink_children(kvm, sp);
2689 kvm_mmu_unlink_parents(kvm, sp);
2690
2691
2692 list_unstable = *nr_zapped;
2693
2694 if (!sp->role.invalid && !sp->role.direct)
2695 unaccount_shadowed(kvm, sp);
2696
2697 if (sp->unsync)
2698 kvm_unlink_unsync_page(kvm, sp);
2699 if (!sp->root_count) {
2700
2701 (*nr_zapped)++;
2702 list_move(&sp->link, invalid_list);
2703 kvm_mod_used_mmu_pages(kvm, -1);
2704 } else {
2705 list_move(&sp->link, &kvm->arch.active_mmu_pages);
2706
2707 if (!sp->role.invalid)
2708 kvm_reload_remote_mmus(kvm);
2709 }
2710
2711 sp->role.invalid = 1;
2712 return list_unstable;
2713}
2714
2715static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2716 struct list_head *invalid_list)
2717{
2718 int nr_zapped;
2719
2720 __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
2721 return nr_zapped;
2722}
2723
2724static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2725 struct list_head *invalid_list)
2726{
2727 struct kvm_mmu_page *sp, *nsp;
2728
2729 if (list_empty(invalid_list))
2730 return;
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741 kvm_flush_remote_tlbs(kvm);
2742
2743 list_for_each_entry_safe(sp, nsp, invalid_list, link) {
2744 WARN_ON(!sp->role.invalid || sp->root_count);
2745 kvm_mmu_free_page(sp);
2746 }
2747}
2748
2749static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
2750 struct list_head *invalid_list)
2751{
2752 struct kvm_mmu_page *sp;
2753
2754 if (list_empty(&kvm->arch.active_mmu_pages))
2755 return false;
2756
2757 sp = list_last_entry(&kvm->arch.active_mmu_pages,
2758 struct kvm_mmu_page, link);
2759 return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2760}
2761
2762
2763
2764
2765
2766void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
2767{
2768 LIST_HEAD(invalid_list);
2769
2770 spin_lock(&kvm->mmu_lock);
2771
2772 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2773
2774 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
2775 if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list))
2776 break;
2777
2778 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2779 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
2780 }
2781
2782 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
2783
2784 spin_unlock(&kvm->mmu_lock);
2785}
2786
2787int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2788{
2789 struct kvm_mmu_page *sp;
2790 LIST_HEAD(invalid_list);
2791 int r;
2792
2793 pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
2794 r = 0;
2795 spin_lock(&kvm->mmu_lock);
2796 for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
2797 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
2798 sp->role.word);
2799 r = 1;
2800 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
2801 }
2802 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2803 spin_unlock(&kvm->mmu_lock);
2804
2805 return r;
2806}
2807EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
2808
2809static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
2810{
2811 trace_kvm_mmu_unsync_page(sp);
2812 ++vcpu->kvm->stat.mmu_unsync;
2813 sp->unsync = 1;
2814
2815 kvm_mmu_mark_parents_unsync(sp);
2816}
2817
2818static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2819 bool can_unsync)
2820{
2821 struct kvm_mmu_page *sp;
2822
2823 if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
2824 return true;
2825
2826 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
2827 if (!can_unsync)
2828 return true;
2829
2830 if (sp->unsync)
2831 continue;
2832
2833 WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
2834 kvm_unsync_page(vcpu, sp);
2835 }
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874 smp_wmb();
2875
2876 return false;
2877}
2878
2879static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
2880{
2881 if (pfn_valid(pfn))
2882 return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893 (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
2894
2895 return true;
2896}
2897
2898
2899#define SET_SPTE_WRITE_PROTECTED_PT BIT(0)
2900#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
2901
2902static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2903 unsigned pte_access, int level,
2904 gfn_t gfn, kvm_pfn_t pfn, bool speculative,
2905 bool can_unsync, bool host_writable)
2906{
2907 u64 spte = 0;
2908 int ret = 0;
2909 struct kvm_mmu_page *sp;
2910
2911 if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
2912 return 0;
2913
2914 sp = page_header(__pa(sptep));
2915 if (sp_ad_disabled(sp))
2916 spte |= shadow_acc_track_value;
2917
2918
2919
2920
2921
2922
2923
2924 spte |= shadow_present_mask;
2925 if (!speculative)
2926 spte |= spte_shadow_accessed_mask(spte);
2927
2928 if (pte_access & ACC_EXEC_MASK)
2929 spte |= shadow_x_mask;
2930 else
2931 spte |= shadow_nx_mask;
2932
2933 if (pte_access & ACC_USER_MASK)
2934 spte |= shadow_user_mask;
2935
2936 if (level > PT_PAGE_TABLE_LEVEL)
2937 spte |= PT_PAGE_SIZE_MASK;
2938 if (tdp_enabled)
2939 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
2940 kvm_is_mmio_pfn(pfn));
2941
2942 if (host_writable)
2943 spte |= SPTE_HOST_WRITEABLE;
2944 else
2945 pte_access &= ~ACC_WRITE_MASK;
2946
2947 if (!kvm_is_mmio_pfn(pfn))
2948 spte |= shadow_me_mask;
2949
2950 spte |= (u64)pfn << PAGE_SHIFT;
2951
2952 if (pte_access & ACC_WRITE_MASK) {
2953
2954
2955
2956
2957
2958
2959
2960 if (level > PT_PAGE_TABLE_LEVEL &&
2961 mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
2962 goto done;
2963
2964 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
2965
2966
2967
2968
2969
2970
2971
2972 if (!can_unsync && is_writable_pte(*sptep))
2973 goto set_pte;
2974
2975 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
2976 pgprintk("%s: found shadow page for %llx, marking ro\n",
2977 __func__, gfn);
2978 ret |= SET_SPTE_WRITE_PROTECTED_PT;
2979 pte_access &= ~ACC_WRITE_MASK;
2980 spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
2981 }
2982 }
2983
2984 if (pte_access & ACC_WRITE_MASK) {
2985 kvm_vcpu_mark_page_dirty(vcpu, gfn);
2986 spte |= spte_shadow_dirty_mask(spte);
2987 }
2988
2989 if (speculative)
2990 spte = mark_spte_for_access_track(spte);
2991
2992set_pte:
2993 if (mmu_spte_update(sptep, spte))
2994 ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
2995done:
2996 return ret;
2997}
2998
2999static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
3000 int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
3001 bool speculative, bool host_writable)
3002{
3003 int was_rmapped = 0;
3004 int rmap_count;
3005 int set_spte_ret;
3006 int ret = RET_PF_RETRY;
3007 bool flush = false;
3008
3009 pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
3010 *sptep, write_fault, gfn);
3011
3012 if (is_shadow_present_pte(*sptep)) {
3013
3014
3015
3016
3017 if (level > PT_PAGE_TABLE_LEVEL &&
3018 !is_large_pte(*sptep)) {
3019 struct kvm_mmu_page *child;
3020 u64 pte = *sptep;
3021
3022 child = page_header(pte & PT64_BASE_ADDR_MASK);
3023 drop_parent_pte(child, sptep);
3024 flush = true;
3025 } else if (pfn != spte_to_pfn(*sptep)) {
3026 pgprintk("hfn old %llx new %llx\n",
3027 spte_to_pfn(*sptep), pfn);
3028 drop_spte(vcpu->kvm, sptep);
3029 flush = true;
3030 } else
3031 was_rmapped = 1;
3032 }
3033
3034 set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
3035 speculative, true, host_writable);
3036 if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
3037 if (write_fault)
3038 ret = RET_PF_EMULATE;
3039 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3040 }
3041
3042 if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
3043 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
3044 KVM_PAGES_PER_HPAGE(level));
3045
3046 if (unlikely(is_mmio_spte(*sptep)))
3047 ret = RET_PF_EMULATE;
3048
3049 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
3050 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
3051 is_large_pte(*sptep)? "2MB" : "4kB",
3052 *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn,
3053 *sptep, sptep);
3054 if (!was_rmapped && is_large_pte(*sptep))
3055 ++vcpu->kvm->stat.lpages;
3056
3057 if (is_shadow_present_pte(*sptep)) {
3058 if (!was_rmapped) {
3059 rmap_count = rmap_add(vcpu, sptep, gfn);
3060 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
3061 rmap_recycle(vcpu, sptep, gfn);
3062 }
3063 }
3064
3065 kvm_release_pfn_clean(pfn);
3066
3067 return ret;
3068}
3069
3070static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
3071 bool no_dirty_log)
3072{
3073 struct kvm_memory_slot *slot;
3074
3075 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
3076 if (!slot)
3077 return KVM_PFN_ERR_FAULT;
3078
3079 return gfn_to_pfn_memslot_atomic(slot, gfn);
3080}
3081
3082static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
3083 struct kvm_mmu_page *sp,
3084 u64 *start, u64 *end)
3085{
3086 struct page *pages[PTE_PREFETCH_NUM];
3087 struct kvm_memory_slot *slot;
3088 unsigned access = sp->role.access;
3089 int i, ret;
3090 gfn_t gfn;
3091
3092 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
3093 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
3094 if (!slot)
3095 return -1;
3096
3097 ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
3098 if (ret <= 0)
3099 return -1;
3100
3101 for (i = 0; i < ret; i++, gfn++, start++)
3102 mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
3103 page_to_pfn(pages[i]), true, true);
3104
3105 return 0;
3106}
3107
3108static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
3109 struct kvm_mmu_page *sp, u64 *sptep)
3110{
3111 u64 *spte, *start = NULL;
3112 int i;
3113
3114 WARN_ON(!sp->role.direct);
3115
3116 i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
3117 spte = sp->spt + i;
3118
3119 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
3120 if (is_shadow_present_pte(*spte) || spte == sptep) {
3121 if (!start)
3122 continue;
3123 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
3124 break;
3125 start = NULL;
3126 } else if (!start)
3127 start = spte;
3128 }
3129}
3130
3131static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
3132{
3133 struct kvm_mmu_page *sp;
3134
3135 sp = page_header(__pa(sptep));
3136
3137
3138
3139
3140
3141
3142 if (sp_ad_disabled(sp))
3143 return;
3144
3145 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
3146 return;
3147
3148 __direct_pte_prefetch(vcpu, sp, sptep);
3149}
3150
3151static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable,
3152 int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault)
3153{
3154 struct kvm_shadow_walk_iterator iterator;
3155 struct kvm_mmu_page *sp;
3156 int emulate = 0;
3157 gfn_t pseudo_gfn;
3158
3159 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3160 return 0;
3161
3162 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
3163 if (iterator.level == level) {
3164 emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
3165 write, level, gfn, pfn, prefault,
3166 map_writable);
3167 direct_pte_prefetch(vcpu, iterator.sptep);
3168 ++vcpu->stat.pf_fixed;
3169 break;
3170 }
3171
3172 drop_large_spte(vcpu, iterator.sptep);
3173 if (!is_shadow_present_pte(*iterator.sptep)) {
3174 u64 base_addr = iterator.addr;
3175
3176 base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
3177 pseudo_gfn = base_addr >> PAGE_SHIFT;
3178 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
3179 iterator.level - 1, 1, ACC_ALL);
3180
3181 link_shadow_page(vcpu, iterator.sptep, sp);
3182 }
3183 }
3184 return emulate;
3185}
3186
3187static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
3188{
3189 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
3190}
3191
3192static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
3193{
3194
3195
3196
3197
3198
3199 if (pfn == KVM_PFN_ERR_RO_FAULT)
3200 return RET_PF_EMULATE;
3201
3202 if (pfn == KVM_PFN_ERR_HWPOISON) {
3203 kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
3204 return RET_PF_RETRY;
3205 }
3206
3207 return -EFAULT;
3208}
3209
3210static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
3211 gfn_t *gfnp, kvm_pfn_t *pfnp,
3212 int *levelp)
3213{
3214 kvm_pfn_t pfn = *pfnp;
3215 gfn_t gfn = *gfnp;
3216 int level = *levelp;
3217
3218
3219
3220
3221
3222
3223
3224 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
3225 level == PT_PAGE_TABLE_LEVEL &&
3226 PageTransCompoundMap(pfn_to_page(pfn)) &&
3227 !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
3228 unsigned long mask;
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238 *levelp = level = PT_DIRECTORY_LEVEL;
3239 mask = KVM_PAGES_PER_HPAGE(level) - 1;
3240 VM_BUG_ON((gfn & mask) != (pfn & mask));
3241 if (pfn & mask) {
3242 gfn &= ~mask;
3243 *gfnp = gfn;
3244 kvm_release_pfn_clean(pfn);
3245 pfn &= ~mask;
3246 kvm_get_pfn(pfn);
3247 *pfnp = pfn;
3248 }
3249 }
3250}
3251
3252static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
3253 kvm_pfn_t pfn, unsigned access, int *ret_val)
3254{
3255
3256 if (unlikely(is_error_pfn(pfn))) {
3257 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
3258 return true;
3259 }
3260
3261 if (unlikely(is_noslot_pfn(pfn)))
3262 vcpu_cache_mmio_info(vcpu, gva, gfn, access);
3263
3264 return false;
3265}
3266
3267static bool page_fault_can_be_fast(u32 error_code)
3268{
3269
3270
3271
3272
3273 if (unlikely(error_code & PFERR_RSVD_MASK))
3274 return false;
3275
3276
3277 if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
3278 == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
3279 return false;
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295 return shadow_acc_track_mask != 0 ||
3296 ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
3297 == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
3298}
3299
3300
3301
3302
3303
3304static bool
3305fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
3306 u64 *sptep, u64 old_spte, u64 new_spte)
3307{
3308 gfn_t gfn;
3309
3310 WARN_ON(!sp->role.direct);
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324 if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
3325 return false;
3326
3327 if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
3328
3329
3330
3331
3332 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
3333 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3334 }
3335
3336 return true;
3337}
3338
3339static bool is_access_allowed(u32 fault_err_code, u64 spte)
3340{
3341 if (fault_err_code & PFERR_FETCH_MASK)
3342 return is_executable_pte(spte);
3343
3344 if (fault_err_code & PFERR_WRITE_MASK)
3345 return is_writable_pte(spte);
3346
3347
3348 return spte & PT_PRESENT_MASK;
3349}
3350
3351
3352
3353
3354
3355
3356static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
3357 u32 error_code)
3358{
3359 struct kvm_shadow_walk_iterator iterator;
3360 struct kvm_mmu_page *sp;
3361 bool fault_handled = false;
3362 u64 spte = 0ull;
3363 uint retry_count = 0;
3364
3365 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3366 return false;
3367
3368 if (!page_fault_can_be_fast(error_code))
3369 return false;
3370
3371 walk_shadow_page_lockless_begin(vcpu);
3372
3373 do {
3374 u64 new_spte;
3375
3376 for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
3377 if (!is_shadow_present_pte(spte) ||
3378 iterator.level < level)
3379 break;
3380
3381 sp = page_header(__pa(iterator.sptep));
3382 if (!is_last_spte(spte, sp->role.level))
3383 break;
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395 if (is_access_allowed(error_code, spte)) {
3396 fault_handled = true;
3397 break;
3398 }
3399
3400 new_spte = spte;
3401
3402 if (is_access_track_spte(spte))
3403 new_spte = restore_acc_track_spte(new_spte);
3404
3405
3406
3407
3408
3409
3410 if ((error_code & PFERR_WRITE_MASK) &&
3411 spte_can_locklessly_be_made_writable(spte))
3412 {
3413 new_spte |= PT_WRITABLE_MASK;
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
3427 break;
3428 }
3429
3430
3431 if (new_spte == spte ||
3432 !is_access_allowed(error_code, new_spte))
3433 break;
3434
3435
3436
3437
3438
3439
3440 fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
3441 iterator.sptep, spte,
3442 new_spte);
3443 if (fault_handled)
3444 break;
3445
3446 if (++retry_count > 4) {
3447 printk_once(KERN_WARNING
3448 "kvm: Fast #PF retrying more than 4 times.\n");
3449 break;
3450 }
3451
3452 } while (true);
3453
3454 trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
3455 spte, fault_handled);
3456 walk_shadow_page_lockless_end(vcpu);
3457
3458 return fault_handled;
3459}
3460
3461static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
3462 gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable);
3463static int make_mmu_pages_available(struct kvm_vcpu *vcpu);
3464
3465static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
3466 gfn_t gfn, bool prefault)
3467{
3468 int r;
3469 int level;
3470 bool force_pt_level = false;
3471 kvm_pfn_t pfn;
3472 unsigned long mmu_seq;
3473 bool map_writable, write = error_code & PFERR_WRITE_MASK;
3474
3475 level = mapping_level(vcpu, gfn, &force_pt_level);
3476 if (likely(!force_pt_level)) {
3477
3478
3479
3480
3481
3482 if (level > PT_DIRECTORY_LEVEL)
3483 level = PT_DIRECTORY_LEVEL;
3484
3485 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
3486 }
3487
3488 if (fast_page_fault(vcpu, v, level, error_code))
3489 return RET_PF_RETRY;
3490
3491 mmu_seq = vcpu->kvm->mmu_notifier_seq;
3492 smp_rmb();
3493
3494 if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
3495 return RET_PF_RETRY;
3496
3497 if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
3498 return r;
3499
3500 spin_lock(&vcpu->kvm->mmu_lock);
3501 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
3502 goto out_unlock;
3503 if (make_mmu_pages_available(vcpu) < 0)
3504 goto out_unlock;
3505 if (likely(!force_pt_level))
3506 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
3507 r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
3508 spin_unlock(&vcpu->kvm->mmu_lock);
3509
3510 return r;
3511
3512out_unlock:
3513 spin_unlock(&vcpu->kvm->mmu_lock);
3514 kvm_release_pfn_clean(pfn);
3515 return RET_PF_RETRY;
3516}
3517
3518static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
3519 struct list_head *invalid_list)
3520{
3521 struct kvm_mmu_page *sp;
3522
3523 if (!VALID_PAGE(*root_hpa))
3524 return;
3525
3526 sp = page_header(*root_hpa & PT64_BASE_ADDR_MASK);
3527 --sp->root_count;
3528 if (!sp->root_count && sp->role.invalid)
3529 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
3530
3531 *root_hpa = INVALID_PAGE;
3532}
3533
3534
3535void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
3536 ulong roots_to_free)
3537{
3538 int i;
3539 LIST_HEAD(invalid_list);
3540 bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
3541
3542 BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
3543
3544
3545 if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) {
3546 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3547 if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
3548 VALID_PAGE(mmu->prev_roots[i].hpa))
3549 break;
3550
3551 if (i == KVM_MMU_NUM_PREV_ROOTS)
3552 return;
3553 }
3554
3555 spin_lock(&vcpu->kvm->mmu_lock);
3556
3557 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3558 if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
3559 mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa,
3560 &invalid_list);
3561
3562 if (free_active_root) {
3563 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
3564 (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
3565 mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
3566 &invalid_list);
3567 } else {
3568 for (i = 0; i < 4; ++i)
3569 if (mmu->pae_root[i] != 0)
3570 mmu_free_root_page(vcpu->kvm,
3571 &mmu->pae_root[i],
3572 &invalid_list);
3573 mmu->root_hpa = INVALID_PAGE;
3574 }
3575 mmu->root_cr3 = 0;
3576 }
3577
3578 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3579 spin_unlock(&vcpu->kvm->mmu_lock);
3580}
3581EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
3582
3583static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
3584{
3585 int ret = 0;
3586
3587 if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
3588 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3589 ret = 1;
3590 }
3591
3592 return ret;
3593}
3594
3595static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
3596{
3597 struct kvm_mmu_page *sp;
3598 unsigned i;
3599
3600 if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
3601 spin_lock(&vcpu->kvm->mmu_lock);
3602 if(make_mmu_pages_available(vcpu) < 0) {
3603 spin_unlock(&vcpu->kvm->mmu_lock);
3604 return -ENOSPC;
3605 }
3606 sp = kvm_mmu_get_page(vcpu, 0, 0,
3607 vcpu->arch.mmu->shadow_root_level, 1, ACC_ALL);
3608 ++sp->root_count;
3609 spin_unlock(&vcpu->kvm->mmu_lock);
3610 vcpu->arch.mmu->root_hpa = __pa(sp->spt);
3611 } else if (vcpu->arch.mmu->shadow_root_level == PT32E_ROOT_LEVEL) {
3612 for (i = 0; i < 4; ++i) {
3613 hpa_t root = vcpu->arch.mmu->pae_root[i];
3614
3615 MMU_WARN_ON(VALID_PAGE(root));
3616 spin_lock(&vcpu->kvm->mmu_lock);
3617 if (make_mmu_pages_available(vcpu) < 0) {
3618 spin_unlock(&vcpu->kvm->mmu_lock);
3619 return -ENOSPC;
3620 }
3621 sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
3622 i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
3623 root = __pa(sp->spt);
3624 ++sp->root_count;
3625 spin_unlock(&vcpu->kvm->mmu_lock);
3626 vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
3627 }
3628 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
3629 } else
3630 BUG();
3631 vcpu->arch.mmu->root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
3632
3633 return 0;
3634}
3635
3636static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3637{
3638 struct kvm_mmu_page *sp;
3639 u64 pdptr, pm_mask;
3640 gfn_t root_gfn, root_cr3;
3641 int i;
3642
3643 root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
3644 root_gfn = root_cr3 >> PAGE_SHIFT;
3645
3646 if (mmu_check_root(vcpu, root_gfn))
3647 return 1;
3648
3649
3650
3651
3652
3653 if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
3654 hpa_t root = vcpu->arch.mmu->root_hpa;
3655
3656 MMU_WARN_ON(VALID_PAGE(root));
3657
3658 spin_lock(&vcpu->kvm->mmu_lock);
3659 if (make_mmu_pages_available(vcpu) < 0) {
3660 spin_unlock(&vcpu->kvm->mmu_lock);
3661 return -ENOSPC;
3662 }
3663 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
3664 vcpu->arch.mmu->shadow_root_level, 0, ACC_ALL);
3665 root = __pa(sp->spt);
3666 ++sp->root_count;
3667 spin_unlock(&vcpu->kvm->mmu_lock);
3668 vcpu->arch.mmu->root_hpa = root;
3669 goto set_root_cr3;
3670 }
3671
3672
3673
3674
3675
3676
3677 pm_mask = PT_PRESENT_MASK;
3678 if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL)
3679 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
3680
3681 for (i = 0; i < 4; ++i) {
3682 hpa_t root = vcpu->arch.mmu->pae_root[i];
3683
3684 MMU_WARN_ON(VALID_PAGE(root));
3685 if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
3686 pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
3687 if (!(pdptr & PT_PRESENT_MASK)) {
3688 vcpu->arch.mmu->pae_root[i] = 0;
3689 continue;
3690 }
3691 root_gfn = pdptr >> PAGE_SHIFT;
3692 if (mmu_check_root(vcpu, root_gfn))
3693 return 1;
3694 }
3695 spin_lock(&vcpu->kvm->mmu_lock);
3696 if (make_mmu_pages_available(vcpu) < 0) {
3697 spin_unlock(&vcpu->kvm->mmu_lock);
3698 return -ENOSPC;
3699 }
3700 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
3701 0, ACC_ALL);
3702 root = __pa(sp->spt);
3703 ++sp->root_count;
3704 spin_unlock(&vcpu->kvm->mmu_lock);
3705
3706 vcpu->arch.mmu->pae_root[i] = root | pm_mask;
3707 }
3708 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
3709
3710
3711
3712
3713
3714 if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
3715 if (vcpu->arch.mmu->lm_root == NULL) {
3716
3717
3718
3719
3720
3721 u64 *lm_root;
3722
3723 lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3724 if (lm_root == NULL)
3725 return 1;
3726
3727 lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask;
3728
3729 vcpu->arch.mmu->lm_root = lm_root;
3730 }
3731
3732 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
3733 }
3734
3735set_root_cr3:
3736 vcpu->arch.mmu->root_cr3 = root_cr3;
3737
3738 return 0;
3739}
3740
3741static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
3742{
3743 if (vcpu->arch.mmu->direct_map)
3744 return mmu_alloc_direct_roots(vcpu);
3745 else
3746 return mmu_alloc_shadow_roots(vcpu);
3747}
3748
3749void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
3750{
3751 int i;
3752 struct kvm_mmu_page *sp;
3753
3754 if (vcpu->arch.mmu->direct_map)
3755 return;
3756
3757 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3758 return;
3759
3760 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
3761
3762 if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
3763 hpa_t root = vcpu->arch.mmu->root_hpa;
3764 sp = page_header(root);
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776 if (!smp_load_acquire(&sp->unsync) &&
3777 !smp_load_acquire(&sp->unsync_children))
3778 return;
3779
3780 spin_lock(&vcpu->kvm->mmu_lock);
3781 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
3782
3783 mmu_sync_children(vcpu, sp);
3784
3785 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
3786 spin_unlock(&vcpu->kvm->mmu_lock);
3787 return;
3788 }
3789
3790 spin_lock(&vcpu->kvm->mmu_lock);
3791 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
3792
3793 for (i = 0; i < 4; ++i) {
3794 hpa_t root = vcpu->arch.mmu->pae_root[i];
3795
3796 if (root && VALID_PAGE(root)) {
3797 root &= PT64_BASE_ADDR_MASK;
3798 sp = page_header(root);
3799 mmu_sync_children(vcpu, sp);
3800 }
3801 }
3802
3803 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
3804 spin_unlock(&vcpu->kvm->mmu_lock);
3805}
3806EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
3807
3808static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
3809 u32 access, struct x86_exception *exception)
3810{
3811 if (exception)
3812 exception->error_code = 0;
3813 return vaddr;
3814}
3815
3816static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
3817 u32 access,
3818 struct x86_exception *exception)
3819{
3820 if (exception)
3821 exception->error_code = 0;
3822 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
3823}
3824
3825static bool
3826__is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
3827{
3828 int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f;
3829
3830 return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) |
3831 ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0);
3832}
3833
3834static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
3835{
3836 return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level);
3837}
3838
3839static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
3840{
3841 return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
3842}
3843
3844static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3845{
3846
3847
3848
3849
3850 if (mmu_is_nested(vcpu))
3851 return false;
3852
3853 if (direct)
3854 return vcpu_match_mmio_gpa(vcpu, addr);
3855
3856 return vcpu_match_mmio_gva(vcpu, addr);
3857}
3858
3859
3860static bool
3861walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
3862{
3863 struct kvm_shadow_walk_iterator iterator;
3864 u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull;
3865 int root, leaf;
3866 bool reserved = false;
3867
3868 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3869 goto exit;
3870
3871 walk_shadow_page_lockless_begin(vcpu);
3872
3873 for (shadow_walk_init(&iterator, vcpu, addr),
3874 leaf = root = iterator.level;
3875 shadow_walk_okay(&iterator);
3876 __shadow_walk_next(&iterator, spte)) {
3877 spte = mmu_spte_get_lockless(iterator.sptep);
3878
3879 sptes[leaf - 1] = spte;
3880 leaf--;
3881
3882 if (!is_shadow_present_pte(spte))
3883 break;
3884
3885 reserved |= is_shadow_zero_bits_set(vcpu->arch.mmu, spte,
3886 iterator.level);
3887 }
3888
3889 walk_shadow_page_lockless_end(vcpu);
3890
3891 if (reserved) {
3892 pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
3893 __func__, addr);
3894 while (root > leaf) {
3895 pr_err("------ spte 0x%llx level %d.\n",
3896 sptes[root - 1], root);
3897 root--;
3898 }
3899 }
3900exit:
3901 *sptep = spte;
3902 return reserved;
3903}
3904
3905static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3906{
3907 u64 spte;
3908 bool reserved;
3909
3910 if (mmio_info_in_cache(vcpu, addr, direct))
3911 return RET_PF_EMULATE;
3912
3913 reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
3914 if (WARN_ON(reserved))
3915 return -EINVAL;
3916
3917 if (is_mmio_spte(spte)) {
3918 gfn_t gfn = get_mmio_spte_gfn(spte);
3919 unsigned access = get_mmio_spte_access(spte);
3920
3921 if (!check_mmio_spte(vcpu, spte))
3922 return RET_PF_INVALID;
3923
3924 if (direct)
3925 addr = 0;
3926
3927 trace_handle_mmio_page_fault(addr, gfn, access);
3928 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
3929 return RET_PF_EMULATE;
3930 }
3931
3932
3933
3934
3935
3936 return RET_PF_RETRY;
3937}
3938
3939static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
3940 u32 error_code, gfn_t gfn)
3941{
3942 if (unlikely(error_code & PFERR_RSVD_MASK))
3943 return false;
3944
3945 if (!(error_code & PFERR_PRESENT_MASK) ||
3946 !(error_code & PFERR_WRITE_MASK))
3947 return false;
3948
3949
3950
3951
3952
3953 if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
3954 return true;
3955
3956 return false;
3957}
3958
3959static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
3960{
3961 struct kvm_shadow_walk_iterator iterator;
3962 u64 spte;
3963
3964 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3965 return;
3966
3967 walk_shadow_page_lockless_begin(vcpu);
3968 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
3969 clear_sp_write_flooding_count(iterator.sptep);
3970 if (!is_shadow_present_pte(spte))
3971 break;
3972 }
3973 walk_shadow_page_lockless_end(vcpu);
3974}
3975
3976static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
3977 u32 error_code, bool prefault)
3978{
3979 gfn_t gfn = gva >> PAGE_SHIFT;
3980 int r;
3981
3982 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
3983
3984 if (page_fault_handle_page_track(vcpu, error_code, gfn))
3985 return RET_PF_EMULATE;
3986
3987 r = mmu_topup_memory_caches(vcpu);
3988 if (r)
3989 return r;
3990
3991 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
3992
3993
3994 return nonpaging_map(vcpu, gva & PAGE_MASK,
3995 error_code, gfn, prefault);
3996}
3997
3998static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
3999{
4000 struct kvm_arch_async_pf arch;
4001
4002 arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
4003 arch.gfn = gfn;
4004 arch.direct_map = vcpu->arch.mmu->direct_map;
4005 arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu);
4006
4007 return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
4008}
4009
4010bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
4011{
4012 if (unlikely(!lapic_in_kernel(vcpu) ||
4013 kvm_event_needs_reinjection(vcpu) ||
4014 vcpu->arch.exception.pending))
4015 return false;
4016
4017 if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
4018 return false;
4019
4020 return kvm_x86_ops->interrupt_allowed(vcpu);
4021}
4022
4023static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
4024 gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
4025{
4026 struct kvm_memory_slot *slot;
4027 bool async;
4028
4029
4030
4031
4032 if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) {
4033 *pfn = KVM_PFN_NOSLOT;
4034 return false;
4035 }
4036
4037 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
4038 async = false;
4039 *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
4040 if (!async)
4041 return false;
4042
4043 if (!prefault && kvm_can_do_async_pf(vcpu)) {
4044 trace_kvm_try_async_get_page(gva, gfn);
4045 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
4046 trace_kvm_async_pf_doublefault(gva, gfn);
4047 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
4048 return true;
4049 } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
4050 return true;
4051 }
4052
4053 *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
4054 return false;
4055}
4056
4057int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
4058 u64 fault_address, char *insn, int insn_len)
4059{
4060 int r = 1;
4061
4062 vcpu->arch.l1tf_flush_l1d = true;
4063 switch (vcpu->arch.apf.host_apf_reason) {
4064 default:
4065 trace_kvm_page_fault(fault_address, error_code);
4066
4067 if (kvm_event_needs_reinjection(vcpu))
4068 kvm_mmu_unprotect_page_virt(vcpu, fault_address);
4069 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
4070 insn_len);
4071 break;
4072 case KVM_PV_REASON_PAGE_NOT_PRESENT:
4073 vcpu->arch.apf.host_apf_reason = 0;
4074 local_irq_disable();
4075 kvm_async_pf_task_wait(fault_address, 0);
4076 local_irq_enable();
4077 break;
4078 case KVM_PV_REASON_PAGE_READY:
4079 vcpu->arch.apf.host_apf_reason = 0;
4080 local_irq_disable();
4081 kvm_async_pf_task_wake(fault_address);
4082 local_irq_enable();
4083 break;
4084 }
4085 return r;
4086}
4087EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
4088
4089static bool
4090check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
4091{
4092 int page_num = KVM_PAGES_PER_HPAGE(level);
4093
4094 gfn &= ~(page_num - 1);
4095
4096 return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
4097}
4098
4099static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
4100 bool prefault)
4101{
4102 kvm_pfn_t pfn;
4103 int r;
4104 int level;
4105 bool force_pt_level;
4106 gfn_t gfn = gpa >> PAGE_SHIFT;
4107 unsigned long mmu_seq;
4108 int write = error_code & PFERR_WRITE_MASK;
4109 bool map_writable;
4110
4111 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
4112
4113 if (page_fault_handle_page_track(vcpu, error_code, gfn))
4114 return RET_PF_EMULATE;
4115
4116 r = mmu_topup_memory_caches(vcpu);
4117 if (r)
4118 return r;
4119
4120 force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
4121 PT_DIRECTORY_LEVEL);
4122 level = mapping_level(vcpu, gfn, &force_pt_level);
4123 if (likely(!force_pt_level)) {
4124 if (level > PT_DIRECTORY_LEVEL &&
4125 !check_hugepage_cache_consistency(vcpu, gfn, level))
4126 level = PT_DIRECTORY_LEVEL;
4127 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
4128 }
4129
4130 if (fast_page_fault(vcpu, gpa, level, error_code))
4131 return RET_PF_RETRY;
4132
4133 mmu_seq = vcpu->kvm->mmu_notifier_seq;
4134 smp_rmb();
4135
4136 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
4137 return RET_PF_RETRY;
4138
4139 if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
4140 return r;
4141
4142 spin_lock(&vcpu->kvm->mmu_lock);
4143 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
4144 goto out_unlock;
4145 if (make_mmu_pages_available(vcpu) < 0)
4146 goto out_unlock;
4147 if (likely(!force_pt_level))
4148 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
4149 r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
4150 spin_unlock(&vcpu->kvm->mmu_lock);
4151
4152 return r;
4153
4154out_unlock:
4155 spin_unlock(&vcpu->kvm->mmu_lock);
4156 kvm_release_pfn_clean(pfn);
4157 return RET_PF_RETRY;
4158}
4159
4160static void nonpaging_init_context(struct kvm_vcpu *vcpu,
4161 struct kvm_mmu *context)
4162{
4163 context->page_fault = nonpaging_page_fault;
4164 context->gva_to_gpa = nonpaging_gva_to_gpa;
4165 context->sync_page = nonpaging_sync_page;
4166 context->invlpg = nonpaging_invlpg;
4167 context->update_pte = nonpaging_update_pte;
4168 context->root_level = 0;
4169 context->shadow_root_level = PT32E_ROOT_LEVEL;
4170 context->direct_map = true;
4171 context->nx = false;
4172}
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4183 union kvm_mmu_page_role new_role)
4184{
4185 uint i;
4186 struct kvm_mmu_root_info root;
4187 struct kvm_mmu *mmu = vcpu->arch.mmu;
4188
4189 root.cr3 = mmu->root_cr3;
4190 root.hpa = mmu->root_hpa;
4191
4192 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
4193 swap(root, mmu->prev_roots[i]);
4194
4195 if (new_cr3 == root.cr3 && VALID_PAGE(root.hpa) &&
4196 page_header(root.hpa) != NULL &&
4197 new_role.word == page_header(root.hpa)->role.word)
4198 break;
4199 }
4200
4201 mmu->root_hpa = root.hpa;
4202 mmu->root_cr3 = root.cr3;
4203
4204 return i < KVM_MMU_NUM_PREV_ROOTS;
4205}
4206
4207static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4208 union kvm_mmu_page_role new_role,
4209 bool skip_tlb_flush)
4210{
4211 struct kvm_mmu *mmu = vcpu->arch.mmu;
4212
4213
4214
4215
4216
4217
4218 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
4219 mmu->root_level >= PT64_ROOT_4LEVEL) {
4220 if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT))
4221 return false;
4222
4223 if (cached_root_available(vcpu, new_cr3, new_role)) {
4224 kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
4225 if (!skip_tlb_flush) {
4226 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
4227 kvm_x86_ops->tlb_flush(vcpu, true);
4228 }
4229
4230
4231
4232
4233
4234
4235
4236
4237 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
4238
4239 __clear_sp_write_flooding_count(
4240 page_header(mmu->root_hpa));
4241
4242 return true;
4243 }
4244 }
4245
4246 return false;
4247}
4248
4249static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4250 union kvm_mmu_page_role new_role,
4251 bool skip_tlb_flush)
4252{
4253 if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush))
4254 kvm_mmu_free_roots(vcpu, vcpu->arch.mmu,
4255 KVM_MMU_ROOT_CURRENT);
4256}
4257
4258void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
4259{
4260 __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu),
4261 skip_tlb_flush);
4262}
4263EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3);
4264
4265static unsigned long get_cr3(struct kvm_vcpu *vcpu)
4266{
4267 return kvm_read_cr3(vcpu);
4268}
4269
4270static void inject_page_fault(struct kvm_vcpu *vcpu,
4271 struct x86_exception *fault)
4272{
4273 vcpu->arch.mmu->inject_page_fault(vcpu, fault);
4274}
4275
4276static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
4277 unsigned access, int *nr_present)
4278{
4279 if (unlikely(is_mmio_spte(*sptep))) {
4280 if (gfn != get_mmio_spte_gfn(*sptep)) {
4281 mmu_spte_clear_no_track(sptep);
4282 return true;
4283 }
4284
4285 (*nr_present)++;
4286 mark_mmio_spte(vcpu, sptep, gfn, access);
4287 return true;
4288 }
4289
4290 return false;
4291}
4292
4293static inline bool is_last_gpte(struct kvm_mmu *mmu,
4294 unsigned level, unsigned gpte)
4295{
4296
4297
4298
4299
4300
4301 gpte &= level - mmu->last_nonleaf_level;
4302
4303
4304
4305
4306
4307
4308 gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
4309
4310 return gpte & PT_PAGE_SIZE_MASK;
4311}
4312
4313#define PTTYPE_EPT 18
4314#define PTTYPE PTTYPE_EPT
4315#include "paging_tmpl.h"
4316#undef PTTYPE
4317
4318#define PTTYPE 64
4319#include "paging_tmpl.h"
4320#undef PTTYPE
4321
4322#define PTTYPE 32
4323#include "paging_tmpl.h"
4324#undef PTTYPE
4325
4326static void
4327__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4328 struct rsvd_bits_validate *rsvd_check,
4329 int maxphyaddr, int level, bool nx, bool gbpages,
4330 bool pse, bool amd)
4331{
4332 u64 exb_bit_rsvd = 0;
4333 u64 gbpages_bit_rsvd = 0;
4334 u64 nonleaf_bit8_rsvd = 0;
4335
4336 rsvd_check->bad_mt_xwr = 0;
4337
4338 if (!nx)
4339 exb_bit_rsvd = rsvd_bits(63, 63);
4340 if (!gbpages)
4341 gbpages_bit_rsvd = rsvd_bits(7, 7);
4342
4343
4344
4345
4346
4347 if (amd)
4348 nonleaf_bit8_rsvd = rsvd_bits(8, 8);
4349
4350 switch (level) {
4351 case PT32_ROOT_LEVEL:
4352
4353 rsvd_check->rsvd_bits_mask[0][1] = 0;
4354 rsvd_check->rsvd_bits_mask[0][0] = 0;
4355 rsvd_check->rsvd_bits_mask[1][0] =
4356 rsvd_check->rsvd_bits_mask[0][0];
4357
4358 if (!pse) {
4359 rsvd_check->rsvd_bits_mask[1][1] = 0;
4360 break;
4361 }
4362
4363 if (is_cpuid_PSE36())
4364
4365 rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
4366 else
4367
4368 rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
4369 break;
4370 case PT32E_ROOT_LEVEL:
4371 rsvd_check->rsvd_bits_mask[0][2] =
4372 rsvd_bits(maxphyaddr, 63) |
4373 rsvd_bits(5, 8) | rsvd_bits(1, 2);
4374 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
4375 rsvd_bits(maxphyaddr, 62);
4376 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
4377 rsvd_bits(maxphyaddr, 62);
4378 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
4379 rsvd_bits(maxphyaddr, 62) |
4380 rsvd_bits(13, 20);
4381 rsvd_check->rsvd_bits_mask[1][0] =
4382 rsvd_check->rsvd_bits_mask[0][0];
4383 break;
4384 case PT64_ROOT_5LEVEL:
4385 rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |
4386 nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
4387 rsvd_bits(maxphyaddr, 51);
4388 rsvd_check->rsvd_bits_mask[1][4] =
4389 rsvd_check->rsvd_bits_mask[0][4];
4390
4391 case PT64_ROOT_4LEVEL:
4392 rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
4393 nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
4394 rsvd_bits(maxphyaddr, 51);
4395 rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
4396 nonleaf_bit8_rsvd | gbpages_bit_rsvd |
4397 rsvd_bits(maxphyaddr, 51);
4398 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
4399 rsvd_bits(maxphyaddr, 51);
4400 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
4401 rsvd_bits(maxphyaddr, 51);
4402 rsvd_check->rsvd_bits_mask[1][3] =
4403 rsvd_check->rsvd_bits_mask[0][3];
4404 rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd |
4405 gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
4406 rsvd_bits(13, 29);
4407 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
4408 rsvd_bits(maxphyaddr, 51) |
4409 rsvd_bits(13, 20);
4410 rsvd_check->rsvd_bits_mask[1][0] =
4411 rsvd_check->rsvd_bits_mask[0][0];
4412 break;
4413 }
4414}
4415
4416static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4417 struct kvm_mmu *context)
4418{
4419 __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
4420 cpuid_maxphyaddr(vcpu), context->root_level,
4421 context->nx,
4422 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
4423 is_pse(vcpu), guest_cpuid_is_amd(vcpu));
4424}
4425
4426static void
4427__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
4428 int maxphyaddr, bool execonly)
4429{
4430 u64 bad_mt_xwr;
4431
4432 rsvd_check->rsvd_bits_mask[0][4] =
4433 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
4434 rsvd_check->rsvd_bits_mask[0][3] =
4435 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
4436 rsvd_check->rsvd_bits_mask[0][2] =
4437 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
4438 rsvd_check->rsvd_bits_mask[0][1] =
4439 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
4440 rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
4441
4442
4443 rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
4444 rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
4445 rsvd_check->rsvd_bits_mask[1][2] =
4446 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
4447 rsvd_check->rsvd_bits_mask[1][1] =
4448 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
4449 rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
4450
4451 bad_mt_xwr = 0xFFull << (2 * 8);
4452 bad_mt_xwr |= 0xFFull << (3 * 8);
4453 bad_mt_xwr |= 0xFFull << (7 * 8);
4454 bad_mt_xwr |= REPEAT_BYTE(1ull << 2);
4455 bad_mt_xwr |= REPEAT_BYTE(1ull << 6);
4456 if (!execonly) {
4457
4458 bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
4459 }
4460 rsvd_check->bad_mt_xwr = bad_mt_xwr;
4461}
4462
4463static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
4464 struct kvm_mmu *context, bool execonly)
4465{
4466 __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
4467 cpuid_maxphyaddr(vcpu), execonly);
4468}
4469
4470
4471
4472
4473
4474
4475void
4476reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
4477{
4478 bool uses_nx = context->nx ||
4479 context->mmu_role.base.smep_andnot_wp;
4480 struct rsvd_bits_validate *shadow_zero_check;
4481 int i;
4482
4483
4484
4485
4486
4487 shadow_zero_check = &context->shadow_zero_check;
4488 __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4489 boot_cpu_data.x86_phys_bits,
4490 context->shadow_root_level, uses_nx,
4491 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
4492 is_pse(vcpu), true);
4493
4494 if (!shadow_me_mask)
4495 return;
4496
4497 for (i = context->shadow_root_level; --i >= 0;) {
4498 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4499 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4500 }
4501
4502}
4503EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
4504
4505static inline bool boot_cpu_is_amd(void)
4506{
4507 WARN_ON_ONCE(!tdp_enabled);
4508 return shadow_x_mask == 0;
4509}
4510
4511
4512
4513
4514
4515static void
4516reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4517 struct kvm_mmu *context)
4518{
4519 struct rsvd_bits_validate *shadow_zero_check;
4520 int i;
4521
4522 shadow_zero_check = &context->shadow_zero_check;
4523
4524 if (boot_cpu_is_amd())
4525 __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4526 boot_cpu_data.x86_phys_bits,
4527 context->shadow_root_level, false,
4528 boot_cpu_has(X86_FEATURE_GBPAGES),
4529 true, true);
4530 else
4531 __reset_rsvds_bits_mask_ept(shadow_zero_check,
4532 boot_cpu_data.x86_phys_bits,
4533 false);
4534
4535 if (!shadow_me_mask)
4536 return;
4537
4538 for (i = context->shadow_root_level; --i >= 0;) {
4539 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4540 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4541 }
4542}
4543
4544
4545
4546
4547
4548static void
4549reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4550 struct kvm_mmu *context, bool execonly)
4551{
4552 __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
4553 boot_cpu_data.x86_phys_bits, execonly);
4554}
4555
4556#define BYTE_MASK(access) \
4557 ((1 & (access) ? 2 : 0) | \
4558 (2 & (access) ? 4 : 0) | \
4559 (3 & (access) ? 8 : 0) | \
4560 (4 & (access) ? 16 : 0) | \
4561 (5 & (access) ? 32 : 0) | \
4562 (6 & (access) ? 64 : 0) | \
4563 (7 & (access) ? 128 : 0))
4564
4565
4566static void update_permission_bitmask(struct kvm_vcpu *vcpu,
4567 struct kvm_mmu *mmu, bool ept)
4568{
4569 unsigned byte;
4570
4571 const u8 x = BYTE_MASK(ACC_EXEC_MASK);
4572 const u8 w = BYTE_MASK(ACC_WRITE_MASK);
4573 const u8 u = BYTE_MASK(ACC_USER_MASK);
4574
4575 bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0;
4576 bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0;
4577 bool cr0_wp = is_write_protection(vcpu);
4578
4579 for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
4580 unsigned pfec = byte << 1;
4581
4582
4583
4584
4585
4586
4587
4588 u8 wf = (pfec & PFERR_WRITE_MASK) ? ~w : 0;
4589
4590 u8 uf = (pfec & PFERR_USER_MASK) ? ~u : 0;
4591
4592 u8 ff = (pfec & PFERR_FETCH_MASK) ? ~x : 0;
4593
4594 u8 smepf = 0;
4595
4596 u8 smapf = 0;
4597
4598 if (!ept) {
4599
4600 u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
4601
4602
4603 if (!mmu->nx)
4604 ff = 0;
4605
4606
4607 if (!cr0_wp)
4608 wf = (pfec & PFERR_USER_MASK) ? wf : 0;
4609
4610
4611 if (cr4_smep)
4612 smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630 if (cr4_smap)
4631 smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
4632 }
4633
4634 mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
4635 }
4636}
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
4663 bool ept)
4664{
4665 unsigned bit;
4666 bool wp;
4667
4668 if (ept) {
4669 mmu->pkru_mask = 0;
4670 return;
4671 }
4672
4673
4674 if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) {
4675 mmu->pkru_mask = 0;
4676 return;
4677 }
4678
4679 wp = is_write_protection(vcpu);
4680
4681 for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
4682 unsigned pfec, pkey_bits;
4683 bool check_pkey, check_write, ff, uf, wf, pte_user;
4684
4685 pfec = bit << 1;
4686 ff = pfec & PFERR_FETCH_MASK;
4687 uf = pfec & PFERR_USER_MASK;
4688 wf = pfec & PFERR_WRITE_MASK;
4689
4690
4691 pte_user = pfec & PFERR_RSVD_MASK;
4692
4693
4694
4695
4696
4697 check_pkey = (!ff && pte_user);
4698
4699
4700
4701
4702 check_write = check_pkey && wf && (uf || wp);
4703
4704
4705 pkey_bits = !!check_pkey;
4706
4707 pkey_bits |= (!!check_write) << 1;
4708
4709 mmu->pkru_mask |= (pkey_bits & 3) << pfec;
4710 }
4711}
4712
4713static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
4714{
4715 unsigned root_level = mmu->root_level;
4716
4717 mmu->last_nonleaf_level = root_level;
4718 if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
4719 mmu->last_nonleaf_level++;
4720}
4721
4722static void paging64_init_context_common(struct kvm_vcpu *vcpu,
4723 struct kvm_mmu *context,
4724 int level)
4725{
4726 context->nx = is_nx(vcpu);
4727 context->root_level = level;
4728
4729 reset_rsvds_bits_mask(vcpu, context);
4730 update_permission_bitmask(vcpu, context, false);
4731 update_pkru_bitmask(vcpu, context, false);
4732 update_last_nonleaf_level(vcpu, context);
4733
4734 MMU_WARN_ON(!is_pae(vcpu));
4735 context->page_fault = paging64_page_fault;
4736 context->gva_to_gpa = paging64_gva_to_gpa;
4737 context->sync_page = paging64_sync_page;
4738 context->invlpg = paging64_invlpg;
4739 context->update_pte = paging64_update_pte;
4740 context->shadow_root_level = level;
4741 context->direct_map = false;
4742}
4743
4744static void paging64_init_context(struct kvm_vcpu *vcpu,
4745 struct kvm_mmu *context)
4746{
4747 int root_level = is_la57_mode(vcpu) ?
4748 PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4749
4750 paging64_init_context_common(vcpu, context, root_level);
4751}
4752
4753static void paging32_init_context(struct kvm_vcpu *vcpu,
4754 struct kvm_mmu *context)
4755{
4756 context->nx = false;
4757 context->root_level = PT32_ROOT_LEVEL;
4758
4759 reset_rsvds_bits_mask(vcpu, context);
4760 update_permission_bitmask(vcpu, context, false);
4761 update_pkru_bitmask(vcpu, context, false);
4762 update_last_nonleaf_level(vcpu, context);
4763
4764 context->page_fault = paging32_page_fault;
4765 context->gva_to_gpa = paging32_gva_to_gpa;
4766 context->sync_page = paging32_sync_page;
4767 context->invlpg = paging32_invlpg;
4768 context->update_pte = paging32_update_pte;
4769 context->shadow_root_level = PT32E_ROOT_LEVEL;
4770 context->direct_map = false;
4771}
4772
4773static void paging32E_init_context(struct kvm_vcpu *vcpu,
4774 struct kvm_mmu *context)
4775{
4776 paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
4777}
4778
4779static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
4780{
4781 union kvm_mmu_extended_role ext = {0};
4782
4783 ext.cr0_pg = !!is_paging(vcpu);
4784 ext.cr4_pae = !!is_pae(vcpu);
4785 ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
4786 ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
4787 ext.cr4_pse = !!is_pse(vcpu);
4788 ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
4789 ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
4790 ext.maxphyaddr = cpuid_maxphyaddr(vcpu);
4791
4792 ext.valid = 1;
4793
4794 return ext;
4795}
4796
4797static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
4798 bool base_only)
4799{
4800 union kvm_mmu_role role = {0};
4801
4802 role.base.access = ACC_ALL;
4803 role.base.nxe = !!is_nx(vcpu);
4804 role.base.cr0_wp = is_write_protection(vcpu);
4805 role.base.smm = is_smm(vcpu);
4806 role.base.guest_mode = is_guest_mode(vcpu);
4807
4808 if (base_only)
4809 return role;
4810
4811 role.ext = kvm_calc_mmu_role_ext(vcpu);
4812
4813 return role;
4814}
4815
4816static union kvm_mmu_role
4817kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
4818{
4819 union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
4820
4821 role.base.ad_disabled = (shadow_accessed_mask == 0);
4822 role.base.level = kvm_x86_ops->get_tdp_level(vcpu);
4823 role.base.direct = true;
4824 role.base.gpte_is_8_bytes = true;
4825
4826 return role;
4827}
4828
4829static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
4830{
4831 struct kvm_mmu *context = vcpu->arch.mmu;
4832 union kvm_mmu_role new_role =
4833 kvm_calc_tdp_mmu_root_page_role(vcpu, false);
4834
4835 new_role.base.word &= mmu_base_role_mask.word;
4836 if (new_role.as_u64 == context->mmu_role.as_u64)
4837 return;
4838
4839 context->mmu_role.as_u64 = new_role.as_u64;
4840 context->page_fault = tdp_page_fault;
4841 context->sync_page = nonpaging_sync_page;
4842 context->invlpg = nonpaging_invlpg;
4843 context->update_pte = nonpaging_update_pte;
4844 context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
4845 context->direct_map = true;
4846 context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
4847 context->get_cr3 = get_cr3;
4848 context->get_pdptr = kvm_pdptr_read;
4849 context->inject_page_fault = kvm_inject_page_fault;
4850
4851 if (!is_paging(vcpu)) {
4852 context->nx = false;
4853 context->gva_to_gpa = nonpaging_gva_to_gpa;
4854 context->root_level = 0;
4855 } else if (is_long_mode(vcpu)) {
4856 context->nx = is_nx(vcpu);
4857 context->root_level = is_la57_mode(vcpu) ?
4858 PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4859 reset_rsvds_bits_mask(vcpu, context);
4860 context->gva_to_gpa = paging64_gva_to_gpa;
4861 } else if (is_pae(vcpu)) {
4862 context->nx = is_nx(vcpu);
4863 context->root_level = PT32E_ROOT_LEVEL;
4864 reset_rsvds_bits_mask(vcpu, context);
4865 context->gva_to_gpa = paging64_gva_to_gpa;
4866 } else {
4867 context->nx = false;
4868 context->root_level = PT32_ROOT_LEVEL;
4869 reset_rsvds_bits_mask(vcpu, context);
4870 context->gva_to_gpa = paging32_gva_to_gpa;
4871 }
4872
4873 update_permission_bitmask(vcpu, context, false);
4874 update_pkru_bitmask(vcpu, context, false);
4875 update_last_nonleaf_level(vcpu, context);
4876 reset_tdp_shadow_zero_bits_mask(vcpu, context);
4877}
4878
4879static union kvm_mmu_role
4880kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
4881{
4882 union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
4883
4884 role.base.smep_andnot_wp = role.ext.cr4_smep &&
4885 !is_write_protection(vcpu);
4886 role.base.smap_andnot_wp = role.ext.cr4_smap &&
4887 !is_write_protection(vcpu);
4888 role.base.direct = !is_paging(vcpu);
4889 role.base.gpte_is_8_bytes = !!is_pae(vcpu);
4890
4891 if (!is_long_mode(vcpu))
4892 role.base.level = PT32E_ROOT_LEVEL;
4893 else if (is_la57_mode(vcpu))
4894 role.base.level = PT64_ROOT_5LEVEL;
4895 else
4896 role.base.level = PT64_ROOT_4LEVEL;
4897
4898 return role;
4899}
4900
4901void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
4902{
4903 struct kvm_mmu *context = vcpu->arch.mmu;
4904 union kvm_mmu_role new_role =
4905 kvm_calc_shadow_mmu_root_page_role(vcpu, false);
4906
4907 new_role.base.word &= mmu_base_role_mask.word;
4908 if (new_role.as_u64 == context->mmu_role.as_u64)
4909 return;
4910
4911 if (!is_paging(vcpu))
4912 nonpaging_init_context(vcpu, context);
4913 else if (is_long_mode(vcpu))
4914 paging64_init_context(vcpu, context);
4915 else if (is_pae(vcpu))
4916 paging32E_init_context(vcpu, context);
4917 else
4918 paging32_init_context(vcpu, context);
4919
4920 context->mmu_role.as_u64 = new_role.as_u64;
4921 reset_shadow_zero_bits_mask(vcpu, context);
4922}
4923EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
4924
4925static union kvm_mmu_role
4926kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
4927 bool execonly)
4928{
4929 union kvm_mmu_role role = {0};
4930
4931
4932 role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
4933
4934 role.base.level = PT64_ROOT_4LEVEL;
4935 role.base.gpte_is_8_bytes = true;
4936 role.base.direct = false;
4937 role.base.ad_disabled = !accessed_dirty;
4938 role.base.guest_mode = true;
4939 role.base.access = ACC_ALL;
4940
4941
4942
4943
4944
4945 role.base.cr0_wp = true;
4946 role.base.smap_andnot_wp = true;
4947
4948 role.ext = kvm_calc_mmu_role_ext(vcpu);
4949 role.ext.execonly = execonly;
4950
4951 return role;
4952}
4953
4954void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
4955 bool accessed_dirty, gpa_t new_eptp)
4956{
4957 struct kvm_mmu *context = vcpu->arch.mmu;
4958 union kvm_mmu_role new_role =
4959 kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
4960 execonly);
4961
4962 __kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false);
4963
4964 new_role.base.word &= mmu_base_role_mask.word;
4965 if (new_role.as_u64 == context->mmu_role.as_u64)
4966 return;
4967
4968 context->shadow_root_level = PT64_ROOT_4LEVEL;
4969
4970 context->nx = true;
4971 context->ept_ad = accessed_dirty;
4972 context->page_fault = ept_page_fault;
4973 context->gva_to_gpa = ept_gva_to_gpa;
4974 context->sync_page = ept_sync_page;
4975 context->invlpg = ept_invlpg;
4976 context->update_pte = ept_update_pte;
4977 context->root_level = PT64_ROOT_4LEVEL;
4978 context->direct_map = false;
4979 context->mmu_role.as_u64 = new_role.as_u64;
4980
4981 update_permission_bitmask(vcpu, context, true);
4982 update_pkru_bitmask(vcpu, context, true);
4983 update_last_nonleaf_level(vcpu, context);
4984 reset_rsvds_bits_mask_ept(vcpu, context, execonly);
4985 reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
4986}
4987EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
4988
4989static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
4990{
4991 struct kvm_mmu *context = vcpu->arch.mmu;
4992
4993 kvm_init_shadow_mmu(vcpu);
4994 context->set_cr3 = kvm_x86_ops->set_cr3;
4995 context->get_cr3 = get_cr3;
4996 context->get_pdptr = kvm_pdptr_read;
4997 context->inject_page_fault = kvm_inject_page_fault;
4998}
4999
5000static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
5001{
5002 union kvm_mmu_role new_role = kvm_calc_mmu_role_common(vcpu, false);
5003 struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
5004
5005 new_role.base.word &= mmu_base_role_mask.word;
5006 if (new_role.as_u64 == g_context->mmu_role.as_u64)
5007 return;
5008
5009 g_context->mmu_role.as_u64 = new_role.as_u64;
5010 g_context->get_cr3 = get_cr3;
5011 g_context->get_pdptr = kvm_pdptr_read;
5012 g_context->inject_page_fault = kvm_inject_page_fault;
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022 if (!is_paging(vcpu)) {
5023 g_context->nx = false;
5024 g_context->root_level = 0;
5025 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
5026 } else if (is_long_mode(vcpu)) {
5027 g_context->nx = is_nx(vcpu);
5028 g_context->root_level = is_la57_mode(vcpu) ?
5029 PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
5030 reset_rsvds_bits_mask(vcpu, g_context);
5031 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
5032 } else if (is_pae(vcpu)) {
5033 g_context->nx = is_nx(vcpu);
5034 g_context->root_level = PT32E_ROOT_LEVEL;
5035 reset_rsvds_bits_mask(vcpu, g_context);
5036 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
5037 } else {
5038 g_context->nx = false;
5039 g_context->root_level = PT32_ROOT_LEVEL;
5040 reset_rsvds_bits_mask(vcpu, g_context);
5041 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
5042 }
5043
5044 update_permission_bitmask(vcpu, g_context, false);
5045 update_pkru_bitmask(vcpu, g_context, false);
5046 update_last_nonleaf_level(vcpu, g_context);
5047}
5048
5049void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
5050{
5051 if (reset_roots) {
5052 uint i;
5053
5054 vcpu->arch.mmu->root_hpa = INVALID_PAGE;
5055
5056 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5057 vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5058 }
5059
5060 if (mmu_is_nested(vcpu))
5061 init_kvm_nested_mmu(vcpu);
5062 else if (tdp_enabled)
5063 init_kvm_tdp_mmu(vcpu);
5064 else
5065 init_kvm_softmmu(vcpu);
5066}
5067EXPORT_SYMBOL_GPL(kvm_init_mmu);
5068
5069static union kvm_mmu_page_role
5070kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
5071{
5072 union kvm_mmu_role role;
5073
5074 if (tdp_enabled)
5075 role = kvm_calc_tdp_mmu_root_page_role(vcpu, true);
5076 else
5077 role = kvm_calc_shadow_mmu_root_page_role(vcpu, true);
5078
5079 return role.base;
5080}
5081
5082void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
5083{
5084 kvm_mmu_unload(vcpu);
5085 kvm_init_mmu(vcpu, true);
5086}
5087EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
5088
5089int kvm_mmu_load(struct kvm_vcpu *vcpu)
5090{
5091 int r;
5092
5093 r = mmu_topup_memory_caches(vcpu);
5094 if (r)
5095 goto out;
5096 r = mmu_alloc_roots(vcpu);
5097 kvm_mmu_sync_roots(vcpu);
5098 if (r)
5099 goto out;
5100 kvm_mmu_load_cr3(vcpu);
5101 kvm_x86_ops->tlb_flush(vcpu, true);
5102out:
5103 return r;
5104}
5105EXPORT_SYMBOL_GPL(kvm_mmu_load);
5106
5107void kvm_mmu_unload(struct kvm_vcpu *vcpu)
5108{
5109 kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
5110 WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa));
5111 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
5112 WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
5113}
5114EXPORT_SYMBOL_GPL(kvm_mmu_unload);
5115
5116static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
5117 struct kvm_mmu_page *sp, u64 *spte,
5118 const void *new)
5119{
5120 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
5121 ++vcpu->kvm->stat.mmu_pde_zapped;
5122 return;
5123 }
5124
5125 ++vcpu->kvm->stat.mmu_pte_updated;
5126 vcpu->arch.mmu->update_pte(vcpu, sp, spte, new);
5127}
5128
5129static bool need_remote_flush(u64 old, u64 new)
5130{
5131 if (!is_shadow_present_pte(old))
5132 return false;
5133 if (!is_shadow_present_pte(new))
5134 return true;
5135 if ((old ^ new) & PT64_BASE_ADDR_MASK)
5136 return true;
5137 old ^= shadow_nx_mask;
5138 new ^= shadow_nx_mask;
5139 return (old & ~new & PT64_PERM_MASK) != 0;
5140}
5141
5142static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
5143 int *bytes)
5144{
5145 u64 gentry = 0;
5146 int r;
5147
5148
5149
5150
5151
5152
5153 if (is_pae(vcpu) && *bytes == 4) {
5154
5155 *gpa &= ~(gpa_t)7;
5156 *bytes = 8;
5157 }
5158
5159 if (*bytes == 4 || *bytes == 8) {
5160 r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
5161 if (r)
5162 gentry = 0;
5163 }
5164
5165 return gentry;
5166}
5167
5168
5169
5170
5171
5172static bool detect_write_flooding(struct kvm_mmu_page *sp)
5173{
5174
5175
5176
5177
5178 if (sp->role.level == PT_PAGE_TABLE_LEVEL)
5179 return false;
5180
5181 atomic_inc(&sp->write_flooding_count);
5182 return atomic_read(&sp->write_flooding_count) >= 3;
5183}
5184
5185
5186
5187
5188
5189static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
5190 int bytes)
5191{
5192 unsigned offset, pte_size, misaligned;
5193
5194 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
5195 gpa, bytes, sp->role.word);
5196
5197 offset = offset_in_page(gpa);
5198 pte_size = sp->role.gpte_is_8_bytes ? 8 : 4;
5199
5200
5201
5202
5203
5204 if (!(offset & (pte_size - 1)) && bytes == 1)
5205 return false;
5206
5207 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
5208 misaligned |= bytes < 4;
5209
5210 return misaligned;
5211}
5212
5213static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
5214{
5215 unsigned page_offset, quadrant;
5216 u64 *spte;
5217 int level;
5218
5219 page_offset = offset_in_page(gpa);
5220 level = sp->role.level;
5221 *nspte = 1;
5222 if (!sp->role.gpte_is_8_bytes) {
5223 page_offset <<= 1;
5224
5225
5226
5227
5228
5229 if (level == PT32_ROOT_LEVEL) {
5230 page_offset &= ~7;
5231 page_offset <<= 1;
5232 *nspte = 2;
5233 }
5234 quadrant = page_offset >> PAGE_SHIFT;
5235 page_offset &= ~PAGE_MASK;
5236 if (quadrant != sp->role.quadrant)
5237 return NULL;
5238 }
5239
5240 spte = &sp->spt[page_offset / sizeof(*spte)];
5241 return spte;
5242}
5243
5244static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
5245 const u8 *new, int bytes,
5246 struct kvm_page_track_notifier_node *node)
5247{
5248 gfn_t gfn = gpa >> PAGE_SHIFT;
5249 struct kvm_mmu_page *sp;
5250 LIST_HEAD(invalid_list);
5251 u64 entry, gentry, *spte;
5252 int npte;
5253 bool remote_flush, local_flush;
5254
5255
5256
5257
5258
5259 if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
5260 return;
5261
5262 remote_flush = local_flush = false;
5263
5264 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
5265
5266
5267
5268
5269
5270
5271 mmu_topup_memory_caches(vcpu);
5272
5273 spin_lock(&vcpu->kvm->mmu_lock);
5274
5275 gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
5276
5277 ++vcpu->kvm->stat.mmu_pte_write;
5278 kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
5279
5280 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
5281 if (detect_write_misaligned(sp, gpa, bytes) ||
5282 detect_write_flooding(sp)) {
5283 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
5284 ++vcpu->kvm->stat.mmu_flooded;
5285 continue;
5286 }
5287
5288 spte = get_written_sptes(sp, gpa, &npte);
5289 if (!spte)
5290 continue;
5291
5292 local_flush = true;
5293 while (npte--) {
5294 u32 base_role = vcpu->arch.mmu->mmu_role.base.word;
5295
5296 entry = *spte;
5297 mmu_page_zap_pte(vcpu->kvm, sp, spte);
5298 if (gentry &&
5299 !((sp->role.word ^ base_role)
5300 & mmu_base_role_mask.word) && rmap_can_add(vcpu))
5301 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
5302 if (need_remote_flush(entry, *spte))
5303 remote_flush = true;
5304 ++spte;
5305 }
5306 }
5307 kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
5308 kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
5309 spin_unlock(&vcpu->kvm->mmu_lock);
5310}
5311
5312int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
5313{
5314 gpa_t gpa;
5315 int r;
5316
5317 if (vcpu->arch.mmu->direct_map)
5318 return 0;
5319
5320 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
5321
5322 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
5323
5324 return r;
5325}
5326EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
5327
5328static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
5329{
5330 LIST_HEAD(invalid_list);
5331
5332 if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
5333 return 0;
5334
5335 while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
5336 if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
5337 break;
5338
5339 ++vcpu->kvm->stat.mmu_recycled;
5340 }
5341 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
5342
5343 if (!kvm_mmu_available_pages(vcpu->kvm))
5344 return -ENOSPC;
5345 return 0;
5346}
5347
5348int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
5349 void *insn, int insn_len)
5350{
5351 int r, emulation_type = 0;
5352 enum emulation_result er;
5353 bool direct = vcpu->arch.mmu->direct_map;
5354
5355
5356 if (vcpu->arch.mmu->direct_map) {
5357 vcpu->arch.gpa_available = true;
5358 vcpu->arch.gpa_val = cr2;
5359 }
5360
5361 r = RET_PF_INVALID;
5362 if (unlikely(error_code & PFERR_RSVD_MASK)) {
5363 r = handle_mmio_page_fault(vcpu, cr2, direct);
5364 if (r == RET_PF_EMULATE)
5365 goto emulate;
5366 }
5367
5368 if (r == RET_PF_INVALID) {
5369 r = vcpu->arch.mmu->page_fault(vcpu, cr2,
5370 lower_32_bits(error_code),
5371 false);
5372 WARN_ON(r == RET_PF_INVALID);
5373 }
5374
5375 if (r == RET_PF_RETRY)
5376 return 1;
5377 if (r < 0)
5378 return r;
5379
5380
5381
5382
5383
5384
5385
5386
5387 if (vcpu->arch.mmu->direct_map &&
5388 (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
5389 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
5390 return 1;
5391 }
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404 if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu))
5405 emulation_type = EMULTYPE_ALLOW_RETRY;
5406emulate:
5407
5408
5409
5410
5411
5412
5413
5414 if (unlikely(insn && !insn_len)) {
5415 if (!kvm_x86_ops->need_emulation_on_page_fault(vcpu))
5416 return 1;
5417 }
5418
5419 er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
5420
5421 switch (er) {
5422 case EMULATE_DONE:
5423 return 1;
5424 case EMULATE_USER_EXIT:
5425 ++vcpu->stat.mmio_exits;
5426
5427 case EMULATE_FAIL:
5428 return 0;
5429 default:
5430 BUG();
5431 }
5432}
5433EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
5434
5435void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
5436{
5437 struct kvm_mmu *mmu = vcpu->arch.mmu;
5438 int i;
5439
5440
5441 if (is_noncanonical_address(gva, vcpu))
5442 return;
5443
5444 mmu->invlpg(vcpu, gva, mmu->root_hpa);
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5458 if (VALID_PAGE(mmu->prev_roots[i].hpa))
5459 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5460
5461 kvm_x86_ops->tlb_flush_gva(vcpu, gva);
5462 ++vcpu->stat.invlpg;
5463}
5464EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
5465
5466void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
5467{
5468 struct kvm_mmu *mmu = vcpu->arch.mmu;
5469 bool tlb_flush = false;
5470 uint i;
5471
5472 if (pcid == kvm_get_active_pcid(vcpu)) {
5473 mmu->invlpg(vcpu, gva, mmu->root_hpa);
5474 tlb_flush = true;
5475 }
5476
5477 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5478 if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
5479 pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) {
5480 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5481 tlb_flush = true;
5482 }
5483 }
5484
5485 if (tlb_flush)
5486 kvm_x86_ops->tlb_flush_gva(vcpu, gva);
5487
5488 ++vcpu->stat.invlpg;
5489
5490
5491
5492
5493
5494
5495}
5496EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
5497
5498void kvm_enable_tdp(void)
5499{
5500 tdp_enabled = true;
5501}
5502EXPORT_SYMBOL_GPL(kvm_enable_tdp);
5503
5504void kvm_disable_tdp(void)
5505{
5506 tdp_enabled = false;
5507}
5508EXPORT_SYMBOL_GPL(kvm_disable_tdp);
5509
5510
5511
5512typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
5513
5514
5515static __always_inline bool
5516slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
5517 slot_level_handler fn, int start_level, int end_level,
5518 gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
5519{
5520 struct slot_rmap_walk_iterator iterator;
5521 bool flush = false;
5522
5523 for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
5524 end_gfn, &iterator) {
5525 if (iterator.rmap)
5526 flush |= fn(kvm, iterator.rmap);
5527
5528 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
5529 if (flush && lock_flush_tlb) {
5530 kvm_flush_remote_tlbs_with_address(kvm,
5531 start_gfn,
5532 iterator.gfn - start_gfn + 1);
5533 flush = false;
5534 }
5535 cond_resched_lock(&kvm->mmu_lock);
5536 }
5537 }
5538
5539 if (flush && lock_flush_tlb) {
5540 kvm_flush_remote_tlbs_with_address(kvm, start_gfn,
5541 end_gfn - start_gfn + 1);
5542 flush = false;
5543 }
5544
5545 return flush;
5546}
5547
5548static __always_inline bool
5549slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5550 slot_level_handler fn, int start_level, int end_level,
5551 bool lock_flush_tlb)
5552{
5553 return slot_handle_level_range(kvm, memslot, fn, start_level,
5554 end_level, memslot->base_gfn,
5555 memslot->base_gfn + memslot->npages - 1,
5556 lock_flush_tlb);
5557}
5558
5559static __always_inline bool
5560slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5561 slot_level_handler fn, bool lock_flush_tlb)
5562{
5563 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
5564 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5565}
5566
5567static __always_inline bool
5568slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5569 slot_level_handler fn, bool lock_flush_tlb)
5570{
5571 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
5572 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5573}
5574
5575static __always_inline bool
5576slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
5577 slot_level_handler fn, bool lock_flush_tlb)
5578{
5579 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
5580 PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
5581}
5582
5583static void free_mmu_pages(struct kvm_vcpu *vcpu)
5584{
5585 free_page((unsigned long)vcpu->arch.mmu->pae_root);
5586 free_page((unsigned long)vcpu->arch.mmu->lm_root);
5587}
5588
5589static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
5590{
5591 struct page *page;
5592 int i;
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603 if (tdp_enabled && kvm_x86_ops->get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
5604 return 0;
5605
5606 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
5607 if (!page)
5608 return -ENOMEM;
5609
5610 vcpu->arch.mmu->pae_root = page_address(page);
5611 for (i = 0; i < 4; ++i)
5612 vcpu->arch.mmu->pae_root[i] = INVALID_PAGE;
5613
5614 return 0;
5615}
5616
5617int kvm_mmu_create(struct kvm_vcpu *vcpu)
5618{
5619 uint i;
5620
5621 vcpu->arch.mmu = &vcpu->arch.root_mmu;
5622 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
5623
5624 vcpu->arch.root_mmu.root_hpa = INVALID_PAGE;
5625 vcpu->arch.root_mmu.root_cr3 = 0;
5626 vcpu->arch.root_mmu.translate_gpa = translate_gpa;
5627 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5628 vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5629
5630 vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE;
5631 vcpu->arch.guest_mmu.root_cr3 = 0;
5632 vcpu->arch.guest_mmu.translate_gpa = translate_gpa;
5633 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5634 vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5635
5636 vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
5637 return alloc_mmu_pages(vcpu);
5638}
5639
5640static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
5641 struct kvm_memory_slot *slot,
5642 struct kvm_page_track_notifier_node *node)
5643{
5644 struct kvm_mmu_page *sp;
5645 LIST_HEAD(invalid_list);
5646 unsigned long i;
5647 bool flush;
5648 gfn_t gfn;
5649
5650 spin_lock(&kvm->mmu_lock);
5651
5652 if (list_empty(&kvm->arch.active_mmu_pages))
5653 goto out_unlock;
5654
5655 flush = slot_handle_all_level(kvm, slot, kvm_zap_rmapp, false);
5656
5657 for (i = 0; i < slot->npages; i++) {
5658 gfn = slot->base_gfn + i;
5659
5660 for_each_valid_sp(kvm, sp, gfn) {
5661 if (sp->gfn != gfn)
5662 continue;
5663
5664 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
5665 }
5666 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
5667 kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
5668 flush = false;
5669 cond_resched_lock(&kvm->mmu_lock);
5670 }
5671 }
5672 kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
5673
5674out_unlock:
5675 spin_unlock(&kvm->mmu_lock);
5676}
5677
5678void kvm_mmu_init_vm(struct kvm *kvm)
5679{
5680 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5681
5682 node->track_write = kvm_mmu_pte_write;
5683 node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
5684 kvm_page_track_register_notifier(kvm, node);
5685}
5686
5687void kvm_mmu_uninit_vm(struct kvm *kvm)
5688{
5689 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5690
5691 kvm_page_track_unregister_notifier(kvm, node);
5692}
5693
5694void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
5695{
5696 struct kvm_memslots *slots;
5697 struct kvm_memory_slot *memslot;
5698 int i;
5699
5700 spin_lock(&kvm->mmu_lock);
5701 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
5702 slots = __kvm_memslots(kvm, i);
5703 kvm_for_each_memslot(memslot, slots) {
5704 gfn_t start, end;
5705
5706 start = max(gfn_start, memslot->base_gfn);
5707 end = min(gfn_end, memslot->base_gfn + memslot->npages);
5708 if (start >= end)
5709 continue;
5710
5711 slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
5712 PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
5713 start, end - 1, true);
5714 }
5715 }
5716
5717 spin_unlock(&kvm->mmu_lock);
5718}
5719
5720static bool slot_rmap_write_protect(struct kvm *kvm,
5721 struct kvm_rmap_head *rmap_head)
5722{
5723 return __rmap_write_protect(kvm, rmap_head, false);
5724}
5725
5726void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
5727 struct kvm_memory_slot *memslot)
5728{
5729 bool flush;
5730
5731 spin_lock(&kvm->mmu_lock);
5732 flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
5733 false);
5734 spin_unlock(&kvm->mmu_lock);
5735
5736
5737
5738
5739
5740
5741 lockdep_assert_held(&kvm->slots_lock);
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754 if (flush)
5755 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5756 memslot->npages);
5757}
5758
5759static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
5760 struct kvm_rmap_head *rmap_head)
5761{
5762 u64 *sptep;
5763 struct rmap_iterator iter;
5764 int need_tlb_flush = 0;
5765 kvm_pfn_t pfn;
5766 struct kvm_mmu_page *sp;
5767
5768restart:
5769 for_each_rmap_spte(rmap_head, &iter, sptep) {
5770 sp = page_header(__pa(sptep));
5771 pfn = spte_to_pfn(*sptep);
5772
5773
5774
5775
5776
5777
5778
5779
5780 if (sp->role.direct &&
5781 !kvm_is_reserved_pfn(pfn) &&
5782 PageTransCompoundMap(pfn_to_page(pfn))) {
5783 pte_list_remove(rmap_head, sptep);
5784
5785 if (kvm_available_flush_tlb_with_range())
5786 kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
5787 KVM_PAGES_PER_HPAGE(sp->role.level));
5788 else
5789 need_tlb_flush = 1;
5790
5791 goto restart;
5792 }
5793 }
5794
5795 return need_tlb_flush;
5796}
5797
5798void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
5799 const struct kvm_memory_slot *memslot)
5800{
5801
5802 spin_lock(&kvm->mmu_lock);
5803 slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
5804 kvm_mmu_zap_collapsible_spte, true);
5805 spin_unlock(&kvm->mmu_lock);
5806}
5807
5808void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
5809 struct kvm_memory_slot *memslot)
5810{
5811 bool flush;
5812
5813 spin_lock(&kvm->mmu_lock);
5814 flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
5815 spin_unlock(&kvm->mmu_lock);
5816
5817 lockdep_assert_held(&kvm->slots_lock);
5818
5819
5820
5821
5822
5823
5824
5825 if (flush)
5826 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5827 memslot->npages);
5828}
5829EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
5830
5831void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
5832 struct kvm_memory_slot *memslot)
5833{
5834 bool flush;
5835
5836 spin_lock(&kvm->mmu_lock);
5837 flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
5838 false);
5839 spin_unlock(&kvm->mmu_lock);
5840
5841
5842 lockdep_assert_held(&kvm->slots_lock);
5843
5844 if (flush)
5845 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5846 memslot->npages);
5847}
5848EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
5849
5850void kvm_mmu_slot_set_dirty(struct kvm *kvm,
5851 struct kvm_memory_slot *memslot)
5852{
5853 bool flush;
5854
5855 spin_lock(&kvm->mmu_lock);
5856 flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
5857 spin_unlock(&kvm->mmu_lock);
5858
5859 lockdep_assert_held(&kvm->slots_lock);
5860
5861
5862 if (flush)
5863 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5864 memslot->npages);
5865}
5866EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
5867
5868static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only)
5869{
5870 struct kvm_mmu_page *sp, *node;
5871 LIST_HEAD(invalid_list);
5872 int ign;
5873
5874 spin_lock(&kvm->mmu_lock);
5875restart:
5876 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
5877 if (mmio_only && !sp->mmio_cached)
5878 continue;
5879 if (sp->role.invalid && sp->root_count)
5880 continue;
5881 if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) {
5882 WARN_ON_ONCE(mmio_only);
5883 goto restart;
5884 }
5885 if (cond_resched_lock(&kvm->mmu_lock))
5886 goto restart;
5887 }
5888
5889 kvm_mmu_commit_zap_page(kvm, &invalid_list);
5890 spin_unlock(&kvm->mmu_lock);
5891}
5892
5893void kvm_mmu_zap_all(struct kvm *kvm)
5894{
5895 return __kvm_mmu_zap_all(kvm, false);
5896}
5897
5898void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
5899{
5900 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
5901
5902 gen &= MMIO_SPTE_GEN_MASK;
5903
5904
5905
5906
5907
5908
5909
5910
5911 gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
5912
5913
5914
5915
5916
5917 if (unlikely(gen == 0)) {
5918 kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
5919 __kvm_mmu_zap_all(kvm, true);
5920 }
5921}
5922
5923static unsigned long
5924mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
5925{
5926 struct kvm *kvm;
5927 int nr_to_scan = sc->nr_to_scan;
5928 unsigned long freed = 0;
5929
5930 spin_lock(&kvm_lock);
5931
5932 list_for_each_entry(kvm, &vm_list, vm_list) {
5933 int idx;
5934 LIST_HEAD(invalid_list);
5935
5936
5937
5938
5939
5940
5941
5942 if (!nr_to_scan--)
5943 break;
5944
5945
5946
5947
5948
5949
5950 if (!kvm->arch.n_used_mmu_pages)
5951 continue;
5952
5953 idx = srcu_read_lock(&kvm->srcu);
5954 spin_lock(&kvm->mmu_lock);
5955
5956 if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
5957 freed++;
5958 kvm_mmu_commit_zap_page(kvm, &invalid_list);
5959
5960 spin_unlock(&kvm->mmu_lock);
5961 srcu_read_unlock(&kvm->srcu, idx);
5962
5963
5964
5965
5966
5967
5968 list_move_tail(&kvm->vm_list, &vm_list);
5969 break;
5970 }
5971
5972 spin_unlock(&kvm_lock);
5973 return freed;
5974}
5975
5976static unsigned long
5977mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
5978{
5979 return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
5980}
5981
5982static struct shrinker mmu_shrinker = {
5983 .count_objects = mmu_shrink_count,
5984 .scan_objects = mmu_shrink_scan,
5985 .seeks = DEFAULT_SEEKS * 10,
5986};
5987
5988static void mmu_destroy_caches(void)
5989{
5990 kmem_cache_destroy(pte_list_desc_cache);
5991 kmem_cache_destroy(mmu_page_header_cache);
5992}
5993
5994int kvm_mmu_module_init(void)
5995{
5996 int ret = -ENOMEM;
5997
5998
5999
6000
6001
6002
6003
6004 BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
6005 BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
6006 BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64));
6007
6008 kvm_mmu_reset_all_pte_masks();
6009
6010 pte_list_desc_cache = kmem_cache_create("pte_list_desc",
6011 sizeof(struct pte_list_desc),
6012 0, SLAB_ACCOUNT, NULL);
6013 if (!pte_list_desc_cache)
6014 goto out;
6015
6016 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
6017 sizeof(struct kvm_mmu_page),
6018 0, SLAB_ACCOUNT, NULL);
6019 if (!mmu_page_header_cache)
6020 goto out;
6021
6022 if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
6023 goto out;
6024
6025 ret = register_shrinker(&mmu_shrinker);
6026 if (ret)
6027 goto out;
6028
6029 return 0;
6030
6031out:
6032 mmu_destroy_caches();
6033 return ret;
6034}
6035
6036
6037
6038
6039unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
6040{
6041 unsigned long nr_mmu_pages;
6042 unsigned long nr_pages = 0;
6043 struct kvm_memslots *slots;
6044 struct kvm_memory_slot *memslot;
6045 int i;
6046
6047 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
6048 slots = __kvm_memslots(kvm, i);
6049
6050 kvm_for_each_memslot(memslot, slots)
6051 nr_pages += memslot->npages;
6052 }
6053
6054 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
6055 nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
6056
6057 return nr_mmu_pages;
6058}
6059
6060void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
6061{
6062 kvm_mmu_unload(vcpu);
6063 free_mmu_pages(vcpu);
6064 mmu_free_memory_caches(vcpu);
6065}
6066
6067void kvm_mmu_module_exit(void)
6068{
6069 mmu_destroy_caches();
6070 percpu_counter_destroy(&kvm_total_used_mmu_pages);
6071 unregister_shrinker(&mmu_shrinker);
6072 mmu_audit_disable();
6073}
6074