1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18#include "irq.h"
19#include "mmu.h"
20#include "x86.h"
21#include "kvm_cache_regs.h"
22#include "cpuid.h"
23
24#include <linux/kvm_host.h>
25#include <linux/types.h>
26#include <linux/string.h>
27#include <linux/mm.h>
28#include <linux/highmem.h>
29#include <linux/moduleparam.h>
30#include <linux/export.h>
31#include <linux/swap.h>
32#include <linux/hugetlb.h>
33#include <linux/compiler.h>
34#include <linux/srcu.h>
35#include <linux/slab.h>
36#include <linux/sched/signal.h>
37#include <linux/uaccess.h>
38#include <linux/hash.h>
39#include <linux/kern_levels.h>
40
41#include <asm/page.h>
42#include <asm/pat.h>
43#include <asm/cmpxchg.h>
44#include <asm/e820/api.h>
45#include <asm/io.h>
46#include <asm/vmx.h>
47#include <asm/kvm_page_track.h>
48#include "trace.h"
49
50
51
52
53
54
55
56
57bool tdp_enabled = false;
58
59enum {
60 AUDIT_PRE_PAGE_FAULT,
61 AUDIT_POST_PAGE_FAULT,
62 AUDIT_PRE_PTE_WRITE,
63 AUDIT_POST_PTE_WRITE,
64 AUDIT_PRE_SYNC,
65 AUDIT_POST_SYNC
66};
67
68#undef MMU_DEBUG
69
70#ifdef MMU_DEBUG
71static bool dbg = 0;
72module_param(dbg, bool, 0644);
73
74#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
75#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
76#define MMU_WARN_ON(x) WARN_ON(x)
77#else
78#define pgprintk(x...) do { } while (0)
79#define rmap_printk(x...) do { } while (0)
80#define MMU_WARN_ON(x) do { } while (0)
81#endif
82
83#define PTE_PREFETCH_NUM 8
84
85#define PT_FIRST_AVAIL_BITS_SHIFT 10
86#define PT64_SECOND_AVAIL_BITS_SHIFT 52
87
88#define PT64_LEVEL_BITS 9
89
90#define PT64_LEVEL_SHIFT(level) \
91 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
92
93#define PT64_INDEX(address, level)\
94 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
95
96
97#define PT32_LEVEL_BITS 10
98
99#define PT32_LEVEL_SHIFT(level) \
100 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
101
102#define PT32_LVL_OFFSET_MASK(level) \
103 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
104 * PT32_LEVEL_BITS))) - 1))
105
106#define PT32_INDEX(address, level)\
107 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
108
109
110#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
111#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
112#else
113#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
114#endif
115#define PT64_LVL_ADDR_MASK(level) \
116 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
117 * PT64_LEVEL_BITS))) - 1))
118#define PT64_LVL_OFFSET_MASK(level) \
119 (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
120 * PT64_LEVEL_BITS))) - 1))
121
122#define PT32_BASE_ADDR_MASK PAGE_MASK
123#define PT32_DIR_BASE_ADDR_MASK \
124 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
125#define PT32_LVL_ADDR_MASK(level) \
126 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
127 * PT32_LEVEL_BITS))) - 1))
128
129#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
130 | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
131
132#define ACC_EXEC_MASK 1
133#define ACC_WRITE_MASK PT_WRITABLE_MASK
134#define ACC_USER_MASK PT_USER_MASK
135#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
136
137
138#define PT64_EPT_READABLE_MASK 0x1ull
139#define PT64_EPT_EXECUTABLE_MASK 0x4ull
140
141#include <trace/events/kvm.h>
142
143#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
144#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
145
146#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
147
148
149#define PTE_LIST_EXT 3
150
151
152
153
154
155
156
157
158
159enum {
160 RET_PF_RETRY = 0,
161 RET_PF_EMULATE = 1,
162 RET_PF_INVALID = 2,
163};
164
165struct pte_list_desc {
166 u64 *sptes[PTE_LIST_EXT];
167 struct pte_list_desc *more;
168};
169
170struct kvm_shadow_walk_iterator {
171 u64 addr;
172 hpa_t shadow_addr;
173 u64 *sptep;
174 int level;
175 unsigned index;
176};
177
178static const union kvm_mmu_page_role mmu_base_role_mask = {
179 .cr0_wp = 1,
180 .gpte_is_8_bytes = 1,
181 .nxe = 1,
182 .smep_andnot_wp = 1,
183 .smap_andnot_wp = 1,
184 .smm = 1,
185 .guest_mode = 1,
186 .ad_disabled = 1,
187};
188
189#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \
190 for (shadow_walk_init_using_root(&(_walker), (_vcpu), \
191 (_root), (_addr)); \
192 shadow_walk_okay(&(_walker)); \
193 shadow_walk_next(&(_walker)))
194
195#define for_each_shadow_entry(_vcpu, _addr, _walker) \
196 for (shadow_walk_init(&(_walker), _vcpu, _addr); \
197 shadow_walk_okay(&(_walker)); \
198 shadow_walk_next(&(_walker)))
199
200#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
201 for (shadow_walk_init(&(_walker), _vcpu, _addr); \
202 shadow_walk_okay(&(_walker)) && \
203 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
204 __shadow_walk_next(&(_walker), spte))
205
206static struct kmem_cache *pte_list_desc_cache;
207static struct kmem_cache *mmu_page_header_cache;
208static struct percpu_counter kvm_total_used_mmu_pages;
209
210static u64 __read_mostly shadow_nx_mask;
211static u64 __read_mostly shadow_x_mask;
212static u64 __read_mostly shadow_user_mask;
213static u64 __read_mostly shadow_accessed_mask;
214static u64 __read_mostly shadow_dirty_mask;
215static u64 __read_mostly shadow_mmio_mask;
216static u64 __read_mostly shadow_mmio_value;
217static u64 __read_mostly shadow_present_mask;
218static u64 __read_mostly shadow_me_mask;
219
220
221
222
223
224
225static u64 __read_mostly shadow_acc_track_mask;
226static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
227
228
229
230
231
232
233
234static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
235 PT64_EPT_EXECUTABLE_MASK;
236static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
237
238
239
240
241
242static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
243
244
245
246
247static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
248
249
250
251
252
253
254
255
256
257static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
258
259
260
261
262
263static u8 __read_mostly shadow_phys_bits;
264
265static void mmu_spte_set(u64 *sptep, u64 spte);
266static bool is_executable_pte(u64 spte);
267static union kvm_mmu_page_role
268kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
269
270#define CREATE_TRACE_POINTS
271#include "mmutrace.h"
272
273
274static inline bool kvm_available_flush_tlb_with_range(void)
275{
276 return kvm_x86_ops->tlb_remote_flush_with_range;
277}
278
279static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
280 struct kvm_tlb_range *range)
281{
282 int ret = -ENOTSUPP;
283
284 if (range && kvm_x86_ops->tlb_remote_flush_with_range)
285 ret = kvm_x86_ops->tlb_remote_flush_with_range(kvm, range);
286
287 if (ret)
288 kvm_flush_remote_tlbs(kvm);
289}
290
291static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
292 u64 start_gfn, u64 pages)
293{
294 struct kvm_tlb_range range;
295
296 range.start_gfn = start_gfn;
297 range.pages = pages;
298
299 kvm_flush_remote_tlbs_with_range(kvm, &range);
300}
301
302void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
303{
304 BUG_ON((mmio_mask & mmio_value) != mmio_value);
305 shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK;
306 shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
307}
308EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
309
310static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
311{
312 return sp->role.ad_disabled;
313}
314
315static inline bool spte_ad_enabled(u64 spte)
316{
317 MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
318 return !(spte & shadow_acc_track_value);
319}
320
321static inline u64 spte_shadow_accessed_mask(u64 spte)
322{
323 MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
324 return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
325}
326
327static inline u64 spte_shadow_dirty_mask(u64 spte)
328{
329 MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
330 return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
331}
332
333static inline bool is_access_track_spte(u64 spte)
334{
335 return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
336}
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352#define MMIO_SPTE_GEN_MASK GENMASK_ULL(18, 0)
353
354#define MMIO_SPTE_GEN_LOW_START 3
355#define MMIO_SPTE_GEN_LOW_END 11
356#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
357 MMIO_SPTE_GEN_LOW_START)
358
359#define MMIO_SPTE_GEN_HIGH_START 52
360#define MMIO_SPTE_GEN_HIGH_END 61
361#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
362 MMIO_SPTE_GEN_HIGH_START)
363static u64 generation_mmio_spte_mask(u64 gen)
364{
365 u64 mask;
366
367 WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
368
369 mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
370 mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
371 return mask;
372}
373
374static u64 get_mmio_spte_generation(u64 spte)
375{
376 u64 gen;
377
378 spte &= ~shadow_mmio_mask;
379
380 gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
381 gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
382 return gen;
383}
384
385static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
386 unsigned access)
387{
388 u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
389 u64 mask = generation_mmio_spte_mask(gen);
390 u64 gpa = gfn << PAGE_SHIFT;
391
392 access &= ACC_WRITE_MASK | ACC_USER_MASK;
393 mask |= shadow_mmio_value | access;
394 mask |= gpa | shadow_nonpresent_or_rsvd_mask;
395 mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
396 << shadow_nonpresent_or_rsvd_mask_len;
397
398 page_header(__pa(sptep))->mmio_cached = true;
399
400 trace_mark_mmio_spte(sptep, gfn, access, gen);
401 mmu_spte_set(sptep, mask);
402}
403
404static bool is_mmio_spte(u64 spte)
405{
406 return (spte & shadow_mmio_mask) == shadow_mmio_value;
407}
408
409static gfn_t get_mmio_spte_gfn(u64 spte)
410{
411 u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
412
413 gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
414 & shadow_nonpresent_or_rsvd_mask;
415
416 return gpa >> PAGE_SHIFT;
417}
418
419static unsigned get_mmio_spte_access(u64 spte)
420{
421 u64 mask = generation_mmio_spte_mask(MMIO_SPTE_GEN_MASK) | shadow_mmio_mask;
422 return (spte & ~mask) & ~PAGE_MASK;
423}
424
425static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
426 kvm_pfn_t pfn, unsigned access)
427{
428 if (unlikely(is_noslot_pfn(pfn))) {
429 mark_mmio_spte(vcpu, sptep, gfn, access);
430 return true;
431 }
432
433 return false;
434}
435
436static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
437{
438 u64 kvm_gen, spte_gen, gen;
439
440 gen = kvm_vcpu_memslots(vcpu)->generation;
441 if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
442 return false;
443
444 kvm_gen = gen & MMIO_SPTE_GEN_MASK;
445 spte_gen = get_mmio_spte_generation(spte);
446
447 trace_check_mmio_spte(spte, kvm_gen, spte_gen);
448 return likely(kvm_gen == spte_gen);
449}
450
451
452
453
454
455
456
457
458void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
459 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
460 u64 acc_track_mask, u64 me_mask)
461{
462 BUG_ON(!dirty_mask != !accessed_mask);
463 BUG_ON(!accessed_mask && !acc_track_mask);
464 BUG_ON(acc_track_mask & shadow_acc_track_value);
465
466 shadow_user_mask = user_mask;
467 shadow_accessed_mask = accessed_mask;
468 shadow_dirty_mask = dirty_mask;
469 shadow_nx_mask = nx_mask;
470 shadow_x_mask = x_mask;
471 shadow_present_mask = p_mask;
472 shadow_acc_track_mask = acc_track_mask;
473 shadow_me_mask = me_mask;
474}
475EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
476
477static u8 kvm_get_shadow_phys_bits(void)
478{
479
480
481
482
483
484
485 if (!boot_cpu_has(X86_FEATURE_TME) ||
486 WARN_ON_ONCE(boot_cpu_data.extended_cpuid_level < 0x80000008))
487 return boot_cpu_data.x86_phys_bits;
488
489 return cpuid_eax(0x80000008) & 0xff;
490}
491
492static void kvm_mmu_reset_all_pte_masks(void)
493{
494 u8 low_phys_bits;
495
496 shadow_user_mask = 0;
497 shadow_accessed_mask = 0;
498 shadow_dirty_mask = 0;
499 shadow_nx_mask = 0;
500 shadow_x_mask = 0;
501 shadow_mmio_mask = 0;
502 shadow_present_mask = 0;
503 shadow_acc_track_mask = 0;
504
505 shadow_phys_bits = kvm_get_shadow_phys_bits();
506
507
508
509
510
511
512
513
514
515
516
517 shadow_nonpresent_or_rsvd_mask = 0;
518 low_phys_bits = boot_cpu_data.x86_cache_bits;
519 if (boot_cpu_data.x86_cache_bits <
520 52 - shadow_nonpresent_or_rsvd_mask_len) {
521 shadow_nonpresent_or_rsvd_mask =
522 rsvd_bits(boot_cpu_data.x86_cache_bits -
523 shadow_nonpresent_or_rsvd_mask_len,
524 boot_cpu_data.x86_cache_bits - 1);
525 low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len;
526 } else
527 WARN_ON_ONCE(boot_cpu_has_bug(X86_BUG_L1TF));
528
529 shadow_nonpresent_or_rsvd_lower_gfn_mask =
530 GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
531}
532
533static int is_cpuid_PSE36(void)
534{
535 return 1;
536}
537
538static int is_nx(struct kvm_vcpu *vcpu)
539{
540 return vcpu->arch.efer & EFER_NX;
541}
542
543static int is_shadow_present_pte(u64 pte)
544{
545 return (pte != 0) && !is_mmio_spte(pte);
546}
547
548static int is_large_pte(u64 pte)
549{
550 return pte & PT_PAGE_SIZE_MASK;
551}
552
553static int is_last_spte(u64 pte, int level)
554{
555 if (level == PT_PAGE_TABLE_LEVEL)
556 return 1;
557 if (is_large_pte(pte))
558 return 1;
559 return 0;
560}
561
562static bool is_executable_pte(u64 spte)
563{
564 return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
565}
566
567static kvm_pfn_t spte_to_pfn(u64 pte)
568{
569 return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
570}
571
572static gfn_t pse36_gfn_delta(u32 gpte)
573{
574 int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
575
576 return (gpte & PT32_DIR_PSE36_MASK) << shift;
577}
578
579#ifdef CONFIG_X86_64
580static void __set_spte(u64 *sptep, u64 spte)
581{
582 WRITE_ONCE(*sptep, spte);
583}
584
585static void __update_clear_spte_fast(u64 *sptep, u64 spte)
586{
587 WRITE_ONCE(*sptep, spte);
588}
589
590static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
591{
592 return xchg(sptep, spte);
593}
594
595static u64 __get_spte_lockless(u64 *sptep)
596{
597 return READ_ONCE(*sptep);
598}
599#else
600union split_spte {
601 struct {
602 u32 spte_low;
603 u32 spte_high;
604 };
605 u64 spte;
606};
607
608static void count_spte_clear(u64 *sptep, u64 spte)
609{
610 struct kvm_mmu_page *sp = page_header(__pa(sptep));
611
612 if (is_shadow_present_pte(spte))
613 return;
614
615
616 smp_wmb();
617 sp->clear_spte_count++;
618}
619
620static void __set_spte(u64 *sptep, u64 spte)
621{
622 union split_spte *ssptep, sspte;
623
624 ssptep = (union split_spte *)sptep;
625 sspte = (union split_spte)spte;
626
627 ssptep->spte_high = sspte.spte_high;
628
629
630
631
632
633
634 smp_wmb();
635
636 WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
637}
638
639static void __update_clear_spte_fast(u64 *sptep, u64 spte)
640{
641 union split_spte *ssptep, sspte;
642
643 ssptep = (union split_spte *)sptep;
644 sspte = (union split_spte)spte;
645
646 WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
647
648
649
650
651
652 smp_wmb();
653
654 ssptep->spte_high = sspte.spte_high;
655 count_spte_clear(sptep, spte);
656}
657
658static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
659{
660 union split_spte *ssptep, sspte, orig;
661
662 ssptep = (union split_spte *)sptep;
663 sspte = (union split_spte)spte;
664
665
666 orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
667 orig.spte_high = ssptep->spte_high;
668 ssptep->spte_high = sspte.spte_high;
669 count_spte_clear(sptep, spte);
670
671 return orig.spte;
672}
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692static u64 __get_spte_lockless(u64 *sptep)
693{
694 struct kvm_mmu_page *sp = page_header(__pa(sptep));
695 union split_spte spte, *orig = (union split_spte *)sptep;
696 int count;
697
698retry:
699 count = sp->clear_spte_count;
700 smp_rmb();
701
702 spte.spte_low = orig->spte_low;
703 smp_rmb();
704
705 spte.spte_high = orig->spte_high;
706 smp_rmb();
707
708 if (unlikely(spte.spte_low != orig->spte_low ||
709 count != sp->clear_spte_count))
710 goto retry;
711
712 return spte.spte;
713}
714#endif
715
716static bool spte_can_locklessly_be_made_writable(u64 spte)
717{
718 return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
719 (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
720}
721
722static bool spte_has_volatile_bits(u64 spte)
723{
724 if (!is_shadow_present_pte(spte))
725 return false;
726
727
728
729
730
731
732
733 if (spte_can_locklessly_be_made_writable(spte) ||
734 is_access_track_spte(spte))
735 return true;
736
737 if (spte_ad_enabled(spte)) {
738 if ((spte & shadow_accessed_mask) == 0 ||
739 (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
740 return true;
741 }
742
743 return false;
744}
745
746static bool is_accessed_spte(u64 spte)
747{
748 u64 accessed_mask = spte_shadow_accessed_mask(spte);
749
750 return accessed_mask ? spte & accessed_mask
751 : !is_access_track_spte(spte);
752}
753
754static bool is_dirty_spte(u64 spte)
755{
756 u64 dirty_mask = spte_shadow_dirty_mask(spte);
757
758 return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
759}
760
761
762
763
764
765
766
767static void mmu_spte_set(u64 *sptep, u64 new_spte)
768{
769 WARN_ON(is_shadow_present_pte(*sptep));
770 __set_spte(sptep, new_spte);
771}
772
773
774
775
776
777static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
778{
779 u64 old_spte = *sptep;
780
781 WARN_ON(!is_shadow_present_pte(new_spte));
782
783 if (!is_shadow_present_pte(old_spte)) {
784 mmu_spte_set(sptep, new_spte);
785 return old_spte;
786 }
787
788 if (!spte_has_volatile_bits(old_spte))
789 __update_clear_spte_fast(sptep, new_spte);
790 else
791 old_spte = __update_clear_spte_slow(sptep, new_spte);
792
793 WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
794
795 return old_spte;
796}
797
798
799
800
801
802
803
804
805
806
807
808
809static bool mmu_spte_update(u64 *sptep, u64 new_spte)
810{
811 bool flush = false;
812 u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
813
814 if (!is_shadow_present_pte(old_spte))
815 return false;
816
817
818
819
820
821
822 if (spte_can_locklessly_be_made_writable(old_spte) &&
823 !is_writable_pte(new_spte))
824 flush = true;
825
826
827
828
829
830
831 if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
832 flush = true;
833 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
834 }
835
836 if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
837 flush = true;
838 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
839 }
840
841 return flush;
842}
843
844
845
846
847
848
849
850static int mmu_spte_clear_track_bits(u64 *sptep)
851{
852 kvm_pfn_t pfn;
853 u64 old_spte = *sptep;
854
855 if (!spte_has_volatile_bits(old_spte))
856 __update_clear_spte_fast(sptep, 0ull);
857 else
858 old_spte = __update_clear_spte_slow(sptep, 0ull);
859
860 if (!is_shadow_present_pte(old_spte))
861 return 0;
862
863 pfn = spte_to_pfn(old_spte);
864
865
866
867
868
869
870 WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
871
872 if (is_accessed_spte(old_spte))
873 kvm_set_pfn_accessed(pfn);
874
875 if (is_dirty_spte(old_spte))
876 kvm_set_pfn_dirty(pfn);
877
878 return 1;
879}
880
881
882
883
884
885
886static void mmu_spte_clear_no_track(u64 *sptep)
887{
888 __update_clear_spte_fast(sptep, 0ull);
889}
890
891static u64 mmu_spte_get_lockless(u64 *sptep)
892{
893 return __get_spte_lockless(sptep);
894}
895
896static u64 mark_spte_for_access_track(u64 spte)
897{
898 if (spte_ad_enabled(spte))
899 return spte & ~shadow_accessed_mask;
900
901 if (is_access_track_spte(spte))
902 return spte;
903
904
905
906
907
908
909 WARN_ONCE((spte & PT_WRITABLE_MASK) &&
910 !spte_can_locklessly_be_made_writable(spte),
911 "kvm: Writable SPTE is not locklessly dirty-trackable\n");
912
913 WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
914 shadow_acc_track_saved_bits_shift),
915 "kvm: Access Tracking saved bit locations are not zero\n");
916
917 spte |= (spte & shadow_acc_track_saved_bits_mask) <<
918 shadow_acc_track_saved_bits_shift;
919 spte &= ~shadow_acc_track_mask;
920
921 return spte;
922}
923
924
925static u64 restore_acc_track_spte(u64 spte)
926{
927 u64 new_spte = spte;
928 u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
929 & shadow_acc_track_saved_bits_mask;
930
931 WARN_ON_ONCE(spte_ad_enabled(spte));
932 WARN_ON_ONCE(!is_access_track_spte(spte));
933
934 new_spte &= ~shadow_acc_track_mask;
935 new_spte &= ~(shadow_acc_track_saved_bits_mask <<
936 shadow_acc_track_saved_bits_shift);
937 new_spte |= saved_bits;
938
939 return new_spte;
940}
941
942
943static bool mmu_spte_age(u64 *sptep)
944{
945 u64 spte = mmu_spte_get_lockless(sptep);
946
947 if (!is_accessed_spte(spte))
948 return false;
949
950 if (spte_ad_enabled(spte)) {
951 clear_bit((ffs(shadow_accessed_mask) - 1),
952 (unsigned long *)sptep);
953 } else {
954
955
956
957
958 if (is_writable_pte(spte))
959 kvm_set_pfn_dirty(spte_to_pfn(spte));
960
961 spte = mark_spte_for_access_track(spte);
962 mmu_spte_update_no_track(sptep, spte);
963 }
964
965 return true;
966}
967
968static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
969{
970
971
972
973
974 local_irq_disable();
975
976
977
978
979
980 smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
981}
982
983static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
984{
985
986
987
988
989
990 smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
991 local_irq_enable();
992}
993
994static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
995 struct kmem_cache *base_cache, int min)
996{
997 void *obj;
998
999 if (cache->nobjs >= min)
1000 return 0;
1001 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
1002 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT);
1003 if (!obj)
1004 return cache->nobjs >= min ? 0 : -ENOMEM;
1005 cache->objects[cache->nobjs++] = obj;
1006 }
1007 return 0;
1008}
1009
1010static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
1011{
1012 return cache->nobjs;
1013}
1014
1015static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
1016 struct kmem_cache *cache)
1017{
1018 while (mc->nobjs)
1019 kmem_cache_free(cache, mc->objects[--mc->nobjs]);
1020}
1021
1022static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
1023 int min)
1024{
1025 void *page;
1026
1027 if (cache->nobjs >= min)
1028 return 0;
1029 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
1030 page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
1031 if (!page)
1032 return cache->nobjs >= min ? 0 : -ENOMEM;
1033 cache->objects[cache->nobjs++] = page;
1034 }
1035 return 0;
1036}
1037
1038static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
1039{
1040 while (mc->nobjs)
1041 free_page((unsigned long)mc->objects[--mc->nobjs]);
1042}
1043
1044static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
1045{
1046 int r;
1047
1048 r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
1049 pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
1050 if (r)
1051 goto out;
1052 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
1053 if (r)
1054 goto out;
1055 r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
1056 mmu_page_header_cache, 4);
1057out:
1058 return r;
1059}
1060
1061static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
1062{
1063 mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
1064 pte_list_desc_cache);
1065 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
1066 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
1067 mmu_page_header_cache);
1068}
1069
1070static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
1071{
1072 void *p;
1073
1074 BUG_ON(!mc->nobjs);
1075 p = mc->objects[--mc->nobjs];
1076 return p;
1077}
1078
1079static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
1080{
1081 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
1082}
1083
1084static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
1085{
1086 kmem_cache_free(pte_list_desc_cache, pte_list_desc);
1087}
1088
1089static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
1090{
1091 if (!sp->role.direct)
1092 return sp->gfns[index];
1093
1094 return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
1095}
1096
1097static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
1098{
1099 if (!sp->role.direct) {
1100 sp->gfns[index] = gfn;
1101 return;
1102 }
1103
1104 if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
1105 pr_err_ratelimited("gfn mismatch under direct page %llx "
1106 "(expected %llx, got %llx)\n",
1107 sp->gfn,
1108 kvm_mmu_page_get_gfn(sp, index), gfn);
1109}
1110
1111
1112
1113
1114
1115static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
1116 struct kvm_memory_slot *slot,
1117 int level)
1118{
1119 unsigned long idx;
1120
1121 idx = gfn_to_index(gfn, slot->base_gfn, level);
1122 return &slot->arch.lpage_info[level - 2][idx];
1123}
1124
1125static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
1126 gfn_t gfn, int count)
1127{
1128 struct kvm_lpage_info *linfo;
1129 int i;
1130
1131 for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1132 linfo = lpage_info_slot(gfn, slot, i);
1133 linfo->disallow_lpage += count;
1134 WARN_ON(linfo->disallow_lpage < 0);
1135 }
1136}
1137
1138void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
1139{
1140 update_gfn_disallow_lpage_count(slot, gfn, 1);
1141}
1142
1143void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
1144{
1145 update_gfn_disallow_lpage_count(slot, gfn, -1);
1146}
1147
1148static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1149{
1150 struct kvm_memslots *slots;
1151 struct kvm_memory_slot *slot;
1152 gfn_t gfn;
1153
1154 kvm->arch.indirect_shadow_pages++;
1155 gfn = sp->gfn;
1156 slots = kvm_memslots_for_spte_role(kvm, sp->role);
1157 slot = __gfn_to_memslot(slots, gfn);
1158
1159
1160 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
1161 return kvm_slot_page_track_add_page(kvm, slot, gfn,
1162 KVM_PAGE_TRACK_WRITE);
1163
1164 kvm_mmu_gfn_disallow_lpage(slot, gfn);
1165}
1166
1167static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1168{
1169 struct kvm_memslots *slots;
1170 struct kvm_memory_slot *slot;
1171 gfn_t gfn;
1172
1173 kvm->arch.indirect_shadow_pages--;
1174 gfn = sp->gfn;
1175 slots = kvm_memslots_for_spte_role(kvm, sp->role);
1176 slot = __gfn_to_memslot(slots, gfn);
1177 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
1178 return kvm_slot_page_track_remove_page(kvm, slot, gfn,
1179 KVM_PAGE_TRACK_WRITE);
1180
1181 kvm_mmu_gfn_allow_lpage(slot, gfn);
1182}
1183
1184static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
1185 struct kvm_memory_slot *slot)
1186{
1187 struct kvm_lpage_info *linfo;
1188
1189 if (slot) {
1190 linfo = lpage_info_slot(gfn, slot, level);
1191 return !!linfo->disallow_lpage;
1192 }
1193
1194 return true;
1195}
1196
1197static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
1198 int level)
1199{
1200 struct kvm_memory_slot *slot;
1201
1202 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1203 return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
1204}
1205
1206static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
1207{
1208 unsigned long page_size;
1209 int i, ret = 0;
1210
1211 page_size = kvm_host_page_size(kvm, gfn);
1212
1213 for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1214 if (page_size >= KVM_HPAGE_SIZE(i))
1215 ret = i;
1216 else
1217 break;
1218 }
1219
1220 return ret;
1221}
1222
1223static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot,
1224 bool no_dirty_log)
1225{
1226 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
1227 return false;
1228 if (no_dirty_log && slot->dirty_bitmap)
1229 return false;
1230
1231 return true;
1232}
1233
1234static struct kvm_memory_slot *
1235gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
1236 bool no_dirty_log)
1237{
1238 struct kvm_memory_slot *slot;
1239
1240 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1241 if (!memslot_valid_for_gpte(slot, no_dirty_log))
1242 slot = NULL;
1243
1244 return slot;
1245}
1246
1247static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
1248 bool *force_pt_level)
1249{
1250 int host_level, level, max_level;
1251 struct kvm_memory_slot *slot;
1252
1253 if (unlikely(*force_pt_level))
1254 return PT_PAGE_TABLE_LEVEL;
1255
1256 slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
1257 *force_pt_level = !memslot_valid_for_gpte(slot, true);
1258 if (unlikely(*force_pt_level))
1259 return PT_PAGE_TABLE_LEVEL;
1260
1261 host_level = host_mapping_level(vcpu->kvm, large_gfn);
1262
1263 if (host_level == PT_PAGE_TABLE_LEVEL)
1264 return host_level;
1265
1266 max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
1267
1268 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
1269 if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
1270 break;
1271
1272 return level - 1;
1273}
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
1287 struct kvm_rmap_head *rmap_head)
1288{
1289 struct pte_list_desc *desc;
1290 int i, count = 0;
1291
1292 if (!rmap_head->val) {
1293 rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
1294 rmap_head->val = (unsigned long)spte;
1295 } else if (!(rmap_head->val & 1)) {
1296 rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
1297 desc = mmu_alloc_pte_list_desc(vcpu);
1298 desc->sptes[0] = (u64 *)rmap_head->val;
1299 desc->sptes[1] = spte;
1300 rmap_head->val = (unsigned long)desc | 1;
1301 ++count;
1302 } else {
1303 rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
1304 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1305 while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
1306 desc = desc->more;
1307 count += PTE_LIST_EXT;
1308 }
1309 if (desc->sptes[PTE_LIST_EXT-1]) {
1310 desc->more = mmu_alloc_pte_list_desc(vcpu);
1311 desc = desc->more;
1312 }
1313 for (i = 0; desc->sptes[i]; ++i)
1314 ++count;
1315 desc->sptes[i] = spte;
1316 }
1317 return count;
1318}
1319
1320static void
1321pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
1322 struct pte_list_desc *desc, int i,
1323 struct pte_list_desc *prev_desc)
1324{
1325 int j;
1326
1327 for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
1328 ;
1329 desc->sptes[i] = desc->sptes[j];
1330 desc->sptes[j] = NULL;
1331 if (j != 0)
1332 return;
1333 if (!prev_desc && !desc->more)
1334 rmap_head->val = (unsigned long)desc->sptes[0];
1335 else
1336 if (prev_desc)
1337 prev_desc->more = desc->more;
1338 else
1339 rmap_head->val = (unsigned long)desc->more | 1;
1340 mmu_free_pte_list_desc(desc);
1341}
1342
1343static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
1344{
1345 struct pte_list_desc *desc;
1346 struct pte_list_desc *prev_desc;
1347 int i;
1348
1349 if (!rmap_head->val) {
1350 pr_err("%s: %p 0->BUG\n", __func__, spte);
1351 BUG();
1352 } else if (!(rmap_head->val & 1)) {
1353 rmap_printk("%s: %p 1->0\n", __func__, spte);
1354 if ((u64 *)rmap_head->val != spte) {
1355 pr_err("%s: %p 1->BUG\n", __func__, spte);
1356 BUG();
1357 }
1358 rmap_head->val = 0;
1359 } else {
1360 rmap_printk("%s: %p many->many\n", __func__, spte);
1361 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1362 prev_desc = NULL;
1363 while (desc) {
1364 for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
1365 if (desc->sptes[i] == spte) {
1366 pte_list_desc_remove_entry(rmap_head,
1367 desc, i, prev_desc);
1368 return;
1369 }
1370 }
1371 prev_desc = desc;
1372 desc = desc->more;
1373 }
1374 pr_err("%s: %p many->many\n", __func__, spte);
1375 BUG();
1376 }
1377}
1378
1379static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep)
1380{
1381 mmu_spte_clear_track_bits(sptep);
1382 __pte_list_remove(sptep, rmap_head);
1383}
1384
1385static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
1386 struct kvm_memory_slot *slot)
1387{
1388 unsigned long idx;
1389
1390 idx = gfn_to_index(gfn, slot->base_gfn, level);
1391 return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
1392}
1393
1394static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
1395 struct kvm_mmu_page *sp)
1396{
1397 struct kvm_memslots *slots;
1398 struct kvm_memory_slot *slot;
1399
1400 slots = kvm_memslots_for_spte_role(kvm, sp->role);
1401 slot = __gfn_to_memslot(slots, gfn);
1402 return __gfn_to_rmap(gfn, sp->role.level, slot);
1403}
1404
1405static bool rmap_can_add(struct kvm_vcpu *vcpu)
1406{
1407 struct kvm_mmu_memory_cache *cache;
1408
1409 cache = &vcpu->arch.mmu_pte_list_desc_cache;
1410 return mmu_memory_cache_free_objects(cache);
1411}
1412
1413static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
1414{
1415 struct kvm_mmu_page *sp;
1416 struct kvm_rmap_head *rmap_head;
1417
1418 sp = page_header(__pa(spte));
1419 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
1420 rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
1421 return pte_list_add(vcpu, spte, rmap_head);
1422}
1423
1424static void rmap_remove(struct kvm *kvm, u64 *spte)
1425{
1426 struct kvm_mmu_page *sp;
1427 gfn_t gfn;
1428 struct kvm_rmap_head *rmap_head;
1429
1430 sp = page_header(__pa(spte));
1431 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
1432 rmap_head = gfn_to_rmap(kvm, gfn, sp);
1433 __pte_list_remove(spte, rmap_head);
1434}
1435
1436
1437
1438
1439
1440struct rmap_iterator {
1441
1442 struct pte_list_desc *desc;
1443 int pos;
1444};
1445
1446
1447
1448
1449
1450
1451
1452
1453static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
1454 struct rmap_iterator *iter)
1455{
1456 u64 *sptep;
1457
1458 if (!rmap_head->val)
1459 return NULL;
1460
1461 if (!(rmap_head->val & 1)) {
1462 iter->desc = NULL;
1463 sptep = (u64 *)rmap_head->val;
1464 goto out;
1465 }
1466
1467 iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1468 iter->pos = 0;
1469 sptep = iter->desc->sptes[iter->pos];
1470out:
1471 BUG_ON(!is_shadow_present_pte(*sptep));
1472 return sptep;
1473}
1474
1475
1476
1477
1478
1479
1480static u64 *rmap_get_next(struct rmap_iterator *iter)
1481{
1482 u64 *sptep;
1483
1484 if (iter->desc) {
1485 if (iter->pos < PTE_LIST_EXT - 1) {
1486 ++iter->pos;
1487 sptep = iter->desc->sptes[iter->pos];
1488 if (sptep)
1489 goto out;
1490 }
1491
1492 iter->desc = iter->desc->more;
1493
1494 if (iter->desc) {
1495 iter->pos = 0;
1496
1497 sptep = iter->desc->sptes[iter->pos];
1498 goto out;
1499 }
1500 }
1501
1502 return NULL;
1503out:
1504 BUG_ON(!is_shadow_present_pte(*sptep));
1505 return sptep;
1506}
1507
1508#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \
1509 for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \
1510 _spte_; _spte_ = rmap_get_next(_iter_))
1511
1512static void drop_spte(struct kvm *kvm, u64 *sptep)
1513{
1514 if (mmu_spte_clear_track_bits(sptep))
1515 rmap_remove(kvm, sptep);
1516}
1517
1518
1519static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
1520{
1521 if (is_large_pte(*sptep)) {
1522 WARN_ON(page_header(__pa(sptep))->role.level ==
1523 PT_PAGE_TABLE_LEVEL);
1524 drop_spte(kvm, sptep);
1525 --kvm->stat.lpages;
1526 return true;
1527 }
1528
1529 return false;
1530}
1531
1532static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1533{
1534 if (__drop_large_spte(vcpu->kvm, sptep)) {
1535 struct kvm_mmu_page *sp = page_header(__pa(sptep));
1536
1537 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1538 KVM_PAGES_PER_HPAGE(sp->role.level));
1539 }
1540}
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555static bool spte_write_protect(u64 *sptep, bool pt_protect)
1556{
1557 u64 spte = *sptep;
1558
1559 if (!is_writable_pte(spte) &&
1560 !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
1561 return false;
1562
1563 rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
1564
1565 if (pt_protect)
1566 spte &= ~SPTE_MMU_WRITEABLE;
1567 spte = spte & ~PT_WRITABLE_MASK;
1568
1569 return mmu_spte_update(sptep, spte);
1570}
1571
1572static bool __rmap_write_protect(struct kvm *kvm,
1573 struct kvm_rmap_head *rmap_head,
1574 bool pt_protect)
1575{
1576 u64 *sptep;
1577 struct rmap_iterator iter;
1578 bool flush = false;
1579
1580 for_each_rmap_spte(rmap_head, &iter, sptep)
1581 flush |= spte_write_protect(sptep, pt_protect);
1582
1583 return flush;
1584}
1585
1586static bool spte_clear_dirty(u64 *sptep)
1587{
1588 u64 spte = *sptep;
1589
1590 rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
1591
1592 spte &= ~shadow_dirty_mask;
1593
1594 return mmu_spte_update(sptep, spte);
1595}
1596
1597static bool wrprot_ad_disabled_spte(u64 *sptep)
1598{
1599 bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
1600 (unsigned long *)sptep);
1601 if (was_writable)
1602 kvm_set_pfn_dirty(spte_to_pfn(*sptep));
1603
1604 return was_writable;
1605}
1606
1607
1608
1609
1610
1611
1612
1613static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1614{
1615 u64 *sptep;
1616 struct rmap_iterator iter;
1617 bool flush = false;
1618
1619 for_each_rmap_spte(rmap_head, &iter, sptep)
1620 if (spte_ad_enabled(*sptep))
1621 flush |= spte_clear_dirty(sptep);
1622 else
1623 flush |= wrprot_ad_disabled_spte(sptep);
1624
1625 return flush;
1626}
1627
1628static bool spte_set_dirty(u64 *sptep)
1629{
1630 u64 spte = *sptep;
1631
1632 rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
1633
1634 spte |= shadow_dirty_mask;
1635
1636 return mmu_spte_update(sptep, spte);
1637}
1638
1639static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1640{
1641 u64 *sptep;
1642 struct rmap_iterator iter;
1643 bool flush = false;
1644
1645 for_each_rmap_spte(rmap_head, &iter, sptep)
1646 if (spte_ad_enabled(*sptep))
1647 flush |= spte_set_dirty(sptep);
1648
1649 return flush;
1650}
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1663 struct kvm_memory_slot *slot,
1664 gfn_t gfn_offset, unsigned long mask)
1665{
1666 struct kvm_rmap_head *rmap_head;
1667
1668 while (mask) {
1669 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1670 PT_PAGE_TABLE_LEVEL, slot);
1671 __rmap_write_protect(kvm, rmap_head, false);
1672
1673
1674 mask &= mask - 1;
1675 }
1676}
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1689 struct kvm_memory_slot *slot,
1690 gfn_t gfn_offset, unsigned long mask)
1691{
1692 struct kvm_rmap_head *rmap_head;
1693
1694 while (mask) {
1695 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1696 PT_PAGE_TABLE_LEVEL, slot);
1697 __rmap_clear_dirty(kvm, rmap_head);
1698
1699
1700 mask &= mask - 1;
1701 }
1702}
1703EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1716 struct kvm_memory_slot *slot,
1717 gfn_t gfn_offset, unsigned long mask)
1718{
1719 if (kvm_x86_ops->enable_log_dirty_pt_masked)
1720 kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
1721 mask);
1722 else
1723 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1724}
1725
1726
1727
1728
1729
1730
1731
1732
1733int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu)
1734{
1735 if (kvm_x86_ops->write_log_dirty)
1736 return kvm_x86_ops->write_log_dirty(vcpu);
1737
1738 return 0;
1739}
1740
1741bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
1742 struct kvm_memory_slot *slot, u64 gfn)
1743{
1744 struct kvm_rmap_head *rmap_head;
1745 int i;
1746 bool write_protected = false;
1747
1748 for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1749 rmap_head = __gfn_to_rmap(gfn, i, slot);
1750 write_protected |= __rmap_write_protect(kvm, rmap_head, true);
1751 }
1752
1753 return write_protected;
1754}
1755
1756static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
1757{
1758 struct kvm_memory_slot *slot;
1759
1760 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1761 return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
1762}
1763
1764static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1765{
1766 u64 *sptep;
1767 struct rmap_iterator iter;
1768 bool flush = false;
1769
1770 while ((sptep = rmap_get_first(rmap_head, &iter))) {
1771 rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
1772
1773 pte_list_remove(rmap_head, sptep);
1774 flush = true;
1775 }
1776
1777 return flush;
1778}
1779
1780static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1781 struct kvm_memory_slot *slot, gfn_t gfn, int level,
1782 unsigned long data)
1783{
1784 return kvm_zap_rmapp(kvm, rmap_head);
1785}
1786
1787static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1788 struct kvm_memory_slot *slot, gfn_t gfn, int level,
1789 unsigned long data)
1790{
1791 u64 *sptep;
1792 struct rmap_iterator iter;
1793 int need_flush = 0;
1794 u64 new_spte;
1795 pte_t *ptep = (pte_t *)data;
1796 kvm_pfn_t new_pfn;
1797
1798 WARN_ON(pte_huge(*ptep));
1799 new_pfn = pte_pfn(*ptep);
1800
1801restart:
1802 for_each_rmap_spte(rmap_head, &iter, sptep) {
1803 rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
1804 sptep, *sptep, gfn, level);
1805
1806 need_flush = 1;
1807
1808 if (pte_write(*ptep)) {
1809 pte_list_remove(rmap_head, sptep);
1810 goto restart;
1811 } else {
1812 new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
1813 new_spte |= (u64)new_pfn << PAGE_SHIFT;
1814
1815 new_spte &= ~PT_WRITABLE_MASK;
1816 new_spte &= ~SPTE_HOST_WRITEABLE;
1817
1818 new_spte = mark_spte_for_access_track(new_spte);
1819
1820 mmu_spte_clear_track_bits(sptep);
1821 mmu_spte_set(sptep, new_spte);
1822 }
1823 }
1824
1825 if (need_flush && kvm_available_flush_tlb_with_range()) {
1826 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1827 return 0;
1828 }
1829
1830 return need_flush;
1831}
1832
1833struct slot_rmap_walk_iterator {
1834
1835 struct kvm_memory_slot *slot;
1836 gfn_t start_gfn;
1837 gfn_t end_gfn;
1838 int start_level;
1839 int end_level;
1840
1841
1842 gfn_t gfn;
1843 struct kvm_rmap_head *rmap;
1844 int level;
1845
1846
1847 struct kvm_rmap_head *end_rmap;
1848};
1849
1850static void
1851rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
1852{
1853 iterator->level = level;
1854 iterator->gfn = iterator->start_gfn;
1855 iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
1856 iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
1857 iterator->slot);
1858}
1859
1860static void
1861slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
1862 struct kvm_memory_slot *slot, int start_level,
1863 int end_level, gfn_t start_gfn, gfn_t end_gfn)
1864{
1865 iterator->slot = slot;
1866 iterator->start_level = start_level;
1867 iterator->end_level = end_level;
1868 iterator->start_gfn = start_gfn;
1869 iterator->end_gfn = end_gfn;
1870
1871 rmap_walk_init_level(iterator, iterator->start_level);
1872}
1873
1874static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
1875{
1876 return !!iterator->rmap;
1877}
1878
1879static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
1880{
1881 if (++iterator->rmap <= iterator->end_rmap) {
1882 iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
1883 return;
1884 }
1885
1886 if (++iterator->level > iterator->end_level) {
1887 iterator->rmap = NULL;
1888 return;
1889 }
1890
1891 rmap_walk_init_level(iterator, iterator->level);
1892}
1893
1894#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \
1895 _start_gfn, _end_gfn, _iter_) \
1896 for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \
1897 _end_level_, _start_gfn, _end_gfn); \
1898 slot_rmap_walk_okay(_iter_); \
1899 slot_rmap_walk_next(_iter_))
1900
1901static int kvm_handle_hva_range(struct kvm *kvm,
1902 unsigned long start,
1903 unsigned long end,
1904 unsigned long data,
1905 int (*handler)(struct kvm *kvm,
1906 struct kvm_rmap_head *rmap_head,
1907 struct kvm_memory_slot *slot,
1908 gfn_t gfn,
1909 int level,
1910 unsigned long data))
1911{
1912 struct kvm_memslots *slots;
1913 struct kvm_memory_slot *memslot;
1914 struct slot_rmap_walk_iterator iterator;
1915 int ret = 0;
1916 int i;
1917
1918 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1919 slots = __kvm_memslots(kvm, i);
1920 kvm_for_each_memslot(memslot, slots) {
1921 unsigned long hva_start, hva_end;
1922 gfn_t gfn_start, gfn_end;
1923
1924 hva_start = max(start, memslot->userspace_addr);
1925 hva_end = min(end, memslot->userspace_addr +
1926 (memslot->npages << PAGE_SHIFT));
1927 if (hva_start >= hva_end)
1928 continue;
1929
1930
1931
1932
1933 gfn_start = hva_to_gfn_memslot(hva_start, memslot);
1934 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
1935
1936 for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
1937 PT_MAX_HUGEPAGE_LEVEL,
1938 gfn_start, gfn_end - 1,
1939 &iterator)
1940 ret |= handler(kvm, iterator.rmap, memslot,
1941 iterator.gfn, iterator.level, data);
1942 }
1943 }
1944
1945 return ret;
1946}
1947
1948static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
1949 unsigned long data,
1950 int (*handler)(struct kvm *kvm,
1951 struct kvm_rmap_head *rmap_head,
1952 struct kvm_memory_slot *slot,
1953 gfn_t gfn, int level,
1954 unsigned long data))
1955{
1956 return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
1957}
1958
1959int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
1960{
1961 return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
1962}
1963
1964int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1965{
1966 return kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
1967}
1968
1969static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1970 struct kvm_memory_slot *slot, gfn_t gfn, int level,
1971 unsigned long data)
1972{
1973 u64 *sptep;
1974 struct rmap_iterator uninitialized_var(iter);
1975 int young = 0;
1976
1977 for_each_rmap_spte(rmap_head, &iter, sptep)
1978 young |= mmu_spte_age(sptep);
1979
1980 trace_kvm_age_page(gfn, level, slot, young);
1981 return young;
1982}
1983
1984static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1985 struct kvm_memory_slot *slot, gfn_t gfn,
1986 int level, unsigned long data)
1987{
1988 u64 *sptep;
1989 struct rmap_iterator iter;
1990
1991 for_each_rmap_spte(rmap_head, &iter, sptep)
1992 if (is_accessed_spte(*sptep))
1993 return 1;
1994 return 0;
1995}
1996
1997#define RMAP_RECYCLE_THRESHOLD 1000
1998
1999static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
2000{
2001 struct kvm_rmap_head *rmap_head;
2002 struct kvm_mmu_page *sp;
2003
2004 sp = page_header(__pa(spte));
2005
2006 rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
2007
2008 kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
2009 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
2010 KVM_PAGES_PER_HPAGE(sp->role.level));
2011}
2012
2013int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
2014{
2015 return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
2016}
2017
2018int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
2019{
2020 return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
2021}
2022
2023#ifdef MMU_DEBUG
2024static int is_empty_shadow_page(u64 *spt)
2025{
2026 u64 *pos;
2027 u64 *end;
2028
2029 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
2030 if (is_shadow_present_pte(*pos)) {
2031 printk(KERN_ERR "%s: %p %llx\n", __func__,
2032 pos, *pos);
2033 return 0;
2034 }
2035 return 1;
2036}
2037#endif
2038
2039
2040
2041
2042
2043
2044
2045static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr)
2046{
2047 kvm->arch.n_used_mmu_pages += nr;
2048 percpu_counter_add(&kvm_total_used_mmu_pages, nr);
2049}
2050
2051static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
2052{
2053 MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
2054 hlist_del(&sp->hash_link);
2055 list_del(&sp->link);
2056 free_page((unsigned long)sp->spt);
2057 if (!sp->role.direct)
2058 free_page((unsigned long)sp->gfns);
2059 kmem_cache_free(mmu_page_header_cache, sp);
2060}
2061
2062static unsigned kvm_page_table_hashfn(gfn_t gfn)
2063{
2064 return hash_64(gfn, KVM_MMU_HASH_SHIFT);
2065}
2066
2067static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
2068 struct kvm_mmu_page *sp, u64 *parent_pte)
2069{
2070 if (!parent_pte)
2071 return;
2072
2073 pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
2074}
2075
2076static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
2077 u64 *parent_pte)
2078{
2079 __pte_list_remove(parent_pte, &sp->parent_ptes);
2080}
2081
2082static void drop_parent_pte(struct kvm_mmu_page *sp,
2083 u64 *parent_pte)
2084{
2085 mmu_page_remove_parent_pte(sp, parent_pte);
2086 mmu_spte_clear_no_track(parent_pte);
2087}
2088
2089static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
2090{
2091 struct kvm_mmu_page *sp;
2092
2093 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
2094 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
2095 if (!direct)
2096 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
2097 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
2098
2099
2100
2101
2102
2103
2104 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
2105 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
2106 return sp;
2107}
2108
2109static void mark_unsync(u64 *spte);
2110static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
2111{
2112 u64 *sptep;
2113 struct rmap_iterator iter;
2114
2115 for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
2116 mark_unsync(sptep);
2117 }
2118}
2119
2120static void mark_unsync(u64 *spte)
2121{
2122 struct kvm_mmu_page *sp;
2123 unsigned int index;
2124
2125 sp = page_header(__pa(spte));
2126 index = spte - sp->spt;
2127 if (__test_and_set_bit(index, sp->unsync_child_bitmap))
2128 return;
2129 if (sp->unsync_children++)
2130 return;
2131 kvm_mmu_mark_parents_unsync(sp);
2132}
2133
2134static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
2135 struct kvm_mmu_page *sp)
2136{
2137 return 0;
2138}
2139
2140static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root)
2141{
2142}
2143
2144static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
2145 struct kvm_mmu_page *sp, u64 *spte,
2146 const void *pte)
2147{
2148 WARN_ON(1);
2149}
2150
2151#define KVM_PAGE_ARRAY_NR 16
2152
2153struct kvm_mmu_pages {
2154 struct mmu_page_and_offset {
2155 struct kvm_mmu_page *sp;
2156 unsigned int idx;
2157 } page[KVM_PAGE_ARRAY_NR];
2158 unsigned int nr;
2159};
2160
2161static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
2162 int idx)
2163{
2164 int i;
2165
2166 if (sp->unsync)
2167 for (i=0; i < pvec->nr; i++)
2168 if (pvec->page[i].sp == sp)
2169 return 0;
2170
2171 pvec->page[pvec->nr].sp = sp;
2172 pvec->page[pvec->nr].idx = idx;
2173 pvec->nr++;
2174 return (pvec->nr == KVM_PAGE_ARRAY_NR);
2175}
2176
2177static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
2178{
2179 --sp->unsync_children;
2180 WARN_ON((int)sp->unsync_children < 0);
2181 __clear_bit(idx, sp->unsync_child_bitmap);
2182}
2183
2184static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
2185 struct kvm_mmu_pages *pvec)
2186{
2187 int i, ret, nr_unsync_leaf = 0;
2188
2189 for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
2190 struct kvm_mmu_page *child;
2191 u64 ent = sp->spt[i];
2192
2193 if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
2194 clear_unsync_child_bit(sp, i);
2195 continue;
2196 }
2197
2198 child = page_header(ent & PT64_BASE_ADDR_MASK);
2199
2200 if (child->unsync_children) {
2201 if (mmu_pages_add(pvec, child, i))
2202 return -ENOSPC;
2203
2204 ret = __mmu_unsync_walk(child, pvec);
2205 if (!ret) {
2206 clear_unsync_child_bit(sp, i);
2207 continue;
2208 } else if (ret > 0) {
2209 nr_unsync_leaf += ret;
2210 } else
2211 return ret;
2212 } else if (child->unsync) {
2213 nr_unsync_leaf++;
2214 if (mmu_pages_add(pvec, child, i))
2215 return -ENOSPC;
2216 } else
2217 clear_unsync_child_bit(sp, i);
2218 }
2219
2220 return nr_unsync_leaf;
2221}
2222
2223#define INVALID_INDEX (-1)
2224
2225static int mmu_unsync_walk(struct kvm_mmu_page *sp,
2226 struct kvm_mmu_pages *pvec)
2227{
2228 pvec->nr = 0;
2229 if (!sp->unsync_children)
2230 return 0;
2231
2232 mmu_pages_add(pvec, sp, INVALID_INDEX);
2233 return __mmu_unsync_walk(sp, pvec);
2234}
2235
2236static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
2237{
2238 WARN_ON(!sp->unsync);
2239 trace_kvm_mmu_sync_page(sp);
2240 sp->unsync = 0;
2241 --kvm->stat.mmu_unsync;
2242}
2243
2244static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2245 struct list_head *invalid_list);
2246static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2247 struct list_head *invalid_list);
2248
2249
2250#define for_each_valid_sp(_kvm, _sp, _gfn) \
2251 hlist_for_each_entry(_sp, \
2252 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
2253 if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \
2254 } else
2255
2256#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
2257 for_each_valid_sp(_kvm, _sp, _gfn) \
2258 if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
2259
2260static inline bool is_ept_sp(struct kvm_mmu_page *sp)
2261{
2262 return sp->role.cr0_wp && sp->role.smap_andnot_wp;
2263}
2264
2265
2266static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2267 struct list_head *invalid_list)
2268{
2269 if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) ||
2270 vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
2271 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
2272 return false;
2273 }
2274
2275 return true;
2276}
2277
2278static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
2279 struct list_head *invalid_list,
2280 bool remote_flush)
2281{
2282 if (!remote_flush && list_empty(invalid_list))
2283 return false;
2284
2285 if (!list_empty(invalid_list))
2286 kvm_mmu_commit_zap_page(kvm, invalid_list);
2287 else
2288 kvm_flush_remote_tlbs(kvm);
2289 return true;
2290}
2291
2292static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
2293 struct list_head *invalid_list,
2294 bool remote_flush, bool local_flush)
2295{
2296 if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush))
2297 return;
2298
2299 if (local_flush)
2300 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2301}
2302
2303#ifdef CONFIG_KVM_MMU_AUDIT
2304#include "mmu_audit.c"
2305#else
2306static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
2307static void mmu_audit_disable(void) { }
2308#endif
2309
2310static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
2311{
2312 return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
2313}
2314
2315static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2316 struct list_head *invalid_list)
2317{
2318 kvm_unlink_unsync_page(vcpu->kvm, sp);
2319 return __kvm_sync_page(vcpu, sp, invalid_list);
2320}
2321
2322
2323static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
2324 struct list_head *invalid_list)
2325{
2326 struct kvm_mmu_page *s;
2327 bool ret = false;
2328
2329 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
2330 if (!s->unsync)
2331 continue;
2332
2333 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
2334 ret |= kvm_sync_page(vcpu, s, invalid_list);
2335 }
2336
2337 return ret;
2338}
2339
2340struct mmu_page_path {
2341 struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
2342 unsigned int idx[PT64_ROOT_MAX_LEVEL];
2343};
2344
2345#define for_each_sp(pvec, sp, parents, i) \
2346 for (i = mmu_pages_first(&pvec, &parents); \
2347 i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
2348 i = mmu_pages_next(&pvec, &parents, i))
2349
2350static int mmu_pages_next(struct kvm_mmu_pages *pvec,
2351 struct mmu_page_path *parents,
2352 int i)
2353{
2354 int n;
2355
2356 for (n = i+1; n < pvec->nr; n++) {
2357 struct kvm_mmu_page *sp = pvec->page[n].sp;
2358 unsigned idx = pvec->page[n].idx;
2359 int level = sp->role.level;
2360
2361 parents->idx[level-1] = idx;
2362 if (level == PT_PAGE_TABLE_LEVEL)
2363 break;
2364
2365 parents->parent[level-2] = sp;
2366 }
2367
2368 return n;
2369}
2370
2371static int mmu_pages_first(struct kvm_mmu_pages *pvec,
2372 struct mmu_page_path *parents)
2373{
2374 struct kvm_mmu_page *sp;
2375 int level;
2376
2377 if (pvec->nr == 0)
2378 return 0;
2379
2380 WARN_ON(pvec->page[0].idx != INVALID_INDEX);
2381
2382 sp = pvec->page[0].sp;
2383 level = sp->role.level;
2384 WARN_ON(level == PT_PAGE_TABLE_LEVEL);
2385
2386 parents->parent[level-2] = sp;
2387
2388
2389
2390
2391 parents->parent[level-1] = NULL;
2392 return mmu_pages_next(pvec, parents, 0);
2393}
2394
2395static void mmu_pages_clear_parents(struct mmu_page_path *parents)
2396{
2397 struct kvm_mmu_page *sp;
2398 unsigned int level = 0;
2399
2400 do {
2401 unsigned int idx = parents->idx[level];
2402 sp = parents->parent[level];
2403 if (!sp)
2404 return;
2405
2406 WARN_ON(idx == INVALID_INDEX);
2407 clear_unsync_child_bit(sp, idx);
2408 level++;
2409 } while (!sp->unsync_children);
2410}
2411
2412static void mmu_sync_children(struct kvm_vcpu *vcpu,
2413 struct kvm_mmu_page *parent)
2414{
2415 int i;
2416 struct kvm_mmu_page *sp;
2417 struct mmu_page_path parents;
2418 struct kvm_mmu_pages pages;
2419 LIST_HEAD(invalid_list);
2420 bool flush = false;
2421
2422 while (mmu_unsync_walk(parent, &pages)) {
2423 bool protected = false;
2424
2425 for_each_sp(pages, sp, parents, i)
2426 protected |= rmap_write_protect(vcpu, sp->gfn);
2427
2428 if (protected) {
2429 kvm_flush_remote_tlbs(vcpu->kvm);
2430 flush = false;
2431 }
2432
2433 for_each_sp(pages, sp, parents, i) {
2434 flush |= kvm_sync_page(vcpu, sp, &invalid_list);
2435 mmu_pages_clear_parents(&parents);
2436 }
2437 if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
2438 kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2439 cond_resched_lock(&vcpu->kvm->mmu_lock);
2440 flush = false;
2441 }
2442 }
2443
2444 kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2445}
2446
2447static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
2448{
2449 atomic_set(&sp->write_flooding_count, 0);
2450}
2451
2452static void clear_sp_write_flooding_count(u64 *spte)
2453{
2454 struct kvm_mmu_page *sp = page_header(__pa(spte));
2455
2456 __clear_sp_write_flooding_count(sp);
2457}
2458
2459static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
2460 gfn_t gfn,
2461 gva_t gaddr,
2462 unsigned level,
2463 int direct,
2464 unsigned access)
2465{
2466 union kvm_mmu_page_role role;
2467 unsigned quadrant;
2468 struct kvm_mmu_page *sp;
2469 bool need_sync = false;
2470 bool flush = false;
2471 int collisions = 0;
2472 LIST_HEAD(invalid_list);
2473
2474 role = vcpu->arch.mmu->mmu_role.base;
2475 role.level = level;
2476 role.direct = direct;
2477 if (role.direct)
2478 role.gpte_is_8_bytes = true;
2479 role.access = access;
2480 if (!vcpu->arch.mmu->direct_map
2481 && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
2482 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
2483 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
2484 role.quadrant = quadrant;
2485 }
2486 for_each_valid_sp(vcpu->kvm, sp, gfn) {
2487 if (sp->gfn != gfn) {
2488 collisions++;
2489 continue;
2490 }
2491
2492 if (!need_sync && sp->unsync)
2493 need_sync = true;
2494
2495 if (sp->role.word != role.word)
2496 continue;
2497
2498 if (sp->unsync) {
2499
2500
2501
2502 if (!__kvm_sync_page(vcpu, sp, &invalid_list))
2503 break;
2504
2505 WARN_ON(!list_empty(&invalid_list));
2506 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2507 }
2508
2509 if (sp->unsync_children)
2510 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
2511
2512 __clear_sp_write_flooding_count(sp);
2513 trace_kvm_mmu_get_page(sp, false);
2514 goto out;
2515 }
2516
2517 ++vcpu->kvm->stat.mmu_cache_miss;
2518
2519 sp = kvm_mmu_alloc_page(vcpu, direct);
2520
2521 sp->gfn = gfn;
2522 sp->role = role;
2523 hlist_add_head(&sp->hash_link,
2524 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
2525 if (!direct) {
2526
2527
2528
2529
2530
2531 account_shadowed(vcpu->kvm, sp);
2532 if (level == PT_PAGE_TABLE_LEVEL &&
2533 rmap_write_protect(vcpu, gfn))
2534 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
2535
2536 if (level > PT_PAGE_TABLE_LEVEL && need_sync)
2537 flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
2538 }
2539 sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
2540 clear_page(sp->spt);
2541 trace_kvm_mmu_get_page(sp, true);
2542
2543 kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2544out:
2545 if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
2546 vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
2547 return sp;
2548}
2549
2550static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
2551 struct kvm_vcpu *vcpu, hpa_t root,
2552 u64 addr)
2553{
2554 iterator->addr = addr;
2555 iterator->shadow_addr = root;
2556 iterator->level = vcpu->arch.mmu->shadow_root_level;
2557
2558 if (iterator->level == PT64_ROOT_4LEVEL &&
2559 vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
2560 !vcpu->arch.mmu->direct_map)
2561 --iterator->level;
2562
2563 if (iterator->level == PT32E_ROOT_LEVEL) {
2564
2565
2566
2567
2568 BUG_ON(root != vcpu->arch.mmu->root_hpa);
2569
2570 iterator->shadow_addr
2571 = vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
2572 iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
2573 --iterator->level;
2574 if (!iterator->shadow_addr)
2575 iterator->level = 0;
2576 }
2577}
2578
2579static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
2580 struct kvm_vcpu *vcpu, u64 addr)
2581{
2582 shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa,
2583 addr);
2584}
2585
2586static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
2587{
2588 if (iterator->level < PT_PAGE_TABLE_LEVEL)
2589 return false;
2590
2591 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
2592 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
2593 return true;
2594}
2595
2596static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
2597 u64 spte)
2598{
2599 if (is_last_spte(spte, iterator->level)) {
2600 iterator->level = 0;
2601 return;
2602 }
2603
2604 iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
2605 --iterator->level;
2606}
2607
2608static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
2609{
2610 __shadow_walk_next(iterator, *iterator->sptep);
2611}
2612
2613static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
2614 struct kvm_mmu_page *sp)
2615{
2616 u64 spte;
2617
2618 BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
2619
2620 spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
2621 shadow_user_mask | shadow_x_mask | shadow_me_mask;
2622
2623 if (sp_ad_disabled(sp))
2624 spte |= shadow_acc_track_value;
2625 else
2626 spte |= shadow_accessed_mask;
2627
2628 mmu_spte_set(sptep, spte);
2629
2630 mmu_page_add_parent_pte(vcpu, sp, sptep);
2631
2632 if (sp->unsync_children || sp->unsync)
2633 mark_unsync(sptep);
2634}
2635
2636static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2637 unsigned direct_access)
2638{
2639 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
2640 struct kvm_mmu_page *child;
2641
2642
2643
2644
2645
2646
2647
2648
2649 child = page_header(*sptep & PT64_BASE_ADDR_MASK);
2650 if (child->role.access == direct_access)
2651 return;
2652
2653 drop_parent_pte(child, sptep);
2654 kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1);
2655 }
2656}
2657
2658static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
2659 u64 *spte)
2660{
2661 u64 pte;
2662 struct kvm_mmu_page *child;
2663
2664 pte = *spte;
2665 if (is_shadow_present_pte(pte)) {
2666 if (is_last_spte(pte, sp->role.level)) {
2667 drop_spte(kvm, spte);
2668 if (is_large_pte(pte))
2669 --kvm->stat.lpages;
2670 } else {
2671 child = page_header(pte & PT64_BASE_ADDR_MASK);
2672 drop_parent_pte(child, spte);
2673 }
2674 return true;
2675 }
2676
2677 if (is_mmio_spte(pte))
2678 mmu_spte_clear_no_track(spte);
2679
2680 return false;
2681}
2682
2683static void kvm_mmu_page_unlink_children(struct kvm *kvm,
2684 struct kvm_mmu_page *sp)
2685{
2686 unsigned i;
2687
2688 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2689 mmu_page_zap_pte(kvm, sp, sp->spt + i);
2690}
2691
2692static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
2693{
2694 u64 *sptep;
2695 struct rmap_iterator iter;
2696
2697 while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
2698 drop_parent_pte(sp, sptep);
2699}
2700
2701static int mmu_zap_unsync_children(struct kvm *kvm,
2702 struct kvm_mmu_page *parent,
2703 struct list_head *invalid_list)
2704{
2705 int i, zapped = 0;
2706 struct mmu_page_path parents;
2707 struct kvm_mmu_pages pages;
2708
2709 if (parent->role.level == PT_PAGE_TABLE_LEVEL)
2710 return 0;
2711
2712 while (mmu_unsync_walk(parent, &pages)) {
2713 struct kvm_mmu_page *sp;
2714
2715 for_each_sp(pages, sp, parents, i) {
2716 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2717 mmu_pages_clear_parents(&parents);
2718 zapped++;
2719 }
2720 }
2721
2722 return zapped;
2723}
2724
2725static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
2726 struct kvm_mmu_page *sp,
2727 struct list_head *invalid_list,
2728 int *nr_zapped)
2729{
2730 bool list_unstable;
2731
2732 trace_kvm_mmu_prepare_zap_page(sp);
2733 ++kvm->stat.mmu_shadow_zapped;
2734 *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
2735 kvm_mmu_page_unlink_children(kvm, sp);
2736 kvm_mmu_unlink_parents(kvm, sp);
2737
2738
2739 list_unstable = *nr_zapped;
2740
2741 if (!sp->role.invalid && !sp->role.direct)
2742 unaccount_shadowed(kvm, sp);
2743
2744 if (sp->unsync)
2745 kvm_unlink_unsync_page(kvm, sp);
2746 if (!sp->root_count) {
2747
2748 (*nr_zapped)++;
2749 list_move(&sp->link, invalid_list);
2750 kvm_mod_used_mmu_pages(kvm, -1);
2751 } else {
2752 list_move(&sp->link, &kvm->arch.active_mmu_pages);
2753
2754 if (!sp->role.invalid)
2755 kvm_reload_remote_mmus(kvm);
2756 }
2757
2758 sp->role.invalid = 1;
2759 return list_unstable;
2760}
2761
2762static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2763 struct list_head *invalid_list)
2764{
2765 int nr_zapped;
2766
2767 __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
2768 return nr_zapped;
2769}
2770
2771static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2772 struct list_head *invalid_list)
2773{
2774 struct kvm_mmu_page *sp, *nsp;
2775
2776 if (list_empty(invalid_list))
2777 return;
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788 kvm_flush_remote_tlbs(kvm);
2789
2790 list_for_each_entry_safe(sp, nsp, invalid_list, link) {
2791 WARN_ON(!sp->role.invalid || sp->root_count);
2792 kvm_mmu_free_page(sp);
2793 }
2794}
2795
2796static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
2797 struct list_head *invalid_list)
2798{
2799 struct kvm_mmu_page *sp;
2800
2801 if (list_empty(&kvm->arch.active_mmu_pages))
2802 return false;
2803
2804 sp = list_last_entry(&kvm->arch.active_mmu_pages,
2805 struct kvm_mmu_page, link);
2806 return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2807}
2808
2809
2810
2811
2812
2813void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
2814{
2815 LIST_HEAD(invalid_list);
2816
2817 spin_lock(&kvm->mmu_lock);
2818
2819 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2820
2821 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
2822 if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list))
2823 break;
2824
2825 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2826 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
2827 }
2828
2829 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
2830
2831 spin_unlock(&kvm->mmu_lock);
2832}
2833
2834int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2835{
2836 struct kvm_mmu_page *sp;
2837 LIST_HEAD(invalid_list);
2838 int r;
2839
2840 pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
2841 r = 0;
2842 spin_lock(&kvm->mmu_lock);
2843 for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
2844 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
2845 sp->role.word);
2846 r = 1;
2847 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
2848 }
2849 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2850 spin_unlock(&kvm->mmu_lock);
2851
2852 return r;
2853}
2854EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
2855
2856static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
2857{
2858 trace_kvm_mmu_unsync_page(sp);
2859 ++vcpu->kvm->stat.mmu_unsync;
2860 sp->unsync = 1;
2861
2862 kvm_mmu_mark_parents_unsync(sp);
2863}
2864
2865static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2866 bool can_unsync)
2867{
2868 struct kvm_mmu_page *sp;
2869
2870 if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
2871 return true;
2872
2873 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
2874 if (!can_unsync)
2875 return true;
2876
2877 if (sp->unsync)
2878 continue;
2879
2880 WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
2881 kvm_unsync_page(vcpu, sp);
2882 }
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921 smp_wmb();
2922
2923 return false;
2924}
2925
2926static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
2927{
2928 if (pfn_valid(pfn))
2929 return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940 (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
2941
2942 return !e820__mapped_raw_any(pfn_to_hpa(pfn),
2943 pfn_to_hpa(pfn + 1) - 1,
2944 E820_TYPE_RAM);
2945}
2946
2947
2948#define SET_SPTE_WRITE_PROTECTED_PT BIT(0)
2949#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
2950
2951static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2952 unsigned pte_access, int level,
2953 gfn_t gfn, kvm_pfn_t pfn, bool speculative,
2954 bool can_unsync, bool host_writable)
2955{
2956 u64 spte = 0;
2957 int ret = 0;
2958 struct kvm_mmu_page *sp;
2959
2960 if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
2961 return 0;
2962
2963 sp = page_header(__pa(sptep));
2964 if (sp_ad_disabled(sp))
2965 spte |= shadow_acc_track_value;
2966
2967
2968
2969
2970
2971
2972
2973 spte |= shadow_present_mask;
2974 if (!speculative)
2975 spte |= spte_shadow_accessed_mask(spte);
2976
2977 if (pte_access & ACC_EXEC_MASK)
2978 spte |= shadow_x_mask;
2979 else
2980 spte |= shadow_nx_mask;
2981
2982 if (pte_access & ACC_USER_MASK)
2983 spte |= shadow_user_mask;
2984
2985 if (level > PT_PAGE_TABLE_LEVEL)
2986 spte |= PT_PAGE_SIZE_MASK;
2987 if (tdp_enabled)
2988 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
2989 kvm_is_mmio_pfn(pfn));
2990
2991 if (host_writable)
2992 spte |= SPTE_HOST_WRITEABLE;
2993 else
2994 pte_access &= ~ACC_WRITE_MASK;
2995
2996 if (!kvm_is_mmio_pfn(pfn))
2997 spte |= shadow_me_mask;
2998
2999 spte |= (u64)pfn << PAGE_SHIFT;
3000
3001 if (pte_access & ACC_WRITE_MASK) {
3002
3003
3004
3005
3006
3007
3008
3009 if (level > PT_PAGE_TABLE_LEVEL &&
3010 mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
3011 goto done;
3012
3013 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
3014
3015
3016
3017
3018
3019
3020
3021 if (!can_unsync && is_writable_pte(*sptep))
3022 goto set_pte;
3023
3024 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
3025 pgprintk("%s: found shadow page for %llx, marking ro\n",
3026 __func__, gfn);
3027 ret |= SET_SPTE_WRITE_PROTECTED_PT;
3028 pte_access &= ~ACC_WRITE_MASK;
3029 spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
3030 }
3031 }
3032
3033 if (pte_access & ACC_WRITE_MASK) {
3034 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3035 spte |= spte_shadow_dirty_mask(spte);
3036 }
3037
3038 if (speculative)
3039 spte = mark_spte_for_access_track(spte);
3040
3041set_pte:
3042 if (mmu_spte_update(sptep, spte))
3043 ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
3044done:
3045 return ret;
3046}
3047
3048static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
3049 int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
3050 bool speculative, bool host_writable)
3051{
3052 int was_rmapped = 0;
3053 int rmap_count;
3054 int set_spte_ret;
3055 int ret = RET_PF_RETRY;
3056 bool flush = false;
3057
3058 pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
3059 *sptep, write_fault, gfn);
3060
3061 if (is_shadow_present_pte(*sptep)) {
3062
3063
3064
3065
3066 if (level > PT_PAGE_TABLE_LEVEL &&
3067 !is_large_pte(*sptep)) {
3068 struct kvm_mmu_page *child;
3069 u64 pte = *sptep;
3070
3071 child = page_header(pte & PT64_BASE_ADDR_MASK);
3072 drop_parent_pte(child, sptep);
3073 flush = true;
3074 } else if (pfn != spte_to_pfn(*sptep)) {
3075 pgprintk("hfn old %llx new %llx\n",
3076 spte_to_pfn(*sptep), pfn);
3077 drop_spte(vcpu->kvm, sptep);
3078 flush = true;
3079 } else
3080 was_rmapped = 1;
3081 }
3082
3083 set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
3084 speculative, true, host_writable);
3085 if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
3086 if (write_fault)
3087 ret = RET_PF_EMULATE;
3088 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3089 }
3090
3091 if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
3092 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
3093 KVM_PAGES_PER_HPAGE(level));
3094
3095 if (unlikely(is_mmio_spte(*sptep)))
3096 ret = RET_PF_EMULATE;
3097
3098 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
3099 trace_kvm_mmu_set_spte(level, gfn, sptep);
3100 if (!was_rmapped && is_large_pte(*sptep))
3101 ++vcpu->kvm->stat.lpages;
3102
3103 if (is_shadow_present_pte(*sptep)) {
3104 if (!was_rmapped) {
3105 rmap_count = rmap_add(vcpu, sptep, gfn);
3106 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
3107 rmap_recycle(vcpu, sptep, gfn);
3108 }
3109 }
3110
3111 return ret;
3112}
3113
3114static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
3115 bool no_dirty_log)
3116{
3117 struct kvm_memory_slot *slot;
3118
3119 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
3120 if (!slot)
3121 return KVM_PFN_ERR_FAULT;
3122
3123 return gfn_to_pfn_memslot_atomic(slot, gfn);
3124}
3125
3126static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
3127 struct kvm_mmu_page *sp,
3128 u64 *start, u64 *end)
3129{
3130 struct page *pages[PTE_PREFETCH_NUM];
3131 struct kvm_memory_slot *slot;
3132 unsigned access = sp->role.access;
3133 int i, ret;
3134 gfn_t gfn;
3135
3136 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
3137 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
3138 if (!slot)
3139 return -1;
3140
3141 ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
3142 if (ret <= 0)
3143 return -1;
3144
3145 for (i = 0; i < ret; i++, gfn++, start++) {
3146 mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
3147 page_to_pfn(pages[i]), true, true);
3148 put_page(pages[i]);
3149 }
3150
3151 return 0;
3152}
3153
3154static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
3155 struct kvm_mmu_page *sp, u64 *sptep)
3156{
3157 u64 *spte, *start = NULL;
3158 int i;
3159
3160 WARN_ON(!sp->role.direct);
3161
3162 i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
3163 spte = sp->spt + i;
3164
3165 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
3166 if (is_shadow_present_pte(*spte) || spte == sptep) {
3167 if (!start)
3168 continue;
3169 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
3170 break;
3171 start = NULL;
3172 } else if (!start)
3173 start = spte;
3174 }
3175}
3176
3177static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
3178{
3179 struct kvm_mmu_page *sp;
3180
3181 sp = page_header(__pa(sptep));
3182
3183
3184
3185
3186
3187
3188 if (sp_ad_disabled(sp))
3189 return;
3190
3191 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
3192 return;
3193
3194 __direct_pte_prefetch(vcpu, sp, sptep);
3195}
3196
3197static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
3198 int map_writable, int level, kvm_pfn_t pfn,
3199 bool prefault)
3200{
3201 struct kvm_shadow_walk_iterator it;
3202 struct kvm_mmu_page *sp;
3203 int ret;
3204 gfn_t gfn = gpa >> PAGE_SHIFT;
3205 gfn_t base_gfn = gfn;
3206
3207 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3208 return RET_PF_RETRY;
3209
3210 trace_kvm_mmu_spte_requested(gpa, level, pfn);
3211 for_each_shadow_entry(vcpu, gpa, it) {
3212 base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
3213 if (it.level == level)
3214 break;
3215
3216 drop_large_spte(vcpu, it.sptep);
3217 if (!is_shadow_present_pte(*it.sptep)) {
3218 sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
3219 it.level - 1, true, ACC_ALL);
3220
3221 link_shadow_page(vcpu, it.sptep, sp);
3222 }
3223 }
3224
3225 ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
3226 write, level, base_gfn, pfn, prefault,
3227 map_writable);
3228 direct_pte_prefetch(vcpu, it.sptep);
3229 ++vcpu->stat.pf_fixed;
3230 return ret;
3231}
3232
3233static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
3234{
3235 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
3236}
3237
3238static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
3239{
3240
3241
3242
3243
3244
3245 if (pfn == KVM_PFN_ERR_RO_FAULT)
3246 return RET_PF_EMULATE;
3247
3248 if (pfn == KVM_PFN_ERR_HWPOISON) {
3249 kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
3250 return RET_PF_RETRY;
3251 }
3252
3253 return -EFAULT;
3254}
3255
3256static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
3257 gfn_t gfn, kvm_pfn_t *pfnp,
3258 int *levelp)
3259{
3260 kvm_pfn_t pfn = *pfnp;
3261 int level = *levelp;
3262
3263
3264
3265
3266
3267
3268
3269 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
3270 level == PT_PAGE_TABLE_LEVEL &&
3271 PageTransCompoundMap(pfn_to_page(pfn)) &&
3272 !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
3273 unsigned long mask;
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283 *levelp = level = PT_DIRECTORY_LEVEL;
3284 mask = KVM_PAGES_PER_HPAGE(level) - 1;
3285 VM_BUG_ON((gfn & mask) != (pfn & mask));
3286 if (pfn & mask) {
3287 kvm_release_pfn_clean(pfn);
3288 pfn &= ~mask;
3289 kvm_get_pfn(pfn);
3290 *pfnp = pfn;
3291 }
3292 }
3293}
3294
3295static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
3296 kvm_pfn_t pfn, unsigned access, int *ret_val)
3297{
3298
3299 if (unlikely(is_error_pfn(pfn))) {
3300 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
3301 return true;
3302 }
3303
3304 if (unlikely(is_noslot_pfn(pfn)))
3305 vcpu_cache_mmio_info(vcpu, gva, gfn, access);
3306
3307 return false;
3308}
3309
3310static bool page_fault_can_be_fast(u32 error_code)
3311{
3312
3313
3314
3315
3316 if (unlikely(error_code & PFERR_RSVD_MASK))
3317 return false;
3318
3319
3320 if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
3321 == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
3322 return false;
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338 return shadow_acc_track_mask != 0 ||
3339 ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
3340 == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
3341}
3342
3343
3344
3345
3346
3347static bool
3348fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
3349 u64 *sptep, u64 old_spte, u64 new_spte)
3350{
3351 gfn_t gfn;
3352
3353 WARN_ON(!sp->role.direct);
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367 if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
3368 return false;
3369
3370 if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
3371
3372
3373
3374
3375 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
3376 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3377 }
3378
3379 return true;
3380}
3381
3382static bool is_access_allowed(u32 fault_err_code, u64 spte)
3383{
3384 if (fault_err_code & PFERR_FETCH_MASK)
3385 return is_executable_pte(spte);
3386
3387 if (fault_err_code & PFERR_WRITE_MASK)
3388 return is_writable_pte(spte);
3389
3390
3391 return spte & PT_PRESENT_MASK;
3392}
3393
3394
3395
3396
3397
3398
3399static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
3400 u32 error_code)
3401{
3402 struct kvm_shadow_walk_iterator iterator;
3403 struct kvm_mmu_page *sp;
3404 bool fault_handled = false;
3405 u64 spte = 0ull;
3406 uint retry_count = 0;
3407
3408 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3409 return false;
3410
3411 if (!page_fault_can_be_fast(error_code))
3412 return false;
3413
3414 walk_shadow_page_lockless_begin(vcpu);
3415
3416 do {
3417 u64 new_spte;
3418
3419 for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
3420 if (!is_shadow_present_pte(spte) ||
3421 iterator.level < level)
3422 break;
3423
3424 sp = page_header(__pa(iterator.sptep));
3425 if (!is_last_spte(spte, sp->role.level))
3426 break;
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438 if (is_access_allowed(error_code, spte)) {
3439 fault_handled = true;
3440 break;
3441 }
3442
3443 new_spte = spte;
3444
3445 if (is_access_track_spte(spte))
3446 new_spte = restore_acc_track_spte(new_spte);
3447
3448
3449
3450
3451
3452
3453 if ((error_code & PFERR_WRITE_MASK) &&
3454 spte_can_locklessly_be_made_writable(spte))
3455 {
3456 new_spte |= PT_WRITABLE_MASK;
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
3470 break;
3471 }
3472
3473
3474 if (new_spte == spte ||
3475 !is_access_allowed(error_code, new_spte))
3476 break;
3477
3478
3479
3480
3481
3482
3483 fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
3484 iterator.sptep, spte,
3485 new_spte);
3486 if (fault_handled)
3487 break;
3488
3489 if (++retry_count > 4) {
3490 printk_once(KERN_WARNING
3491 "kvm: Fast #PF retrying more than 4 times.\n");
3492 break;
3493 }
3494
3495 } while (true);
3496
3497 trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
3498 spte, fault_handled);
3499 walk_shadow_page_lockless_end(vcpu);
3500
3501 return fault_handled;
3502}
3503
3504static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
3505 gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable);
3506static int make_mmu_pages_available(struct kvm_vcpu *vcpu);
3507
3508static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
3509 gfn_t gfn, bool prefault)
3510{
3511 int r;
3512 int level;
3513 bool force_pt_level = false;
3514 kvm_pfn_t pfn;
3515 unsigned long mmu_seq;
3516 bool map_writable, write = error_code & PFERR_WRITE_MASK;
3517
3518 level = mapping_level(vcpu, gfn, &force_pt_level);
3519 if (likely(!force_pt_level)) {
3520
3521
3522
3523
3524
3525 if (level > PT_DIRECTORY_LEVEL)
3526 level = PT_DIRECTORY_LEVEL;
3527
3528 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
3529 }
3530
3531 if (fast_page_fault(vcpu, v, level, error_code))
3532 return RET_PF_RETRY;
3533
3534 mmu_seq = vcpu->kvm->mmu_notifier_seq;
3535 smp_rmb();
3536
3537 if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
3538 return RET_PF_RETRY;
3539
3540 if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
3541 return r;
3542
3543 r = RET_PF_RETRY;
3544 spin_lock(&vcpu->kvm->mmu_lock);
3545 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
3546 goto out_unlock;
3547 if (make_mmu_pages_available(vcpu) < 0)
3548 goto out_unlock;
3549 if (likely(!force_pt_level))
3550 transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
3551 r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault);
3552out_unlock:
3553 spin_unlock(&vcpu->kvm->mmu_lock);
3554 kvm_release_pfn_clean(pfn);
3555 return r;
3556}
3557
3558static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
3559 struct list_head *invalid_list)
3560{
3561 struct kvm_mmu_page *sp;
3562
3563 if (!VALID_PAGE(*root_hpa))
3564 return;
3565
3566 sp = page_header(*root_hpa & PT64_BASE_ADDR_MASK);
3567 --sp->root_count;
3568 if (!sp->root_count && sp->role.invalid)
3569 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
3570
3571 *root_hpa = INVALID_PAGE;
3572}
3573
3574
3575void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
3576 ulong roots_to_free)
3577{
3578 int i;
3579 LIST_HEAD(invalid_list);
3580 bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
3581
3582 BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
3583
3584
3585 if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) {
3586 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3587 if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
3588 VALID_PAGE(mmu->prev_roots[i].hpa))
3589 break;
3590
3591 if (i == KVM_MMU_NUM_PREV_ROOTS)
3592 return;
3593 }
3594
3595 spin_lock(&vcpu->kvm->mmu_lock);
3596
3597 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3598 if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
3599 mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa,
3600 &invalid_list);
3601
3602 if (free_active_root) {
3603 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
3604 (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
3605 mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
3606 &invalid_list);
3607 } else {
3608 for (i = 0; i < 4; ++i)
3609 if (mmu->pae_root[i] != 0)
3610 mmu_free_root_page(vcpu->kvm,
3611 &mmu->pae_root[i],
3612 &invalid_list);
3613 mmu->root_hpa = INVALID_PAGE;
3614 }
3615 mmu->root_cr3 = 0;
3616 }
3617
3618 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3619 spin_unlock(&vcpu->kvm->mmu_lock);
3620}
3621EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
3622
3623static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
3624{
3625 int ret = 0;
3626
3627 if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
3628 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3629 ret = 1;
3630 }
3631
3632 return ret;
3633}
3634
3635static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
3636{
3637 struct kvm_mmu_page *sp;
3638 unsigned i;
3639
3640 if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
3641 spin_lock(&vcpu->kvm->mmu_lock);
3642 if(make_mmu_pages_available(vcpu) < 0) {
3643 spin_unlock(&vcpu->kvm->mmu_lock);
3644 return -ENOSPC;
3645 }
3646 sp = kvm_mmu_get_page(vcpu, 0, 0,
3647 vcpu->arch.mmu->shadow_root_level, 1, ACC_ALL);
3648 ++sp->root_count;
3649 spin_unlock(&vcpu->kvm->mmu_lock);
3650 vcpu->arch.mmu->root_hpa = __pa(sp->spt);
3651 } else if (vcpu->arch.mmu->shadow_root_level == PT32E_ROOT_LEVEL) {
3652 for (i = 0; i < 4; ++i) {
3653 hpa_t root = vcpu->arch.mmu->pae_root[i];
3654
3655 MMU_WARN_ON(VALID_PAGE(root));
3656 spin_lock(&vcpu->kvm->mmu_lock);
3657 if (make_mmu_pages_available(vcpu) < 0) {
3658 spin_unlock(&vcpu->kvm->mmu_lock);
3659 return -ENOSPC;
3660 }
3661 sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
3662 i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
3663 root = __pa(sp->spt);
3664 ++sp->root_count;
3665 spin_unlock(&vcpu->kvm->mmu_lock);
3666 vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
3667 }
3668 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
3669 } else
3670 BUG();
3671 vcpu->arch.mmu->root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
3672
3673 return 0;
3674}
3675
3676static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3677{
3678 struct kvm_mmu_page *sp;
3679 u64 pdptr, pm_mask;
3680 gfn_t root_gfn, root_cr3;
3681 int i;
3682
3683 root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
3684 root_gfn = root_cr3 >> PAGE_SHIFT;
3685
3686 if (mmu_check_root(vcpu, root_gfn))
3687 return 1;
3688
3689
3690
3691
3692
3693 if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
3694 hpa_t root = vcpu->arch.mmu->root_hpa;
3695
3696 MMU_WARN_ON(VALID_PAGE(root));
3697
3698 spin_lock(&vcpu->kvm->mmu_lock);
3699 if (make_mmu_pages_available(vcpu) < 0) {
3700 spin_unlock(&vcpu->kvm->mmu_lock);
3701 return -ENOSPC;
3702 }
3703 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
3704 vcpu->arch.mmu->shadow_root_level, 0, ACC_ALL);
3705 root = __pa(sp->spt);
3706 ++sp->root_count;
3707 spin_unlock(&vcpu->kvm->mmu_lock);
3708 vcpu->arch.mmu->root_hpa = root;
3709 goto set_root_cr3;
3710 }
3711
3712
3713
3714
3715
3716
3717 pm_mask = PT_PRESENT_MASK;
3718 if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL)
3719 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
3720
3721 for (i = 0; i < 4; ++i) {
3722 hpa_t root = vcpu->arch.mmu->pae_root[i];
3723
3724 MMU_WARN_ON(VALID_PAGE(root));
3725 if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
3726 pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
3727 if (!(pdptr & PT_PRESENT_MASK)) {
3728 vcpu->arch.mmu->pae_root[i] = 0;
3729 continue;
3730 }
3731 root_gfn = pdptr >> PAGE_SHIFT;
3732 if (mmu_check_root(vcpu, root_gfn))
3733 return 1;
3734 }
3735 spin_lock(&vcpu->kvm->mmu_lock);
3736 if (make_mmu_pages_available(vcpu) < 0) {
3737 spin_unlock(&vcpu->kvm->mmu_lock);
3738 return -ENOSPC;
3739 }
3740 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
3741 0, ACC_ALL);
3742 root = __pa(sp->spt);
3743 ++sp->root_count;
3744 spin_unlock(&vcpu->kvm->mmu_lock);
3745
3746 vcpu->arch.mmu->pae_root[i] = root | pm_mask;
3747 }
3748 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
3749
3750
3751
3752
3753
3754 if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
3755 if (vcpu->arch.mmu->lm_root == NULL) {
3756
3757
3758
3759
3760
3761 u64 *lm_root;
3762
3763 lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3764 if (lm_root == NULL)
3765 return 1;
3766
3767 lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask;
3768
3769 vcpu->arch.mmu->lm_root = lm_root;
3770 }
3771
3772 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
3773 }
3774
3775set_root_cr3:
3776 vcpu->arch.mmu->root_cr3 = root_cr3;
3777
3778 return 0;
3779}
3780
3781static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
3782{
3783 if (vcpu->arch.mmu->direct_map)
3784 return mmu_alloc_direct_roots(vcpu);
3785 else
3786 return mmu_alloc_shadow_roots(vcpu);
3787}
3788
3789void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
3790{
3791 int i;
3792 struct kvm_mmu_page *sp;
3793
3794 if (vcpu->arch.mmu->direct_map)
3795 return;
3796
3797 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3798 return;
3799
3800 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
3801
3802 if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
3803 hpa_t root = vcpu->arch.mmu->root_hpa;
3804 sp = page_header(root);
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816 if (!smp_load_acquire(&sp->unsync) &&
3817 !smp_load_acquire(&sp->unsync_children))
3818 return;
3819
3820 spin_lock(&vcpu->kvm->mmu_lock);
3821 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
3822
3823 mmu_sync_children(vcpu, sp);
3824
3825 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
3826 spin_unlock(&vcpu->kvm->mmu_lock);
3827 return;
3828 }
3829
3830 spin_lock(&vcpu->kvm->mmu_lock);
3831 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
3832
3833 for (i = 0; i < 4; ++i) {
3834 hpa_t root = vcpu->arch.mmu->pae_root[i];
3835
3836 if (root && VALID_PAGE(root)) {
3837 root &= PT64_BASE_ADDR_MASK;
3838 sp = page_header(root);
3839 mmu_sync_children(vcpu, sp);
3840 }
3841 }
3842
3843 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
3844 spin_unlock(&vcpu->kvm->mmu_lock);
3845}
3846EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
3847
3848static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
3849 u32 access, struct x86_exception *exception)
3850{
3851 if (exception)
3852 exception->error_code = 0;
3853 return vaddr;
3854}
3855
3856static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
3857 u32 access,
3858 struct x86_exception *exception)
3859{
3860 if (exception)
3861 exception->error_code = 0;
3862 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
3863}
3864
3865static bool
3866__is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
3867{
3868 int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f;
3869
3870 return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) |
3871 ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0);
3872}
3873
3874static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
3875{
3876 return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level);
3877}
3878
3879static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
3880{
3881 return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
3882}
3883
3884static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3885{
3886
3887
3888
3889
3890 if (mmu_is_nested(vcpu))
3891 return false;
3892
3893 if (direct)
3894 return vcpu_match_mmio_gpa(vcpu, addr);
3895
3896 return vcpu_match_mmio_gva(vcpu, addr);
3897}
3898
3899
3900static bool
3901walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
3902{
3903 struct kvm_shadow_walk_iterator iterator;
3904 u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull;
3905 int root, leaf;
3906 bool reserved = false;
3907
3908 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3909 goto exit;
3910
3911 walk_shadow_page_lockless_begin(vcpu);
3912
3913 for (shadow_walk_init(&iterator, vcpu, addr),
3914 leaf = root = iterator.level;
3915 shadow_walk_okay(&iterator);
3916 __shadow_walk_next(&iterator, spte)) {
3917 spte = mmu_spte_get_lockless(iterator.sptep);
3918
3919 sptes[leaf - 1] = spte;
3920 leaf--;
3921
3922 if (!is_shadow_present_pte(spte))
3923 break;
3924
3925 reserved |= is_shadow_zero_bits_set(vcpu->arch.mmu, spte,
3926 iterator.level);
3927 }
3928
3929 walk_shadow_page_lockless_end(vcpu);
3930
3931 if (reserved) {
3932 pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
3933 __func__, addr);
3934 while (root > leaf) {
3935 pr_err("------ spte 0x%llx level %d.\n",
3936 sptes[root - 1], root);
3937 root--;
3938 }
3939 }
3940exit:
3941 *sptep = spte;
3942 return reserved;
3943}
3944
3945static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3946{
3947 u64 spte;
3948 bool reserved;
3949
3950 if (mmio_info_in_cache(vcpu, addr, direct))
3951 return RET_PF_EMULATE;
3952
3953 reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
3954 if (WARN_ON(reserved))
3955 return -EINVAL;
3956
3957 if (is_mmio_spte(spte)) {
3958 gfn_t gfn = get_mmio_spte_gfn(spte);
3959 unsigned access = get_mmio_spte_access(spte);
3960
3961 if (!check_mmio_spte(vcpu, spte))
3962 return RET_PF_INVALID;
3963
3964 if (direct)
3965 addr = 0;
3966
3967 trace_handle_mmio_page_fault(addr, gfn, access);
3968 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
3969 return RET_PF_EMULATE;
3970 }
3971
3972
3973
3974
3975
3976 return RET_PF_RETRY;
3977}
3978
3979static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
3980 u32 error_code, gfn_t gfn)
3981{
3982 if (unlikely(error_code & PFERR_RSVD_MASK))
3983 return false;
3984
3985 if (!(error_code & PFERR_PRESENT_MASK) ||
3986 !(error_code & PFERR_WRITE_MASK))
3987 return false;
3988
3989
3990
3991
3992
3993 if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
3994 return true;
3995
3996 return false;
3997}
3998
3999static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
4000{
4001 struct kvm_shadow_walk_iterator iterator;
4002 u64 spte;
4003
4004 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
4005 return;
4006
4007 walk_shadow_page_lockless_begin(vcpu);
4008 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
4009 clear_sp_write_flooding_count(iterator.sptep);
4010 if (!is_shadow_present_pte(spte))
4011 break;
4012 }
4013 walk_shadow_page_lockless_end(vcpu);
4014}
4015
4016static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
4017 u32 error_code, bool prefault)
4018{
4019 gfn_t gfn = gva >> PAGE_SHIFT;
4020 int r;
4021
4022 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
4023
4024 if (page_fault_handle_page_track(vcpu, error_code, gfn))
4025 return RET_PF_EMULATE;
4026
4027 r = mmu_topup_memory_caches(vcpu);
4028 if (r)
4029 return r;
4030
4031 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
4032
4033
4034 return nonpaging_map(vcpu, gva & PAGE_MASK,
4035 error_code, gfn, prefault);
4036}
4037
4038static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
4039{
4040 struct kvm_arch_async_pf arch;
4041
4042 arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
4043 arch.gfn = gfn;
4044 arch.direct_map = vcpu->arch.mmu->direct_map;
4045 arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu);
4046
4047 return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
4048}
4049
4050static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
4051 gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
4052{
4053 struct kvm_memory_slot *slot;
4054 bool async;
4055
4056
4057
4058
4059 if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) {
4060 *pfn = KVM_PFN_NOSLOT;
4061 return false;
4062 }
4063
4064 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
4065 async = false;
4066 *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
4067 if (!async)
4068 return false;
4069
4070 if (!prefault && kvm_can_do_async_pf(vcpu)) {
4071 trace_kvm_try_async_get_page(gva, gfn);
4072 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
4073 trace_kvm_async_pf_doublefault(gva, gfn);
4074 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
4075 return true;
4076 } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
4077 return true;
4078 }
4079
4080 *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
4081 return false;
4082}
4083
4084int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
4085 u64 fault_address, char *insn, int insn_len)
4086{
4087 int r = 1;
4088
4089 vcpu->arch.l1tf_flush_l1d = true;
4090 switch (vcpu->arch.apf.host_apf_reason) {
4091 default:
4092 trace_kvm_page_fault(fault_address, error_code);
4093
4094 if (kvm_event_needs_reinjection(vcpu))
4095 kvm_mmu_unprotect_page_virt(vcpu, fault_address);
4096 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
4097 insn_len);
4098 break;
4099 case KVM_PV_REASON_PAGE_NOT_PRESENT:
4100 vcpu->arch.apf.host_apf_reason = 0;
4101 local_irq_disable();
4102 kvm_async_pf_task_wait(fault_address, 0);
4103 local_irq_enable();
4104 break;
4105 case KVM_PV_REASON_PAGE_READY:
4106 vcpu->arch.apf.host_apf_reason = 0;
4107 local_irq_disable();
4108 kvm_async_pf_task_wake(fault_address);
4109 local_irq_enable();
4110 break;
4111 }
4112 return r;
4113}
4114EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
4115
4116static bool
4117check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
4118{
4119 int page_num = KVM_PAGES_PER_HPAGE(level);
4120
4121 gfn &= ~(page_num - 1);
4122
4123 return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
4124}
4125
4126static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
4127 bool prefault)
4128{
4129 kvm_pfn_t pfn;
4130 int r;
4131 int level;
4132 bool force_pt_level;
4133 gfn_t gfn = gpa >> PAGE_SHIFT;
4134 unsigned long mmu_seq;
4135 int write = error_code & PFERR_WRITE_MASK;
4136 bool map_writable;
4137
4138 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
4139
4140 if (page_fault_handle_page_track(vcpu, error_code, gfn))
4141 return RET_PF_EMULATE;
4142
4143 r = mmu_topup_memory_caches(vcpu);
4144 if (r)
4145 return r;
4146
4147 force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
4148 PT_DIRECTORY_LEVEL);
4149 level = mapping_level(vcpu, gfn, &force_pt_level);
4150 if (likely(!force_pt_level)) {
4151 if (level > PT_DIRECTORY_LEVEL &&
4152 !check_hugepage_cache_consistency(vcpu, gfn, level))
4153 level = PT_DIRECTORY_LEVEL;
4154 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
4155 }
4156
4157 if (fast_page_fault(vcpu, gpa, level, error_code))
4158 return RET_PF_RETRY;
4159
4160 mmu_seq = vcpu->kvm->mmu_notifier_seq;
4161 smp_rmb();
4162
4163 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
4164 return RET_PF_RETRY;
4165
4166 if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
4167 return r;
4168
4169 r = RET_PF_RETRY;
4170 spin_lock(&vcpu->kvm->mmu_lock);
4171 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
4172 goto out_unlock;
4173 if (make_mmu_pages_available(vcpu) < 0)
4174 goto out_unlock;
4175 if (likely(!force_pt_level))
4176 transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
4177 r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault);
4178out_unlock:
4179 spin_unlock(&vcpu->kvm->mmu_lock);
4180 kvm_release_pfn_clean(pfn);
4181 return r;
4182}
4183
4184static void nonpaging_init_context(struct kvm_vcpu *vcpu,
4185 struct kvm_mmu *context)
4186{
4187 context->page_fault = nonpaging_page_fault;
4188 context->gva_to_gpa = nonpaging_gva_to_gpa;
4189 context->sync_page = nonpaging_sync_page;
4190 context->invlpg = nonpaging_invlpg;
4191 context->update_pte = nonpaging_update_pte;
4192 context->root_level = 0;
4193 context->shadow_root_level = PT32E_ROOT_LEVEL;
4194 context->direct_map = true;
4195 context->nx = false;
4196}
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4207 union kvm_mmu_page_role new_role)
4208{
4209 uint i;
4210 struct kvm_mmu_root_info root;
4211 struct kvm_mmu *mmu = vcpu->arch.mmu;
4212
4213 root.cr3 = mmu->root_cr3;
4214 root.hpa = mmu->root_hpa;
4215
4216 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
4217 swap(root, mmu->prev_roots[i]);
4218
4219 if (new_cr3 == root.cr3 && VALID_PAGE(root.hpa) &&
4220 page_header(root.hpa) != NULL &&
4221 new_role.word == page_header(root.hpa)->role.word)
4222 break;
4223 }
4224
4225 mmu->root_hpa = root.hpa;
4226 mmu->root_cr3 = root.cr3;
4227
4228 return i < KVM_MMU_NUM_PREV_ROOTS;
4229}
4230
4231static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4232 union kvm_mmu_page_role new_role,
4233 bool skip_tlb_flush)
4234{
4235 struct kvm_mmu *mmu = vcpu->arch.mmu;
4236
4237
4238
4239
4240
4241
4242 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
4243 mmu->root_level >= PT64_ROOT_4LEVEL) {
4244 if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT))
4245 return false;
4246
4247 if (cached_root_available(vcpu, new_cr3, new_role)) {
4248
4249
4250
4251
4252
4253
4254
4255 kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
4256 if (!skip_tlb_flush) {
4257 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
4258 kvm_x86_ops->tlb_flush(vcpu, true);
4259 }
4260
4261
4262
4263
4264
4265
4266
4267
4268 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
4269
4270 __clear_sp_write_flooding_count(
4271 page_header(mmu->root_hpa));
4272
4273 return true;
4274 }
4275 }
4276
4277 return false;
4278}
4279
4280static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4281 union kvm_mmu_page_role new_role,
4282 bool skip_tlb_flush)
4283{
4284 if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush))
4285 kvm_mmu_free_roots(vcpu, vcpu->arch.mmu,
4286 KVM_MMU_ROOT_CURRENT);
4287}
4288
4289void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
4290{
4291 __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu),
4292 skip_tlb_flush);
4293}
4294EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3);
4295
4296static unsigned long get_cr3(struct kvm_vcpu *vcpu)
4297{
4298 return kvm_read_cr3(vcpu);
4299}
4300
4301static void inject_page_fault(struct kvm_vcpu *vcpu,
4302 struct x86_exception *fault)
4303{
4304 vcpu->arch.mmu->inject_page_fault(vcpu, fault);
4305}
4306
4307static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
4308 unsigned access, int *nr_present)
4309{
4310 if (unlikely(is_mmio_spte(*sptep))) {
4311 if (gfn != get_mmio_spte_gfn(*sptep)) {
4312 mmu_spte_clear_no_track(sptep);
4313 return true;
4314 }
4315
4316 (*nr_present)++;
4317 mark_mmio_spte(vcpu, sptep, gfn, access);
4318 return true;
4319 }
4320
4321 return false;
4322}
4323
4324static inline bool is_last_gpte(struct kvm_mmu *mmu,
4325 unsigned level, unsigned gpte)
4326{
4327
4328
4329
4330
4331
4332 gpte &= level - mmu->last_nonleaf_level;
4333
4334
4335
4336
4337
4338
4339 gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
4340
4341 return gpte & PT_PAGE_SIZE_MASK;
4342}
4343
4344#define PTTYPE_EPT 18
4345#define PTTYPE PTTYPE_EPT
4346#include "paging_tmpl.h"
4347#undef PTTYPE
4348
4349#define PTTYPE 64
4350#include "paging_tmpl.h"
4351#undef PTTYPE
4352
4353#define PTTYPE 32
4354#include "paging_tmpl.h"
4355#undef PTTYPE
4356
4357static void
4358__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4359 struct rsvd_bits_validate *rsvd_check,
4360 int maxphyaddr, int level, bool nx, bool gbpages,
4361 bool pse, bool amd)
4362{
4363 u64 exb_bit_rsvd = 0;
4364 u64 gbpages_bit_rsvd = 0;
4365 u64 nonleaf_bit8_rsvd = 0;
4366
4367 rsvd_check->bad_mt_xwr = 0;
4368
4369 if (!nx)
4370 exb_bit_rsvd = rsvd_bits(63, 63);
4371 if (!gbpages)
4372 gbpages_bit_rsvd = rsvd_bits(7, 7);
4373
4374
4375
4376
4377
4378 if (amd)
4379 nonleaf_bit8_rsvd = rsvd_bits(8, 8);
4380
4381 switch (level) {
4382 case PT32_ROOT_LEVEL:
4383
4384 rsvd_check->rsvd_bits_mask[0][1] = 0;
4385 rsvd_check->rsvd_bits_mask[0][0] = 0;
4386 rsvd_check->rsvd_bits_mask[1][0] =
4387 rsvd_check->rsvd_bits_mask[0][0];
4388
4389 if (!pse) {
4390 rsvd_check->rsvd_bits_mask[1][1] = 0;
4391 break;
4392 }
4393
4394 if (is_cpuid_PSE36())
4395
4396 rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
4397 else
4398
4399 rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
4400 break;
4401 case PT32E_ROOT_LEVEL:
4402 rsvd_check->rsvd_bits_mask[0][2] =
4403 rsvd_bits(maxphyaddr, 63) |
4404 rsvd_bits(5, 8) | rsvd_bits(1, 2);
4405 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
4406 rsvd_bits(maxphyaddr, 62);
4407 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
4408 rsvd_bits(maxphyaddr, 62);
4409 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
4410 rsvd_bits(maxphyaddr, 62) |
4411 rsvd_bits(13, 20);
4412 rsvd_check->rsvd_bits_mask[1][0] =
4413 rsvd_check->rsvd_bits_mask[0][0];
4414 break;
4415 case PT64_ROOT_5LEVEL:
4416 rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |
4417 nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
4418 rsvd_bits(maxphyaddr, 51);
4419 rsvd_check->rsvd_bits_mask[1][4] =
4420 rsvd_check->rsvd_bits_mask[0][4];
4421
4422 case PT64_ROOT_4LEVEL:
4423 rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
4424 nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
4425 rsvd_bits(maxphyaddr, 51);
4426 rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
4427 nonleaf_bit8_rsvd | gbpages_bit_rsvd |
4428 rsvd_bits(maxphyaddr, 51);
4429 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
4430 rsvd_bits(maxphyaddr, 51);
4431 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
4432 rsvd_bits(maxphyaddr, 51);
4433 rsvd_check->rsvd_bits_mask[1][3] =
4434 rsvd_check->rsvd_bits_mask[0][3];
4435 rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd |
4436 gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
4437 rsvd_bits(13, 29);
4438 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
4439 rsvd_bits(maxphyaddr, 51) |
4440 rsvd_bits(13, 20);
4441 rsvd_check->rsvd_bits_mask[1][0] =
4442 rsvd_check->rsvd_bits_mask[0][0];
4443 break;
4444 }
4445}
4446
4447static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4448 struct kvm_mmu *context)
4449{
4450 __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
4451 cpuid_maxphyaddr(vcpu), context->root_level,
4452 context->nx,
4453 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
4454 is_pse(vcpu), guest_cpuid_is_amd(vcpu));
4455}
4456
4457static void
4458__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
4459 int maxphyaddr, bool execonly)
4460{
4461 u64 bad_mt_xwr;
4462
4463 rsvd_check->rsvd_bits_mask[0][4] =
4464 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
4465 rsvd_check->rsvd_bits_mask[0][3] =
4466 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
4467 rsvd_check->rsvd_bits_mask[0][2] =
4468 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
4469 rsvd_check->rsvd_bits_mask[0][1] =
4470 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
4471 rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
4472
4473
4474 rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
4475 rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
4476 rsvd_check->rsvd_bits_mask[1][2] =
4477 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
4478 rsvd_check->rsvd_bits_mask[1][1] =
4479 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
4480 rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
4481
4482 bad_mt_xwr = 0xFFull << (2 * 8);
4483 bad_mt_xwr |= 0xFFull << (3 * 8);
4484 bad_mt_xwr |= 0xFFull << (7 * 8);
4485 bad_mt_xwr |= REPEAT_BYTE(1ull << 2);
4486 bad_mt_xwr |= REPEAT_BYTE(1ull << 6);
4487 if (!execonly) {
4488
4489 bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
4490 }
4491 rsvd_check->bad_mt_xwr = bad_mt_xwr;
4492}
4493
4494static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
4495 struct kvm_mmu *context, bool execonly)
4496{
4497 __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
4498 cpuid_maxphyaddr(vcpu), execonly);
4499}
4500
4501
4502
4503
4504
4505
4506void
4507reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
4508{
4509 bool uses_nx = context->nx ||
4510 context->mmu_role.base.smep_andnot_wp;
4511 struct rsvd_bits_validate *shadow_zero_check;
4512 int i;
4513
4514
4515
4516
4517
4518 shadow_zero_check = &context->shadow_zero_check;
4519 __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4520 shadow_phys_bits,
4521 context->shadow_root_level, uses_nx,
4522 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
4523 is_pse(vcpu), true);
4524
4525 if (!shadow_me_mask)
4526 return;
4527
4528 for (i = context->shadow_root_level; --i >= 0;) {
4529 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4530 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4531 }
4532
4533}
4534EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
4535
4536static inline bool boot_cpu_is_amd(void)
4537{
4538 WARN_ON_ONCE(!tdp_enabled);
4539 return shadow_x_mask == 0;
4540}
4541
4542
4543
4544
4545
4546static void
4547reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4548 struct kvm_mmu *context)
4549{
4550 struct rsvd_bits_validate *shadow_zero_check;
4551 int i;
4552
4553 shadow_zero_check = &context->shadow_zero_check;
4554
4555 if (boot_cpu_is_amd())
4556 __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4557 shadow_phys_bits,
4558 context->shadow_root_level, false,
4559 boot_cpu_has(X86_FEATURE_GBPAGES),
4560 true, true);
4561 else
4562 __reset_rsvds_bits_mask_ept(shadow_zero_check,
4563 shadow_phys_bits,
4564 false);
4565
4566 if (!shadow_me_mask)
4567 return;
4568
4569 for (i = context->shadow_root_level; --i >= 0;) {
4570 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4571 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4572 }
4573}
4574
4575
4576
4577
4578
4579static void
4580reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4581 struct kvm_mmu *context, bool execonly)
4582{
4583 __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
4584 shadow_phys_bits, execonly);
4585}
4586
4587#define BYTE_MASK(access) \
4588 ((1 & (access) ? 2 : 0) | \
4589 (2 & (access) ? 4 : 0) | \
4590 (3 & (access) ? 8 : 0) | \
4591 (4 & (access) ? 16 : 0) | \
4592 (5 & (access) ? 32 : 0) | \
4593 (6 & (access) ? 64 : 0) | \
4594 (7 & (access) ? 128 : 0))
4595
4596
4597static void update_permission_bitmask(struct kvm_vcpu *vcpu,
4598 struct kvm_mmu *mmu, bool ept)
4599{
4600 unsigned byte;
4601
4602 const u8 x = BYTE_MASK(ACC_EXEC_MASK);
4603 const u8 w = BYTE_MASK(ACC_WRITE_MASK);
4604 const u8 u = BYTE_MASK(ACC_USER_MASK);
4605
4606 bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0;
4607 bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0;
4608 bool cr0_wp = is_write_protection(vcpu);
4609
4610 for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
4611 unsigned pfec = byte << 1;
4612
4613
4614
4615
4616
4617
4618
4619 u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
4620
4621 u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
4622
4623 u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
4624
4625 u8 smepf = 0;
4626
4627 u8 smapf = 0;
4628
4629 if (!ept) {
4630
4631 u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
4632
4633
4634 if (!mmu->nx)
4635 ff = 0;
4636
4637
4638 if (!cr0_wp)
4639 wf = (pfec & PFERR_USER_MASK) ? wf : 0;
4640
4641
4642 if (cr4_smep)
4643 smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661 if (cr4_smap)
4662 smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
4663 }
4664
4665 mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
4666 }
4667}
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
4694 bool ept)
4695{
4696 unsigned bit;
4697 bool wp;
4698
4699 if (ept) {
4700 mmu->pkru_mask = 0;
4701 return;
4702 }
4703
4704
4705 if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) {
4706 mmu->pkru_mask = 0;
4707 return;
4708 }
4709
4710 wp = is_write_protection(vcpu);
4711
4712 for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
4713 unsigned pfec, pkey_bits;
4714 bool check_pkey, check_write, ff, uf, wf, pte_user;
4715
4716 pfec = bit << 1;
4717 ff = pfec & PFERR_FETCH_MASK;
4718 uf = pfec & PFERR_USER_MASK;
4719 wf = pfec & PFERR_WRITE_MASK;
4720
4721
4722 pte_user = pfec & PFERR_RSVD_MASK;
4723
4724
4725
4726
4727
4728 check_pkey = (!ff && pte_user);
4729
4730
4731
4732
4733 check_write = check_pkey && wf && (uf || wp);
4734
4735
4736 pkey_bits = !!check_pkey;
4737
4738 pkey_bits |= (!!check_write) << 1;
4739
4740 mmu->pkru_mask |= (pkey_bits & 3) << pfec;
4741 }
4742}
4743
4744static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
4745{
4746 unsigned root_level = mmu->root_level;
4747
4748 mmu->last_nonleaf_level = root_level;
4749 if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
4750 mmu->last_nonleaf_level++;
4751}
4752
4753static void paging64_init_context_common(struct kvm_vcpu *vcpu,
4754 struct kvm_mmu *context,
4755 int level)
4756{
4757 context->nx = is_nx(vcpu);
4758 context->root_level = level;
4759
4760 reset_rsvds_bits_mask(vcpu, context);
4761 update_permission_bitmask(vcpu, context, false);
4762 update_pkru_bitmask(vcpu, context, false);
4763 update_last_nonleaf_level(vcpu, context);
4764
4765 MMU_WARN_ON(!is_pae(vcpu));
4766 context->page_fault = paging64_page_fault;
4767 context->gva_to_gpa = paging64_gva_to_gpa;
4768 context->sync_page = paging64_sync_page;
4769 context->invlpg = paging64_invlpg;
4770 context->update_pte = paging64_update_pte;
4771 context->shadow_root_level = level;
4772 context->direct_map = false;
4773}
4774
4775static void paging64_init_context(struct kvm_vcpu *vcpu,
4776 struct kvm_mmu *context)
4777{
4778 int root_level = is_la57_mode(vcpu) ?
4779 PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4780
4781 paging64_init_context_common(vcpu, context, root_level);
4782}
4783
4784static void paging32_init_context(struct kvm_vcpu *vcpu,
4785 struct kvm_mmu *context)
4786{
4787 context->nx = false;
4788 context->root_level = PT32_ROOT_LEVEL;
4789
4790 reset_rsvds_bits_mask(vcpu, context);
4791 update_permission_bitmask(vcpu, context, false);
4792 update_pkru_bitmask(vcpu, context, false);
4793 update_last_nonleaf_level(vcpu, context);
4794
4795 context->page_fault = paging32_page_fault;
4796 context->gva_to_gpa = paging32_gva_to_gpa;
4797 context->sync_page = paging32_sync_page;
4798 context->invlpg = paging32_invlpg;
4799 context->update_pte = paging32_update_pte;
4800 context->shadow_root_level = PT32E_ROOT_LEVEL;
4801 context->direct_map = false;
4802}
4803
4804static void paging32E_init_context(struct kvm_vcpu *vcpu,
4805 struct kvm_mmu *context)
4806{
4807 paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
4808}
4809
4810static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
4811{
4812 union kvm_mmu_extended_role ext = {0};
4813
4814 ext.cr0_pg = !!is_paging(vcpu);
4815 ext.cr4_pae = !!is_pae(vcpu);
4816 ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
4817 ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
4818 ext.cr4_pse = !!is_pse(vcpu);
4819 ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
4820 ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
4821 ext.maxphyaddr = cpuid_maxphyaddr(vcpu);
4822
4823 ext.valid = 1;
4824
4825 return ext;
4826}
4827
4828static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
4829 bool base_only)
4830{
4831 union kvm_mmu_role role = {0};
4832
4833 role.base.access = ACC_ALL;
4834 role.base.nxe = !!is_nx(vcpu);
4835 role.base.cr0_wp = is_write_protection(vcpu);
4836 role.base.smm = is_smm(vcpu);
4837 role.base.guest_mode = is_guest_mode(vcpu);
4838
4839 if (base_only)
4840 return role;
4841
4842 role.ext = kvm_calc_mmu_role_ext(vcpu);
4843
4844 return role;
4845}
4846
4847static union kvm_mmu_role
4848kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
4849{
4850 union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
4851
4852 role.base.ad_disabled = (shadow_accessed_mask == 0);
4853 role.base.level = kvm_x86_ops->get_tdp_level(vcpu);
4854 role.base.direct = true;
4855 role.base.gpte_is_8_bytes = true;
4856
4857 return role;
4858}
4859
4860static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
4861{
4862 struct kvm_mmu *context = vcpu->arch.mmu;
4863 union kvm_mmu_role new_role =
4864 kvm_calc_tdp_mmu_root_page_role(vcpu, false);
4865
4866 new_role.base.word &= mmu_base_role_mask.word;
4867 if (new_role.as_u64 == context->mmu_role.as_u64)
4868 return;
4869
4870 context->mmu_role.as_u64 = new_role.as_u64;
4871 context->page_fault = tdp_page_fault;
4872 context->sync_page = nonpaging_sync_page;
4873 context->invlpg = nonpaging_invlpg;
4874 context->update_pte = nonpaging_update_pte;
4875 context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
4876 context->direct_map = true;
4877 context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
4878 context->get_cr3 = get_cr3;
4879 context->get_pdptr = kvm_pdptr_read;
4880 context->inject_page_fault = kvm_inject_page_fault;
4881
4882 if (!is_paging(vcpu)) {
4883 context->nx = false;
4884 context->gva_to_gpa = nonpaging_gva_to_gpa;
4885 context->root_level = 0;
4886 } else if (is_long_mode(vcpu)) {
4887 context->nx = is_nx(vcpu);
4888 context->root_level = is_la57_mode(vcpu) ?
4889 PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4890 reset_rsvds_bits_mask(vcpu, context);
4891 context->gva_to_gpa = paging64_gva_to_gpa;
4892 } else if (is_pae(vcpu)) {
4893 context->nx = is_nx(vcpu);
4894 context->root_level = PT32E_ROOT_LEVEL;
4895 reset_rsvds_bits_mask(vcpu, context);
4896 context->gva_to_gpa = paging64_gva_to_gpa;
4897 } else {
4898 context->nx = false;
4899 context->root_level = PT32_ROOT_LEVEL;
4900 reset_rsvds_bits_mask(vcpu, context);
4901 context->gva_to_gpa = paging32_gva_to_gpa;
4902 }
4903
4904 update_permission_bitmask(vcpu, context, false);
4905 update_pkru_bitmask(vcpu, context, false);
4906 update_last_nonleaf_level(vcpu, context);
4907 reset_tdp_shadow_zero_bits_mask(vcpu, context);
4908}
4909
4910static union kvm_mmu_role
4911kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
4912{
4913 union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
4914
4915 role.base.smep_andnot_wp = role.ext.cr4_smep &&
4916 !is_write_protection(vcpu);
4917 role.base.smap_andnot_wp = role.ext.cr4_smap &&
4918 !is_write_protection(vcpu);
4919 role.base.direct = !is_paging(vcpu);
4920 role.base.gpte_is_8_bytes = !!is_pae(vcpu);
4921
4922 if (!is_long_mode(vcpu))
4923 role.base.level = PT32E_ROOT_LEVEL;
4924 else if (is_la57_mode(vcpu))
4925 role.base.level = PT64_ROOT_5LEVEL;
4926 else
4927 role.base.level = PT64_ROOT_4LEVEL;
4928
4929 return role;
4930}
4931
4932void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
4933{
4934 struct kvm_mmu *context = vcpu->arch.mmu;
4935 union kvm_mmu_role new_role =
4936 kvm_calc_shadow_mmu_root_page_role(vcpu, false);
4937
4938 new_role.base.word &= mmu_base_role_mask.word;
4939 if (new_role.as_u64 == context->mmu_role.as_u64)
4940 return;
4941
4942 if (!is_paging(vcpu))
4943 nonpaging_init_context(vcpu, context);
4944 else if (is_long_mode(vcpu))
4945 paging64_init_context(vcpu, context);
4946 else if (is_pae(vcpu))
4947 paging32E_init_context(vcpu, context);
4948 else
4949 paging32_init_context(vcpu, context);
4950
4951 context->mmu_role.as_u64 = new_role.as_u64;
4952 reset_shadow_zero_bits_mask(vcpu, context);
4953}
4954EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
4955
4956static union kvm_mmu_role
4957kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
4958 bool execonly)
4959{
4960 union kvm_mmu_role role = {0};
4961
4962
4963 role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
4964
4965 role.base.level = PT64_ROOT_4LEVEL;
4966 role.base.gpte_is_8_bytes = true;
4967 role.base.direct = false;
4968 role.base.ad_disabled = !accessed_dirty;
4969 role.base.guest_mode = true;
4970 role.base.access = ACC_ALL;
4971
4972
4973
4974
4975
4976 role.base.cr0_wp = true;
4977 role.base.smap_andnot_wp = true;
4978
4979 role.ext = kvm_calc_mmu_role_ext(vcpu);
4980 role.ext.execonly = execonly;
4981
4982 return role;
4983}
4984
4985void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
4986 bool accessed_dirty, gpa_t new_eptp)
4987{
4988 struct kvm_mmu *context = vcpu->arch.mmu;
4989 union kvm_mmu_role new_role =
4990 kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
4991 execonly);
4992
4993 __kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false);
4994
4995 new_role.base.word &= mmu_base_role_mask.word;
4996 if (new_role.as_u64 == context->mmu_role.as_u64)
4997 return;
4998
4999 context->shadow_root_level = PT64_ROOT_4LEVEL;
5000
5001 context->nx = true;
5002 context->ept_ad = accessed_dirty;
5003 context->page_fault = ept_page_fault;
5004 context->gva_to_gpa = ept_gva_to_gpa;
5005 context->sync_page = ept_sync_page;
5006 context->invlpg = ept_invlpg;
5007 context->update_pte = ept_update_pte;
5008 context->root_level = PT64_ROOT_4LEVEL;
5009 context->direct_map = false;
5010 context->mmu_role.as_u64 = new_role.as_u64;
5011
5012 update_permission_bitmask(vcpu, context, true);
5013 update_pkru_bitmask(vcpu, context, true);
5014 update_last_nonleaf_level(vcpu, context);
5015 reset_rsvds_bits_mask_ept(vcpu, context, execonly);
5016 reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
5017}
5018EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
5019
5020static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
5021{
5022 struct kvm_mmu *context = vcpu->arch.mmu;
5023
5024 kvm_init_shadow_mmu(vcpu);
5025 context->set_cr3 = kvm_x86_ops->set_cr3;
5026 context->get_cr3 = get_cr3;
5027 context->get_pdptr = kvm_pdptr_read;
5028 context->inject_page_fault = kvm_inject_page_fault;
5029}
5030
5031static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
5032{
5033 union kvm_mmu_role new_role = kvm_calc_mmu_role_common(vcpu, false);
5034 struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
5035
5036 new_role.base.word &= mmu_base_role_mask.word;
5037 if (new_role.as_u64 == g_context->mmu_role.as_u64)
5038 return;
5039
5040 g_context->mmu_role.as_u64 = new_role.as_u64;
5041 g_context->get_cr3 = get_cr3;
5042 g_context->get_pdptr = kvm_pdptr_read;
5043 g_context->inject_page_fault = kvm_inject_page_fault;
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053 if (!is_paging(vcpu)) {
5054 g_context->nx = false;
5055 g_context->root_level = 0;
5056 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
5057 } else if (is_long_mode(vcpu)) {
5058 g_context->nx = is_nx(vcpu);
5059 g_context->root_level = is_la57_mode(vcpu) ?
5060 PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
5061 reset_rsvds_bits_mask(vcpu, g_context);
5062 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
5063 } else if (is_pae(vcpu)) {
5064 g_context->nx = is_nx(vcpu);
5065 g_context->root_level = PT32E_ROOT_LEVEL;
5066 reset_rsvds_bits_mask(vcpu, g_context);
5067 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
5068 } else {
5069 g_context->nx = false;
5070 g_context->root_level = PT32_ROOT_LEVEL;
5071 reset_rsvds_bits_mask(vcpu, g_context);
5072 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
5073 }
5074
5075 update_permission_bitmask(vcpu, g_context, false);
5076 update_pkru_bitmask(vcpu, g_context, false);
5077 update_last_nonleaf_level(vcpu, g_context);
5078}
5079
5080void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
5081{
5082 if (reset_roots) {
5083 uint i;
5084
5085 vcpu->arch.mmu->root_hpa = INVALID_PAGE;
5086
5087 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5088 vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5089 }
5090
5091 if (mmu_is_nested(vcpu))
5092 init_kvm_nested_mmu(vcpu);
5093 else if (tdp_enabled)
5094 init_kvm_tdp_mmu(vcpu);
5095 else
5096 init_kvm_softmmu(vcpu);
5097}
5098EXPORT_SYMBOL_GPL(kvm_init_mmu);
5099
5100static union kvm_mmu_page_role
5101kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
5102{
5103 union kvm_mmu_role role;
5104
5105 if (tdp_enabled)
5106 role = kvm_calc_tdp_mmu_root_page_role(vcpu, true);
5107 else
5108 role = kvm_calc_shadow_mmu_root_page_role(vcpu, true);
5109
5110 return role.base;
5111}
5112
5113void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
5114{
5115 kvm_mmu_unload(vcpu);
5116 kvm_init_mmu(vcpu, true);
5117}
5118EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
5119
5120int kvm_mmu_load(struct kvm_vcpu *vcpu)
5121{
5122 int r;
5123
5124 r = mmu_topup_memory_caches(vcpu);
5125 if (r)
5126 goto out;
5127 r = mmu_alloc_roots(vcpu);
5128 kvm_mmu_sync_roots(vcpu);
5129 if (r)
5130 goto out;
5131 kvm_mmu_load_cr3(vcpu);
5132 kvm_x86_ops->tlb_flush(vcpu, true);
5133out:
5134 return r;
5135}
5136EXPORT_SYMBOL_GPL(kvm_mmu_load);
5137
5138void kvm_mmu_unload(struct kvm_vcpu *vcpu)
5139{
5140 kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
5141 WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa));
5142 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
5143 WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
5144}
5145EXPORT_SYMBOL_GPL(kvm_mmu_unload);
5146
5147static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
5148 struct kvm_mmu_page *sp, u64 *spte,
5149 const void *new)
5150{
5151 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
5152 ++vcpu->kvm->stat.mmu_pde_zapped;
5153 return;
5154 }
5155
5156 ++vcpu->kvm->stat.mmu_pte_updated;
5157 vcpu->arch.mmu->update_pte(vcpu, sp, spte, new);
5158}
5159
5160static bool need_remote_flush(u64 old, u64 new)
5161{
5162 if (!is_shadow_present_pte(old))
5163 return false;
5164 if (!is_shadow_present_pte(new))
5165 return true;
5166 if ((old ^ new) & PT64_BASE_ADDR_MASK)
5167 return true;
5168 old ^= shadow_nx_mask;
5169 new ^= shadow_nx_mask;
5170 return (old & ~new & PT64_PERM_MASK) != 0;
5171}
5172
5173static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
5174 int *bytes)
5175{
5176 u64 gentry = 0;
5177 int r;
5178
5179
5180
5181
5182
5183
5184 if (is_pae(vcpu) && *bytes == 4) {
5185
5186 *gpa &= ~(gpa_t)7;
5187 *bytes = 8;
5188 }
5189
5190 if (*bytes == 4 || *bytes == 8) {
5191 r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
5192 if (r)
5193 gentry = 0;
5194 }
5195
5196 return gentry;
5197}
5198
5199
5200
5201
5202
5203static bool detect_write_flooding(struct kvm_mmu_page *sp)
5204{
5205
5206
5207
5208
5209 if (sp->role.level == PT_PAGE_TABLE_LEVEL)
5210 return false;
5211
5212 atomic_inc(&sp->write_flooding_count);
5213 return atomic_read(&sp->write_flooding_count) >= 3;
5214}
5215
5216
5217
5218
5219
5220static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
5221 int bytes)
5222{
5223 unsigned offset, pte_size, misaligned;
5224
5225 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
5226 gpa, bytes, sp->role.word);
5227
5228 offset = offset_in_page(gpa);
5229 pte_size = sp->role.gpte_is_8_bytes ? 8 : 4;
5230
5231
5232
5233
5234
5235 if (!(offset & (pte_size - 1)) && bytes == 1)
5236 return false;
5237
5238 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
5239 misaligned |= bytes < 4;
5240
5241 return misaligned;
5242}
5243
5244static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
5245{
5246 unsigned page_offset, quadrant;
5247 u64 *spte;
5248 int level;
5249
5250 page_offset = offset_in_page(gpa);
5251 level = sp->role.level;
5252 *nspte = 1;
5253 if (!sp->role.gpte_is_8_bytes) {
5254 page_offset <<= 1;
5255
5256
5257
5258
5259
5260 if (level == PT32_ROOT_LEVEL) {
5261 page_offset &= ~7;
5262 page_offset <<= 1;
5263 *nspte = 2;
5264 }
5265 quadrant = page_offset >> PAGE_SHIFT;
5266 page_offset &= ~PAGE_MASK;
5267 if (quadrant != sp->role.quadrant)
5268 return NULL;
5269 }
5270
5271 spte = &sp->spt[page_offset / sizeof(*spte)];
5272 return spte;
5273}
5274
5275static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
5276 const u8 *new, int bytes,
5277 struct kvm_page_track_notifier_node *node)
5278{
5279 gfn_t gfn = gpa >> PAGE_SHIFT;
5280 struct kvm_mmu_page *sp;
5281 LIST_HEAD(invalid_list);
5282 u64 entry, gentry, *spte;
5283 int npte;
5284 bool remote_flush, local_flush;
5285
5286
5287
5288
5289
5290 if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
5291 return;
5292
5293 remote_flush = local_flush = false;
5294
5295 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
5296
5297
5298
5299
5300
5301
5302 mmu_topup_memory_caches(vcpu);
5303
5304 spin_lock(&vcpu->kvm->mmu_lock);
5305
5306 gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
5307
5308 ++vcpu->kvm->stat.mmu_pte_write;
5309 kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
5310
5311 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
5312 if (detect_write_misaligned(sp, gpa, bytes) ||
5313 detect_write_flooding(sp)) {
5314 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
5315 ++vcpu->kvm->stat.mmu_flooded;
5316 continue;
5317 }
5318
5319 spte = get_written_sptes(sp, gpa, &npte);
5320 if (!spte)
5321 continue;
5322
5323 local_flush = true;
5324 while (npte--) {
5325 u32 base_role = vcpu->arch.mmu->mmu_role.base.word;
5326
5327 entry = *spte;
5328 mmu_page_zap_pte(vcpu->kvm, sp, spte);
5329 if (gentry &&
5330 !((sp->role.word ^ base_role)
5331 & mmu_base_role_mask.word) && rmap_can_add(vcpu))
5332 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
5333 if (need_remote_flush(entry, *spte))
5334 remote_flush = true;
5335 ++spte;
5336 }
5337 }
5338 kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
5339 kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
5340 spin_unlock(&vcpu->kvm->mmu_lock);
5341}
5342
5343int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
5344{
5345 gpa_t gpa;
5346 int r;
5347
5348 if (vcpu->arch.mmu->direct_map)
5349 return 0;
5350
5351 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
5352
5353 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
5354
5355 return r;
5356}
5357EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
5358
5359static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
5360{
5361 LIST_HEAD(invalid_list);
5362
5363 if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
5364 return 0;
5365
5366 while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
5367 if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
5368 break;
5369
5370 ++vcpu->kvm->stat.mmu_recycled;
5371 }
5372 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
5373
5374 if (!kvm_mmu_available_pages(vcpu->kvm))
5375 return -ENOSPC;
5376 return 0;
5377}
5378
5379int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
5380 void *insn, int insn_len)
5381{
5382 int r, emulation_type = 0;
5383 enum emulation_result er;
5384 bool direct = vcpu->arch.mmu->direct_map;
5385
5386
5387 if (vcpu->arch.mmu->direct_map) {
5388 vcpu->arch.gpa_available = true;
5389 vcpu->arch.gpa_val = cr2;
5390 }
5391
5392 r = RET_PF_INVALID;
5393 if (unlikely(error_code & PFERR_RSVD_MASK)) {
5394 r = handle_mmio_page_fault(vcpu, cr2, direct);
5395 if (r == RET_PF_EMULATE)
5396 goto emulate;
5397 }
5398
5399 if (r == RET_PF_INVALID) {
5400 r = vcpu->arch.mmu->page_fault(vcpu, cr2,
5401 lower_32_bits(error_code),
5402 false);
5403 WARN_ON(r == RET_PF_INVALID);
5404 }
5405
5406 if (r == RET_PF_RETRY)
5407 return 1;
5408 if (r < 0)
5409 return r;
5410
5411
5412
5413
5414
5415
5416
5417
5418 if (vcpu->arch.mmu->direct_map &&
5419 (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
5420 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
5421 return 1;
5422 }
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435 if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu))
5436 emulation_type = EMULTYPE_ALLOW_RETRY;
5437emulate:
5438
5439
5440
5441
5442
5443
5444
5445 if (unlikely(insn && !insn_len)) {
5446 if (!kvm_x86_ops->need_emulation_on_page_fault(vcpu))
5447 return 1;
5448 }
5449
5450 er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
5451
5452 switch (er) {
5453 case EMULATE_DONE:
5454 return 1;
5455 case EMULATE_USER_EXIT:
5456 ++vcpu->stat.mmio_exits;
5457
5458 case EMULATE_FAIL:
5459 return 0;
5460 default:
5461 BUG();
5462 }
5463}
5464EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
5465
5466void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
5467{
5468 struct kvm_mmu *mmu = vcpu->arch.mmu;
5469 int i;
5470
5471
5472 if (is_noncanonical_address(gva, vcpu))
5473 return;
5474
5475 mmu->invlpg(vcpu, gva, mmu->root_hpa);
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5489 if (VALID_PAGE(mmu->prev_roots[i].hpa))
5490 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5491
5492 kvm_x86_ops->tlb_flush_gva(vcpu, gva);
5493 ++vcpu->stat.invlpg;
5494}
5495EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
5496
5497void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
5498{
5499 struct kvm_mmu *mmu = vcpu->arch.mmu;
5500 bool tlb_flush = false;
5501 uint i;
5502
5503 if (pcid == kvm_get_active_pcid(vcpu)) {
5504 mmu->invlpg(vcpu, gva, mmu->root_hpa);
5505 tlb_flush = true;
5506 }
5507
5508 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5509 if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
5510 pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) {
5511 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5512 tlb_flush = true;
5513 }
5514 }
5515
5516 if (tlb_flush)
5517 kvm_x86_ops->tlb_flush_gva(vcpu, gva);
5518
5519 ++vcpu->stat.invlpg;
5520
5521
5522
5523
5524
5525
5526}
5527EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
5528
5529void kvm_enable_tdp(void)
5530{
5531 tdp_enabled = true;
5532}
5533EXPORT_SYMBOL_GPL(kvm_enable_tdp);
5534
5535void kvm_disable_tdp(void)
5536{
5537 tdp_enabled = false;
5538}
5539EXPORT_SYMBOL_GPL(kvm_disable_tdp);
5540
5541
5542
5543typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
5544
5545
5546static __always_inline bool
5547slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
5548 slot_level_handler fn, int start_level, int end_level,
5549 gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
5550{
5551 struct slot_rmap_walk_iterator iterator;
5552 bool flush = false;
5553
5554 for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
5555 end_gfn, &iterator) {
5556 if (iterator.rmap)
5557 flush |= fn(kvm, iterator.rmap);
5558
5559 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
5560 if (flush && lock_flush_tlb) {
5561 kvm_flush_remote_tlbs_with_address(kvm,
5562 start_gfn,
5563 iterator.gfn - start_gfn + 1);
5564 flush = false;
5565 }
5566 cond_resched_lock(&kvm->mmu_lock);
5567 }
5568 }
5569
5570 if (flush && lock_flush_tlb) {
5571 kvm_flush_remote_tlbs_with_address(kvm, start_gfn,
5572 end_gfn - start_gfn + 1);
5573 flush = false;
5574 }
5575
5576 return flush;
5577}
5578
5579static __always_inline bool
5580slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5581 slot_level_handler fn, int start_level, int end_level,
5582 bool lock_flush_tlb)
5583{
5584 return slot_handle_level_range(kvm, memslot, fn, start_level,
5585 end_level, memslot->base_gfn,
5586 memslot->base_gfn + memslot->npages - 1,
5587 lock_flush_tlb);
5588}
5589
5590static __always_inline bool
5591slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5592 slot_level_handler fn, bool lock_flush_tlb)
5593{
5594 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
5595 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5596}
5597
5598static __always_inline bool
5599slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5600 slot_level_handler fn, bool lock_flush_tlb)
5601{
5602 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
5603 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5604}
5605
5606static __always_inline bool
5607slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
5608 slot_level_handler fn, bool lock_flush_tlb)
5609{
5610 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
5611 PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
5612}
5613
5614static void free_mmu_pages(struct kvm_vcpu *vcpu)
5615{
5616 free_page((unsigned long)vcpu->arch.mmu->pae_root);
5617 free_page((unsigned long)vcpu->arch.mmu->lm_root);
5618}
5619
5620static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
5621{
5622 struct page *page;
5623 int i;
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634 if (tdp_enabled && kvm_x86_ops->get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
5635 return 0;
5636
5637 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
5638 if (!page)
5639 return -ENOMEM;
5640
5641 vcpu->arch.mmu->pae_root = page_address(page);
5642 for (i = 0; i < 4; ++i)
5643 vcpu->arch.mmu->pae_root[i] = INVALID_PAGE;
5644
5645 return 0;
5646}
5647
5648int kvm_mmu_create(struct kvm_vcpu *vcpu)
5649{
5650 uint i;
5651
5652 vcpu->arch.mmu = &vcpu->arch.root_mmu;
5653 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
5654
5655 vcpu->arch.root_mmu.root_hpa = INVALID_PAGE;
5656 vcpu->arch.root_mmu.root_cr3 = 0;
5657 vcpu->arch.root_mmu.translate_gpa = translate_gpa;
5658 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5659 vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5660
5661 vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE;
5662 vcpu->arch.guest_mmu.root_cr3 = 0;
5663 vcpu->arch.guest_mmu.translate_gpa = translate_gpa;
5664 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5665 vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5666
5667 vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
5668 return alloc_mmu_pages(vcpu);
5669}
5670
5671
5672static void kvm_zap_obsolete_pages(struct kvm *kvm)
5673{
5674 struct kvm_mmu_page *sp, *node;
5675 LIST_HEAD(invalid_list);
5676 int ign;
5677
5678restart:
5679 list_for_each_entry_safe_reverse(sp, node,
5680 &kvm->arch.active_mmu_pages, link) {
5681
5682
5683
5684
5685 if (!is_obsolete_sp(kvm, sp))
5686 break;
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715 if (sp->role.invalid)
5716 continue;
5717
5718 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
5719 kvm_mmu_commit_zap_page(kvm, &invalid_list);
5720 cond_resched_lock(&kvm->mmu_lock);
5721 goto restart;
5722 }
5723
5724 if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
5725 goto restart;
5726 }
5727
5728 kvm_mmu_commit_zap_page(kvm, &invalid_list);
5729}
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740static void kvm_mmu_zap_all_fast(struct kvm *kvm)
5741{
5742 spin_lock(&kvm->mmu_lock);
5743 kvm->arch.mmu_valid_gen++;
5744
5745 kvm_zap_obsolete_pages(kvm);
5746 spin_unlock(&kvm->mmu_lock);
5747}
5748
5749static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
5750 struct kvm_memory_slot *slot,
5751 struct kvm_page_track_notifier_node *node)
5752{
5753 kvm_mmu_zap_all_fast(kvm);
5754}
5755
5756void kvm_mmu_init_vm(struct kvm *kvm)
5757{
5758 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5759
5760 node->track_write = kvm_mmu_pte_write;
5761 node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
5762 kvm_page_track_register_notifier(kvm, node);
5763}
5764
5765void kvm_mmu_uninit_vm(struct kvm *kvm)
5766{
5767 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5768
5769 kvm_page_track_unregister_notifier(kvm, node);
5770}
5771
5772void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
5773{
5774 struct kvm_memslots *slots;
5775 struct kvm_memory_slot *memslot;
5776 int i;
5777
5778 spin_lock(&kvm->mmu_lock);
5779 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
5780 slots = __kvm_memslots(kvm, i);
5781 kvm_for_each_memslot(memslot, slots) {
5782 gfn_t start, end;
5783
5784 start = max(gfn_start, memslot->base_gfn);
5785 end = min(gfn_end, memslot->base_gfn + memslot->npages);
5786 if (start >= end)
5787 continue;
5788
5789 slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
5790 PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
5791 start, end - 1, true);
5792 }
5793 }
5794
5795 spin_unlock(&kvm->mmu_lock);
5796}
5797
5798static bool slot_rmap_write_protect(struct kvm *kvm,
5799 struct kvm_rmap_head *rmap_head)
5800{
5801 return __rmap_write_protect(kvm, rmap_head, false);
5802}
5803
5804void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
5805 struct kvm_memory_slot *memslot)
5806{
5807 bool flush;
5808
5809 spin_lock(&kvm->mmu_lock);
5810 flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
5811 false);
5812 spin_unlock(&kvm->mmu_lock);
5813
5814
5815
5816
5817
5818
5819 lockdep_assert_held(&kvm->slots_lock);
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832 if (flush)
5833 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5834 memslot->npages);
5835}
5836
5837static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
5838 struct kvm_rmap_head *rmap_head)
5839{
5840 u64 *sptep;
5841 struct rmap_iterator iter;
5842 int need_tlb_flush = 0;
5843 kvm_pfn_t pfn;
5844 struct kvm_mmu_page *sp;
5845
5846restart:
5847 for_each_rmap_spte(rmap_head, &iter, sptep) {
5848 sp = page_header(__pa(sptep));
5849 pfn = spte_to_pfn(*sptep);
5850
5851
5852
5853
5854
5855
5856
5857
5858 if (sp->role.direct &&
5859 !kvm_is_reserved_pfn(pfn) &&
5860 PageTransCompoundMap(pfn_to_page(pfn))) {
5861 pte_list_remove(rmap_head, sptep);
5862
5863 if (kvm_available_flush_tlb_with_range())
5864 kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
5865 KVM_PAGES_PER_HPAGE(sp->role.level));
5866 else
5867 need_tlb_flush = 1;
5868
5869 goto restart;
5870 }
5871 }
5872
5873 return need_tlb_flush;
5874}
5875
5876void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
5877 const struct kvm_memory_slot *memslot)
5878{
5879
5880 spin_lock(&kvm->mmu_lock);
5881 slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
5882 kvm_mmu_zap_collapsible_spte, true);
5883 spin_unlock(&kvm->mmu_lock);
5884}
5885
5886void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
5887 struct kvm_memory_slot *memslot)
5888{
5889 bool flush;
5890
5891 spin_lock(&kvm->mmu_lock);
5892 flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
5893 spin_unlock(&kvm->mmu_lock);
5894
5895 lockdep_assert_held(&kvm->slots_lock);
5896
5897
5898
5899
5900
5901
5902
5903 if (flush)
5904 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5905 memslot->npages);
5906}
5907EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
5908
5909void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
5910 struct kvm_memory_slot *memslot)
5911{
5912 bool flush;
5913
5914 spin_lock(&kvm->mmu_lock);
5915 flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
5916 false);
5917 spin_unlock(&kvm->mmu_lock);
5918
5919
5920 lockdep_assert_held(&kvm->slots_lock);
5921
5922 if (flush)
5923 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5924 memslot->npages);
5925}
5926EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
5927
5928void kvm_mmu_slot_set_dirty(struct kvm *kvm,
5929 struct kvm_memory_slot *memslot)
5930{
5931 bool flush;
5932
5933 spin_lock(&kvm->mmu_lock);
5934 flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
5935 spin_unlock(&kvm->mmu_lock);
5936
5937 lockdep_assert_held(&kvm->slots_lock);
5938
5939
5940 if (flush)
5941 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5942 memslot->npages);
5943}
5944EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
5945
5946static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only)
5947{
5948 struct kvm_mmu_page *sp, *node;
5949 LIST_HEAD(invalid_list);
5950 int ign;
5951
5952 spin_lock(&kvm->mmu_lock);
5953restart:
5954 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
5955 if (mmio_only && !sp->mmio_cached)
5956 continue;
5957 if (sp->role.invalid && sp->root_count)
5958 continue;
5959 if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) {
5960 WARN_ON_ONCE(mmio_only);
5961 goto restart;
5962 }
5963 if (cond_resched_lock(&kvm->mmu_lock))
5964 goto restart;
5965 }
5966
5967 kvm_mmu_commit_zap_page(kvm, &invalid_list);
5968 spin_unlock(&kvm->mmu_lock);
5969}
5970
5971void kvm_mmu_zap_all(struct kvm *kvm)
5972{
5973 return __kvm_mmu_zap_all(kvm, false);
5974}
5975
5976void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
5977{
5978 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
5979
5980 gen &= MMIO_SPTE_GEN_MASK;
5981
5982
5983
5984
5985
5986
5987
5988
5989 gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
5990
5991
5992
5993
5994
5995 if (unlikely(gen == 0)) {
5996 kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
5997 __kvm_mmu_zap_all(kvm, true);
5998 }
5999}
6000
6001static unsigned long
6002mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
6003{
6004 struct kvm *kvm;
6005 int nr_to_scan = sc->nr_to_scan;
6006 unsigned long freed = 0;
6007
6008 mutex_lock(&kvm_lock);
6009
6010 list_for_each_entry(kvm, &vm_list, vm_list) {
6011 int idx;
6012 LIST_HEAD(invalid_list);
6013
6014
6015
6016
6017
6018
6019
6020 if (!nr_to_scan--)
6021 break;
6022
6023
6024
6025
6026
6027
6028 if (!kvm->arch.n_used_mmu_pages)
6029 continue;
6030
6031 idx = srcu_read_lock(&kvm->srcu);
6032 spin_lock(&kvm->mmu_lock);
6033
6034 if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
6035 freed++;
6036 kvm_mmu_commit_zap_page(kvm, &invalid_list);
6037
6038 spin_unlock(&kvm->mmu_lock);
6039 srcu_read_unlock(&kvm->srcu, idx);
6040
6041
6042
6043
6044
6045
6046 list_move_tail(&kvm->vm_list, &vm_list);
6047 break;
6048 }
6049
6050 mutex_unlock(&kvm_lock);
6051 return freed;
6052}
6053
6054static unsigned long
6055mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
6056{
6057 return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
6058}
6059
6060static struct shrinker mmu_shrinker = {
6061 .count_objects = mmu_shrink_count,
6062 .scan_objects = mmu_shrink_scan,
6063 .seeks = DEFAULT_SEEKS * 10,
6064};
6065
6066static void mmu_destroy_caches(void)
6067{
6068 kmem_cache_destroy(pte_list_desc_cache);
6069 kmem_cache_destroy(mmu_page_header_cache);
6070}
6071
6072static void kvm_set_mmio_spte_mask(void)
6073{
6074 u64 mask;
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085 mask = 1ull << 51;
6086
6087
6088 mask |= 1ull;
6089
6090
6091
6092
6093
6094 if (IS_ENABLED(CONFIG_X86_64) && shadow_phys_bits == 52)
6095 mask &= ~1ull;
6096
6097 kvm_mmu_set_mmio_spte_mask(mask, mask);
6098}
6099
6100int kvm_mmu_module_init(void)
6101{
6102 int ret = -ENOMEM;
6103
6104
6105
6106
6107
6108
6109
6110 BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
6111 BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
6112 BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64));
6113
6114 kvm_mmu_reset_all_pte_masks();
6115
6116 kvm_set_mmio_spte_mask();
6117
6118 pte_list_desc_cache = kmem_cache_create("pte_list_desc",
6119 sizeof(struct pte_list_desc),
6120 0, SLAB_ACCOUNT, NULL);
6121 if (!pte_list_desc_cache)
6122 goto out;
6123
6124 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
6125 sizeof(struct kvm_mmu_page),
6126 0, SLAB_ACCOUNT, NULL);
6127 if (!mmu_page_header_cache)
6128 goto out;
6129
6130 if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
6131 goto out;
6132
6133 ret = register_shrinker(&mmu_shrinker);
6134 if (ret)
6135 goto out;
6136
6137 return 0;
6138
6139out:
6140 mmu_destroy_caches();
6141 return ret;
6142}
6143
6144
6145
6146
6147unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
6148{
6149 unsigned long nr_mmu_pages;
6150 unsigned long nr_pages = 0;
6151 struct kvm_memslots *slots;
6152 struct kvm_memory_slot *memslot;
6153 int i;
6154
6155 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
6156 slots = __kvm_memslots(kvm, i);
6157
6158 kvm_for_each_memslot(memslot, slots)
6159 nr_pages += memslot->npages;
6160 }
6161
6162 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
6163 nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
6164
6165 return nr_mmu_pages;
6166}
6167
6168void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
6169{
6170 kvm_mmu_unload(vcpu);
6171 free_mmu_pages(vcpu);
6172 mmu_free_memory_caches(vcpu);
6173}
6174
6175void kvm_mmu_module_exit(void)
6176{
6177 mmu_destroy_caches();
6178 percpu_counter_destroy(&kvm_total_used_mmu_pages);
6179 unregister_shrinker(&mmu_shrinker);
6180 mmu_audit_disable();
6181}
6182