1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18#include "irq.h"
19#include "mmu.h"
20#include "x86.h"
21#include "kvm_cache_regs.h"
22#include "cpuid.h"
23
24#include <linux/kvm_host.h>
25#include <linux/types.h>
26#include <linux/string.h>
27#include <linux/mm.h>
28#include <linux/highmem.h>
29#include <linux/moduleparam.h>
30#include <linux/export.h>
31#include <linux/swap.h>
32#include <linux/hugetlb.h>
33#include <linux/compiler.h>
34#include <linux/srcu.h>
35#include <linux/slab.h>
36#include <linux/sched/signal.h>
37#include <linux/uaccess.h>
38#include <linux/hash.h>
39#include <linux/kern_levels.h>
40
41#include <asm/page.h>
42#include <asm/pat.h>
43#include <asm/cmpxchg.h>
44#include <asm/e820/api.h>
45#include <asm/io.h>
46#include <asm/vmx.h>
47#include <asm/kvm_page_track.h>
48#include "trace.h"
49
50
51
52
53
54
55
56
57bool tdp_enabled = false;
58
59enum {
60 AUDIT_PRE_PAGE_FAULT,
61 AUDIT_POST_PAGE_FAULT,
62 AUDIT_PRE_PTE_WRITE,
63 AUDIT_POST_PTE_WRITE,
64 AUDIT_PRE_SYNC,
65 AUDIT_POST_SYNC
66};
67
68#undef MMU_DEBUG
69
70#ifdef MMU_DEBUG
71static bool dbg = 0;
72module_param(dbg, bool, 0644);
73
74#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
75#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
76#define MMU_WARN_ON(x) WARN_ON(x)
77#else
78#define pgprintk(x...) do { } while (0)
79#define rmap_printk(x...) do { } while (0)
80#define MMU_WARN_ON(x) do { } while (0)
81#endif
82
83#define PTE_PREFETCH_NUM 8
84
85#define PT_FIRST_AVAIL_BITS_SHIFT 10
86#define PT64_SECOND_AVAIL_BITS_SHIFT 52
87
88#define PT64_LEVEL_BITS 9
89
90#define PT64_LEVEL_SHIFT(level) \
91 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
92
93#define PT64_INDEX(address, level)\
94 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
95
96
97#define PT32_LEVEL_BITS 10
98
99#define PT32_LEVEL_SHIFT(level) \
100 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
101
102#define PT32_LVL_OFFSET_MASK(level) \
103 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
104 * PT32_LEVEL_BITS))) - 1))
105
106#define PT32_INDEX(address, level)\
107 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
108
109
110#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
111#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
112#else
113#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
114#endif
115#define PT64_LVL_ADDR_MASK(level) \
116 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
117 * PT64_LEVEL_BITS))) - 1))
118#define PT64_LVL_OFFSET_MASK(level) \
119 (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
120 * PT64_LEVEL_BITS))) - 1))
121
122#define PT32_BASE_ADDR_MASK PAGE_MASK
123#define PT32_DIR_BASE_ADDR_MASK \
124 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
125#define PT32_LVL_ADDR_MASK(level) \
126 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
127 * PT32_LEVEL_BITS))) - 1))
128
129#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
130 | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
131
132#define ACC_EXEC_MASK 1
133#define ACC_WRITE_MASK PT_WRITABLE_MASK
134#define ACC_USER_MASK PT_USER_MASK
135#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
136
137
138#define PT64_EPT_READABLE_MASK 0x1ull
139#define PT64_EPT_EXECUTABLE_MASK 0x4ull
140
141#include <trace/events/kvm.h>
142
143#define CREATE_TRACE_POINTS
144#include "mmutrace.h"
145
146#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
147#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
148
149#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
150
151
152#define PTE_LIST_EXT 3
153
154
155
156
157
158
159
160
161
162enum {
163 RET_PF_RETRY = 0,
164 RET_PF_EMULATE = 1,
165 RET_PF_INVALID = 2,
166};
167
168struct pte_list_desc {
169 u64 *sptes[PTE_LIST_EXT];
170 struct pte_list_desc *more;
171};
172
173struct kvm_shadow_walk_iterator {
174 u64 addr;
175 hpa_t shadow_addr;
176 u64 *sptep;
177 int level;
178 unsigned index;
179};
180
181static const union kvm_mmu_page_role mmu_base_role_mask = {
182 .cr0_wp = 1,
183 .gpte_is_8_bytes = 1,
184 .nxe = 1,
185 .smep_andnot_wp = 1,
186 .smap_andnot_wp = 1,
187 .smm = 1,
188 .guest_mode = 1,
189 .ad_disabled = 1,
190};
191
192#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \
193 for (shadow_walk_init_using_root(&(_walker), (_vcpu), \
194 (_root), (_addr)); \
195 shadow_walk_okay(&(_walker)); \
196 shadow_walk_next(&(_walker)))
197
198#define for_each_shadow_entry(_vcpu, _addr, _walker) \
199 for (shadow_walk_init(&(_walker), _vcpu, _addr); \
200 shadow_walk_okay(&(_walker)); \
201 shadow_walk_next(&(_walker)))
202
203#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
204 for (shadow_walk_init(&(_walker), _vcpu, _addr); \
205 shadow_walk_okay(&(_walker)) && \
206 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
207 __shadow_walk_next(&(_walker), spte))
208
209static struct kmem_cache *pte_list_desc_cache;
210static struct kmem_cache *mmu_page_header_cache;
211static struct percpu_counter kvm_total_used_mmu_pages;
212
213static u64 __read_mostly shadow_nx_mask;
214static u64 __read_mostly shadow_x_mask;
215static u64 __read_mostly shadow_user_mask;
216static u64 __read_mostly shadow_accessed_mask;
217static u64 __read_mostly shadow_dirty_mask;
218static u64 __read_mostly shadow_mmio_mask;
219static u64 __read_mostly shadow_mmio_value;
220static u64 __read_mostly shadow_present_mask;
221static u64 __read_mostly shadow_me_mask;
222
223
224
225
226
227
228static u64 __read_mostly shadow_acc_track_mask;
229static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
230
231
232
233
234
235
236
237static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
238 PT64_EPT_EXECUTABLE_MASK;
239static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
240
241
242
243
244
245static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
246
247
248
249
250static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
251
252
253
254
255
256
257
258
259
260static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
261
262
263static void mmu_spte_set(u64 *sptep, u64 spte);
264static union kvm_mmu_page_role
265kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
266
267
268static inline bool kvm_available_flush_tlb_with_range(void)
269{
270 return kvm_x86_ops->tlb_remote_flush_with_range;
271}
272
273static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
274 struct kvm_tlb_range *range)
275{
276 int ret = -ENOTSUPP;
277
278 if (range && kvm_x86_ops->tlb_remote_flush_with_range)
279 ret = kvm_x86_ops->tlb_remote_flush_with_range(kvm, range);
280
281 if (ret)
282 kvm_flush_remote_tlbs(kvm);
283}
284
285static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
286 u64 start_gfn, u64 pages)
287{
288 struct kvm_tlb_range range;
289
290 range.start_gfn = start_gfn;
291 range.pages = pages;
292
293 kvm_flush_remote_tlbs_with_range(kvm, &range);
294}
295
296void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
297{
298 BUG_ON((mmio_mask & mmio_value) != mmio_value);
299 shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK;
300 shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
301}
302EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
303
304static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
305{
306 return sp->role.ad_disabled;
307}
308
309static inline bool spte_ad_enabled(u64 spte)
310{
311 MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
312 return !(spte & shadow_acc_track_value);
313}
314
315static inline u64 spte_shadow_accessed_mask(u64 spte)
316{
317 MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
318 return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
319}
320
321static inline u64 spte_shadow_dirty_mask(u64 spte)
322{
323 MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
324 return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
325}
326
327static inline bool is_access_track_spte(u64 spte)
328{
329 return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
330}
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346#define MMIO_SPTE_GEN_MASK GENMASK_ULL(18, 0)
347
348#define MMIO_SPTE_GEN_LOW_START 3
349#define MMIO_SPTE_GEN_LOW_END 11
350#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
351 MMIO_SPTE_GEN_LOW_START)
352
353#define MMIO_SPTE_GEN_HIGH_START 52
354#define MMIO_SPTE_GEN_HIGH_END 61
355#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
356 MMIO_SPTE_GEN_HIGH_START)
357static u64 generation_mmio_spte_mask(u64 gen)
358{
359 u64 mask;
360
361 WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
362
363 mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
364 mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
365 return mask;
366}
367
368static u64 get_mmio_spte_generation(u64 spte)
369{
370 u64 gen;
371
372 spte &= ~shadow_mmio_mask;
373
374 gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
375 gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
376 return gen;
377}
378
379static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
380 unsigned access)
381{
382 u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
383 u64 mask = generation_mmio_spte_mask(gen);
384 u64 gpa = gfn << PAGE_SHIFT;
385
386 access &= ACC_WRITE_MASK | ACC_USER_MASK;
387 mask |= shadow_mmio_value | access;
388 mask |= gpa | shadow_nonpresent_or_rsvd_mask;
389 mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
390 << shadow_nonpresent_or_rsvd_mask_len;
391
392 page_header(__pa(sptep))->mmio_cached = true;
393
394 trace_mark_mmio_spte(sptep, gfn, access, gen);
395 mmu_spte_set(sptep, mask);
396}
397
398static bool is_mmio_spte(u64 spte)
399{
400 return (spte & shadow_mmio_mask) == shadow_mmio_value;
401}
402
403static gfn_t get_mmio_spte_gfn(u64 spte)
404{
405 u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
406
407 gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
408 & shadow_nonpresent_or_rsvd_mask;
409
410 return gpa >> PAGE_SHIFT;
411}
412
413static unsigned get_mmio_spte_access(u64 spte)
414{
415 u64 mask = generation_mmio_spte_mask(MMIO_SPTE_GEN_MASK) | shadow_mmio_mask;
416 return (spte & ~mask) & ~PAGE_MASK;
417}
418
419static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
420 kvm_pfn_t pfn, unsigned access)
421{
422 if (unlikely(is_noslot_pfn(pfn))) {
423 mark_mmio_spte(vcpu, sptep, gfn, access);
424 return true;
425 }
426
427 return false;
428}
429
430static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
431{
432 u64 kvm_gen, spte_gen, gen;
433
434 gen = kvm_vcpu_memslots(vcpu)->generation;
435 if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
436 return false;
437
438 kvm_gen = gen & MMIO_SPTE_GEN_MASK;
439 spte_gen = get_mmio_spte_generation(spte);
440
441 trace_check_mmio_spte(spte, kvm_gen, spte_gen);
442 return likely(kvm_gen == spte_gen);
443}
444
445
446
447
448
449
450
451
452void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
453 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
454 u64 acc_track_mask, u64 me_mask)
455{
456 BUG_ON(!dirty_mask != !accessed_mask);
457 BUG_ON(!accessed_mask && !acc_track_mask);
458 BUG_ON(acc_track_mask & shadow_acc_track_value);
459
460 shadow_user_mask = user_mask;
461 shadow_accessed_mask = accessed_mask;
462 shadow_dirty_mask = dirty_mask;
463 shadow_nx_mask = nx_mask;
464 shadow_x_mask = x_mask;
465 shadow_present_mask = p_mask;
466 shadow_acc_track_mask = acc_track_mask;
467 shadow_me_mask = me_mask;
468}
469EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
470
471static void kvm_mmu_reset_all_pte_masks(void)
472{
473 u8 low_phys_bits;
474
475 shadow_user_mask = 0;
476 shadow_accessed_mask = 0;
477 shadow_dirty_mask = 0;
478 shadow_nx_mask = 0;
479 shadow_x_mask = 0;
480 shadow_mmio_mask = 0;
481 shadow_present_mask = 0;
482 shadow_acc_track_mask = 0;
483
484
485
486
487
488
489
490
491
492
493
494 shadow_nonpresent_or_rsvd_mask = 0;
495 low_phys_bits = boot_cpu_data.x86_cache_bits;
496 if (boot_cpu_data.x86_cache_bits <
497 52 - shadow_nonpresent_or_rsvd_mask_len) {
498 shadow_nonpresent_or_rsvd_mask =
499 rsvd_bits(boot_cpu_data.x86_cache_bits -
500 shadow_nonpresent_or_rsvd_mask_len,
501 boot_cpu_data.x86_cache_bits - 1);
502 low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len;
503 } else
504 WARN_ON_ONCE(boot_cpu_has_bug(X86_BUG_L1TF));
505
506 shadow_nonpresent_or_rsvd_lower_gfn_mask =
507 GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
508}
509
510static int is_cpuid_PSE36(void)
511{
512 return 1;
513}
514
515static int is_nx(struct kvm_vcpu *vcpu)
516{
517 return vcpu->arch.efer & EFER_NX;
518}
519
520static int is_shadow_present_pte(u64 pte)
521{
522 return (pte != 0) && !is_mmio_spte(pte);
523}
524
525static int is_large_pte(u64 pte)
526{
527 return pte & PT_PAGE_SIZE_MASK;
528}
529
530static int is_last_spte(u64 pte, int level)
531{
532 if (level == PT_PAGE_TABLE_LEVEL)
533 return 1;
534 if (is_large_pte(pte))
535 return 1;
536 return 0;
537}
538
539static bool is_executable_pte(u64 spte)
540{
541 return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
542}
543
544static kvm_pfn_t spte_to_pfn(u64 pte)
545{
546 return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
547}
548
549static gfn_t pse36_gfn_delta(u32 gpte)
550{
551 int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
552
553 return (gpte & PT32_DIR_PSE36_MASK) << shift;
554}
555
556#ifdef CONFIG_X86_64
557static void __set_spte(u64 *sptep, u64 spte)
558{
559 WRITE_ONCE(*sptep, spte);
560}
561
562static void __update_clear_spte_fast(u64 *sptep, u64 spte)
563{
564 WRITE_ONCE(*sptep, spte);
565}
566
567static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
568{
569 return xchg(sptep, spte);
570}
571
572static u64 __get_spte_lockless(u64 *sptep)
573{
574 return READ_ONCE(*sptep);
575}
576#else
577union split_spte {
578 struct {
579 u32 spte_low;
580 u32 spte_high;
581 };
582 u64 spte;
583};
584
585static void count_spte_clear(u64 *sptep, u64 spte)
586{
587 struct kvm_mmu_page *sp = page_header(__pa(sptep));
588
589 if (is_shadow_present_pte(spte))
590 return;
591
592
593 smp_wmb();
594 sp->clear_spte_count++;
595}
596
597static void __set_spte(u64 *sptep, u64 spte)
598{
599 union split_spte *ssptep, sspte;
600
601 ssptep = (union split_spte *)sptep;
602 sspte = (union split_spte)spte;
603
604 ssptep->spte_high = sspte.spte_high;
605
606
607
608
609
610
611 smp_wmb();
612
613 WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
614}
615
616static void __update_clear_spte_fast(u64 *sptep, u64 spte)
617{
618 union split_spte *ssptep, sspte;
619
620 ssptep = (union split_spte *)sptep;
621 sspte = (union split_spte)spte;
622
623 WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
624
625
626
627
628
629 smp_wmb();
630
631 ssptep->spte_high = sspte.spte_high;
632 count_spte_clear(sptep, spte);
633}
634
635static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
636{
637 union split_spte *ssptep, sspte, orig;
638
639 ssptep = (union split_spte *)sptep;
640 sspte = (union split_spte)spte;
641
642
643 orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
644 orig.spte_high = ssptep->spte_high;
645 ssptep->spte_high = sspte.spte_high;
646 count_spte_clear(sptep, spte);
647
648 return orig.spte;
649}
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669static u64 __get_spte_lockless(u64 *sptep)
670{
671 struct kvm_mmu_page *sp = page_header(__pa(sptep));
672 union split_spte spte, *orig = (union split_spte *)sptep;
673 int count;
674
675retry:
676 count = sp->clear_spte_count;
677 smp_rmb();
678
679 spte.spte_low = orig->spte_low;
680 smp_rmb();
681
682 spte.spte_high = orig->spte_high;
683 smp_rmb();
684
685 if (unlikely(spte.spte_low != orig->spte_low ||
686 count != sp->clear_spte_count))
687 goto retry;
688
689 return spte.spte;
690}
691#endif
692
693static bool spte_can_locklessly_be_made_writable(u64 spte)
694{
695 return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
696 (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
697}
698
699static bool spte_has_volatile_bits(u64 spte)
700{
701 if (!is_shadow_present_pte(spte))
702 return false;
703
704
705
706
707
708
709
710 if (spte_can_locklessly_be_made_writable(spte) ||
711 is_access_track_spte(spte))
712 return true;
713
714 if (spte_ad_enabled(spte)) {
715 if ((spte & shadow_accessed_mask) == 0 ||
716 (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
717 return true;
718 }
719
720 return false;
721}
722
723static bool is_accessed_spte(u64 spte)
724{
725 u64 accessed_mask = spte_shadow_accessed_mask(spte);
726
727 return accessed_mask ? spte & accessed_mask
728 : !is_access_track_spte(spte);
729}
730
731static bool is_dirty_spte(u64 spte)
732{
733 u64 dirty_mask = spte_shadow_dirty_mask(spte);
734
735 return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
736}
737
738
739
740
741
742
743
744static void mmu_spte_set(u64 *sptep, u64 new_spte)
745{
746 WARN_ON(is_shadow_present_pte(*sptep));
747 __set_spte(sptep, new_spte);
748}
749
750
751
752
753
754static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
755{
756 u64 old_spte = *sptep;
757
758 WARN_ON(!is_shadow_present_pte(new_spte));
759
760 if (!is_shadow_present_pte(old_spte)) {
761 mmu_spte_set(sptep, new_spte);
762 return old_spte;
763 }
764
765 if (!spte_has_volatile_bits(old_spte))
766 __update_clear_spte_fast(sptep, new_spte);
767 else
768 old_spte = __update_clear_spte_slow(sptep, new_spte);
769
770 WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
771
772 return old_spte;
773}
774
775
776
777
778
779
780
781
782
783
784
785
786static bool mmu_spte_update(u64 *sptep, u64 new_spte)
787{
788 bool flush = false;
789 u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
790
791 if (!is_shadow_present_pte(old_spte))
792 return false;
793
794
795
796
797
798
799 if (spte_can_locklessly_be_made_writable(old_spte) &&
800 !is_writable_pte(new_spte))
801 flush = true;
802
803
804
805
806
807
808 if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
809 flush = true;
810 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
811 }
812
813 if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
814 flush = true;
815 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
816 }
817
818 return flush;
819}
820
821
822
823
824
825
826
827static int mmu_spte_clear_track_bits(u64 *sptep)
828{
829 kvm_pfn_t pfn;
830 u64 old_spte = *sptep;
831
832 if (!spte_has_volatile_bits(old_spte))
833 __update_clear_spte_fast(sptep, 0ull);
834 else
835 old_spte = __update_clear_spte_slow(sptep, 0ull);
836
837 if (!is_shadow_present_pte(old_spte))
838 return 0;
839
840 pfn = spte_to_pfn(old_spte);
841
842
843
844
845
846
847 WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
848
849 if (is_accessed_spte(old_spte))
850 kvm_set_pfn_accessed(pfn);
851
852 if (is_dirty_spte(old_spte))
853 kvm_set_pfn_dirty(pfn);
854
855 return 1;
856}
857
858
859
860
861
862
863static void mmu_spte_clear_no_track(u64 *sptep)
864{
865 __update_clear_spte_fast(sptep, 0ull);
866}
867
868static u64 mmu_spte_get_lockless(u64 *sptep)
869{
870 return __get_spte_lockless(sptep);
871}
872
873static u64 mark_spte_for_access_track(u64 spte)
874{
875 if (spte_ad_enabled(spte))
876 return spte & ~shadow_accessed_mask;
877
878 if (is_access_track_spte(spte))
879 return spte;
880
881
882
883
884
885
886 WARN_ONCE((spte & PT_WRITABLE_MASK) &&
887 !spte_can_locklessly_be_made_writable(spte),
888 "kvm: Writable SPTE is not locklessly dirty-trackable\n");
889
890 WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
891 shadow_acc_track_saved_bits_shift),
892 "kvm: Access Tracking saved bit locations are not zero\n");
893
894 spte |= (spte & shadow_acc_track_saved_bits_mask) <<
895 shadow_acc_track_saved_bits_shift;
896 spte &= ~shadow_acc_track_mask;
897
898 return spte;
899}
900
901
902static u64 restore_acc_track_spte(u64 spte)
903{
904 u64 new_spte = spte;
905 u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
906 & shadow_acc_track_saved_bits_mask;
907
908 WARN_ON_ONCE(spte_ad_enabled(spte));
909 WARN_ON_ONCE(!is_access_track_spte(spte));
910
911 new_spte &= ~shadow_acc_track_mask;
912 new_spte &= ~(shadow_acc_track_saved_bits_mask <<
913 shadow_acc_track_saved_bits_shift);
914 new_spte |= saved_bits;
915
916 return new_spte;
917}
918
919
920static bool mmu_spte_age(u64 *sptep)
921{
922 u64 spte = mmu_spte_get_lockless(sptep);
923
924 if (!is_accessed_spte(spte))
925 return false;
926
927 if (spte_ad_enabled(spte)) {
928 clear_bit((ffs(shadow_accessed_mask) - 1),
929 (unsigned long *)sptep);
930 } else {
931
932
933
934
935 if (is_writable_pte(spte))
936 kvm_set_pfn_dirty(spte_to_pfn(spte));
937
938 spte = mark_spte_for_access_track(spte);
939 mmu_spte_update_no_track(sptep, spte);
940 }
941
942 return true;
943}
944
945static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
946{
947
948
949
950
951 local_irq_disable();
952
953
954
955
956
957 smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
958}
959
960static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
961{
962
963
964
965
966
967 smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
968 local_irq_enable();
969}
970
971static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
972 struct kmem_cache *base_cache, int min)
973{
974 void *obj;
975
976 if (cache->nobjs >= min)
977 return 0;
978 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
979 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT);
980 if (!obj)
981 return cache->nobjs >= min ? 0 : -ENOMEM;
982 cache->objects[cache->nobjs++] = obj;
983 }
984 return 0;
985}
986
987static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
988{
989 return cache->nobjs;
990}
991
992static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
993 struct kmem_cache *cache)
994{
995 while (mc->nobjs)
996 kmem_cache_free(cache, mc->objects[--mc->nobjs]);
997}
998
999static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
1000 int min)
1001{
1002 void *page;
1003
1004 if (cache->nobjs >= min)
1005 return 0;
1006 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
1007 page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
1008 if (!page)
1009 return cache->nobjs >= min ? 0 : -ENOMEM;
1010 cache->objects[cache->nobjs++] = page;
1011 }
1012 return 0;
1013}
1014
1015static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
1016{
1017 while (mc->nobjs)
1018 free_page((unsigned long)mc->objects[--mc->nobjs]);
1019}
1020
1021static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
1022{
1023 int r;
1024
1025 r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
1026 pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
1027 if (r)
1028 goto out;
1029 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
1030 if (r)
1031 goto out;
1032 r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
1033 mmu_page_header_cache, 4);
1034out:
1035 return r;
1036}
1037
1038static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
1039{
1040 mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
1041 pte_list_desc_cache);
1042 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
1043 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
1044 mmu_page_header_cache);
1045}
1046
1047static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
1048{
1049 void *p;
1050
1051 BUG_ON(!mc->nobjs);
1052 p = mc->objects[--mc->nobjs];
1053 return p;
1054}
1055
1056static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
1057{
1058 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
1059}
1060
1061static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
1062{
1063 kmem_cache_free(pte_list_desc_cache, pte_list_desc);
1064}
1065
1066static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
1067{
1068 if (!sp->role.direct)
1069 return sp->gfns[index];
1070
1071 return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
1072}
1073
1074static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
1075{
1076 if (sp->role.direct)
1077 BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
1078 else
1079 sp->gfns[index] = gfn;
1080}
1081
1082
1083
1084
1085
1086static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
1087 struct kvm_memory_slot *slot,
1088 int level)
1089{
1090 unsigned long idx;
1091
1092 idx = gfn_to_index(gfn, slot->base_gfn, level);
1093 return &slot->arch.lpage_info[level - 2][idx];
1094}
1095
1096static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
1097 gfn_t gfn, int count)
1098{
1099 struct kvm_lpage_info *linfo;
1100 int i;
1101
1102 for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1103 linfo = lpage_info_slot(gfn, slot, i);
1104 linfo->disallow_lpage += count;
1105 WARN_ON(linfo->disallow_lpage < 0);
1106 }
1107}
1108
1109void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
1110{
1111 update_gfn_disallow_lpage_count(slot, gfn, 1);
1112}
1113
1114void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
1115{
1116 update_gfn_disallow_lpage_count(slot, gfn, -1);
1117}
1118
1119static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1120{
1121 struct kvm_memslots *slots;
1122 struct kvm_memory_slot *slot;
1123 gfn_t gfn;
1124
1125 kvm->arch.indirect_shadow_pages++;
1126 gfn = sp->gfn;
1127 slots = kvm_memslots_for_spte_role(kvm, sp->role);
1128 slot = __gfn_to_memslot(slots, gfn);
1129
1130
1131 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
1132 return kvm_slot_page_track_add_page(kvm, slot, gfn,
1133 KVM_PAGE_TRACK_WRITE);
1134
1135 kvm_mmu_gfn_disallow_lpage(slot, gfn);
1136}
1137
1138static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1139{
1140 struct kvm_memslots *slots;
1141 struct kvm_memory_slot *slot;
1142 gfn_t gfn;
1143
1144 kvm->arch.indirect_shadow_pages--;
1145 gfn = sp->gfn;
1146 slots = kvm_memslots_for_spte_role(kvm, sp->role);
1147 slot = __gfn_to_memslot(slots, gfn);
1148 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
1149 return kvm_slot_page_track_remove_page(kvm, slot, gfn,
1150 KVM_PAGE_TRACK_WRITE);
1151
1152 kvm_mmu_gfn_allow_lpage(slot, gfn);
1153}
1154
1155static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
1156 struct kvm_memory_slot *slot)
1157{
1158 struct kvm_lpage_info *linfo;
1159
1160 if (slot) {
1161 linfo = lpage_info_slot(gfn, slot, level);
1162 return !!linfo->disallow_lpage;
1163 }
1164
1165 return true;
1166}
1167
1168static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
1169 int level)
1170{
1171 struct kvm_memory_slot *slot;
1172
1173 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1174 return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
1175}
1176
1177static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
1178{
1179 unsigned long page_size;
1180 int i, ret = 0;
1181
1182 page_size = kvm_host_page_size(kvm, gfn);
1183
1184 for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1185 if (page_size >= KVM_HPAGE_SIZE(i))
1186 ret = i;
1187 else
1188 break;
1189 }
1190
1191 return ret;
1192}
1193
1194static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot,
1195 bool no_dirty_log)
1196{
1197 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
1198 return false;
1199 if (no_dirty_log && slot->dirty_bitmap)
1200 return false;
1201
1202 return true;
1203}
1204
1205static struct kvm_memory_slot *
1206gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
1207 bool no_dirty_log)
1208{
1209 struct kvm_memory_slot *slot;
1210
1211 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1212 if (!memslot_valid_for_gpte(slot, no_dirty_log))
1213 slot = NULL;
1214
1215 return slot;
1216}
1217
1218static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
1219 bool *force_pt_level)
1220{
1221 int host_level, level, max_level;
1222 struct kvm_memory_slot *slot;
1223
1224 if (unlikely(*force_pt_level))
1225 return PT_PAGE_TABLE_LEVEL;
1226
1227 slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
1228 *force_pt_level = !memslot_valid_for_gpte(slot, true);
1229 if (unlikely(*force_pt_level))
1230 return PT_PAGE_TABLE_LEVEL;
1231
1232 host_level = host_mapping_level(vcpu->kvm, large_gfn);
1233
1234 if (host_level == PT_PAGE_TABLE_LEVEL)
1235 return host_level;
1236
1237 max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
1238
1239 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
1240 if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
1241 break;
1242
1243 return level - 1;
1244}
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
1258 struct kvm_rmap_head *rmap_head)
1259{
1260 struct pte_list_desc *desc;
1261 int i, count = 0;
1262
1263 if (!rmap_head->val) {
1264 rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
1265 rmap_head->val = (unsigned long)spte;
1266 } else if (!(rmap_head->val & 1)) {
1267 rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
1268 desc = mmu_alloc_pte_list_desc(vcpu);
1269 desc->sptes[0] = (u64 *)rmap_head->val;
1270 desc->sptes[1] = spte;
1271 rmap_head->val = (unsigned long)desc | 1;
1272 ++count;
1273 } else {
1274 rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
1275 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1276 while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
1277 desc = desc->more;
1278 count += PTE_LIST_EXT;
1279 }
1280 if (desc->sptes[PTE_LIST_EXT-1]) {
1281 desc->more = mmu_alloc_pte_list_desc(vcpu);
1282 desc = desc->more;
1283 }
1284 for (i = 0; desc->sptes[i]; ++i)
1285 ++count;
1286 desc->sptes[i] = spte;
1287 }
1288 return count;
1289}
1290
1291static void
1292pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
1293 struct pte_list_desc *desc, int i,
1294 struct pte_list_desc *prev_desc)
1295{
1296 int j;
1297
1298 for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
1299 ;
1300 desc->sptes[i] = desc->sptes[j];
1301 desc->sptes[j] = NULL;
1302 if (j != 0)
1303 return;
1304 if (!prev_desc && !desc->more)
1305 rmap_head->val = (unsigned long)desc->sptes[0];
1306 else
1307 if (prev_desc)
1308 prev_desc->more = desc->more;
1309 else
1310 rmap_head->val = (unsigned long)desc->more | 1;
1311 mmu_free_pte_list_desc(desc);
1312}
1313
1314static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
1315{
1316 struct pte_list_desc *desc;
1317 struct pte_list_desc *prev_desc;
1318 int i;
1319
1320 if (!rmap_head->val) {
1321 pr_err("%s: %p 0->BUG\n", __func__, spte);
1322 BUG();
1323 } else if (!(rmap_head->val & 1)) {
1324 rmap_printk("%s: %p 1->0\n", __func__, spte);
1325 if ((u64 *)rmap_head->val != spte) {
1326 pr_err("%s: %p 1->BUG\n", __func__, spte);
1327 BUG();
1328 }
1329 rmap_head->val = 0;
1330 } else {
1331 rmap_printk("%s: %p many->many\n", __func__, spte);
1332 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1333 prev_desc = NULL;
1334 while (desc) {
1335 for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
1336 if (desc->sptes[i] == spte) {
1337 pte_list_desc_remove_entry(rmap_head,
1338 desc, i, prev_desc);
1339 return;
1340 }
1341 }
1342 prev_desc = desc;
1343 desc = desc->more;
1344 }
1345 pr_err("%s: %p many->many\n", __func__, spte);
1346 BUG();
1347 }
1348}
1349
1350static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep)
1351{
1352 mmu_spte_clear_track_bits(sptep);
1353 __pte_list_remove(sptep, rmap_head);
1354}
1355
1356static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
1357 struct kvm_memory_slot *slot)
1358{
1359 unsigned long idx;
1360
1361 idx = gfn_to_index(gfn, slot->base_gfn, level);
1362 return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
1363}
1364
1365static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
1366 struct kvm_mmu_page *sp)
1367{
1368 struct kvm_memslots *slots;
1369 struct kvm_memory_slot *slot;
1370
1371 slots = kvm_memslots_for_spte_role(kvm, sp->role);
1372 slot = __gfn_to_memslot(slots, gfn);
1373 return __gfn_to_rmap(gfn, sp->role.level, slot);
1374}
1375
1376static bool rmap_can_add(struct kvm_vcpu *vcpu)
1377{
1378 struct kvm_mmu_memory_cache *cache;
1379
1380 cache = &vcpu->arch.mmu_pte_list_desc_cache;
1381 return mmu_memory_cache_free_objects(cache);
1382}
1383
1384static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
1385{
1386 struct kvm_mmu_page *sp;
1387 struct kvm_rmap_head *rmap_head;
1388
1389 sp = page_header(__pa(spte));
1390 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
1391 rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
1392 return pte_list_add(vcpu, spte, rmap_head);
1393}
1394
1395static void rmap_remove(struct kvm *kvm, u64 *spte)
1396{
1397 struct kvm_mmu_page *sp;
1398 gfn_t gfn;
1399 struct kvm_rmap_head *rmap_head;
1400
1401 sp = page_header(__pa(spte));
1402 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
1403 rmap_head = gfn_to_rmap(kvm, gfn, sp);
1404 __pte_list_remove(spte, rmap_head);
1405}
1406
1407
1408
1409
1410
1411struct rmap_iterator {
1412
1413 struct pte_list_desc *desc;
1414 int pos;
1415};
1416
1417
1418
1419
1420
1421
1422
1423
1424static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
1425 struct rmap_iterator *iter)
1426{
1427 u64 *sptep;
1428
1429 if (!rmap_head->val)
1430 return NULL;
1431
1432 if (!(rmap_head->val & 1)) {
1433 iter->desc = NULL;
1434 sptep = (u64 *)rmap_head->val;
1435 goto out;
1436 }
1437
1438 iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1439 iter->pos = 0;
1440 sptep = iter->desc->sptes[iter->pos];
1441out:
1442 BUG_ON(!is_shadow_present_pte(*sptep));
1443 return sptep;
1444}
1445
1446
1447
1448
1449
1450
1451static u64 *rmap_get_next(struct rmap_iterator *iter)
1452{
1453 u64 *sptep;
1454
1455 if (iter->desc) {
1456 if (iter->pos < PTE_LIST_EXT - 1) {
1457 ++iter->pos;
1458 sptep = iter->desc->sptes[iter->pos];
1459 if (sptep)
1460 goto out;
1461 }
1462
1463 iter->desc = iter->desc->more;
1464
1465 if (iter->desc) {
1466 iter->pos = 0;
1467
1468 sptep = iter->desc->sptes[iter->pos];
1469 goto out;
1470 }
1471 }
1472
1473 return NULL;
1474out:
1475 BUG_ON(!is_shadow_present_pte(*sptep));
1476 return sptep;
1477}
1478
1479#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \
1480 for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \
1481 _spte_; _spte_ = rmap_get_next(_iter_))
1482
1483static void drop_spte(struct kvm *kvm, u64 *sptep)
1484{
1485 if (mmu_spte_clear_track_bits(sptep))
1486 rmap_remove(kvm, sptep);
1487}
1488
1489
1490static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
1491{
1492 if (is_large_pte(*sptep)) {
1493 WARN_ON(page_header(__pa(sptep))->role.level ==
1494 PT_PAGE_TABLE_LEVEL);
1495 drop_spte(kvm, sptep);
1496 --kvm->stat.lpages;
1497 return true;
1498 }
1499
1500 return false;
1501}
1502
1503static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1504{
1505 if (__drop_large_spte(vcpu->kvm, sptep)) {
1506 struct kvm_mmu_page *sp = page_header(__pa(sptep));
1507
1508 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1509 KVM_PAGES_PER_HPAGE(sp->role.level));
1510 }
1511}
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526static bool spte_write_protect(u64 *sptep, bool pt_protect)
1527{
1528 u64 spte = *sptep;
1529
1530 if (!is_writable_pte(spte) &&
1531 !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
1532 return false;
1533
1534 rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
1535
1536 if (pt_protect)
1537 spte &= ~SPTE_MMU_WRITEABLE;
1538 spte = spte & ~PT_WRITABLE_MASK;
1539
1540 return mmu_spte_update(sptep, spte);
1541}
1542
1543static bool __rmap_write_protect(struct kvm *kvm,
1544 struct kvm_rmap_head *rmap_head,
1545 bool pt_protect)
1546{
1547 u64 *sptep;
1548 struct rmap_iterator iter;
1549 bool flush = false;
1550
1551 for_each_rmap_spte(rmap_head, &iter, sptep)
1552 flush |= spte_write_protect(sptep, pt_protect);
1553
1554 return flush;
1555}
1556
1557static bool spte_clear_dirty(u64 *sptep)
1558{
1559 u64 spte = *sptep;
1560
1561 rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
1562
1563 spte &= ~shadow_dirty_mask;
1564
1565 return mmu_spte_update(sptep, spte);
1566}
1567
1568static bool wrprot_ad_disabled_spte(u64 *sptep)
1569{
1570 bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
1571 (unsigned long *)sptep);
1572 if (was_writable)
1573 kvm_set_pfn_dirty(spte_to_pfn(*sptep));
1574
1575 return was_writable;
1576}
1577
1578
1579
1580
1581
1582
1583
1584static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1585{
1586 u64 *sptep;
1587 struct rmap_iterator iter;
1588 bool flush = false;
1589
1590 for_each_rmap_spte(rmap_head, &iter, sptep)
1591 if (spte_ad_enabled(*sptep))
1592 flush |= spte_clear_dirty(sptep);
1593 else
1594 flush |= wrprot_ad_disabled_spte(sptep);
1595
1596 return flush;
1597}
1598
1599static bool spte_set_dirty(u64 *sptep)
1600{
1601 u64 spte = *sptep;
1602
1603 rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
1604
1605 spte |= shadow_dirty_mask;
1606
1607 return mmu_spte_update(sptep, spte);
1608}
1609
1610static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1611{
1612 u64 *sptep;
1613 struct rmap_iterator iter;
1614 bool flush = false;
1615
1616 for_each_rmap_spte(rmap_head, &iter, sptep)
1617 if (spte_ad_enabled(*sptep))
1618 flush |= spte_set_dirty(sptep);
1619
1620 return flush;
1621}
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1634 struct kvm_memory_slot *slot,
1635 gfn_t gfn_offset, unsigned long mask)
1636{
1637 struct kvm_rmap_head *rmap_head;
1638
1639 while (mask) {
1640 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1641 PT_PAGE_TABLE_LEVEL, slot);
1642 __rmap_write_protect(kvm, rmap_head, false);
1643
1644
1645 mask &= mask - 1;
1646 }
1647}
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1660 struct kvm_memory_slot *slot,
1661 gfn_t gfn_offset, unsigned long mask)
1662{
1663 struct kvm_rmap_head *rmap_head;
1664
1665 while (mask) {
1666 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1667 PT_PAGE_TABLE_LEVEL, slot);
1668 __rmap_clear_dirty(kvm, rmap_head);
1669
1670
1671 mask &= mask - 1;
1672 }
1673}
1674EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1687 struct kvm_memory_slot *slot,
1688 gfn_t gfn_offset, unsigned long mask)
1689{
1690 if (kvm_x86_ops->enable_log_dirty_pt_masked)
1691 kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
1692 mask);
1693 else
1694 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1695}
1696
1697
1698
1699
1700
1701
1702
1703
1704int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu)
1705{
1706 if (kvm_x86_ops->write_log_dirty)
1707 return kvm_x86_ops->write_log_dirty(vcpu);
1708
1709 return 0;
1710}
1711
1712bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
1713 struct kvm_memory_slot *slot, u64 gfn)
1714{
1715 struct kvm_rmap_head *rmap_head;
1716 int i;
1717 bool write_protected = false;
1718
1719 for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1720 rmap_head = __gfn_to_rmap(gfn, i, slot);
1721 write_protected |= __rmap_write_protect(kvm, rmap_head, true);
1722 }
1723
1724 return write_protected;
1725}
1726
1727static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
1728{
1729 struct kvm_memory_slot *slot;
1730
1731 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1732 return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
1733}
1734
1735static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1736{
1737 u64 *sptep;
1738 struct rmap_iterator iter;
1739 bool flush = false;
1740
1741 while ((sptep = rmap_get_first(rmap_head, &iter))) {
1742 rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
1743
1744 pte_list_remove(rmap_head, sptep);
1745 flush = true;
1746 }
1747
1748 return flush;
1749}
1750
1751static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1752 struct kvm_memory_slot *slot, gfn_t gfn, int level,
1753 unsigned long data)
1754{
1755 return kvm_zap_rmapp(kvm, rmap_head);
1756}
1757
1758static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1759 struct kvm_memory_slot *slot, gfn_t gfn, int level,
1760 unsigned long data)
1761{
1762 u64 *sptep;
1763 struct rmap_iterator iter;
1764 int need_flush = 0;
1765 u64 new_spte;
1766 pte_t *ptep = (pte_t *)data;
1767 kvm_pfn_t new_pfn;
1768
1769 WARN_ON(pte_huge(*ptep));
1770 new_pfn = pte_pfn(*ptep);
1771
1772restart:
1773 for_each_rmap_spte(rmap_head, &iter, sptep) {
1774 rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
1775 sptep, *sptep, gfn, level);
1776
1777 need_flush = 1;
1778
1779 if (pte_write(*ptep)) {
1780 pte_list_remove(rmap_head, sptep);
1781 goto restart;
1782 } else {
1783 new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
1784 new_spte |= (u64)new_pfn << PAGE_SHIFT;
1785
1786 new_spte &= ~PT_WRITABLE_MASK;
1787 new_spte &= ~SPTE_HOST_WRITEABLE;
1788
1789 new_spte = mark_spte_for_access_track(new_spte);
1790
1791 mmu_spte_clear_track_bits(sptep);
1792 mmu_spte_set(sptep, new_spte);
1793 }
1794 }
1795
1796 if (need_flush && kvm_available_flush_tlb_with_range()) {
1797 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1798 return 0;
1799 }
1800
1801 return need_flush;
1802}
1803
1804struct slot_rmap_walk_iterator {
1805
1806 struct kvm_memory_slot *slot;
1807 gfn_t start_gfn;
1808 gfn_t end_gfn;
1809 int start_level;
1810 int end_level;
1811
1812
1813 gfn_t gfn;
1814 struct kvm_rmap_head *rmap;
1815 int level;
1816
1817
1818 struct kvm_rmap_head *end_rmap;
1819};
1820
1821static void
1822rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
1823{
1824 iterator->level = level;
1825 iterator->gfn = iterator->start_gfn;
1826 iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
1827 iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
1828 iterator->slot);
1829}
1830
1831static void
1832slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
1833 struct kvm_memory_slot *slot, int start_level,
1834 int end_level, gfn_t start_gfn, gfn_t end_gfn)
1835{
1836 iterator->slot = slot;
1837 iterator->start_level = start_level;
1838 iterator->end_level = end_level;
1839 iterator->start_gfn = start_gfn;
1840 iterator->end_gfn = end_gfn;
1841
1842 rmap_walk_init_level(iterator, iterator->start_level);
1843}
1844
1845static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
1846{
1847 return !!iterator->rmap;
1848}
1849
1850static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
1851{
1852 if (++iterator->rmap <= iterator->end_rmap) {
1853 iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
1854 return;
1855 }
1856
1857 if (++iterator->level > iterator->end_level) {
1858 iterator->rmap = NULL;
1859 return;
1860 }
1861
1862 rmap_walk_init_level(iterator, iterator->level);
1863}
1864
1865#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \
1866 _start_gfn, _end_gfn, _iter_) \
1867 for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \
1868 _end_level_, _start_gfn, _end_gfn); \
1869 slot_rmap_walk_okay(_iter_); \
1870 slot_rmap_walk_next(_iter_))
1871
1872static int kvm_handle_hva_range(struct kvm *kvm,
1873 unsigned long start,
1874 unsigned long end,
1875 unsigned long data,
1876 int (*handler)(struct kvm *kvm,
1877 struct kvm_rmap_head *rmap_head,
1878 struct kvm_memory_slot *slot,
1879 gfn_t gfn,
1880 int level,
1881 unsigned long data))
1882{
1883 struct kvm_memslots *slots;
1884 struct kvm_memory_slot *memslot;
1885 struct slot_rmap_walk_iterator iterator;
1886 int ret = 0;
1887 int i;
1888
1889 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1890 slots = __kvm_memslots(kvm, i);
1891 kvm_for_each_memslot(memslot, slots) {
1892 unsigned long hva_start, hva_end;
1893 gfn_t gfn_start, gfn_end;
1894
1895 hva_start = max(start, memslot->userspace_addr);
1896 hva_end = min(end, memslot->userspace_addr +
1897 (memslot->npages << PAGE_SHIFT));
1898 if (hva_start >= hva_end)
1899 continue;
1900
1901
1902
1903
1904 gfn_start = hva_to_gfn_memslot(hva_start, memslot);
1905 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
1906
1907 for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
1908 PT_MAX_HUGEPAGE_LEVEL,
1909 gfn_start, gfn_end - 1,
1910 &iterator)
1911 ret |= handler(kvm, iterator.rmap, memslot,
1912 iterator.gfn, iterator.level, data);
1913 }
1914 }
1915
1916 return ret;
1917}
1918
1919static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
1920 unsigned long data,
1921 int (*handler)(struct kvm *kvm,
1922 struct kvm_rmap_head *rmap_head,
1923 struct kvm_memory_slot *slot,
1924 gfn_t gfn, int level,
1925 unsigned long data))
1926{
1927 return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
1928}
1929
1930int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
1931{
1932 return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
1933}
1934
1935int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1936{
1937 return kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
1938}
1939
1940static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1941 struct kvm_memory_slot *slot, gfn_t gfn, int level,
1942 unsigned long data)
1943{
1944 u64 *sptep;
1945 struct rmap_iterator uninitialized_var(iter);
1946 int young = 0;
1947
1948 for_each_rmap_spte(rmap_head, &iter, sptep)
1949 young |= mmu_spte_age(sptep);
1950
1951 trace_kvm_age_page(gfn, level, slot, young);
1952 return young;
1953}
1954
1955static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1956 struct kvm_memory_slot *slot, gfn_t gfn,
1957 int level, unsigned long data)
1958{
1959 u64 *sptep;
1960 struct rmap_iterator iter;
1961
1962 for_each_rmap_spte(rmap_head, &iter, sptep)
1963 if (is_accessed_spte(*sptep))
1964 return 1;
1965 return 0;
1966}
1967
1968#define RMAP_RECYCLE_THRESHOLD 1000
1969
1970static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
1971{
1972 struct kvm_rmap_head *rmap_head;
1973 struct kvm_mmu_page *sp;
1974
1975 sp = page_header(__pa(spte));
1976
1977 rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
1978
1979 kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
1980 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1981 KVM_PAGES_PER_HPAGE(sp->role.level));
1982}
1983
1984int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
1985{
1986 return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
1987}
1988
1989int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
1990{
1991 return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
1992}
1993
1994#ifdef MMU_DEBUG
1995static int is_empty_shadow_page(u64 *spt)
1996{
1997 u64 *pos;
1998 u64 *end;
1999
2000 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
2001 if (is_shadow_present_pte(*pos)) {
2002 printk(KERN_ERR "%s: %p %llx\n", __func__,
2003 pos, *pos);
2004 return 0;
2005 }
2006 return 1;
2007}
2008#endif
2009
2010
2011
2012
2013
2014
2015
2016static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr)
2017{
2018 kvm->arch.n_used_mmu_pages += nr;
2019 percpu_counter_add(&kvm_total_used_mmu_pages, nr);
2020}
2021
2022static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
2023{
2024 MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
2025 hlist_del(&sp->hash_link);
2026 list_del(&sp->link);
2027 free_page((unsigned long)sp->spt);
2028 if (!sp->role.direct)
2029 free_page((unsigned long)sp->gfns);
2030 kmem_cache_free(mmu_page_header_cache, sp);
2031}
2032
2033static unsigned kvm_page_table_hashfn(gfn_t gfn)
2034{
2035 return hash_64(gfn, KVM_MMU_HASH_SHIFT);
2036}
2037
2038static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
2039 struct kvm_mmu_page *sp, u64 *parent_pte)
2040{
2041 if (!parent_pte)
2042 return;
2043
2044 pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
2045}
2046
2047static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
2048 u64 *parent_pte)
2049{
2050 __pte_list_remove(parent_pte, &sp->parent_ptes);
2051}
2052
2053static void drop_parent_pte(struct kvm_mmu_page *sp,
2054 u64 *parent_pte)
2055{
2056 mmu_page_remove_parent_pte(sp, parent_pte);
2057 mmu_spte_clear_no_track(parent_pte);
2058}
2059
2060static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
2061{
2062 struct kvm_mmu_page *sp;
2063
2064 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
2065 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
2066 if (!direct)
2067 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
2068 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
2069 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
2070 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
2071 return sp;
2072}
2073
2074static void mark_unsync(u64 *spte);
2075static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
2076{
2077 u64 *sptep;
2078 struct rmap_iterator iter;
2079
2080 for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
2081 mark_unsync(sptep);
2082 }
2083}
2084
2085static void mark_unsync(u64 *spte)
2086{
2087 struct kvm_mmu_page *sp;
2088 unsigned int index;
2089
2090 sp = page_header(__pa(spte));
2091 index = spte - sp->spt;
2092 if (__test_and_set_bit(index, sp->unsync_child_bitmap))
2093 return;
2094 if (sp->unsync_children++)
2095 return;
2096 kvm_mmu_mark_parents_unsync(sp);
2097}
2098
2099static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
2100 struct kvm_mmu_page *sp)
2101{
2102 return 0;
2103}
2104
2105static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root)
2106{
2107}
2108
2109static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
2110 struct kvm_mmu_page *sp, u64 *spte,
2111 const void *pte)
2112{
2113 WARN_ON(1);
2114}
2115
2116#define KVM_PAGE_ARRAY_NR 16
2117
2118struct kvm_mmu_pages {
2119 struct mmu_page_and_offset {
2120 struct kvm_mmu_page *sp;
2121 unsigned int idx;
2122 } page[KVM_PAGE_ARRAY_NR];
2123 unsigned int nr;
2124};
2125
2126static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
2127 int idx)
2128{
2129 int i;
2130
2131 if (sp->unsync)
2132 for (i=0; i < pvec->nr; i++)
2133 if (pvec->page[i].sp == sp)
2134 return 0;
2135
2136 pvec->page[pvec->nr].sp = sp;
2137 pvec->page[pvec->nr].idx = idx;
2138 pvec->nr++;
2139 return (pvec->nr == KVM_PAGE_ARRAY_NR);
2140}
2141
2142static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
2143{
2144 --sp->unsync_children;
2145 WARN_ON((int)sp->unsync_children < 0);
2146 __clear_bit(idx, sp->unsync_child_bitmap);
2147}
2148
2149static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
2150 struct kvm_mmu_pages *pvec)
2151{
2152 int i, ret, nr_unsync_leaf = 0;
2153
2154 for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
2155 struct kvm_mmu_page *child;
2156 u64 ent = sp->spt[i];
2157
2158 if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
2159 clear_unsync_child_bit(sp, i);
2160 continue;
2161 }
2162
2163 child = page_header(ent & PT64_BASE_ADDR_MASK);
2164
2165 if (child->unsync_children) {
2166 if (mmu_pages_add(pvec, child, i))
2167 return -ENOSPC;
2168
2169 ret = __mmu_unsync_walk(child, pvec);
2170 if (!ret) {
2171 clear_unsync_child_bit(sp, i);
2172 continue;
2173 } else if (ret > 0) {
2174 nr_unsync_leaf += ret;
2175 } else
2176 return ret;
2177 } else if (child->unsync) {
2178 nr_unsync_leaf++;
2179 if (mmu_pages_add(pvec, child, i))
2180 return -ENOSPC;
2181 } else
2182 clear_unsync_child_bit(sp, i);
2183 }
2184
2185 return nr_unsync_leaf;
2186}
2187
2188#define INVALID_INDEX (-1)
2189
2190static int mmu_unsync_walk(struct kvm_mmu_page *sp,
2191 struct kvm_mmu_pages *pvec)
2192{
2193 pvec->nr = 0;
2194 if (!sp->unsync_children)
2195 return 0;
2196
2197 mmu_pages_add(pvec, sp, INVALID_INDEX);
2198 return __mmu_unsync_walk(sp, pvec);
2199}
2200
2201static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
2202{
2203 WARN_ON(!sp->unsync);
2204 trace_kvm_mmu_sync_page(sp);
2205 sp->unsync = 0;
2206 --kvm->stat.mmu_unsync;
2207}
2208
2209static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2210 struct list_head *invalid_list);
2211static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2212 struct list_head *invalid_list);
2213
2214
2215#define for_each_valid_sp(_kvm, _sp, _gfn) \
2216 hlist_for_each_entry(_sp, \
2217 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
2218 if ((_sp)->role.invalid) { \
2219 } else
2220
2221#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
2222 for_each_valid_sp(_kvm, _sp, _gfn) \
2223 if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
2224
2225static inline bool is_ept_sp(struct kvm_mmu_page *sp)
2226{
2227 return sp->role.cr0_wp && sp->role.smap_andnot_wp;
2228}
2229
2230
2231static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2232 struct list_head *invalid_list)
2233{
2234 if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) ||
2235 vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
2236 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
2237 return false;
2238 }
2239
2240 return true;
2241}
2242
2243static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
2244 struct list_head *invalid_list,
2245 bool remote_flush)
2246{
2247 if (!remote_flush && list_empty(invalid_list))
2248 return false;
2249
2250 if (!list_empty(invalid_list))
2251 kvm_mmu_commit_zap_page(kvm, invalid_list);
2252 else
2253 kvm_flush_remote_tlbs(kvm);
2254 return true;
2255}
2256
2257static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
2258 struct list_head *invalid_list,
2259 bool remote_flush, bool local_flush)
2260{
2261 if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush))
2262 return;
2263
2264 if (local_flush)
2265 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2266}
2267
2268#ifdef CONFIG_KVM_MMU_AUDIT
2269#include "mmu_audit.c"
2270#else
2271static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
2272static void mmu_audit_disable(void) { }
2273#endif
2274
2275static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2276 struct list_head *invalid_list)
2277{
2278 kvm_unlink_unsync_page(vcpu->kvm, sp);
2279 return __kvm_sync_page(vcpu, sp, invalid_list);
2280}
2281
2282
2283static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
2284 struct list_head *invalid_list)
2285{
2286 struct kvm_mmu_page *s;
2287 bool ret = false;
2288
2289 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
2290 if (!s->unsync)
2291 continue;
2292
2293 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
2294 ret |= kvm_sync_page(vcpu, s, invalid_list);
2295 }
2296
2297 return ret;
2298}
2299
2300struct mmu_page_path {
2301 struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
2302 unsigned int idx[PT64_ROOT_MAX_LEVEL];
2303};
2304
2305#define for_each_sp(pvec, sp, parents, i) \
2306 for (i = mmu_pages_first(&pvec, &parents); \
2307 i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
2308 i = mmu_pages_next(&pvec, &parents, i))
2309
2310static int mmu_pages_next(struct kvm_mmu_pages *pvec,
2311 struct mmu_page_path *parents,
2312 int i)
2313{
2314 int n;
2315
2316 for (n = i+1; n < pvec->nr; n++) {
2317 struct kvm_mmu_page *sp = pvec->page[n].sp;
2318 unsigned idx = pvec->page[n].idx;
2319 int level = sp->role.level;
2320
2321 parents->idx[level-1] = idx;
2322 if (level == PT_PAGE_TABLE_LEVEL)
2323 break;
2324
2325 parents->parent[level-2] = sp;
2326 }
2327
2328 return n;
2329}
2330
2331static int mmu_pages_first(struct kvm_mmu_pages *pvec,
2332 struct mmu_page_path *parents)
2333{
2334 struct kvm_mmu_page *sp;
2335 int level;
2336
2337 if (pvec->nr == 0)
2338 return 0;
2339
2340 WARN_ON(pvec->page[0].idx != INVALID_INDEX);
2341
2342 sp = pvec->page[0].sp;
2343 level = sp->role.level;
2344 WARN_ON(level == PT_PAGE_TABLE_LEVEL);
2345
2346 parents->parent[level-2] = sp;
2347
2348
2349
2350
2351 parents->parent[level-1] = NULL;
2352 return mmu_pages_next(pvec, parents, 0);
2353}
2354
2355static void mmu_pages_clear_parents(struct mmu_page_path *parents)
2356{
2357 struct kvm_mmu_page *sp;
2358 unsigned int level = 0;
2359
2360 do {
2361 unsigned int idx = parents->idx[level];
2362 sp = parents->parent[level];
2363 if (!sp)
2364 return;
2365
2366 WARN_ON(idx == INVALID_INDEX);
2367 clear_unsync_child_bit(sp, idx);
2368 level++;
2369 } while (!sp->unsync_children);
2370}
2371
2372static void mmu_sync_children(struct kvm_vcpu *vcpu,
2373 struct kvm_mmu_page *parent)
2374{
2375 int i;
2376 struct kvm_mmu_page *sp;
2377 struct mmu_page_path parents;
2378 struct kvm_mmu_pages pages;
2379 LIST_HEAD(invalid_list);
2380 bool flush = false;
2381
2382 while (mmu_unsync_walk(parent, &pages)) {
2383 bool protected = false;
2384
2385 for_each_sp(pages, sp, parents, i)
2386 protected |= rmap_write_protect(vcpu, sp->gfn);
2387
2388 if (protected) {
2389 kvm_flush_remote_tlbs(vcpu->kvm);
2390 flush = false;
2391 }
2392
2393 for_each_sp(pages, sp, parents, i) {
2394 flush |= kvm_sync_page(vcpu, sp, &invalid_list);
2395 mmu_pages_clear_parents(&parents);
2396 }
2397 if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
2398 kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2399 cond_resched_lock(&vcpu->kvm->mmu_lock);
2400 flush = false;
2401 }
2402 }
2403
2404 kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2405}
2406
2407static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
2408{
2409 atomic_set(&sp->write_flooding_count, 0);
2410}
2411
2412static void clear_sp_write_flooding_count(u64 *spte)
2413{
2414 struct kvm_mmu_page *sp = page_header(__pa(spte));
2415
2416 __clear_sp_write_flooding_count(sp);
2417}
2418
2419static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
2420 gfn_t gfn,
2421 gva_t gaddr,
2422 unsigned level,
2423 int direct,
2424 unsigned access)
2425{
2426 union kvm_mmu_page_role role;
2427 unsigned quadrant;
2428 struct kvm_mmu_page *sp;
2429 bool need_sync = false;
2430 bool flush = false;
2431 int collisions = 0;
2432 LIST_HEAD(invalid_list);
2433
2434 role = vcpu->arch.mmu->mmu_role.base;
2435 role.level = level;
2436 role.direct = direct;
2437 if (role.direct)
2438 role.gpte_is_8_bytes = true;
2439 role.access = access;
2440 if (!vcpu->arch.mmu->direct_map
2441 && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
2442 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
2443 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
2444 role.quadrant = quadrant;
2445 }
2446 for_each_valid_sp(vcpu->kvm, sp, gfn) {
2447 if (sp->gfn != gfn) {
2448 collisions++;
2449 continue;
2450 }
2451
2452 if (!need_sync && sp->unsync)
2453 need_sync = true;
2454
2455 if (sp->role.word != role.word)
2456 continue;
2457
2458 if (sp->unsync) {
2459
2460
2461
2462 if (!__kvm_sync_page(vcpu, sp, &invalid_list))
2463 break;
2464
2465 WARN_ON(!list_empty(&invalid_list));
2466 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2467 }
2468
2469 if (sp->unsync_children)
2470 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
2471
2472 __clear_sp_write_flooding_count(sp);
2473 trace_kvm_mmu_get_page(sp, false);
2474 goto out;
2475 }
2476
2477 ++vcpu->kvm->stat.mmu_cache_miss;
2478
2479 sp = kvm_mmu_alloc_page(vcpu, direct);
2480
2481 sp->gfn = gfn;
2482 sp->role = role;
2483 hlist_add_head(&sp->hash_link,
2484 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
2485 if (!direct) {
2486
2487
2488
2489
2490
2491 account_shadowed(vcpu->kvm, sp);
2492 if (level == PT_PAGE_TABLE_LEVEL &&
2493 rmap_write_protect(vcpu, gfn))
2494 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
2495
2496 if (level > PT_PAGE_TABLE_LEVEL && need_sync)
2497 flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
2498 }
2499 clear_page(sp->spt);
2500 trace_kvm_mmu_get_page(sp, true);
2501
2502 kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2503out:
2504 if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
2505 vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
2506 return sp;
2507}
2508
2509static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
2510 struct kvm_vcpu *vcpu, hpa_t root,
2511 u64 addr)
2512{
2513 iterator->addr = addr;
2514 iterator->shadow_addr = root;
2515 iterator->level = vcpu->arch.mmu->shadow_root_level;
2516
2517 if (iterator->level == PT64_ROOT_4LEVEL &&
2518 vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
2519 !vcpu->arch.mmu->direct_map)
2520 --iterator->level;
2521
2522 if (iterator->level == PT32E_ROOT_LEVEL) {
2523
2524
2525
2526
2527 BUG_ON(root != vcpu->arch.mmu->root_hpa);
2528
2529 iterator->shadow_addr
2530 = vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
2531 iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
2532 --iterator->level;
2533 if (!iterator->shadow_addr)
2534 iterator->level = 0;
2535 }
2536}
2537
2538static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
2539 struct kvm_vcpu *vcpu, u64 addr)
2540{
2541 shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa,
2542 addr);
2543}
2544
2545static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
2546{
2547 if (iterator->level < PT_PAGE_TABLE_LEVEL)
2548 return false;
2549
2550 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
2551 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
2552 return true;
2553}
2554
2555static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
2556 u64 spte)
2557{
2558 if (is_last_spte(spte, iterator->level)) {
2559 iterator->level = 0;
2560 return;
2561 }
2562
2563 iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
2564 --iterator->level;
2565}
2566
2567static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
2568{
2569 __shadow_walk_next(iterator, *iterator->sptep);
2570}
2571
2572static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
2573 struct kvm_mmu_page *sp)
2574{
2575 u64 spte;
2576
2577 BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
2578
2579 spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
2580 shadow_user_mask | shadow_x_mask | shadow_me_mask;
2581
2582 if (sp_ad_disabled(sp))
2583 spte |= shadow_acc_track_value;
2584 else
2585 spte |= shadow_accessed_mask;
2586
2587 mmu_spte_set(sptep, spte);
2588
2589 mmu_page_add_parent_pte(vcpu, sp, sptep);
2590
2591 if (sp->unsync_children || sp->unsync)
2592 mark_unsync(sptep);
2593}
2594
2595static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2596 unsigned direct_access)
2597{
2598 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
2599 struct kvm_mmu_page *child;
2600
2601
2602
2603
2604
2605
2606
2607
2608 child = page_header(*sptep & PT64_BASE_ADDR_MASK);
2609 if (child->role.access == direct_access)
2610 return;
2611
2612 drop_parent_pte(child, sptep);
2613 kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1);
2614 }
2615}
2616
2617static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
2618 u64 *spte)
2619{
2620 u64 pte;
2621 struct kvm_mmu_page *child;
2622
2623 pte = *spte;
2624 if (is_shadow_present_pte(pte)) {
2625 if (is_last_spte(pte, sp->role.level)) {
2626 drop_spte(kvm, spte);
2627 if (is_large_pte(pte))
2628 --kvm->stat.lpages;
2629 } else {
2630 child = page_header(pte & PT64_BASE_ADDR_MASK);
2631 drop_parent_pte(child, spte);
2632 }
2633 return true;
2634 }
2635
2636 if (is_mmio_spte(pte))
2637 mmu_spte_clear_no_track(spte);
2638
2639 return false;
2640}
2641
2642static void kvm_mmu_page_unlink_children(struct kvm *kvm,
2643 struct kvm_mmu_page *sp)
2644{
2645 unsigned i;
2646
2647 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2648 mmu_page_zap_pte(kvm, sp, sp->spt + i);
2649}
2650
2651static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
2652{
2653 u64 *sptep;
2654 struct rmap_iterator iter;
2655
2656 while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
2657 drop_parent_pte(sp, sptep);
2658}
2659
2660static int mmu_zap_unsync_children(struct kvm *kvm,
2661 struct kvm_mmu_page *parent,
2662 struct list_head *invalid_list)
2663{
2664 int i, zapped = 0;
2665 struct mmu_page_path parents;
2666 struct kvm_mmu_pages pages;
2667
2668 if (parent->role.level == PT_PAGE_TABLE_LEVEL)
2669 return 0;
2670
2671 while (mmu_unsync_walk(parent, &pages)) {
2672 struct kvm_mmu_page *sp;
2673
2674 for_each_sp(pages, sp, parents, i) {
2675 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2676 mmu_pages_clear_parents(&parents);
2677 zapped++;
2678 }
2679 }
2680
2681 return zapped;
2682}
2683
2684static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
2685 struct kvm_mmu_page *sp,
2686 struct list_head *invalid_list,
2687 int *nr_zapped)
2688{
2689 bool list_unstable;
2690
2691 trace_kvm_mmu_prepare_zap_page(sp);
2692 ++kvm->stat.mmu_shadow_zapped;
2693 *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
2694 kvm_mmu_page_unlink_children(kvm, sp);
2695 kvm_mmu_unlink_parents(kvm, sp);
2696
2697
2698 list_unstable = *nr_zapped;
2699
2700 if (!sp->role.invalid && !sp->role.direct)
2701 unaccount_shadowed(kvm, sp);
2702
2703 if (sp->unsync)
2704 kvm_unlink_unsync_page(kvm, sp);
2705 if (!sp->root_count) {
2706
2707 (*nr_zapped)++;
2708 list_move(&sp->link, invalid_list);
2709 kvm_mod_used_mmu_pages(kvm, -1);
2710 } else {
2711 list_move(&sp->link, &kvm->arch.active_mmu_pages);
2712
2713 if (!sp->role.invalid)
2714 kvm_reload_remote_mmus(kvm);
2715 }
2716
2717 sp->role.invalid = 1;
2718 return list_unstable;
2719}
2720
2721static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2722 struct list_head *invalid_list)
2723{
2724 int nr_zapped;
2725
2726 __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
2727 return nr_zapped;
2728}
2729
2730static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2731 struct list_head *invalid_list)
2732{
2733 struct kvm_mmu_page *sp, *nsp;
2734
2735 if (list_empty(invalid_list))
2736 return;
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747 kvm_flush_remote_tlbs(kvm);
2748
2749 list_for_each_entry_safe(sp, nsp, invalid_list, link) {
2750 WARN_ON(!sp->role.invalid || sp->root_count);
2751 kvm_mmu_free_page(sp);
2752 }
2753}
2754
2755static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
2756 struct list_head *invalid_list)
2757{
2758 struct kvm_mmu_page *sp;
2759
2760 if (list_empty(&kvm->arch.active_mmu_pages))
2761 return false;
2762
2763 sp = list_last_entry(&kvm->arch.active_mmu_pages,
2764 struct kvm_mmu_page, link);
2765 return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2766}
2767
2768
2769
2770
2771
2772void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
2773{
2774 LIST_HEAD(invalid_list);
2775
2776 spin_lock(&kvm->mmu_lock);
2777
2778 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2779
2780 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
2781 if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list))
2782 break;
2783
2784 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2785 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
2786 }
2787
2788 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
2789
2790 spin_unlock(&kvm->mmu_lock);
2791}
2792
2793int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2794{
2795 struct kvm_mmu_page *sp;
2796 LIST_HEAD(invalid_list);
2797 int r;
2798
2799 pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
2800 r = 0;
2801 spin_lock(&kvm->mmu_lock);
2802 for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
2803 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
2804 sp->role.word);
2805 r = 1;
2806 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
2807 }
2808 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2809 spin_unlock(&kvm->mmu_lock);
2810
2811 return r;
2812}
2813EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
2814
2815static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
2816{
2817 trace_kvm_mmu_unsync_page(sp);
2818 ++vcpu->kvm->stat.mmu_unsync;
2819 sp->unsync = 1;
2820
2821 kvm_mmu_mark_parents_unsync(sp);
2822}
2823
2824static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2825 bool can_unsync)
2826{
2827 struct kvm_mmu_page *sp;
2828
2829 if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
2830 return true;
2831
2832 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
2833 if (!can_unsync)
2834 return true;
2835
2836 if (sp->unsync)
2837 continue;
2838
2839 WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
2840 kvm_unsync_page(vcpu, sp);
2841 }
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880 smp_wmb();
2881
2882 return false;
2883}
2884
2885static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
2886{
2887 if (pfn_valid(pfn))
2888 return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899 (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
2900
2901 return !e820__mapped_raw_any(pfn_to_hpa(pfn),
2902 pfn_to_hpa(pfn + 1) - 1,
2903 E820_TYPE_RAM);
2904}
2905
2906
2907#define SET_SPTE_WRITE_PROTECTED_PT BIT(0)
2908#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
2909
2910static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2911 unsigned pte_access, int level,
2912 gfn_t gfn, kvm_pfn_t pfn, bool speculative,
2913 bool can_unsync, bool host_writable)
2914{
2915 u64 spte = 0;
2916 int ret = 0;
2917 struct kvm_mmu_page *sp;
2918
2919 if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
2920 return 0;
2921
2922 sp = page_header(__pa(sptep));
2923 if (sp_ad_disabled(sp))
2924 spte |= shadow_acc_track_value;
2925
2926
2927
2928
2929
2930
2931
2932 spte |= shadow_present_mask;
2933 if (!speculative)
2934 spte |= spte_shadow_accessed_mask(spte);
2935
2936 if (pte_access & ACC_EXEC_MASK)
2937 spte |= shadow_x_mask;
2938 else
2939 spte |= shadow_nx_mask;
2940
2941 if (pte_access & ACC_USER_MASK)
2942 spte |= shadow_user_mask;
2943
2944 if (level > PT_PAGE_TABLE_LEVEL)
2945 spte |= PT_PAGE_SIZE_MASK;
2946 if (tdp_enabled)
2947 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
2948 kvm_is_mmio_pfn(pfn));
2949
2950 if (host_writable)
2951 spte |= SPTE_HOST_WRITEABLE;
2952 else
2953 pte_access &= ~ACC_WRITE_MASK;
2954
2955 if (!kvm_is_mmio_pfn(pfn))
2956 spte |= shadow_me_mask;
2957
2958 spte |= (u64)pfn << PAGE_SHIFT;
2959
2960 if (pte_access & ACC_WRITE_MASK) {
2961
2962
2963
2964
2965
2966
2967
2968 if (level > PT_PAGE_TABLE_LEVEL &&
2969 mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
2970 goto done;
2971
2972 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
2973
2974
2975
2976
2977
2978
2979
2980 if (!can_unsync && is_writable_pte(*sptep))
2981 goto set_pte;
2982
2983 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
2984 pgprintk("%s: found shadow page for %llx, marking ro\n",
2985 __func__, gfn);
2986 ret |= SET_SPTE_WRITE_PROTECTED_PT;
2987 pte_access &= ~ACC_WRITE_MASK;
2988 spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
2989 }
2990 }
2991
2992 if (pte_access & ACC_WRITE_MASK) {
2993 kvm_vcpu_mark_page_dirty(vcpu, gfn);
2994 spte |= spte_shadow_dirty_mask(spte);
2995 }
2996
2997 if (speculative)
2998 spte = mark_spte_for_access_track(spte);
2999
3000set_pte:
3001 if (mmu_spte_update(sptep, spte))
3002 ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
3003done:
3004 return ret;
3005}
3006
3007static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
3008 int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
3009 bool speculative, bool host_writable)
3010{
3011 int was_rmapped = 0;
3012 int rmap_count;
3013 int set_spte_ret;
3014 int ret = RET_PF_RETRY;
3015 bool flush = false;
3016
3017 pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
3018 *sptep, write_fault, gfn);
3019
3020 if (is_shadow_present_pte(*sptep)) {
3021
3022
3023
3024
3025 if (level > PT_PAGE_TABLE_LEVEL &&
3026 !is_large_pte(*sptep)) {
3027 struct kvm_mmu_page *child;
3028 u64 pte = *sptep;
3029
3030 child = page_header(pte & PT64_BASE_ADDR_MASK);
3031 drop_parent_pte(child, sptep);
3032 flush = true;
3033 } else if (pfn != spte_to_pfn(*sptep)) {
3034 pgprintk("hfn old %llx new %llx\n",
3035 spte_to_pfn(*sptep), pfn);
3036 drop_spte(vcpu->kvm, sptep);
3037 flush = true;
3038 } else
3039 was_rmapped = 1;
3040 }
3041
3042 set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
3043 speculative, true, host_writable);
3044 if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
3045 if (write_fault)
3046 ret = RET_PF_EMULATE;
3047 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3048 }
3049
3050 if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
3051 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
3052 KVM_PAGES_PER_HPAGE(level));
3053
3054 if (unlikely(is_mmio_spte(*sptep)))
3055 ret = RET_PF_EMULATE;
3056
3057 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
3058 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
3059 is_large_pte(*sptep)? "2MB" : "4kB",
3060 *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn,
3061 *sptep, sptep);
3062 if (!was_rmapped && is_large_pte(*sptep))
3063 ++vcpu->kvm->stat.lpages;
3064
3065 if (is_shadow_present_pte(*sptep)) {
3066 if (!was_rmapped) {
3067 rmap_count = rmap_add(vcpu, sptep, gfn);
3068 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
3069 rmap_recycle(vcpu, sptep, gfn);
3070 }
3071 }
3072
3073 kvm_release_pfn_clean(pfn);
3074
3075 return ret;
3076}
3077
3078static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
3079 bool no_dirty_log)
3080{
3081 struct kvm_memory_slot *slot;
3082
3083 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
3084 if (!slot)
3085 return KVM_PFN_ERR_FAULT;
3086
3087 return gfn_to_pfn_memslot_atomic(slot, gfn);
3088}
3089
3090static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
3091 struct kvm_mmu_page *sp,
3092 u64 *start, u64 *end)
3093{
3094 struct page *pages[PTE_PREFETCH_NUM];
3095 struct kvm_memory_slot *slot;
3096 unsigned access = sp->role.access;
3097 int i, ret;
3098 gfn_t gfn;
3099
3100 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
3101 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
3102 if (!slot)
3103 return -1;
3104
3105 ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
3106 if (ret <= 0)
3107 return -1;
3108
3109 for (i = 0; i < ret; i++, gfn++, start++)
3110 mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
3111 page_to_pfn(pages[i]), true, true);
3112
3113 return 0;
3114}
3115
3116static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
3117 struct kvm_mmu_page *sp, u64 *sptep)
3118{
3119 u64 *spte, *start = NULL;
3120 int i;
3121
3122 WARN_ON(!sp->role.direct);
3123
3124 i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
3125 spte = sp->spt + i;
3126
3127 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
3128 if (is_shadow_present_pte(*spte) || spte == sptep) {
3129 if (!start)
3130 continue;
3131 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
3132 break;
3133 start = NULL;
3134 } else if (!start)
3135 start = spte;
3136 }
3137}
3138
3139static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
3140{
3141 struct kvm_mmu_page *sp;
3142
3143 sp = page_header(__pa(sptep));
3144
3145
3146
3147
3148
3149
3150 if (sp_ad_disabled(sp))
3151 return;
3152
3153 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
3154 return;
3155
3156 __direct_pte_prefetch(vcpu, sp, sptep);
3157}
3158
3159static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable,
3160 int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault)
3161{
3162 struct kvm_shadow_walk_iterator iterator;
3163 struct kvm_mmu_page *sp;
3164 int emulate = 0;
3165 gfn_t pseudo_gfn;
3166
3167 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3168 return 0;
3169
3170 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
3171 if (iterator.level == level) {
3172 emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
3173 write, level, gfn, pfn, prefault,
3174 map_writable);
3175 direct_pte_prefetch(vcpu, iterator.sptep);
3176 ++vcpu->stat.pf_fixed;
3177 break;
3178 }
3179
3180 drop_large_spte(vcpu, iterator.sptep);
3181 if (!is_shadow_present_pte(*iterator.sptep)) {
3182 u64 base_addr = iterator.addr;
3183
3184 base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
3185 pseudo_gfn = base_addr >> PAGE_SHIFT;
3186 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
3187 iterator.level - 1, 1, ACC_ALL);
3188
3189 link_shadow_page(vcpu, iterator.sptep, sp);
3190 }
3191 }
3192 return emulate;
3193}
3194
3195static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
3196{
3197 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
3198}
3199
3200static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
3201{
3202
3203
3204
3205
3206
3207 if (pfn == KVM_PFN_ERR_RO_FAULT)
3208 return RET_PF_EMULATE;
3209
3210 if (pfn == KVM_PFN_ERR_HWPOISON) {
3211 kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
3212 return RET_PF_RETRY;
3213 }
3214
3215 return -EFAULT;
3216}
3217
3218static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
3219 gfn_t *gfnp, kvm_pfn_t *pfnp,
3220 int *levelp)
3221{
3222 kvm_pfn_t pfn = *pfnp;
3223 gfn_t gfn = *gfnp;
3224 int level = *levelp;
3225
3226
3227
3228
3229
3230
3231
3232 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
3233 level == PT_PAGE_TABLE_LEVEL &&
3234 PageTransCompoundMap(pfn_to_page(pfn)) &&
3235 !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
3236 unsigned long mask;
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246 *levelp = level = PT_DIRECTORY_LEVEL;
3247 mask = KVM_PAGES_PER_HPAGE(level) - 1;
3248 VM_BUG_ON((gfn & mask) != (pfn & mask));
3249 if (pfn & mask) {
3250 gfn &= ~mask;
3251 *gfnp = gfn;
3252 kvm_release_pfn_clean(pfn);
3253 pfn &= ~mask;
3254 kvm_get_pfn(pfn);
3255 *pfnp = pfn;
3256 }
3257 }
3258}
3259
3260static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
3261 kvm_pfn_t pfn, unsigned access, int *ret_val)
3262{
3263
3264 if (unlikely(is_error_pfn(pfn))) {
3265 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
3266 return true;
3267 }
3268
3269 if (unlikely(is_noslot_pfn(pfn)))
3270 vcpu_cache_mmio_info(vcpu, gva, gfn, access);
3271
3272 return false;
3273}
3274
3275static bool page_fault_can_be_fast(u32 error_code)
3276{
3277
3278
3279
3280
3281 if (unlikely(error_code & PFERR_RSVD_MASK))
3282 return false;
3283
3284
3285 if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
3286 == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
3287 return false;
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303 return shadow_acc_track_mask != 0 ||
3304 ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
3305 == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
3306}
3307
3308
3309
3310
3311
3312static bool
3313fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
3314 u64 *sptep, u64 old_spte, u64 new_spte)
3315{
3316 gfn_t gfn;
3317
3318 WARN_ON(!sp->role.direct);
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332 if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
3333 return false;
3334
3335 if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
3336
3337
3338
3339
3340 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
3341 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3342 }
3343
3344 return true;
3345}
3346
3347static bool is_access_allowed(u32 fault_err_code, u64 spte)
3348{
3349 if (fault_err_code & PFERR_FETCH_MASK)
3350 return is_executable_pte(spte);
3351
3352 if (fault_err_code & PFERR_WRITE_MASK)
3353 return is_writable_pte(spte);
3354
3355
3356 return spte & PT_PRESENT_MASK;
3357}
3358
3359
3360
3361
3362
3363
3364static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
3365 u32 error_code)
3366{
3367 struct kvm_shadow_walk_iterator iterator;
3368 struct kvm_mmu_page *sp;
3369 bool fault_handled = false;
3370 u64 spte = 0ull;
3371 uint retry_count = 0;
3372
3373 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3374 return false;
3375
3376 if (!page_fault_can_be_fast(error_code))
3377 return false;
3378
3379 walk_shadow_page_lockless_begin(vcpu);
3380
3381 do {
3382 u64 new_spte;
3383
3384 for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
3385 if (!is_shadow_present_pte(spte) ||
3386 iterator.level < level)
3387 break;
3388
3389 sp = page_header(__pa(iterator.sptep));
3390 if (!is_last_spte(spte, sp->role.level))
3391 break;
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403 if (is_access_allowed(error_code, spte)) {
3404 fault_handled = true;
3405 break;
3406 }
3407
3408 new_spte = spte;
3409
3410 if (is_access_track_spte(spte))
3411 new_spte = restore_acc_track_spte(new_spte);
3412
3413
3414
3415
3416
3417
3418 if ((error_code & PFERR_WRITE_MASK) &&
3419 spte_can_locklessly_be_made_writable(spte))
3420 {
3421 new_spte |= PT_WRITABLE_MASK;
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
3435 break;
3436 }
3437
3438
3439 if (new_spte == spte ||
3440 !is_access_allowed(error_code, new_spte))
3441 break;
3442
3443
3444
3445
3446
3447
3448 fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
3449 iterator.sptep, spte,
3450 new_spte);
3451 if (fault_handled)
3452 break;
3453
3454 if (++retry_count > 4) {
3455 printk_once(KERN_WARNING
3456 "kvm: Fast #PF retrying more than 4 times.\n");
3457 break;
3458 }
3459
3460 } while (true);
3461
3462 trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
3463 spte, fault_handled);
3464 walk_shadow_page_lockless_end(vcpu);
3465
3466 return fault_handled;
3467}
3468
3469static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
3470 gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable);
3471static int make_mmu_pages_available(struct kvm_vcpu *vcpu);
3472
3473static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
3474 gfn_t gfn, bool prefault)
3475{
3476 int r;
3477 int level;
3478 bool force_pt_level = false;
3479 kvm_pfn_t pfn;
3480 unsigned long mmu_seq;
3481 bool map_writable, write = error_code & PFERR_WRITE_MASK;
3482
3483 level = mapping_level(vcpu, gfn, &force_pt_level);
3484 if (likely(!force_pt_level)) {
3485
3486
3487
3488
3489
3490 if (level > PT_DIRECTORY_LEVEL)
3491 level = PT_DIRECTORY_LEVEL;
3492
3493 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
3494 }
3495
3496 if (fast_page_fault(vcpu, v, level, error_code))
3497 return RET_PF_RETRY;
3498
3499 mmu_seq = vcpu->kvm->mmu_notifier_seq;
3500 smp_rmb();
3501
3502 if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
3503 return RET_PF_RETRY;
3504
3505 if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
3506 return r;
3507
3508 spin_lock(&vcpu->kvm->mmu_lock);
3509 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
3510 goto out_unlock;
3511 if (make_mmu_pages_available(vcpu) < 0)
3512 goto out_unlock;
3513 if (likely(!force_pt_level))
3514 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
3515 r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
3516 spin_unlock(&vcpu->kvm->mmu_lock);
3517
3518 return r;
3519
3520out_unlock:
3521 spin_unlock(&vcpu->kvm->mmu_lock);
3522 kvm_release_pfn_clean(pfn);
3523 return RET_PF_RETRY;
3524}
3525
3526static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
3527 struct list_head *invalid_list)
3528{
3529 struct kvm_mmu_page *sp;
3530
3531 if (!VALID_PAGE(*root_hpa))
3532 return;
3533
3534 sp = page_header(*root_hpa & PT64_BASE_ADDR_MASK);
3535 --sp->root_count;
3536 if (!sp->root_count && sp->role.invalid)
3537 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
3538
3539 *root_hpa = INVALID_PAGE;
3540}
3541
3542
3543void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
3544 ulong roots_to_free)
3545{
3546 int i;
3547 LIST_HEAD(invalid_list);
3548 bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
3549
3550 BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
3551
3552
3553 if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) {
3554 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3555 if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
3556 VALID_PAGE(mmu->prev_roots[i].hpa))
3557 break;
3558
3559 if (i == KVM_MMU_NUM_PREV_ROOTS)
3560 return;
3561 }
3562
3563 spin_lock(&vcpu->kvm->mmu_lock);
3564
3565 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3566 if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
3567 mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa,
3568 &invalid_list);
3569
3570 if (free_active_root) {
3571 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
3572 (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
3573 mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
3574 &invalid_list);
3575 } else {
3576 for (i = 0; i < 4; ++i)
3577 if (mmu->pae_root[i] != 0)
3578 mmu_free_root_page(vcpu->kvm,
3579 &mmu->pae_root[i],
3580 &invalid_list);
3581 mmu->root_hpa = INVALID_PAGE;
3582 }
3583 mmu->root_cr3 = 0;
3584 }
3585
3586 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3587 spin_unlock(&vcpu->kvm->mmu_lock);
3588}
3589EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
3590
3591static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
3592{
3593 int ret = 0;
3594
3595 if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
3596 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3597 ret = 1;
3598 }
3599
3600 return ret;
3601}
3602
3603static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
3604{
3605 struct kvm_mmu_page *sp;
3606 unsigned i;
3607
3608 if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
3609 spin_lock(&vcpu->kvm->mmu_lock);
3610 if(make_mmu_pages_available(vcpu) < 0) {
3611 spin_unlock(&vcpu->kvm->mmu_lock);
3612 return -ENOSPC;
3613 }
3614 sp = kvm_mmu_get_page(vcpu, 0, 0,
3615 vcpu->arch.mmu->shadow_root_level, 1, ACC_ALL);
3616 ++sp->root_count;
3617 spin_unlock(&vcpu->kvm->mmu_lock);
3618 vcpu->arch.mmu->root_hpa = __pa(sp->spt);
3619 } else if (vcpu->arch.mmu->shadow_root_level == PT32E_ROOT_LEVEL) {
3620 for (i = 0; i < 4; ++i) {
3621 hpa_t root = vcpu->arch.mmu->pae_root[i];
3622
3623 MMU_WARN_ON(VALID_PAGE(root));
3624 spin_lock(&vcpu->kvm->mmu_lock);
3625 if (make_mmu_pages_available(vcpu) < 0) {
3626 spin_unlock(&vcpu->kvm->mmu_lock);
3627 return -ENOSPC;
3628 }
3629 sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
3630 i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
3631 root = __pa(sp->spt);
3632 ++sp->root_count;
3633 spin_unlock(&vcpu->kvm->mmu_lock);
3634 vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
3635 }
3636 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
3637 } else
3638 BUG();
3639 vcpu->arch.mmu->root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
3640
3641 return 0;
3642}
3643
3644static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3645{
3646 struct kvm_mmu_page *sp;
3647 u64 pdptr, pm_mask;
3648 gfn_t root_gfn, root_cr3;
3649 int i;
3650
3651 root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
3652 root_gfn = root_cr3 >> PAGE_SHIFT;
3653
3654 if (mmu_check_root(vcpu, root_gfn))
3655 return 1;
3656
3657
3658
3659
3660
3661 if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
3662 hpa_t root = vcpu->arch.mmu->root_hpa;
3663
3664 MMU_WARN_ON(VALID_PAGE(root));
3665
3666 spin_lock(&vcpu->kvm->mmu_lock);
3667 if (make_mmu_pages_available(vcpu) < 0) {
3668 spin_unlock(&vcpu->kvm->mmu_lock);
3669 return -ENOSPC;
3670 }
3671 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
3672 vcpu->arch.mmu->shadow_root_level, 0, ACC_ALL);
3673 root = __pa(sp->spt);
3674 ++sp->root_count;
3675 spin_unlock(&vcpu->kvm->mmu_lock);
3676 vcpu->arch.mmu->root_hpa = root;
3677 goto set_root_cr3;
3678 }
3679
3680
3681
3682
3683
3684
3685 pm_mask = PT_PRESENT_MASK;
3686 if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL)
3687 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
3688
3689 for (i = 0; i < 4; ++i) {
3690 hpa_t root = vcpu->arch.mmu->pae_root[i];
3691
3692 MMU_WARN_ON(VALID_PAGE(root));
3693 if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
3694 pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
3695 if (!(pdptr & PT_PRESENT_MASK)) {
3696 vcpu->arch.mmu->pae_root[i] = 0;
3697 continue;
3698 }
3699 root_gfn = pdptr >> PAGE_SHIFT;
3700 if (mmu_check_root(vcpu, root_gfn))
3701 return 1;
3702 }
3703 spin_lock(&vcpu->kvm->mmu_lock);
3704 if (make_mmu_pages_available(vcpu) < 0) {
3705 spin_unlock(&vcpu->kvm->mmu_lock);
3706 return -ENOSPC;
3707 }
3708 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
3709 0, ACC_ALL);
3710 root = __pa(sp->spt);
3711 ++sp->root_count;
3712 spin_unlock(&vcpu->kvm->mmu_lock);
3713
3714 vcpu->arch.mmu->pae_root[i] = root | pm_mask;
3715 }
3716 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
3717
3718
3719
3720
3721
3722 if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
3723 if (vcpu->arch.mmu->lm_root == NULL) {
3724
3725
3726
3727
3728
3729 u64 *lm_root;
3730
3731 lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3732 if (lm_root == NULL)
3733 return 1;
3734
3735 lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask;
3736
3737 vcpu->arch.mmu->lm_root = lm_root;
3738 }
3739
3740 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
3741 }
3742
3743set_root_cr3:
3744 vcpu->arch.mmu->root_cr3 = root_cr3;
3745
3746 return 0;
3747}
3748
3749static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
3750{
3751 if (vcpu->arch.mmu->direct_map)
3752 return mmu_alloc_direct_roots(vcpu);
3753 else
3754 return mmu_alloc_shadow_roots(vcpu);
3755}
3756
3757void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
3758{
3759 int i;
3760 struct kvm_mmu_page *sp;
3761
3762 if (vcpu->arch.mmu->direct_map)
3763 return;
3764
3765 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3766 return;
3767
3768 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
3769
3770 if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
3771 hpa_t root = vcpu->arch.mmu->root_hpa;
3772 sp = page_header(root);
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784 if (!smp_load_acquire(&sp->unsync) &&
3785 !smp_load_acquire(&sp->unsync_children))
3786 return;
3787
3788 spin_lock(&vcpu->kvm->mmu_lock);
3789 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
3790
3791 mmu_sync_children(vcpu, sp);
3792
3793 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
3794 spin_unlock(&vcpu->kvm->mmu_lock);
3795 return;
3796 }
3797
3798 spin_lock(&vcpu->kvm->mmu_lock);
3799 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
3800
3801 for (i = 0; i < 4; ++i) {
3802 hpa_t root = vcpu->arch.mmu->pae_root[i];
3803
3804 if (root && VALID_PAGE(root)) {
3805 root &= PT64_BASE_ADDR_MASK;
3806 sp = page_header(root);
3807 mmu_sync_children(vcpu, sp);
3808 }
3809 }
3810
3811 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
3812 spin_unlock(&vcpu->kvm->mmu_lock);
3813}
3814EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
3815
3816static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
3817 u32 access, struct x86_exception *exception)
3818{
3819 if (exception)
3820 exception->error_code = 0;
3821 return vaddr;
3822}
3823
3824static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
3825 u32 access,
3826 struct x86_exception *exception)
3827{
3828 if (exception)
3829 exception->error_code = 0;
3830 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
3831}
3832
3833static bool
3834__is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
3835{
3836 int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f;
3837
3838 return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) |
3839 ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0);
3840}
3841
3842static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
3843{
3844 return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level);
3845}
3846
3847static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
3848{
3849 return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
3850}
3851
3852static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3853{
3854
3855
3856
3857
3858 if (mmu_is_nested(vcpu))
3859 return false;
3860
3861 if (direct)
3862 return vcpu_match_mmio_gpa(vcpu, addr);
3863
3864 return vcpu_match_mmio_gva(vcpu, addr);
3865}
3866
3867
3868static bool
3869walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
3870{
3871 struct kvm_shadow_walk_iterator iterator;
3872 u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull;
3873 int root, leaf;
3874 bool reserved = false;
3875
3876 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3877 goto exit;
3878
3879 walk_shadow_page_lockless_begin(vcpu);
3880
3881 for (shadow_walk_init(&iterator, vcpu, addr),
3882 leaf = root = iterator.level;
3883 shadow_walk_okay(&iterator);
3884 __shadow_walk_next(&iterator, spte)) {
3885 spte = mmu_spte_get_lockless(iterator.sptep);
3886
3887 sptes[leaf - 1] = spte;
3888 leaf--;
3889
3890 if (!is_shadow_present_pte(spte))
3891 break;
3892
3893 reserved |= is_shadow_zero_bits_set(vcpu->arch.mmu, spte,
3894 iterator.level);
3895 }
3896
3897 walk_shadow_page_lockless_end(vcpu);
3898
3899 if (reserved) {
3900 pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
3901 __func__, addr);
3902 while (root > leaf) {
3903 pr_err("------ spte 0x%llx level %d.\n",
3904 sptes[root - 1], root);
3905 root--;
3906 }
3907 }
3908exit:
3909 *sptep = spte;
3910 return reserved;
3911}
3912
3913static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3914{
3915 u64 spte;
3916 bool reserved;
3917
3918 if (mmio_info_in_cache(vcpu, addr, direct))
3919 return RET_PF_EMULATE;
3920
3921 reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
3922 if (WARN_ON(reserved))
3923 return -EINVAL;
3924
3925 if (is_mmio_spte(spte)) {
3926 gfn_t gfn = get_mmio_spte_gfn(spte);
3927 unsigned access = get_mmio_spte_access(spte);
3928
3929 if (!check_mmio_spte(vcpu, spte))
3930 return RET_PF_INVALID;
3931
3932 if (direct)
3933 addr = 0;
3934
3935 trace_handle_mmio_page_fault(addr, gfn, access);
3936 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
3937 return RET_PF_EMULATE;
3938 }
3939
3940
3941
3942
3943
3944 return RET_PF_RETRY;
3945}
3946
3947static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
3948 u32 error_code, gfn_t gfn)
3949{
3950 if (unlikely(error_code & PFERR_RSVD_MASK))
3951 return false;
3952
3953 if (!(error_code & PFERR_PRESENT_MASK) ||
3954 !(error_code & PFERR_WRITE_MASK))
3955 return false;
3956
3957
3958
3959
3960
3961 if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
3962 return true;
3963
3964 return false;
3965}
3966
3967static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
3968{
3969 struct kvm_shadow_walk_iterator iterator;
3970 u64 spte;
3971
3972 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3973 return;
3974
3975 walk_shadow_page_lockless_begin(vcpu);
3976 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
3977 clear_sp_write_flooding_count(iterator.sptep);
3978 if (!is_shadow_present_pte(spte))
3979 break;
3980 }
3981 walk_shadow_page_lockless_end(vcpu);
3982}
3983
3984static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
3985 u32 error_code, bool prefault)
3986{
3987 gfn_t gfn = gva >> PAGE_SHIFT;
3988 int r;
3989
3990 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
3991
3992 if (page_fault_handle_page_track(vcpu, error_code, gfn))
3993 return RET_PF_EMULATE;
3994
3995 r = mmu_topup_memory_caches(vcpu);
3996 if (r)
3997 return r;
3998
3999 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
4000
4001
4002 return nonpaging_map(vcpu, gva & PAGE_MASK,
4003 error_code, gfn, prefault);
4004}
4005
4006static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
4007{
4008 struct kvm_arch_async_pf arch;
4009
4010 arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
4011 arch.gfn = gfn;
4012 arch.direct_map = vcpu->arch.mmu->direct_map;
4013 arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu);
4014
4015 return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
4016}
4017
4018bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
4019{
4020 if (unlikely(!lapic_in_kernel(vcpu) ||
4021 kvm_event_needs_reinjection(vcpu) ||
4022 vcpu->arch.exception.pending))
4023 return false;
4024
4025 if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
4026 return false;
4027
4028 return kvm_x86_ops->interrupt_allowed(vcpu);
4029}
4030
4031static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
4032 gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
4033{
4034 struct kvm_memory_slot *slot;
4035 bool async;
4036
4037
4038
4039
4040 if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) {
4041 *pfn = KVM_PFN_NOSLOT;
4042 return false;
4043 }
4044
4045 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
4046 async = false;
4047 *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
4048 if (!async)
4049 return false;
4050
4051 if (!prefault && kvm_can_do_async_pf(vcpu)) {
4052 trace_kvm_try_async_get_page(gva, gfn);
4053 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
4054 trace_kvm_async_pf_doublefault(gva, gfn);
4055 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
4056 return true;
4057 } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
4058 return true;
4059 }
4060
4061 *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
4062 return false;
4063}
4064
4065int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
4066 u64 fault_address, char *insn, int insn_len)
4067{
4068 int r = 1;
4069
4070 vcpu->arch.l1tf_flush_l1d = true;
4071 switch (vcpu->arch.apf.host_apf_reason) {
4072 default:
4073 trace_kvm_page_fault(fault_address, error_code);
4074
4075 if (kvm_event_needs_reinjection(vcpu))
4076 kvm_mmu_unprotect_page_virt(vcpu, fault_address);
4077 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
4078 insn_len);
4079 break;
4080 case KVM_PV_REASON_PAGE_NOT_PRESENT:
4081 vcpu->arch.apf.host_apf_reason = 0;
4082 local_irq_disable();
4083 kvm_async_pf_task_wait(fault_address, 0);
4084 local_irq_enable();
4085 break;
4086 case KVM_PV_REASON_PAGE_READY:
4087 vcpu->arch.apf.host_apf_reason = 0;
4088 local_irq_disable();
4089 kvm_async_pf_task_wake(fault_address);
4090 local_irq_enable();
4091 break;
4092 }
4093 return r;
4094}
4095EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
4096
4097static bool
4098check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
4099{
4100 int page_num = KVM_PAGES_PER_HPAGE(level);
4101
4102 gfn &= ~(page_num - 1);
4103
4104 return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
4105}
4106
4107static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
4108 bool prefault)
4109{
4110 kvm_pfn_t pfn;
4111 int r;
4112 int level;
4113 bool force_pt_level;
4114 gfn_t gfn = gpa >> PAGE_SHIFT;
4115 unsigned long mmu_seq;
4116 int write = error_code & PFERR_WRITE_MASK;
4117 bool map_writable;
4118
4119 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
4120
4121 if (page_fault_handle_page_track(vcpu, error_code, gfn))
4122 return RET_PF_EMULATE;
4123
4124 r = mmu_topup_memory_caches(vcpu);
4125 if (r)
4126 return r;
4127
4128 force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
4129 PT_DIRECTORY_LEVEL);
4130 level = mapping_level(vcpu, gfn, &force_pt_level);
4131 if (likely(!force_pt_level)) {
4132 if (level > PT_DIRECTORY_LEVEL &&
4133 !check_hugepage_cache_consistency(vcpu, gfn, level))
4134 level = PT_DIRECTORY_LEVEL;
4135 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
4136 }
4137
4138 if (fast_page_fault(vcpu, gpa, level, error_code))
4139 return RET_PF_RETRY;
4140
4141 mmu_seq = vcpu->kvm->mmu_notifier_seq;
4142 smp_rmb();
4143
4144 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
4145 return RET_PF_RETRY;
4146
4147 if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
4148 return r;
4149
4150 spin_lock(&vcpu->kvm->mmu_lock);
4151 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
4152 goto out_unlock;
4153 if (make_mmu_pages_available(vcpu) < 0)
4154 goto out_unlock;
4155 if (likely(!force_pt_level))
4156 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
4157 r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
4158 spin_unlock(&vcpu->kvm->mmu_lock);
4159
4160 return r;
4161
4162out_unlock:
4163 spin_unlock(&vcpu->kvm->mmu_lock);
4164 kvm_release_pfn_clean(pfn);
4165 return RET_PF_RETRY;
4166}
4167
4168static void nonpaging_init_context(struct kvm_vcpu *vcpu,
4169 struct kvm_mmu *context)
4170{
4171 context->page_fault = nonpaging_page_fault;
4172 context->gva_to_gpa = nonpaging_gva_to_gpa;
4173 context->sync_page = nonpaging_sync_page;
4174 context->invlpg = nonpaging_invlpg;
4175 context->update_pte = nonpaging_update_pte;
4176 context->root_level = 0;
4177 context->shadow_root_level = PT32E_ROOT_LEVEL;
4178 context->direct_map = true;
4179 context->nx = false;
4180}
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4191 union kvm_mmu_page_role new_role)
4192{
4193 uint i;
4194 struct kvm_mmu_root_info root;
4195 struct kvm_mmu *mmu = vcpu->arch.mmu;
4196
4197 root.cr3 = mmu->root_cr3;
4198 root.hpa = mmu->root_hpa;
4199
4200 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
4201 swap(root, mmu->prev_roots[i]);
4202
4203 if (new_cr3 == root.cr3 && VALID_PAGE(root.hpa) &&
4204 page_header(root.hpa) != NULL &&
4205 new_role.word == page_header(root.hpa)->role.word)
4206 break;
4207 }
4208
4209 mmu->root_hpa = root.hpa;
4210 mmu->root_cr3 = root.cr3;
4211
4212 return i < KVM_MMU_NUM_PREV_ROOTS;
4213}
4214
4215static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4216 union kvm_mmu_page_role new_role,
4217 bool skip_tlb_flush)
4218{
4219 struct kvm_mmu *mmu = vcpu->arch.mmu;
4220
4221
4222
4223
4224
4225
4226 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
4227 mmu->root_level >= PT64_ROOT_4LEVEL) {
4228 if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT))
4229 return false;
4230
4231 if (cached_root_available(vcpu, new_cr3, new_role)) {
4232 kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
4233 if (!skip_tlb_flush) {
4234 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
4235 kvm_x86_ops->tlb_flush(vcpu, true);
4236 }
4237
4238
4239
4240
4241
4242
4243
4244
4245 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
4246
4247 __clear_sp_write_flooding_count(
4248 page_header(mmu->root_hpa));
4249
4250 return true;
4251 }
4252 }
4253
4254 return false;
4255}
4256
4257static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4258 union kvm_mmu_page_role new_role,
4259 bool skip_tlb_flush)
4260{
4261 if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush))
4262 kvm_mmu_free_roots(vcpu, vcpu->arch.mmu,
4263 KVM_MMU_ROOT_CURRENT);
4264}
4265
4266void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
4267{
4268 __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu),
4269 skip_tlb_flush);
4270}
4271EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3);
4272
4273static unsigned long get_cr3(struct kvm_vcpu *vcpu)
4274{
4275 return kvm_read_cr3(vcpu);
4276}
4277
4278static void inject_page_fault(struct kvm_vcpu *vcpu,
4279 struct x86_exception *fault)
4280{
4281 vcpu->arch.mmu->inject_page_fault(vcpu, fault);
4282}
4283
4284static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
4285 unsigned access, int *nr_present)
4286{
4287 if (unlikely(is_mmio_spte(*sptep))) {
4288 if (gfn != get_mmio_spte_gfn(*sptep)) {
4289 mmu_spte_clear_no_track(sptep);
4290 return true;
4291 }
4292
4293 (*nr_present)++;
4294 mark_mmio_spte(vcpu, sptep, gfn, access);
4295 return true;
4296 }
4297
4298 return false;
4299}
4300
4301static inline bool is_last_gpte(struct kvm_mmu *mmu,
4302 unsigned level, unsigned gpte)
4303{
4304
4305
4306
4307
4308
4309 gpte &= level - mmu->last_nonleaf_level;
4310
4311
4312
4313
4314
4315
4316 gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
4317
4318 return gpte & PT_PAGE_SIZE_MASK;
4319}
4320
4321#define PTTYPE_EPT 18
4322#define PTTYPE PTTYPE_EPT
4323#include "paging_tmpl.h"
4324#undef PTTYPE
4325
4326#define PTTYPE 64
4327#include "paging_tmpl.h"
4328#undef PTTYPE
4329
4330#define PTTYPE 32
4331#include "paging_tmpl.h"
4332#undef PTTYPE
4333
4334static void
4335__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4336 struct rsvd_bits_validate *rsvd_check,
4337 int maxphyaddr, int level, bool nx, bool gbpages,
4338 bool pse, bool amd)
4339{
4340 u64 exb_bit_rsvd = 0;
4341 u64 gbpages_bit_rsvd = 0;
4342 u64 nonleaf_bit8_rsvd = 0;
4343
4344 rsvd_check->bad_mt_xwr = 0;
4345
4346 if (!nx)
4347 exb_bit_rsvd = rsvd_bits(63, 63);
4348 if (!gbpages)
4349 gbpages_bit_rsvd = rsvd_bits(7, 7);
4350
4351
4352
4353
4354
4355 if (amd)
4356 nonleaf_bit8_rsvd = rsvd_bits(8, 8);
4357
4358 switch (level) {
4359 case PT32_ROOT_LEVEL:
4360
4361 rsvd_check->rsvd_bits_mask[0][1] = 0;
4362 rsvd_check->rsvd_bits_mask[0][0] = 0;
4363 rsvd_check->rsvd_bits_mask[1][0] =
4364 rsvd_check->rsvd_bits_mask[0][0];
4365
4366 if (!pse) {
4367 rsvd_check->rsvd_bits_mask[1][1] = 0;
4368 break;
4369 }
4370
4371 if (is_cpuid_PSE36())
4372
4373 rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
4374 else
4375
4376 rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
4377 break;
4378 case PT32E_ROOT_LEVEL:
4379 rsvd_check->rsvd_bits_mask[0][2] =
4380 rsvd_bits(maxphyaddr, 63) |
4381 rsvd_bits(5, 8) | rsvd_bits(1, 2);
4382 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
4383 rsvd_bits(maxphyaddr, 62);
4384 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
4385 rsvd_bits(maxphyaddr, 62);
4386 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
4387 rsvd_bits(maxphyaddr, 62) |
4388 rsvd_bits(13, 20);
4389 rsvd_check->rsvd_bits_mask[1][0] =
4390 rsvd_check->rsvd_bits_mask[0][0];
4391 break;
4392 case PT64_ROOT_5LEVEL:
4393 rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |
4394 nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
4395 rsvd_bits(maxphyaddr, 51);
4396 rsvd_check->rsvd_bits_mask[1][4] =
4397 rsvd_check->rsvd_bits_mask[0][4];
4398
4399 case PT64_ROOT_4LEVEL:
4400 rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
4401 nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
4402 rsvd_bits(maxphyaddr, 51);
4403 rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
4404 nonleaf_bit8_rsvd | gbpages_bit_rsvd |
4405 rsvd_bits(maxphyaddr, 51);
4406 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
4407 rsvd_bits(maxphyaddr, 51);
4408 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
4409 rsvd_bits(maxphyaddr, 51);
4410 rsvd_check->rsvd_bits_mask[1][3] =
4411 rsvd_check->rsvd_bits_mask[0][3];
4412 rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd |
4413 gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
4414 rsvd_bits(13, 29);
4415 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
4416 rsvd_bits(maxphyaddr, 51) |
4417 rsvd_bits(13, 20);
4418 rsvd_check->rsvd_bits_mask[1][0] =
4419 rsvd_check->rsvd_bits_mask[0][0];
4420 break;
4421 }
4422}
4423
4424static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4425 struct kvm_mmu *context)
4426{
4427 __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
4428 cpuid_maxphyaddr(vcpu), context->root_level,
4429 context->nx,
4430 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
4431 is_pse(vcpu), guest_cpuid_is_amd(vcpu));
4432}
4433
4434static void
4435__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
4436 int maxphyaddr, bool execonly)
4437{
4438 u64 bad_mt_xwr;
4439
4440 rsvd_check->rsvd_bits_mask[0][4] =
4441 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
4442 rsvd_check->rsvd_bits_mask[0][3] =
4443 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
4444 rsvd_check->rsvd_bits_mask[0][2] =
4445 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
4446 rsvd_check->rsvd_bits_mask[0][1] =
4447 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
4448 rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
4449
4450
4451 rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
4452 rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
4453 rsvd_check->rsvd_bits_mask[1][2] =
4454 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
4455 rsvd_check->rsvd_bits_mask[1][1] =
4456 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
4457 rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
4458
4459 bad_mt_xwr = 0xFFull << (2 * 8);
4460 bad_mt_xwr |= 0xFFull << (3 * 8);
4461 bad_mt_xwr |= 0xFFull << (7 * 8);
4462 bad_mt_xwr |= REPEAT_BYTE(1ull << 2);
4463 bad_mt_xwr |= REPEAT_BYTE(1ull << 6);
4464 if (!execonly) {
4465
4466 bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
4467 }
4468 rsvd_check->bad_mt_xwr = bad_mt_xwr;
4469}
4470
4471static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
4472 struct kvm_mmu *context, bool execonly)
4473{
4474 __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
4475 cpuid_maxphyaddr(vcpu), execonly);
4476}
4477
4478
4479
4480
4481
4482
4483void
4484reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
4485{
4486 bool uses_nx = context->nx ||
4487 context->mmu_role.base.smep_andnot_wp;
4488 struct rsvd_bits_validate *shadow_zero_check;
4489 int i;
4490
4491
4492
4493
4494
4495 shadow_zero_check = &context->shadow_zero_check;
4496 __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4497 boot_cpu_data.x86_phys_bits,
4498 context->shadow_root_level, uses_nx,
4499 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
4500 is_pse(vcpu), true);
4501
4502 if (!shadow_me_mask)
4503 return;
4504
4505 for (i = context->shadow_root_level; --i >= 0;) {
4506 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4507 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4508 }
4509
4510}
4511EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
4512
4513static inline bool boot_cpu_is_amd(void)
4514{
4515 WARN_ON_ONCE(!tdp_enabled);
4516 return shadow_x_mask == 0;
4517}
4518
4519
4520
4521
4522
4523static void
4524reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4525 struct kvm_mmu *context)
4526{
4527 struct rsvd_bits_validate *shadow_zero_check;
4528 int i;
4529
4530 shadow_zero_check = &context->shadow_zero_check;
4531
4532 if (boot_cpu_is_amd())
4533 __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4534 boot_cpu_data.x86_phys_bits,
4535 context->shadow_root_level, false,
4536 boot_cpu_has(X86_FEATURE_GBPAGES),
4537 true, true);
4538 else
4539 __reset_rsvds_bits_mask_ept(shadow_zero_check,
4540 boot_cpu_data.x86_phys_bits,
4541 false);
4542
4543 if (!shadow_me_mask)
4544 return;
4545
4546 for (i = context->shadow_root_level; --i >= 0;) {
4547 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4548 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4549 }
4550}
4551
4552
4553
4554
4555
4556static void
4557reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4558 struct kvm_mmu *context, bool execonly)
4559{
4560 __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
4561 boot_cpu_data.x86_phys_bits, execonly);
4562}
4563
4564#define BYTE_MASK(access) \
4565 ((1 & (access) ? 2 : 0) | \
4566 (2 & (access) ? 4 : 0) | \
4567 (3 & (access) ? 8 : 0) | \
4568 (4 & (access) ? 16 : 0) | \
4569 (5 & (access) ? 32 : 0) | \
4570 (6 & (access) ? 64 : 0) | \
4571 (7 & (access) ? 128 : 0))
4572
4573
4574static void update_permission_bitmask(struct kvm_vcpu *vcpu,
4575 struct kvm_mmu *mmu, bool ept)
4576{
4577 unsigned byte;
4578
4579 const u8 x = BYTE_MASK(ACC_EXEC_MASK);
4580 const u8 w = BYTE_MASK(ACC_WRITE_MASK);
4581 const u8 u = BYTE_MASK(ACC_USER_MASK);
4582
4583 bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0;
4584 bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0;
4585 bool cr0_wp = is_write_protection(vcpu);
4586
4587 for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
4588 unsigned pfec = byte << 1;
4589
4590
4591
4592
4593
4594
4595
4596 u8 wf = (pfec & PFERR_WRITE_MASK) ? ~w : 0;
4597
4598 u8 uf = (pfec & PFERR_USER_MASK) ? ~u : 0;
4599
4600 u8 ff = (pfec & PFERR_FETCH_MASK) ? ~x : 0;
4601
4602 u8 smepf = 0;
4603
4604 u8 smapf = 0;
4605
4606 if (!ept) {
4607
4608 u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
4609
4610
4611 if (!mmu->nx)
4612 ff = 0;
4613
4614
4615 if (!cr0_wp)
4616 wf = (pfec & PFERR_USER_MASK) ? wf : 0;
4617
4618
4619 if (cr4_smep)
4620 smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638 if (cr4_smap)
4639 smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
4640 }
4641
4642 mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
4643 }
4644}
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
4671 bool ept)
4672{
4673 unsigned bit;
4674 bool wp;
4675
4676 if (ept) {
4677 mmu->pkru_mask = 0;
4678 return;
4679 }
4680
4681
4682 if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) {
4683 mmu->pkru_mask = 0;
4684 return;
4685 }
4686
4687 wp = is_write_protection(vcpu);
4688
4689 for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
4690 unsigned pfec, pkey_bits;
4691 bool check_pkey, check_write, ff, uf, wf, pte_user;
4692
4693 pfec = bit << 1;
4694 ff = pfec & PFERR_FETCH_MASK;
4695 uf = pfec & PFERR_USER_MASK;
4696 wf = pfec & PFERR_WRITE_MASK;
4697
4698
4699 pte_user = pfec & PFERR_RSVD_MASK;
4700
4701
4702
4703
4704
4705 check_pkey = (!ff && pte_user);
4706
4707
4708
4709
4710 check_write = check_pkey && wf && (uf || wp);
4711
4712
4713 pkey_bits = !!check_pkey;
4714
4715 pkey_bits |= (!!check_write) << 1;
4716
4717 mmu->pkru_mask |= (pkey_bits & 3) << pfec;
4718 }
4719}
4720
4721static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
4722{
4723 unsigned root_level = mmu->root_level;
4724
4725 mmu->last_nonleaf_level = root_level;
4726 if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
4727 mmu->last_nonleaf_level++;
4728}
4729
4730static void paging64_init_context_common(struct kvm_vcpu *vcpu,
4731 struct kvm_mmu *context,
4732 int level)
4733{
4734 context->nx = is_nx(vcpu);
4735 context->root_level = level;
4736
4737 reset_rsvds_bits_mask(vcpu, context);
4738 update_permission_bitmask(vcpu, context, false);
4739 update_pkru_bitmask(vcpu, context, false);
4740 update_last_nonleaf_level(vcpu, context);
4741
4742 MMU_WARN_ON(!is_pae(vcpu));
4743 context->page_fault = paging64_page_fault;
4744 context->gva_to_gpa = paging64_gva_to_gpa;
4745 context->sync_page = paging64_sync_page;
4746 context->invlpg = paging64_invlpg;
4747 context->update_pte = paging64_update_pte;
4748 context->shadow_root_level = level;
4749 context->direct_map = false;
4750}
4751
4752static void paging64_init_context(struct kvm_vcpu *vcpu,
4753 struct kvm_mmu *context)
4754{
4755 int root_level = is_la57_mode(vcpu) ?
4756 PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4757
4758 paging64_init_context_common(vcpu, context, root_level);
4759}
4760
4761static void paging32_init_context(struct kvm_vcpu *vcpu,
4762 struct kvm_mmu *context)
4763{
4764 context->nx = false;
4765 context->root_level = PT32_ROOT_LEVEL;
4766
4767 reset_rsvds_bits_mask(vcpu, context);
4768 update_permission_bitmask(vcpu, context, false);
4769 update_pkru_bitmask(vcpu, context, false);
4770 update_last_nonleaf_level(vcpu, context);
4771
4772 context->page_fault = paging32_page_fault;
4773 context->gva_to_gpa = paging32_gva_to_gpa;
4774 context->sync_page = paging32_sync_page;
4775 context->invlpg = paging32_invlpg;
4776 context->update_pte = paging32_update_pte;
4777 context->shadow_root_level = PT32E_ROOT_LEVEL;
4778 context->direct_map = false;
4779}
4780
4781static void paging32E_init_context(struct kvm_vcpu *vcpu,
4782 struct kvm_mmu *context)
4783{
4784 paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
4785}
4786
4787static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
4788{
4789 union kvm_mmu_extended_role ext = {0};
4790
4791 ext.cr0_pg = !!is_paging(vcpu);
4792 ext.cr4_pae = !!is_pae(vcpu);
4793 ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
4794 ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
4795 ext.cr4_pse = !!is_pse(vcpu);
4796 ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
4797 ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
4798 ext.maxphyaddr = cpuid_maxphyaddr(vcpu);
4799
4800 ext.valid = 1;
4801
4802 return ext;
4803}
4804
4805static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
4806 bool base_only)
4807{
4808 union kvm_mmu_role role = {0};
4809
4810 role.base.access = ACC_ALL;
4811 role.base.nxe = !!is_nx(vcpu);
4812 role.base.cr0_wp = is_write_protection(vcpu);
4813 role.base.smm = is_smm(vcpu);
4814 role.base.guest_mode = is_guest_mode(vcpu);
4815
4816 if (base_only)
4817 return role;
4818
4819 role.ext = kvm_calc_mmu_role_ext(vcpu);
4820
4821 return role;
4822}
4823
4824static union kvm_mmu_role
4825kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
4826{
4827 union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
4828
4829 role.base.ad_disabled = (shadow_accessed_mask == 0);
4830 role.base.level = kvm_x86_ops->get_tdp_level(vcpu);
4831 role.base.direct = true;
4832 role.base.gpte_is_8_bytes = true;
4833
4834 return role;
4835}
4836
4837static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
4838{
4839 struct kvm_mmu *context = vcpu->arch.mmu;
4840 union kvm_mmu_role new_role =
4841 kvm_calc_tdp_mmu_root_page_role(vcpu, false);
4842
4843 new_role.base.word &= mmu_base_role_mask.word;
4844 if (new_role.as_u64 == context->mmu_role.as_u64)
4845 return;
4846
4847 context->mmu_role.as_u64 = new_role.as_u64;
4848 context->page_fault = tdp_page_fault;
4849 context->sync_page = nonpaging_sync_page;
4850 context->invlpg = nonpaging_invlpg;
4851 context->update_pte = nonpaging_update_pte;
4852 context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
4853 context->direct_map = true;
4854 context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
4855 context->get_cr3 = get_cr3;
4856 context->get_pdptr = kvm_pdptr_read;
4857 context->inject_page_fault = kvm_inject_page_fault;
4858
4859 if (!is_paging(vcpu)) {
4860 context->nx = false;
4861 context->gva_to_gpa = nonpaging_gva_to_gpa;
4862 context->root_level = 0;
4863 } else if (is_long_mode(vcpu)) {
4864 context->nx = is_nx(vcpu);
4865 context->root_level = is_la57_mode(vcpu) ?
4866 PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4867 reset_rsvds_bits_mask(vcpu, context);
4868 context->gva_to_gpa = paging64_gva_to_gpa;
4869 } else if (is_pae(vcpu)) {
4870 context->nx = is_nx(vcpu);
4871 context->root_level = PT32E_ROOT_LEVEL;
4872 reset_rsvds_bits_mask(vcpu, context);
4873 context->gva_to_gpa = paging64_gva_to_gpa;
4874 } else {
4875 context->nx = false;
4876 context->root_level = PT32_ROOT_LEVEL;
4877 reset_rsvds_bits_mask(vcpu, context);
4878 context->gva_to_gpa = paging32_gva_to_gpa;
4879 }
4880
4881 update_permission_bitmask(vcpu, context, false);
4882 update_pkru_bitmask(vcpu, context, false);
4883 update_last_nonleaf_level(vcpu, context);
4884 reset_tdp_shadow_zero_bits_mask(vcpu, context);
4885}
4886
4887static union kvm_mmu_role
4888kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
4889{
4890 union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
4891
4892 role.base.smep_andnot_wp = role.ext.cr4_smep &&
4893 !is_write_protection(vcpu);
4894 role.base.smap_andnot_wp = role.ext.cr4_smap &&
4895 !is_write_protection(vcpu);
4896 role.base.direct = !is_paging(vcpu);
4897 role.base.gpte_is_8_bytes = !!is_pae(vcpu);
4898
4899 if (!is_long_mode(vcpu))
4900 role.base.level = PT32E_ROOT_LEVEL;
4901 else if (is_la57_mode(vcpu))
4902 role.base.level = PT64_ROOT_5LEVEL;
4903 else
4904 role.base.level = PT64_ROOT_4LEVEL;
4905
4906 return role;
4907}
4908
4909void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
4910{
4911 struct kvm_mmu *context = vcpu->arch.mmu;
4912 union kvm_mmu_role new_role =
4913 kvm_calc_shadow_mmu_root_page_role(vcpu, false);
4914
4915 new_role.base.word &= mmu_base_role_mask.word;
4916 if (new_role.as_u64 == context->mmu_role.as_u64)
4917 return;
4918
4919 if (!is_paging(vcpu))
4920 nonpaging_init_context(vcpu, context);
4921 else if (is_long_mode(vcpu))
4922 paging64_init_context(vcpu, context);
4923 else if (is_pae(vcpu))
4924 paging32E_init_context(vcpu, context);
4925 else
4926 paging32_init_context(vcpu, context);
4927
4928 context->mmu_role.as_u64 = new_role.as_u64;
4929 reset_shadow_zero_bits_mask(vcpu, context);
4930}
4931EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
4932
4933static union kvm_mmu_role
4934kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
4935 bool execonly)
4936{
4937 union kvm_mmu_role role = {0};
4938
4939
4940 role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
4941
4942 role.base.level = PT64_ROOT_4LEVEL;
4943 role.base.gpte_is_8_bytes = true;
4944 role.base.direct = false;
4945 role.base.ad_disabled = !accessed_dirty;
4946 role.base.guest_mode = true;
4947 role.base.access = ACC_ALL;
4948
4949
4950
4951
4952
4953 role.base.cr0_wp = true;
4954 role.base.smap_andnot_wp = true;
4955
4956 role.ext = kvm_calc_mmu_role_ext(vcpu);
4957 role.ext.execonly = execonly;
4958
4959 return role;
4960}
4961
4962void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
4963 bool accessed_dirty, gpa_t new_eptp)
4964{
4965 struct kvm_mmu *context = vcpu->arch.mmu;
4966 union kvm_mmu_role new_role =
4967 kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
4968 execonly);
4969
4970 __kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false);
4971
4972 new_role.base.word &= mmu_base_role_mask.word;
4973 if (new_role.as_u64 == context->mmu_role.as_u64)
4974 return;
4975
4976 context->shadow_root_level = PT64_ROOT_4LEVEL;
4977
4978 context->nx = true;
4979 context->ept_ad = accessed_dirty;
4980 context->page_fault = ept_page_fault;
4981 context->gva_to_gpa = ept_gva_to_gpa;
4982 context->sync_page = ept_sync_page;
4983 context->invlpg = ept_invlpg;
4984 context->update_pte = ept_update_pte;
4985 context->root_level = PT64_ROOT_4LEVEL;
4986 context->direct_map = false;
4987 context->mmu_role.as_u64 = new_role.as_u64;
4988
4989 update_permission_bitmask(vcpu, context, true);
4990 update_pkru_bitmask(vcpu, context, true);
4991 update_last_nonleaf_level(vcpu, context);
4992 reset_rsvds_bits_mask_ept(vcpu, context, execonly);
4993 reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
4994}
4995EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
4996
4997static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
4998{
4999 struct kvm_mmu *context = vcpu->arch.mmu;
5000
5001 kvm_init_shadow_mmu(vcpu);
5002 context->set_cr3 = kvm_x86_ops->set_cr3;
5003 context->get_cr3 = get_cr3;
5004 context->get_pdptr = kvm_pdptr_read;
5005 context->inject_page_fault = kvm_inject_page_fault;
5006}
5007
5008static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
5009{
5010 union kvm_mmu_role new_role = kvm_calc_mmu_role_common(vcpu, false);
5011 struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
5012
5013 new_role.base.word &= mmu_base_role_mask.word;
5014 if (new_role.as_u64 == g_context->mmu_role.as_u64)
5015 return;
5016
5017 g_context->mmu_role.as_u64 = new_role.as_u64;
5018 g_context->get_cr3 = get_cr3;
5019 g_context->get_pdptr = kvm_pdptr_read;
5020 g_context->inject_page_fault = kvm_inject_page_fault;
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030 if (!is_paging(vcpu)) {
5031 g_context->nx = false;
5032 g_context->root_level = 0;
5033 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
5034 } else if (is_long_mode(vcpu)) {
5035 g_context->nx = is_nx(vcpu);
5036 g_context->root_level = is_la57_mode(vcpu) ?
5037 PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
5038 reset_rsvds_bits_mask(vcpu, g_context);
5039 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
5040 } else if (is_pae(vcpu)) {
5041 g_context->nx = is_nx(vcpu);
5042 g_context->root_level = PT32E_ROOT_LEVEL;
5043 reset_rsvds_bits_mask(vcpu, g_context);
5044 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
5045 } else {
5046 g_context->nx = false;
5047 g_context->root_level = PT32_ROOT_LEVEL;
5048 reset_rsvds_bits_mask(vcpu, g_context);
5049 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
5050 }
5051
5052 update_permission_bitmask(vcpu, g_context, false);
5053 update_pkru_bitmask(vcpu, g_context, false);
5054 update_last_nonleaf_level(vcpu, g_context);
5055}
5056
5057void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
5058{
5059 if (reset_roots) {
5060 uint i;
5061
5062 vcpu->arch.mmu->root_hpa = INVALID_PAGE;
5063
5064 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5065 vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5066 }
5067
5068 if (mmu_is_nested(vcpu))
5069 init_kvm_nested_mmu(vcpu);
5070 else if (tdp_enabled)
5071 init_kvm_tdp_mmu(vcpu);
5072 else
5073 init_kvm_softmmu(vcpu);
5074}
5075EXPORT_SYMBOL_GPL(kvm_init_mmu);
5076
5077static union kvm_mmu_page_role
5078kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
5079{
5080 union kvm_mmu_role role;
5081
5082 if (tdp_enabled)
5083 role = kvm_calc_tdp_mmu_root_page_role(vcpu, true);
5084 else
5085 role = kvm_calc_shadow_mmu_root_page_role(vcpu, true);
5086
5087 return role.base;
5088}
5089
5090void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
5091{
5092 kvm_mmu_unload(vcpu);
5093 kvm_init_mmu(vcpu, true);
5094}
5095EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
5096
5097int kvm_mmu_load(struct kvm_vcpu *vcpu)
5098{
5099 int r;
5100
5101 r = mmu_topup_memory_caches(vcpu);
5102 if (r)
5103 goto out;
5104 r = mmu_alloc_roots(vcpu);
5105 kvm_mmu_sync_roots(vcpu);
5106 if (r)
5107 goto out;
5108 kvm_mmu_load_cr3(vcpu);
5109 kvm_x86_ops->tlb_flush(vcpu, true);
5110out:
5111 return r;
5112}
5113EXPORT_SYMBOL_GPL(kvm_mmu_load);
5114
5115void kvm_mmu_unload(struct kvm_vcpu *vcpu)
5116{
5117 kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
5118 WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa));
5119 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
5120 WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
5121}
5122EXPORT_SYMBOL_GPL(kvm_mmu_unload);
5123
5124static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
5125 struct kvm_mmu_page *sp, u64 *spte,
5126 const void *new)
5127{
5128 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
5129 ++vcpu->kvm->stat.mmu_pde_zapped;
5130 return;
5131 }
5132
5133 ++vcpu->kvm->stat.mmu_pte_updated;
5134 vcpu->arch.mmu->update_pte(vcpu, sp, spte, new);
5135}
5136
5137static bool need_remote_flush(u64 old, u64 new)
5138{
5139 if (!is_shadow_present_pte(old))
5140 return false;
5141 if (!is_shadow_present_pte(new))
5142 return true;
5143 if ((old ^ new) & PT64_BASE_ADDR_MASK)
5144 return true;
5145 old ^= shadow_nx_mask;
5146 new ^= shadow_nx_mask;
5147 return (old & ~new & PT64_PERM_MASK) != 0;
5148}
5149
5150static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
5151 int *bytes)
5152{
5153 u64 gentry = 0;
5154 int r;
5155
5156
5157
5158
5159
5160
5161 if (is_pae(vcpu) && *bytes == 4) {
5162
5163 *gpa &= ~(gpa_t)7;
5164 *bytes = 8;
5165 }
5166
5167 if (*bytes == 4 || *bytes == 8) {
5168 r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
5169 if (r)
5170 gentry = 0;
5171 }
5172
5173 return gentry;
5174}
5175
5176
5177
5178
5179
5180static bool detect_write_flooding(struct kvm_mmu_page *sp)
5181{
5182
5183
5184
5185
5186 if (sp->role.level == PT_PAGE_TABLE_LEVEL)
5187 return false;
5188
5189 atomic_inc(&sp->write_flooding_count);
5190 return atomic_read(&sp->write_flooding_count) >= 3;
5191}
5192
5193
5194
5195
5196
5197static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
5198 int bytes)
5199{
5200 unsigned offset, pte_size, misaligned;
5201
5202 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
5203 gpa, bytes, sp->role.word);
5204
5205 offset = offset_in_page(gpa);
5206 pte_size = sp->role.gpte_is_8_bytes ? 8 : 4;
5207
5208
5209
5210
5211
5212 if (!(offset & (pte_size - 1)) && bytes == 1)
5213 return false;
5214
5215 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
5216 misaligned |= bytes < 4;
5217
5218 return misaligned;
5219}
5220
5221static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
5222{
5223 unsigned page_offset, quadrant;
5224 u64 *spte;
5225 int level;
5226
5227 page_offset = offset_in_page(gpa);
5228 level = sp->role.level;
5229 *nspte = 1;
5230 if (!sp->role.gpte_is_8_bytes) {
5231 page_offset <<= 1;
5232
5233
5234
5235
5236
5237 if (level == PT32_ROOT_LEVEL) {
5238 page_offset &= ~7;
5239 page_offset <<= 1;
5240 *nspte = 2;
5241 }
5242 quadrant = page_offset >> PAGE_SHIFT;
5243 page_offset &= ~PAGE_MASK;
5244 if (quadrant != sp->role.quadrant)
5245 return NULL;
5246 }
5247
5248 spte = &sp->spt[page_offset / sizeof(*spte)];
5249 return spte;
5250}
5251
5252static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
5253 const u8 *new, int bytes,
5254 struct kvm_page_track_notifier_node *node)
5255{
5256 gfn_t gfn = gpa >> PAGE_SHIFT;
5257 struct kvm_mmu_page *sp;
5258 LIST_HEAD(invalid_list);
5259 u64 entry, gentry, *spte;
5260 int npte;
5261 bool remote_flush, local_flush;
5262
5263
5264
5265
5266
5267 if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
5268 return;
5269
5270 remote_flush = local_flush = false;
5271
5272 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
5273
5274
5275
5276
5277
5278
5279 mmu_topup_memory_caches(vcpu);
5280
5281 spin_lock(&vcpu->kvm->mmu_lock);
5282
5283 gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
5284
5285 ++vcpu->kvm->stat.mmu_pte_write;
5286 kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
5287
5288 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
5289 if (detect_write_misaligned(sp, gpa, bytes) ||
5290 detect_write_flooding(sp)) {
5291 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
5292 ++vcpu->kvm->stat.mmu_flooded;
5293 continue;
5294 }
5295
5296 spte = get_written_sptes(sp, gpa, &npte);
5297 if (!spte)
5298 continue;
5299
5300 local_flush = true;
5301 while (npte--) {
5302 u32 base_role = vcpu->arch.mmu->mmu_role.base.word;
5303
5304 entry = *spte;
5305 mmu_page_zap_pte(vcpu->kvm, sp, spte);
5306 if (gentry &&
5307 !((sp->role.word ^ base_role)
5308 & mmu_base_role_mask.word) && rmap_can_add(vcpu))
5309 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
5310 if (need_remote_flush(entry, *spte))
5311 remote_flush = true;
5312 ++spte;
5313 }
5314 }
5315 kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
5316 kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
5317 spin_unlock(&vcpu->kvm->mmu_lock);
5318}
5319
5320int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
5321{
5322 gpa_t gpa;
5323 int r;
5324
5325 if (vcpu->arch.mmu->direct_map)
5326 return 0;
5327
5328 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
5329
5330 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
5331
5332 return r;
5333}
5334EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
5335
5336static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
5337{
5338 LIST_HEAD(invalid_list);
5339
5340 if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
5341 return 0;
5342
5343 while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
5344 if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
5345 break;
5346
5347 ++vcpu->kvm->stat.mmu_recycled;
5348 }
5349 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
5350
5351 if (!kvm_mmu_available_pages(vcpu->kvm))
5352 return -ENOSPC;
5353 return 0;
5354}
5355
5356int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
5357 void *insn, int insn_len)
5358{
5359 int r, emulation_type = 0;
5360 enum emulation_result er;
5361 bool direct = vcpu->arch.mmu->direct_map;
5362
5363
5364 if (vcpu->arch.mmu->direct_map) {
5365 vcpu->arch.gpa_available = true;
5366 vcpu->arch.gpa_val = cr2;
5367 }
5368
5369 r = RET_PF_INVALID;
5370 if (unlikely(error_code & PFERR_RSVD_MASK)) {
5371 r = handle_mmio_page_fault(vcpu, cr2, direct);
5372 if (r == RET_PF_EMULATE)
5373 goto emulate;
5374 }
5375
5376 if (r == RET_PF_INVALID) {
5377 r = vcpu->arch.mmu->page_fault(vcpu, cr2,
5378 lower_32_bits(error_code),
5379 false);
5380 WARN_ON(r == RET_PF_INVALID);
5381 }
5382
5383 if (r == RET_PF_RETRY)
5384 return 1;
5385 if (r < 0)
5386 return r;
5387
5388
5389
5390
5391
5392
5393
5394
5395 if (vcpu->arch.mmu->direct_map &&
5396 (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
5397 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
5398 return 1;
5399 }
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412 if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu))
5413 emulation_type = EMULTYPE_ALLOW_RETRY;
5414emulate:
5415
5416
5417
5418
5419
5420
5421
5422 if (unlikely(insn && !insn_len)) {
5423 if (!kvm_x86_ops->need_emulation_on_page_fault(vcpu))
5424 return 1;
5425 }
5426
5427 er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
5428
5429 switch (er) {
5430 case EMULATE_DONE:
5431 return 1;
5432 case EMULATE_USER_EXIT:
5433 ++vcpu->stat.mmio_exits;
5434
5435 case EMULATE_FAIL:
5436 return 0;
5437 default:
5438 BUG();
5439 }
5440}
5441EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
5442
5443void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
5444{
5445 struct kvm_mmu *mmu = vcpu->arch.mmu;
5446 int i;
5447
5448
5449 if (is_noncanonical_address(gva, vcpu))
5450 return;
5451
5452 mmu->invlpg(vcpu, gva, mmu->root_hpa);
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5466 if (VALID_PAGE(mmu->prev_roots[i].hpa))
5467 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5468
5469 kvm_x86_ops->tlb_flush_gva(vcpu, gva);
5470 ++vcpu->stat.invlpg;
5471}
5472EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
5473
5474void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
5475{
5476 struct kvm_mmu *mmu = vcpu->arch.mmu;
5477 bool tlb_flush = false;
5478 uint i;
5479
5480 if (pcid == kvm_get_active_pcid(vcpu)) {
5481 mmu->invlpg(vcpu, gva, mmu->root_hpa);
5482 tlb_flush = true;
5483 }
5484
5485 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5486 if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
5487 pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) {
5488 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5489 tlb_flush = true;
5490 }
5491 }
5492
5493 if (tlb_flush)
5494 kvm_x86_ops->tlb_flush_gva(vcpu, gva);
5495
5496 ++vcpu->stat.invlpg;
5497
5498
5499
5500
5501
5502
5503}
5504EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
5505
5506void kvm_enable_tdp(void)
5507{
5508 tdp_enabled = true;
5509}
5510EXPORT_SYMBOL_GPL(kvm_enable_tdp);
5511
5512void kvm_disable_tdp(void)
5513{
5514 tdp_enabled = false;
5515}
5516EXPORT_SYMBOL_GPL(kvm_disable_tdp);
5517
5518
5519
5520typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
5521
5522
5523static __always_inline bool
5524slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
5525 slot_level_handler fn, int start_level, int end_level,
5526 gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
5527{
5528 struct slot_rmap_walk_iterator iterator;
5529 bool flush = false;
5530
5531 for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
5532 end_gfn, &iterator) {
5533 if (iterator.rmap)
5534 flush |= fn(kvm, iterator.rmap);
5535
5536 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
5537 if (flush && lock_flush_tlb) {
5538 kvm_flush_remote_tlbs_with_address(kvm,
5539 start_gfn,
5540 iterator.gfn - start_gfn + 1);
5541 flush = false;
5542 }
5543 cond_resched_lock(&kvm->mmu_lock);
5544 }
5545 }
5546
5547 if (flush && lock_flush_tlb) {
5548 kvm_flush_remote_tlbs_with_address(kvm, start_gfn,
5549 end_gfn - start_gfn + 1);
5550 flush = false;
5551 }
5552
5553 return flush;
5554}
5555
5556static __always_inline bool
5557slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5558 slot_level_handler fn, int start_level, int end_level,
5559 bool lock_flush_tlb)
5560{
5561 return slot_handle_level_range(kvm, memslot, fn, start_level,
5562 end_level, memslot->base_gfn,
5563 memslot->base_gfn + memslot->npages - 1,
5564 lock_flush_tlb);
5565}
5566
5567static __always_inline bool
5568slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5569 slot_level_handler fn, bool lock_flush_tlb)
5570{
5571 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
5572 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5573}
5574
5575static __always_inline bool
5576slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5577 slot_level_handler fn, bool lock_flush_tlb)
5578{
5579 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
5580 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5581}
5582
5583static __always_inline bool
5584slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
5585 slot_level_handler fn, bool lock_flush_tlb)
5586{
5587 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
5588 PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
5589}
5590
5591static void free_mmu_pages(struct kvm_vcpu *vcpu)
5592{
5593 free_page((unsigned long)vcpu->arch.mmu->pae_root);
5594 free_page((unsigned long)vcpu->arch.mmu->lm_root);
5595}
5596
5597static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
5598{
5599 struct page *page;
5600 int i;
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611 if (tdp_enabled && kvm_x86_ops->get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
5612 return 0;
5613
5614 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
5615 if (!page)
5616 return -ENOMEM;
5617
5618 vcpu->arch.mmu->pae_root = page_address(page);
5619 for (i = 0; i < 4; ++i)
5620 vcpu->arch.mmu->pae_root[i] = INVALID_PAGE;
5621
5622 return 0;
5623}
5624
5625int kvm_mmu_create(struct kvm_vcpu *vcpu)
5626{
5627 uint i;
5628
5629 vcpu->arch.mmu = &vcpu->arch.root_mmu;
5630 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
5631
5632 vcpu->arch.root_mmu.root_hpa = INVALID_PAGE;
5633 vcpu->arch.root_mmu.root_cr3 = 0;
5634 vcpu->arch.root_mmu.translate_gpa = translate_gpa;
5635 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5636 vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5637
5638 vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE;
5639 vcpu->arch.guest_mmu.root_cr3 = 0;
5640 vcpu->arch.guest_mmu.translate_gpa = translate_gpa;
5641 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5642 vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5643
5644 vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
5645 return alloc_mmu_pages(vcpu);
5646}
5647
5648static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
5649 struct kvm_memory_slot *slot,
5650 struct kvm_page_track_notifier_node *node)
5651{
5652 struct kvm_mmu_page *sp;
5653 LIST_HEAD(invalid_list);
5654 unsigned long i;
5655 bool flush;
5656 gfn_t gfn;
5657
5658 spin_lock(&kvm->mmu_lock);
5659
5660 if (list_empty(&kvm->arch.active_mmu_pages))
5661 goto out_unlock;
5662
5663 flush = slot_handle_all_level(kvm, slot, kvm_zap_rmapp, false);
5664
5665 for (i = 0; i < slot->npages; i++) {
5666 gfn = slot->base_gfn + i;
5667
5668 for_each_valid_sp(kvm, sp, gfn) {
5669 if (sp->gfn != gfn)
5670 continue;
5671
5672 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
5673 }
5674 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
5675 kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
5676 flush = false;
5677 cond_resched_lock(&kvm->mmu_lock);
5678 }
5679 }
5680 kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
5681
5682out_unlock:
5683 spin_unlock(&kvm->mmu_lock);
5684}
5685
5686void kvm_mmu_init_vm(struct kvm *kvm)
5687{
5688 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5689
5690 node->track_write = kvm_mmu_pte_write;
5691 node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
5692 kvm_page_track_register_notifier(kvm, node);
5693}
5694
5695void kvm_mmu_uninit_vm(struct kvm *kvm)
5696{
5697 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5698
5699 kvm_page_track_unregister_notifier(kvm, node);
5700}
5701
5702void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
5703{
5704 struct kvm_memslots *slots;
5705 struct kvm_memory_slot *memslot;
5706 int i;
5707
5708 spin_lock(&kvm->mmu_lock);
5709 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
5710 slots = __kvm_memslots(kvm, i);
5711 kvm_for_each_memslot(memslot, slots) {
5712 gfn_t start, end;
5713
5714 start = max(gfn_start, memslot->base_gfn);
5715 end = min(gfn_end, memslot->base_gfn + memslot->npages);
5716 if (start >= end)
5717 continue;
5718
5719 slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
5720 PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
5721 start, end - 1, true);
5722 }
5723 }
5724
5725 spin_unlock(&kvm->mmu_lock);
5726}
5727
5728static bool slot_rmap_write_protect(struct kvm *kvm,
5729 struct kvm_rmap_head *rmap_head)
5730{
5731 return __rmap_write_protect(kvm, rmap_head, false);
5732}
5733
5734void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
5735 struct kvm_memory_slot *memslot)
5736{
5737 bool flush;
5738
5739 spin_lock(&kvm->mmu_lock);
5740 flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
5741 false);
5742 spin_unlock(&kvm->mmu_lock);
5743
5744
5745
5746
5747
5748
5749 lockdep_assert_held(&kvm->slots_lock);
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762 if (flush)
5763 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5764 memslot->npages);
5765}
5766
5767static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
5768 struct kvm_rmap_head *rmap_head)
5769{
5770 u64 *sptep;
5771 struct rmap_iterator iter;
5772 int need_tlb_flush = 0;
5773 kvm_pfn_t pfn;
5774 struct kvm_mmu_page *sp;
5775
5776restart:
5777 for_each_rmap_spte(rmap_head, &iter, sptep) {
5778 sp = page_header(__pa(sptep));
5779 pfn = spte_to_pfn(*sptep);
5780
5781
5782
5783
5784
5785
5786
5787
5788 if (sp->role.direct &&
5789 !kvm_is_reserved_pfn(pfn) &&
5790 PageTransCompoundMap(pfn_to_page(pfn))) {
5791 pte_list_remove(rmap_head, sptep);
5792
5793 if (kvm_available_flush_tlb_with_range())
5794 kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
5795 KVM_PAGES_PER_HPAGE(sp->role.level));
5796 else
5797 need_tlb_flush = 1;
5798
5799 goto restart;
5800 }
5801 }
5802
5803 return need_tlb_flush;
5804}
5805
5806void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
5807 const struct kvm_memory_slot *memslot)
5808{
5809
5810 spin_lock(&kvm->mmu_lock);
5811 slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
5812 kvm_mmu_zap_collapsible_spte, true);
5813 spin_unlock(&kvm->mmu_lock);
5814}
5815
5816void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
5817 struct kvm_memory_slot *memslot)
5818{
5819 bool flush;
5820
5821 spin_lock(&kvm->mmu_lock);
5822 flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
5823 spin_unlock(&kvm->mmu_lock);
5824
5825 lockdep_assert_held(&kvm->slots_lock);
5826
5827
5828
5829
5830
5831
5832
5833 if (flush)
5834 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5835 memslot->npages);
5836}
5837EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
5838
5839void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
5840 struct kvm_memory_slot *memslot)
5841{
5842 bool flush;
5843
5844 spin_lock(&kvm->mmu_lock);
5845 flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
5846 false);
5847 spin_unlock(&kvm->mmu_lock);
5848
5849
5850 lockdep_assert_held(&kvm->slots_lock);
5851
5852 if (flush)
5853 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5854 memslot->npages);
5855}
5856EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
5857
5858void kvm_mmu_slot_set_dirty(struct kvm *kvm,
5859 struct kvm_memory_slot *memslot)
5860{
5861 bool flush;
5862
5863 spin_lock(&kvm->mmu_lock);
5864 flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
5865 spin_unlock(&kvm->mmu_lock);
5866
5867 lockdep_assert_held(&kvm->slots_lock);
5868
5869
5870 if (flush)
5871 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5872 memslot->npages);
5873}
5874EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
5875
5876static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only)
5877{
5878 struct kvm_mmu_page *sp, *node;
5879 LIST_HEAD(invalid_list);
5880 int ign;
5881
5882 spin_lock(&kvm->mmu_lock);
5883restart:
5884 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
5885 if (mmio_only && !sp->mmio_cached)
5886 continue;
5887 if (sp->role.invalid && sp->root_count)
5888 continue;
5889 if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) {
5890 WARN_ON_ONCE(mmio_only);
5891 goto restart;
5892 }
5893 if (cond_resched_lock(&kvm->mmu_lock))
5894 goto restart;
5895 }
5896
5897 kvm_mmu_commit_zap_page(kvm, &invalid_list);
5898 spin_unlock(&kvm->mmu_lock);
5899}
5900
5901void kvm_mmu_zap_all(struct kvm *kvm)
5902{
5903 return __kvm_mmu_zap_all(kvm, false);
5904}
5905
5906void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
5907{
5908 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
5909
5910 gen &= MMIO_SPTE_GEN_MASK;
5911
5912
5913
5914
5915
5916
5917
5918
5919 gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
5920
5921
5922
5923
5924
5925 if (unlikely(gen == 0)) {
5926 kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
5927 __kvm_mmu_zap_all(kvm, true);
5928 }
5929}
5930
5931static unsigned long
5932mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
5933{
5934 struct kvm *kvm;
5935 int nr_to_scan = sc->nr_to_scan;
5936 unsigned long freed = 0;
5937
5938 spin_lock(&kvm_lock);
5939
5940 list_for_each_entry(kvm, &vm_list, vm_list) {
5941 int idx;
5942 LIST_HEAD(invalid_list);
5943
5944
5945
5946
5947
5948
5949
5950 if (!nr_to_scan--)
5951 break;
5952
5953
5954
5955
5956
5957
5958 if (!kvm->arch.n_used_mmu_pages)
5959 continue;
5960
5961 idx = srcu_read_lock(&kvm->srcu);
5962 spin_lock(&kvm->mmu_lock);
5963
5964 if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
5965 freed++;
5966 kvm_mmu_commit_zap_page(kvm, &invalid_list);
5967
5968 spin_unlock(&kvm->mmu_lock);
5969 srcu_read_unlock(&kvm->srcu, idx);
5970
5971
5972
5973
5974
5975
5976 list_move_tail(&kvm->vm_list, &vm_list);
5977 break;
5978 }
5979
5980 spin_unlock(&kvm_lock);
5981 return freed;
5982}
5983
5984static unsigned long
5985mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
5986{
5987 return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
5988}
5989
5990static struct shrinker mmu_shrinker = {
5991 .count_objects = mmu_shrink_count,
5992 .scan_objects = mmu_shrink_scan,
5993 .seeks = DEFAULT_SEEKS * 10,
5994};
5995
5996static void mmu_destroy_caches(void)
5997{
5998 kmem_cache_destroy(pte_list_desc_cache);
5999 kmem_cache_destroy(mmu_page_header_cache);
6000}
6001
6002int kvm_mmu_module_init(void)
6003{
6004 int ret = -ENOMEM;
6005
6006
6007
6008
6009
6010
6011
6012 BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
6013 BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
6014 BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64));
6015
6016 kvm_mmu_reset_all_pte_masks();
6017
6018 pte_list_desc_cache = kmem_cache_create("pte_list_desc",
6019 sizeof(struct pte_list_desc),
6020 0, SLAB_ACCOUNT, NULL);
6021 if (!pte_list_desc_cache)
6022 goto out;
6023
6024 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
6025 sizeof(struct kvm_mmu_page),
6026 0, SLAB_ACCOUNT, NULL);
6027 if (!mmu_page_header_cache)
6028 goto out;
6029
6030 if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
6031 goto out;
6032
6033 ret = register_shrinker(&mmu_shrinker);
6034 if (ret)
6035 goto out;
6036
6037 return 0;
6038
6039out:
6040 mmu_destroy_caches();
6041 return ret;
6042}
6043
6044
6045
6046
6047unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
6048{
6049 unsigned long nr_mmu_pages;
6050 unsigned long nr_pages = 0;
6051 struct kvm_memslots *slots;
6052 struct kvm_memory_slot *memslot;
6053 int i;
6054
6055 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
6056 slots = __kvm_memslots(kvm, i);
6057
6058 kvm_for_each_memslot(memslot, slots)
6059 nr_pages += memslot->npages;
6060 }
6061
6062 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
6063 nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
6064
6065 return nr_mmu_pages;
6066}
6067
6068void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
6069{
6070 kvm_mmu_unload(vcpu);
6071 free_mmu_pages(vcpu);
6072 mmu_free_memory_caches(vcpu);
6073}
6074
6075void kvm_mmu_module_exit(void)
6076{
6077 mmu_destroy_caches();
6078 percpu_counter_destroy(&kvm_total_used_mmu_pages);
6079 unregister_shrinker(&mmu_shrinker);
6080 mmu_audit_disable();
6081}
6082