1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41#include <linux/sched.h>
42#include <linux/highmem.h>
43#include <linux/debugfs.h>
44#include <linux/bug.h>
45#include <linux/module.h>
46
47#include <asm/pgtable.h>
48#include <asm/tlbflush.h>
49#include <asm/fixmap.h>
50#include <asm/mmu_context.h>
51#include <asm/setup.h>
52#include <asm/paravirt.h>
53#include <asm/linkage.h>
54
55#include <asm/xen/hypercall.h>
56#include <asm/xen/hypervisor.h>
57
58#include <xen/page.h>
59#include <xen/interface/xen.h>
60#include <xen/interface/version.h>
61#include <xen/hvc-console.h>
62
63#include "multicalls.h"
64#include "mmu.h"
65#include "debugfs.h"
66
67#define MMU_UPDATE_HISTO 30
68
69#ifdef CONFIG_XEN_DEBUG_FS
70
71static struct {
72 u32 pgd_update;
73 u32 pgd_update_pinned;
74 u32 pgd_update_batched;
75
76 u32 pud_update;
77 u32 pud_update_pinned;
78 u32 pud_update_batched;
79
80 u32 pmd_update;
81 u32 pmd_update_pinned;
82 u32 pmd_update_batched;
83
84 u32 pte_update;
85 u32 pte_update_pinned;
86 u32 pte_update_batched;
87
88 u32 mmu_update;
89 u32 mmu_update_extended;
90 u32 mmu_update_histo[MMU_UPDATE_HISTO];
91
92 u32 prot_commit;
93 u32 prot_commit_batched;
94
95 u32 set_pte_at;
96 u32 set_pte_at_batched;
97 u32 set_pte_at_pinned;
98 u32 set_pte_at_current;
99 u32 set_pte_at_kernel;
100} mmu_stats;
101
102static u8 zero_stats;
103
104static inline void check_zero(void)
105{
106 if (unlikely(zero_stats)) {
107 memset(&mmu_stats, 0, sizeof(mmu_stats));
108 zero_stats = 0;
109 }
110}
111
112#define ADD_STATS(elem, val) \
113 do { check_zero(); mmu_stats.elem += (val); } while(0)
114
115#else
116
117#define ADD_STATS(elem, val) do { (void)(val); } while(0)
118
119#endif
120
121
122
123
124
125
126
127static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
128
129#ifdef CONFIG_X86_64
130
131static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
132#endif
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148DEFINE_PER_CPU(unsigned long, xen_cr3);
149DEFINE_PER_CPU(unsigned long, xen_current_cr3);
150
151
152
153
154
155
156#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
157
158
159#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
160#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
161
162
163static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
164 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
165
166
167static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
168 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
169
170
171static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
172
173static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
174 __page_aligned_bss;
175
176static inline unsigned p2m_top_index(unsigned long pfn)
177{
178 BUG_ON(pfn >= MAX_DOMAIN_PAGES);
179 return pfn / P2M_ENTRIES_PER_PAGE;
180}
181
182static inline unsigned p2m_index(unsigned long pfn)
183{
184 return pfn % P2M_ENTRIES_PER_PAGE;
185}
186
187
188static void __init xen_build_mfn_list_list(void)
189{
190 unsigned pfn, idx;
191
192 for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
193 unsigned topidx = p2m_top_index(pfn);
194
195 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
196 }
197
198 for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
199 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
200 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
201 }
202}
203
204void xen_setup_mfn_list_list(void)
205{
206 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
207
208 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
209 virt_to_mfn(p2m_top_mfn_list);
210 HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
211}
212
213
214void __init xen_build_dynamic_phys_to_machine(void)
215{
216 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
217 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
218 unsigned pfn;
219
220 for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
221 unsigned topidx = p2m_top_index(pfn);
222
223 p2m_top[topidx] = &mfn_list[pfn];
224 }
225
226 xen_build_mfn_list_list();
227}
228
229unsigned long get_phys_to_machine(unsigned long pfn)
230{
231 unsigned topidx, idx;
232
233 if (unlikely(pfn >= MAX_DOMAIN_PAGES))
234 return INVALID_P2M_ENTRY;
235
236 topidx = p2m_top_index(pfn);
237 idx = p2m_index(pfn);
238 return p2m_top[topidx][idx];
239}
240EXPORT_SYMBOL_GPL(get_phys_to_machine);
241
242
243bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
244{
245 unsigned topidx = p2m_top_index(pfn);
246 unsigned long **pfnp, *mfnp;
247 unsigned i;
248
249 pfnp = &p2m_top[topidx];
250 mfnp = &p2m_top_mfn[topidx];
251
252 for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
253 p[i] = INVALID_P2M_ENTRY;
254
255 if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) {
256 *mfnp = virt_to_mfn(p);
257 return true;
258 }
259
260 return false;
261}
262
263static void alloc_p2m(unsigned long pfn)
264{
265 unsigned long *p;
266
267 p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
268 BUG_ON(p == NULL);
269
270 if (!install_p2mtop_page(pfn, p))
271 free_page((unsigned long)p);
272}
273
274
275bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
276{
277 unsigned topidx, idx;
278
279 if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
280 BUG_ON(mfn != INVALID_P2M_ENTRY);
281 return true;
282 }
283
284 topidx = p2m_top_index(pfn);
285 if (p2m_top[topidx] == p2m_missing) {
286 if (mfn == INVALID_P2M_ENTRY)
287 return true;
288 return false;
289 }
290
291 idx = p2m_index(pfn);
292 p2m_top[topidx][idx] = mfn;
293
294 return true;
295}
296
297void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
298{
299 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
300 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
301 return;
302 }
303
304 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
305 alloc_p2m(pfn);
306
307 if (!__set_phys_to_machine(pfn, mfn))
308 BUG();
309 }
310}
311
312unsigned long arbitrary_virt_to_mfn(void *vaddr)
313{
314 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
315
316 return PFN_DOWN(maddr.maddr);
317}
318
319xmaddr_t arbitrary_virt_to_machine(void *vaddr)
320{
321 unsigned long address = (unsigned long)vaddr;
322 unsigned int level;
323 pte_t *pte;
324 unsigned offset;
325
326
327
328
329
330 if (virt_addr_valid(vaddr))
331 return virt_to_machine(vaddr);
332
333
334
335 pte = lookup_address(address, &level);
336 BUG_ON(pte == NULL);
337 offset = address & ~PAGE_MASK;
338 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
339}
340
341void make_lowmem_page_readonly(void *vaddr)
342{
343 pte_t *pte, ptev;
344 unsigned long address = (unsigned long)vaddr;
345 unsigned int level;
346
347 pte = lookup_address(address, &level);
348 BUG_ON(pte == NULL);
349
350 ptev = pte_wrprotect(*pte);
351
352 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
353 BUG();
354}
355
356void make_lowmem_page_readwrite(void *vaddr)
357{
358 pte_t *pte, ptev;
359 unsigned long address = (unsigned long)vaddr;
360 unsigned int level;
361
362 pte = lookup_address(address, &level);
363 BUG_ON(pte == NULL);
364
365 ptev = pte_mkwrite(*pte);
366
367 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
368 BUG();
369}
370
371
372static bool xen_page_pinned(void *ptr)
373{
374 struct page *page = virt_to_page(ptr);
375
376 return PagePinned(page);
377}
378
379static void xen_extend_mmu_update(const struct mmu_update *update)
380{
381 struct multicall_space mcs;
382 struct mmu_update *u;
383
384 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
385
386 if (mcs.mc != NULL) {
387 ADD_STATS(mmu_update_extended, 1);
388 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
389
390 mcs.mc->args[1]++;
391
392 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
393 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
394 else
395 ADD_STATS(mmu_update_histo[0], 1);
396 } else {
397 ADD_STATS(mmu_update, 1);
398 mcs = __xen_mc_entry(sizeof(*u));
399 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
400 ADD_STATS(mmu_update_histo[1], 1);
401 }
402
403 u = mcs.args;
404 *u = *update;
405}
406
407void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
408{
409 struct mmu_update u;
410
411 preempt_disable();
412
413 xen_mc_batch();
414
415
416 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
417 u.val = pmd_val_ma(val);
418 xen_extend_mmu_update(&u);
419
420 ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
421
422 xen_mc_issue(PARAVIRT_LAZY_MMU);
423
424 preempt_enable();
425}
426
427void xen_set_pmd(pmd_t *ptr, pmd_t val)
428{
429 ADD_STATS(pmd_update, 1);
430
431
432
433 if (!xen_page_pinned(ptr)) {
434 *ptr = val;
435 return;
436 }
437
438 ADD_STATS(pmd_update_pinned, 1);
439
440 xen_set_pmd_hyper(ptr, val);
441}
442
443
444
445
446
447void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
448{
449 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
450}
451
452void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
453 pte_t *ptep, pte_t pteval)
454{
455 ADD_STATS(set_pte_at, 1);
456
457 ADD_STATS(set_pte_at_current, mm == current->mm);
458 ADD_STATS(set_pte_at_kernel, mm == &init_mm);
459
460 if (mm == current->mm || mm == &init_mm) {
461 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
462 struct multicall_space mcs;
463 mcs = xen_mc_entry(0);
464
465 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
466 ADD_STATS(set_pte_at_batched, 1);
467 xen_mc_issue(PARAVIRT_LAZY_MMU);
468 goto out;
469 } else
470 if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
471 goto out;
472 }
473 xen_set_pte(ptep, pteval);
474
475out: return;
476}
477
478pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
479 unsigned long addr, pte_t *ptep)
480{
481
482 return *ptep;
483}
484
485void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
486 pte_t *ptep, pte_t pte)
487{
488 struct mmu_update u;
489
490 xen_mc_batch();
491
492 u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
493 u.val = pte_val_ma(pte);
494 xen_extend_mmu_update(&u);
495
496 ADD_STATS(prot_commit, 1);
497 ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
498
499 xen_mc_issue(PARAVIRT_LAZY_MMU);
500}
501
502
503static pteval_t pte_mfn_to_pfn(pteval_t val)
504{
505 if (val & _PAGE_PRESENT) {
506 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
507 pteval_t flags = val & PTE_FLAGS_MASK;
508 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
509 }
510
511 return val;
512}
513
514static pteval_t pte_pfn_to_mfn(pteval_t val)
515{
516 if (val & _PAGE_PRESENT) {
517 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
518 pteval_t flags = val & PTE_FLAGS_MASK;
519 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
520 }
521
522 return val;
523}
524
525pteval_t xen_pte_val(pte_t pte)
526{
527 return pte_mfn_to_pfn(pte.pte);
528}
529PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
530
531pgdval_t xen_pgd_val(pgd_t pgd)
532{
533 return pte_mfn_to_pfn(pgd.pgd);
534}
535PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
536
537pte_t xen_make_pte(pteval_t pte)
538{
539 pte = pte_pfn_to_mfn(pte);
540 return native_make_pte(pte);
541}
542PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
543
544pgd_t xen_make_pgd(pgdval_t pgd)
545{
546 pgd = pte_pfn_to_mfn(pgd);
547 return native_make_pgd(pgd);
548}
549PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
550
551pmdval_t xen_pmd_val(pmd_t pmd)
552{
553 return pte_mfn_to_pfn(pmd.pmd);
554}
555PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
556
557void xen_set_pud_hyper(pud_t *ptr, pud_t val)
558{
559 struct mmu_update u;
560
561 preempt_disable();
562
563 xen_mc_batch();
564
565
566 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
567 u.val = pud_val_ma(val);
568 xen_extend_mmu_update(&u);
569
570 ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
571
572 xen_mc_issue(PARAVIRT_LAZY_MMU);
573
574 preempt_enable();
575}
576
577void xen_set_pud(pud_t *ptr, pud_t val)
578{
579 ADD_STATS(pud_update, 1);
580
581
582
583 if (!xen_page_pinned(ptr)) {
584 *ptr = val;
585 return;
586 }
587
588 ADD_STATS(pud_update_pinned, 1);
589
590 xen_set_pud_hyper(ptr, val);
591}
592
593void xen_set_pte(pte_t *ptep, pte_t pte)
594{
595 ADD_STATS(pte_update, 1);
596
597 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
598
599#ifdef CONFIG_X86_PAE
600 ptep->pte_high = pte.pte_high;
601 smp_wmb();
602 ptep->pte_low = pte.pte_low;
603#else
604 *ptep = pte;
605#endif
606}
607
608#ifdef CONFIG_X86_PAE
609void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
610{
611 set_64bit((u64 *)ptep, native_pte_val(pte));
612}
613
614void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
615{
616 ptep->pte_low = 0;
617 smp_wmb();
618 ptep->pte_high = 0;
619}
620
621void xen_pmd_clear(pmd_t *pmdp)
622{
623 set_pmd(pmdp, __pmd(0));
624}
625#endif
626
627pmd_t xen_make_pmd(pmdval_t pmd)
628{
629 pmd = pte_pfn_to_mfn(pmd);
630 return native_make_pmd(pmd);
631}
632PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
633
634#if PAGETABLE_LEVELS == 4
635pudval_t xen_pud_val(pud_t pud)
636{
637 return pte_mfn_to_pfn(pud.pud);
638}
639PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
640
641pud_t xen_make_pud(pudval_t pud)
642{
643 pud = pte_pfn_to_mfn(pud);
644
645 return native_make_pud(pud);
646}
647PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
648
649pgd_t *xen_get_user_pgd(pgd_t *pgd)
650{
651 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
652 unsigned offset = pgd - pgd_page;
653 pgd_t *user_ptr = NULL;
654
655 if (offset < pgd_index(USER_LIMIT)) {
656 struct page *page = virt_to_page(pgd_page);
657 user_ptr = (pgd_t *)page->private;
658 if (user_ptr)
659 user_ptr += offset;
660 }
661
662 return user_ptr;
663}
664
665static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
666{
667 struct mmu_update u;
668
669 u.ptr = virt_to_machine(ptr).maddr;
670 u.val = pgd_val_ma(val);
671 xen_extend_mmu_update(&u);
672}
673
674
675
676
677
678
679
680
681void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
682{
683 preempt_disable();
684
685 xen_mc_batch();
686
687 __xen_set_pgd_hyper(ptr, val);
688
689 xen_mc_issue(PARAVIRT_LAZY_MMU);
690
691 preempt_enable();
692}
693
694void xen_set_pgd(pgd_t *ptr, pgd_t val)
695{
696 pgd_t *user_ptr = xen_get_user_pgd(ptr);
697
698 ADD_STATS(pgd_update, 1);
699
700
701
702 if (!xen_page_pinned(ptr)) {
703 *ptr = val;
704 if (user_ptr) {
705 WARN_ON(xen_page_pinned(user_ptr));
706 *user_ptr = val;
707 }
708 return;
709 }
710
711 ADD_STATS(pgd_update_pinned, 1);
712 ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
713
714
715
716 xen_mc_batch();
717
718 __xen_set_pgd_hyper(ptr, val);
719 if (user_ptr)
720 __xen_set_pgd_hyper(user_ptr, val);
721
722 xen_mc_issue(PARAVIRT_LAZY_MMU);
723}
724#endif
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
742 int (*func)(struct mm_struct *mm, struct page *,
743 enum pt_level),
744 unsigned long limit)
745{
746 int flush = 0;
747 unsigned hole_low, hole_high;
748 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
749 unsigned pgdidx, pudidx, pmdidx;
750
751
752 limit--;
753 BUG_ON(limit >= FIXADDR_TOP);
754
755 if (xen_feature(XENFEAT_auto_translated_physmap))
756 return 0;
757
758
759
760
761
762
763 hole_low = pgd_index(USER_LIMIT);
764 hole_high = pgd_index(PAGE_OFFSET);
765
766 pgdidx_limit = pgd_index(limit);
767#if PTRS_PER_PUD > 1
768 pudidx_limit = pud_index(limit);
769#else
770 pudidx_limit = 0;
771#endif
772#if PTRS_PER_PMD > 1
773 pmdidx_limit = pmd_index(limit);
774#else
775 pmdidx_limit = 0;
776#endif
777
778 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
779 pud_t *pud;
780
781 if (pgdidx >= hole_low && pgdidx < hole_high)
782 continue;
783
784 if (!pgd_val(pgd[pgdidx]))
785 continue;
786
787 pud = pud_offset(&pgd[pgdidx], 0);
788
789 if (PTRS_PER_PUD > 1)
790 flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
791
792 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
793 pmd_t *pmd;
794
795 if (pgdidx == pgdidx_limit &&
796 pudidx > pudidx_limit)
797 goto out;
798
799 if (pud_none(pud[pudidx]))
800 continue;
801
802 pmd = pmd_offset(&pud[pudidx], 0);
803
804 if (PTRS_PER_PMD > 1)
805 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
806
807 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
808 struct page *pte;
809
810 if (pgdidx == pgdidx_limit &&
811 pudidx == pudidx_limit &&
812 pmdidx > pmdidx_limit)
813 goto out;
814
815 if (pmd_none(pmd[pmdidx]))
816 continue;
817
818 pte = pmd_page(pmd[pmdidx]);
819 flush |= (*func)(mm, pte, PT_PTE);
820 }
821 }
822 }
823
824out:
825
826
827 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
828
829 return flush;
830}
831
832static int xen_pgd_walk(struct mm_struct *mm,
833 int (*func)(struct mm_struct *mm, struct page *,
834 enum pt_level),
835 unsigned long limit)
836{
837 return __xen_pgd_walk(mm, mm->pgd, func, limit);
838}
839
840
841
842static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
843{
844 spinlock_t *ptl = NULL;
845
846#if USE_SPLIT_PTLOCKS
847 ptl = __pte_lockptr(page);
848 spin_lock_nest_lock(ptl, &mm->page_table_lock);
849#endif
850
851 return ptl;
852}
853
854static void xen_pte_unlock(void *v)
855{
856 spinlock_t *ptl = v;
857 spin_unlock(ptl);
858}
859
860static void xen_do_pin(unsigned level, unsigned long pfn)
861{
862 struct mmuext_op *op;
863 struct multicall_space mcs;
864
865 mcs = __xen_mc_entry(sizeof(*op));
866 op = mcs.args;
867 op->cmd = level;
868 op->arg1.mfn = pfn_to_mfn(pfn);
869 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
870}
871
872static int xen_pin_page(struct mm_struct *mm, struct page *page,
873 enum pt_level level)
874{
875 unsigned pgfl = TestSetPagePinned(page);
876 int flush;
877
878 if (pgfl)
879 flush = 0;
880 else if (PageHighMem(page))
881
882
883 flush = 1;
884 else {
885 void *pt = lowmem_page_address(page);
886 unsigned long pfn = page_to_pfn(page);
887 struct multicall_space mcs = __xen_mc_entry(0);
888 spinlock_t *ptl;
889
890 flush = 0;
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912 ptl = NULL;
913 if (level == PT_PTE)
914 ptl = xen_pte_lock(page, mm);
915
916 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
917 pfn_pte(pfn, PAGE_KERNEL_RO),
918 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
919
920 if (ptl) {
921 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
922
923
924
925 xen_mc_callback(xen_pte_unlock, ptl);
926 }
927 }
928
929 return flush;
930}
931
932
933
934
935static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
936{
937 vm_unmap_aliases();
938
939 xen_mc_batch();
940
941 if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
942
943 xen_mc_issue(0);
944
945 kmap_flush_unused();
946
947 xen_mc_batch();
948 }
949
950#ifdef CONFIG_X86_64
951 {
952 pgd_t *user_pgd = xen_get_user_pgd(pgd);
953
954 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
955
956 if (user_pgd) {
957 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
958 xen_do_pin(MMUEXT_PIN_L4_TABLE,
959 PFN_DOWN(__pa(user_pgd)));
960 }
961 }
962#else
963#ifdef CONFIG_X86_PAE
964
965 xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
966 PT_PMD);
967#endif
968 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
969#endif
970 xen_mc_issue(0);
971}
972
973static void xen_pgd_pin(struct mm_struct *mm)
974{
975 __xen_pgd_pin(mm, mm->pgd);
976}
977
978
979
980
981
982
983
984
985
986
987
988void xen_mm_pin_all(void)
989{
990 unsigned long flags;
991 struct page *page;
992
993 spin_lock_irqsave(&pgd_lock, flags);
994
995 list_for_each_entry(page, &pgd_list, lru) {
996 if (!PagePinned(page)) {
997 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
998 SetPageSavePinned(page);
999 }
1000 }
1001
1002 spin_unlock_irqrestore(&pgd_lock, flags);
1003}
1004
1005
1006
1007
1008
1009
1010static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
1011 enum pt_level level)
1012{
1013 SetPagePinned(page);
1014 return 0;
1015}
1016
1017static void __init xen_mark_init_mm_pinned(void)
1018{
1019 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
1020}
1021
1022static int xen_unpin_page(struct mm_struct *mm, struct page *page,
1023 enum pt_level level)
1024{
1025 unsigned pgfl = TestClearPagePinned(page);
1026
1027 if (pgfl && !PageHighMem(page)) {
1028 void *pt = lowmem_page_address(page);
1029 unsigned long pfn = page_to_pfn(page);
1030 spinlock_t *ptl = NULL;
1031 struct multicall_space mcs;
1032
1033
1034
1035
1036
1037
1038
1039
1040 if (level == PT_PTE) {
1041 ptl = xen_pte_lock(page, mm);
1042
1043 if (ptl)
1044 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
1045 }
1046
1047 mcs = __xen_mc_entry(0);
1048
1049 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
1050 pfn_pte(pfn, PAGE_KERNEL),
1051 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
1052
1053 if (ptl) {
1054
1055 xen_mc_callback(xen_pte_unlock, ptl);
1056 }
1057 }
1058
1059 return 0;
1060}
1061
1062
1063static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
1064{
1065 xen_mc_batch();
1066
1067 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1068
1069#ifdef CONFIG_X86_64
1070 {
1071 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1072
1073 if (user_pgd) {
1074 xen_do_pin(MMUEXT_UNPIN_TABLE,
1075 PFN_DOWN(__pa(user_pgd)));
1076 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
1077 }
1078 }
1079#endif
1080
1081#ifdef CONFIG_X86_PAE
1082
1083 xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
1084 PT_PMD);
1085#endif
1086
1087 __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
1088
1089 xen_mc_issue(0);
1090}
1091
1092static void xen_pgd_unpin(struct mm_struct *mm)
1093{
1094 __xen_pgd_unpin(mm, mm->pgd);
1095}
1096
1097
1098
1099
1100
1101void xen_mm_unpin_all(void)
1102{
1103 unsigned long flags;
1104 struct page *page;
1105
1106 spin_lock_irqsave(&pgd_lock, flags);
1107
1108 list_for_each_entry(page, &pgd_list, lru) {
1109 if (PageSavePinned(page)) {
1110 BUG_ON(!PagePinned(page));
1111 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
1112 ClearPageSavePinned(page);
1113 }
1114 }
1115
1116 spin_unlock_irqrestore(&pgd_lock, flags);
1117}
1118
1119void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
1120{
1121 spin_lock(&next->page_table_lock);
1122 xen_pgd_pin(next);
1123 spin_unlock(&next->page_table_lock);
1124}
1125
1126void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1127{
1128 spin_lock(&mm->page_table_lock);
1129 xen_pgd_pin(mm);
1130 spin_unlock(&mm->page_table_lock);
1131}
1132
1133
1134#ifdef CONFIG_SMP
1135
1136
1137static void drop_other_mm_ref(void *info)
1138{
1139 struct mm_struct *mm = info;
1140 struct mm_struct *active_mm;
1141
1142 active_mm = percpu_read(cpu_tlbstate.active_mm);
1143
1144 if (active_mm == mm)
1145 leave_mm(smp_processor_id());
1146
1147
1148
1149 if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
1150 load_cr3(swapper_pg_dir);
1151}
1152
1153static void xen_drop_mm_ref(struct mm_struct *mm)
1154{
1155 cpumask_var_t mask;
1156 unsigned cpu;
1157
1158 if (current->active_mm == mm) {
1159 if (current->mm == mm)
1160 load_cr3(swapper_pg_dir);
1161 else
1162 leave_mm(smp_processor_id());
1163 }
1164
1165
1166 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1167 for_each_online_cpu(cpu) {
1168 if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
1169 && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1170 continue;
1171 smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1172 }
1173 return;
1174 }
1175 cpumask_copy(mask, mm_cpumask(mm));
1176
1177
1178
1179
1180
1181
1182 for_each_online_cpu(cpu) {
1183 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
1184 cpumask_set_cpu(cpu, mask);
1185 }
1186
1187 if (!cpumask_empty(mask))
1188 smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1189 free_cpumask_var(mask);
1190}
1191#else
1192static void xen_drop_mm_ref(struct mm_struct *mm)
1193{
1194 if (current->active_mm == mm)
1195 load_cr3(swapper_pg_dir);
1196}
1197#endif
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213void xen_exit_mmap(struct mm_struct *mm)
1214{
1215 get_cpu();
1216 xen_drop_mm_ref(mm);
1217 put_cpu();
1218
1219 spin_lock(&mm->page_table_lock);
1220
1221
1222 if (xen_page_pinned(mm->pgd))
1223 xen_pgd_unpin(mm);
1224
1225 spin_unlock(&mm->page_table_lock);
1226}
1227
1228static __init void xen_pagetable_setup_start(pgd_t *base)
1229{
1230}
1231
1232static void xen_post_allocator_init(void);
1233
1234static __init void xen_pagetable_setup_done(pgd_t *base)
1235{
1236 xen_setup_shared_info();
1237 xen_post_allocator_init();
1238}
1239
1240static void xen_write_cr2(unsigned long cr2)
1241{
1242 percpu_read(xen_vcpu)->arch.cr2 = cr2;
1243}
1244
1245static unsigned long xen_read_cr2(void)
1246{
1247 return percpu_read(xen_vcpu)->arch.cr2;
1248}
1249
1250unsigned long xen_read_cr2_direct(void)
1251{
1252 return percpu_read(xen_vcpu_info.arch.cr2);
1253}
1254
1255static void xen_flush_tlb(void)
1256{
1257 struct mmuext_op *op;
1258 struct multicall_space mcs;
1259
1260 preempt_disable();
1261
1262 mcs = xen_mc_entry(sizeof(*op));
1263
1264 op = mcs.args;
1265 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1266 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1267
1268 xen_mc_issue(PARAVIRT_LAZY_MMU);
1269
1270 preempt_enable();
1271}
1272
1273static void xen_flush_tlb_single(unsigned long addr)
1274{
1275 struct mmuext_op *op;
1276 struct multicall_space mcs;
1277
1278 preempt_disable();
1279
1280 mcs = xen_mc_entry(sizeof(*op));
1281 op = mcs.args;
1282 op->cmd = MMUEXT_INVLPG_LOCAL;
1283 op->arg1.linear_addr = addr & PAGE_MASK;
1284 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1285
1286 xen_mc_issue(PARAVIRT_LAZY_MMU);
1287
1288 preempt_enable();
1289}
1290
1291static void xen_flush_tlb_others(const struct cpumask *cpus,
1292 struct mm_struct *mm, unsigned long va)
1293{
1294 struct {
1295 struct mmuext_op op;
1296 DECLARE_BITMAP(mask, NR_CPUS);
1297 } *args;
1298 struct multicall_space mcs;
1299
1300 if (cpumask_empty(cpus))
1301 return;
1302
1303 mcs = xen_mc_entry(sizeof(*args));
1304 args = mcs.args;
1305 args->op.arg2.vcpumask = to_cpumask(args->mask);
1306
1307
1308 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1309 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1310
1311 if (va == TLB_FLUSH_ALL) {
1312 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1313 } else {
1314 args->op.cmd = MMUEXT_INVLPG_MULTI;
1315 args->op.arg1.linear_addr = va;
1316 }
1317
1318 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1319
1320 xen_mc_issue(PARAVIRT_LAZY_MMU);
1321}
1322
1323static unsigned long xen_read_cr3(void)
1324{
1325 return percpu_read(xen_cr3);
1326}
1327
1328static void set_current_cr3(void *v)
1329{
1330 percpu_write(xen_current_cr3, (unsigned long)v);
1331}
1332
1333static void __xen_write_cr3(bool kernel, unsigned long cr3)
1334{
1335 struct mmuext_op *op;
1336 struct multicall_space mcs;
1337 unsigned long mfn;
1338
1339 if (cr3)
1340 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1341 else
1342 mfn = 0;
1343
1344 WARN_ON(mfn == 0 && kernel);
1345
1346 mcs = __xen_mc_entry(sizeof(*op));
1347
1348 op = mcs.args;
1349 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1350 op->arg1.mfn = mfn;
1351
1352 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1353
1354 if (kernel) {
1355 percpu_write(xen_cr3, cr3);
1356
1357
1358
1359 xen_mc_callback(set_current_cr3, (void *)cr3);
1360 }
1361}
1362
1363static void xen_write_cr3(unsigned long cr3)
1364{
1365 BUG_ON(preemptible());
1366
1367 xen_mc_batch();
1368
1369
1370
1371 percpu_write(xen_cr3, cr3);
1372
1373 __xen_write_cr3(true, cr3);
1374
1375#ifdef CONFIG_X86_64
1376 {
1377 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1378 if (user_pgd)
1379 __xen_write_cr3(false, __pa(user_pgd));
1380 else
1381 __xen_write_cr3(false, 0);
1382 }
1383#endif
1384
1385 xen_mc_issue(PARAVIRT_LAZY_CPU);
1386}
1387
1388static int xen_pgd_alloc(struct mm_struct *mm)
1389{
1390 pgd_t *pgd = mm->pgd;
1391 int ret = 0;
1392
1393 BUG_ON(PagePinned(virt_to_page(pgd)));
1394
1395#ifdef CONFIG_X86_64
1396 {
1397 struct page *page = virt_to_page(pgd);
1398 pgd_t *user_pgd;
1399
1400 BUG_ON(page->private != 0);
1401
1402 ret = -ENOMEM;
1403
1404 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1405 page->private = (unsigned long)user_pgd;
1406
1407 if (user_pgd != NULL) {
1408 user_pgd[pgd_index(VSYSCALL_START)] =
1409 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1410 ret = 0;
1411 }
1412
1413 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1414 }
1415#endif
1416
1417 return ret;
1418}
1419
1420static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1421{
1422#ifdef CONFIG_X86_64
1423 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1424
1425 if (user_pgd)
1426 free_page((unsigned long)user_pgd);
1427#endif
1428}
1429
1430#ifdef CONFIG_HIGHPTE
1431static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
1432{
1433 pgprot_t prot = PAGE_KERNEL;
1434
1435 if (PagePinned(page))
1436 prot = PAGE_KERNEL_RO;
1437
1438 if (0 && PageHighMem(page))
1439 printk("mapping highpte %lx type %d prot %s\n",
1440 page_to_pfn(page), type,
1441 (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
1442
1443 return kmap_atomic_prot(page, type, prot);
1444}
1445#endif
1446
1447#ifdef CONFIG_X86_32
1448static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1449{
1450
1451 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1452 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1453 pte_val_ma(pte));
1454
1455 return pte;
1456}
1457
1458
1459
1460static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1461{
1462 pte = mask_rw_pte(ptep, pte);
1463
1464 xen_set_pte(ptep, pte);
1465}
1466#endif
1467
1468static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1469{
1470 struct mmuext_op op;
1471 op.cmd = cmd;
1472 op.arg1.mfn = pfn_to_mfn(pfn);
1473 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1474 BUG();
1475}
1476
1477
1478
1479static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1480{
1481#ifdef CONFIG_FLATMEM
1482 BUG_ON(mem_map);
1483#endif
1484 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1485 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1486}
1487
1488
1489static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1490{
1491#ifdef CONFIG_FLATMEM
1492 BUG_ON(mem_map);
1493#endif
1494 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1495}
1496
1497
1498
1499static __init void xen_release_pte_init(unsigned long pfn)
1500{
1501 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1502 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1503}
1504
1505static __init void xen_release_pmd_init(unsigned long pfn)
1506{
1507 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1508}
1509
1510
1511
1512static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
1513{
1514 struct page *page = pfn_to_page(pfn);
1515
1516 if (PagePinned(virt_to_page(mm->pgd))) {
1517 SetPagePinned(page);
1518
1519 vm_unmap_aliases();
1520 if (!PageHighMem(page)) {
1521 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1522 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1523 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1524 } else {
1525
1526
1527 kmap_flush_unused();
1528 }
1529 }
1530}
1531
1532static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1533{
1534 xen_alloc_ptpage(mm, pfn, PT_PTE);
1535}
1536
1537static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1538{
1539 xen_alloc_ptpage(mm, pfn, PT_PMD);
1540}
1541
1542
1543static void xen_release_ptpage(unsigned long pfn, unsigned level)
1544{
1545 struct page *page = pfn_to_page(pfn);
1546
1547 if (PagePinned(page)) {
1548 if (!PageHighMem(page)) {
1549 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1550 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1551 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1552 }
1553 ClearPagePinned(page);
1554 }
1555}
1556
1557static void xen_release_pte(unsigned long pfn)
1558{
1559 xen_release_ptpage(pfn, PT_PTE);
1560}
1561
1562static void xen_release_pmd(unsigned long pfn)
1563{
1564 xen_release_ptpage(pfn, PT_PMD);
1565}
1566
1567#if PAGETABLE_LEVELS == 4
1568static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1569{
1570 xen_alloc_ptpage(mm, pfn, PT_PUD);
1571}
1572
1573static void xen_release_pud(unsigned long pfn)
1574{
1575 xen_release_ptpage(pfn, PT_PUD);
1576}
1577#endif
1578
1579void __init xen_reserve_top(void)
1580{
1581#ifdef CONFIG_X86_32
1582 unsigned long top = HYPERVISOR_VIRT_START;
1583 struct xen_platform_parameters pp;
1584
1585 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1586 top = pp.virt_start;
1587
1588 reserve_top_address(-top);
1589#endif
1590}
1591
1592
1593
1594
1595
1596static void *__ka(phys_addr_t paddr)
1597{
1598#ifdef CONFIG_X86_64
1599 return (void *)(paddr + __START_KERNEL_map);
1600#else
1601 return __va(paddr);
1602#endif
1603}
1604
1605
1606static unsigned long m2p(phys_addr_t maddr)
1607{
1608 phys_addr_t paddr;
1609
1610 maddr &= PTE_PFN_MASK;
1611 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1612
1613 return paddr;
1614}
1615
1616
1617static void *m2v(phys_addr_t maddr)
1618{
1619 return __ka(m2p(maddr));
1620}
1621
1622static void set_page_prot(void *addr, pgprot_t prot)
1623{
1624 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1625 pte_t pte = pfn_pte(pfn, prot);
1626
1627 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1628 BUG();
1629}
1630
1631static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1632{
1633 unsigned pmdidx, pteidx;
1634 unsigned ident_pte;
1635 unsigned long pfn;
1636
1637 ident_pte = 0;
1638 pfn = 0;
1639 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1640 pte_t *pte_page;
1641
1642
1643 if (pmd_present(pmd[pmdidx]))
1644 pte_page = m2v(pmd[pmdidx].pmd);
1645 else {
1646
1647 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1648 break;
1649
1650 pte_page = &level1_ident_pgt[ident_pte];
1651 ident_pte += PTRS_PER_PTE;
1652
1653 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1654 }
1655
1656
1657 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1658 pte_t pte;
1659
1660 if (pfn > max_pfn_mapped)
1661 max_pfn_mapped = pfn;
1662
1663 if (!pte_none(pte_page[pteidx]))
1664 continue;
1665
1666 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1667 pte_page[pteidx] = pte;
1668 }
1669 }
1670
1671 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1672 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1673
1674 set_page_prot(pmd, PAGE_KERNEL_RO);
1675}
1676
1677#ifdef CONFIG_X86_64
1678static void convert_pfn_mfn(void *v)
1679{
1680 pte_t *pte = v;
1681 int i;
1682
1683
1684
1685 for (i = 0; i < PTRS_PER_PTE; i++)
1686 pte[i] = xen_make_pte(pte[i].pte);
1687}
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1701 unsigned long max_pfn)
1702{
1703 pud_t *l3;
1704 pmd_t *l2;
1705
1706
1707 init_level4_pgt[0] = __pgd(0);
1708
1709
1710 convert_pfn_mfn(init_level4_pgt);
1711 convert_pfn_mfn(level3_ident_pgt);
1712 convert_pfn_mfn(level3_kernel_pgt);
1713
1714 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1715 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1716
1717 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1718 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1719
1720 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1721 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1722 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1723
1724
1725 xen_map_identity_early(level2_ident_pgt, max_pfn);
1726
1727
1728 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1729 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1730 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1731 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1732 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1733 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1734
1735
1736 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1737 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1738
1739
1740 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1741
1742
1743 pgd = init_level4_pgt;
1744
1745
1746
1747
1748
1749
1750 xen_mc_batch();
1751 __xen_write_cr3(true, __pa(pgd));
1752 xen_mc_issue(PARAVIRT_LAZY_CPU);
1753
1754 reserve_early(__pa(xen_start_info->pt_base),
1755 __pa(xen_start_info->pt_base +
1756 xen_start_info->nr_pt_frames * PAGE_SIZE),
1757 "XEN PAGETABLES");
1758
1759 return pgd;
1760}
1761#else
1762static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1763
1764__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1765 unsigned long max_pfn)
1766{
1767 pmd_t *kernel_pmd;
1768
1769 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
1770 xen_start_info->nr_pt_frames * PAGE_SIZE +
1771 512*1024);
1772
1773 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1774 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1775
1776 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1777
1778 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1779 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1780 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1781
1782 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1783 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1784 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1785
1786 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1787
1788 xen_write_cr3(__pa(swapper_pg_dir));
1789
1790 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1791
1792 reserve_early(__pa(xen_start_info->pt_base),
1793 __pa(xen_start_info->pt_base +
1794 xen_start_info->nr_pt_frames * PAGE_SIZE),
1795 "XEN PAGETABLES");
1796
1797 return swapper_pg_dir;
1798}
1799#endif
1800
1801static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1802{
1803 pte_t pte;
1804
1805 phys >>= PAGE_SHIFT;
1806
1807 switch (idx) {
1808 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1809#ifdef CONFIG_X86_F00F_BUG
1810 case FIX_F00F_IDT:
1811#endif
1812#ifdef CONFIG_X86_32
1813 case FIX_WP_TEST:
1814 case FIX_VDSO:
1815# ifdef CONFIG_HIGHMEM
1816 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1817# endif
1818#else
1819 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1820#endif
1821#ifdef CONFIG_X86_LOCAL_APIC
1822 case FIX_APIC_BASE:
1823#endif
1824 case FIX_TEXT_POKE0:
1825 case FIX_TEXT_POKE1:
1826
1827 pte = pfn_pte(phys, prot);
1828 break;
1829
1830 default:
1831 pte = mfn_pte(phys, prot);
1832 break;
1833 }
1834
1835 __native_set_fixmap(idx, pte);
1836
1837#ifdef CONFIG_X86_64
1838
1839
1840 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1841 unsigned long vaddr = __fix_to_virt(idx);
1842 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1843 }
1844#endif
1845}
1846
1847static __init void xen_post_allocator_init(void)
1848{
1849 pv_mmu_ops.set_pte = xen_set_pte;
1850 pv_mmu_ops.set_pmd = xen_set_pmd;
1851 pv_mmu_ops.set_pud = xen_set_pud;
1852#if PAGETABLE_LEVELS == 4
1853 pv_mmu_ops.set_pgd = xen_set_pgd;
1854#endif
1855
1856
1857
1858 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1859 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1860 pv_mmu_ops.release_pte = xen_release_pte;
1861 pv_mmu_ops.release_pmd = xen_release_pmd;
1862#if PAGETABLE_LEVELS == 4
1863 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1864 pv_mmu_ops.release_pud = xen_release_pud;
1865#endif
1866
1867#ifdef CONFIG_X86_64
1868 SetPagePinned(virt_to_page(level3_user_vsyscall));
1869#endif
1870 xen_mark_init_mm_pinned();
1871}
1872
1873static void xen_leave_lazy_mmu(void)
1874{
1875 preempt_disable();
1876 xen_mc_flush();
1877 paravirt_leave_lazy_mmu();
1878 preempt_enable();
1879}
1880
1881static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1882 .read_cr2 = xen_read_cr2,
1883 .write_cr2 = xen_write_cr2,
1884
1885 .read_cr3 = xen_read_cr3,
1886 .write_cr3 = xen_write_cr3,
1887
1888 .flush_tlb_user = xen_flush_tlb,
1889 .flush_tlb_kernel = xen_flush_tlb,
1890 .flush_tlb_single = xen_flush_tlb_single,
1891 .flush_tlb_others = xen_flush_tlb_others,
1892
1893 .pte_update = paravirt_nop,
1894 .pte_update_defer = paravirt_nop,
1895
1896 .pgd_alloc = xen_pgd_alloc,
1897 .pgd_free = xen_pgd_free,
1898
1899 .alloc_pte = xen_alloc_pte_init,
1900 .release_pte = xen_release_pte_init,
1901 .alloc_pmd = xen_alloc_pmd_init,
1902 .alloc_pmd_clone = paravirt_nop,
1903 .release_pmd = xen_release_pmd_init,
1904
1905#ifdef CONFIG_HIGHPTE
1906 .kmap_atomic_pte = xen_kmap_atomic_pte,
1907#endif
1908
1909#ifdef CONFIG_X86_64
1910 .set_pte = xen_set_pte,
1911#else
1912 .set_pte = xen_set_pte_init,
1913#endif
1914 .set_pte_at = xen_set_pte_at,
1915 .set_pmd = xen_set_pmd_hyper,
1916
1917 .ptep_modify_prot_start = __ptep_modify_prot_start,
1918 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1919
1920 .pte_val = PV_CALLEE_SAVE(xen_pte_val),
1921 .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
1922
1923 .make_pte = PV_CALLEE_SAVE(xen_make_pte),
1924 .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
1925
1926#ifdef CONFIG_X86_PAE
1927 .set_pte_atomic = xen_set_pte_atomic,
1928 .pte_clear = xen_pte_clear,
1929 .pmd_clear = xen_pmd_clear,
1930#endif
1931 .set_pud = xen_set_pud_hyper,
1932
1933 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
1934 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
1935
1936#if PAGETABLE_LEVELS == 4
1937 .pud_val = PV_CALLEE_SAVE(xen_pud_val),
1938 .make_pud = PV_CALLEE_SAVE(xen_make_pud),
1939 .set_pgd = xen_set_pgd_hyper,
1940
1941 .alloc_pud = xen_alloc_pmd_init,
1942 .release_pud = xen_release_pmd_init,
1943#endif
1944
1945 .activate_mm = xen_activate_mm,
1946 .dup_mmap = xen_dup_mmap,
1947 .exit_mmap = xen_exit_mmap,
1948
1949 .lazy_mode = {
1950 .enter = paravirt_enter_lazy_mmu,
1951 .leave = xen_leave_lazy_mmu,
1952 },
1953
1954 .set_fixmap = xen_set_fixmap,
1955};
1956
1957void __init xen_init_mmu_ops(void)
1958{
1959 x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
1960 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
1961 pv_mmu_ops = xen_mmu_ops;
1962}
1963
1964#ifdef CONFIG_XEN_DEBUG_FS
1965
1966static struct dentry *d_mmu_debug;
1967
1968static int __init xen_mmu_debugfs(void)
1969{
1970 struct dentry *d_xen = xen_init_debugfs();
1971
1972 if (d_xen == NULL)
1973 return -ENOMEM;
1974
1975 d_mmu_debug = debugfs_create_dir("mmu", d_xen);
1976
1977 debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
1978
1979 debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
1980 debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
1981 &mmu_stats.pgd_update_pinned);
1982 debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
1983 &mmu_stats.pgd_update_pinned);
1984
1985 debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
1986 debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
1987 &mmu_stats.pud_update_pinned);
1988 debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
1989 &mmu_stats.pud_update_pinned);
1990
1991 debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
1992 debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
1993 &mmu_stats.pmd_update_pinned);
1994 debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
1995 &mmu_stats.pmd_update_pinned);
1996
1997 debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
1998
1999
2000 debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
2001 &mmu_stats.pte_update_pinned);
2002
2003 debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
2004 debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
2005 &mmu_stats.mmu_update_extended);
2006 xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
2007 mmu_stats.mmu_update_histo, 20);
2008
2009 debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
2010 debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
2011 &mmu_stats.set_pte_at_batched);
2012 debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
2013 &mmu_stats.set_pte_at_current);
2014 debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
2015 &mmu_stats.set_pte_at_kernel);
2016
2017 debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
2018 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
2019 &mmu_stats.prot_commit_batched);
2020
2021 return 0;
2022}
2023fs_initcall(xen_mmu_debugfs);
2024
2025#endif
2026