LXR linux/arch/x86/xen/mmu.c

   1/*
   2 * Xen mmu operations
   3 *
   4 * This file contains the various mmu fetch and update operations.
   5 * The most important job they must perform is the mapping between the
   6 * domain's pfn and the overall machine mfns.
   7 *
   8 * Xen allows guests to directly update the pagetable, in a controlled
   9 * fashion.  In other words, the guest modifies the same pagetable
  10 * that the CPU actually uses, which eliminates the overhead of having
  11 * a separate shadow pagetable.
  12 *
  13 * In order to allow this, it falls on the guest domain to map its
  14 * notion of a "physical" pfn - which is just a domain-local linear
  15 * address - into a real "machine address" which the CPU's MMU can
  16 * use.
  17 *
  18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
  19 * inserted directly into the pagetable.  When creating a new
  20 * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
  21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
  22 * the mfn back into a pfn.
  23 *
  24 * The other constraint is that all pages which make up a pagetable
  25 * must be mapped read-only in the guest.  This prevents uncontrolled
  26 * guest updates to the pagetable.  Xen strictly enforces this, and
  27 * will disallow any pagetable update which will end up mapping a
  28 * pagetable page RW, and will disallow using any writable page as a
  29 * pagetable.
  30 *
  31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
  32 * would need to validate the whole pagetable before going on.
  33 * Naturally, this is quite slow.  The solution is to "pin" a
  34 * pagetable, which enforces all the constraints on the pagetable even
  35 * when it is not actively in use.  This menas that Xen can be assured
  36 * that it is still valid when you do load it into %cr3, and doesn't
  37 * need to revalidate it.
  38 *
  39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  40 */
  41#include <linux/sched.h>
  42#include <linux/highmem.h>
  43#include <linux/debugfs.h>
  44#include <linux/bug.h>
  45#include <linux/vmalloc.h>
  46#include <linux/module.h>
  47#include <linux/gfp.h>
  48#include <linux/memblock.h>
  49#include <linux/seq_file.h>
  50#include <linux/crash_dump.h>
  51
  52#include <trace/events/xen.h>
  53
  54#include <asm/pgtable.h>
  55#include <asm/tlbflush.h>
  56#include <asm/fixmap.h>
  57#include <asm/mmu_context.h>
  58#include <asm/setup.h>
  59#include <asm/paravirt.h>
  60#include <asm/e820.h>
  61#include <asm/linkage.h>
  62#include <asm/page.h>
  63#include <asm/init.h>
  64#include <asm/pat.h>
  65#include <asm/smp.h>
  66
  67#include <asm/xen/hypercall.h>
  68#include <asm/xen/hypervisor.h>
  69
  70#include <xen/xen.h>
  71#include <xen/page.h>
  72#include <xen/interface/xen.h>
  73#include <xen/interface/hvm/hvm_op.h>
  74#include <xen/interface/version.h>
  75#include <xen/interface/memory.h>
  76#include <xen/hvc-console.h>
  77
  78#include "multicalls.h"
  79#include "mmu.h"
  80#include "debugfs.h"
  81
  82/*
  83 * Protects atomic reservation decrease/increase against concurrent increases.
  84 * Also protects non-atomic updates of current_pages and balloon lists.
  85 */
  86DEFINE_SPINLOCK(xen_reservation_lock);
  87
  88#ifdef CONFIG_X86_32
  89/*
  90 * Identity map, in addition to plain kernel map.  This needs to be
  91 * large enough to allocate page table pages to allocate the rest.
  92 * Each page can map 2MB.
  93 */
  94#define LEVEL1_IDENT_ENTRIES    (PTRS_PER_PTE * 4)
  95static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
  96#endif
  97#ifdef CONFIG_X86_64
  98/* l3 pud for userspace vsyscall mapping */
  99static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
 100#endif /* CONFIG_X86_64 */
 101
 102/*
 103 * Note about cr3 (pagetable base) values:
 104 *
 105 * xen_cr3 contains the current logical cr3 value; it contains the
 106 * last set cr3.  This may not be the current effective cr3, because
 107 * its update may be being lazily deferred.  However, a vcpu looking
 108 * at its own cr3 can use this value knowing that it everything will
 109 * be self-consistent.
 110 *
 111 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
 112 * hypercall to set the vcpu cr3 is complete (so it may be a little
 113 * out of date, but it will never be set early).  If one vcpu is
 114 * looking at another vcpu's cr3 value, it should use this variable.
 115 */
 116DEFINE_PER_CPU(unsigned long, xen_cr3);  /* cr3 stored as physaddr */
 117DEFINE_PER_CPU(unsigned long, xen_current_cr3);  /* actual vcpu cr3 */
 118
 119
 120/*
 121 * Just beyond the highest usermode address.  STACK_TOP_MAX has a
 122 * redzone above it, so round it up to a PGD boundary.
 123 */
 124#define USER_LIMIT      ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
 125
 126unsigned long arbitrary_virt_to_mfn(void *vaddr)
 127{
 128        xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
 129
 130        return PFN_DOWN(maddr.maddr);
 131}
 132
 133xmaddr_t arbitrary_virt_to_machine(void *vaddr)
 134{
 135        unsigned long address = (unsigned long)vaddr;
 136        unsigned int level;
 137        pte_t *pte;
 138        unsigned offset;
 139
 140        /*
 141         * if the PFN is in the linear mapped vaddr range, we can just use
 142         * the (quick) virt_to_machine() p2m lookup
 143         */
 144        if (virt_addr_valid(vaddr))
 145                return virt_to_machine(vaddr);
 146
 147        /* otherwise we have to do a (slower) full page-table walk */
 148
 149        pte = lookup_address(address, &level);
 150        BUG_ON(pte == NULL);
 151        offset = address & ~PAGE_MASK;
 152        return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
 153}
 154EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
 155
 156void make_lowmem_page_readonly(void *vaddr)
 157{
 158        pte_t *pte, ptev;
 159        unsigned long address = (unsigned long)vaddr;
 160        unsigned int level;
 161
 162        pte = lookup_address(address, &level);
 163        if (pte == NULL)
 164                return;         /* vaddr missing */
 165
 166        ptev = pte_wrprotect(*pte);
 167
 168        if (HYPERVISOR_update_va_mapping(address, ptev, 0))
 169                BUG();
 170}
 171
 172void make_lowmem_page_readwrite(void *vaddr)
 173{
 174        pte_t *pte, ptev;
 175        unsigned long address = (unsigned long)vaddr;
 176        unsigned int level;
 177
 178        pte = lookup_address(address, &level);
 179        if (pte == NULL)
 180                return;         /* vaddr missing */
 181
 182        ptev = pte_mkwrite(*pte);
 183
 184        if (HYPERVISOR_update_va_mapping(address, ptev, 0))
 185                BUG();
 186}
 187
 188
 189static bool xen_page_pinned(void *ptr)
 190{
 191        struct page *page = virt_to_page(ptr);
 192
 193        return PagePinned(page);
 194}
 195
 196void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
 197{
 198        struct multicall_space mcs;
 199        struct mmu_update *u;
 200
 201        trace_xen_mmu_set_domain_pte(ptep, pteval, domid);
 202
 203        mcs = xen_mc_entry(sizeof(*u));
 204        u = mcs.args;
 205
 206        /* ptep might be kmapped when using 32-bit HIGHPTE */
 207        u->ptr = virt_to_machine(ptep).maddr;
 208        u->val = pte_val_ma(pteval);
 209
 210        MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
 211
 212        xen_mc_issue(PARAVIRT_LAZY_MMU);
 213}
 214EXPORT_SYMBOL_GPL(xen_set_domain_pte);
 215
 216static void xen_extend_mmu_update(const struct mmu_update *update)
 217{
 218        struct multicall_space mcs;
 219        struct mmu_update *u;
 220
 221        mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
 222
 223        if (mcs.mc != NULL) {
 224                mcs.mc->args[1]++;
 225        } else {
 226                mcs = __xen_mc_entry(sizeof(*u));
 227                MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
 228        }
 229
 230        u = mcs.args;
 231        *u = *update;
 232}
 233
 234static void xen_extend_mmuext_op(const struct mmuext_op *op)
 235{
 236        struct multicall_space mcs;
 237        struct mmuext_op *u;
 238
 239        mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u));
 240
 241        if (mcs.mc != NULL) {
 242                mcs.mc->args[1]++;
 243        } else {
 244                mcs = __xen_mc_entry(sizeof(*u));
 245                MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
 246        }
 247
 248        u = mcs.args;
 249        *u = *op;
 250}
 251
 252static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
 253{
 254        struct mmu_update u;
 255
 256        preempt_disable();
 257
 258        xen_mc_batch();
 259
 260        /* ptr may be ioremapped for 64-bit pagetable setup */
 261        u.ptr = arbitrary_virt_to_machine(ptr).maddr;
 262        u.val = pmd_val_ma(val);
 263        xen_extend_mmu_update(&u);
 264
 265        xen_mc_issue(PARAVIRT_LAZY_MMU);
 266
 267        preempt_enable();
 268}
 269
 270static void xen_set_pmd(pmd_t *ptr, pmd_t val)
 271{
 272        trace_xen_mmu_set_pmd(ptr, val);
 273
 274        /* If page is not pinned, we can just update the entry
 275           directly */
 276        if (!xen_page_pinned(ptr)) {
 277                *ptr = val;
 278                return;
 279        }
 280
 281        xen_set_pmd_hyper(ptr, val);
 282}
 283
 284/*
 285 * Associate a virtual page frame with a given physical page frame
 286 * and protection flags for that frame.
 287 */
 288void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
 289{
 290        set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
 291}
 292
 293static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
 294{
 295        struct mmu_update u;
 296
 297        if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
 298                return false;
 299
 300        xen_mc_batch();
 301
 302        u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
 303        u.val = pte_val_ma(pteval);
 304        xen_extend_mmu_update(&u);
 305
 306        xen_mc_issue(PARAVIRT_LAZY_MMU);
 307
 308        return true;
 309}
 310
 311static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
 312{
 313        if (!xen_batched_set_pte(ptep, pteval)) {
 314                /*
 315                 * Could call native_set_pte() here and trap and
 316                 * emulate the PTE write but with 32-bit guests this
 317                 * needs two traps (one for each of the two 32-bit
 318                 * words in the PTE) so do one hypercall directly
 319                 * instead.
 320                 */
 321                struct mmu_update u;
 322
 323                u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
 324                u.val = pte_val_ma(pteval);
 325                HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
 326        }
 327}
 328
 329static void xen_set_pte(pte_t *ptep, pte_t pteval)
 330{
 331        trace_xen_mmu_set_pte(ptep, pteval);
 332        __xen_set_pte(ptep, pteval);
 333}
 334
 335static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 336                    pte_t *ptep, pte_t pteval)
 337{
 338        trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval);
 339        __xen_set_pte(ptep, pteval);
 340}
 341
 342pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
 343                                 unsigned long addr, pte_t *ptep)
 344{
 345        /* Just return the pte as-is.  We preserve the bits on commit */
 346        trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep);
 347        return *ptep;
 348}
 349
 350void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
 351                                 pte_t *ptep, pte_t pte)
 352{
 353        struct mmu_update u;
 354
 355        trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte);
 356        xen_mc_batch();
 357
 358        u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
 359        u.val = pte_val_ma(pte);
 360        xen_extend_mmu_update(&u);
 361
 362        xen_mc_issue(PARAVIRT_LAZY_MMU);
 363}
 364
 365/* Assume pteval_t is equivalent to all the other *val_t types. */
 366static pteval_t pte_mfn_to_pfn(pteval_t val)
 367{
 368        if (val & _PAGE_PRESENT) {
 369                unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 370                unsigned long pfn = mfn_to_pfn(mfn);
 371
 372                pteval_t flags = val & PTE_FLAGS_MASK;
 373                if (unlikely(pfn == ~0))
 374                        val = flags & ~_PAGE_PRESENT;
 375                else
 376                        val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
 377        }
 378
 379        return val;
 380}
 381
 382static pteval_t pte_pfn_to_mfn(pteval_t val)
 383{
 384        if (val & _PAGE_PRESENT) {
 385                unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 386                pteval_t flags = val & PTE_FLAGS_MASK;
 387                unsigned long mfn;
 388
 389                if (!xen_feature(XENFEAT_auto_translated_physmap))
 390                        mfn = get_phys_to_machine(pfn);
 391                else
 392                        mfn = pfn;
 393                /*
 394                 * If there's no mfn for the pfn, then just create an
 395                 * empty non-present pte.  Unfortunately this loses
 396                 * information about the original pfn, so
 397                 * pte_mfn_to_pfn is asymmetric.
 398                 */
 399                if (unlikely(mfn == INVALID_P2M_ENTRY)) {
 400                        mfn = 0;
 401                        flags = 0;
 402                } else
 403                        mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
 404                val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
 405        }
 406
 407        return val;
 408}
 409
 410__visible pteval_t xen_pte_val(pte_t pte)
 411{
 412        pteval_t pteval = pte.pte;
 413#if 0
 414        /* If this is a WC pte, convert back from Xen WC to Linux WC */
 415        if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
 416                WARN_ON(!pat_enabled);
 417                pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
 418        }
 419#endif
 420        return pte_mfn_to_pfn(pteval);
 421}
 422PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
 423
 424__visible pgdval_t xen_pgd_val(pgd_t pgd)
 425{
 426        return pte_mfn_to_pfn(pgd.pgd);
 427}
 428PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
 429
 430/*
 431 * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7
 432 * are reserved for now, to correspond to the Intel-reserved PAT
 433 * types.
 434 *
 435 * We expect Linux's PAT set as follows:
 436 *
 437 * Idx  PTE flags        Linux    Xen    Default
 438 * 0                     WB       WB     WB
 439 * 1            PWT      WC       WT     WT
 440 * 2        PCD          UC-      UC-    UC-
 441 * 3        PCD PWT      UC       UC     UC
 442 * 4    PAT              WB       WC     WB
 443 * 5    PAT     PWT      WC       WP     WT
 444 * 6    PAT PCD          UC-      UC     UC-
 445 * 7    PAT PCD PWT      UC       UC     UC
 446 */
 447
 448void xen_set_pat(u64 pat)
 449{
 450        /* We expect Linux to use a PAT setting of
 451         * UC UC- WC WB (ignoring the PAT flag) */
 452        WARN_ON(pat != 0x0007010600070106ull);
 453}
 454
 455__visible pte_t xen_make_pte(pteval_t pte)
 456{
 457#if 0
 458        /* If Linux is trying to set a WC pte, then map to the Xen WC.
 459         * If _PAGE_PAT is set, then it probably means it is really
 460         * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
 461         * things work out OK...
 462         *
 463         * (We should never see kernel mappings with _PAGE_PSE set,
 464         * but we could see hugetlbfs mappings, I think.).
 465         */
 466        if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) {
 467                if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
 468                        pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
 469        }
 470#endif
 471        pte = pte_pfn_to_mfn(pte);
 472
 473        return native_make_pte(pte);
 474}
 475PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
 476
 477__visible pgd_t xen_make_pgd(pgdval_t pgd)
 478{
 479        pgd = pte_pfn_to_mfn(pgd);
 480        return native_make_pgd(pgd);
 481}
 482PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
 483
 484__visible pmdval_t xen_pmd_val(pmd_t pmd)
 485{
 486        return pte_mfn_to_pfn(pmd.pmd);
 487}
 488PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
 489
 490static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
 491{
 492        struct mmu_update u;
 493
 494        preempt_disable();
 495
 496        xen_mc_batch();
 497
 498        /* ptr may be ioremapped for 64-bit pagetable setup */
 499        u.ptr = arbitrary_virt_to_machine(ptr).maddr;
 500        u.val = pud_val_ma(val);
 501        xen_extend_mmu_update(&u);
 502
 503        xen_mc_issue(PARAVIRT_LAZY_MMU);
 504
 505        preempt_enable();
 506}
 507
 508static void xen_set_pud(pud_t *ptr, pud_t val)
 509{
 510        trace_xen_mmu_set_pud(ptr, val);
 511
 512        /* If page is not pinned, we can just update the entry
 513           directly */
 514        if (!xen_page_pinned(ptr)) {
 515                *ptr = val;
 516                return;
 517        }
 518
 519        xen_set_pud_hyper(ptr, val);
 520}
 521
 522#ifdef CONFIG_X86_PAE
 523static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
 524{
 525        trace_xen_mmu_set_pte_atomic(ptep, pte);
 526        set_64bit((u64 *)ptep, native_pte_val(pte));
 527}
 528
 529static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 530{
 531        trace_xen_mmu_pte_clear(mm, addr, ptep);
 532        if (!xen_batched_set_pte(ptep, native_make_pte(0)))
 533                native_pte_clear(mm, addr, ptep);
 534}
 535
 536static void xen_pmd_clear(pmd_t *pmdp)
 537{
 538        trace_xen_mmu_pmd_clear(pmdp);
 539        set_pmd(pmdp, __pmd(0));
 540}
 541#endif  /* CONFIG_X86_PAE */
 542
 543__visible pmd_t xen_make_pmd(pmdval_t pmd)
 544{
 545        pmd = pte_pfn_to_mfn(pmd);
 546        return native_make_pmd(pmd);
 547}
 548PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
 549
 550#if PAGETABLE_LEVELS == 4
 551__visible pudval_t xen_pud_val(pud_t pud)
 552{
 553        return pte_mfn_to_pfn(pud.pud);
 554}
 555PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
 556
 557__visible pud_t xen_make_pud(pudval_t pud)
 558{
 559        pud = pte_pfn_to_mfn(pud);
 560
 561        return native_make_pud(pud);
 562}
 563PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
 564
 565static pgd_t *xen_get_user_pgd(pgd_t *pgd)
 566{
 567        pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
 568        unsigned offset = pgd - pgd_page;
 569        pgd_t *user_ptr = NULL;
 570
 571        if (offset < pgd_index(USER_LIMIT)) {
 572                struct page *page = virt_to_page(pgd_page);
 573                user_ptr = (pgd_t *)page->private;
 574                if (user_ptr)
 575                        user_ptr += offset;
 576        }
 577
 578        return user_ptr;
 579}
 580
 581static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
 582{
 583        struct mmu_update u;
 584
 585        u.ptr = virt_to_machine(ptr).maddr;
 586        u.val = pgd_val_ma(val);
 587        xen_extend_mmu_update(&u);
 588}
 589
 590/*
 591 * Raw hypercall-based set_pgd, intended for in early boot before
 592 * there's a page structure.  This implies:
 593 *  1. The only existing pagetable is the kernel's
 594 *  2. It is always pinned
 595 *  3. It has no user pagetable attached to it
 596 */
 597static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
 598{
 599        preempt_disable();
 600
 601        xen_mc_batch();
 602
 603        __xen_set_pgd_hyper(ptr, val);
 604
 605        xen_mc_issue(PARAVIRT_LAZY_MMU);
 606
 607        preempt_enable();
 608}
 609
 610static void xen_set_pgd(pgd_t *ptr, pgd_t val)
 611{
 612        pgd_t *user_ptr = xen_get_user_pgd(ptr);
 613
 614        trace_xen_mmu_set_pgd(ptr, user_ptr, val);
 615
 616        /* If page is not pinned, we can just update the entry
 617           directly */
 618        if (!xen_page_pinned(ptr)) {
 619                *ptr = val;
 620                if (user_ptr) {
 621                        WARN_ON(xen_page_pinned(user_ptr));
 622                        *user_ptr = val;
 623                }
 624                return;
 625        }
 626
 627        /* If it's pinned, then we can at least batch the kernel and
 628           user updates together. */
 629        xen_mc_batch();
 630
 631        __xen_set_pgd_hyper(ptr, val);
 632        if (user_ptr)
 633                __xen_set_pgd_hyper(user_ptr, val);
 634
 635        xen_mc_issue(PARAVIRT_LAZY_MMU);
 636}
 637#endif  /* PAGETABLE_LEVELS == 4 */
 638
 639/*
 640 * (Yet another) pagetable walker.  This one is intended for pinning a
 641 * pagetable.  This means that it walks a pagetable and calls the
 642 * callback function on each page it finds making up the page table,
 643 * at every level.  It walks the entire pagetable, but it only bothers
 644 * pinning pte pages which are below limit.  In the normal case this
 645 * will be STACK_TOP_MAX, but at boot we need to pin up to
 646 * FIXADDR_TOP.
 647 *
 648 * For 32-bit the important bit is that we don't pin beyond there,
 649 * because then we start getting into Xen's ptes.
 650 *
 651 * For 64-bit, we must skip the Xen hole in the middle of the address
 652 * space, just after the big x86-64 virtual hole.
 653 */
 654static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
 655                          int (*func)(struct mm_struct *mm, struct page *,
 656                                      enum pt_level),
 657                          unsigned long limit)
 658{
 659        int flush = 0;
 660        unsigned hole_low, hole_high;
 661        unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
 662        unsigned pgdidx, pudidx, pmdidx;
 663
 664        /* The limit is the last byte to be touched */
 665        limit--;
 666        BUG_ON(limit >= FIXADDR_TOP);
 667
 668        if (xen_feature(XENFEAT_auto_translated_physmap))
 669                return 0;
 670
 671        /*
 672         * 64-bit has a great big hole in the middle of the address
 673         * space, which contains the Xen mappings.  On 32-bit these
 674         * will end up making a zero-sized hole and so is a no-op.
 675         */
 676        hole_low = pgd_index(USER_LIMIT);
 677        hole_high = pgd_index(PAGE_OFFSET);
 678
 679        pgdidx_limit = pgd_index(limit);
 680#if PTRS_PER_PUD > 1
 681        pudidx_limit = pud_index(limit);
 682#else
 683        pudidx_limit = 0;
 684#endif
 685#if PTRS_PER_PMD > 1
 686        pmdidx_limit = pmd_index(limit);
 687#else
 688        pmdidx_limit = 0;
 689#endif
 690
 691        for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
 692                pud_t *pud;
 693
 694                if (pgdidx >= hole_low && pgdidx < hole_high)
 695                        continue;
 696
 697                if (!pgd_val(pgd[pgdidx]))
 698                        continue;
 699
 700                pud = pud_offset(&pgd[pgdidx], 0);
 701
 702                if (PTRS_PER_PUD > 1) /* not folded */
 703                        flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
 704
 705                for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
 706                        pmd_t *pmd;
 707
 708                        if (pgdidx == pgdidx_limit &&
 709                            pudidx > pudidx_limit)
 710                                goto out;
 711
 712                        if (pud_none(pud[pudidx]))
 713                                continue;
 714
 715                        pmd = pmd_offset(&pud[pudidx], 0);
 716
 717                        if (PTRS_PER_PMD > 1) /* not folded */
 718                                flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
 719
 720                        for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
 721                                struct page *pte;
 722
 723                                if (pgdidx == pgdidx_limit &&
 724                                    pudidx == pudidx_limit &&
 725                                    pmdidx > pmdidx_limit)
 726                                        goto out;
 727
 728                                if (pmd_none(pmd[pmdidx]))
 729                                        continue;
 730
 731                                pte = pmd_page(pmd[pmdidx]);
 732                                flush |= (*func)(mm, pte, PT_PTE);
 733                        }
 734                }
 735        }
 736
 737out:
 738        /* Do the top level last, so that the callbacks can use it as
 739           a cue to do final things like tlb flushes. */
 740        flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
 741
 742        return flush;
 743}
 744
 745static int xen_pgd_walk(struct mm_struct *mm,
 746                        int (*func)(struct mm_struct *mm, struct page *,
 747                                    enum pt_level),
 748                        unsigned long limit)
 749{
 750        return __xen_pgd_walk(mm, mm->pgd, func, limit);
 751}
 752
 753/* If we're using split pte locks, then take the page's lock and
 754   return a pointer to it.  Otherwise return NULL. */
 755static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
 756{
 757        spinlock_t *ptl = NULL;
 758
 759#if USE_SPLIT_PTE_PTLOCKS
 760        ptl = ptlock_ptr(page);
 761        spin_lock_nest_lock(ptl, &mm->page_table_lock);
 762#endif
 763
 764        return ptl;
 765}
 766
 767static void xen_pte_unlock(void *v)
 768{
 769        spinlock_t *ptl = v;
 770        spin_unlock(ptl);
 771}
 772
 773static void xen_do_pin(unsigned level, unsigned long pfn)
 774{
 775        struct mmuext_op op;
 776
 777        op.cmd = level;
 778        op.arg1.mfn = pfn_to_mfn(pfn);
 779
 780        xen_extend_mmuext_op(&op);
 781}
 782
 783static int xen_pin_page(struct mm_struct *mm, struct page *page,
 784                        enum pt_level level)
 785{
 786        unsigned pgfl = TestSetPagePinned(page);
 787        int flush;
 788
 789        if (pgfl)
 790                flush = 0;              /* already pinned */
 791        else if (PageHighMem(page))
 792                /* kmaps need flushing if we found an unpinned
 793                   highpage */
 794                flush = 1;
 795        else {
 796                void *pt = lowmem_page_address(page);
 797                unsigned long pfn = page_to_pfn(page);
 798                struct multicall_space mcs = __xen_mc_entry(0);
 799                spinlock_t *ptl;
 800
 801                flush = 0;
 802
 803                /*
 804                 * We need to hold the pagetable lock between the time
 805                 * we make the pagetable RO and when we actually pin
 806                 * it.  If we don't, then other users may come in and
 807                 * attempt to update the pagetable by writing it,
 808                 * which will fail because the memory is RO but not
 809                 * pinned, so Xen won't do the trap'n'emulate.
 810                 *
 811                 * If we're using split pte locks, we can't hold the
 812                 * entire pagetable's worth of locks during the
 813                 * traverse, because we may wrap the preempt count (8
 814                 * bits).  The solution is to mark RO and pin each PTE
 815                 * page while holding the lock.  This means the number
 816                 * of locks we end up holding is never more than a
 817                 * batch size (~32 entries, at present).
 818                 *
 819                 * If we're not using split pte locks, we needn't pin
 820                 * the PTE pages independently, because we're
 821                 * protected by the overall pagetable lock.
 822                 */
 823                ptl = NULL;
 824                if (level == PT_PTE)
 825                        ptl = xen_pte_lock(page, mm);
 826
 827                MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 828                                        pfn_pte(pfn, PAGE_KERNEL_RO),
 829                                        level == PT_PGD ? UVMF_TLB_FLUSH : 0);
 830
 831                if (ptl) {
 832                        xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
 833
 834                        /* Queue a deferred unlock for when this batch
 835                           is completed. */
 836                        xen_mc_callback(xen_pte_unlock, ptl);
 837                }
 838        }
 839
 840        return flush;
 841}
 842
 843/* This is called just after a mm has been created, but it has not
 844   been used yet.  We need to make sure that its pagetable is all
 845   read-only, and can be pinned. */
 846static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
 847{
 848        trace_xen_mmu_pgd_pin(mm, pgd);
 849
 850        xen_mc_batch();
 851
 852        if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
 853                /* re-enable interrupts for flushing */
 854                xen_mc_issue(0);
 855
 856                kmap_flush_unused();
 857
 858                xen_mc_batch();
 859        }
 860
 861#ifdef CONFIG_X86_64
 862        {
 863                pgd_t *user_pgd = xen_get_user_pgd(pgd);
 864
 865                xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
 866
 867                if (user_pgd) {
 868                        xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
 869                        xen_do_pin(MMUEXT_PIN_L4_TABLE,
 870                                   PFN_DOWN(__pa(user_pgd)));
 871                }
 872        }
 873#else /* CONFIG_X86_32 */
 874#ifdef CONFIG_X86_PAE
 875        /* Need to make sure unshared kernel PMD is pinnable */
 876        xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
 877                     PT_PMD);
 878#endif
 879        xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
 880#endif /* CONFIG_X86_64 */
 881        xen_mc_issue(0);
 882}
 883
 884static void xen_pgd_pin(struct mm_struct *mm)
 885{
 886        __xen_pgd_pin(mm, mm->pgd);
 887}
 888
 889/*
 890 * On save, we need to pin all pagetables to make sure they get their
 891 * mfns turned into pfns.  Search the list for any unpinned pgds and pin
 892 * them (unpinned pgds are not currently in use, probably because the
 893 * process is under construction or destruction).
 894 *
 895 * Expected to be called in stop_machine() ("equivalent to taking
 896 * every spinlock in the system"), so the locking doesn't really
 897 * matter all that much.
 898 */
 899void xen_mm_pin_all(void)
 900{
 901        struct page *page;
 902
 903        spin_lock(&pgd_lock);
 904
 905        list_for_each_entry(page, &pgd_list, lru) {
 906                if (!PagePinned(page)) {
 907                        __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
 908                        SetPageSavePinned(page);
 909                }
 910        }
 911
 912        spin_unlock(&pgd_lock);
 913}
 914
 915/*
 916 * The init_mm pagetable is really pinned as soon as its created, but
 917 * that's before we have page structures to store the bits.  So do all
 918 * the book-keeping now.
 919 */
 920static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
 921                                  enum pt_level level)
 922{
 923        SetPagePinned(page);
 924        return 0;
 925}
 926
 927static void __init xen_mark_init_mm_pinned(void)
 928{
 929        xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
 930}
 931
 932static int xen_unpin_page(struct mm_struct *mm, struct page *page,
 933                          enum pt_level level)
 934{
 935        unsigned pgfl = TestClearPagePinned(page);
 936
 937        if (pgfl && !PageHighMem(page)) {
 938                void *pt = lowmem_page_address(page);
 939                unsigned long pfn = page_to_pfn(page);
 940                spinlock_t *ptl = NULL;
 941                struct multicall_space mcs;
 942
 943                /*
 944                 * Do the converse to pin_page.  If we're using split
 945                 * pte locks, we must be holding the lock for while
 946                 * the pte page is unpinned but still RO to prevent
 947                 * concurrent updates from seeing it in this
 948                 * partially-pinned state.
 949                 */
 950                if (level == PT_PTE) {
 951                        ptl = xen_pte_lock(page, mm);
 952
 953                        if (ptl)
 954                                xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
 955                }
 956
 957                mcs = __xen_mc_entry(0);
 958
 959                MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 960                                        pfn_pte(pfn, PAGE_KERNEL),
 961                                        level == PT_PGD ? UVMF_TLB_FLUSH : 0);
 962
 963                if (ptl) {
 964                        /* unlock when batch completed */
 965                        xen_mc_callback(xen_pte_unlock, ptl);
 966                }
 967        }
 968
 969        return 0;               /* never need to flush on unpin */
 970}
 971
 972/* Release a pagetables pages back as normal RW */
 973static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
 974{
 975        trace_xen_mmu_pgd_unpin(mm, pgd);
 976
 977        xen_mc_batch();
 978
 979        xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
 980
 981#ifdef CONFIG_X86_64
 982        {
 983                pgd_t *user_pgd = xen_get_user_pgd(pgd);
 984
 985                if (user_pgd) {
 986                        xen_do_pin(MMUEXT_UNPIN_TABLE,
 987                                   PFN_DOWN(__pa(user_pgd)));
 988                        xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
 989                }
 990        }
 991#endif
 992
 993#ifdef CONFIG_X86_PAE
 994        /* Need to make sure unshared kernel PMD is unpinned */
 995        xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
 996                       PT_PMD);
 997#endif
 998
 999        __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
1000

1001        xen_mc_issue(0);
1002}
1003
1004static void xen_pgd_unpin(struct mm_struct *mm)
1005{
1006        __xen_pgd_unpin(mm, mm->pgd);
1007}
1008
1009/*
1010 * On resume, undo any pinning done at save, so that the rest of the
1011 * kernel doesn't see any unexpected pinned pagetables.
1012 */
1013void xen_mm_unpin_all(void)
1014{
1015        struct page *page;
1016
1017        spin_lock(&pgd_lock);
1018
1019        list_for_each_entry(page, &pgd_list, lru) {
1020                if (PageSavePinned(page)) {
1021                        BUG_ON(!PagePinned(page));
1022                        __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
1023                        ClearPageSavePinned(page);
1024                }
1025        }
1026
1027        spin_unlock(&pgd_lock);
1028}
1029
1030static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
1031{
1032        spin_lock(&next->page_table_lock);
1033        xen_pgd_pin(next);
1034        spin_unlock(&next->page_table_lock);
1035}
1036
1037static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1038{
1039        spin_lock(&mm->page_table_lock);
1040        xen_pgd_pin(mm);
1041        spin_unlock(&mm->page_table_lock);
1042}
1043
1044
1045#ifdef CONFIG_SMP
1046/* Another cpu may still have their %cr3 pointing at the pagetable, so
1047   we need to repoint it somewhere else before we can unpin it. */
1048static void drop_other_mm_ref(void *info)
1049{
1050        struct mm_struct *mm = info;
1051        struct mm_struct *active_mm;
1052
1053        active_mm = this_cpu_read(cpu_tlbstate.active_mm);
1054
1055        if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
1056                leave_mm(smp_processor_id());
1057
1058        /* If this cpu still has a stale cr3 reference, then make sure
1059           it has been flushed. */
1060        if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
1061                load_cr3(swapper_pg_dir);
1062}
1063
1064static void xen_drop_mm_ref(struct mm_struct *mm)
1065{
1066        cpumask_var_t mask;
1067        unsigned cpu;
1068
1069        if (current->active_mm == mm) {
1070                if (current->mm == mm)
1071                        load_cr3(swapper_pg_dir);
1072                else
1073                        leave_mm(smp_processor_id());
1074        }
1075
1076        /* Get the "official" set of cpus referring to our pagetable. */
1077        if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1078                for_each_online_cpu(cpu) {
1079                        if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
1080                            && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1081                                continue;
1082                        smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1083                }
1084                return;
1085        }
1086        cpumask_copy(mask, mm_cpumask(mm));
1087
1088        /* It's possible that a vcpu may have a stale reference to our
1089           cr3, because its in lazy mode, and it hasn't yet flushed
1090           its set of pending hypercalls yet.  In this case, we can
1091           look at its actual current cr3 value, and force it to flush
1092           if needed. */
1093        for_each_online_cpu(cpu) {
1094                if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
1095                        cpumask_set_cpu(cpu, mask);
1096        }
1097
1098        if (!cpumask_empty(mask))
1099                smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1100        free_cpumask_var(mask);
1101}
1102#else
1103static void xen_drop_mm_ref(struct mm_struct *mm)
1104{
1105        if (current->active_mm == mm)
1106                load_cr3(swapper_pg_dir);
1107}
1108#endif
1109
1110/*
1111 * While a process runs, Xen pins its pagetables, which means that the
1112 * hypervisor forces it to be read-only, and it controls all updates
1113 * to it.  This means that all pagetable updates have to go via the
1114 * hypervisor, which is moderately expensive.
1115 *
1116 * Since we're pulling the pagetable down, we switch to use init_mm,
1117 * unpin old process pagetable and mark it all read-write, which
1118 * allows further operations on it to be simple memory accesses.
1119 *
1120 * The only subtle point is that another CPU may be still using the
1121 * pagetable because of lazy tlb flushing.  This means we need need to
1122 * switch all CPUs off this pagetable before we can unpin it.
1123 */
1124static void xen_exit_mmap(struct mm_struct *mm)
1125{
1126        get_cpu();              /* make sure we don't move around */
1127        xen_drop_mm_ref(mm);
1128        put_cpu();
1129
1130        spin_lock(&mm->page_table_lock);
1131
1132        /* pgd may not be pinned in the error exit path of execve */
1133        if (xen_page_pinned(mm->pgd))
1134                xen_pgd_unpin(mm);
1135
1136        spin_unlock(&mm->page_table_lock);
1137}
1138
1139static void xen_post_allocator_init(void);
1140
1141#ifdef CONFIG_X86_64
1142static void __init xen_cleanhighmap(unsigned long vaddr,
1143                                    unsigned long vaddr_end)
1144{
1145        unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
1146        pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr);
1147
1148        /* NOTE: The loop is more greedy than the cleanup_highmap variant.
1149         * We include the PMD passed in on _both_ boundaries. */
1150        for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PAGE_SIZE));
1151                        pmd++, vaddr += PMD_SIZE) {
1152                if (pmd_none(*pmd))
1153                        continue;
1154                if (vaddr < (unsigned long) _text || vaddr > kernel_end)
1155                        set_pmd(pmd, __pmd(0));
1156        }
1157        /* In case we did something silly, we should crash in this function
1158         * instead of somewhere later and be confusing. */
1159        xen_mc_flush();
1160}
1161#endif
1162static void __init xen_pagetable_init(void)
1163{
1164#ifdef CONFIG_X86_64
1165        unsigned long size;
1166        unsigned long addr;
1167#endif
1168        paging_init();
1169        xen_setup_shared_info();
1170#ifdef CONFIG_X86_64
1171        if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1172                unsigned long new_mfn_list;
1173
1174                size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
1175
1176                /* On 32-bit, we get zero so this never gets executed. */
1177                new_mfn_list = xen_revector_p2m_tree();
1178                if (new_mfn_list && new_mfn_list != xen_start_info->mfn_list) {
1179                        /* using __ka address and sticking INVALID_P2M_ENTRY! */
1180                        memset((void *)xen_start_info->mfn_list, 0xff, size);
1181
1182                        /* We should be in __ka space. */
1183                        BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
1184                        addr = xen_start_info->mfn_list;
1185                        /* We roundup to the PMD, which means that if anybody at this stage is
1186                         * using the __ka address of xen_start_info or xen_start_info->shared_info
1187                         * they are in going to crash. Fortunatly we have already revectored
1188                         * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */
1189                        size = roundup(size, PMD_SIZE);
1190                        xen_cleanhighmap(addr, addr + size);
1191
1192                        size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
1193                        memblock_free(__pa(xen_start_info->mfn_list), size);
1194                        /* And revector! Bye bye old array */
1195                        xen_start_info->mfn_list = new_mfn_list;
1196                } else
1197                        goto skip;
1198        }
1199        /* At this stage, cleanup_highmap has already cleaned __ka space
1200         * from _brk_limit way up to the max_pfn_mapped (which is the end of
1201         * the ramdisk). We continue on, erasing PMD entries that point to page
1202         * tables - do note that they are accessible at this stage via __va.
1203         * For good measure we also round up to the PMD - which means that if
1204         * anybody is using __ka address to the initial boot-stack - and try
1205         * to use it - they are going to crash. The xen_start_info has been
1206         * taken care of already in xen_setup_kernel_pagetable. */
1207        addr = xen_start_info->pt_base;
1208        size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE);
1209
1210        xen_cleanhighmap(addr, addr + size);
1211        xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base));
1212#ifdef DEBUG
1213        /* This is superflous and is not neccessary, but you know what
1214         * lets do it. The MODULES_VADDR -> MODULES_END should be clear of
1215         * anything at this stage. */
1216        xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1);
1217#endif
1218skip:
1219#endif
1220        xen_post_allocator_init();
1221}
1222static void xen_write_cr2(unsigned long cr2)
1223{
1224        this_cpu_read(xen_vcpu)->arch.cr2 = cr2;
1225}
1226
1227static unsigned long xen_read_cr2(void)
1228{
1229        return this_cpu_read(xen_vcpu)->arch.cr2;
1230}
1231
1232unsigned long xen_read_cr2_direct(void)
1233{
1234        return this_cpu_read(xen_vcpu_info.arch.cr2);
1235}
1236
1237void xen_flush_tlb_all(void)
1238{
1239        struct mmuext_op *op;
1240        struct multicall_space mcs;
1241
1242        trace_xen_mmu_flush_tlb_all(0);
1243
1244        preempt_disable();
1245
1246        mcs = xen_mc_entry(sizeof(*op));
1247
1248        op = mcs.args;
1249        op->cmd = MMUEXT_TLB_FLUSH_ALL;
1250        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1251
1252        xen_mc_issue(PARAVIRT_LAZY_MMU);
1253
1254        preempt_enable();
1255}
1256static void xen_flush_tlb(void)
1257{
1258        struct mmuext_op *op;
1259        struct multicall_space mcs;
1260
1261        trace_xen_mmu_flush_tlb(0);
1262
1263        preempt_disable();
1264
1265        mcs = xen_mc_entry(sizeof(*op));
1266
1267        op = mcs.args;
1268        op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1269        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1270
1271        xen_mc_issue(PARAVIRT_LAZY_MMU);
1272
1273        preempt_enable();
1274}
1275
1276static void xen_flush_tlb_single(unsigned long addr)
1277{
1278        struct mmuext_op *op;
1279        struct multicall_space mcs;
1280
1281        trace_xen_mmu_flush_tlb_single(addr);
1282
1283        preempt_disable();
1284
1285        mcs = xen_mc_entry(sizeof(*op));
1286        op = mcs.args;
1287        op->cmd = MMUEXT_INVLPG_LOCAL;
1288        op->arg1.linear_addr = addr & PAGE_MASK;
1289        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1290
1291        xen_mc_issue(PARAVIRT_LAZY_MMU);
1292
1293        preempt_enable();
1294}
1295
1296static void xen_flush_tlb_others(const struct cpumask *cpus,
1297                                 struct mm_struct *mm, unsigned long start,
1298                                 unsigned long end)
1299{
1300        struct {
1301                struct mmuext_op op;
1302#ifdef CONFIG_SMP
1303                DECLARE_BITMAP(mask, num_processors);
1304#else
1305                DECLARE_BITMAP(mask, NR_CPUS);
1306#endif
1307        } *args;
1308        struct multicall_space mcs;
1309
1310        trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
1311
1312        if (cpumask_empty(cpus))
1313                return;         /* nothing to do */
1314
1315        mcs = xen_mc_entry(sizeof(*args));
1316        args = mcs.args;
1317        args->op.arg2.vcpumask = to_cpumask(args->mask);
1318
1319        /* Remove us, and any offline CPUS. */
1320        cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1321        cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1322
1323        args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1324        if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
1325                args->op.cmd = MMUEXT_INVLPG_MULTI;
1326                args->op.arg1.linear_addr = start;
1327        }
1328
1329        MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1330
1331        xen_mc_issue(PARAVIRT_LAZY_MMU);
1332}
1333
1334static unsigned long xen_read_cr3(void)
1335{
1336        return this_cpu_read(xen_cr3);
1337}
1338
1339static void set_current_cr3(void *v)
1340{
1341        this_cpu_write(xen_current_cr3, (unsigned long)v);
1342}
1343
1344static void __xen_write_cr3(bool kernel, unsigned long cr3)
1345{
1346        struct mmuext_op op;
1347        unsigned long mfn;
1348
1349        trace_xen_mmu_write_cr3(kernel, cr3);
1350
1351        if (cr3)
1352                mfn = pfn_to_mfn(PFN_DOWN(cr3));
1353        else
1354                mfn = 0;
1355
1356        WARN_ON(mfn == 0 && kernel);
1357
1358        op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1359        op.arg1.mfn = mfn;
1360
1361        xen_extend_mmuext_op(&op);
1362
1363        if (kernel) {
1364                this_cpu_write(xen_cr3, cr3);
1365
1366                /* Update xen_current_cr3 once the batch has actually
1367                   been submitted. */
1368                xen_mc_callback(set_current_cr3, (void *)cr3);
1369        }
1370}
1371static void xen_write_cr3(unsigned long cr3)
1372{
1373        BUG_ON(preemptible());
1374
1375        xen_mc_batch();  /* disables interrupts */
1376
1377        /* Update while interrupts are disabled, so its atomic with
1378           respect to ipis */
1379        this_cpu_write(xen_cr3, cr3);
1380
1381        __xen_write_cr3(true, cr3);
1382
1383#ifdef CONFIG_X86_64
1384        {
1385                pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1386                if (user_pgd)
1387                        __xen_write_cr3(false, __pa(user_pgd));
1388                else
1389                        __xen_write_cr3(false, 0);
1390        }
1391#endif
1392
1393        xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
1394}
1395
1396#ifdef CONFIG_X86_64
1397/*
1398 * At the start of the day - when Xen launches a guest, it has already
1399 * built pagetables for the guest. We diligently look over them
1400 * in xen_setup_kernel_pagetable and graft as appropiate them in the
1401 * init_level4_pgt and its friends. Then when we are happy we load
1402 * the new init_level4_pgt - and continue on.
1403 *
1404 * The generic code starts (start_kernel) and 'init_mem_mapping' sets
1405 * up the rest of the pagetables. When it has completed it loads the cr3.
1406 * N.B. that baremetal would start at 'start_kernel' (and the early
1407 * #PF handler would create bootstrap pagetables) - so we are running
1408 * with the same assumptions as what to do when write_cr3 is executed
1409 * at this point.
1410 *
1411 * Since there are no user-page tables at all, we have two variants
1412 * of xen_write_cr3 - the early bootup (this one), and the late one
1413 * (xen_write_cr3). The reason we have to do that is that in 64-bit
1414 * the Linux kernel and user-space are both in ring 3 while the
1415 * hypervisor is in ring 0.
1416 */
1417static void __init xen_write_cr3_init(unsigned long cr3)
1418{
1419        BUG_ON(preemptible());
1420
1421        xen_mc_batch();  /* disables interrupts */
1422
1423        /* Update while interrupts are disabled, so its atomic with
1424           respect to ipis */
1425        this_cpu_write(xen_cr3, cr3);
1426
1427        __xen_write_cr3(true, cr3);
1428
1429        xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
1430}
1431#endif
1432
1433static int xen_pgd_alloc(struct mm_struct *mm)
1434{
1435        pgd_t *pgd = mm->pgd;
1436        int ret = 0;
1437
1438        BUG_ON(PagePinned(virt_to_page(pgd)));
1439
1440#ifdef CONFIG_X86_64
1441        {
1442                struct page *page = virt_to_page(pgd);
1443                pgd_t *user_pgd;
1444
1445                BUG_ON(page->private != 0);
1446
1447                ret = -ENOMEM;
1448
1449                user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1450                page->private = (unsigned long)user_pgd;
1451
1452                if (user_pgd != NULL) {
1453                        user_pgd[pgd_index(VSYSCALL_START)] =
1454                                __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1455                        ret = 0;
1456                }
1457
1458                BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1459        }
1460#endif
1461
1462        return ret;
1463}
1464
1465static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1466{
1467#ifdef CONFIG_X86_64
1468        pgd_t *user_pgd = xen_get_user_pgd(pgd);
1469
1470        if (user_pgd)
1471                free_page((unsigned long)user_pgd);
1472#endif
1473}
1474
1475#ifdef CONFIG_X86_32
1476static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1477{
1478        /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1479        if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1480                pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1481                               pte_val_ma(pte));
1482
1483        return pte;
1484}
1485#else /* CONFIG_X86_64 */
1486static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1487{
1488        return pte;
1489}
1490#endif /* CONFIG_X86_64 */
1491
1492/*
1493 * Init-time set_pte while constructing initial pagetables, which
1494 * doesn't allow RO page table pages to be remapped RW.
1495 *
1496 * If there is no MFN for this PFN then this page is initially
1497 * ballooned out so clear the PTE (as in decrease_reservation() in
1498 * drivers/xen/balloon.c).
1499 *
1500 * Many of these PTE updates are done on unpinned and writable pages
1501 * and doing a hypercall for these is unnecessary and expensive.  At
1502 * this point it is not possible to tell if a page is pinned or not,
1503 * so always write the PTE directly and rely on Xen trapping and
1504 * emulating any updates as necessary.
1505 */
1506static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
1507{
1508        if (pte_mfn(pte) != INVALID_P2M_ENTRY)
1509                pte = mask_rw_pte(ptep, pte);
1510        else
1511                pte = __pte_ma(0);
1512
1513        native_set_pte(ptep, pte);
1514}
1515
1516static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1517{
1518        struct mmuext_op op;
1519        op.cmd = cmd;
1520        op.arg1.mfn = pfn_to_mfn(pfn);
1521        if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1522                BUG();
1523}
1524
1525/* Early in boot, while setting up the initial pagetable, assume
1526   everything is pinned. */
1527static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1528{
1529#ifdef CONFIG_FLATMEM
1530        BUG_ON(mem_map);        /* should only be used early */
1531#endif
1532        make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1533        pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1534}
1535
1536/* Used for pmd and pud */
1537static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1538{
1539#ifdef CONFIG_FLATMEM
1540        BUG_ON(mem_map);        /* should only be used early */
1541#endif
1542        make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1543}
1544
1545/* Early release_pte assumes that all pts are pinned, since there's
1546   only init_mm and anything attached to that is pinned. */
1547static void __init xen_release_pte_init(unsigned long pfn)
1548{
1549        pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1550        make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1551}
1552
1553static void __init xen_release_pmd_init(unsigned long pfn)
1554{
1555        make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1556}
1557
1558static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1559{
1560        struct multicall_space mcs;
1561        struct mmuext_op *op;
1562
1563        mcs = __xen_mc_entry(sizeof(*op));
1564        op = mcs.args;
1565        op->cmd = cmd;
1566        op->arg1.mfn = pfn_to_mfn(pfn);
1567
1568        MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
1569}
1570
1571static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)
1572{
1573        struct multicall_space mcs;
1574        unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT);
1575
1576        mcs = __xen_mc_entry(0);
1577        MULTI_update_va_mapping(mcs.mc, (unsigned long)addr,
1578                                pfn_pte(pfn, prot), 0);
1579}
1580
1581/* This needs to make sure the new pte page is pinned iff its being
1582   attached to a pinned pagetable. */
1583static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
1584                                    unsigned level)
1585{
1586        bool pinned = PagePinned(virt_to_page(mm->pgd));
1587
1588        trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned);
1589
1590        if (pinned) {
1591                struct page *page = pfn_to_page(pfn);
1592
1593                SetPagePinned(page);
1594
1595                if (!PageHighMem(page)) {
1596                        xen_mc_batch();
1597
1598                        __set_pfn_prot(pfn, PAGE_KERNEL_RO);
1599
1600                        if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
1601                                __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1602
1603                        xen_mc_issue(PARAVIRT_LAZY_MMU);
1604                } else {
1605                        /* make sure there are no stray mappings of
1606                           this page */
1607                        kmap_flush_unused();
1608                }
1609        }
1610}
1611
1612static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1613{
1614        xen_alloc_ptpage(mm, pfn, PT_PTE);
1615}
1616
1617static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1618{
1619        xen_alloc_ptpage(mm, pfn, PT_PMD);
1620}
1621
1622/* This should never happen until we're OK to use struct page */
1623static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
1624{
1625        struct page *page = pfn_to_page(pfn);
1626        bool pinned = PagePinned(page);
1627
1628        trace_xen_mmu_release_ptpage(pfn, level, pinned);
1629
1630        if (pinned) {
1631                if (!PageHighMem(page)) {
1632                        xen_mc_batch();
1633
1634                        if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
1635                                __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1636
1637                        __set_pfn_prot(pfn, PAGE_KERNEL);
1638
1639                        xen_mc_issue(PARAVIRT_LAZY_MMU);
1640                }
1641                ClearPagePinned(page);
1642        }
1643}
1644
1645static void xen_release_pte(unsigned long pfn)
1646{
1647        xen_release_ptpage(pfn, PT_PTE);
1648}
1649
1650static void xen_release_pmd(unsigned long pfn)
1651{
1652        xen_release_ptpage(pfn, PT_PMD);
1653}
1654
1655#if PAGETABLE_LEVELS == 4
1656static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1657{
1658        xen_alloc_ptpage(mm, pfn, PT_PUD);
1659}
1660
1661static void xen_release_pud(unsigned long pfn)
1662{
1663        xen_release_ptpage(pfn, PT_PUD);
1664}
1665#endif
1666
1667void __init xen_reserve_top(void)
1668{
1669#ifdef CONFIG_X86_32
1670        unsigned long top = HYPERVISOR_VIRT_START;
1671        struct xen_platform_parameters pp;
1672
1673        if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1674                top = pp.virt_start;
1675
1676        reserve_top_address(-top);
1677#endif  /* CONFIG_X86_32 */
1678}
1679
1680/*
1681 * Like __va(), but returns address in the kernel mapping (which is
1682 * all we have until the physical memory mapping has been set up.
1683 */
1684static void *__ka(phys_addr_t paddr)
1685{
1686#ifdef CONFIG_X86_64
1687        return (void *)(paddr + __START_KERNEL_map);
1688#else
1689        return __va(paddr);
1690#endif
1691}
1692
1693/* Convert a machine address to physical address */
1694static unsigned long m2p(phys_addr_t maddr)
1695{
1696        phys_addr_t paddr;
1697
1698        maddr &= PTE_PFN_MASK;
1699        paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1700
1701        return paddr;
1702}
1703
1704/* Convert a machine address to kernel virtual */
1705static void *m2v(phys_addr_t maddr)
1706{
1707        return __ka(m2p(maddr));
1708}
1709
1710/* Set the page permissions on an identity-mapped pages */
1711static void set_page_prot_flags(void *addr, pgprot_t prot, unsigned long flags)
1712{
1713        unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1714        pte_t pte = pfn_pte(pfn, prot);
1715
1716        /* For PVH no need to set R/O or R/W to pin them or unpin them. */
1717        if (xen_feature(XENFEAT_auto_translated_physmap))
1718                return;
1719
1720        if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))
1721                BUG();
1722}
1723static void set_page_prot(void *addr, pgprot_t prot)
1724{
1725        return set_page_prot_flags(addr, prot, UVMF_NONE);
1726}
1727#ifdef CONFIG_X86_32
1728static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1729{
1730        unsigned pmdidx, pteidx;
1731        unsigned ident_pte;
1732        unsigned long pfn;
1733
1734        level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
1735                                      PAGE_SIZE);
1736
1737        ident_pte = 0;
1738        pfn = 0;
1739        for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1740                pte_t *pte_page;
1741
1742                /* Reuse or allocate a page of ptes */
1743                if (pmd_present(pmd[pmdidx]))
1744                        pte_page = m2v(pmd[pmdidx].pmd);
1745                else {
1746                        /* Check for free pte pages */
1747                        if (ident_pte == LEVEL1_IDENT_ENTRIES)
1748                                break;
1749
1750                        pte_page = &level1_ident_pgt[ident_pte];
1751                        ident_pte += PTRS_PER_PTE;
1752
1753                        pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1754                }
1755
1756                /* Install mappings */
1757                for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1758                        pte_t pte;
1759
1760#ifdef CONFIG_X86_32
1761                        if (pfn > max_pfn_mapped)
1762                                max_pfn_mapped = pfn;
1763#endif
1764
1765                        if (!pte_none(pte_page[pteidx]))
1766                                continue;
1767
1768                        pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1769                        pte_page[pteidx] = pte;
1770                }
1771        }
1772
1773        for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1774                set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1775
1776        set_page_prot(pmd, PAGE_KERNEL_RO);
1777}
1778#endif
1779void __init xen_setup_machphys_mapping(void)
1780{
1781        struct xen_machphys_mapping mapping;
1782
1783        if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
1784                machine_to_phys_mapping = (unsigned long *)mapping.v_start;
1785                machine_to_phys_nr = mapping.max_mfn + 1;
1786        } else {
1787                machine_to_phys_nr = MACH2PHYS_NR_ENTRIES;
1788        }
1789#ifdef CONFIG_X86_32
1790        WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1))
1791                < machine_to_phys_mapping);
1792#endif
1793}
1794
1795#ifdef CONFIG_X86_64
1796static void convert_pfn_mfn(void *v)
1797{
1798        pte_t *pte = v;
1799        int i;
1800
1801        /* All levels are converted the same way, so just treat them
1802           as ptes. */
1803        for (i = 0; i < PTRS_PER_PTE; i++)
1804                pte[i] = xen_make_pte(pte[i].pte);
1805}
1806static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
1807                                 unsigned long addr)
1808{
1809        if (*pt_base == PFN_DOWN(__pa(addr))) {
1810                set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
1811                clear_page((void *)addr);
1812                (*pt_base)++;
1813        }
1814        if (*pt_end == PFN_DOWN(__pa(addr))) {
1815                set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
1816                clear_page((void *)addr);
1817                (*pt_end)--;
1818        }
1819}
1820/*
1821 * Set up the initial kernel pagetable.
1822 *
1823 * We can construct this by grafting the Xen provided pagetable into
1824 * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
1825 * level2_ident_pgt, and level2_kernel_pgt.  This means that only the
1826 * kernel has a physical mapping to start with - but that's enough to
1827 * get __va working.  We need to fill in the rest of the physical
1828 * mapping once some sort of allocator has been set up.  NOTE: for
1829 * PVH, the page tables are native.
1830 */
1831void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1832{
1833        pud_t *l3;
1834        pmd_t *l2;
1835        unsigned long addr[3];
1836        unsigned long pt_base, pt_end;
1837        unsigned i;
1838
1839        /* max_pfn_mapped is the last pfn mapped in the initial memory
1840         * mappings. Considering that on Xen after the kernel mappings we
1841         * have the mappings of some pages that don't exist in pfn space, we
1842         * set max_pfn_mapped to the last real pfn mapped. */
1843        max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1844
1845        pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
1846        pt_end = pt_base + xen_start_info->nr_pt_frames;
1847
1848        /* Zap identity mapping */
1849        init_level4_pgt[0] = __pgd(0);
1850
1851        if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1852                /* Pre-constructed entries are in pfn, so convert to mfn */
1853                /* L4[272] -> level3_ident_pgt
1854                 * L4[511] -> level3_kernel_pgt */
1855                convert_pfn_mfn(init_level4_pgt);
1856
1857                /* L3_i[0] -> level2_ident_pgt */
1858                convert_pfn_mfn(level3_ident_pgt);
1859                /* L3_k[510] -> level2_kernel_pgt
1860                 * L3_k[511] -> level2_fixmap_pgt */
1861                convert_pfn_mfn(level3_kernel_pgt);
1862
1863                /* L3_k[511][506] -> level1_fixmap_pgt */
1864                convert_pfn_mfn(level2_fixmap_pgt);
1865        }
1866        /* We get [511][511] and have Xen's version of level2_kernel_pgt */
1867        l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1868        l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1869
1870        addr[0] = (unsigned long)pgd;
1871        addr[1] = (unsigned long)l3;
1872        addr[2] = (unsigned long)l2;
1873        /* Graft it onto L4[272][0]. Note that we creating an aliasing problem:
1874         * Both L4[272][0] and L4[511][510] have entries that point to the same
1875         * L2 (PMD) tables. Meaning that if you modify it in __va space
1876         * it will be also modified in the __ka space! (But if you just
1877         * modify the PMD table to point to other PTE's or none, then you
1878         * are OK - which is what cleanup_highmap does) */
1879        copy_page(level2_ident_pgt, l2);
1880        /* Graft it onto L4[511][510] */
1881        copy_page(level2_kernel_pgt, l2);
1882
1883        if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1884                /* Make pagetable pieces RO */
1885                set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1886                set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1887                set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1888                set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1889                set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
1890                set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1891                set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1892                set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO);
1893
1894                /* Pin down new L4 */
1895                pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1896                                  PFN_DOWN(__pa_symbol(init_level4_pgt)));
1897
1898                /* Unpin Xen-provided one */
1899                pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1900
1901                /*
1902                 * At this stage there can be no user pgd, and no page
1903                 * structure to attach it to, so make sure we just set kernel
1904                 * pgd.
1905                 */
1906                xen_mc_batch();
1907                __xen_write_cr3(true, __pa(init_level4_pgt));
1908                xen_mc_issue(PARAVIRT_LAZY_CPU);
1909        } else
1910                native_write_cr3(__pa(init_level4_pgt));
1911
1912        /* We can't that easily rip out L3 and L2, as the Xen pagetables are
1913         * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ...  for
1914         * the initial domain. For guests using the toolstack, they are in:
1915         * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only
1916         * rip out the [L4] (pgd), but for guests we shave off three pages.
1917         */
1918        for (i = 0; i < ARRAY_SIZE(addr); i++)
1919                check_pt_base(&pt_base, &pt_end, addr[i]);
1920
1921        /* Our (by three pages) smaller Xen pagetable that we are using */
1922        memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE);
1923        /* Revector the xen_start_info */
1924        xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
1925}
1926#else   /* !CONFIG_X86_64 */
1927static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
1928static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
1929
1930static void __init xen_write_cr3_init(unsigned long cr3)
1931{
1932        unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
1933
1934        BUG_ON(read_cr3_pa() != __pa(initial_page_table));
1935        BUG_ON(cr3 != __pa(swapper_pg_dir));
1936
1937        /*
1938         * We are switching to swapper_pg_dir for the first time (from
1939         * initial_page_table) and therefore need to mark that page
1940         * read-only and then pin it.
1941         *
1942         * Xen disallows sharing of kernel PMDs for PAE
1943         * guests. Therefore we must copy the kernel PMD from
1944         * initial_page_table into a new kernel PMD to be used in
1945         * swapper_pg_dir.
1946         */
1947        swapper_kernel_pmd =
1948                extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1949        copy_page(swapper_kernel_pmd, initial_kernel_pmd);
1950        swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
1951                __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
1952        set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
1953
1954        set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1955        xen_write_cr3(cr3);
1956        pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
1957
1958        pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
1959                          PFN_DOWN(__pa(initial_page_table)));
1960        set_page_prot(initial_page_table, PAGE_KERNEL);
1961        set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
1962
1963        pv_mmu_ops.write_cr3 = &xen_write_cr3;
1964}
1965
1966void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1967{
1968        pmd_t *kernel_pmd;
1969
1970        initial_kernel_pmd =
1971                extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1972
1973        max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
1974                                  xen_start_info->nr_pt_frames * PAGE_SIZE +
1975                                  512*1024);
1976
1977        kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1978        copy_page(initial_kernel_pmd, kernel_pmd);
1979
1980        xen_map_identity_early(initial_kernel_pmd, max_pfn);
1981
1982        copy_page(initial_page_table, pgd);
1983        initial_page_table[KERNEL_PGD_BOUNDARY] =
1984                __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
1985
1986        set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
1987        set_page_prot(initial_page_table, PAGE_KERNEL_RO);
1988        set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1989
1990        pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1991
1992        pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
1993                          PFN_DOWN(__pa(initial_page_table)));
1994        xen_write_cr3(__pa(initial_page_table));
1995
1996        memblock_reserve(__pa(xen_start_info->pt_base),
1997                         xen_start_info->nr_pt_frames * PAGE_SIZE);
1998}
1999#endif  /* CONFIG_X86_64 */
2000

2001static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
2002
2003static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
2004{
2005        pte_t pte;
2006
2007        phys >>= PAGE_SHIFT;
2008
2009        switch (idx) {
2010        case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
2011        case FIX_RO_IDT:
2012#ifdef CONFIG_X86_32
2013        case FIX_WP_TEST:
2014        case FIX_VDSO:
2015# ifdef CONFIG_HIGHMEM
2016        case FIX_KMAP_BEGIN ... FIX_KMAP_END:
2017# endif
2018#else
2019        case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
2020        case VVAR_PAGE:
2021#endif
2022        case FIX_TEXT_POKE0:
2023        case FIX_TEXT_POKE1:
2024                /* All local page mappings */
2025                pte = pfn_pte(phys, prot);
2026                break;
2027
2028#ifdef CONFIG_X86_LOCAL_APIC
2029        case FIX_APIC_BASE:     /* maps dummy local APIC */
2030                pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
2031                break;
2032#endif
2033
2034#ifdef CONFIG_X86_IO_APIC
2035        case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
2036                /*
2037                 * We just don't map the IO APIC - all access is via
2038                 * hypercalls.  Keep the address in the pte for reference.
2039                 */
2040                pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
2041                break;
2042#endif
2043
2044        case FIX_PARAVIRT_BOOTMAP:
2045                /* This is an MFN, but it isn't an IO mapping from the
2046                   IO domain */
2047                pte = mfn_pte(phys, prot);
2048                break;
2049
2050        default:
2051                /* By default, set_fixmap is used for hardware mappings */
2052                pte = mfn_pte(phys, prot);
2053                break;
2054        }
2055
2056        __native_set_fixmap(idx, pte);
2057
2058#ifdef CONFIG_X86_64
2059        /* Replicate changes to map the vsyscall page into the user
2060           pagetable vsyscall mapping. */
2061        if ((idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) ||
2062            idx == VVAR_PAGE) {
2063                unsigned long vaddr = __fix_to_virt(idx);
2064                set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
2065        }
2066#endif
2067}
2068
2069static void __init xen_post_allocator_init(void)
2070{
2071        if (xen_feature(XENFEAT_auto_translated_physmap))
2072                return;
2073
2074        pv_mmu_ops.set_pte = xen_set_pte;
2075        pv_mmu_ops.set_pmd = xen_set_pmd;
2076        pv_mmu_ops.set_pud = xen_set_pud;
2077#if PAGETABLE_LEVELS == 4
2078        pv_mmu_ops.set_pgd = xen_set_pgd;
2079#endif
2080
2081        /* This will work as long as patching hasn't happened yet
2082           (which it hasn't) */
2083        pv_mmu_ops.alloc_pte = xen_alloc_pte;
2084        pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
2085        pv_mmu_ops.release_pte = xen_release_pte;
2086        pv_mmu_ops.release_pmd = xen_release_pmd;
2087#if PAGETABLE_LEVELS == 4
2088        pv_mmu_ops.alloc_pud = xen_alloc_pud;
2089        pv_mmu_ops.release_pud = xen_release_pud;
2090#endif
2091
2092#ifdef CONFIG_X86_64
2093        pv_mmu_ops.write_cr3 = &xen_write_cr3;
2094        SetPagePinned(virt_to_page(level3_user_vsyscall));
2095#endif
2096        xen_mark_init_mm_pinned();
2097}
2098
2099static void xen_leave_lazy_mmu(void)
2100{
2101        preempt_disable();
2102        xen_mc_flush();
2103        paravirt_leave_lazy_mmu();
2104        preempt_enable();
2105}
2106
2107static const struct pv_mmu_ops xen_mmu_ops __initconst = {
2108        .read_cr2 = xen_read_cr2,
2109        .write_cr2 = xen_write_cr2,
2110
2111        .read_cr3 = xen_read_cr3,
2112        .write_cr3 = xen_write_cr3_init,
2113
2114        .flush_tlb_user = xen_flush_tlb,
2115        .flush_tlb_kernel = xen_flush_tlb,
2116        .flush_tlb_single = xen_flush_tlb_single,
2117        .flush_tlb_others = xen_flush_tlb_others,
2118
2119        .pte_update = paravirt_nop,
2120
2121        .pgd_alloc = xen_pgd_alloc,
2122        .pgd_free = xen_pgd_free,
2123
2124        .alloc_pte = xen_alloc_pte_init,
2125        .release_pte = xen_release_pte_init,
2126        .alloc_pmd = xen_alloc_pmd_init,
2127        .release_pmd = xen_release_pmd_init,
2128
2129        .set_pte = xen_set_pte_init,
2130        .set_pte_at = xen_set_pte_at,
2131        .set_pmd = xen_set_pmd_hyper,
2132
2133        .ptep_modify_prot_start = __ptep_modify_prot_start,
2134        .ptep_modify_prot_commit = __ptep_modify_prot_commit,
2135
2136        .pte_val = PV_CALLEE_SAVE(xen_pte_val),
2137        .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
2138
2139        .make_pte = PV_CALLEE_SAVE(xen_make_pte),
2140        .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
2141
2142#ifdef CONFIG_X86_PAE
2143        .set_pte_atomic = xen_set_pte_atomic,
2144        .pte_clear = xen_pte_clear,
2145        .pmd_clear = xen_pmd_clear,
2146#endif  /* CONFIG_X86_PAE */
2147        .set_pud = xen_set_pud_hyper,
2148
2149        .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2150        .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
2151
2152#if PAGETABLE_LEVELS == 4
2153        .pud_val = PV_CALLEE_SAVE(xen_pud_val),
2154        .make_pud = PV_CALLEE_SAVE(xen_make_pud),
2155        .set_pgd = xen_set_pgd_hyper,
2156
2157        .alloc_pud = xen_alloc_pmd_init,
2158        .release_pud = xen_release_pmd_init,
2159#endif  /* PAGETABLE_LEVELS == 4 */
2160
2161        .activate_mm = xen_activate_mm,
2162        .dup_mmap = xen_dup_mmap,
2163        .exit_mmap = xen_exit_mmap,
2164
2165        .lazy_mode = {
2166                .enter = paravirt_enter_lazy_mmu,
2167                .leave = xen_leave_lazy_mmu,
2168                .flush = paravirt_flush_lazy_mmu,
2169        },
2170
2171        .set_fixmap = xen_set_fixmap,
2172};
2173
2174void __init xen_init_mmu_ops(void)
2175{
2176        x86_init.paging.pagetable_init = xen_pagetable_init;
2177        pv_mmu_ops = xen_mmu_ops;
2178
2179        memset(dummy_mapping, 0xff, PAGE_SIZE);
2180}
2181
2182/* Protected by xen_reservation_lock. */
2183#define MAX_CONTIG_ORDER 9 /* 2MB */
2184static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
2185
2186#define VOID_PTE (mfn_pte(0, __pgprot(0)))
2187static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2188                                unsigned long *in_frames,
2189                                unsigned long *out_frames)
2190{
2191        int i;
2192        struct multicall_space mcs;
2193
2194        xen_mc_batch();
2195        for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2196                mcs = __xen_mc_entry(0);
2197
2198                if (in_frames)
2199                        in_frames[i] = virt_to_mfn(vaddr);
2200
2201                MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2202                __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2203
2204                if (out_frames)
2205                        out_frames[i] = virt_to_pfn(vaddr);
2206        }
2207        xen_mc_issue(0);
2208}
2209
2210/*
2211 * Update the pfn-to-mfn mappings for a virtual address range, either to
2212 * point to an array of mfns, or contiguously from a single starting
2213 * mfn.
2214 */
2215static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2216                                     unsigned long *mfns,
2217                                     unsigned long first_mfn)
2218{
2219        unsigned i, limit;
2220        unsigned long mfn;
2221
2222        xen_mc_batch();
2223
2224        limit = 1u << order;
2225        for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2226                struct multicall_space mcs;
2227                unsigned flags;
2228
2229                mcs = __xen_mc_entry(0);
2230                if (mfns)
2231                        mfn = mfns[i];
2232                else
2233                        mfn = first_mfn + i;
2234
2235                if (i < (limit - 1))
2236                        flags = 0;
2237                else {
2238                        if (order == 0)
2239                                flags = UVMF_INVLPG | UVMF_ALL;
2240                        else
2241                                flags = UVMF_TLB_FLUSH | UVMF_ALL;
2242                }
2243
2244                MULTI_update_va_mapping(mcs.mc, vaddr,
2245                                mfn_pte(mfn, PAGE_KERNEL), flags);
2246
2247                set_phys_to_machine(virt_to_pfn(vaddr), mfn);
2248        }
2249
2250        xen_mc_issue(0);
2251}
2252
2253/*
2254 * Perform the hypercall to exchange a region of our pfns to point to
2255 * memory with the required contiguous alignment.  Takes the pfns as
2256 * input, and populates mfns as output.
2257 *
2258 * Returns a success code indicating whether the hypervisor was able to
2259 * satisfy the request or not.
2260 */
2261static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2262                               unsigned long *pfns_in,
2263                               unsigned long extents_out,
2264                               unsigned int order_out,
2265                               unsigned long *mfns_out,
2266                               unsigned int address_bits)
2267{
2268        long rc;
2269        int success;
2270
2271        struct xen_memory_exchange exchange = {
2272                .in = {
2273                        .nr_extents   = extents_in,
2274                        .extent_order = order_in,
2275                        .extent_start = pfns_in,
2276                        .domid        = DOMID_SELF
2277                },
2278                .out = {
2279                        .nr_extents   = extents_out,
2280                        .extent_order = order_out,
2281                        .extent_start = mfns_out,
2282                        .address_bits = address_bits,
2283                        .domid        = DOMID_SELF
2284                }
2285        };
2286
2287        BUG_ON(extents_in << order_in != extents_out << order_out);
2288
2289        rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2290        success = (exchange.nr_exchanged == extents_in);
2291
2292        BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
2293        BUG_ON(success && (rc != 0));
2294
2295        return success;
2296}
2297
2298int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
2299                                 unsigned int address_bits)
2300{
2301        unsigned long *in_frames = discontig_frames, out_frame;
2302        unsigned long  flags;
2303        int            success;
2304
2305        /*
2306         * Currently an auto-translated guest will not perform I/O, nor will
2307         * it require PAE page directories below 4GB. Therefore any calls to
2308         * this function are redundant and can be ignored.
2309         */
2310
2311        if (xen_feature(XENFEAT_auto_translated_physmap))
2312                return 0;
2313
2314        if (unlikely(order > MAX_CONTIG_ORDER))
2315                return -ENOMEM;
2316
2317        memset((void *) vstart, 0, PAGE_SIZE << order);
2318
2319        spin_lock_irqsave(&xen_reservation_lock, flags);
2320
2321        /* 1. Zap current PTEs, remembering MFNs. */
2322        xen_zap_pfn_range(vstart, order, in_frames, NULL);
2323
2324        /* 2. Get a new contiguous memory extent. */
2325        out_frame = virt_to_pfn(vstart);
2326        success = xen_exchange_memory(1UL << order, 0, in_frames,
2327                                      1, order, &out_frame,
2328                                      address_bits);
2329
2330        /* 3. Map the new extent in place of old pages. */
2331        if (success)
2332                xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2333        else
2334                xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2335
2336        spin_unlock_irqrestore(&xen_reservation_lock, flags);
2337
2338        return success ? 0 : -ENOMEM;
2339}
2340EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
2341
2342void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
2343{
2344        unsigned long *out_frames = discontig_frames, in_frame;
2345        unsigned long  flags;
2346        int success;
2347
2348        if (xen_feature(XENFEAT_auto_translated_physmap))
2349                return;
2350
2351        if (unlikely(order > MAX_CONTIG_ORDER))
2352                return;
2353
2354        memset((void *) vstart, 0, PAGE_SIZE << order);
2355
2356        spin_lock_irqsave(&xen_reservation_lock, flags);
2357
2358        /* 1. Find start MFN of contiguous extent. */
2359        in_frame = virt_to_mfn(vstart);
2360
2361        /* 2. Zap current PTEs. */
2362        xen_zap_pfn_range(vstart, order, NULL, out_frames);
2363
2364        /* 3. Do the exchange for non-contiguous MFNs. */
2365        success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2366                                        0, out_frames, 0);
2367
2368        /* 4. Map new pages in place of old pages. */
2369        if (success)
2370                xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2371        else
2372                xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2373
2374        spin_unlock_irqrestore(&xen_reservation_lock, flags);
2375}
2376EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
2377
2378#ifdef CONFIG_XEN_PVHVM
2379#ifdef CONFIG_PROC_VMCORE
2380/*
2381 * This function is used in two contexts:
2382 * - the kdump kernel has to check whether a pfn of the crashed kernel
2383 *   was a ballooned page. vmcore is using this function to decide
2384 *   whether to access a pfn of the crashed kernel.
2385 * - the kexec kernel has to check whether a pfn was ballooned by the
2386 *   previous kernel. If the pfn is ballooned, handle it properly.
2387 * Returns 0 if the pfn is not backed by a RAM page, the caller may
2388 * handle the pfn special in this case.
2389 */
2390static int xen_oldmem_pfn_is_ram(unsigned long pfn)
2391{
2392        struct xen_hvm_get_mem_type a = {
2393                .domid = DOMID_SELF,
2394                .pfn = pfn,
2395        };
2396        int ram;
2397
2398        if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a))
2399                return -ENXIO;
2400
2401        switch (a.mem_type) {
2402                case HVMMEM_mmio_dm:
2403                        ram = 0;
2404                        break;
2405                case HVMMEM_ram_rw:
2406                case HVMMEM_ram_ro:
2407                default:
2408                        ram = 1;
2409                        break;
2410        }
2411
2412        return ram;
2413}
2414#endif
2415
2416static void xen_hvm_exit_mmap(struct mm_struct *mm)
2417{
2418        struct xen_hvm_pagetable_dying a;
2419        int rc;
2420
2421        a.domid = DOMID_SELF;
2422        a.gpa = __pa(mm->pgd);
2423        rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2424        WARN_ON_ONCE(rc < 0);
2425}
2426
2427static int is_pagetable_dying_supported(void)
2428{
2429        struct xen_hvm_pagetable_dying a;
2430        int rc = 0;
2431
2432        a.domid = DOMID_SELF;
2433        a.gpa = 0x00;
2434        rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2435        if (rc < 0) {
2436                printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
2437                return 0;
2438        }
2439        return 1;
2440}
2441
2442void __init xen_hvm_init_mmu_ops(void)
2443{
2444        if (is_pagetable_dying_supported())
2445                pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
2446#ifdef CONFIG_PROC_VMCORE
2447        WARN_ON(register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram));
2448#endif
2449}
2450#endif
2451
2452#define REMAP_BATCH_SIZE 16
2453
2454struct remap_data {
2455        unsigned long mfn;
2456        pgprot_t prot;
2457        struct mmu_update *mmu_update;
2458};
2459
2460static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
2461                                 unsigned long addr, void *data)
2462{
2463        struct remap_data *rmd = data;
2464        pte_t pte = pte_mkspecial(mfn_pte(rmd->mfn++, rmd->prot));
2465
2466        rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
2467        rmd->mmu_update->val = pte_val_ma(pte);
2468        rmd->mmu_update++;
2469
2470        return 0;
2471}
2472
2473int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2474                               unsigned long addr,
2475                               xen_pfn_t mfn, int nr,
2476                               pgprot_t prot, unsigned domid,
2477                               struct page **pages)
2478
2479{
2480        struct remap_data rmd;
2481        struct mmu_update mmu_update[REMAP_BATCH_SIZE];
2482        int batch;
2483        unsigned long range;
2484        int err = 0;
2485
2486        if (xen_feature(XENFEAT_auto_translated_physmap))
2487                return -EINVAL;
2488
2489        BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
2490
2491        rmd.mfn = mfn;
2492        rmd.prot = prot;
2493
2494        while (nr) {
2495                batch = min(REMAP_BATCH_SIZE, nr);
2496                range = (unsigned long)batch << PAGE_SHIFT;
2497
2498                rmd.mmu_update = mmu_update;
2499                err = apply_to_page_range(vma->vm_mm, addr, range,
2500                                          remap_area_mfn_pte_fn, &rmd);
2501                if (err)
2502                        goto out;
2503
2504                err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid);
2505                if (err < 0)
2506                        goto out;
2507
2508                nr -= batch;
2509                addr += range;
2510        }
2511
2512        err = 0;
2513out:
2514
2515        xen_flush_tlb_all();
2516
2517        return err;
2518}
2519EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
2520
2521/* Returns: 0 success */
2522int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
2523                               int numpgs, struct page **pages)
2524{
2525        if (!pages || !xen_feature(XENFEAT_auto_translated_physmap))
2526                return 0;
2527
2528        return -EINVAL;
2529}
2530EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range);
2531