linux/arch/x86/mm/pgtable.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2#include <linux/mm.h>
   3#include <linux/gfp.h>
   4#include <linux/hugetlb.h>
   5#include <asm/pgalloc.h>
   6#include <asm/pgtable.h>
   7#include <asm/tlb.h>
   8#include <asm/fixmap.h>
   9#include <asm/mtrr.h>
  10
  11#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
  12phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
  13EXPORT_SYMBOL(physical_mask);
  14#endif
  15
  16#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO)
  17
  18#ifdef CONFIG_HIGHPTE
  19#define PGALLOC_USER_GFP __GFP_HIGHMEM
  20#else
  21#define PGALLOC_USER_GFP 0
  22#endif
  23
  24gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
  25
  26pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
  27{
  28        return (pte_t *)__get_free_page(PGALLOC_GFP & ~__GFP_ACCOUNT);
  29}
  30
  31pgtable_t pte_alloc_one(struct mm_struct *mm)
  32{
  33        struct page *pte;
  34
  35        pte = alloc_pages(__userpte_alloc_gfp, 0);
  36        if (!pte)
  37                return NULL;
  38        if (!pgtable_page_ctor(pte)) {
  39                __free_page(pte);
  40                return NULL;
  41        }
  42        return pte;
  43}
  44
  45static int __init setup_userpte(char *arg)
  46{
  47        if (!arg)
  48                return -EINVAL;
  49
  50        /*
  51         * "userpte=nohigh" disables allocation of user pagetables in
  52         * high memory.
  53         */
  54        if (strcmp(arg, "nohigh") == 0)
  55                __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
  56        else
  57                return -EINVAL;
  58        return 0;
  59}
  60early_param("userpte", setup_userpte);
  61
  62void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
  63{
  64        pgtable_page_dtor(pte);
  65        paravirt_release_pte(page_to_pfn(pte));
  66        paravirt_tlb_remove_table(tlb, pte);
  67}
  68
  69#if CONFIG_PGTABLE_LEVELS > 2
  70void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
  71{
  72        struct page *page = virt_to_page(pmd);
  73        paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
  74        /*
  75         * NOTE! For PAE, any changes to the top page-directory-pointer-table
  76         * entries need a full cr3 reload to flush.
  77         */
  78#ifdef CONFIG_X86_PAE
  79        tlb->need_flush_all = 1;
  80#endif
  81        pgtable_pmd_page_dtor(page);
  82        paravirt_tlb_remove_table(tlb, page);
  83}
  84
  85#if CONFIG_PGTABLE_LEVELS > 3
  86void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
  87{
  88        paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
  89        paravirt_tlb_remove_table(tlb, virt_to_page(pud));
  90}
  91
  92#if CONFIG_PGTABLE_LEVELS > 4
  93void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
  94{
  95        paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
  96        paravirt_tlb_remove_table(tlb, virt_to_page(p4d));
  97}
  98#endif  /* CONFIG_PGTABLE_LEVELS > 4 */
  99#endif  /* CONFIG_PGTABLE_LEVELS > 3 */
 100#endif  /* CONFIG_PGTABLE_LEVELS > 2 */
 101
 102static inline void pgd_list_add(pgd_t *pgd)
 103{
 104        struct page *page = virt_to_page(pgd);
 105
 106        list_add(&page->lru, &pgd_list);
 107}
 108
 109static inline void pgd_list_del(pgd_t *pgd)
 110{
 111        struct page *page = virt_to_page(pgd);
 112
 113        list_del(&page->lru);
 114}
 115
 116#define UNSHARED_PTRS_PER_PGD                           \
 117        (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
 118
 119
 120static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
 121{
 122        virt_to_page(pgd)->pt_mm = mm;
 123}
 124
 125struct mm_struct *pgd_page_get_mm(struct page *page)
 126{
 127        return page->pt_mm;
 128}
 129
 130static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
 131{
 132        /* If the pgd points to a shared pagetable level (either the
 133           ptes in non-PAE, or shared PMD in PAE), then just copy the
 134           references from swapper_pg_dir. */
 135        if (CONFIG_PGTABLE_LEVELS == 2 ||
 136            (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
 137            CONFIG_PGTABLE_LEVELS >= 4) {
 138                clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
 139                                swapper_pg_dir + KERNEL_PGD_BOUNDARY,
 140                                KERNEL_PGD_PTRS);
 141        }
 142
 143        /* list required to sync kernel mapping updates */
 144        if (!SHARED_KERNEL_PMD) {
 145                pgd_set_mm(pgd, mm);
 146                pgd_list_add(pgd);
 147        }
 148}
 149
 150static void pgd_dtor(pgd_t *pgd)
 151{
 152        if (SHARED_KERNEL_PMD)
 153                return;
 154
 155        spin_lock(&pgd_lock);
 156        pgd_list_del(pgd);
 157        spin_unlock(&pgd_lock);
 158}
 159
 160/*
 161 * List of all pgd's needed for non-PAE so it can invalidate entries
 162 * in both cached and uncached pgd's; not needed for PAE since the
 163 * kernel pmd is shared. If PAE were not to share the pmd a similar
 164 * tactic would be needed. This is essentially codepath-based locking
 165 * against pageattr.c; it is the unique case in which a valid change
 166 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
 167 * vmalloc faults work because attached pagetables are never freed.
 168 * -- nyc
 169 */
 170
 171#ifdef CONFIG_X86_PAE
 172/*
 173 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
 174 * updating the top-level pagetable entries to guarantee the
 175 * processor notices the update.  Since this is expensive, and
 176 * all 4 top-level entries are used almost immediately in a
 177 * new process's life, we just pre-populate them here.
 178 *
 179 * Also, if we're in a paravirt environment where the kernel pmd is
 180 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
 181 * and initialize the kernel pmds here.
 182 */
 183#define PREALLOCATED_PMDS       UNSHARED_PTRS_PER_PGD
 184
 185void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
 186{
 187        paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
 188
 189        /* Note: almost everything apart from _PAGE_PRESENT is
 190           reserved at the pmd (PDPT) level. */
 191        set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
 192
 193        /*
 194         * According to Intel App note "TLBs, Paging-Structure Caches,
 195         * and Their Invalidation", April 2007, document 317080-001,
 196         * section 8.1: in PAE mode we explicitly have to flush the
 197         * TLB via cr3 if the top-level pgd is changed...
 198         */
 199        flush_tlb_mm(mm);
 200}
 201#else  /* !CONFIG_X86_PAE */
 202
 203/* No need to prepopulate any pagetable entries in non-PAE modes. */
 204#define PREALLOCATED_PMDS       0
 205
 206#endif  /* CONFIG_X86_PAE */
 207
 208static void free_pmds(struct mm_struct *mm, pmd_t *pmds[])
 209{
 210        int i;
 211
 212        for(i = 0; i < PREALLOCATED_PMDS; i++)
 213                if (pmds[i]) {
 214                        pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
 215                        free_page((unsigned long)pmds[i]);
 216                        mm_dec_nr_pmds(mm);
 217                }
 218}
 219
 220static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
 221{
 222        int i;
 223        bool failed = false;
 224        gfp_t gfp = PGALLOC_GFP;
 225
 226        if (mm == &init_mm)
 227                gfp &= ~__GFP_ACCOUNT;
 228
 229        for(i = 0; i < PREALLOCATED_PMDS; i++) {
 230                pmd_t *pmd = (pmd_t *)__get_free_page(gfp);
 231                if (!pmd)
 232                        failed = true;
 233                if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) {
 234                        free_page((unsigned long)pmd);
 235                        pmd = NULL;
 236                        failed = true;
 237                }
 238                if (pmd)
 239                        mm_inc_nr_pmds(mm);
 240                pmds[i] = pmd;
 241        }
 242
 243        if (failed) {
 244                free_pmds(mm, pmds);
 245                return -ENOMEM;
 246        }
 247
 248        return 0;
 249}
 250
 251/*
 252 * Mop up any pmd pages which may still be attached to the pgd.
 253 * Normally they will be freed by munmap/exit_mmap, but any pmd we
 254 * preallocate which never got a corresponding vma will need to be
 255 * freed manually.
 256 */
 257static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
 258{
 259        int i;
 260
 261        for(i = 0; i < PREALLOCATED_PMDS; i++) {
 262                pgd_t pgd = pgdp[i];
 263
 264                if (pgd_val(pgd) != 0) {
 265                        pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
 266
 267                        pgdp[i] = native_make_pgd(0);
 268
 269                        paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
 270                        pmd_free(mm, pmd);
 271                        mm_dec_nr_pmds(mm);
 272                }
 273        }
 274}
 275
 276static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
 277{
 278        p4d_t *p4d;
 279        pud_t *pud;
 280        int i;
 281
 282        if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
 283                return;
 284
 285        p4d = p4d_offset(pgd, 0);
 286        pud = pud_offset(p4d, 0);
 287
 288        for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
 289                pmd_t *pmd = pmds[i];
 290
 291                if (i >= KERNEL_PGD_BOUNDARY)
 292                        memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
 293                               sizeof(pmd_t) * PTRS_PER_PMD);
 294
 295                pud_populate(mm, pud, pmd);
 296        }
 297}
 298
 299/*
 300 * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also
 301 * assumes that pgd should be in one page.
 302 *
 303 * But kernel with PAE paging that is not running as a Xen domain
 304 * only needs to allocate 32 bytes for pgd instead of one page.
 305 */
 306#ifdef CONFIG_X86_PAE
 307
 308#include <linux/slab.h>
 309
 310#define PGD_SIZE        (PTRS_PER_PGD * sizeof(pgd_t))
 311#define PGD_ALIGN       32
 312
 313static struct kmem_cache *pgd_cache;
 314
 315void __init pgd_cache_init(void)
 316{
 317        /*
 318         * When PAE kernel is running as a Xen domain, it does not use
 319         * shared kernel pmd. And this requires a whole page for pgd.
 320         */
 321        if (!SHARED_KERNEL_PMD)
 322                return;
 323
 324        /*
 325         * when PAE kernel is not running as a Xen domain, it uses
 326         * shared kernel pmd. Shared kernel pmd does not require a whole
 327         * page for pgd. We are able to just allocate a 32-byte for pgd.
 328         * During boot time, we create a 32-byte slab for pgd table allocation.
 329         */
 330        pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
 331                                      SLAB_PANIC, NULL);
 332}
 333
 334static inline pgd_t *_pgd_alloc(void)
 335{
 336        /*
 337         * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain.
 338         * We allocate one page for pgd.
 339         */
 340        if (!SHARED_KERNEL_PMD)
 341                return (pgd_t *)__get_free_page(PGALLOC_GFP);
 342
 343        /*
 344         * Now PAE kernel is not running as a Xen domain. We can allocate
 345         * a 32-byte slab for pgd to save memory space.
 346         */
 347        return kmem_cache_alloc(pgd_cache, PGALLOC_GFP);
 348}
 349
 350static inline void _pgd_free(pgd_t *pgd)
 351{
 352        if (!SHARED_KERNEL_PMD)
 353                free_page((unsigned long)pgd);
 354        else
 355                kmem_cache_free(pgd_cache, pgd);
 356}
 357#else
 358
 359void __init pgd_cache_init(void)
 360{
 361}
 362
 363static inline pgd_t *_pgd_alloc(void)
 364{
 365        return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
 366}
 367
 368static inline void _pgd_free(pgd_t *pgd)
 369{
 370        free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
 371}
 372#endif /* CONFIG_X86_PAE */
 373
 374pgd_t *pgd_alloc(struct mm_struct *mm)
 375{
 376        pgd_t *pgd;
 377        pmd_t *pmds[PREALLOCATED_PMDS];
 378
 379        pgd = _pgd_alloc();
 380
 381        if (pgd == NULL)
 382                goto out;
 383
 384        mm->pgd = pgd;
 385
 386        if (preallocate_pmds(mm, pmds) != 0)
 387                goto out_free_pgd;
 388
 389        if (paravirt_pgd_alloc(mm) != 0)
 390                goto out_free_pmds;
 391
 392        /*
 393         * Make sure that pre-populating the pmds is atomic with
 394         * respect to anything walking the pgd_list, so that they
 395         * never see a partially populated pgd.
 396         */
 397        spin_lock(&pgd_lock);
 398
 399        pgd_ctor(mm, pgd);
 400        pgd_prepopulate_pmd(mm, pgd, pmds);
 401
 402        spin_unlock(&pgd_lock);
 403
 404        return pgd;
 405
 406out_free_pmds:
 407        free_pmds(mm, pmds);
 408out_free_pgd:
 409        _pgd_free(pgd);
 410out:
 411        return NULL;
 412}
 413
 414void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 415{
 416        pgd_mop_up_pmds(mm, pgd);
 417        pgd_dtor(pgd);
 418        paravirt_pgd_free(mm, pgd);
 419        _pgd_free(pgd);
 420}
 421
 422/*
 423 * Used to set accessed or dirty bits in the page table entries
 424 * on other architectures. On x86, the accessed and dirty bits
 425 * are tracked by hardware. However, do_wp_page calls this function
 426 * to also make the pte writeable at the same time the dirty bit is
 427 * set. In that case we do actually need to write the PTE.
 428 */
 429int ptep_set_access_flags(struct vm_area_struct *vma,
 430                          unsigned long address, pte_t *ptep,
 431                          pte_t entry, int dirty)
 432{
 433        int changed = !pte_same(*ptep, entry);
 434
 435        if (changed && dirty)
 436                *ptep = entry;
 437
 438        return changed;
 439}
 440
 441#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 442int pmdp_set_access_flags(struct vm_area_struct *vma,
 443                          unsigned long address, pmd_t *pmdp,
 444                          pmd_t entry, int dirty)
 445{
 446        int changed = !pmd_same(*pmdp, entry);
 447
 448        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 449
 450        if (changed && dirty) {
 451                *pmdp = entry;
 452                /*
 453                 * We had a write-protection fault here and changed the pmd
 454                 * to to more permissive. No need to flush the TLB for that,
 455                 * #PF is architecturally guaranteed to do that and in the
 456                 * worst-case we'll generate a spurious fault.
 457                 */
 458        }
 459
 460        return changed;
 461}
 462
 463int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
 464                          pud_t *pudp, pud_t entry, int dirty)
 465{
 466        int changed = !pud_same(*pudp, entry);
 467
 468        VM_BUG_ON(address & ~HPAGE_PUD_MASK);
 469
 470        if (changed && dirty) {
 471                *pudp = entry;
 472                /*
 473                 * We had a write-protection fault here and changed the pud
 474                 * to to more permissive. No need to flush the TLB for that,
 475                 * #PF is architecturally guaranteed to do that and in the
 476                 * worst-case we'll generate a spurious fault.
 477                 */
 478        }
 479
 480        return changed;
 481}
 482#endif
 483
 484int ptep_test_and_clear_young(struct vm_area_struct *vma,
 485                              unsigned long addr, pte_t *ptep)
 486{
 487        int ret = 0;
 488
 489        if (pte_young(*ptep))
 490                ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
 491                                         (unsigned long *) &ptep->pte);
 492
 493        return ret;
 494}
 495
 496#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 497int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 498                              unsigned long addr, pmd_t *pmdp)
 499{
 500        int ret = 0;
 501
 502        if (pmd_young(*pmdp))
 503                ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
 504                                         (unsigned long *)pmdp);
 505
 506        return ret;
 507}
 508int pudp_test_and_clear_young(struct vm_area_struct *vma,
 509                              unsigned long addr, pud_t *pudp)
 510{
 511        int ret = 0;
 512
 513        if (pud_young(*pudp))
 514                ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
 515                                         (unsigned long *)pudp);
 516
 517        return ret;
 518}
 519#endif
 520
 521int ptep_clear_flush_young(struct vm_area_struct *vma,
 522                           unsigned long address, pte_t *ptep)
 523{
 524        /*
 525         * On x86 CPUs, clearing the accessed bit without a TLB flush
 526         * doesn't cause data corruption. [ It could cause incorrect
 527         * page aging and the (mistaken) reclaim of hot pages, but the
 528         * chance of that should be relatively low. ]
 529         *
 530         * So as a performance optimization don't flush the TLB when
 531         * clearing the accessed bit, it will eventually be flushed by
 532         * a context switch or a VM operation anyway. [ In the rare
 533         * event of it not getting flushed for a long time the delay
 534         * shouldn't really matter because there's no real memory
 535         * pressure for swapout to react to. ]
 536         */
 537        return ptep_test_and_clear_young(vma, address, ptep);
 538}
 539
 540#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 541int pmdp_clear_flush_young(struct vm_area_struct *vma,
 542                           unsigned long address, pmd_t *pmdp)
 543{
 544        int young;
 545
 546        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 547
 548        young = pmdp_test_and_clear_young(vma, address, pmdp);
 549        if (young)
 550                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 551
 552        return young;
 553}
 554#endif
 555
 556/**
 557 * reserve_top_address - reserves a hole in the top of kernel address space
 558 * @reserve - size of hole to reserve
 559 *
 560 * Can be used to relocate the fixmap area and poke a hole in the top
 561 * of kernel address space to make room for a hypervisor.
 562 */
 563void __init reserve_top_address(unsigned long reserve)
 564{
 565#ifdef CONFIG_X86_32
 566        BUG_ON(fixmaps_set > 0);
 567        __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE;
 568        printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
 569               -reserve, __FIXADDR_TOP + PAGE_SIZE);
 570#endif
 571}
 572
 573int fixmaps_set;
 574
 575void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
 576{
 577        unsigned long address = __fix_to_virt(idx);
 578
 579#ifdef CONFIG_X86_64
 580       /*
 581        * Ensure that the static initial page tables are covering the
 582        * fixmap completely.
 583        */
 584        BUILD_BUG_ON(__end_of_permanent_fixed_addresses >
 585                     (FIXMAP_PMD_NUM * PTRS_PER_PTE));
 586#endif
 587
 588        if (idx >= __end_of_fixed_addresses) {
 589                BUG();
 590                return;
 591        }
 592        set_pte_vaddr(address, pte);
 593        fixmaps_set++;
 594}
 595
 596void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
 597                       pgprot_t flags)
 598{
 599        /* Sanitize 'prot' against any unsupported bits: */
 600        pgprot_val(flags) &= __default_kernel_pte_mask;
 601
 602        __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
 603}
 604
 605#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
 606#ifdef CONFIG_X86_5LEVEL
 607/**
 608 * p4d_set_huge - setup kernel P4D mapping
 609 *
 610 * No 512GB pages yet -- always return 0
 611 */
 612int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
 613{
 614        return 0;
 615}
 616
 617/**
 618 * p4d_clear_huge - clear kernel P4D mapping when it is set
 619 *
 620 * No 512GB pages yet -- always return 0
 621 */
 622int p4d_clear_huge(p4d_t *p4d)
 623{
 624        return 0;
 625}
 626#endif
 627
 628/**
 629 * pud_set_huge - setup kernel PUD mapping
 630 *
 631 * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
 632 * function sets up a huge page only if any of the following conditions are met:
 633 *
 634 * - MTRRs are disabled, or
 635 *
 636 * - MTRRs are enabled and the range is completely covered by a single MTRR, or
 637 *
 638 * - MTRRs are enabled and the corresponding MTRR memory type is WB, which
 639 *   has no effect on the requested PAT memory type.
 640 *
 641 * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
 642 * page mapping attempt fails.
 643 *
 644 * Returns 1 on success and 0 on failure.
 645 */
 646int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
 647{
 648        u8 mtrr, uniform;
 649
 650        mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
 651        if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
 652            (mtrr != MTRR_TYPE_WRBACK))
 653                return 0;
 654
 655        /* Bail out if we are we on a populated non-leaf entry: */
 656        if (pud_present(*pud) && !pud_huge(*pud))
 657                return 0;
 658
 659        prot = pgprot_4k_2_large(prot);
 660
 661        set_pte((pte_t *)pud, pfn_pte(
 662                (u64)addr >> PAGE_SHIFT,
 663                __pgprot(pgprot_val(prot) | _PAGE_PSE)));
 664
 665        return 1;
 666}
 667
 668/**
 669 * pmd_set_huge - setup kernel PMD mapping
 670 *
 671 * See text over pud_set_huge() above.
 672 *
 673 * Returns 1 on success and 0 on failure.
 674 */
 675int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
 676{
 677        u8 mtrr, uniform;
 678
 679        mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
 680        if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
 681            (mtrr != MTRR_TYPE_WRBACK)) {
 682                pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
 683                             __func__, addr, addr + PMD_SIZE);
 684                return 0;
 685        }
 686
 687        /* Bail out if we are we on a populated non-leaf entry: */
 688        if (pmd_present(*pmd) && !pmd_huge(*pmd))
 689                return 0;
 690
 691        prot = pgprot_4k_2_large(prot);
 692
 693        set_pte((pte_t *)pmd, pfn_pte(
 694                (u64)addr >> PAGE_SHIFT,
 695                __pgprot(pgprot_val(prot) | _PAGE_PSE)));
 696
 697        return 1;
 698}
 699
 700/**
 701 * pud_clear_huge - clear kernel PUD mapping when it is set
 702 *
 703 * Returns 1 on success and 0 on failure (no PUD map is found).
 704 */
 705int pud_clear_huge(pud_t *pud)
 706{
 707        if (pud_large(*pud)) {
 708                pud_clear(pud);
 709                return 1;
 710        }
 711
 712        return 0;
 713}
 714
 715/**
 716 * pmd_clear_huge - clear kernel PMD mapping when it is set
 717 *
 718 * Returns 1 on success and 0 on failure (no PMD map is found).
 719 */
 720int pmd_clear_huge(pmd_t *pmd)
 721{
 722        if (pmd_large(*pmd)) {
 723                pmd_clear(pmd);
 724                return 1;
 725        }
 726
 727        return 0;
 728}
 729
 730#ifdef CONFIG_X86_64
 731/**
 732 * pud_free_pmd_page - Clear pud entry and free pmd page.
 733 * @pud: Pointer to a PUD.
 734 * @addr: Virtual address associated with pud.
 735 *
 736 * Context: The pud range has been unmapped and TLB purged.
 737 * Return: 1 if clearing the entry succeeded. 0 otherwise.
 738 *
 739 * NOTE: Callers must allow a single page allocation.
 740 */
 741int pud_free_pmd_page(pud_t *pud, unsigned long addr)
 742{
 743        pmd_t *pmd, *pmd_sv;
 744        pte_t *pte;
 745        int i;
 746
 747        if (pud_none(*pud))
 748                return 1;
 749
 750        pmd = (pmd_t *)pud_page_vaddr(*pud);
 751        pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
 752        if (!pmd_sv)
 753                return 0;
 754
 755         for (i = 0; i < PTRS_PER_PMD; i++) {
 756                pmd_sv[i] = pmd[i];
 757                if (!pmd_none(pmd[i]))
 758                        pmd_clear(&pmd[i]);
 759        }
 760
 761        pud_clear(pud);
 762
 763        /* INVLPG to clear all paging-structure caches */
 764        flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
 765
 766        for (i = 0; i < PTRS_PER_PMD; i++) {
 767                if (!pmd_none(pmd_sv[i])) {
 768                        pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
 769                        free_page((unsigned long)pte);
 770                }
 771        }
 772
 773        free_page((unsigned long)pmd_sv);
 774
 775        pgtable_pmd_page_dtor(virt_to_page(pmd));
 776        free_page((unsigned long)pmd);
 777
 778        return 1;
 779}
 780
 781/**
 782 * pmd_free_pte_page - Clear pmd entry and free pte page.
 783 * @pmd: Pointer to a PMD.
 784 * @addr: Virtual address associated with pmd.
 785 *
 786 * Context: The pmd range has been unmapped and TLB purged.
 787 * Return: 1 if clearing the entry succeeded. 0 otherwise.
 788 */
 789int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
 790{
 791        pte_t *pte;
 792
 793        if (pmd_none(*pmd))
 794                return 1;
 795
 796        pte = (pte_t *)pmd_page_vaddr(*pmd);
 797        pmd_clear(pmd);
 798
 799        /* INVLPG to clear all paging-structure caches */
 800        flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
 801
 802        free_page((unsigned long)pte);
 803
 804        return 1;
 805}
 806
 807#else /* !CONFIG_X86_64 */
 808
 809int pud_free_pmd_page(pud_t *pud, unsigned long addr)
 810{
 811        return pud_none(*pud);
 812}
 813
 814/*
 815 * Disable free page handling on x86-PAE. This assures that ioremap()
 816 * does not update sync'd pmd entries. See vmalloc_sync_one().
 817 */
 818int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
 819{
 820        return pmd_none(*pmd);
 821}
 822
 823#endif /* CONFIG_X86_64 */
 824#endif  /* CONFIG_HAVE_ARCH_HUGE_VMAP */
 825