linux/arch/x86/mm/pgtable.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2#include <linux/mm.h>
   3#include <linux/gfp.h>
   4#include <linux/hugetlb.h>
   5#include <asm/pgalloc.h>
   6#include <asm/pgtable.h>
   7#include <asm/tlb.h>
   8#include <asm/fixmap.h>
   9#include <asm/mtrr.h>
  10
  11#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
  12phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
  13EXPORT_SYMBOL(physical_mask);
  14#endif
  15
  16#ifdef CONFIG_HIGHPTE
  17#define PGTABLE_HIGHMEM __GFP_HIGHMEM
  18#else
  19#define PGTABLE_HIGHMEM 0
  20#endif
  21
  22gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;
  23
  24pgtable_t pte_alloc_one(struct mm_struct *mm)
  25{
  26        return __pte_alloc_one(mm, __userpte_alloc_gfp);
  27}
  28
  29static int __init setup_userpte(char *arg)
  30{
  31        if (!arg)
  32                return -EINVAL;
  33
  34        /*
  35         * "userpte=nohigh" disables allocation of user pagetables in
  36         * high memory.
  37         */
  38        if (strcmp(arg, "nohigh") == 0)
  39                __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
  40        else
  41                return -EINVAL;
  42        return 0;
  43}
  44early_param("userpte", setup_userpte);
  45
  46void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
  47{
  48        pgtable_pte_page_dtor(pte);
  49        paravirt_release_pte(page_to_pfn(pte));
  50        paravirt_tlb_remove_table(tlb, pte);
  51}
  52
  53#if CONFIG_PGTABLE_LEVELS > 2
  54void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
  55{
  56        struct page *page = virt_to_page(pmd);
  57        paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
  58        /*
  59         * NOTE! For PAE, any changes to the top page-directory-pointer-table
  60         * entries need a full cr3 reload to flush.
  61         */
  62#ifdef CONFIG_X86_PAE
  63        tlb->need_flush_all = 1;
  64#endif
  65        pgtable_pmd_page_dtor(page);
  66        paravirt_tlb_remove_table(tlb, page);
  67}
  68
  69#if CONFIG_PGTABLE_LEVELS > 3
  70void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
  71{
  72        paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
  73        paravirt_tlb_remove_table(tlb, virt_to_page(pud));
  74}
  75
  76#if CONFIG_PGTABLE_LEVELS > 4
  77void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
  78{
  79        paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
  80        paravirt_tlb_remove_table(tlb, virt_to_page(p4d));
  81}
  82#endif  /* CONFIG_PGTABLE_LEVELS > 4 */
  83#endif  /* CONFIG_PGTABLE_LEVELS > 3 */
  84#endif  /* CONFIG_PGTABLE_LEVELS > 2 */
  85
  86static inline void pgd_list_add(pgd_t *pgd)
  87{
  88        struct page *page = virt_to_page(pgd);
  89
  90        list_add(&page->lru, &pgd_list);
  91}
  92
  93static inline void pgd_list_del(pgd_t *pgd)
  94{
  95        struct page *page = virt_to_page(pgd);
  96
  97        list_del(&page->lru);
  98}
  99
 100#define UNSHARED_PTRS_PER_PGD                           \
 101        (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
 102#define MAX_UNSHARED_PTRS_PER_PGD                       \
 103        max_t(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD)
 104
 105
 106static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
 107{
 108        virt_to_page(pgd)->pt_mm = mm;
 109}
 110
 111struct mm_struct *pgd_page_get_mm(struct page *page)
 112{
 113        return page->pt_mm;
 114}
 115
 116static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
 117{
 118        /* If the pgd points to a shared pagetable level (either the
 119           ptes in non-PAE, or shared PMD in PAE), then just copy the
 120           references from swapper_pg_dir. */
 121        if (CONFIG_PGTABLE_LEVELS == 2 ||
 122            (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
 123            CONFIG_PGTABLE_LEVELS >= 4) {
 124                clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
 125                                swapper_pg_dir + KERNEL_PGD_BOUNDARY,
 126                                KERNEL_PGD_PTRS);
 127        }
 128
 129        /* list required to sync kernel mapping updates */
 130        if (!SHARED_KERNEL_PMD) {
 131                pgd_set_mm(pgd, mm);
 132                pgd_list_add(pgd);
 133        }
 134}
 135
 136static void pgd_dtor(pgd_t *pgd)
 137{
 138        if (SHARED_KERNEL_PMD)
 139                return;
 140
 141        spin_lock(&pgd_lock);
 142        pgd_list_del(pgd);
 143        spin_unlock(&pgd_lock);
 144}
 145
 146/*
 147 * List of all pgd's needed for non-PAE so it can invalidate entries
 148 * in both cached and uncached pgd's; not needed for PAE since the
 149 * kernel pmd is shared. If PAE were not to share the pmd a similar
 150 * tactic would be needed. This is essentially codepath-based locking
 151 * against pageattr.c; it is the unique case in which a valid change
 152 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
 153 * vmalloc faults work because attached pagetables are never freed.
 154 * -- nyc
 155 */
 156
 157#ifdef CONFIG_X86_PAE
 158/*
 159 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
 160 * updating the top-level pagetable entries to guarantee the
 161 * processor notices the update.  Since this is expensive, and
 162 * all 4 top-level entries are used almost immediately in a
 163 * new process's life, we just pre-populate them here.
 164 *
 165 * Also, if we're in a paravirt environment where the kernel pmd is
 166 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
 167 * and initialize the kernel pmds here.
 168 */
 169#define PREALLOCATED_PMDS       UNSHARED_PTRS_PER_PGD
 170#define MAX_PREALLOCATED_PMDS   MAX_UNSHARED_PTRS_PER_PGD
 171
 172/*
 173 * We allocate separate PMDs for the kernel part of the user page-table
 174 * when PTI is enabled. We need them to map the per-process LDT into the
 175 * user-space page-table.
 176 */
 177#define PREALLOCATED_USER_PMDS   (boot_cpu_has(X86_FEATURE_PTI) ? \
 178                                        KERNEL_PGD_PTRS : 0)
 179#define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS
 180
 181void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
 182{
 183        paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
 184
 185        /* Note: almost everything apart from _PAGE_PRESENT is
 186           reserved at the pmd (PDPT) level. */
 187        set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
 188
 189        /*
 190         * According to Intel App note "TLBs, Paging-Structure Caches,
 191         * and Their Invalidation", April 2007, document 317080-001,
 192         * section 8.1: in PAE mode we explicitly have to flush the
 193         * TLB via cr3 if the top-level pgd is changed...
 194         */
 195        flush_tlb_mm(mm);
 196}
 197#else  /* !CONFIG_X86_PAE */
 198
 199/* No need to prepopulate any pagetable entries in non-PAE modes. */
 200#define PREALLOCATED_PMDS       0
 201#define MAX_PREALLOCATED_PMDS   0
 202#define PREALLOCATED_USER_PMDS   0
 203#define MAX_PREALLOCATED_USER_PMDS 0
 204#endif  /* CONFIG_X86_PAE */
 205
 206static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
 207{
 208        int i;
 209
 210        for (i = 0; i < count; i++)
 211                if (pmds[i]) {
 212                        pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
 213                        free_page((unsigned long)pmds[i]);
 214                        mm_dec_nr_pmds(mm);
 215                }
 216}
 217
 218static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
 219{
 220        int i;
 221        bool failed = false;
 222        gfp_t gfp = GFP_PGTABLE_USER;
 223
 224        if (mm == &init_mm)
 225                gfp &= ~__GFP_ACCOUNT;
 226
 227        for (i = 0; i < count; i++) {
 228                pmd_t *pmd = (pmd_t *)__get_free_page(gfp);
 229                if (!pmd)
 230                        failed = true;
 231                if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) {
 232                        free_page((unsigned long)pmd);
 233                        pmd = NULL;
 234                        failed = true;
 235                }
 236                if (pmd)
 237                        mm_inc_nr_pmds(mm);
 238                pmds[i] = pmd;
 239        }
 240
 241        if (failed) {
 242                free_pmds(mm, pmds, count);
 243                return -ENOMEM;
 244        }
 245
 246        return 0;
 247}
 248
 249/*
 250 * Mop up any pmd pages which may still be attached to the pgd.
 251 * Normally they will be freed by munmap/exit_mmap, but any pmd we
 252 * preallocate which never got a corresponding vma will need to be
 253 * freed manually.
 254 */
 255static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp)
 256{
 257        pgd_t pgd = *pgdp;
 258
 259        if (pgd_val(pgd) != 0) {
 260                pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
 261
 262                pgd_clear(pgdp);
 263
 264                paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
 265                pmd_free(mm, pmd);
 266                mm_dec_nr_pmds(mm);
 267        }
 268}
 269
 270static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
 271{
 272        int i;
 273
 274        for (i = 0; i < PREALLOCATED_PMDS; i++)
 275                mop_up_one_pmd(mm, &pgdp[i]);
 276
 277#ifdef CONFIG_PAGE_TABLE_ISOLATION
 278
 279        if (!boot_cpu_has(X86_FEATURE_PTI))
 280                return;
 281
 282        pgdp = kernel_to_user_pgdp(pgdp);
 283
 284        for (i = 0; i < PREALLOCATED_USER_PMDS; i++)
 285                mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]);
 286#endif
 287}
 288
 289static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
 290{
 291        p4d_t *p4d;
 292        pud_t *pud;
 293        int i;
 294
 295        if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
 296                return;
 297
 298        p4d = p4d_offset(pgd, 0);
 299        pud = pud_offset(p4d, 0);
 300
 301        for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
 302                pmd_t *pmd = pmds[i];
 303
 304                if (i >= KERNEL_PGD_BOUNDARY)
 305                        memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
 306                               sizeof(pmd_t) * PTRS_PER_PMD);
 307
 308                pud_populate(mm, pud, pmd);
 309        }
 310}
 311
 312#ifdef CONFIG_PAGE_TABLE_ISOLATION
 313static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
 314                                     pgd_t *k_pgd, pmd_t *pmds[])
 315{
 316        pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir);
 317        pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
 318        p4d_t *u_p4d;
 319        pud_t *u_pud;
 320        int i;
 321
 322        u_p4d = p4d_offset(u_pgd, 0);
 323        u_pud = pud_offset(u_p4d, 0);
 324
 325        s_pgd += KERNEL_PGD_BOUNDARY;
 326        u_pud += KERNEL_PGD_BOUNDARY;
 327
 328        for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) {
 329                pmd_t *pmd = pmds[i];
 330
 331                memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd),
 332                       sizeof(pmd_t) * PTRS_PER_PMD);
 333
 334                pud_populate(mm, u_pud, pmd);
 335        }
 336
 337}
 338#else
 339static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
 340                                     pgd_t *k_pgd, pmd_t *pmds[])
 341{
 342}
 343#endif
 344/*
 345 * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also
 346 * assumes that pgd should be in one page.
 347 *
 348 * But kernel with PAE paging that is not running as a Xen domain
 349 * only needs to allocate 32 bytes for pgd instead of one page.
 350 */
 351#ifdef CONFIG_X86_PAE
 352
 353#include <linux/slab.h>
 354
 355#define PGD_SIZE        (PTRS_PER_PGD * sizeof(pgd_t))
 356#define PGD_ALIGN       32
 357
 358static struct kmem_cache *pgd_cache;
 359
 360void __init pgtable_cache_init(void)
 361{
 362        /*
 363         * When PAE kernel is running as a Xen domain, it does not use
 364         * shared kernel pmd. And this requires a whole page for pgd.
 365         */
 366        if (!SHARED_KERNEL_PMD)
 367                return;
 368
 369        /*
 370         * when PAE kernel is not running as a Xen domain, it uses
 371         * shared kernel pmd. Shared kernel pmd does not require a whole
 372         * page for pgd. We are able to just allocate a 32-byte for pgd.
 373         * During boot time, we create a 32-byte slab for pgd table allocation.
 374         */
 375        pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
 376                                      SLAB_PANIC, NULL);
 377}
 378
 379static inline pgd_t *_pgd_alloc(void)
 380{
 381        /*
 382         * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain.
 383         * We allocate one page for pgd.
 384         */
 385        if (!SHARED_KERNEL_PMD)
 386                return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER,
 387                                                 PGD_ALLOCATION_ORDER);
 388
 389        /*
 390         * Now PAE kernel is not running as a Xen domain. We can allocate
 391         * a 32-byte slab for pgd to save memory space.
 392         */
 393        return kmem_cache_alloc(pgd_cache, GFP_PGTABLE_USER);
 394}
 395
 396static inline void _pgd_free(pgd_t *pgd)
 397{
 398        if (!SHARED_KERNEL_PMD)
 399                free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
 400        else
 401                kmem_cache_free(pgd_cache, pgd);
 402}
 403#else
 404
 405static inline pgd_t *_pgd_alloc(void)
 406{
 407        return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER,
 408                                         PGD_ALLOCATION_ORDER);
 409}
 410
 411static inline void _pgd_free(pgd_t *pgd)
 412{
 413        free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
 414}
 415#endif /* CONFIG_X86_PAE */
 416
 417pgd_t *pgd_alloc(struct mm_struct *mm)
 418{
 419        pgd_t *pgd;
 420        pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS];
 421        pmd_t *pmds[MAX_PREALLOCATED_PMDS];
 422
 423        pgd = _pgd_alloc();
 424
 425        if (pgd == NULL)
 426                goto out;
 427
 428        mm->pgd = pgd;
 429
 430        if (preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
 431                goto out_free_pgd;
 432
 433        if (preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0)
 434                goto out_free_pmds;
 435
 436        if (paravirt_pgd_alloc(mm) != 0)
 437                goto out_free_user_pmds;
 438
 439        /*
 440         * Make sure that pre-populating the pmds is atomic with
 441         * respect to anything walking the pgd_list, so that they
 442         * never see a partially populated pgd.
 443         */
 444        spin_lock(&pgd_lock);
 445
 446        pgd_ctor(mm, pgd);
 447        pgd_prepopulate_pmd(mm, pgd, pmds);
 448        pgd_prepopulate_user_pmd(mm, pgd, u_pmds);
 449
 450        spin_unlock(&pgd_lock);
 451
 452        return pgd;
 453
 454out_free_user_pmds:
 455        free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS);
 456out_free_pmds:
 457        free_pmds(mm, pmds, PREALLOCATED_PMDS);
 458out_free_pgd:
 459        _pgd_free(pgd);
 460out:
 461        return NULL;
 462}
 463
 464void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 465{
 466        pgd_mop_up_pmds(mm, pgd);
 467        pgd_dtor(pgd);
 468        paravirt_pgd_free(mm, pgd);
 469        _pgd_free(pgd);
 470}
 471
 472/*
 473 * Used to set accessed or dirty bits in the page table entries
 474 * on other architectures. On x86, the accessed and dirty bits
 475 * are tracked by hardware. However, do_wp_page calls this function
 476 * to also make the pte writeable at the same time the dirty bit is
 477 * set. In that case we do actually need to write the PTE.
 478 */
 479int ptep_set_access_flags(struct vm_area_struct *vma,
 480                          unsigned long address, pte_t *ptep,
 481                          pte_t entry, int dirty)
 482{
 483        int changed = !pte_same(*ptep, entry);
 484
 485        if (changed && dirty)
 486                set_pte(ptep, entry);
 487
 488        return changed;
 489}
 490
 491#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 492int pmdp_set_access_flags(struct vm_area_struct *vma,
 493                          unsigned long address, pmd_t *pmdp,
 494                          pmd_t entry, int dirty)
 495{
 496        int changed = !pmd_same(*pmdp, entry);
 497
 498        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 499
 500        if (changed && dirty) {
 501                set_pmd(pmdp, entry);
 502                /*
 503                 * We had a write-protection fault here and changed the pmd
 504                 * to to more permissive. No need to flush the TLB for that,
 505                 * #PF is architecturally guaranteed to do that and in the
 506                 * worst-case we'll generate a spurious fault.
 507                 */
 508        }
 509
 510        return changed;
 511}
 512
 513int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
 514                          pud_t *pudp, pud_t entry, int dirty)
 515{
 516        int changed = !pud_same(*pudp, entry);
 517
 518        VM_BUG_ON(address & ~HPAGE_PUD_MASK);
 519
 520        if (changed && dirty) {
 521                set_pud(pudp, entry);
 522                /*
 523                 * We had a write-protection fault here and changed the pud
 524                 * to to more permissive. No need to flush the TLB for that,
 525                 * #PF is architecturally guaranteed to do that and in the
 526                 * worst-case we'll generate a spurious fault.
 527                 */
 528        }
 529
 530        return changed;
 531}
 532#endif
 533
 534int ptep_test_and_clear_young(struct vm_area_struct *vma,
 535                              unsigned long addr, pte_t *ptep)
 536{
 537        int ret = 0;
 538
 539        if (pte_young(*ptep))
 540                ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
 541                                         (unsigned long *) &ptep->pte);
 542
 543        return ret;
 544}
 545
 546#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 547int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 548                              unsigned long addr, pmd_t *pmdp)
 549{
 550        int ret = 0;
 551
 552        if (pmd_young(*pmdp))
 553                ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
 554                                         (unsigned long *)pmdp);
 555
 556        return ret;
 557}
 558int pudp_test_and_clear_young(struct vm_area_struct *vma,
 559                              unsigned long addr, pud_t *pudp)
 560{
 561        int ret = 0;
 562
 563        if (pud_young(*pudp))
 564                ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
 565                                         (unsigned long *)pudp);
 566
 567        return ret;
 568}
 569#endif
 570
 571int ptep_clear_flush_young(struct vm_area_struct *vma,
 572                           unsigned long address, pte_t *ptep)
 573{
 574        /*
 575         * On x86 CPUs, clearing the accessed bit without a TLB flush
 576         * doesn't cause data corruption. [ It could cause incorrect
 577         * page aging and the (mistaken) reclaim of hot pages, but the
 578         * chance of that should be relatively low. ]
 579         *
 580         * So as a performance optimization don't flush the TLB when
 581         * clearing the accessed bit, it will eventually be flushed by
 582         * a context switch or a VM operation anyway. [ In the rare
 583         * event of it not getting flushed for a long time the delay
 584         * shouldn't really matter because there's no real memory
 585         * pressure for swapout to react to. ]
 586         */
 587        return ptep_test_and_clear_young(vma, address, ptep);
 588}
 589
 590#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 591int pmdp_clear_flush_young(struct vm_area_struct *vma,
 592                           unsigned long address, pmd_t *pmdp)
 593{
 594        int young;
 595
 596        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 597
 598        young = pmdp_test_and_clear_young(vma, address, pmdp);
 599        if (young)
 600                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 601
 602        return young;
 603}
 604#endif
 605
 606/**
 607 * reserve_top_address - reserves a hole in the top of kernel address space
 608 * @reserve - size of hole to reserve
 609 *
 610 * Can be used to relocate the fixmap area and poke a hole in the top
 611 * of kernel address space to make room for a hypervisor.
 612 */
 613void __init reserve_top_address(unsigned long reserve)
 614{
 615#ifdef CONFIG_X86_32
 616        BUG_ON(fixmaps_set > 0);
 617        __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE;
 618        printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
 619               -reserve, __FIXADDR_TOP + PAGE_SIZE);
 620#endif
 621}
 622
 623int fixmaps_set;
 624
 625void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
 626{
 627        unsigned long address = __fix_to_virt(idx);
 628
 629#ifdef CONFIG_X86_64
 630       /*
 631        * Ensure that the static initial page tables are covering the
 632        * fixmap completely.
 633        */
 634        BUILD_BUG_ON(__end_of_permanent_fixed_addresses >
 635                     (FIXMAP_PMD_NUM * PTRS_PER_PTE));
 636#endif
 637
 638        if (idx >= __end_of_fixed_addresses) {
 639                BUG();
 640                return;
 641        }
 642        set_pte_vaddr(address, pte);
 643        fixmaps_set++;
 644}
 645
 646void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
 647                       pgprot_t flags)
 648{
 649        /* Sanitize 'prot' against any unsupported bits: */
 650        pgprot_val(flags) &= __default_kernel_pte_mask;
 651
 652        __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
 653}
 654
 655#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
 656#ifdef CONFIG_X86_5LEVEL
 657/**
 658 * p4d_set_huge - setup kernel P4D mapping
 659 *
 660 * No 512GB pages yet -- always return 0
 661 */
 662int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
 663{
 664        return 0;
 665}
 666
 667/**
 668 * p4d_clear_huge - clear kernel P4D mapping when it is set
 669 *
 670 * No 512GB pages yet -- always return 0
 671 */
 672int p4d_clear_huge(p4d_t *p4d)
 673{
 674        return 0;
 675}
 676#endif
 677
 678/**
 679 * pud_set_huge - setup kernel PUD mapping
 680 *
 681 * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
 682 * function sets up a huge page only if any of the following conditions are met:
 683 *
 684 * - MTRRs are disabled, or
 685 *
 686 * - MTRRs are enabled and the range is completely covered by a single MTRR, or
 687 *
 688 * - MTRRs are enabled and the corresponding MTRR memory type is WB, which
 689 *   has no effect on the requested PAT memory type.
 690 *
 691 * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
 692 * page mapping attempt fails.
 693 *
 694 * Returns 1 on success and 0 on failure.
 695 */
 696int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
 697{
 698        u8 mtrr, uniform;
 699
 700        mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
 701        if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
 702            (mtrr != MTRR_TYPE_WRBACK))
 703                return 0;
 704
 705        /* Bail out if we are we on a populated non-leaf entry: */
 706        if (pud_present(*pud) && !pud_huge(*pud))
 707                return 0;
 708
 709        prot = pgprot_4k_2_large(prot);
 710
 711        set_pte((pte_t *)pud, pfn_pte(
 712                (u64)addr >> PAGE_SHIFT,
 713                __pgprot(pgprot_val(prot) | _PAGE_PSE)));
 714
 715        return 1;
 716}
 717
 718/**
 719 * pmd_set_huge - setup kernel PMD mapping
 720 *
 721 * See text over pud_set_huge() above.
 722 *
 723 * Returns 1 on success and 0 on failure.
 724 */
 725int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
 726{
 727        u8 mtrr, uniform;
 728
 729        mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
 730        if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
 731            (mtrr != MTRR_TYPE_WRBACK)) {
 732                pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
 733                             __func__, addr, addr + PMD_SIZE);
 734                return 0;
 735        }
 736
 737        /* Bail out if we are we on a populated non-leaf entry: */
 738        if (pmd_present(*pmd) && !pmd_huge(*pmd))
 739                return 0;
 740
 741        prot = pgprot_4k_2_large(prot);
 742
 743        set_pte((pte_t *)pmd, pfn_pte(
 744                (u64)addr >> PAGE_SHIFT,
 745                __pgprot(pgprot_val(prot) | _PAGE_PSE)));
 746
 747        return 1;
 748}
 749
 750/**
 751 * pud_clear_huge - clear kernel PUD mapping when it is set
 752 *
 753 * Returns 1 on success and 0 on failure (no PUD map is found).
 754 */
 755int pud_clear_huge(pud_t *pud)
 756{
 757        if (pud_large(*pud)) {
 758                pud_clear(pud);
 759                return 1;
 760        }
 761
 762        return 0;
 763}
 764
 765/**
 766 * pmd_clear_huge - clear kernel PMD mapping when it is set
 767 *
 768 * Returns 1 on success and 0 on failure (no PMD map is found).
 769 */
 770int pmd_clear_huge(pmd_t *pmd)
 771{
 772        if (pmd_large(*pmd)) {
 773                pmd_clear(pmd);
 774                return 1;
 775        }
 776
 777        return 0;
 778}
 779
 780/*
 781 * Until we support 512GB pages, skip them in the vmap area.
 782 */
 783int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
 784{
 785        return 0;
 786}
 787
 788#ifdef CONFIG_X86_64
 789/**
 790 * pud_free_pmd_page - Clear pud entry and free pmd page.
 791 * @pud: Pointer to a PUD.
 792 * @addr: Virtual address associated with pud.
 793 *
 794 * Context: The pud range has been unmapped and TLB purged.
 795 * Return: 1 if clearing the entry succeeded. 0 otherwise.
 796 *
 797 * NOTE: Callers must allow a single page allocation.
 798 */
 799int pud_free_pmd_page(pud_t *pud, unsigned long addr)
 800{
 801        pmd_t *pmd, *pmd_sv;
 802        pte_t *pte;
 803        int i;
 804
 805        pmd = (pmd_t *)pud_page_vaddr(*pud);
 806        pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
 807        if (!pmd_sv)
 808                return 0;
 809
 810        for (i = 0; i < PTRS_PER_PMD; i++) {
 811                pmd_sv[i] = pmd[i];
 812                if (!pmd_none(pmd[i]))
 813                        pmd_clear(&pmd[i]);
 814        }
 815
 816        pud_clear(pud);
 817
 818        /* INVLPG to clear all paging-structure caches */
 819        flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
 820
 821        for (i = 0; i < PTRS_PER_PMD; i++) {
 822                if (!pmd_none(pmd_sv[i])) {
 823                        pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
 824                        free_page((unsigned long)pte);
 825                }
 826        }
 827
 828        free_page((unsigned long)pmd_sv);
 829        free_page((unsigned long)pmd);
 830
 831        return 1;
 832}
 833
 834/**
 835 * pmd_free_pte_page - Clear pmd entry and free pte page.
 836 * @pmd: Pointer to a PMD.
 837 * @addr: Virtual address associated with pmd.
 838 *
 839 * Context: The pmd range has been unmapped and TLB purged.
 840 * Return: 1 if clearing the entry succeeded. 0 otherwise.
 841 */
 842int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
 843{
 844        pte_t *pte;
 845
 846        pte = (pte_t *)pmd_page_vaddr(*pmd);
 847        pmd_clear(pmd);
 848
 849        /* INVLPG to clear all paging-structure caches */
 850        flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
 851
 852        free_page((unsigned long)pte);
 853
 854        return 1;
 855}
 856
 857#else /* !CONFIG_X86_64 */
 858
 859int pud_free_pmd_page(pud_t *pud, unsigned long addr)
 860{
 861        return pud_none(*pud);
 862}
 863
 864/*
 865 * Disable free page handling on x86-PAE. This assures that ioremap()
 866 * does not update sync'd pmd entries. See vmalloc_sync_one().
 867 */
 868int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
 869{
 870        return pmd_none(*pmd);
 871}
 872
 873#endif /* CONFIG_X86_64 */
 874#endif  /* CONFIG_HAVE_ARCH_HUGE_VMAP */
 875