linux/arch/x86/mm/pgtable.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2#include <linux/mm.h>
   3#include <linux/gfp.h>
   4#include <linux/hugetlb.h>
   5#include <asm/pgalloc.h>
   6#include <asm/pgtable.h>
   7#include <asm/tlb.h>
   8#include <asm/fixmap.h>
   9#include <asm/mtrr.h>
  10
  11#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
  12phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
  13EXPORT_SYMBOL(physical_mask);
  14#endif
  15
  16#ifdef CONFIG_HIGHPTE
  17#define PGTABLE_HIGHMEM __GFP_HIGHMEM
  18#else
  19#define PGTABLE_HIGHMEM 0
  20#endif
  21
  22gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;
  23
  24pgtable_t pte_alloc_one(struct mm_struct *mm)
  25{
  26        return __pte_alloc_one(mm, __userpte_alloc_gfp);
  27}
  28
  29static int __init setup_userpte(char *arg)
  30{
  31        if (!arg)
  32                return -EINVAL;
  33
  34        /*
  35         * "userpte=nohigh" disables allocation of user pagetables in
  36         * high memory.
  37         */
  38        if (strcmp(arg, "nohigh") == 0)
  39                __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
  40        else
  41                return -EINVAL;
  42        return 0;
  43}
  44early_param("userpte", setup_userpte);
  45
  46void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
  47{
  48        pgtable_page_dtor(pte);
  49        paravirt_release_pte(page_to_pfn(pte));
  50        paravirt_tlb_remove_table(tlb, pte);
  51}
  52
  53#if CONFIG_PGTABLE_LEVELS > 2
  54void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
  55{
  56        struct page *page = virt_to_page(pmd);
  57        paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
  58        /*
  59         * NOTE! For PAE, any changes to the top page-directory-pointer-table
  60         * entries need a full cr3 reload to flush.
  61         */
  62#ifdef CONFIG_X86_PAE
  63        tlb->need_flush_all = 1;
  64#endif
  65        pgtable_pmd_page_dtor(page);
  66        paravirt_tlb_remove_table(tlb, page);
  67}
  68
  69#if CONFIG_PGTABLE_LEVELS > 3
  70void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
  71{
  72        paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
  73        paravirt_tlb_remove_table(tlb, virt_to_page(pud));
  74}
  75
  76#if CONFIG_PGTABLE_LEVELS > 4
  77void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
  78{
  79        paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
  80        paravirt_tlb_remove_table(tlb, virt_to_page(p4d));
  81}
  82#endif  /* CONFIG_PGTABLE_LEVELS > 4 */
  83#endif  /* CONFIG_PGTABLE_LEVELS > 3 */
  84#endif  /* CONFIG_PGTABLE_LEVELS > 2 */
  85
  86static inline void pgd_list_add(pgd_t *pgd)
  87{
  88        struct page *page = virt_to_page(pgd);
  89
  90        list_add(&page->lru, &pgd_list);
  91}
  92
  93static inline void pgd_list_del(pgd_t *pgd)
  94{
  95        struct page *page = virt_to_page(pgd);
  96
  97        list_del(&page->lru);
  98}
  99
 100#define UNSHARED_PTRS_PER_PGD                           \
 101        (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
 102#define MAX_UNSHARED_PTRS_PER_PGD                       \
 103        max_t(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD)
 104
 105
 106static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
 107{
 108        virt_to_page(pgd)->pt_mm = mm;
 109}
 110
 111struct mm_struct *pgd_page_get_mm(struct page *page)
 112{
 113        return page->pt_mm;
 114}
 115
 116static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
 117{
 118        /* If the pgd points to a shared pagetable level (either the
 119           ptes in non-PAE, or shared PMD in PAE), then just copy the
 120           references from swapper_pg_dir. */
 121        if (CONFIG_PGTABLE_LEVELS == 2 ||
 122            (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
 123            CONFIG_PGTABLE_LEVELS >= 4) {
 124                clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
 125                                swapper_pg_dir + KERNEL_PGD_BOUNDARY,
 126                                KERNEL_PGD_PTRS);
 127        }
 128
 129        /* list required to sync kernel mapping updates */
 130        if (!SHARED_KERNEL_PMD) {
 131                pgd_set_mm(pgd, mm);
 132                pgd_list_add(pgd);
 133        }
 134}
 135
 136static void pgd_dtor(pgd_t *pgd)
 137{
 138        if (SHARED_KERNEL_PMD)
 139                return;
 140
 141        spin_lock(&pgd_lock);
 142        pgd_list_del(pgd);
 143        spin_unlock(&pgd_lock);
 144}
 145
 146/*
 147 * List of all pgd's needed for non-PAE so it can invalidate entries
 148 * in both cached and uncached pgd's; not needed for PAE since the
 149 * kernel pmd is shared. If PAE were not to share the pmd a similar
 150 * tactic would be needed. This is essentially codepath-based locking
 151 * against pageattr.c; it is the unique case in which a valid change
 152 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
 153 * vmalloc faults work because attached pagetables are never freed.
 154 * -- nyc
 155 */
 156
 157#ifdef CONFIG_X86_PAE
 158/*
 159 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
 160 * updating the top-level pagetable entries to guarantee the
 161 * processor notices the update.  Since this is expensive, and
 162 * all 4 top-level entries are used almost immediately in a
 163 * new process's life, we just pre-populate them here.
 164 *
 165 * Also, if we're in a paravirt environment where the kernel pmd is
 166 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
 167 * and initialize the kernel pmds here.
 168 */
 169#define PREALLOCATED_PMDS       UNSHARED_PTRS_PER_PGD
 170#define MAX_PREALLOCATED_PMDS   MAX_UNSHARED_PTRS_PER_PGD
 171
 172/*
 173 * We allocate separate PMDs for the kernel part of the user page-table
 174 * when PTI is enabled. We need them to map the per-process LDT into the
 175 * user-space page-table.
 176 */
 177#define PREALLOCATED_USER_PMDS   (boot_cpu_has(X86_FEATURE_PTI) ? \
 178                                        KERNEL_PGD_PTRS : 0)
 179#define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS
 180
 181void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
 182{
 183        paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
 184
 185        /* Note: almost everything apart from _PAGE_PRESENT is
 186           reserved at the pmd (PDPT) level. */
 187        set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
 188
 189        /*
 190         * According to Intel App note "TLBs, Paging-Structure Caches,
 191         * and Their Invalidation", April 2007, document 317080-001,
 192         * section 8.1: in PAE mode we explicitly have to flush the
 193         * TLB via cr3 if the top-level pgd is changed...
 194         */
 195        flush_tlb_mm(mm);
 196}
 197#else  /* !CONFIG_X86_PAE */
 198
 199/* No need to prepopulate any pagetable entries in non-PAE modes. */
 200#define PREALLOCATED_PMDS       0
 201#define MAX_PREALLOCATED_PMDS   0
 202#define PREALLOCATED_USER_PMDS   0
 203#define MAX_PREALLOCATED_USER_PMDS 0
 204#endif  /* CONFIG_X86_PAE */
 205
 206static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
 207{
 208        int i;
 209
 210        for (i = 0; i < count; i++)
 211                if (pmds[i]) {
 212                        pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
 213                        free_page((unsigned long)pmds[i]);
 214                        mm_dec_nr_pmds(mm);
 215                }
 216}
 217
 218static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
 219{
 220        int i;
 221        bool failed = false;
 222        gfp_t gfp = GFP_PGTABLE_USER;
 223
 224        if (mm == &init_mm)
 225                gfp &= ~__GFP_ACCOUNT;
 226
 227        for (i = 0; i < count; i++) {
 228                pmd_t *pmd = (pmd_t *)__get_free_page(gfp);
 229                if (!pmd)
 230                        failed = true;
 231                if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) {
 232                        free_page((unsigned long)pmd);
 233                        pmd = NULL;
 234                        failed = true;
 235                }
 236                if (pmd)
 237                        mm_inc_nr_pmds(mm);
 238                pmds[i] = pmd;
 239        }
 240
 241        if (failed) {
 242                free_pmds(mm, pmds, count);
 243                return -ENOMEM;
 244        }
 245
 246        return 0;
 247}
 248
 249/*
 250 * Mop up any pmd pages which may still be attached to the pgd.
 251 * Normally they will be freed by munmap/exit_mmap, but any pmd we
 252 * preallocate which never got a corresponding vma will need to be
 253 * freed manually.
 254 */
 255static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp)
 256{
 257        pgd_t pgd = *pgdp;
 258
 259        if (pgd_val(pgd) != 0) {
 260                pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
 261
 262                pgd_clear(pgdp);
 263
 264                paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
 265                pmd_free(mm, pmd);
 266                mm_dec_nr_pmds(mm);
 267        }
 268}
 269
 270static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
 271{
 272        int i;
 273
 274        for (i = 0; i < PREALLOCATED_PMDS; i++)
 275                mop_up_one_pmd(mm, &pgdp[i]);
 276
 277#ifdef CONFIG_PAGE_TABLE_ISOLATION
 278
 279        if (!boot_cpu_has(X86_FEATURE_PTI))
 280                return;
 281
 282        pgdp = kernel_to_user_pgdp(pgdp);
 283
 284        for (i = 0; i < PREALLOCATED_USER_PMDS; i++)
 285                mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]);
 286#endif
 287}
 288
 289static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
 290{
 291        p4d_t *p4d;
 292        pud_t *pud;
 293        int i;
 294
 295        if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
 296                return;
 297
 298        p4d = p4d_offset(pgd, 0);
 299        pud = pud_offset(p4d, 0);
 300
 301        for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
 302                pmd_t *pmd = pmds[i];
 303
 304                if (i >= KERNEL_PGD_BOUNDARY)
 305                        memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
 306                               sizeof(pmd_t) * PTRS_PER_PMD);
 307
 308                pud_populate(mm, pud, pmd);
 309        }
 310}
 311
 312#ifdef CONFIG_PAGE_TABLE_ISOLATION
 313static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
 314                                     pgd_t *k_pgd, pmd_t *pmds[])
 315{
 316        pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir);
 317        pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
 318        p4d_t *u_p4d;
 319        pud_t *u_pud;
 320        int i;
 321
 322        u_p4d = p4d_offset(u_pgd, 0);
 323        u_pud = pud_offset(u_p4d, 0);
 324
 325        s_pgd += KERNEL_PGD_BOUNDARY;
 326        u_pud += KERNEL_PGD_BOUNDARY;
 327
 328        for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) {
 329                pmd_t *pmd = pmds[i];
 330
 331                memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd),
 332                       sizeof(pmd_t) * PTRS_PER_PMD);
 333
 334                pud_populate(mm, u_pud, pmd);
 335        }
 336
 337}
 338#else
 339static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
 340                                     pgd_t *k_pgd, pmd_t *pmds[])
 341{
 342}
 343#endif
 344/*
 345 * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also
 346 * assumes that pgd should be in one page.
 347 *
 348 * But kernel with PAE paging that is not running as a Xen domain
 349 * only needs to allocate 32 bytes for pgd instead of one page.
 350 */
 351#ifdef CONFIG_X86_PAE
 352
 353#include <linux/slab.h>
 354
 355#define PGD_SIZE        (PTRS_PER_PGD * sizeof(pgd_t))
 356#define PGD_ALIGN       32
 357
 358static struct kmem_cache *pgd_cache;
 359
 360void __init pgd_cache_init(void)
 361{
 362        /*
 363         * When PAE kernel is running as a Xen domain, it does not use
 364         * shared kernel pmd. And this requires a whole page for pgd.
 365         */
 366        if (!SHARED_KERNEL_PMD)
 367                return;
 368
 369        /*
 370         * when PAE kernel is not running as a Xen domain, it uses
 371         * shared kernel pmd. Shared kernel pmd does not require a whole
 372         * page for pgd. We are able to just allocate a 32-byte for pgd.
 373         * During boot time, we create a 32-byte slab for pgd table allocation.
 374         */
 375        pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
 376                                      SLAB_PANIC, NULL);
 377}
 378
 379static inline pgd_t *_pgd_alloc(void)
 380{
 381        /*
 382         * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain.
 383         * We allocate one page for pgd.
 384         */
 385        if (!SHARED_KERNEL_PMD)
 386                return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER,
 387                                                 PGD_ALLOCATION_ORDER);
 388
 389        /*
 390         * Now PAE kernel is not running as a Xen domain. We can allocate
 391         * a 32-byte slab for pgd to save memory space.
 392         */
 393        return kmem_cache_alloc(pgd_cache, GFP_PGTABLE_USER);
 394}
 395
 396static inline void _pgd_free(pgd_t *pgd)
 397{
 398        if (!SHARED_KERNEL_PMD)
 399                free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
 400        else
 401                kmem_cache_free(pgd_cache, pgd);
 402}
 403#else
 404
 405void __init pgd_cache_init(void)
 406{
 407}
 408
 409static inline pgd_t *_pgd_alloc(void)
 410{
 411        return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER,
 412                                         PGD_ALLOCATION_ORDER);
 413}
 414
 415static inline void _pgd_free(pgd_t *pgd)
 416{
 417        free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
 418}
 419#endif /* CONFIG_X86_PAE */
 420
 421pgd_t *pgd_alloc(struct mm_struct *mm)
 422{
 423        pgd_t *pgd;
 424        pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS];
 425        pmd_t *pmds[MAX_PREALLOCATED_PMDS];
 426
 427        pgd = _pgd_alloc();
 428
 429        if (pgd == NULL)
 430                goto out;
 431
 432        mm->pgd = pgd;
 433
 434        if (preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
 435                goto out_free_pgd;
 436
 437        if (preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0)
 438                goto out_free_pmds;
 439
 440        if (paravirt_pgd_alloc(mm) != 0)
 441                goto out_free_user_pmds;
 442
 443        /*
 444         * Make sure that pre-populating the pmds is atomic with
 445         * respect to anything walking the pgd_list, so that they
 446         * never see a partially populated pgd.
 447         */
 448        spin_lock(&pgd_lock);
 449
 450        pgd_ctor(mm, pgd);
 451        pgd_prepopulate_pmd(mm, pgd, pmds);
 452        pgd_prepopulate_user_pmd(mm, pgd, u_pmds);
 453
 454        spin_unlock(&pgd_lock);
 455
 456        return pgd;
 457
 458out_free_user_pmds:
 459        free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS);
 460out_free_pmds:
 461        free_pmds(mm, pmds, PREALLOCATED_PMDS);
 462out_free_pgd:
 463        _pgd_free(pgd);
 464out:
 465        return NULL;
 466}
 467
 468void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 469{
 470        pgd_mop_up_pmds(mm, pgd);
 471        pgd_dtor(pgd);
 472        paravirt_pgd_free(mm, pgd);
 473        _pgd_free(pgd);
 474}
 475
 476/*
 477 * Used to set accessed or dirty bits in the page table entries
 478 * on other architectures. On x86, the accessed and dirty bits
 479 * are tracked by hardware. However, do_wp_page calls this function
 480 * to also make the pte writeable at the same time the dirty bit is
 481 * set. In that case we do actually need to write the PTE.
 482 */
 483int ptep_set_access_flags(struct vm_area_struct *vma,
 484                          unsigned long address, pte_t *ptep,
 485                          pte_t entry, int dirty)
 486{
 487        int changed = !pte_same(*ptep, entry);
 488
 489        if (changed && dirty)
 490                set_pte(ptep, entry);
 491
 492        return changed;
 493}
 494
 495#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 496int pmdp_set_access_flags(struct vm_area_struct *vma,
 497                          unsigned long address, pmd_t *pmdp,
 498                          pmd_t entry, int dirty)
 499{
 500        int changed = !pmd_same(*pmdp, entry);
 501
 502        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 503
 504        if (changed && dirty) {
 505                set_pmd(pmdp, entry);
 506                /*
 507                 * We had a write-protection fault here and changed the pmd
 508                 * to to more permissive. No need to flush the TLB for that,
 509                 * #PF is architecturally guaranteed to do that and in the
 510                 * worst-case we'll generate a spurious fault.
 511                 */
 512        }
 513
 514        return changed;
 515}
 516
 517int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
 518                          pud_t *pudp, pud_t entry, int dirty)
 519{
 520        int changed = !pud_same(*pudp, entry);
 521
 522        VM_BUG_ON(address & ~HPAGE_PUD_MASK);
 523
 524        if (changed && dirty) {
 525                set_pud(pudp, entry);
 526                /*
 527                 * We had a write-protection fault here and changed the pud
 528                 * to to more permissive. No need to flush the TLB for that,
 529                 * #PF is architecturally guaranteed to do that and in the
 530                 * worst-case we'll generate a spurious fault.
 531                 */
 532        }
 533
 534        return changed;
 535}
 536#endif
 537
 538int ptep_test_and_clear_young(struct vm_area_struct *vma,
 539                              unsigned long addr, pte_t *ptep)
 540{
 541        int ret = 0;
 542
 543        if (pte_young(*ptep))
 544                ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
 545                                         (unsigned long *) &ptep->pte);
 546
 547        return ret;
 548}
 549
 550#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 551int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 552                              unsigned long addr, pmd_t *pmdp)
 553{
 554        int ret = 0;
 555
 556        if (pmd_young(*pmdp))
 557                ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
 558                                         (unsigned long *)pmdp);
 559
 560        return ret;
 561}
 562int pudp_test_and_clear_young(struct vm_area_struct *vma,
 563                              unsigned long addr, pud_t *pudp)
 564{
 565        int ret = 0;
 566
 567        if (pud_young(*pudp))
 568                ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
 569                                         (unsigned long *)pudp);
 570
 571        return ret;
 572}
 573#endif
 574
 575int ptep_clear_flush_young(struct vm_area_struct *vma,
 576                           unsigned long address, pte_t *ptep)
 577{
 578        /*
 579         * On x86 CPUs, clearing the accessed bit without a TLB flush
 580         * doesn't cause data corruption. [ It could cause incorrect
 581         * page aging and the (mistaken) reclaim of hot pages, but the
 582         * chance of that should be relatively low. ]
 583         *
 584         * So as a performance optimization don't flush the TLB when
 585         * clearing the accessed bit, it will eventually be flushed by
 586         * a context switch or a VM operation anyway. [ In the rare
 587         * event of it not getting flushed for a long time the delay
 588         * shouldn't really matter because there's no real memory
 589         * pressure for swapout to react to. ]
 590         */
 591        return ptep_test_and_clear_young(vma, address, ptep);
 592}
 593
 594#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 595int pmdp_clear_flush_young(struct vm_area_struct *vma,
 596                           unsigned long address, pmd_t *pmdp)
 597{
 598        int young;
 599
 600        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 601
 602        young = pmdp_test_and_clear_young(vma, address, pmdp);
 603        if (young)
 604                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 605
 606        return young;
 607}
 608#endif
 609
 610/**
 611 * reserve_top_address - reserves a hole in the top of kernel address space
 612 * @reserve - size of hole to reserve
 613 *
 614 * Can be used to relocate the fixmap area and poke a hole in the top
 615 * of kernel address space to make room for a hypervisor.
 616 */
 617void __init reserve_top_address(unsigned long reserve)
 618{
 619#ifdef CONFIG_X86_32
 620        BUG_ON(fixmaps_set > 0);
 621        __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE;
 622        printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
 623               -reserve, __FIXADDR_TOP + PAGE_SIZE);
 624#endif
 625}
 626
 627int fixmaps_set;
 628
 629void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
 630{
 631        unsigned long address = __fix_to_virt(idx);
 632
 633#ifdef CONFIG_X86_64
 634       /*
 635        * Ensure that the static initial page tables are covering the
 636        * fixmap completely.
 637        */
 638        BUILD_BUG_ON(__end_of_permanent_fixed_addresses >
 639                     (FIXMAP_PMD_NUM * PTRS_PER_PTE));
 640#endif
 641
 642        if (idx >= __end_of_fixed_addresses) {
 643                BUG();
 644                return;
 645        }
 646        set_pte_vaddr(address, pte);
 647        fixmaps_set++;
 648}
 649
 650void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
 651                       pgprot_t flags)
 652{
 653        /* Sanitize 'prot' against any unsupported bits: */
 654        pgprot_val(flags) &= __default_kernel_pte_mask;
 655
 656        __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
 657}
 658
 659#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
 660#ifdef CONFIG_X86_5LEVEL
 661/**
 662 * p4d_set_huge - setup kernel P4D mapping
 663 *
 664 * No 512GB pages yet -- always return 0
 665 */
 666int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
 667{
 668        return 0;
 669}
 670
 671/**
 672 * p4d_clear_huge - clear kernel P4D mapping when it is set
 673 *
 674 * No 512GB pages yet -- always return 0
 675 */
 676int p4d_clear_huge(p4d_t *p4d)
 677{
 678        return 0;
 679}
 680#endif
 681
 682/**
 683 * pud_set_huge - setup kernel PUD mapping
 684 *
 685 * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
 686 * function sets up a huge page only if any of the following conditions are met:
 687 *
 688 * - MTRRs are disabled, or
 689 *
 690 * - MTRRs are enabled and the range is completely covered by a single MTRR, or
 691 *
 692 * - MTRRs are enabled and the corresponding MTRR memory type is WB, which
 693 *   has no effect on the requested PAT memory type.
 694 *
 695 * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
 696 * page mapping attempt fails.
 697 *
 698 * Returns 1 on success and 0 on failure.
 699 */
 700int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
 701{
 702        u8 mtrr, uniform;
 703
 704        mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
 705        if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
 706            (mtrr != MTRR_TYPE_WRBACK))
 707                return 0;
 708
 709        /* Bail out if we are we on a populated non-leaf entry: */
 710        if (pud_present(*pud) && !pud_huge(*pud))
 711                return 0;
 712
 713        prot = pgprot_4k_2_large(prot);
 714
 715        set_pte((pte_t *)pud, pfn_pte(
 716                (u64)addr >> PAGE_SHIFT,
 717                __pgprot(pgprot_val(prot) | _PAGE_PSE)));
 718
 719        return 1;
 720}
 721
 722/**
 723 * pmd_set_huge - setup kernel PMD mapping
 724 *
 725 * See text over pud_set_huge() above.
 726 *
 727 * Returns 1 on success and 0 on failure.
 728 */
 729int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
 730{
 731        u8 mtrr, uniform;
 732
 733        mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
 734        if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
 735            (mtrr != MTRR_TYPE_WRBACK)) {
 736                pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
 737                             __func__, addr, addr + PMD_SIZE);
 738                return 0;
 739        }
 740
 741        /* Bail out if we are we on a populated non-leaf entry: */
 742        if (pmd_present(*pmd) && !pmd_huge(*pmd))
 743                return 0;
 744
 745        prot = pgprot_4k_2_large(prot);
 746
 747        set_pte((pte_t *)pmd, pfn_pte(
 748                (u64)addr >> PAGE_SHIFT,
 749                __pgprot(pgprot_val(prot) | _PAGE_PSE)));
 750
 751        return 1;
 752}
 753
 754/**
 755 * pud_clear_huge - clear kernel PUD mapping when it is set
 756 *
 757 * Returns 1 on success and 0 on failure (no PUD map is found).
 758 */
 759int pud_clear_huge(pud_t *pud)
 760{
 761        if (pud_large(*pud)) {
 762                pud_clear(pud);
 763                return 1;
 764        }
 765
 766        return 0;
 767}
 768
 769/**
 770 * pmd_clear_huge - clear kernel PMD mapping when it is set
 771 *
 772 * Returns 1 on success and 0 on failure (no PMD map is found).
 773 */
 774int pmd_clear_huge(pmd_t *pmd)
 775{
 776        if (pmd_large(*pmd)) {
 777                pmd_clear(pmd);
 778                return 1;
 779        }
 780
 781        return 0;
 782}
 783
 784/*
 785 * Until we support 512GB pages, skip them in the vmap area.
 786 */
 787int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
 788{
 789        return 0;
 790}
 791
 792#ifdef CONFIG_X86_64
 793/**
 794 * pud_free_pmd_page - Clear pud entry and free pmd page.
 795 * @pud: Pointer to a PUD.
 796 * @addr: Virtual address associated with pud.
 797 *
 798 * Context: The pud range has been unmapped and TLB purged.
 799 * Return: 1 if clearing the entry succeeded. 0 otherwise.
 800 *
 801 * NOTE: Callers must allow a single page allocation.
 802 */
 803int pud_free_pmd_page(pud_t *pud, unsigned long addr)
 804{
 805        pmd_t *pmd, *pmd_sv;
 806        pte_t *pte;
 807        int i;
 808
 809        pmd = (pmd_t *)pud_page_vaddr(*pud);
 810        pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
 811        if (!pmd_sv)
 812                return 0;
 813
 814        for (i = 0; i < PTRS_PER_PMD; i++) {
 815                pmd_sv[i] = pmd[i];
 816                if (!pmd_none(pmd[i]))
 817                        pmd_clear(&pmd[i]);
 818        }
 819
 820        pud_clear(pud);
 821
 822        /* INVLPG to clear all paging-structure caches */
 823        flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
 824
 825        for (i = 0; i < PTRS_PER_PMD; i++) {
 826                if (!pmd_none(pmd_sv[i])) {
 827                        pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
 828                        free_page((unsigned long)pte);
 829                }
 830        }
 831
 832        free_page((unsigned long)pmd_sv);
 833        free_page((unsigned long)pmd);
 834
 835        return 1;
 836}
 837
 838/**
 839 * pmd_free_pte_page - Clear pmd entry and free pte page.
 840 * @pmd: Pointer to a PMD.
 841 * @addr: Virtual address associated with pmd.
 842 *
 843 * Context: The pmd range has been unmapped and TLB purged.
 844 * Return: 1 if clearing the entry succeeded. 0 otherwise.
 845 */
 846int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
 847{
 848        pte_t *pte;
 849
 850        pte = (pte_t *)pmd_page_vaddr(*pmd);
 851        pmd_clear(pmd);
 852
 853        /* INVLPG to clear all paging-structure caches */
 854        flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
 855
 856        free_page((unsigned long)pte);
 857
 858        return 1;
 859}
 860
 861#else /* !CONFIG_X86_64 */
 862
 863int pud_free_pmd_page(pud_t *pud, unsigned long addr)
 864{
 865        return pud_none(*pud);
 866}
 867
 868/*
 869 * Disable free page handling on x86-PAE. This assures that ioremap()
 870 * does not update sync'd pmd entries. See vmalloc_sync_one().
 871 */
 872int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
 873{
 874        return pmd_none(*pmd);
 875}
 876
 877#endif /* CONFIG_X86_64 */
 878#endif  /* CONFIG_HAVE_ARCH_HUGE_VMAP */
 879