LXR linux/arch/powerpc/mm/hugetlbpage.c

   1/*
   2 * PPC Huge TLB Page Support for Kernel.
   3 *
   4 * Copyright (C) 2003 David Gibson, IBM Corporation.
   5 * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
   6 *
   7 * Based on the IA-32 version:
   8 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
   9 */
  10
  11#include <linux/mm.h>
  12#include <linux/io.h>
  13#include <linux/slab.h>
  14#include <linux/hugetlb.h>
  15#include <linux/export.h>
  16#include <linux/of_fdt.h>
  17#include <linux/memblock.h>
  18#include <linux/bootmem.h>
  19#include <linux/moduleparam.h>
  20#include <linux/swap.h>
  21#include <linux/swapops.h>
  22#include <asm/pgtable.h>
  23#include <asm/pgalloc.h>
  24#include <asm/tlb.h>
  25#include <asm/setup.h>
  26#include <asm/hugetlb.h>
  27#include <asm/pte-walk.h>
  28
  29
  30#ifdef CONFIG_HUGETLB_PAGE
  31
  32#define PAGE_SHIFT_64K  16
  33#define PAGE_SHIFT_512K 19
  34#define PAGE_SHIFT_8M   23
  35#define PAGE_SHIFT_16M  24
  36#define PAGE_SHIFT_16G  34
  37
  38unsigned int HPAGE_SHIFT;
  39EXPORT_SYMBOL(HPAGE_SHIFT);
  40
  41#define hugepd_none(hpd)        (hpd_val(hpd) == 0)
  42
  43pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
  44{
  45        /*
  46         * Only called for hugetlbfs pages, hence can ignore THP and the
  47         * irq disabled walk.
  48         */
  49        return __find_linux_pte(mm->pgd, addr, NULL, NULL);
  50}
  51
  52static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
  53                           unsigned long address, unsigned pdshift, unsigned pshift)
  54{
  55        struct kmem_cache *cachep;
  56        pte_t *new;
  57        int i;
  58        int num_hugepd;
  59
  60        if (pshift >= pdshift) {
  61                cachep = hugepte_cache;
  62                num_hugepd = 1 << (pshift - pdshift);
  63        } else {
  64                cachep = PGT_CACHE(pdshift - pshift);
  65                num_hugepd = 1;
  66        }
  67
  68        new = kmem_cache_zalloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
  69
  70        BUG_ON(pshift > HUGEPD_SHIFT_MASK);
  71        BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
  72
  73        if (! new)
  74                return -ENOMEM;
  75
  76        /*
  77         * Make sure other cpus find the hugepd set only after a
  78         * properly initialized page table is visible to them.
  79         * For more details look for comment in __pte_alloc().
  80         */
  81        smp_wmb();
  82
  83        spin_lock(&mm->page_table_lock);
  84
  85        /*
  86         * We have multiple higher-level entries that point to the same
  87         * actual pte location.  Fill in each as we go and backtrack on error.
  88         * We need all of these so the DTLB pgtable walk code can find the
  89         * right higher-level entry without knowing if it's a hugepage or not.
  90         */
  91        for (i = 0; i < num_hugepd; i++, hpdp++) {
  92                if (unlikely(!hugepd_none(*hpdp)))
  93                        break;
  94                else {
  95#ifdef CONFIG_PPC_BOOK3S_64
  96                        *hpdp = __hugepd(__pa(new) |
  97                                         (shift_to_mmu_psize(pshift) << 2));
  98#elif defined(CONFIG_PPC_8xx)
  99                        *hpdp = __hugepd(__pa(new) | _PMD_USER |
 100                                         (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M :
 101                                          _PMD_PAGE_512K) | _PMD_PRESENT);
 102#else
 103                        /* We use the old format for PPC_FSL_BOOK3E */
 104                        *hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift);
 105#endif
 106                }
 107        }
 108        /* If we bailed from the for loop early, an error occurred, clean up */
 109        if (i < num_hugepd) {
 110                for (i = i - 1 ; i >= 0; i--, hpdp--)
 111                        *hpdp = __hugepd(0);
 112                kmem_cache_free(cachep, new);
 113        }
 114        spin_unlock(&mm->page_table_lock);
 115        return 0;
 116}
 117
 118/*
 119 * These macros define how to determine which level of the page table holds
 120 * the hpdp.
 121 */
 122#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
 123#define HUGEPD_PGD_SHIFT PGDIR_SHIFT
 124#define HUGEPD_PUD_SHIFT PUD_SHIFT
 125#endif
 126
 127/*
 128 * At this point we do the placement change only for BOOK3S 64. This would
 129 * possibly work on other subarchs.
 130 */
 131pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
 132{
 133        pgd_t *pg;
 134        pud_t *pu;
 135        pmd_t *pm;
 136        hugepd_t *hpdp = NULL;
 137        unsigned pshift = __ffs(sz);
 138        unsigned pdshift = PGDIR_SHIFT;
 139
 140        addr &= ~(sz-1);
 141        pg = pgd_offset(mm, addr);
 142
 143#ifdef CONFIG_PPC_BOOK3S_64
 144        if (pshift == PGDIR_SHIFT)
 145                /* 16GB huge page */
 146                return (pte_t *) pg;
 147        else if (pshift > PUD_SHIFT)
 148                /*
 149                 * We need to use hugepd table
 150                 */
 151                hpdp = (hugepd_t *)pg;
 152        else {
 153                pdshift = PUD_SHIFT;
 154                pu = pud_alloc(mm, pg, addr);
 155                if (pshift == PUD_SHIFT)
 156                        return (pte_t *)pu;
 157                else if (pshift > PMD_SHIFT)
 158                        hpdp = (hugepd_t *)pu;
 159                else {
 160                        pdshift = PMD_SHIFT;
 161                        pm = pmd_alloc(mm, pu, addr);
 162                        if (pshift == PMD_SHIFT)
 163                                /* 16MB hugepage */
 164                                return (pte_t *)pm;
 165                        else
 166                                hpdp = (hugepd_t *)pm;
 167                }
 168        }
 169#else
 170        if (pshift >= HUGEPD_PGD_SHIFT) {
 171                hpdp = (hugepd_t *)pg;
 172        } else {
 173                pdshift = PUD_SHIFT;
 174                pu = pud_alloc(mm, pg, addr);
 175                if (pshift >= HUGEPD_PUD_SHIFT) {
 176                        hpdp = (hugepd_t *)pu;
 177                } else {
 178                        pdshift = PMD_SHIFT;
 179                        pm = pmd_alloc(mm, pu, addr);
 180                        hpdp = (hugepd_t *)pm;
 181                }
 182        }
 183#endif
 184        if (!hpdp)
 185                return NULL;
 186
 187        BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
 188
 189        if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
 190                return NULL;
 191
 192        return hugepte_offset(*hpdp, addr, pdshift);
 193}
 194
 195#ifdef CONFIG_PPC_BOOK3S_64
 196/*
 197 * Tracks gpages after the device tree is scanned and before the
 198 * huge_boot_pages list is ready on pseries.
 199 */
 200#define MAX_NUMBER_GPAGES       1024
 201__initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES];
 202__initdata static unsigned nr_gpages;
 203
 204/*
 205 * Build list of addresses of gigantic pages.  This function is used in early
 206 * boot before the buddy allocator is setup.
 207 */
 208void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
 209{
 210        if (!addr)
 211                return;
 212        while (number_of_pages > 0) {
 213                gpage_freearray[nr_gpages] = addr;
 214                nr_gpages++;
 215                number_of_pages--;
 216                addr += page_size;
 217        }
 218}
 219
 220int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
 221{
 222        struct huge_bootmem_page *m;
 223        if (nr_gpages == 0)
 224                return 0;
 225        m = phys_to_virt(gpage_freearray[--nr_gpages]);
 226        gpage_freearray[nr_gpages] = 0;
 227        list_add(&m->list, &huge_boot_pages);
 228        m->hstate = hstate;
 229        return 1;
 230}
 231#endif
 232
 233
 234int __init alloc_bootmem_huge_page(struct hstate *h)
 235{
 236
 237#ifdef CONFIG_PPC_BOOK3S_64
 238        if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
 239                return pseries_alloc_bootmem_huge_page(h);
 240#endif
 241        return __alloc_bootmem_huge_page(h);
 242}
 243
 244#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
 245#define HUGEPD_FREELIST_SIZE \
 246        ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
 247
 248struct hugepd_freelist {
 249        struct rcu_head rcu;
 250        unsigned int index;
 251        void *ptes[0];
 252};
 253
 254static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
 255
 256static void hugepd_free_rcu_callback(struct rcu_head *head)
 257{
 258        struct hugepd_freelist *batch =
 259                container_of(head, struct hugepd_freelist, rcu);
 260        unsigned int i;
 261
 262        for (i = 0; i < batch->index; i++)
 263                kmem_cache_free(hugepte_cache, batch->ptes[i]);
 264
 265        free_page((unsigned long)batch);
 266}
 267
 268static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
 269{
 270        struct hugepd_freelist **batchp;
 271
 272        batchp = &get_cpu_var(hugepd_freelist_cur);
 273
 274        if (atomic_read(&tlb->mm->mm_users) < 2 ||
 275            mm_is_thread_local(tlb->mm)) {
 276                kmem_cache_free(hugepte_cache, hugepte);
 277                put_cpu_var(hugepd_freelist_cur);
 278                return;
 279        }
 280
 281        if (*batchp == NULL) {
 282                *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
 283                (*batchp)->index = 0;
 284        }
 285
 286        (*batchp)->ptes[(*batchp)->index++] = hugepte;
 287        if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
 288                call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
 289                *batchp = NULL;
 290        }
 291        put_cpu_var(hugepd_freelist_cur);
 292}
 293#else
 294static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {}
 295#endif
 296
 297static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
 298                              unsigned long start, unsigned long end,
 299                              unsigned long floor, unsigned long ceiling)
 300{
 301        pte_t *hugepte = hugepd_page(*hpdp);
 302        int i;
 303
 304        unsigned long pdmask = ~((1UL << pdshift) - 1);
 305        unsigned int num_hugepd = 1;
 306        unsigned int shift = hugepd_shift(*hpdp);
 307
 308        /* Note: On fsl the hpdp may be the first of several */
 309        if (shift > pdshift)
 310                num_hugepd = 1 << (shift - pdshift);
 311
 312        start &= pdmask;
 313        if (start < floor)
 314                return;
 315        if (ceiling) {
 316                ceiling &= pdmask;
 317                if (! ceiling)
 318                        return;
 319        }
 320        if (end - 1 > ceiling - 1)
 321                return;
 322
 323        for (i = 0; i < num_hugepd; i++, hpdp++)
 324                *hpdp = __hugepd(0);
 325
 326        if (shift >= pdshift)
 327                hugepd_free(tlb, hugepte);
 328        else
 329                pgtable_free_tlb(tlb, hugepte, pdshift - shift);
 330}
 331
 332static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 333                                   unsigned long addr, unsigned long end,
 334                                   unsigned long floor, unsigned long ceiling)
 335{
 336        pmd_t *pmd;
 337        unsigned long next;
 338        unsigned long start;
 339
 340        start = addr;
 341        do {
 342                unsigned long more;
 343
 344                pmd = pmd_offset(pud, addr);
 345                next = pmd_addr_end(addr, end);
 346                if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
 347                        /*
 348                         * if it is not hugepd pointer, we should already find
 349                         * it cleared.
 350                         */
 351                        WARN_ON(!pmd_none_or_clear_bad(pmd));
 352                        continue;
 353                }
 354                /*
 355                 * Increment next by the size of the huge mapping since
 356                 * there may be more than one entry at this level for a
 357                 * single hugepage, but all of them point to
 358                 * the same kmem cache that holds the hugepte.
 359                 */
 360                more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
 361                if (more > next)
 362                        next = more;
 363
 364                free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
 365                                  addr, next, floor, ceiling);
 366        } while (addr = next, addr != end);
 367
 368        start &= PUD_MASK;
 369        if (start < floor)
 370                return;
 371        if (ceiling) {
 372                ceiling &= PUD_MASK;
 373                if (!ceiling)
 374                        return;
 375        }
 376        if (end - 1 > ceiling - 1)
 377                return;
 378
 379        pmd = pmd_offset(pud, start);
 380        pud_clear(pud);
 381        pmd_free_tlb(tlb, pmd, start);
 382        mm_dec_nr_pmds(tlb->mm);
 383}
 384
 385static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 386                                   unsigned long addr, unsigned long end,
 387                                   unsigned long floor, unsigned long ceiling)
 388{
 389        pud_t *pud;
 390        unsigned long next;
 391        unsigned long start;
 392
 393        start = addr;
 394        do {
 395                pud = pud_offset(pgd, addr);
 396                next = pud_addr_end(addr, end);
 397                if (!is_hugepd(__hugepd(pud_val(*pud)))) {
 398                        if (pud_none_or_clear_bad(pud))
 399                                continue;
 400                        hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
 401                                               ceiling);
 402                } else {
 403                        unsigned long more;
 404                        /*
 405                         * Increment next by the size of the huge mapping since
 406                         * there may be more than one entry at this level for a
 407                         * single hugepage, but all of them point to
 408                         * the same kmem cache that holds the hugepte.
 409                         */
 410                        more = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
 411                        if (more > next)
 412                                next = more;
 413
 414                        free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
 415                                          addr, next, floor, ceiling);
 416                }
 417        } while (addr = next, addr != end);
 418
 419        start &= PGDIR_MASK;
 420        if (start < floor)
 421                return;
 422        if (ceiling) {
 423                ceiling &= PGDIR_MASK;
 424                if (!ceiling)
 425                        return;
 426        }
 427        if (end - 1 > ceiling - 1)
 428                return;
 429
 430        pud = pud_offset(pgd, start);
 431        pgd_clear(pgd);
 432        pud_free_tlb(tlb, pud, start);
 433        mm_dec_nr_puds(tlb->mm);
 434}
 435
 436/*
 437 * This function frees user-level page tables of a process.
 438 */
 439void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 440                            unsigned long addr, unsigned long end,
 441                            unsigned long floor, unsigned long ceiling)
 442{
 443        pgd_t *pgd;
 444        unsigned long next;
 445
 446        /*
 447         * Because there are a number of different possible pagetable
 448         * layouts for hugepage ranges, we limit knowledge of how
 449         * things should be laid out to the allocation path
 450         * (huge_pte_alloc(), above).  Everything else works out the
 451         * structure as it goes from information in the hugepd
 452         * pointers.  That means that we can't here use the
 453         * optimization used in the normal page free_pgd_range(), of
 454         * checking whether we're actually covering a large enough
 455         * range to have to do anything at the top level of the walk
 456         * instead of at the bottom.
 457         *
 458         * To make sense of this, you should probably go read the big
 459         * block comment at the top of the normal free_pgd_range(),
 460         * too.
 461         */
 462
 463        do {
 464                next = pgd_addr_end(addr, end);
 465                pgd = pgd_offset(tlb->mm, addr);
 466                if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
 467                        if (pgd_none_or_clear_bad(pgd))
 468                                continue;
 469                        hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
 470                } else {
 471                        unsigned long more;
 472                        /*
 473                         * Increment next by the size of the huge mapping since
 474                         * there may be more than one entry at the pgd level
 475                         * for a single hugepage, but all of them point to the
 476                         * same kmem cache that holds the hugepte.
 477                         */
 478                        more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
 479                        if (more > next)
 480                                next = more;
 481
 482                        free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
 483                                          addr, next, floor, ceiling);
 484                }
 485        } while (addr = next, addr != end);
 486}
 487
 488struct page *follow_huge_pd(struct vm_area_struct *vma,
 489                            unsigned long address, hugepd_t hpd,
 490                            int flags, int pdshift)
 491{
 492        pte_t *ptep;
 493        spinlock_t *ptl;
 494        struct page *page = NULL;
 495        unsigned long mask;
 496        int shift = hugepd_shift(hpd);
 497        struct mm_struct *mm = vma->vm_mm;
 498
 499retry:
 500        ptl = &mm->page_table_lock;
 501        spin_lock(ptl);
 502
 503        ptep = hugepte_offset(hpd, address, pdshift);
 504        if (pte_present(*ptep)) {
 505                mask = (1UL << shift) - 1;
 506                page = pte_page(*ptep);
 507                page += ((address & mask) >> PAGE_SHIFT);
 508                if (flags & FOLL_GET)
 509                        get_page(page);
 510        } else {
 511                if (is_hugetlb_entry_migration(*ptep)) {
 512                        spin_unlock(ptl);
 513                        __migration_entry_wait(mm, ptep, ptl);
 514                        goto retry;
 515                }
 516        }
 517        spin_unlock(ptl);
 518        return page;
 519}
 520
 521static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
 522                                      unsigned long sz)
 523{
 524        unsigned long __boundary = (addr + sz) & ~(sz-1);
 525        return (__boundary - 1 < end - 1) ? __boundary : end;
 526}
 527
 528int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift,
 529                unsigned long end, int write, struct page **pages, int *nr)
 530{
 531        pte_t *ptep;
 532        unsigned long sz = 1UL << hugepd_shift(hugepd);
 533        unsigned long next;
 534
 535        ptep = hugepte_offset(hugepd, addr, pdshift);
 536        do {
 537                next = hugepte_addr_end(addr, end, sz);
 538                if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
 539                        return 0;
 540        } while (ptep++, addr = next, addr != end);
 541
 542        return 1;
 543}
 544
 545#ifdef CONFIG_PPC_MM_SLICES
 546unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 547                                        unsigned long len, unsigned long pgoff,
 548                                        unsigned long flags)
 549{
 550        struct hstate *hstate = hstate_file(file);
 551        int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
 552
 553#ifdef CONFIG_PPC_RADIX_MMU
 554        if (radix_enabled())
 555                return radix__hugetlb_get_unmapped_area(file, addr, len,
 556                                                       pgoff, flags);
 557#endif
 558        return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
 559}
 560#endif
 561
 562unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
 563{
 564#ifdef CONFIG_PPC_MM_SLICES
 565        /* With radix we don't use slice, so derive it from vma*/
 566        if (!radix_enabled()) {
 567                unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
 568
 569                return 1UL << mmu_psize_to_shift(psize);
 570        }
 571#endif
 572        return vma_kernel_pagesize(vma);
 573}
 574
 575static inline bool is_power_of_4(unsigned long x)
 576{
 577        if (is_power_of_2(x))
 578                return (__ilog2(x) % 2) ? false : true;
 579        return false;
 580}
 581
 582static int __init add_huge_page_size(unsigned long long size)
 583{
 584        int shift = __ffs(size);
 585        int mmu_psize;
 586
 587        /* Check that it is a page size supported by the hardware and
 588         * that it fits within pagetable and slice limits. */
 589        if (size <= PAGE_SIZE)
 590                return -EINVAL;
 591#if defined(CONFIG_PPC_FSL_BOOK3E)
 592        if (!is_power_of_4(size))
 593                return -EINVAL;
 594#elif !defined(CONFIG_PPC_8xx)
 595        if (!is_power_of_2(size) || (shift > SLICE_HIGH_SHIFT))
 596                return -EINVAL;
 597#endif
 598
 599        if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
 600                return -EINVAL;
 601
 602#ifdef CONFIG_PPC_BOOK3S_64
 603        /*
 604         * We need to make sure that for different page sizes reported by
 605         * firmware we only add hugetlb support for page sizes that can be
 606         * supported by linux page table layout.
 607         * For now we have
 608         * Radix: 2M
 609         * Hash: 16M and 16G
 610         */
 611        if (radix_enabled()) {
 612                if (mmu_psize != MMU_PAGE_2M) {
 613                        if (cpu_has_feature(CPU_FTR_POWER9_DD1) ||
 614                            (mmu_psize != MMU_PAGE_1G))
 615                                return -EINVAL;
 616                }
 617        } else {
 618                if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G)
 619                        return -EINVAL;
 620        }
 621#endif
 622
 623        BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
 624
 625        /* Return if huge page size has already been setup */
 626        if (size_to_hstate(size))
 627                return 0;
 628
 629        hugetlb_add_hstate(shift - PAGE_SHIFT);
 630
 631        return 0;
 632}
 633
 634static int __init hugepage_setup_sz(char *str)
 635{
 636        unsigned long long size;
 637
 638        size = memparse(str, &str);
 639
 640        if (add_huge_page_size(size) != 0) {
 641                hugetlb_bad_size();
 642                pr_err("Invalid huge page size specified(%llu)\n", size);
 643        }
 644
 645        return 1;
 646}
 647__setup("hugepagesz=", hugepage_setup_sz);
 648
 649struct kmem_cache *hugepte_cache;
 650static int __init hugetlbpage_init(void)
 651{
 652        int psize;
 653
 654#if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx)
 655        if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
 656                return -ENODEV;
 657#endif
 658        for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
 659                unsigned shift;
 660                unsigned pdshift;
 661
 662                if (!mmu_psize_defs[psize].shift)
 663                        continue;
 664
 665                shift = mmu_psize_to_shift(psize);
 666
 667#ifdef CONFIG_PPC_BOOK3S_64
 668                if (shift > PGDIR_SHIFT)
 669                        continue;
 670                else if (shift > PUD_SHIFT)
 671                        pdshift = PGDIR_SHIFT;
 672                else if (shift > PMD_SHIFT)
 673                        pdshift = PUD_SHIFT;
 674                else
 675                        pdshift = PMD_SHIFT;
 676#else
 677                if (shift < HUGEPD_PUD_SHIFT)
 678                        pdshift = PMD_SHIFT;
 679                else if (shift < HUGEPD_PGD_SHIFT)
 680                        pdshift = PUD_SHIFT;
 681                else
 682                        pdshift = PGDIR_SHIFT;
 683#endif
 684
 685                if (add_huge_page_size(1ULL << shift) < 0)
 686                        continue;
 687                /*
 688                 * if we have pdshift and shift value same, we don't
 689                 * use pgt cache for hugepd.
 690                 */
 691                if (pdshift > shift)
 692                        pgtable_cache_add(pdshift - shift, NULL);
 693#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
 694                else if (!hugepte_cache) {
 695                        /*
 696                         * Create a kmem cache for hugeptes.  The bottom bits in
 697                         * the pte have size information encoded in them, so
 698                         * align them to allow this
 699                         */
 700                        hugepte_cache = kmem_cache_create("hugepte-cache",
 701                                                          sizeof(pte_t),
 702                                                          HUGEPD_SHIFT_MASK + 1,
 703                                                          0, NULL);
 704                        if (hugepte_cache == NULL)
 705                                panic("%s: Unable to create kmem cache "
 706                                      "for hugeptes\n", __func__);
 707
 708                }
 709#endif
 710        }
 711
 712#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
 713        /* Default hpage size = 4M on FSL_BOOK3E and 512k on 8xx */
 714        if (mmu_psize_defs[MMU_PAGE_4M].shift)
 715                HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
 716        else if (mmu_psize_defs[MMU_PAGE_512K].shift)
 717                HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_512K].shift;
 718#else
 719        /* Set default large page size. Currently, we pick 16M or 1M
 720         * depending on what is available
 721         */
 722        if (mmu_psize_defs[MMU_PAGE_16M].shift)
 723                HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
 724        else if (mmu_psize_defs[MMU_PAGE_1M].shift)
 725                HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
 726        else if (mmu_psize_defs[MMU_PAGE_2M].shift)
 727                HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift;
 728#endif
 729        return 0;
 730}
 731
 732arch_initcall(hugetlbpage_init);
 733
 734void flush_dcache_icache_hugepage(struct page *page)
 735{
 736        int i;
 737        void *start;
 738
 739        BUG_ON(!PageCompound(page));
 740
 741        for (i = 0; i < (1UL << compound_order(page)); i++) {
 742                if (!PageHighMem(page)) {
 743                        __flush_dcache_icache(page_address(page+i));
 744                } else {
 745                        start = kmap_atomic(page+i);
 746                        __flush_dcache_icache(start);
 747                        kunmap_atomic(start);
 748                }
 749        }
 750}
 751
 752#endif /* CONFIG_HUGETLB_PAGE */
 753
 754/*
 755 * We have 4 cases for pgds and pmds:
 756 * (1) invalid (all zeroes)
 757 * (2) pointer to next table, as normal; bottom 6 bits == 0
 758 * (3) leaf pte for huge page _PAGE_PTE set
 759 * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
 760 *
 761 * So long as we atomically load page table pointers we are safe against teardown,
 762 * we can follow the address down to the the page and take a ref on it.
 763 * This function need to be called with interrupts disabled. We use this variant
 764 * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED
 765 */
 766pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
 767                        bool *is_thp, unsigned *hpage_shift)
 768{
 769        pgd_t pgd, *pgdp;
 770        pud_t pud, *pudp;
 771        pmd_t pmd, *pmdp;
 772        pte_t *ret_pte;
 773        hugepd_t *hpdp = NULL;
 774        unsigned pdshift = PGDIR_SHIFT;
 775
 776        if (hpage_shift)
 777                *hpage_shift = 0;
 778
 779        if (is_thp)
 780                *is_thp = false;
 781
 782        pgdp = pgdir + pgd_index(ea);
 783        pgd  = READ_ONCE(*pgdp);
 784        /*
 785         * Always operate on the local stack value. This make sure the
 786         * value don't get updated by a parallel THP split/collapse,
 787         * page fault or a page unmap. The return pte_t * is still not
 788         * stable. So should be checked there for above conditions.
 789         */
 790        if (pgd_none(pgd))
 791                return NULL;
 792        else if (pgd_huge(pgd)) {
 793                ret_pte = (pte_t *) pgdp;
 794                goto out;
 795        } else if (is_hugepd(__hugepd(pgd_val(pgd))))
 796                hpdp = (hugepd_t *)&pgd;
 797        else {
 798                /*
 799                 * Even if we end up with an unmap, the pgtable will not
 800                 * be freed, because we do an rcu free and here we are
 801                 * irq disabled
 802                 */
 803                pdshift = PUD_SHIFT;
 804                pudp = pud_offset(&pgd, ea);
 805                pud  = READ_ONCE(*pudp);
 806
 807                if (pud_none(pud))
 808                        return NULL;
 809                else if (pud_huge(pud)) {
 810                        ret_pte = (pte_t *) pudp;
 811                        goto out;
 812                } else if (is_hugepd(__hugepd(pud_val(pud))))
 813                        hpdp = (hugepd_t *)&pud;
 814                else {
 815                        pdshift = PMD_SHIFT;
 816                        pmdp = pmd_offset(&pud, ea);
 817                        pmd  = READ_ONCE(*pmdp);
 818                        /*
 819                         * A hugepage collapse is captured by pmd_none, because
 820                         * it mark the pmd none and do a hpte invalidate.
 821                         */
 822                        if (pmd_none(pmd))
 823                                return NULL;
 824
 825                        if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
 826                                if (is_thp)
 827                                        *is_thp = true;
 828                                ret_pte = (pte_t *) pmdp;
 829                                goto out;
 830                        }
 831
 832                        if (pmd_huge(pmd)) {
 833                                ret_pte = (pte_t *) pmdp;
 834                                goto out;
 835                        } else if (is_hugepd(__hugepd(pmd_val(pmd))))
 836                                hpdp = (hugepd_t *)&pmd;
 837                        else
 838                                return pte_offset_kernel(&pmd, ea);
 839                }
 840        }
 841        if (!hpdp)
 842                return NULL;
 843
 844        ret_pte = hugepte_offset(*hpdp, ea, pdshift);
 845        pdshift = hugepd_shift(*hpdp);
 846out:
 847        if (hpage_shift)
 848                *hpage_shift = pdshift;
 849        return ret_pte;
 850}
 851EXPORT_SYMBOL_GPL(__find_linux_pte);
 852
 853int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 854                unsigned long end, int write, struct page **pages, int *nr)
 855{
 856        unsigned long pte_end;
 857        struct page *head, *page;
 858        pte_t pte;
 859        int refs;
 860
 861        pte_end = (addr + sz) & ~(sz-1);
 862        if (pte_end < end)
 863                end = pte_end;
 864
 865        pte = READ_ONCE(*ptep);
 866
 867        if (!pte_access_permitted(pte, write))
 868                return 0;
 869
 870        /* hugepages are never "special" */
 871        VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
 872
 873        refs = 0;
 874        head = pte_page(pte);
 875
 876        page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
 877        do {
 878                VM_BUG_ON(compound_head(page) != head);
 879                pages[*nr] = page;
 880                (*nr)++;
 881                page++;
 882                refs++;
 883        } while (addr += PAGE_SIZE, addr != end);
 884
 885        if (!page_cache_add_speculative(head, refs)) {
 886                *nr -= refs;
 887                return 0;
 888        }
 889
 890        if (unlikely(pte_val(pte) != pte_val(*ptep))) {
 891                /* Could be optimized better */
 892                *nr -= refs;
 893                while (refs--)
 894                        put_page(head);
 895                return 0;
 896        }
 897
 898        return 1;
 899}
 900