LXR linux/arch/powerpc/mm/hugetlbpage.c

   1/*
   2 * PPC Huge TLB Page Support for Kernel.
   3 *
   4 * Copyright (C) 2003 David Gibson, IBM Corporation.
   5 * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
   6 *
   7 * Based on the IA-32 version:
   8 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
   9 */
  10
  11#include <linux/mm.h>
  12#include <linux/io.h>
  13#include <linux/slab.h>
  14#include <linux/hugetlb.h>
  15#include <linux/export.h>
  16#include <linux/of_fdt.h>
  17#include <linux/memblock.h>
  18#include <linux/bootmem.h>
  19#include <linux/moduleparam.h>
  20#include <asm/pgtable.h>
  21#include <asm/pgalloc.h>
  22#include <asm/tlb.h>
  23#include <asm/setup.h>
  24#include <asm/hugetlb.h>
  25
  26#ifdef CONFIG_HUGETLB_PAGE
  27
  28#define PAGE_SHIFT_64K  16
  29#define PAGE_SHIFT_16M  24
  30#define PAGE_SHIFT_16G  34
  31
  32unsigned int HPAGE_SHIFT;
  33
  34/*
  35 * Tracks gpages after the device tree is scanned and before the
  36 * huge_boot_pages list is ready.  On non-Freescale implementations, this is
  37 * just used to track 16G pages and so is a single array.  FSL-based
  38 * implementations may have more than one gpage size, so we need multiple
  39 * arrays
  40 */
  41#ifdef CONFIG_PPC_FSL_BOOK3E
  42#define MAX_NUMBER_GPAGES       128
  43struct psize_gpages {
  44        u64 gpage_list[MAX_NUMBER_GPAGES];
  45        unsigned int nr_gpages;
  46};
  47static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT];
  48#else
  49#define MAX_NUMBER_GPAGES       1024
  50static u64 gpage_freearray[MAX_NUMBER_GPAGES];
  51static unsigned nr_gpages;
  52#endif
  53
  54#define hugepd_none(hpd)        ((hpd).pd == 0)
  55
  56pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
  57{
  58        /* Only called for hugetlbfs pages, hence can ignore THP */
  59        return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL);
  60}
  61
  62static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
  63                           unsigned long address, unsigned pdshift, unsigned pshift)
  64{
  65        struct kmem_cache *cachep;
  66        pte_t *new;
  67
  68#ifdef CONFIG_PPC_FSL_BOOK3E
  69        int i;
  70        int num_hugepd = 1 << (pshift - pdshift);
  71        cachep = hugepte_cache;
  72#else
  73        cachep = PGT_CACHE(pdshift - pshift);
  74#endif
  75
  76        new = kmem_cache_zalloc(cachep, GFP_KERNEL|__GFP_REPEAT);
  77
  78        BUG_ON(pshift > HUGEPD_SHIFT_MASK);
  79        BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
  80
  81        if (! new)
  82                return -ENOMEM;
  83
  84        spin_lock(&mm->page_table_lock);
  85#ifdef CONFIG_PPC_FSL_BOOK3E
  86        /*
  87         * We have multiple higher-level entries that point to the same
  88         * actual pte location.  Fill in each as we go and backtrack on error.
  89         * We need all of these so the DTLB pgtable walk code can find the
  90         * right higher-level entry without knowing if it's a hugepage or not.
  91         */
  92        for (i = 0; i < num_hugepd; i++, hpdp++) {
  93                if (unlikely(!hugepd_none(*hpdp)))
  94                        break;
  95                else
  96                        /* We use the old format for PPC_FSL_BOOK3E */
  97                        hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
  98        }
  99        /* If we bailed from the for loop early, an error occurred, clean up */
 100        if (i < num_hugepd) {
 101                for (i = i - 1 ; i >= 0; i--, hpdp--)
 102                        hpdp->pd = 0;
 103                kmem_cache_free(cachep, new);
 104        }
 105#else
 106        if (!hugepd_none(*hpdp))
 107                kmem_cache_free(cachep, new);
 108        else {
 109#ifdef CONFIG_PPC_BOOK3S_64
 110                hpdp->pd = (unsigned long)new |
 111                            (shift_to_mmu_psize(pshift) << 2);
 112#else
 113                hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
 114#endif
 115        }
 116#endif
 117        spin_unlock(&mm->page_table_lock);
 118        return 0;
 119}
 120
 121/*
 122 * These macros define how to determine which level of the page table holds
 123 * the hpdp.
 124 */
 125#ifdef CONFIG_PPC_FSL_BOOK3E
 126#define HUGEPD_PGD_SHIFT PGDIR_SHIFT
 127#define HUGEPD_PUD_SHIFT PUD_SHIFT
 128#else
 129#define HUGEPD_PGD_SHIFT PUD_SHIFT
 130#define HUGEPD_PUD_SHIFT PMD_SHIFT
 131#endif
 132
 133#ifdef CONFIG_PPC_BOOK3S_64
 134/*
 135 * At this point we do the placement change only for BOOK3S 64. This would
 136 * possibly work on other subarchs.
 137 */
 138pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
 139{
 140        pgd_t *pg;
 141        pud_t *pu;
 142        pmd_t *pm;
 143        hugepd_t *hpdp = NULL;
 144        unsigned pshift = __ffs(sz);
 145        unsigned pdshift = PGDIR_SHIFT;
 146
 147        addr &= ~(sz-1);
 148        pg = pgd_offset(mm, addr);
 149
 150        if (pshift == PGDIR_SHIFT)
 151                /* 16GB huge page */
 152                return (pte_t *) pg;
 153        else if (pshift > PUD_SHIFT)
 154                /*
 155                 * We need to use hugepd table
 156                 */
 157                hpdp = (hugepd_t *)pg;
 158        else {
 159                pdshift = PUD_SHIFT;
 160                pu = pud_alloc(mm, pg, addr);
 161                if (pshift == PUD_SHIFT)
 162                        return (pte_t *)pu;
 163                else if (pshift > PMD_SHIFT)
 164                        hpdp = (hugepd_t *)pu;
 165                else {
 166                        pdshift = PMD_SHIFT;
 167                        pm = pmd_alloc(mm, pu, addr);
 168                        if (pshift == PMD_SHIFT)
 169                                /* 16MB hugepage */
 170                                return (pte_t *)pm;
 171                        else
 172                                hpdp = (hugepd_t *)pm;
 173                }
 174        }
 175        if (!hpdp)
 176                return NULL;
 177
 178        BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
 179
 180        if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
 181                return NULL;
 182
 183        return hugepte_offset(*hpdp, addr, pdshift);
 184}
 185
 186#else
 187
 188pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
 189{
 190        pgd_t *pg;
 191        pud_t *pu;
 192        pmd_t *pm;
 193        hugepd_t *hpdp = NULL;
 194        unsigned pshift = __ffs(sz);
 195        unsigned pdshift = PGDIR_SHIFT;
 196
 197        addr &= ~(sz-1);
 198
 199        pg = pgd_offset(mm, addr);
 200
 201        if (pshift >= HUGEPD_PGD_SHIFT) {
 202                hpdp = (hugepd_t *)pg;
 203        } else {
 204                pdshift = PUD_SHIFT;
 205                pu = pud_alloc(mm, pg, addr);
 206                if (pshift >= HUGEPD_PUD_SHIFT) {
 207                        hpdp = (hugepd_t *)pu;
 208                } else {
 209                        pdshift = PMD_SHIFT;
 210                        pm = pmd_alloc(mm, pu, addr);
 211                        hpdp = (hugepd_t *)pm;
 212                }
 213        }
 214
 215        if (!hpdp)
 216                return NULL;
 217
 218        BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
 219
 220        if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
 221                return NULL;
 222
 223        return hugepte_offset(*hpdp, addr, pdshift);
 224}
 225#endif
 226
 227#ifdef CONFIG_PPC_FSL_BOOK3E
 228/* Build list of addresses of gigantic pages.  This function is used in early
 229 * boot before the buddy allocator is setup.
 230 */
 231void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
 232{
 233        unsigned int idx = shift_to_mmu_psize(__ffs(page_size));
 234        int i;
 235
 236        if (addr == 0)
 237                return;
 238
 239        gpage_freearray[idx].nr_gpages = number_of_pages;
 240
 241        for (i = 0; i < number_of_pages; i++) {
 242                gpage_freearray[idx].gpage_list[i] = addr;
 243                addr += page_size;
 244        }
 245}
 246
 247/*
 248 * Moves the gigantic page addresses from the temporary list to the
 249 * huge_boot_pages list.
 250 */
 251int alloc_bootmem_huge_page(struct hstate *hstate)
 252{
 253        struct huge_bootmem_page *m;
 254        int idx = shift_to_mmu_psize(huge_page_shift(hstate));
 255        int nr_gpages = gpage_freearray[idx].nr_gpages;
 256
 257        if (nr_gpages == 0)
 258                return 0;
 259
 260#ifdef CONFIG_HIGHMEM
 261        /*
 262         * If gpages can be in highmem we can't use the trick of storing the
 263         * data structure in the page; allocate space for this
 264         */
 265        m = memblock_virt_alloc(sizeof(struct huge_bootmem_page), 0);
 266        m->phys = gpage_freearray[idx].gpage_list[--nr_gpages];
 267#else
 268        m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]);
 269#endif
 270
 271        list_add(&m->list, &huge_boot_pages);
 272        gpage_freearray[idx].nr_gpages = nr_gpages;
 273        gpage_freearray[idx].gpage_list[nr_gpages] = 0;
 274        m->hstate = hstate;
 275
 276        return 1;
 277}
 278/*
 279 * Scan the command line hugepagesz= options for gigantic pages; store those in
 280 * a list that we use to allocate the memory once all options are parsed.
 281 */
 282
 283unsigned long gpage_npages[MMU_PAGE_COUNT];
 284
 285static int __init do_gpage_early_setup(char *param, char *val,
 286                                       const char *unused, void *arg)
 287{
 288        static phys_addr_t size;
 289        unsigned long npages;
 290
 291        /*
 292         * The hugepagesz and hugepages cmdline options are interleaved.  We
 293         * use the size variable to keep track of whether or not this was done
 294         * properly and skip over instances where it is incorrect.  Other
 295         * command-line parsing code will issue warnings, so we don't need to.
 296         *
 297         */
 298        if ((strcmp(param, "default_hugepagesz") == 0) ||
 299            (strcmp(param, "hugepagesz") == 0)) {
 300                size = memparse(val, NULL);
 301        } else if (strcmp(param, "hugepages") == 0) {
 302                if (size != 0) {
 303                        if (sscanf(val, "%lu", &npages) <= 0)
 304                                npages = 0;
 305                        if (npages > MAX_NUMBER_GPAGES) {
 306                                pr_warn("MMU: %lu pages requested for page "
 307                                        "size %llu KB, limiting to "
 308                                        __stringify(MAX_NUMBER_GPAGES) "\n",
 309                                        npages, size / 1024);
 310                                npages = MAX_NUMBER_GPAGES;
 311                        }
 312                        gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages;
 313                        size = 0;
 314                }
 315        }
 316        return 0;
 317}
 318
 319
 320/*
 321 * This function allocates physical space for pages that are larger than the
 322 * buddy allocator can handle.  We want to allocate these in highmem because
 323 * the amount of lowmem is limited.  This means that this function MUST be
 324 * called before lowmem_end_addr is set up in MMU_init() in order for the lmb
 325 * allocate to grab highmem.
 326 */
 327void __init reserve_hugetlb_gpages(void)
 328{
 329        static __initdata char cmdline[COMMAND_LINE_SIZE];
 330        phys_addr_t size, base;
 331        int i;
 332
 333        strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE);
 334        parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0,
 335                        NULL, &do_gpage_early_setup);
 336
 337        /*
 338         * Walk gpage list in reverse, allocating larger page sizes first.
 339         * Skip over unsupported sizes, or sizes that have 0 gpages allocated.
 340         * When we reach the point in the list where pages are no longer
 341         * considered gpages, we're done.
 342         */
 343        for (i = MMU_PAGE_COUNT-1; i >= 0; i--) {
 344                if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0)
 345                        continue;
 346                else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT))
 347                        break;
 348
 349                size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i));
 350                base = memblock_alloc_base(size * gpage_npages[i], size,
 351                                           MEMBLOCK_ALLOC_ANYWHERE);
 352                add_gpage(base, size, gpage_npages[i]);
 353        }
 354}
 355
 356#else /* !PPC_FSL_BOOK3E */
 357
 358/* Build list of addresses of gigantic pages.  This function is used in early
 359 * boot before the buddy allocator is setup.
 360 */
 361void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
 362{
 363        if (!addr)
 364                return;
 365        while (number_of_pages > 0) {
 366                gpage_freearray[nr_gpages] = addr;
 367                nr_gpages++;
 368                number_of_pages--;
 369                addr += page_size;
 370        }
 371}
 372
 373/* Moves the gigantic page addresses from the temporary list to the
 374 * huge_boot_pages list.
 375 */
 376int alloc_bootmem_huge_page(struct hstate *hstate)
 377{
 378        struct huge_bootmem_page *m;
 379        if (nr_gpages == 0)
 380                return 0;
 381        m = phys_to_virt(gpage_freearray[--nr_gpages]);
 382        gpage_freearray[nr_gpages] = 0;
 383        list_add(&m->list, &huge_boot_pages);
 384        m->hstate = hstate;
 385        return 1;
 386}
 387#endif
 388
 389#ifdef CONFIG_PPC_FSL_BOOK3E
 390#define HUGEPD_FREELIST_SIZE \
 391        ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
 392
 393struct hugepd_freelist {
 394        struct rcu_head rcu;
 395        unsigned int index;
 396        void *ptes[0];
 397};
 398
 399static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
 400
 401static void hugepd_free_rcu_callback(struct rcu_head *head)
 402{
 403        struct hugepd_freelist *batch =
 404                container_of(head, struct hugepd_freelist, rcu);
 405        unsigned int i;
 406
 407        for (i = 0; i < batch->index; i++)
 408                kmem_cache_free(hugepte_cache, batch->ptes[i]);
 409
 410        free_page((unsigned long)batch);
 411}
 412
 413static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
 414{
 415        struct hugepd_freelist **batchp;
 416
 417        batchp = this_cpu_ptr(&hugepd_freelist_cur);
 418
 419        if (atomic_read(&tlb->mm->mm_users) < 2 ||
 420            cpumask_equal(mm_cpumask(tlb->mm),
 421                          cpumask_of(smp_processor_id()))) {
 422                kmem_cache_free(hugepte_cache, hugepte);
 423        put_cpu_var(hugepd_freelist_cur);
 424                return;
 425        }
 426
 427        if (*batchp == NULL) {
 428                *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
 429                (*batchp)->index = 0;
 430        }
 431
 432        (*batchp)->ptes[(*batchp)->index++] = hugepte;
 433        if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
 434                call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
 435                *batchp = NULL;
 436        }
 437        put_cpu_var(hugepd_freelist_cur);
 438}
 439#endif
 440
 441static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
 442                              unsigned long start, unsigned long end,
 443                              unsigned long floor, unsigned long ceiling)
 444{
 445        pte_t *hugepte = hugepd_page(*hpdp);
 446        int i;
 447
 448        unsigned long pdmask = ~((1UL << pdshift) - 1);
 449        unsigned int num_hugepd = 1;
 450
 451#ifdef CONFIG_PPC_FSL_BOOK3E
 452        /* Note: On fsl the hpdp may be the first of several */
 453        num_hugepd = (1 << (hugepd_shift(*hpdp) - pdshift));
 454#else
 455        unsigned int shift = hugepd_shift(*hpdp);
 456#endif
 457
 458        start &= pdmask;
 459        if (start < floor)
 460                return;
 461        if (ceiling) {
 462                ceiling &= pdmask;
 463                if (! ceiling)
 464                        return;
 465        }
 466        if (end - 1 > ceiling - 1)
 467                return;
 468
 469        for (i = 0; i < num_hugepd; i++, hpdp++)
 470                hpdp->pd = 0;
 471
 472#ifdef CONFIG_PPC_FSL_BOOK3E
 473        hugepd_free(tlb, hugepte);
 474#else
 475        pgtable_free_tlb(tlb, hugepte, pdshift - shift);
 476#endif
 477}
 478
 479static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 480                                   unsigned long addr, unsigned long end,
 481                                   unsigned long floor, unsigned long ceiling)
 482{
 483        pmd_t *pmd;
 484        unsigned long next;
 485        unsigned long start;
 486
 487        start = addr;
 488        do {
 489                pmd = pmd_offset(pud, addr);
 490                next = pmd_addr_end(addr, end);
 491                if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
 492                        /*
 493                         * if it is not hugepd pointer, we should already find
 494                         * it cleared.
 495                         */
 496                        WARN_ON(!pmd_none_or_clear_bad(pmd));
 497                        continue;
 498                }
 499#ifdef CONFIG_PPC_FSL_BOOK3E
 500                /*
 501                 * Increment next by the size of the huge mapping since
 502                 * there may be more than one entry at this level for a
 503                 * single hugepage, but all of them point to
 504                 * the same kmem cache that holds the hugepte.
 505                 */
 506                next = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
 507#endif
 508                free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
 509                                  addr, next, floor, ceiling);
 510        } while (addr = next, addr != end);
 511
 512        start &= PUD_MASK;
 513        if (start < floor)
 514                return;
 515        if (ceiling) {
 516                ceiling &= PUD_MASK;
 517                if (!ceiling)
 518                        return;
 519        }
 520        if (end - 1 > ceiling - 1)
 521                return;
 522
 523        pmd = pmd_offset(pud, start);
 524        pud_clear(pud);
 525        pmd_free_tlb(tlb, pmd, start);
 526        mm_dec_nr_pmds(tlb->mm);
 527}
 528
 529static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 530                                   unsigned long addr, unsigned long end,
 531                                   unsigned long floor, unsigned long ceiling)
 532{
 533        pud_t *pud;
 534        unsigned long next;
 535        unsigned long start;
 536
 537        start = addr;
 538        do {
 539                pud = pud_offset(pgd, addr);
 540                next = pud_addr_end(addr, end);
 541                if (!is_hugepd(__hugepd(pud_val(*pud)))) {
 542                        if (pud_none_or_clear_bad(pud))
 543                                continue;
 544                        hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
 545                                               ceiling);
 546                } else {
 547#ifdef CONFIG_PPC_FSL_BOOK3E
 548                        /*
 549                         * Increment next by the size of the huge mapping since
 550                         * there may be more than one entry at this level for a
 551                         * single hugepage, but all of them point to
 552                         * the same kmem cache that holds the hugepte.
 553                         */
 554                        next = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
 555#endif
 556                        free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
 557                                          addr, next, floor, ceiling);
 558                }
 559        } while (addr = next, addr != end);
 560
 561        start &= PGDIR_MASK;
 562        if (start < floor)
 563                return;
 564        if (ceiling) {
 565                ceiling &= PGDIR_MASK;
 566                if (!ceiling)
 567                        return;
 568        }
 569        if (end - 1 > ceiling - 1)
 570                return;
 571
 572        pud = pud_offset(pgd, start);
 573        pgd_clear(pgd);
 574        pud_free_tlb(tlb, pud, start);
 575}
 576
 577/*
 578 * This function frees user-level page tables of a process.
 579 */
 580void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 581                            unsigned long addr, unsigned long end,
 582                            unsigned long floor, unsigned long ceiling)
 583{
 584        pgd_t *pgd;
 585        unsigned long next;
 586
 587        /*
 588         * Because there are a number of different possible pagetable
 589         * layouts for hugepage ranges, we limit knowledge of how
 590         * things should be laid out to the allocation path
 591         * (huge_pte_alloc(), above).  Everything else works out the
 592         * structure as it goes from information in the hugepd
 593         * pointers.  That means that we can't here use the
 594         * optimization used in the normal page free_pgd_range(), of
 595         * checking whether we're actually covering a large enough
 596         * range to have to do anything at the top level of the walk
 597         * instead of at the bottom.
 598         *
 599         * To make sense of this, you should probably go read the big
 600         * block comment at the top of the normal free_pgd_range(),
 601         * too.
 602         */
 603
 604        do {
 605                next = pgd_addr_end(addr, end);
 606                pgd = pgd_offset(tlb->mm, addr);
 607                if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
 608                        if (pgd_none_or_clear_bad(pgd))
 609                                continue;
 610                        hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
 611                } else {
 612#ifdef CONFIG_PPC_FSL_BOOK3E
 613                        /*
 614                         * Increment next by the size of the huge mapping since
 615                         * there may be more than one entry at the pgd level
 616                         * for a single hugepage, but all of them point to the
 617                         * same kmem cache that holds the hugepte.
 618                         */
 619                        next = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
 620#endif
 621                        free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
 622                                          addr, next, floor, ceiling);
 623                }
 624        } while (addr = next, addr != end);
 625}
 626
 627/*
 628 * We are holding mmap_sem, so a parallel huge page collapse cannot run.
 629 * To prevent hugepage split, disable irq.
 630 */
 631struct page *
 632follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 633{
 634        bool is_thp;
 635        pte_t *ptep, pte;
 636        unsigned shift;
 637        unsigned long mask, flags;
 638        struct page *page = ERR_PTR(-EINVAL);
 639
 640        local_irq_save(flags);
 641        ptep = find_linux_pte_or_hugepte(mm->pgd, address, &is_thp, &shift);
 642        if (!ptep)
 643                goto no_page;
 644        pte = READ_ONCE(*ptep);
 645        /*
 646         * Verify it is a huge page else bail.
 647         * Transparent hugepages are handled by generic code. We can skip them
 648         * here.
 649         */
 650        if (!shift || is_thp)
 651                goto no_page;
 652
 653        if (!pte_present(pte)) {
 654                page = NULL;
 655                goto no_page;
 656        }
 657        mask = (1UL << shift) - 1;
 658        page = pte_page(pte);
 659        if (page)
 660                page += (address & mask) / PAGE_SIZE;
 661
 662no_page:
 663        local_irq_restore(flags);
 664        return page;
 665}
 666
 667struct page *
 668follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 669                pmd_t *pmd, int write)
 670{
 671        BUG();
 672        return NULL;
 673}
 674
 675struct page *
 676follow_huge_pud(struct mm_struct *mm, unsigned long address,
 677                pud_t *pud, int write)
 678{
 679        BUG();
 680        return NULL;
 681}
 682
 683static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
 684                                      unsigned long sz)
 685{
 686        unsigned long __boundary = (addr + sz) & ~(sz-1);
 687        return (__boundary - 1 < end - 1) ? __boundary : end;
 688}
 689
 690int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift,
 691                unsigned long end, int write, struct page **pages, int *nr)
 692{
 693        pte_t *ptep;
 694        unsigned long sz = 1UL << hugepd_shift(hugepd);
 695        unsigned long next;
 696
 697        ptep = hugepte_offset(hugepd, addr, pdshift);
 698        do {
 699                next = hugepte_addr_end(addr, end, sz);
 700                if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
 701                        return 0;
 702        } while (ptep++, addr = next, addr != end);
 703
 704        return 1;
 705}
 706
 707#ifdef CONFIG_PPC_MM_SLICES
 708unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 709                                        unsigned long len, unsigned long pgoff,
 710                                        unsigned long flags)
 711{
 712        struct hstate *hstate = hstate_file(file);
 713        int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
 714
 715        return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
 716}
 717#endif
 718
 719unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
 720{
 721#ifdef CONFIG_PPC_MM_SLICES
 722        unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
 723
 724        return 1UL << mmu_psize_to_shift(psize);
 725#else
 726        if (!is_vm_hugetlb_page(vma))
 727                return PAGE_SIZE;
 728
 729        return huge_page_size(hstate_vma(vma));
 730#endif
 731}
 732
 733static inline bool is_power_of_4(unsigned long x)
 734{
 735        if (is_power_of_2(x))
 736                return (__ilog2(x) % 2) ? false : true;
 737        return false;
 738}
 739
 740static int __init add_huge_page_size(unsigned long long size)
 741{
 742        int shift = __ffs(size);
 743        int mmu_psize;
 744
 745        /* Check that it is a page size supported by the hardware and
 746         * that it fits within pagetable and slice limits. */
 747#ifdef CONFIG_PPC_FSL_BOOK3E
 748        if ((size < PAGE_SIZE) || !is_power_of_4(size))
 749                return -EINVAL;
 750#else
 751        if (!is_power_of_2(size)
 752            || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT))
 753                return -EINVAL;
 754#endif
 755
 756        if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
 757                return -EINVAL;
 758
 759        BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
 760
 761        /* Return if huge page size has already been setup */
 762        if (size_to_hstate(size))
 763                return 0;
 764
 765        hugetlb_add_hstate(shift - PAGE_SHIFT);
 766
 767        return 0;
 768}
 769
 770static int __init hugepage_setup_sz(char *str)
 771{
 772        unsigned long long size;
 773
 774        size = memparse(str, &str);
 775
 776        if (add_huge_page_size(size) != 0)
 777                printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
 778
 779        return 1;
 780}
 781__setup("hugepagesz=", hugepage_setup_sz);
 782
 783#ifdef CONFIG_PPC_FSL_BOOK3E
 784struct kmem_cache *hugepte_cache;
 785static int __init hugetlbpage_init(void)
 786{
 787        int psize;
 788
 789        for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
 790                unsigned shift;
 791
 792                if (!mmu_psize_defs[psize].shift)
 793                        continue;
 794
 795                shift = mmu_psize_to_shift(psize);
 796
 797                /* Don't treat normal page sizes as huge... */
 798                if (shift != PAGE_SHIFT)
 799                        if (add_huge_page_size(1ULL << shift) < 0)
 800                                continue;
 801        }
 802
 803        /*
 804         * Create a kmem cache for hugeptes.  The bottom bits in the pte have
 805         * size information encoded in them, so align them to allow this
 806         */
 807        hugepte_cache =  kmem_cache_create("hugepte-cache", sizeof(pte_t),
 808                                           HUGEPD_SHIFT_MASK + 1, 0, NULL);
 809        if (hugepte_cache == NULL)
 810                panic("%s: Unable to create kmem cache for hugeptes\n",
 811                      __func__);
 812
 813        /* Default hpage size = 4M */
 814        if (mmu_psize_defs[MMU_PAGE_4M].shift)
 815                HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
 816        else
 817                panic("%s: Unable to set default huge page size\n", __func__);
 818
 819
 820        return 0;
 821}
 822#else
 823static int __init hugetlbpage_init(void)
 824{
 825        int psize;
 826
 827        if (!mmu_has_feature(MMU_FTR_16M_PAGE))
 828                return -ENODEV;
 829
 830        for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
 831                unsigned shift;
 832                unsigned pdshift;
 833
 834                if (!mmu_psize_defs[psize].shift)
 835                        continue;
 836
 837                shift = mmu_psize_to_shift(psize);
 838
 839                if (add_huge_page_size(1ULL << shift) < 0)
 840                        continue;
 841
 842                if (shift < PMD_SHIFT)
 843                        pdshift = PMD_SHIFT;
 844                else if (shift < PUD_SHIFT)
 845                        pdshift = PUD_SHIFT;
 846                else
 847                        pdshift = PGDIR_SHIFT;
 848                /*
 849                 * if we have pdshift and shift value same, we don't
 850                 * use pgt cache for hugepd.
 851                 */
 852                if (pdshift != shift) {
 853                        pgtable_cache_add(pdshift - shift, NULL);
 854                        if (!PGT_CACHE(pdshift - shift))
 855                                panic("hugetlbpage_init(): could not create "
 856                                      "pgtable cache for %d bit pagesize\n", shift);
 857                }
 858        }
 859
 860        /* Set default large page size. Currently, we pick 16M or 1M
 861         * depending on what is available
 862         */
 863        if (mmu_psize_defs[MMU_PAGE_16M].shift)
 864                HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
 865        else if (mmu_psize_defs[MMU_PAGE_1M].shift)
 866                HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
 867
 868        return 0;
 869}
 870#endif
 871arch_initcall(hugetlbpage_init);
 872
 873void flush_dcache_icache_hugepage(struct page *page)
 874{
 875        int i;
 876        void *start;
 877
 878        BUG_ON(!PageCompound(page));
 879
 880        for (i = 0; i < (1UL << compound_order(page)); i++) {
 881                if (!PageHighMem(page)) {
 882                        __flush_dcache_icache(page_address(page+i));
 883                } else {
 884                        start = kmap_atomic(page+i);
 885                        __flush_dcache_icache(start);
 886                        kunmap_atomic(start);
 887                }
 888        }
 889}
 890
 891#endif /* CONFIG_HUGETLB_PAGE */
 892
 893/*
 894 * We have 4 cases for pgds and pmds:
 895 * (1) invalid (all zeroes)
 896 * (2) pointer to next table, as normal; bottom 6 bits == 0
 897 * (3) leaf pte for huge page _PAGE_PTE set
 898 * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
 899 *
 900 * So long as we atomically load page table pointers we are safe against teardown,
 901 * we can follow the address down to the the page and take a ref on it.
 902 * This function need to be called with interrupts disabled. We use this variant
 903 * when we have MSR[EE] = 0 but the paca->soft_enabled = 1
 904 */
 905
 906pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
 907                                   bool *is_thp, unsigned *shift)
 908{
 909        pgd_t pgd, *pgdp;
 910        pud_t pud, *pudp;
 911        pmd_t pmd, *pmdp;
 912        pte_t *ret_pte;
 913        hugepd_t *hpdp = NULL;
 914        unsigned pdshift = PGDIR_SHIFT;
 915
 916        if (shift)
 917                *shift = 0;
 918
 919        if (is_thp)
 920                *is_thp = false;
 921
 922        pgdp = pgdir + pgd_index(ea);
 923        pgd  = READ_ONCE(*pgdp);
 924        /*
 925         * Always operate on the local stack value. This make sure the
 926         * value don't get updated by a parallel THP split/collapse,
 927         * page fault or a page unmap. The return pte_t * is still not
 928         * stable. So should be checked there for above conditions.
 929         */
 930        if (pgd_none(pgd))
 931                return NULL;
 932        else if (pgd_huge(pgd)) {
 933                ret_pte = (pte_t *) pgdp;
 934                goto out;
 935        } else if (is_hugepd(__hugepd(pgd_val(pgd))))
 936                hpdp = (hugepd_t *)&pgd;
 937        else {
 938                /*
 939                 * Even if we end up with an unmap, the pgtable will not
 940                 * be freed, because we do an rcu free and here we are
 941                 * irq disabled
 942                 */
 943                pdshift = PUD_SHIFT;
 944                pudp = pud_offset(&pgd, ea);
 945                pud  = READ_ONCE(*pudp);
 946
 947                if (pud_none(pud))
 948                        return NULL;
 949                else if (pud_huge(pud)) {
 950                        ret_pte = (pte_t *) pudp;
 951                        goto out;
 952                } else if (is_hugepd(__hugepd(pud_val(pud))))
 953                        hpdp = (hugepd_t *)&pud;
 954                else {
 955                        pdshift = PMD_SHIFT;
 956                        pmdp = pmd_offset(&pud, ea);
 957                        pmd  = READ_ONCE(*pmdp);
 958                        /*
 959                         * A hugepage collapse is captured by pmd_none, because
 960                         * it mark the pmd none and do a hpte invalidate.
 961                         */
 962                        if (pmd_none(pmd))
 963                                return NULL;
 964
 965                        if (pmd_trans_huge(pmd)) {
 966                                if (is_thp)
 967                                        *is_thp = true;
 968                                ret_pte = (pte_t *) pmdp;
 969                                goto out;
 970                        }
 971
 972                        if (pmd_huge(pmd)) {
 973                                ret_pte = (pte_t *) pmdp;
 974                                goto out;
 975                        } else if (is_hugepd(__hugepd(pmd_val(pmd))))
 976                                hpdp = (hugepd_t *)&pmd;
 977                        else
 978                                return pte_offset_kernel(&pmd, ea);
 979                }
 980        }
 981        if (!hpdp)
 982                return NULL;
 983
 984        ret_pte = hugepte_offset(*hpdp, ea, pdshift);
 985        pdshift = hugepd_shift(*hpdp);
 986out:
 987        if (shift)
 988                *shift = pdshift;
 989        return ret_pte;
 990}
 991EXPORT_SYMBOL_GPL(__find_linux_pte_or_hugepte);
 992
 993int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 994                unsigned long end, int write, struct page **pages, int *nr)
 995{
 996        unsigned long mask;
 997        unsigned long pte_end;
 998        struct page *head, *page;
 999        pte_t pte;
1000        int refs;

1001
1002        pte_end = (addr + sz) & ~(sz-1);
1003        if (pte_end < end)
1004                end = pte_end;
1005
1006        pte = READ_ONCE(*ptep);
1007        mask = _PAGE_PRESENT | _PAGE_USER;
1008        if (write)
1009                mask |= _PAGE_RW;
1010
1011        if ((pte_val(pte) & mask) != mask)
1012                return 0;
1013
1014        /* hugepages are never "special" */
1015        VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
1016
1017        refs = 0;
1018        head = pte_page(pte);
1019
1020        page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
1021        do {
1022                VM_BUG_ON(compound_head(page) != head);
1023                pages[*nr] = page;
1024                (*nr)++;
1025                page++;
1026                refs++;
1027        } while (addr += PAGE_SIZE, addr != end);
1028
1029        if (!page_cache_add_speculative(head, refs)) {
1030                *nr -= refs;
1031                return 0;
1032        }
1033
1034        if (unlikely(pte_val(pte) != pte_val(*ptep))) {
1035                /* Could be optimized better */
1036                *nr -= refs;
1037                while (refs--)
1038                        put_page(head);
1039                return 0;
1040        }
1041
1042        return 1;
1043}
1044