linux/arch/powerpc/mm/hugetlbpage.c
<<
>>
Prefs
   1/*
   2 * PPC64 (POWER4) Huge TLB Page Support for Kernel.
   3 *
   4 * Copyright (C) 2003 David Gibson, IBM Corporation.
   5 *
   6 * Based on the IA-32 version:
   7 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
   8 */
   9
  10#include <linux/mm.h>
  11#include <linux/io.h>
  12#include <linux/slab.h>
  13#include <linux/hugetlb.h>
  14#include <asm/pgtable.h>
  15#include <asm/pgalloc.h>
  16#include <asm/tlb.h>
  17
  18#define PAGE_SHIFT_64K  16
  19#define PAGE_SHIFT_16M  24
  20#define PAGE_SHIFT_16G  34
  21
  22#define MAX_NUMBER_GPAGES       1024
  23
  24/* Tracks the 16G pages after the device tree is scanned and before the
  25 * huge_boot_pages list is ready.  */
  26static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
  27static unsigned nr_gpages;
  28
  29/* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
  30 * will choke on pointers to hugepte tables, which is handy for
  31 * catching screwups early. */
  32
  33static inline int shift_to_mmu_psize(unsigned int shift)
  34{
  35        int psize;
  36
  37        for (psize = 0; psize < MMU_PAGE_COUNT; ++psize)
  38                if (mmu_psize_defs[psize].shift == shift)
  39                        return psize;
  40        return -1;
  41}
  42
  43static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
  44{
  45        if (mmu_psize_defs[mmu_psize].shift)
  46                return mmu_psize_defs[mmu_psize].shift;
  47        BUG();
  48}
  49
  50#define hugepd_none(hpd)        ((hpd).pd == 0)
  51
  52static inline pte_t *hugepd_page(hugepd_t hpd)
  53{
  54        BUG_ON(!hugepd_ok(hpd));
  55        return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | 0xc000000000000000);
  56}
  57
  58static inline unsigned int hugepd_shift(hugepd_t hpd)
  59{
  60        return hpd.pd & HUGEPD_SHIFT_MASK;
  61}
  62
  63static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, unsigned pdshift)
  64{
  65        unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp);
  66        pte_t *dir = hugepd_page(*hpdp);
  67
  68        return dir + idx;
  69}
  70
  71pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
  72{
  73        pgd_t *pg;
  74        pud_t *pu;
  75        pmd_t *pm;
  76        hugepd_t *hpdp = NULL;
  77        unsigned pdshift = PGDIR_SHIFT;
  78
  79        if (shift)
  80                *shift = 0;
  81
  82        pg = pgdir + pgd_index(ea);
  83        if (is_hugepd(pg)) {
  84                hpdp = (hugepd_t *)pg;
  85        } else if (!pgd_none(*pg)) {
  86                pdshift = PUD_SHIFT;
  87                pu = pud_offset(pg, ea);
  88                if (is_hugepd(pu))
  89                        hpdp = (hugepd_t *)pu;
  90                else if (!pud_none(*pu)) {
  91                        pdshift = PMD_SHIFT;
  92                        pm = pmd_offset(pu, ea);
  93                        if (is_hugepd(pm))
  94                                hpdp = (hugepd_t *)pm;
  95                        else if (!pmd_none(*pm)) {
  96                                return pte_offset_map(pm, ea);
  97                        }
  98                }
  99        }
 100
 101        if (!hpdp)
 102                return NULL;
 103
 104        if (shift)
 105                *shift = hugepd_shift(*hpdp);
 106        return hugepte_offset(hpdp, ea, pdshift);
 107}
 108
 109pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 110{
 111        return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
 112}
 113
 114static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 115                           unsigned long address, unsigned pdshift, unsigned pshift)
 116{
 117        pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift),
 118                                       GFP_KERNEL|__GFP_REPEAT);
 119
 120        BUG_ON(pshift > HUGEPD_SHIFT_MASK);
 121        BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
 122
 123        if (! new)
 124                return -ENOMEM;
 125
 126        spin_lock(&mm->page_table_lock);
 127        if (!hugepd_none(*hpdp))
 128                kmem_cache_free(PGT_CACHE(pdshift - pshift), new);
 129        else
 130                hpdp->pd = ((unsigned long)new & ~0x8000000000000000) | pshift;
 131        spin_unlock(&mm->page_table_lock);
 132        return 0;
 133}
 134
 135pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
 136{
 137        pgd_t *pg;
 138        pud_t *pu;
 139        pmd_t *pm;
 140        hugepd_t *hpdp = NULL;
 141        unsigned pshift = __ffs(sz);
 142        unsigned pdshift = PGDIR_SHIFT;
 143
 144        addr &= ~(sz-1);
 145
 146        pg = pgd_offset(mm, addr);
 147        if (pshift >= PUD_SHIFT) {
 148                hpdp = (hugepd_t *)pg;
 149        } else {
 150                pdshift = PUD_SHIFT;
 151                pu = pud_alloc(mm, pg, addr);
 152                if (pshift >= PMD_SHIFT) {
 153                        hpdp = (hugepd_t *)pu;
 154                } else {
 155                        pdshift = PMD_SHIFT;
 156                        pm = pmd_alloc(mm, pu, addr);
 157                        hpdp = (hugepd_t *)pm;
 158                }
 159        }
 160
 161        if (!hpdp)
 162                return NULL;
 163
 164        BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
 165
 166        if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
 167                return NULL;
 168
 169        return hugepte_offset(hpdp, addr, pdshift);
 170}
 171
 172/* Build list of addresses of gigantic pages.  This function is used in early
 173 * boot before the buddy or bootmem allocator is setup.
 174 */
 175void add_gpage(unsigned long addr, unsigned long page_size,
 176        unsigned long number_of_pages)
 177{
 178        if (!addr)
 179                return;
 180        while (number_of_pages > 0) {
 181                gpage_freearray[nr_gpages] = addr;
 182                nr_gpages++;
 183                number_of_pages--;
 184                addr += page_size;
 185        }
 186}
 187
 188/* Moves the gigantic page addresses from the temporary list to the
 189 * huge_boot_pages list.
 190 */
 191int alloc_bootmem_huge_page(struct hstate *hstate)
 192{
 193        struct huge_bootmem_page *m;
 194        if (nr_gpages == 0)
 195                return 0;
 196        m = phys_to_virt(gpage_freearray[--nr_gpages]);
 197        gpage_freearray[nr_gpages] = 0;
 198        list_add(&m->list, &huge_boot_pages);
 199        m->hstate = hstate;
 200        return 1;
 201}
 202
 203int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 204{
 205        return 0;
 206}
 207
 208static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
 209                              unsigned long start, unsigned long end,
 210                              unsigned long floor, unsigned long ceiling)
 211{
 212        pte_t *hugepte = hugepd_page(*hpdp);
 213        unsigned shift = hugepd_shift(*hpdp);
 214        unsigned long pdmask = ~((1UL << pdshift) - 1);
 215
 216        start &= pdmask;
 217        if (start < floor)
 218                return;
 219        if (ceiling) {
 220                ceiling &= pdmask;
 221                if (! ceiling)
 222                        return;
 223        }
 224        if (end - 1 > ceiling - 1)
 225                return;
 226
 227        hpdp->pd = 0;
 228        tlb->need_flush = 1;
 229        pgtable_free_tlb(tlb, hugepte, pdshift - shift);
 230}
 231
 232static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 233                                   unsigned long addr, unsigned long end,
 234                                   unsigned long floor, unsigned long ceiling)
 235{
 236        pmd_t *pmd;
 237        unsigned long next;
 238        unsigned long start;
 239
 240        start = addr;
 241        pmd = pmd_offset(pud, addr);
 242        do {
 243                next = pmd_addr_end(addr, end);
 244                if (pmd_none(*pmd))
 245                        continue;
 246                free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
 247                                  addr, next, floor, ceiling);
 248        } while (pmd++, addr = next, addr != end);
 249
 250        start &= PUD_MASK;
 251        if (start < floor)
 252                return;
 253        if (ceiling) {
 254                ceiling &= PUD_MASK;
 255                if (!ceiling)
 256                        return;
 257        }
 258        if (end - 1 > ceiling - 1)
 259                return;
 260
 261        pmd = pmd_offset(pud, start);
 262        pud_clear(pud);
 263        pmd_free_tlb(tlb, pmd, start);
 264}
 265
 266static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 267                                   unsigned long addr, unsigned long end,
 268                                   unsigned long floor, unsigned long ceiling)
 269{
 270        pud_t *pud;
 271        unsigned long next;
 272        unsigned long start;
 273
 274        start = addr;
 275        pud = pud_offset(pgd, addr);
 276        do {
 277                next = pud_addr_end(addr, end);
 278                if (!is_hugepd(pud)) {
 279                        if (pud_none_or_clear_bad(pud))
 280                                continue;
 281                        hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
 282                                               ceiling);
 283                } else {
 284                        free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
 285                                          addr, next, floor, ceiling);
 286                }
 287        } while (pud++, addr = next, addr != end);
 288
 289        start &= PGDIR_MASK;
 290        if (start < floor)
 291                return;
 292        if (ceiling) {
 293                ceiling &= PGDIR_MASK;
 294                if (!ceiling)
 295                        return;
 296        }
 297        if (end - 1 > ceiling - 1)
 298                return;
 299
 300        pud = pud_offset(pgd, start);
 301        pgd_clear(pgd);
 302        pud_free_tlb(tlb, pud, start);
 303}
 304
 305/*
 306 * This function frees user-level page tables of a process.
 307 *
 308 * Must be called with pagetable lock held.
 309 */
 310void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 311                            unsigned long addr, unsigned long end,
 312                            unsigned long floor, unsigned long ceiling)
 313{
 314        pgd_t *pgd;
 315        unsigned long next;
 316
 317        /*
 318         * Because there are a number of different possible pagetable
 319         * layouts for hugepage ranges, we limit knowledge of how
 320         * things should be laid out to the allocation path
 321         * (huge_pte_alloc(), above).  Everything else works out the
 322         * structure as it goes from information in the hugepd
 323         * pointers.  That means that we can't here use the
 324         * optimization used in the normal page free_pgd_range(), of
 325         * checking whether we're actually covering a large enough
 326         * range to have to do anything at the top level of the walk
 327         * instead of at the bottom.
 328         *
 329         * To make sense of this, you should probably go read the big
 330         * block comment at the top of the normal free_pgd_range(),
 331         * too.
 332         */
 333
 334        pgd = pgd_offset(tlb->mm, addr);
 335        do {
 336                next = pgd_addr_end(addr, end);
 337                if (!is_hugepd(pgd)) {
 338                        if (pgd_none_or_clear_bad(pgd))
 339                                continue;
 340                        hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
 341                } else {
 342                        free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
 343                                          addr, next, floor, ceiling);
 344                }
 345        } while (pgd++, addr = next, addr != end);
 346}
 347
 348struct page *
 349follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 350{
 351        pte_t *ptep;
 352        struct page *page;
 353        unsigned shift;
 354        unsigned long mask;
 355
 356        ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
 357
 358        /* Verify it is a huge page else bail. */
 359        if (!ptep || !shift)
 360                return ERR_PTR(-EINVAL);
 361
 362        mask = (1UL << shift) - 1;
 363        page = pte_page(*ptep);
 364        if (page)
 365                page += (address & mask) / PAGE_SIZE;
 366
 367        return page;
 368}
 369
 370int pmd_huge(pmd_t pmd)
 371{
 372        return 0;
 373}
 374
 375int pud_huge(pud_t pud)
 376{
 377        return 0;
 378}
 379
 380struct page *
 381follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 382                pmd_t *pmd, int write)
 383{
 384        BUG();
 385        return NULL;
 386}
 387
 388static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 389                       unsigned long end, int write, struct page **pages, int *nr)
 390{
 391        unsigned long mask;
 392        unsigned long pte_end;
 393        struct page *head, *page;
 394        pte_t pte;
 395        int refs;
 396
 397        pte_end = (addr + sz) & ~(sz-1);
 398        if (pte_end < end)
 399                end = pte_end;
 400
 401        pte = *ptep;
 402        mask = _PAGE_PRESENT | _PAGE_USER;
 403        if (write)
 404                mask |= _PAGE_RW;
 405
 406        if ((pte_val(pte) & mask) != mask)
 407                return 0;
 408
 409        /* hugepages are never "special" */
 410        VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
 411
 412        refs = 0;
 413        head = pte_page(pte);
 414
 415        page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
 416        do {
 417                VM_BUG_ON(compound_head(page) != head);
 418                pages[*nr] = page;
 419                (*nr)++;
 420                page++;
 421                refs++;
 422        } while (addr += PAGE_SIZE, addr != end);
 423
 424        if (!page_cache_add_speculative(head, refs)) {
 425                *nr -= refs;
 426                return 0;
 427        }
 428
 429        if (unlikely(pte_val(pte) != pte_val(*ptep))) {
 430                /* Could be optimized better */
 431                while (*nr) {
 432                        put_page(page);
 433                        (*nr)--;
 434                }
 435        }
 436
 437        return 1;
 438}
 439
 440static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
 441                                      unsigned long sz)
 442{
 443        unsigned long __boundary = (addr + sz) & ~(sz-1);
 444        return (__boundary - 1 < end - 1) ? __boundary : end;
 445}
 446
 447int gup_hugepd(hugepd_t *hugepd, unsigned pdshift,
 448               unsigned long addr, unsigned long end,
 449               int write, struct page **pages, int *nr)
 450{
 451        pte_t *ptep;
 452        unsigned long sz = 1UL << hugepd_shift(*hugepd);
 453        unsigned long next;
 454
 455        ptep = hugepte_offset(hugepd, addr, pdshift);
 456        do {
 457                next = hugepte_addr_end(addr, end, sz);
 458                if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
 459                        return 0;
 460        } while (ptep++, addr = next, addr != end);
 461
 462        return 1;
 463}
 464
 465unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 466                                        unsigned long len, unsigned long pgoff,
 467                                        unsigned long flags)
 468{
 469        struct hstate *hstate = hstate_file(file);
 470        int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
 471
 472        return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
 473}
 474
 475unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
 476{
 477        unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
 478
 479        return 1UL << mmu_psize_to_shift(psize);
 480}
 481
 482static int __init add_huge_page_size(unsigned long long size)
 483{
 484        int shift = __ffs(size);
 485        int mmu_psize;
 486
 487        /* Check that it is a page size supported by the hardware and
 488         * that it fits within pagetable and slice limits. */
 489        if (!is_power_of_2(size)
 490            || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT))
 491                return -EINVAL;
 492
 493        if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
 494                return -EINVAL;
 495
 496#ifdef CONFIG_SPU_FS_64K_LS
 497        /* Disable support for 64K huge pages when 64K SPU local store
 498         * support is enabled as the current implementation conflicts.
 499         */
 500        if (shift == PAGE_SHIFT_64K)
 501                return -EINVAL;
 502#endif /* CONFIG_SPU_FS_64K_LS */
 503
 504        BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
 505
 506        /* Return if huge page size has already been setup */
 507        if (size_to_hstate(size))
 508                return 0;
 509
 510        hugetlb_add_hstate(shift - PAGE_SHIFT);
 511
 512        return 0;
 513}
 514
 515static int __init hugepage_setup_sz(char *str)
 516{
 517        unsigned long long size;
 518
 519        size = memparse(str, &str);
 520
 521        if (add_huge_page_size(size) != 0)
 522                printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
 523
 524        return 1;
 525}
 526__setup("hugepagesz=", hugepage_setup_sz);
 527
 528static int __init hugetlbpage_init(void)
 529{
 530        int psize;
 531
 532        if (!cpu_has_feature(CPU_FTR_16M_PAGE))
 533                return -ENODEV;
 534
 535        for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
 536                unsigned shift;
 537                unsigned pdshift;
 538
 539                if (!mmu_psize_defs[psize].shift)
 540                        continue;
 541
 542                shift = mmu_psize_to_shift(psize);
 543
 544                if (add_huge_page_size(1ULL << shift) < 0)
 545                        continue;
 546
 547                if (shift < PMD_SHIFT)
 548                        pdshift = PMD_SHIFT;
 549                else if (shift < PUD_SHIFT)
 550                        pdshift = PUD_SHIFT;
 551                else
 552                        pdshift = PGDIR_SHIFT;
 553
 554                pgtable_cache_add(pdshift - shift, NULL);
 555                if (!PGT_CACHE(pdshift - shift))
 556                        panic("hugetlbpage_init(): could not create "
 557                              "pgtable cache for %d bit pagesize\n", shift);
 558        }
 559
 560        /* Set default large page size. Currently, we pick 16M or 1M
 561         * depending on what is available
 562         */
 563        if (mmu_psize_defs[MMU_PAGE_16M].shift)
 564                HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
 565        else if (mmu_psize_defs[MMU_PAGE_1M].shift)
 566                HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
 567
 568        return 0;
 569}
 570
 571module_init(hugetlbpage_init);
 572
 573void flush_dcache_icache_hugepage(struct page *page)
 574{
 575        int i;
 576
 577        BUG_ON(!PageCompound(page));
 578
 579        for (i = 0; i < (1UL << compound_order(page)); i++)
 580                __flush_dcache_icache(page_address(page+i));
 581}
 582