linux/arch/powerpc/mm/hugetlbpage.c
<<
>>
Prefs
   1/*
   2 * PPC64 (POWER4) Huge TLB Page Support for Kernel.
   3 *
   4 * Copyright (C) 2003 David Gibson, IBM Corporation.
   5 *
   6 * Based on the IA-32 version:
   7 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
   8 */
   9
  10#include <linux/init.h>
  11#include <linux/fs.h>
  12#include <linux/mm.h>
  13#include <linux/hugetlb.h>
  14#include <linux/pagemap.h>
  15#include <linux/slab.h>
  16#include <linux/err.h>
  17#include <linux/sysctl.h>
  18#include <asm/mman.h>
  19#include <asm/pgalloc.h>
  20#include <asm/tlb.h>
  21#include <asm/tlbflush.h>
  22#include <asm/mmu_context.h>
  23#include <asm/machdep.h>
  24#include <asm/cputable.h>
  25#include <asm/spu.h>
  26
  27#define NUM_LOW_AREAS   (0x100000000UL >> SID_SHIFT)
  28#define NUM_HIGH_AREAS  (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
  29
  30#ifdef CONFIG_PPC_64K_PAGES
  31#define HUGEPTE_INDEX_SIZE      (PMD_SHIFT-HPAGE_SHIFT)
  32#else
  33#define HUGEPTE_INDEX_SIZE      (PUD_SHIFT-HPAGE_SHIFT)
  34#endif
  35#define PTRS_PER_HUGEPTE        (1 << HUGEPTE_INDEX_SIZE)
  36#define HUGEPTE_TABLE_SIZE      (sizeof(pte_t) << HUGEPTE_INDEX_SIZE)
  37
  38#define HUGEPD_SHIFT            (HPAGE_SHIFT + HUGEPTE_INDEX_SIZE)
  39#define HUGEPD_SIZE             (1UL << HUGEPD_SHIFT)
  40#define HUGEPD_MASK             (~(HUGEPD_SIZE-1))
  41
  42#define huge_pgtable_cache      (pgtable_cache[HUGEPTE_CACHE_NUM])
  43
  44/* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
  45 * will choke on pointers to hugepte tables, which is handy for
  46 * catching screwups early. */
  47#define HUGEPD_OK       0x1
  48
  49typedef struct { unsigned long pd; } hugepd_t;
  50
  51#define hugepd_none(hpd)        ((hpd).pd == 0)
  52
  53static inline pte_t *hugepd_page(hugepd_t hpd)
  54{
  55        BUG_ON(!(hpd.pd & HUGEPD_OK));
  56        return (pte_t *)(hpd.pd & ~HUGEPD_OK);
  57}
  58
  59static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr)
  60{
  61        unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1));
  62        pte_t *dir = hugepd_page(*hpdp);
  63
  64        return dir + idx;
  65}
  66
  67static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
  68                           unsigned long address)
  69{
  70        pte_t *new = kmem_cache_alloc(huge_pgtable_cache,
  71                                      GFP_KERNEL|__GFP_REPEAT);
  72
  73        if (! new)
  74                return -ENOMEM;
  75
  76        spin_lock(&mm->page_table_lock);
  77        if (!hugepd_none(*hpdp))
  78                kmem_cache_free(huge_pgtable_cache, new);
  79        else
  80                hpdp->pd = (unsigned long)new | HUGEPD_OK;
  81        spin_unlock(&mm->page_table_lock);
  82        return 0;
  83}
  84
  85/* Modelled after find_linux_pte() */
  86pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
  87{
  88        pgd_t *pg;
  89        pud_t *pu;
  90
  91        BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);
  92
  93        addr &= HPAGE_MASK;
  94
  95        pg = pgd_offset(mm, addr);
  96        if (!pgd_none(*pg)) {
  97                pu = pud_offset(pg, addr);
  98                if (!pud_none(*pu)) {
  99#ifdef CONFIG_PPC_64K_PAGES
 100                        pmd_t *pm;
 101                        pm = pmd_offset(pu, addr);
 102                        if (!pmd_none(*pm))
 103                                return hugepte_offset((hugepd_t *)pm, addr);
 104#else
 105                        return hugepte_offset((hugepd_t *)pu, addr);
 106#endif
 107                }
 108        }
 109
 110        return NULL;
 111}
 112
 113pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 114{
 115        pgd_t *pg;
 116        pud_t *pu;
 117        hugepd_t *hpdp = NULL;
 118
 119        BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);
 120
 121        addr &= HPAGE_MASK;
 122
 123        pg = pgd_offset(mm, addr);
 124        pu = pud_alloc(mm, pg, addr);
 125
 126        if (pu) {
 127#ifdef CONFIG_PPC_64K_PAGES
 128                pmd_t *pm;
 129                pm = pmd_alloc(mm, pu, addr);
 130                if (pm)
 131                        hpdp = (hugepd_t *)pm;
 132#else
 133                hpdp = (hugepd_t *)pu;
 134#endif
 135        }
 136
 137        if (! hpdp)
 138                return NULL;
 139
 140        if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr))
 141                return NULL;
 142
 143        return hugepte_offset(hpdp, addr);
 144}
 145
 146int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 147{
 148        return 0;
 149}
 150
 151static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp)
 152{
 153        pte_t *hugepte = hugepd_page(*hpdp);
 154
 155        hpdp->pd = 0;
 156        tlb->need_flush = 1;
 157        pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM,
 158                                                 PGF_CACHENUM_MASK));
 159}
 160
 161#ifdef CONFIG_PPC_64K_PAGES
 162static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 163                                   unsigned long addr, unsigned long end,
 164                                   unsigned long floor, unsigned long ceiling)
 165{
 166        pmd_t *pmd;
 167        unsigned long next;
 168        unsigned long start;
 169
 170        start = addr;
 171        pmd = pmd_offset(pud, addr);
 172        do {
 173                next = pmd_addr_end(addr, end);
 174                if (pmd_none(*pmd))
 175                        continue;
 176                free_hugepte_range(tlb, (hugepd_t *)pmd);
 177        } while (pmd++, addr = next, addr != end);
 178
 179        start &= PUD_MASK;
 180        if (start < floor)
 181                return;
 182        if (ceiling) {
 183                ceiling &= PUD_MASK;
 184                if (!ceiling)
 185                        return;
 186        }
 187        if (end - 1 > ceiling - 1)
 188                return;
 189
 190        pmd = pmd_offset(pud, start);
 191        pud_clear(pud);
 192        pmd_free_tlb(tlb, pmd);
 193}
 194#endif
 195
 196static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 197                                   unsigned long addr, unsigned long end,
 198                                   unsigned long floor, unsigned long ceiling)
 199{
 200        pud_t *pud;
 201        unsigned long next;
 202        unsigned long start;
 203
 204        start = addr;
 205        pud = pud_offset(pgd, addr);
 206        do {
 207                next = pud_addr_end(addr, end);
 208#ifdef CONFIG_PPC_64K_PAGES
 209                if (pud_none_or_clear_bad(pud))
 210                        continue;
 211                hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
 212#else
 213                if (pud_none(*pud))
 214                        continue;
 215                free_hugepte_range(tlb, (hugepd_t *)pud);
 216#endif
 217        } while (pud++, addr = next, addr != end);
 218
 219        start &= PGDIR_MASK;
 220        if (start < floor)
 221                return;
 222        if (ceiling) {
 223                ceiling &= PGDIR_MASK;
 224                if (!ceiling)
 225                        return;
 226        }
 227        if (end - 1 > ceiling - 1)
 228                return;
 229
 230        pud = pud_offset(pgd, start);
 231        pgd_clear(pgd);
 232        pud_free_tlb(tlb, pud);
 233}
 234
 235/*
 236 * This function frees user-level page tables of a process.
 237 *
 238 * Must be called with pagetable lock held.
 239 */
 240void hugetlb_free_pgd_range(struct mmu_gather **tlb,
 241                            unsigned long addr, unsigned long end,
 242                            unsigned long floor, unsigned long ceiling)
 243{
 244        pgd_t *pgd;
 245        unsigned long next;
 246        unsigned long start;
 247
 248        /*
 249         * Comments below take from the normal free_pgd_range().  They
 250         * apply here too.  The tests against HUGEPD_MASK below are
 251         * essential, because we *don't* test for this at the bottom
 252         * level.  Without them we'll attempt to free a hugepte table
 253         * when we unmap just part of it, even if there are other
 254         * active mappings using it.
 255         *
 256         * The next few lines have given us lots of grief...
 257         *
 258         * Why are we testing HUGEPD* at this top level?  Because
 259         * often there will be no work to do at all, and we'd prefer
 260         * not to go all the way down to the bottom just to discover
 261         * that.
 262         *
 263         * Why all these "- 1"s?  Because 0 represents both the bottom
 264         * of the address space and the top of it (using -1 for the
 265         * top wouldn't help much: the masks would do the wrong thing).
 266         * The rule is that addr 0 and floor 0 refer to the bottom of
 267         * the address space, but end 0 and ceiling 0 refer to the top
 268         * Comparisons need to use "end - 1" and "ceiling - 1" (though
 269         * that end 0 case should be mythical).
 270         *
 271         * Wherever addr is brought up or ceiling brought down, we
 272         * must be careful to reject "the opposite 0" before it
 273         * confuses the subsequent tests.  But what about where end is
 274         * brought down by HUGEPD_SIZE below? no, end can't go down to
 275         * 0 there.
 276         *
 277         * Whereas we round start (addr) and ceiling down, by different
 278         * masks at different levels, in order to test whether a table
 279         * now has no other vmas using it, so can be freed, we don't
 280         * bother to round floor or end up - the tests don't need that.
 281         */
 282
 283        addr &= HUGEPD_MASK;
 284        if (addr < floor) {
 285                addr += HUGEPD_SIZE;
 286                if (!addr)
 287                        return;
 288        }
 289        if (ceiling) {
 290                ceiling &= HUGEPD_MASK;
 291                if (!ceiling)
 292                        return;
 293        }
 294        if (end - 1 > ceiling - 1)
 295                end -= HUGEPD_SIZE;
 296        if (addr > end - 1)
 297                return;
 298
 299        start = addr;
 300        pgd = pgd_offset((*tlb)->mm, addr);
 301        do {
 302                BUG_ON(get_slice_psize((*tlb)->mm, addr) != mmu_huge_psize);
 303                next = pgd_addr_end(addr, end);
 304                if (pgd_none_or_clear_bad(pgd))
 305                        continue;
 306                hugetlb_free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
 307        } while (pgd++, addr = next, addr != end);
 308}
 309
 310void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 311                     pte_t *ptep, pte_t pte)
 312{
 313        if (pte_present(*ptep)) {
 314                /* We open-code pte_clear because we need to pass the right
 315                 * argument to hpte_need_flush (huge / !huge). Might not be
 316                 * necessary anymore if we make hpte_need_flush() get the
 317                 * page size from the slices
 318                 */
 319                pte_update(mm, addr & HPAGE_MASK, ptep, ~0UL, 1);
 320        }
 321        *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
 322}
 323
 324pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 325                              pte_t *ptep)
 326{
 327        unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
 328        return __pte(old);
 329}
 330
 331struct page *
 332follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 333{
 334        pte_t *ptep;
 335        struct page *page;
 336
 337        if (get_slice_psize(mm, address) != mmu_huge_psize)
 338                return ERR_PTR(-EINVAL);
 339
 340        ptep = huge_pte_offset(mm, address);
 341        page = pte_page(*ptep);
 342        if (page)
 343                page += (address % HPAGE_SIZE) / PAGE_SIZE;
 344
 345        return page;
 346}
 347
 348int pmd_huge(pmd_t pmd)
 349{
 350        return 0;
 351}
 352
 353struct page *
 354follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 355                pmd_t *pmd, int write)
 356{
 357        BUG();
 358        return NULL;
 359}
 360
 361
 362unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 363                                        unsigned long len, unsigned long pgoff,
 364                                        unsigned long flags)
 365{
 366        return slice_get_unmapped_area(addr, len, flags,
 367                                       mmu_huge_psize, 1, 0);
 368}
 369
 370/*
 371 * Called by asm hashtable.S for doing lazy icache flush
 372 */
 373static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
 374                                                  pte_t pte, int trap)
 375{
 376        struct page *page;
 377        int i;
 378
 379        if (!pfn_valid(pte_pfn(pte)))
 380                return rflags;
 381
 382        page = pte_page(pte);
 383
 384        /* page is dirty */
 385        if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
 386                if (trap == 0x400) {
 387                        for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++)
 388                                __flush_dcache_icache(page_address(page+i));
 389                        set_bit(PG_arch_1, &page->flags);
 390                } else {
 391                        rflags |= HPTE_R_N;
 392                }
 393        }
 394        return rflags;
 395}
 396
 397int hash_huge_page(struct mm_struct *mm, unsigned long access,
 398                   unsigned long ea, unsigned long vsid, int local,
 399                   unsigned long trap)
 400{
 401        pte_t *ptep;
 402        unsigned long old_pte, new_pte;
 403        unsigned long va, rflags, pa;
 404        long slot;
 405        int err = 1;
 406        int ssize = user_segment_size(ea);
 407
 408        ptep = huge_pte_offset(mm, ea);
 409
 410        /* Search the Linux page table for a match with va */
 411        va = hpt_va(ea, vsid, ssize);
 412
 413        /*
 414         * If no pte found or not present, send the problem up to
 415         * do_page_fault
 416         */
 417        if (unlikely(!ptep || pte_none(*ptep)))
 418                goto out;
 419
 420        /* 
 421         * Check the user's access rights to the page.  If access should be
 422         * prevented then send the problem up to do_page_fault.
 423         */
 424        if (unlikely(access & ~pte_val(*ptep)))
 425                goto out;
 426        /*
 427         * At this point, we have a pte (old_pte) which can be used to build
 428         * or update an HPTE. There are 2 cases:
 429         *
 430         * 1. There is a valid (present) pte with no associated HPTE (this is 
 431         *      the most common case)
 432         * 2. There is a valid (present) pte with an associated HPTE. The
 433         *      current values of the pp bits in the HPTE prevent access
 434         *      because we are doing software DIRTY bit management and the
 435         *      page is currently not DIRTY. 
 436         */
 437
 438
 439        do {
 440                old_pte = pte_val(*ptep);
 441                if (old_pte & _PAGE_BUSY)
 442                        goto out;
 443                new_pte = old_pte | _PAGE_BUSY |
 444                        _PAGE_ACCESSED | _PAGE_HASHPTE;
 445        } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
 446                                         old_pte, new_pte));
 447
 448        rflags = 0x2 | (!(new_pte & _PAGE_RW));
 449        /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
 450        rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
 451        if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
 452                /* No CPU has hugepages but lacks no execute, so we
 453                 * don't need to worry about that case */
 454                rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
 455                                                       trap);
 456
 457        /* Check if pte already has an hpte (case 2) */
 458        if (unlikely(old_pte & _PAGE_HASHPTE)) {
 459                /* There MIGHT be an HPTE for this pte */
 460                unsigned long hash, slot;
 461
 462                hash = hpt_hash(va, HPAGE_SHIFT, ssize);
 463                if (old_pte & _PAGE_F_SECOND)
 464                        hash = ~hash;
 465                slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 466                slot += (old_pte & _PAGE_F_GIX) >> 12;
 467
 468                if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize,
 469                                         ssize, local) == -1)
 470                        old_pte &= ~_PAGE_HPTEFLAGS;
 471        }
 472
 473        if (likely(!(old_pte & _PAGE_HASHPTE))) {
 474                unsigned long hash = hpt_hash(va, HPAGE_SHIFT, ssize);
 475                unsigned long hpte_group;
 476
 477                pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
 478
 479repeat:
 480                hpte_group = ((hash & htab_hash_mask) *
 481                              HPTES_PER_GROUP) & ~0x7UL;
 482
 483                /* clear HPTE slot informations in new PTE */
 484                new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
 485
 486                /* Add in WIMG bits */
 487                /* XXX We should store these in the pte */
 488                /* --BenH: I think they are ... */
 489                rflags |= _PAGE_COHERENT;
 490
 491                /* Insert into the hash table, primary slot */
 492                slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
 493                                          mmu_huge_psize, ssize);
 494
 495                /* Primary is full, try the secondary */
 496                if (unlikely(slot == -1)) {
 497                        hpte_group = ((~hash & htab_hash_mask) *
 498                                      HPTES_PER_GROUP) & ~0x7UL; 
 499                        slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
 500                                                  HPTE_V_SECONDARY,
 501                                                  mmu_huge_psize, ssize);
 502                        if (slot == -1) {
 503                                if (mftb() & 0x1)
 504                                        hpte_group = ((hash & htab_hash_mask) *
 505                                                      HPTES_PER_GROUP)&~0x7UL;
 506
 507                                ppc_md.hpte_remove(hpte_group);
 508                                goto repeat;
 509                        }
 510                }
 511
 512                if (unlikely(slot == -2))
 513                        panic("hash_huge_page: pte_insert failed\n");
 514
 515                new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
 516        }
 517
 518        /*
 519         * No need to use ldarx/stdcx here
 520         */
 521        *ptep = __pte(new_pte & ~_PAGE_BUSY);
 522
 523        err = 0;
 524
 525 out:
 526        return err;
 527}
 528
 529static void zero_ctor(struct kmem_cache *cache, void *addr)
 530{
 531        memset(addr, 0, kmem_cache_size(cache));
 532}
 533
 534static int __init hugetlbpage_init(void)
 535{
 536        if (!cpu_has_feature(CPU_FTR_16M_PAGE))
 537                return -ENODEV;
 538
 539        huge_pgtable_cache = kmem_cache_create("hugepte_cache",
 540                                               HUGEPTE_TABLE_SIZE,
 541                                               HUGEPTE_TABLE_SIZE,
 542                                               0,
 543                                               zero_ctor);
 544        if (! huge_pgtable_cache)
 545                panic("hugetlbpage_init(): could not create hugepte cache\n");
 546
 547        return 0;
 548}
 549
 550module_init(hugetlbpage_init);
 551