LXR linux/arch/powerpc/mm/pgtable

   1/*
   2 *  This file contains ioremap and related functions for 64-bit machines.
   3 *
   4 *  Derived from arch/ppc64/mm/init.c
   5 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
   6 *
   7 *  Modifications by Paul Mackerras (PowerMac) (paulus@samba.org)
   8 *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
   9 *    Copyright (C) 1996 Paul Mackerras
  10 *
  11 *  Derived from "arch/i386/mm/init.c"
  12 *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  13 *
  14 *  Dave Engebretsen <engebret@us.ibm.com>
  15 *      Rework for PPC64 port.
  16 *
  17 *  This program is free software; you can redistribute it and/or
  18 *  modify it under the terms of the GNU General Public License
  19 *  as published by the Free Software Foundation; either version
  20 *  2 of the License, or (at your option) any later version.
  21 *
  22 */
  23
  24#include <linux/signal.h>
  25#include <linux/sched.h>
  26#include <linux/kernel.h>
  27#include <linux/errno.h>
  28#include <linux/string.h>
  29#include <linux/export.h>
  30#include <linux/types.h>
  31#include <linux/mman.h>
  32#include <linux/mm.h>
  33#include <linux/swap.h>
  34#include <linux/stddef.h>
  35#include <linux/vmalloc.h>
  36#include <linux/memblock.h>
  37#include <linux/slab.h>
  38#include <linux/hugetlb.h>
  39
  40#include <asm/pgalloc.h>
  41#include <asm/page.h>
  42#include <asm/prom.h>
  43#include <asm/io.h>
  44#include <asm/mmu_context.h>
  45#include <asm/pgtable.h>
  46#include <asm/mmu.h>
  47#include <asm/smp.h>
  48#include <asm/machdep.h>
  49#include <asm/tlb.h>
  50#include <asm/processor.h>
  51#include <asm/cputable.h>
  52#include <asm/sections.h>
  53#include <asm/firmware.h>
  54#include <asm/dma.h>
  55
  56#include "mmu_decl.h"
  57
  58#define CREATE_TRACE_POINTS
  59#include <trace/events/thp.h>
  60
  61/* Some sanity checking */
  62#if TASK_SIZE_USER64 > PGTABLE_RANGE
  63#error TASK_SIZE_USER64 exceeds pagetable range
  64#endif
  65
  66#ifdef CONFIG_PPC_STD_MMU_64
  67#if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT))
  68#error TASK_SIZE_USER64 exceeds user VSID range
  69#endif
  70#endif
  71
  72unsigned long ioremap_bot = IOREMAP_BASE;
  73
  74#ifdef CONFIG_PPC_MMU_NOHASH
  75static __ref void *early_alloc_pgtable(unsigned long size)
  76{
  77        void *pt;
  78
  79        pt = __va(memblock_alloc_base(size, size, __pa(MAX_DMA_ADDRESS)));
  80        memset(pt, 0, size);
  81
  82        return pt;
  83}
  84#endif /* CONFIG_PPC_MMU_NOHASH */
  85
  86/*
  87 * map_kernel_page currently only called by __ioremap
  88 * map_kernel_page adds an entry to the ioremap page table
  89 * and adds an entry to the HPT, possibly bolting it
  90 */
  91int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags)
  92{
  93        pgd_t *pgdp;
  94        pud_t *pudp;
  95        pmd_t *pmdp;
  96        pte_t *ptep;
  97
  98        if (slab_is_available()) {
  99                pgdp = pgd_offset_k(ea);
 100                pudp = pud_alloc(&init_mm, pgdp, ea);
 101                if (!pudp)
 102                        return -ENOMEM;
 103                pmdp = pmd_alloc(&init_mm, pudp, ea);
 104                if (!pmdp)
 105                        return -ENOMEM;
 106                ptep = pte_alloc_kernel(pmdp, ea);
 107                if (!ptep)
 108                        return -ENOMEM;
 109                set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
 110                                                          __pgprot(flags)));
 111        } else {
 112#ifdef CONFIG_PPC_MMU_NOHASH
 113                pgdp = pgd_offset_k(ea);
 114#ifdef PUD_TABLE_SIZE
 115                if (pgd_none(*pgdp)) {
 116                        pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
 117                        BUG_ON(pudp == NULL);
 118                        pgd_populate(&init_mm, pgdp, pudp);
 119                }
 120#endif /* PUD_TABLE_SIZE */
 121                pudp = pud_offset(pgdp, ea);
 122                if (pud_none(*pudp)) {
 123                        pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
 124                        BUG_ON(pmdp == NULL);
 125                        pud_populate(&init_mm, pudp, pmdp);
 126                }
 127                pmdp = pmd_offset(pudp, ea);
 128                if (!pmd_present(*pmdp)) {
 129                        ptep = early_alloc_pgtable(PAGE_SIZE);
 130                        BUG_ON(ptep == NULL);
 131                        pmd_populate_kernel(&init_mm, pmdp, ptep);
 132                }
 133                ptep = pte_offset_kernel(pmdp, ea);
 134                set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
 135                                                          __pgprot(flags)));
 136#else /* CONFIG_PPC_MMU_NOHASH */
 137                /*
 138                 * If the mm subsystem is not fully up, we cannot create a
 139                 * linux page table entry for this mapping.  Simply bolt an
 140                 * entry in the hardware page table.
 141                 *
 142                 */
 143                if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags,
 144                                      mmu_io_psize, mmu_kernel_ssize)) {
 145                        printk(KERN_ERR "Failed to do bolted mapping IO "
 146                               "memory at %016lx !\n", pa);
 147                        return -ENOMEM;
 148                }
 149#endif /* !CONFIG_PPC_MMU_NOHASH */
 150        }
 151
 152        smp_wmb();
 153        return 0;
 154}
 155
 156
 157/**
 158 * __ioremap_at - Low level function to establish the page tables
 159 *                for an IO mapping
 160 */
 161void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size,
 162                            unsigned long flags)
 163{
 164        unsigned long i;
 165
 166        /* Make sure we have the base flags */
 167        if ((flags & _PAGE_PRESENT) == 0)
 168                flags |= pgprot_val(PAGE_KERNEL);
 169
 170        /* Non-cacheable page cannot be coherent */
 171        if (flags & _PAGE_NO_CACHE)
 172                flags &= ~_PAGE_COHERENT;
 173
 174        /* We don't support the 4K PFN hack with ioremap */
 175        if (flags & _PAGE_4K_PFN)
 176                return NULL;
 177
 178        WARN_ON(pa & ~PAGE_MASK);
 179        WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
 180        WARN_ON(size & ~PAGE_MASK);
 181
 182        for (i = 0; i < size; i += PAGE_SIZE)
 183                if (map_kernel_page((unsigned long)ea+i, pa+i, flags))
 184                        return NULL;
 185
 186        return (void __iomem *)ea;
 187}
 188
 189/**
 190 * __iounmap_from - Low level function to tear down the page tables
 191 *                  for an IO mapping. This is used for mappings that
 192 *                  are manipulated manually, like partial unmapping of
 193 *                  PCI IOs or ISA space.
 194 */
 195void __iounmap_at(void *ea, unsigned long size)
 196{
 197        WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
 198        WARN_ON(size & ~PAGE_MASK);
 199
 200        unmap_kernel_range((unsigned long)ea, size);
 201}
 202
 203void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size,
 204                                unsigned long flags, void *caller)
 205{
 206        phys_addr_t paligned;
 207        void __iomem *ret;
 208
 209        /*
 210         * Choose an address to map it to.
 211         * Once the imalloc system is running, we use it.
 212         * Before that, we map using addresses going
 213         * up from ioremap_bot.  imalloc will use
 214         * the addresses from ioremap_bot through
 215         * IMALLOC_END
 216         * 
 217         */
 218        paligned = addr & PAGE_MASK;
 219        size = PAGE_ALIGN(addr + size) - paligned;
 220
 221        if ((size == 0) || (paligned == 0))
 222                return NULL;
 223
 224        if (slab_is_available()) {
 225                struct vm_struct *area;
 226
 227                area = __get_vm_area_caller(size, VM_IOREMAP,
 228                                            ioremap_bot, IOREMAP_END,
 229                                            caller);
 230                if (area == NULL)
 231                        return NULL;
 232
 233                area->phys_addr = paligned;
 234                ret = __ioremap_at(paligned, area->addr, size, flags);
 235                if (!ret)
 236                        vunmap(area->addr);
 237        } else {
 238                ret = __ioremap_at(paligned, (void *)ioremap_bot, size, flags);
 239                if (ret)
 240                        ioremap_bot += size;
 241        }
 242
 243        if (ret)
 244                ret += addr & ~PAGE_MASK;
 245        return ret;
 246}
 247
 248void __iomem * __ioremap(phys_addr_t addr, unsigned long size,
 249                         unsigned long flags)
 250{
 251        return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
 252}
 253
 254void __iomem * ioremap(phys_addr_t addr, unsigned long size)
 255{
 256        unsigned long flags = _PAGE_NO_CACHE | _PAGE_GUARDED;
 257        void *caller = __builtin_return_address(0);
 258
 259        if (ppc_md.ioremap)
 260                return ppc_md.ioremap(addr, size, flags, caller);
 261        return __ioremap_caller(addr, size, flags, caller);
 262}
 263
 264void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size)
 265{
 266        unsigned long flags = _PAGE_NO_CACHE;
 267        void *caller = __builtin_return_address(0);
 268
 269        if (ppc_md.ioremap)
 270                return ppc_md.ioremap(addr, size, flags, caller);
 271        return __ioremap_caller(addr, size, flags, caller);
 272}
 273
 274void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size,
 275                             unsigned long flags)
 276{
 277        void *caller = __builtin_return_address(0);
 278
 279        /* writeable implies dirty for kernel addresses */
 280        if (flags & _PAGE_RW)
 281                flags |= _PAGE_DIRTY;
 282
 283        /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
 284        flags &= ~(_PAGE_USER | _PAGE_EXEC);
 285
 286#ifdef _PAGE_BAP_SR
 287        /* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format
 288         * which means that we just cleared supervisor access... oops ;-) This
 289         * restores it
 290         */
 291        flags |= _PAGE_BAP_SR;
 292#endif
 293
 294        if (ppc_md.ioremap)
 295                return ppc_md.ioremap(addr, size, flags, caller);
 296        return __ioremap_caller(addr, size, flags, caller);
 297}
 298
 299
 300/*  
 301 * Unmap an IO region and remove it from imalloc'd list.
 302 * Access to IO memory should be serialized by driver.
 303 */
 304void __iounmap(volatile void __iomem *token)
 305{
 306        void *addr;
 307
 308        if (!slab_is_available())
 309                return;
 310        
 311        addr = (void *) ((unsigned long __force)
 312                         PCI_FIX_ADDR(token) & PAGE_MASK);
 313        if ((unsigned long)addr < ioremap_bot) {
 314                printk(KERN_WARNING "Attempt to iounmap early bolted mapping"
 315                       " at 0x%p\n", addr);
 316                return;
 317        }
 318        vunmap(addr);
 319}
 320
 321void iounmap(volatile void __iomem *token)
 322{
 323        if (ppc_md.iounmap)
 324                ppc_md.iounmap(token);
 325        else
 326                __iounmap(token);
 327}
 328
 329EXPORT_SYMBOL(ioremap);
 330EXPORT_SYMBOL(ioremap_wc);
 331EXPORT_SYMBOL(ioremap_prot);
 332EXPORT_SYMBOL(__ioremap);
 333EXPORT_SYMBOL(__ioremap_at);
 334EXPORT_SYMBOL(iounmap);
 335EXPORT_SYMBOL(__iounmap);
 336EXPORT_SYMBOL(__iounmap_at);
 337
 338#ifndef __PAGETABLE_PUD_FOLDED
 339/* 4 level page table */
 340struct page *pgd_page(pgd_t pgd)
 341{
 342        if (pgd_huge(pgd))
 343                return pte_page(pgd_pte(pgd));
 344        return virt_to_page(pgd_page_vaddr(pgd));
 345}
 346#endif
 347
 348struct page *pud_page(pud_t pud)
 349{
 350        if (pud_huge(pud))
 351                return pte_page(pud_pte(pud));
 352        return virt_to_page(pud_page_vaddr(pud));
 353}
 354
 355/*
 356 * For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags
 357 * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address.
 358 */
 359struct page *pmd_page(pmd_t pmd)
 360{
 361        if (pmd_trans_huge(pmd) || pmd_huge(pmd))
 362                return pte_page(pmd_pte(pmd));
 363        return virt_to_page(pmd_page_vaddr(pmd));
 364}
 365
 366#ifdef CONFIG_PPC_64K_PAGES
 367static pte_t *get_from_cache(struct mm_struct *mm)
 368{
 369        void *pte_frag, *ret;
 370
 371        spin_lock(&mm->page_table_lock);
 372        ret = mm->context.pte_frag;
 373        if (ret) {
 374                pte_frag = ret + PTE_FRAG_SIZE;
 375                /*
 376                 * If we have taken up all the fragments mark PTE page NULL
 377                 */
 378                if (((unsigned long)pte_frag & ~PAGE_MASK) == 0)
 379                        pte_frag = NULL;
 380                mm->context.pte_frag = pte_frag;
 381        }
 382        spin_unlock(&mm->page_table_lock);
 383        return (pte_t *)ret;
 384}
 385
 386static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel)
 387{
 388        void *ret = NULL;
 389        struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
 390                                       __GFP_REPEAT | __GFP_ZERO);
 391        if (!page)
 392                return NULL;
 393        if (!kernel && !pgtable_page_ctor(page)) {
 394                __free_page(page);
 395                return NULL;
 396        }
 397
 398        ret = page_address(page);
 399        spin_lock(&mm->page_table_lock);
 400        /*
 401         * If we find pgtable_page set, we return
 402         * the allocated page with single fragement
 403         * count.
 404         */
 405        if (likely(!mm->context.pte_frag)) {
 406                set_page_count(page, PTE_FRAG_NR);
 407                mm->context.pte_frag = ret + PTE_FRAG_SIZE;
 408        }
 409        spin_unlock(&mm->page_table_lock);
 410
 411        return (pte_t *)ret;
 412}
 413
 414pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel)
 415{
 416        pte_t *pte;
 417
 418        pte = get_from_cache(mm);
 419        if (pte)
 420                return pte;
 421
 422        return __alloc_for_cache(mm, kernel);
 423}
 424
 425void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel)
 426{
 427        struct page *page = virt_to_page(table);
 428        if (put_page_testzero(page)) {
 429                if (!kernel)
 430                        pgtable_page_dtor(page);
 431                free_hot_cold_page(page, 0);
 432        }
 433}
 434
 435#ifdef CONFIG_SMP
 436static void page_table_free_rcu(void *table)
 437{
 438        struct page *page = virt_to_page(table);
 439        if (put_page_testzero(page)) {
 440                pgtable_page_dtor(page);
 441                free_hot_cold_page(page, 0);
 442        }
 443}
 444
 445void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
 446{
 447        unsigned long pgf = (unsigned long)table;
 448
 449        BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
 450        pgf |= shift;
 451        tlb_remove_table(tlb, (void *)pgf);
 452}
 453
 454void __tlb_remove_table(void *_table)
 455{
 456        void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
 457        unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
 458
 459        if (!shift)
 460                /* PTE page needs special handling */
 461                page_table_free_rcu(table);
 462        else {
 463                BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
 464                kmem_cache_free(PGT_CACHE(shift), table);
 465        }
 466}
 467#else
 468void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
 469{
 470        if (!shift) {
 471                /* PTE page needs special handling */
 472                struct page *page = virt_to_page(table);
 473                if (put_page_testzero(page)) {
 474                        pgtable_page_dtor(page);
 475                        free_hot_cold_page(page, 0);
 476                }
 477        } else {
 478                BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
 479                kmem_cache_free(PGT_CACHE(shift), table);
 480        }
 481}
 482#endif
 483#endif /* CONFIG_PPC_64K_PAGES */
 484
 485#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 486
 487/*
 488 * This is called when relaxing access to a hugepage. It's also called in the page
 489 * fault path when we don't hit any of the major fault cases, ie, a minor
 490 * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
 491 * handled those two for us, we additionally deal with missing execute
 492 * permission here on some processors
 493 */
 494int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
 495                          pmd_t *pmdp, pmd_t entry, int dirty)
 496{
 497        int changed;
 498#ifdef CONFIG_DEBUG_VM
 499        WARN_ON(!pmd_trans_huge(*pmdp));
 500        assert_spin_locked(&vma->vm_mm->page_table_lock);
 501#endif
 502        changed = !pmd_same(*(pmdp), entry);
 503        if (changed) {
 504                __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
 505                /*
 506                 * Since we are not supporting SW TLB systems, we don't
 507                 * have any thing similar to flush_tlb_page_nohash()
 508                 */
 509        }
 510        return changed;
 511}
 512
 513unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
 514                                  pmd_t *pmdp, unsigned long clr,
 515                                  unsigned long set)
 516{
 517
 518        unsigned long old, tmp;
 519
 520#ifdef CONFIG_DEBUG_VM
 521        WARN_ON(!pmd_trans_huge(*pmdp));
 522        assert_spin_locked(&mm->page_table_lock);
 523#endif
 524
 525#ifdef PTE_ATOMIC_UPDATES
 526        __asm__ __volatile__(
 527        "1:     ldarx   %0,0,%3\n\
 528                andi.   %1,%0,%6\n\
 529                bne-    1b \n\
 530                andc    %1,%0,%4 \n\
 531                or      %1,%1,%7\n\
 532                stdcx.  %1,0,%3 \n\
 533                bne-    1b"
 534        : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
 535        : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY), "r" (set)
 536        : "cc" );
 537#else
 538        old = pmd_val(*pmdp);
 539        *pmdp = __pmd((old & ~clr) | set);
 540#endif
 541        trace_hugepage_update(addr, old, clr, set);
 542        if (old & _PAGE_HASHPTE)
 543                hpte_do_hugepage_flush(mm, addr, pmdp, old);
 544        return old;
 545}
 546
 547pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
 548                          pmd_t *pmdp)
 549{
 550        pmd_t pmd;
 551
 552        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 553        VM_BUG_ON(pmd_trans_huge(*pmdp));
 554
 555        pmd = *pmdp;
 556        pmd_clear(pmdp);
 557        /*
 558         * Wait for all pending hash_page to finish. This is needed
 559         * in case of subpage collapse. When we collapse normal pages
 560         * to hugepage, we first clear the pmd, then invalidate all
 561         * the PTE entries. The assumption here is that any low level
 562         * page fault will see a none pmd and take the slow path that
 563         * will wait on mmap_sem. But we could very well be in a
 564         * hash_page with local ptep pointer value. Such a hash page
 565         * can result in adding new HPTE entries for normal subpages.
 566         * That means we could be modifying the page content as we
 567         * copy them to a huge page. So wait for parallel hash_page
 568         * to finish before invalidating HPTE entries. We can do this
 569         * by sending an IPI to all the cpus and executing a dummy
 570         * function there.
 571         */
 572        kick_all_cpus_sync();
 573        /*
 574         * Now invalidate the hpte entries in the range
 575         * covered by pmd. This make sure we take a
 576         * fault and will find the pmd as none, which will
 577         * result in a major fault which takes mmap_sem and
 578         * hence wait for collapse to complete. Without this
 579         * the __collapse_huge_page_copy can result in copying
 580         * the old content.
 581         */
 582        flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
 583        return pmd;
 584}
 585
 586int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 587                              unsigned long address, pmd_t *pmdp)
 588{
 589        return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
 590}
 591
 592/*
 593 * We currently remove entries from the hashtable regardless of whether
 594 * the entry was young or dirty. The generic routines only flush if the
 595 * entry was young or dirty which is not good enough.
 596 *
 597 * We should be more intelligent about this but for the moment we override
 598 * these functions and force a tlb flush unconditionally
 599 */
 600int pmdp_clear_flush_young(struct vm_area_struct *vma,
 601                                  unsigned long address, pmd_t *pmdp)
 602{
 603        return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
 604}
 605
 606/*
 607 * We want to put the pgtable in pmd and use pgtable for tracking
 608 * the base page size hptes
 609 */
 610void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
 611                                pgtable_t pgtable)
 612{
 613        pgtable_t *pgtable_slot;
 614        assert_spin_locked(&mm->page_table_lock);
 615        /*
 616         * we store the pgtable in the second half of PMD
 617         */
 618        pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
 619        *pgtable_slot = pgtable;
 620        /*
 621         * expose the deposited pgtable to other cpus.
 622         * before we set the hugepage PTE at pmd level
 623         * hash fault code looks at the deposted pgtable
 624         * to store hash index values.
 625         */
 626        smp_wmb();
 627}
 628
 629pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 630{
 631        pgtable_t pgtable;
 632        pgtable_t *pgtable_slot;
 633
 634        assert_spin_locked(&mm->page_table_lock);
 635        pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
 636        pgtable = *pgtable_slot;
 637        /*
 638         * Once we withdraw, mark the entry NULL.
 639         */
 640        *pgtable_slot = NULL;
 641        /*
 642         * We store HPTE information in the deposited PTE fragment.
 643         * zero out the content on withdraw.
 644         */
 645        memset(pgtable, 0, PTE_FRAG_SIZE);
 646        return pgtable;
 647}
 648
 649void pmdp_huge_split_prepare(struct vm_area_struct *vma,
 650                             unsigned long address, pmd_t *pmdp)
 651{
 652        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 653        VM_BUG_ON(REGION_ID(address) != USER_REGION_ID);
 654
 655        /*
 656         * We can't mark the pmd none here, because that will cause a race
 657         * against exit_mmap. We need to continue mark pmd TRANS HUGE, while
 658         * we spilt, but at the same time we wan't rest of the ppc64 code
 659         * not to insert hash pte on this, because we will be modifying
 660         * the deposited pgtable in the caller of this function. Hence
 661         * clear the _PAGE_USER so that we move the fault handling to
 662         * higher level function and that will serialize against ptl.
 663         * We need to flush existing hash pte entries here even though,
 664         * the translation is still valid, because we will withdraw
 665         * pgtable_t after this.
 666         */
 667        pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_USER, 0);
 668}
 669
 670
 671/*
 672 * set a new huge pmd. We should not be called for updating
 673 * an existing pmd entry. That should go via pmd_hugepage_update.
 674 */
 675void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 676                pmd_t *pmdp, pmd_t pmd)
 677{
 678#ifdef CONFIG_DEBUG_VM
 679        WARN_ON((pmd_val(*pmdp) & (_PAGE_PRESENT | _PAGE_USER)) ==
 680                (_PAGE_PRESENT | _PAGE_USER));
 681        assert_spin_locked(&mm->page_table_lock);
 682        WARN_ON(!pmd_trans_huge(pmd));
 683#endif
 684        trace_hugepage_set_pmd(addr, pmd_val(pmd));
 685        return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
 686}
 687
 688/*
 689 * We use this to invalidate a pmdp entry before switching from a
 690 * hugepte to regular pmd entry.
 691 */
 692void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
 693                     pmd_t *pmdp)
 694{
 695        pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
 696
 697        /*
 698         * This ensures that generic code that rely on IRQ disabling
 699         * to prevent a parallel THP split work as expected.
 700         */
 701        kick_all_cpus_sync();
 702}
 703
 704/*
 705 * A linux hugepage PMD was changed and the corresponding hash table entries
 706 * neesd to be flushed.
 707 */
 708void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
 709                            pmd_t *pmdp, unsigned long old_pmd)
 710{
 711        int ssize;
 712        unsigned int psize;
 713        unsigned long vsid;
 714        unsigned long flags = 0;
 715        const struct cpumask *tmp;
 716
 717        /* get the base page size,vsid and segment size */
 718#ifdef CONFIG_DEBUG_VM
 719        psize = get_slice_psize(mm, addr);
 720        BUG_ON(psize == MMU_PAGE_16M);
 721#endif
 722        if (old_pmd & _PAGE_COMBO)
 723                psize = MMU_PAGE_4K;
 724        else
 725                psize = MMU_PAGE_64K;
 726
 727        if (!is_kernel_addr(addr)) {
 728                ssize = user_segment_size(addr);
 729                vsid = get_vsid(mm->context.id, addr, ssize);
 730                WARN_ON(vsid == 0);
 731        } else {
 732                vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
 733                ssize = mmu_kernel_ssize;
 734        }
 735
 736        tmp = cpumask_of(smp_processor_id());
 737        if (cpumask_equal(mm_cpumask(mm), tmp))
 738                flags |= HPTE_LOCAL_UPDATE;
 739
 740        return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
 741}
 742
 743static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
 744{
 745        return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
 746}
 747
 748pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
 749{
 750        unsigned long pmdv;
 751
 752        pmdv = (pfn << PTE_RPN_SHIFT) & PTE_RPN_MASK;
 753        return pmd_set_protbits(__pmd(pmdv), pgprot);
 754}
 755
 756pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
 757{
 758        return pfn_pmd(page_to_pfn(page), pgprot);
 759}
 760
 761pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
 762{
 763        unsigned long pmdv;
 764
 765        pmdv = pmd_val(pmd);
 766        pmdv &= _HPAGE_CHG_MASK;
 767        return pmd_set_protbits(__pmd(pmdv), newprot);
 768}
 769
 770/*
 771 * This is called at the end of handling a user page fault, when the
 772 * fault has been handled by updating a HUGE PMD entry in the linux page tables.
 773 * We use it to preload an HPTE into the hash table corresponding to
 774 * the updated linux HUGE PMD entry.
 775 */
 776void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
 777                          pmd_t *pmd)
 778{
 779        return;
 780}
 781
 782pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
 783                              unsigned long addr, pmd_t *pmdp)
 784{
 785        pmd_t old_pmd;
 786        pgtable_t pgtable;
 787        unsigned long old;
 788        pgtable_t *pgtable_slot;
 789
 790        old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
 791        old_pmd = __pmd(old);
 792        /*
 793         * We have pmd == none and we are holding page_table_lock.
 794         * So we can safely go and clear the pgtable hash
 795         * index info.
 796         */
 797        pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
 798        pgtable = *pgtable_slot;
 799        /*
 800         * Let's zero out old valid and hash index details
 801         * hash fault look at them.
 802         */
 803        memset(pgtable, 0, PTE_FRAG_SIZE);
 804        /*
 805         * Serialize against find_linux_pte_or_hugepte which does lock-less
 806         * lookup in page tables with local interrupts disabled. For huge pages
 807         * it casts pmd_t to pte_t. Since format of pte_t is different from
 808         * pmd_t we want to prevent transit from pmd pointing to page table
 809         * to pmd pointing to huge page (and back) while interrupts are disabled.
 810         * We clear pmd to possibly replace it with page table pointer in
 811         * different code paths. So make sure we wait for the parallel
 812         * find_linux_pte_or_hugepage to finish.
 813         */
 814        kick_all_cpus_sync();
 815        return old_pmd;
 816}
 817
 818int has_transparent_hugepage(void)
 819{
 820
 821        BUILD_BUG_ON_MSG((PMD_SHIFT - PAGE_SHIFT) >= MAX_ORDER,
 822                "hugepages can't be allocated by the buddy allocator");
 823
 824        BUILD_BUG_ON_MSG((PMD_SHIFT - PAGE_SHIFT) < 2,
 825                         "We need more than 2 pages to do deferred thp split");
 826
 827        if (!mmu_has_feature(MMU_FTR_16M_PAGE))
 828                return 0;
 829        /*
 830         * We support THP only if PMD_SIZE is 16MB.
 831         */
 832        if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
 833                return 0;
 834        /*
 835         * We need to make sure that we support 16MB hugepage in a segement
 836         * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
 837         * of 64K.
 838         */
 839        /*
 840         * If we have 64K HPTE, we will be using that by default
 841         */
 842        if (mmu_psize_defs[MMU_PAGE_64K].shift &&
 843            (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
 844                return 0;
 845        /*
 846         * Ok we only have 4K HPTE
 847         */
 848        if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
 849                return 0;
 850
 851        return 1;
 852}
 853#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 854