linux/arch/powerpc/mm/pgtable-hash64.c
<<
>>
Prefs
   1/*
   2 * Copyright 2005, Paul Mackerras, IBM Corporation.
   3 * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
   4 * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
   5 *
   6 * This program is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU General Public License
   8 * as published by the Free Software Foundation; either version
   9 * 2 of the License, or (at your option) any later version.
  10 */
  11
  12#include <linux/sched.h>
  13#include <asm/pgalloc.h>
  14#include <asm/tlb.h>
  15
  16#include "mmu_decl.h"
  17
  18#define CREATE_TRACE_POINTS
  19#include <trace/events/thp.h>
  20
  21#ifdef CONFIG_SPARSEMEM_VMEMMAP
  22/*
  23 * On hash-based CPUs, the vmemmap is bolted in the hash table.
  24 *
  25 */
  26int __meminit hash__vmemmap_create_mapping(unsigned long start,
  27                                       unsigned long page_size,
  28                                       unsigned long phys)
  29{
  30        int rc = htab_bolt_mapping(start, start + page_size, phys,
  31                                   pgprot_val(PAGE_KERNEL),
  32                                   mmu_vmemmap_psize, mmu_kernel_ssize);
  33        if (rc < 0) {
  34                int rc2 = htab_remove_mapping(start, start + page_size,
  35                                              mmu_vmemmap_psize,
  36                                              mmu_kernel_ssize);
  37                BUG_ON(rc2 && (rc2 != -ENOENT));
  38        }
  39        return rc;
  40}
  41
  42#ifdef CONFIG_MEMORY_HOTPLUG
  43void hash__vmemmap_remove_mapping(unsigned long start,
  44                              unsigned long page_size)
  45{
  46        int rc = htab_remove_mapping(start, start + page_size,
  47                                     mmu_vmemmap_psize,
  48                                     mmu_kernel_ssize);
  49        BUG_ON((rc < 0) && (rc != -ENOENT));
  50        WARN_ON(rc == -ENOENT);
  51}
  52#endif
  53#endif /* CONFIG_SPARSEMEM_VMEMMAP */
  54
  55/*
  56 * map_kernel_page currently only called by __ioremap
  57 * map_kernel_page adds an entry to the ioremap page table
  58 * and adds an entry to the HPT, possibly bolting it
  59 */
  60int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags)
  61{
  62        pgd_t *pgdp;
  63        pud_t *pudp;
  64        pmd_t *pmdp;
  65        pte_t *ptep;
  66
  67        BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
  68        if (slab_is_available()) {
  69                pgdp = pgd_offset_k(ea);
  70                pudp = pud_alloc(&init_mm, pgdp, ea);
  71                if (!pudp)
  72                        return -ENOMEM;
  73                pmdp = pmd_alloc(&init_mm, pudp, ea);
  74                if (!pmdp)
  75                        return -ENOMEM;
  76                ptep = pte_alloc_kernel(pmdp, ea);
  77                if (!ptep)
  78                        return -ENOMEM;
  79                set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
  80                                                          __pgprot(flags)));
  81        } else {
  82                /*
  83                 * If the mm subsystem is not fully up, we cannot create a
  84                 * linux page table entry for this mapping.  Simply bolt an
  85                 * entry in the hardware page table.
  86                 *
  87                 */
  88                if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags,
  89                                      mmu_io_psize, mmu_kernel_ssize)) {
  90                        printk(KERN_ERR "Failed to do bolted mapping IO "
  91                               "memory at %016lx !\n", pa);
  92                        return -ENOMEM;
  93                }
  94        }
  95
  96        smp_wmb();
  97        return 0;
  98}
  99
 100#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 101
 102unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
 103                                    pmd_t *pmdp, unsigned long clr,
 104                                    unsigned long set)
 105{
 106        __be64 old_be, tmp;
 107        unsigned long old;
 108
 109#ifdef CONFIG_DEBUG_VM
 110        WARN_ON(!pmd_trans_huge(*pmdp));
 111        assert_spin_locked(&mm->page_table_lock);
 112#endif
 113
 114        __asm__ __volatile__(
 115        "1:     ldarx   %0,0,%3\n\
 116                and.    %1,%0,%6\n\
 117                bne-    1b \n\
 118                andc    %1,%0,%4 \n\
 119                or      %1,%1,%7\n\
 120                stdcx.  %1,0,%3 \n\
 121                bne-    1b"
 122        : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp)
 123        : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp),
 124          "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
 125        : "cc" );
 126
 127        old = be64_to_cpu(old_be);
 128
 129        trace_hugepage_update(addr, old, clr, set);
 130        if (old & H_PAGE_HASHPTE)
 131                hpte_do_hugepage_flush(mm, addr, pmdp, old);
 132        return old;
 133}
 134
 135pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
 136                            pmd_t *pmdp)
 137{
 138        pmd_t pmd;
 139
 140        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 141        VM_BUG_ON(pmd_trans_huge(*pmdp));
 142
 143        pmd = *pmdp;
 144        pmd_clear(pmdp);
 145        /*
 146         * Wait for all pending hash_page to finish. This is needed
 147         * in case of subpage collapse. When we collapse normal pages
 148         * to hugepage, we first clear the pmd, then invalidate all
 149         * the PTE entries. The assumption here is that any low level
 150         * page fault will see a none pmd and take the slow path that
 151         * will wait on mmap_sem. But we could very well be in a
 152         * hash_page with local ptep pointer value. Such a hash page
 153         * can result in adding new HPTE entries for normal subpages.
 154         * That means we could be modifying the page content as we
 155         * copy them to a huge page. So wait for parallel hash_page
 156         * to finish before invalidating HPTE entries. We can do this
 157         * by sending an IPI to all the cpus and executing a dummy
 158         * function there.
 159         */
 160        kick_all_cpus_sync();
 161        /*
 162         * Now invalidate the hpte entries in the range
 163         * covered by pmd. This make sure we take a
 164         * fault and will find the pmd as none, which will
 165         * result in a major fault which takes mmap_sem and
 166         * hence wait for collapse to complete. Without this
 167         * the __collapse_huge_page_copy can result in copying
 168         * the old content.
 169         */
 170        flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
 171        return pmd;
 172}
 173
 174/*
 175 * We want to put the pgtable in pmd and use pgtable for tracking
 176 * the base page size hptes
 177 */
 178void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
 179                                  pgtable_t pgtable)
 180{
 181        pgtable_t *pgtable_slot;
 182        assert_spin_locked(&mm->page_table_lock);
 183        /*
 184         * we store the pgtable in the second half of PMD
 185         */
 186        pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
 187        *pgtable_slot = pgtable;
 188        /*
 189         * expose the deposited pgtable to other cpus.
 190         * before we set the hugepage PTE at pmd level
 191         * hash fault code looks at the deposted pgtable
 192         * to store hash index values.
 193         */
 194        smp_wmb();
 195}
 196
 197pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 198{
 199        pgtable_t pgtable;
 200        pgtable_t *pgtable_slot;
 201
 202        assert_spin_locked(&mm->page_table_lock);
 203        pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
 204        pgtable = *pgtable_slot;
 205        /*
 206         * Once we withdraw, mark the entry NULL.
 207         */
 208        *pgtable_slot = NULL;
 209        /*
 210         * We store HPTE information in the deposited PTE fragment.
 211         * zero out the content on withdraw.
 212         */
 213        memset(pgtable, 0, PTE_FRAG_SIZE);
 214        return pgtable;
 215}
 216
 217void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
 218                               unsigned long address, pmd_t *pmdp)
 219{
 220        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 221        VM_BUG_ON(REGION_ID(address) != USER_REGION_ID);
 222
 223        /*
 224         * We can't mark the pmd none here, because that will cause a race
 225         * against exit_mmap. We need to continue mark pmd TRANS HUGE, while
 226         * we spilt, but at the same time we wan't rest of the ppc64 code
 227         * not to insert hash pte on this, because we will be modifying
 228         * the deposited pgtable in the caller of this function. Hence
 229         * clear the _PAGE_USER so that we move the fault handling to
 230         * higher level function and that will serialize against ptl.
 231         * We need to flush existing hash pte entries here even though,
 232         * the translation is still valid, because we will withdraw
 233         * pgtable_t after this.
 234         */
 235        pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED);
 236}
 237
 238/*
 239 * A linux hugepage PMD was changed and the corresponding hash table entries
 240 * neesd to be flushed.
 241 */
 242void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
 243                            pmd_t *pmdp, unsigned long old_pmd)
 244{
 245        int ssize;
 246        unsigned int psize;
 247        unsigned long vsid;
 248        unsigned long flags = 0;
 249        const struct cpumask *tmp;
 250
 251        /* get the base page size,vsid and segment size */
 252#ifdef CONFIG_DEBUG_VM
 253        psize = get_slice_psize(mm, addr);
 254        BUG_ON(psize == MMU_PAGE_16M);
 255#endif
 256        if (old_pmd & H_PAGE_COMBO)
 257                psize = MMU_PAGE_4K;
 258        else
 259                psize = MMU_PAGE_64K;
 260
 261        if (!is_kernel_addr(addr)) {
 262                ssize = user_segment_size(addr);
 263                vsid = get_vsid(mm->context.id, addr, ssize);
 264                WARN_ON(vsid == 0);
 265        } else {
 266                vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
 267                ssize = mmu_kernel_ssize;
 268        }
 269
 270        tmp = cpumask_of(smp_processor_id());
 271        if (cpumask_equal(mm_cpumask(mm), tmp))
 272                flags |= HPTE_LOCAL_UPDATE;
 273
 274        return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
 275}
 276
 277pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
 278                                unsigned long addr, pmd_t *pmdp)
 279{
 280        pmd_t old_pmd;
 281        pgtable_t pgtable;
 282        unsigned long old;
 283        pgtable_t *pgtable_slot;
 284
 285        old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
 286        old_pmd = __pmd(old);
 287        /*
 288         * We have pmd == none and we are holding page_table_lock.
 289         * So we can safely go and clear the pgtable hash
 290         * index info.
 291         */
 292        pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
 293        pgtable = *pgtable_slot;
 294        /*
 295         * Let's zero out old valid and hash index details
 296         * hash fault look at them.
 297         */
 298        memset(pgtable, 0, PTE_FRAG_SIZE);
 299        /*
 300         * Serialize against find_linux_pte_or_hugepte which does lock-less
 301         * lookup in page tables with local interrupts disabled. For huge pages
 302         * it casts pmd_t to pte_t. Since format of pte_t is different from
 303         * pmd_t we want to prevent transit from pmd pointing to page table
 304         * to pmd pointing to huge page (and back) while interrupts are disabled.
 305         * We clear pmd to possibly replace it with page table pointer in
 306         * different code paths. So make sure we wait for the parallel
 307         * find_linux_pte_or_hugepage to finish.
 308         */
 309        kick_all_cpus_sync();
 310        return old_pmd;
 311}
 312
 313int hash__has_transparent_hugepage(void)
 314{
 315
 316        if (!mmu_has_feature(MMU_FTR_16M_PAGE))
 317                return 0;
 318        /*
 319         * We support THP only if PMD_SIZE is 16MB.
 320         */
 321        if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
 322                return 0;
 323        /*
 324         * We need to make sure that we support 16MB hugepage in a segement
 325         * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
 326         * of 64K.
 327         */
 328        /*
 329         * If we have 64K HPTE, we will be using that by default
 330         */
 331        if (mmu_psize_defs[MMU_PAGE_64K].shift &&
 332            (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
 333                return 0;
 334        /*
 335         * Ok we only have 4K HPTE
 336         */
 337        if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
 338                return 0;
 339
 340        return 1;
 341}
 342#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 343