linux/arch/powerpc/mm/pgtable.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * This file contains common routines for dealing with free of page tables
   4 * Along with common page table handling code
   5 *
   6 *  Derived from arch/powerpc/mm/tlb_64.c:
   7 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
   8 *
   9 *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
  10 *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
  11 *    Copyright (C) 1996 Paul Mackerras
  12 *
  13 *  Derived from "arch/i386/mm/init.c"
  14 *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  15 *
  16 *  Dave Engebretsen <engebret@us.ibm.com>
  17 *      Rework for PPC64 port.
  18 */
  19
  20#include <linux/kernel.h>
  21#include <linux/gfp.h>
  22#include <linux/mm.h>
  23#include <linux/percpu.h>
  24#include <linux/hardirq.h>
  25#include <linux/hugetlb.h>
  26#include <asm/tlbflush.h>
  27#include <asm/tlb.h>
  28#include <asm/hugetlb.h>
  29#include <asm/pte-walk.h>
  30
  31#ifdef CONFIG_PPC64
  32#define PGD_ALIGN (sizeof(pgd_t) * MAX_PTRS_PER_PGD)
  33#else
  34#define PGD_ALIGN PAGE_SIZE
  35#endif
  36
  37pgd_t swapper_pg_dir[MAX_PTRS_PER_PGD] __section(".bss..page_aligned") __aligned(PGD_ALIGN);
  38
  39static inline int is_exec_fault(void)
  40{
  41        return current->thread.regs && TRAP(current->thread.regs) == 0x400;
  42}
  43
  44/* We only try to do i/d cache coherency on stuff that looks like
  45 * reasonably "normal" PTEs. We currently require a PTE to be present
  46 * and we avoid _PAGE_SPECIAL and cache inhibited pte. We also only do that
  47 * on userspace PTEs
  48 */
  49static inline int pte_looks_normal(pte_t pte)
  50{
  51
  52        if (pte_present(pte) && !pte_special(pte)) {
  53                if (pte_ci(pte))
  54                        return 0;
  55                if (pte_user(pte))
  56                        return 1;
  57        }
  58        return 0;
  59}
  60
  61static struct page *maybe_pte_to_page(pte_t pte)
  62{
  63        unsigned long pfn = pte_pfn(pte);
  64        struct page *page;
  65
  66        if (unlikely(!pfn_valid(pfn)))
  67                return NULL;
  68        page = pfn_to_page(pfn);
  69        if (PageReserved(page))
  70                return NULL;
  71        return page;
  72}
  73
  74#ifdef CONFIG_PPC_BOOK3S
  75
  76/* Server-style MMU handles coherency when hashing if HW exec permission
  77 * is supposed per page (currently 64-bit only). If not, then, we always
  78 * flush the cache for valid PTEs in set_pte. Embedded CPU without HW exec
  79 * support falls into the same category.
  80 */
  81
  82static pte_t set_pte_filter_hash(pte_t pte)
  83{
  84        if (radix_enabled())
  85                return pte;
  86
  87        pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
  88        if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) ||
  89                                       cpu_has_feature(CPU_FTR_NOEXECUTE))) {
  90                struct page *pg = maybe_pte_to_page(pte);
  91                if (!pg)
  92                        return pte;
  93                if (!test_bit(PG_dcache_clean, &pg->flags)) {
  94                        flush_dcache_icache_page(pg);
  95                        set_bit(PG_dcache_clean, &pg->flags);
  96                }
  97        }
  98        return pte;
  99}
 100
 101#else /* CONFIG_PPC_BOOK3S */
 102
 103static pte_t set_pte_filter_hash(pte_t pte) { return pte; }
 104
 105#endif /* CONFIG_PPC_BOOK3S */
 106
 107/* Embedded type MMU with HW exec support. This is a bit more complicated
 108 * as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so
 109 * instead we "filter out" the exec permission for non clean pages.
 110 */
 111static inline pte_t set_pte_filter(pte_t pte)
 112{
 113        struct page *pg;
 114
 115        if (mmu_has_feature(MMU_FTR_HPTE_TABLE))
 116                return set_pte_filter_hash(pte);
 117
 118        /* No exec permission in the first place, move on */
 119        if (!pte_exec(pte) || !pte_looks_normal(pte))
 120                return pte;
 121
 122        /* If you set _PAGE_EXEC on weird pages you're on your own */
 123        pg = maybe_pte_to_page(pte);
 124        if (unlikely(!pg))
 125                return pte;
 126
 127        /* If the page clean, we move on */
 128        if (test_bit(PG_dcache_clean, &pg->flags))
 129                return pte;
 130
 131        /* If it's an exec fault, we flush the cache and make it clean */
 132        if (is_exec_fault()) {
 133                flush_dcache_icache_page(pg);
 134                set_bit(PG_dcache_clean, &pg->flags);
 135                return pte;
 136        }
 137
 138        /* Else, we filter out _PAGE_EXEC */
 139        return pte_exprotect(pte);
 140}
 141
 142static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
 143                                     int dirty)
 144{
 145        struct page *pg;
 146
 147        if (mmu_has_feature(MMU_FTR_HPTE_TABLE))
 148                return pte;
 149
 150        /* So here, we only care about exec faults, as we use them
 151         * to recover lost _PAGE_EXEC and perform I$/D$ coherency
 152         * if necessary. Also if _PAGE_EXEC is already set, same deal,
 153         * we just bail out
 154         */
 155        if (dirty || pte_exec(pte) || !is_exec_fault())
 156                return pte;
 157
 158#ifdef CONFIG_DEBUG_VM
 159        /* So this is an exec fault, _PAGE_EXEC is not set. If it was
 160         * an error we would have bailed out earlier in do_page_fault()
 161         * but let's make sure of it
 162         */
 163        if (WARN_ON(!(vma->vm_flags & VM_EXEC)))
 164                return pte;
 165#endif /* CONFIG_DEBUG_VM */
 166
 167        /* If you set _PAGE_EXEC on weird pages you're on your own */
 168        pg = maybe_pte_to_page(pte);
 169        if (unlikely(!pg))
 170                goto bail;
 171
 172        /* If the page is already clean, we move on */
 173        if (test_bit(PG_dcache_clean, &pg->flags))
 174                goto bail;
 175
 176        /* Clean the page and set PG_dcache_clean */
 177        flush_dcache_icache_page(pg);
 178        set_bit(PG_dcache_clean, &pg->flags);
 179
 180 bail:
 181        return pte_mkexec(pte);
 182}
 183
 184/*
 185 * set_pte stores a linux PTE into the linux page table.
 186 */
 187void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
 188                pte_t pte)
 189{
 190        /*
 191         * Make sure hardware valid bit is not set. We don't do
 192         * tlb flush for this update.
 193         */
 194        VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
 195
 196        /* Note: mm->context.id might not yet have been assigned as
 197         * this context might not have been activated yet when this
 198         * is called.
 199         */
 200        pte = set_pte_filter(pte);
 201
 202        /* Perform the setting of the PTE */
 203        __set_pte_at(mm, addr, ptep, pte, 0);
 204}
 205
 206/*
 207 * This is called when relaxing access to a PTE. It's also called in the page
 208 * fault path when we don't hit any of the major fault cases, ie, a minor
 209 * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
 210 * handled those two for us, we additionally deal with missing execute
 211 * permission here on some processors
 212 */
 213int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
 214                          pte_t *ptep, pte_t entry, int dirty)
 215{
 216        int changed;
 217        entry = set_access_flags_filter(entry, vma, dirty);
 218        changed = !pte_same(*(ptep), entry);
 219        if (changed) {
 220                assert_pte_locked(vma->vm_mm, address);
 221                __ptep_set_access_flags(vma, ptep, entry,
 222                                        address, mmu_virtual_psize);
 223        }
 224        return changed;
 225}
 226
 227#ifdef CONFIG_HUGETLB_PAGE
 228int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 229                               unsigned long addr, pte_t *ptep,
 230                               pte_t pte, int dirty)
 231{
 232#ifdef HUGETLB_NEED_PRELOAD
 233        /*
 234         * The "return 1" forces a call of update_mmu_cache, which will write a
 235         * TLB entry.  Without this, platforms that don't do a write of the TLB
 236         * entry in the TLB miss handler asm will fault ad infinitum.
 237         */
 238        ptep_set_access_flags(vma, addr, ptep, pte, dirty);
 239        return 1;
 240#else
 241        int changed, psize;
 242
 243        pte = set_access_flags_filter(pte, vma, dirty);
 244        changed = !pte_same(*(ptep), pte);
 245        if (changed) {
 246
 247#ifdef CONFIG_PPC_BOOK3S_64
 248                struct hstate *h = hstate_vma(vma);
 249
 250                psize = hstate_get_psize(h);
 251#ifdef CONFIG_DEBUG_VM
 252                assert_spin_locked(huge_pte_lockptr(h, vma->vm_mm, ptep));
 253#endif
 254
 255#else
 256                /*
 257                 * Not used on non book3s64 platforms.
 258                 * 8xx compares it with mmu_virtual_psize to
 259                 * know if it is a huge page or not.
 260                 */
 261                psize = MMU_PAGE_COUNT;
 262#endif
 263                __ptep_set_access_flags(vma, ptep, pte, addr, psize);
 264        }
 265        return changed;
 266#endif
 267}
 268
 269#if defined(CONFIG_PPC_8xx)
 270void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
 271{
 272        pmd_t *pmd = pmd_off(mm, addr);
 273        pte_basic_t val;
 274        pte_basic_t *entry = &ptep->pte;
 275        int num, i;
 276
 277        /*
 278         * Make sure hardware valid bit is not set. We don't do
 279         * tlb flush for this update.
 280         */
 281        VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
 282
 283        pte = set_pte_filter(pte);
 284
 285        val = pte_val(pte);
 286
 287        num = number_of_cells_per_pte(pmd, val, 1);
 288
 289        for (i = 0; i < num; i++, entry++, val += SZ_4K)
 290                *entry = val;
 291}
 292#endif
 293#endif /* CONFIG_HUGETLB_PAGE */
 294
 295#ifdef CONFIG_DEBUG_VM
 296void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
 297{
 298        pgd_t *pgd;
 299        p4d_t *p4d;
 300        pud_t *pud;
 301        pmd_t *pmd;
 302
 303        if (mm == &init_mm)
 304                return;
 305        pgd = mm->pgd + pgd_index(addr);
 306        BUG_ON(pgd_none(*pgd));
 307        p4d = p4d_offset(pgd, addr);
 308        BUG_ON(p4d_none(*p4d));
 309        pud = pud_offset(p4d, addr);
 310        BUG_ON(pud_none(*pud));
 311        pmd = pmd_offset(pud, addr);
 312        /*
 313         * khugepaged to collapse normal pages to hugepage, first set
 314         * pmd to none to force page fault/gup to take mmap_lock. After
 315         * pmd is set to none, we do a pte_clear which does this assertion
 316         * so if we find pmd none, return.
 317         */
 318        if (pmd_none(*pmd))
 319                return;
 320        BUG_ON(!pmd_present(*pmd));
 321        assert_spin_locked(pte_lockptr(mm, pmd));
 322}
 323#endif /* CONFIG_DEBUG_VM */
 324
 325unsigned long vmalloc_to_phys(void *va)
 326{
 327        unsigned long pfn = vmalloc_to_pfn(va);
 328
 329        BUG_ON(!pfn);
 330        return __pa(pfn_to_kaddr(pfn)) + offset_in_page(va);
 331}
 332EXPORT_SYMBOL_GPL(vmalloc_to_phys);
 333
 334/*
 335 * We have 4 cases for pgds and pmds:
 336 * (1) invalid (all zeroes)
 337 * (2) pointer to next table, as normal; bottom 6 bits == 0
 338 * (3) leaf pte for huge page _PAGE_PTE set
 339 * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
 340 *
 341 * So long as we atomically load page table pointers we are safe against teardown,
 342 * we can follow the address down to the the page and take a ref on it.
 343 * This function need to be called with interrupts disabled. We use this variant
 344 * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED
 345 */
 346pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
 347                        bool *is_thp, unsigned *hpage_shift)
 348{
 349        pgd_t *pgdp;
 350        p4d_t p4d, *p4dp;
 351        pud_t pud, *pudp;
 352        pmd_t pmd, *pmdp;
 353        pte_t *ret_pte;
 354        hugepd_t *hpdp = NULL;
 355        unsigned pdshift;
 356
 357        if (hpage_shift)
 358                *hpage_shift = 0;
 359
 360        if (is_thp)
 361                *is_thp = false;
 362
 363        /*
 364         * Always operate on the local stack value. This make sure the
 365         * value don't get updated by a parallel THP split/collapse,
 366         * page fault or a page unmap. The return pte_t * is still not
 367         * stable. So should be checked there for above conditions.
 368         * Top level is an exception because it is folded into p4d.
 369         */
 370        pgdp = pgdir + pgd_index(ea);
 371        p4dp = p4d_offset(pgdp, ea);
 372        p4d  = READ_ONCE(*p4dp);
 373        pdshift = P4D_SHIFT;
 374
 375        if (p4d_none(p4d))
 376                return NULL;
 377
 378        if (p4d_is_leaf(p4d)) {
 379                ret_pte = (pte_t *)p4dp;
 380                goto out;
 381        }
 382
 383        if (is_hugepd(__hugepd(p4d_val(p4d)))) {
 384                hpdp = (hugepd_t *)&p4d;
 385                goto out_huge;
 386        }
 387
 388        /*
 389         * Even if we end up with an unmap, the pgtable will not
 390         * be freed, because we do an rcu free and here we are
 391         * irq disabled
 392         */
 393        pdshift = PUD_SHIFT;
 394        pudp = pud_offset(&p4d, ea);
 395        pud  = READ_ONCE(*pudp);
 396
 397        if (pud_none(pud))
 398                return NULL;
 399
 400        if (pud_is_leaf(pud)) {
 401                ret_pte = (pte_t *)pudp;
 402                goto out;
 403        }
 404
 405        if (is_hugepd(__hugepd(pud_val(pud)))) {
 406                hpdp = (hugepd_t *)&pud;
 407                goto out_huge;
 408        }
 409
 410        pdshift = PMD_SHIFT;
 411        pmdp = pmd_offset(&pud, ea);
 412        pmd  = READ_ONCE(*pmdp);
 413
 414        /*
 415         * A hugepage collapse is captured by this condition, see
 416         * pmdp_collapse_flush.
 417         */
 418        if (pmd_none(pmd))
 419                return NULL;
 420
 421#ifdef CONFIG_PPC_BOOK3S_64
 422        /*
 423         * A hugepage split is captured by this condition, see
 424         * pmdp_invalidate.
 425         *
 426         * Huge page modification can be caught here too.
 427         */
 428        if (pmd_is_serializing(pmd))
 429                return NULL;
 430#endif
 431
 432        if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
 433                if (is_thp)
 434                        *is_thp = true;
 435                ret_pte = (pte_t *)pmdp;
 436                goto out;
 437        }
 438
 439        if (pmd_is_leaf(pmd)) {
 440                ret_pte = (pte_t *)pmdp;
 441                goto out;
 442        }
 443
 444        if (is_hugepd(__hugepd(pmd_val(pmd)))) {
 445                hpdp = (hugepd_t *)&pmd;
 446                goto out_huge;
 447        }
 448
 449        return pte_offset_kernel(&pmd, ea);
 450
 451out_huge:
 452        if (!hpdp)
 453                return NULL;
 454
 455        ret_pte = hugepte_offset(*hpdp, ea, pdshift);
 456        pdshift = hugepd_shift(*hpdp);
 457out:
 458        if (hpage_shift)
 459                *hpage_shift = pdshift;
 460        return ret_pte;
 461}
 462EXPORT_SYMBOL_GPL(__find_linux_pte);
 463