linux/arch/powerpc/mm/book3s64/radix_pgtable.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * Page table handling routines for radix page table.
   4 *
   5 * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
   6 */
   7
   8#define pr_fmt(fmt) "radix-mmu: " fmt
   9
  10#include <linux/io.h>
  11#include <linux/kernel.h>
  12#include <linux/sched/mm.h>
  13#include <linux/memblock.h>
  14#include <linux/of_fdt.h>
  15#include <linux/mm.h>
  16#include <linux/hugetlb.h>
  17#include <linux/string_helpers.h>
  18#include <linux/stop_machine.h>
  19
  20#include <asm/pgalloc.h>
  21#include <asm/mmu_context.h>
  22#include <asm/dma.h>
  23#include <asm/machdep.h>
  24#include <asm/mmu.h>
  25#include <asm/firmware.h>
  26#include <asm/powernv.h>
  27#include <asm/sections.h>
  28#include <asm/smp.h>
  29#include <asm/trace.h>
  30#include <asm/uaccess.h>
  31#include <asm/ultravisor.h>
  32
  33#include <trace/events/thp.h>
  34
  35unsigned int mmu_pid_bits;
  36unsigned int mmu_base_pid;
  37
  38static __ref void *early_alloc_pgtable(unsigned long size, int nid,
  39                        unsigned long region_start, unsigned long region_end)
  40{
  41        phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT;
  42        phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE;
  43        void *ptr;
  44
  45        if (region_start)
  46                min_addr = region_start;
  47        if (region_end)
  48                max_addr = region_end;
  49
  50        ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid);
  51
  52        if (!ptr)
  53                panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n",
  54                      __func__, size, size, nid, &min_addr, &max_addr);
  55
  56        return ptr;
  57}
  58
  59static int early_map_kernel_page(unsigned long ea, unsigned long pa,
  60                          pgprot_t flags,
  61                          unsigned int map_page_size,
  62                          int nid,
  63                          unsigned long region_start, unsigned long region_end)
  64{
  65        unsigned long pfn = pa >> PAGE_SHIFT;
  66        pgd_t *pgdp;
  67        p4d_t *p4dp;
  68        pud_t *pudp;
  69        pmd_t *pmdp;
  70        pte_t *ptep;
  71
  72        pgdp = pgd_offset_k(ea);
  73        p4dp = p4d_offset(pgdp, ea);
  74        if (p4d_none(*p4dp)) {
  75                pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
  76                                                region_start, region_end);
  77                p4d_populate(&init_mm, p4dp, pudp);
  78        }
  79        pudp = pud_offset(p4dp, ea);
  80        if (map_page_size == PUD_SIZE) {
  81                ptep = (pte_t *)pudp;
  82                goto set_the_pte;
  83        }
  84        if (pud_none(*pudp)) {
  85                pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid,
  86                                                region_start, region_end);
  87                pud_populate(&init_mm, pudp, pmdp);
  88        }
  89        pmdp = pmd_offset(pudp, ea);
  90        if (map_page_size == PMD_SIZE) {
  91                ptep = pmdp_ptep(pmdp);
  92                goto set_the_pte;
  93        }
  94        if (!pmd_present(*pmdp)) {
  95                ptep = early_alloc_pgtable(PAGE_SIZE, nid,
  96                                                region_start, region_end);
  97                pmd_populate_kernel(&init_mm, pmdp, ptep);
  98        }
  99        ptep = pte_offset_kernel(pmdp, ea);
 100
 101set_the_pte:
 102        set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
 103        smp_wmb();
 104        return 0;
 105}
 106
 107/*
 108 * nid, region_start, and region_end are hints to try to place the page
 109 * table memory in the same node or region.
 110 */
 111static int __map_kernel_page(unsigned long ea, unsigned long pa,
 112                          pgprot_t flags,
 113                          unsigned int map_page_size,
 114                          int nid,
 115                          unsigned long region_start, unsigned long region_end)
 116{
 117        unsigned long pfn = pa >> PAGE_SHIFT;
 118        pgd_t *pgdp;
 119        p4d_t *p4dp;
 120        pud_t *pudp;
 121        pmd_t *pmdp;
 122        pte_t *ptep;
 123        /*
 124         * Make sure task size is correct as per the max adddr
 125         */
 126        BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
 127
 128#ifdef CONFIG_PPC_64K_PAGES
 129        BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT));
 130#endif
 131
 132        if (unlikely(!slab_is_available()))
 133                return early_map_kernel_page(ea, pa, flags, map_page_size,
 134                                                nid, region_start, region_end);
 135
 136        /*
 137         * Should make page table allocation functions be able to take a
 138         * node, so we can place kernel page tables on the right nodes after
 139         * boot.
 140         */
 141        pgdp = pgd_offset_k(ea);
 142        p4dp = p4d_offset(pgdp, ea);
 143        pudp = pud_alloc(&init_mm, p4dp, ea);
 144        if (!pudp)
 145                return -ENOMEM;
 146        if (map_page_size == PUD_SIZE) {
 147                ptep = (pte_t *)pudp;
 148                goto set_the_pte;
 149        }
 150        pmdp = pmd_alloc(&init_mm, pudp, ea);
 151        if (!pmdp)
 152                return -ENOMEM;
 153        if (map_page_size == PMD_SIZE) {
 154                ptep = pmdp_ptep(pmdp);
 155                goto set_the_pte;
 156        }
 157        ptep = pte_alloc_kernel(pmdp, ea);
 158        if (!ptep)
 159                return -ENOMEM;
 160
 161set_the_pte:
 162        set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
 163        smp_wmb();
 164        return 0;
 165}
 166
 167int radix__map_kernel_page(unsigned long ea, unsigned long pa,
 168                          pgprot_t flags,
 169                          unsigned int map_page_size)
 170{
 171        return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);
 172}
 173
 174#ifdef CONFIG_STRICT_KERNEL_RWX
 175void radix__change_memory_range(unsigned long start, unsigned long end,
 176                                unsigned long clear)
 177{
 178        unsigned long idx;
 179        pgd_t *pgdp;
 180        p4d_t *p4dp;
 181        pud_t *pudp;
 182        pmd_t *pmdp;
 183        pte_t *ptep;
 184
 185        start = ALIGN_DOWN(start, PAGE_SIZE);
 186        end = PAGE_ALIGN(end); // aligns up
 187
 188        pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",
 189                 start, end, clear);
 190
 191        for (idx = start; idx < end; idx += PAGE_SIZE) {
 192                pgdp = pgd_offset_k(idx);
 193                p4dp = p4d_offset(pgdp, idx);
 194                pudp = pud_alloc(&init_mm, p4dp, idx);
 195                if (!pudp)
 196                        continue;
 197                if (pud_is_leaf(*pudp)) {
 198                        ptep = (pte_t *)pudp;
 199                        goto update_the_pte;
 200                }
 201                pmdp = pmd_alloc(&init_mm, pudp, idx);
 202                if (!pmdp)
 203                        continue;
 204                if (pmd_is_leaf(*pmdp)) {
 205                        ptep = pmdp_ptep(pmdp);
 206                        goto update_the_pte;
 207                }
 208                ptep = pte_alloc_kernel(pmdp, idx);
 209                if (!ptep)
 210                        continue;
 211update_the_pte:
 212                radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);
 213        }
 214
 215        radix__flush_tlb_kernel_range(start, end);
 216}
 217
 218void radix__mark_rodata_ro(void)
 219{
 220        unsigned long start, end;
 221
 222        start = (unsigned long)_stext;
 223        end = (unsigned long)__init_begin;
 224
 225        radix__change_memory_range(start, end, _PAGE_WRITE);
 226}
 227
 228void radix__mark_initmem_nx(void)
 229{
 230        unsigned long start = (unsigned long)__init_begin;
 231        unsigned long end = (unsigned long)__init_end;
 232
 233        radix__change_memory_range(start, end, _PAGE_EXEC);
 234}
 235#endif /* CONFIG_STRICT_KERNEL_RWX */
 236
 237static inline void __meminit
 238print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec)
 239{
 240        char buf[10];
 241
 242        if (end <= start)
 243                return;
 244
 245        string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));
 246
 247        pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf,
 248                exec ? " (exec)" : "");
 249}
 250
 251static unsigned long next_boundary(unsigned long addr, unsigned long end)
 252{
 253#ifdef CONFIG_STRICT_KERNEL_RWX
 254        if (addr < __pa_symbol(__init_begin))
 255                return __pa_symbol(__init_begin);
 256#endif
 257        return end;
 258}
 259
 260static int __meminit create_physical_mapping(unsigned long start,
 261                                             unsigned long end,
 262                                             int nid, pgprot_t _prot)
 263{
 264        unsigned long vaddr, addr, mapping_size = 0;
 265        bool prev_exec, exec = false;
 266        pgprot_t prot;
 267        int psize;
 268
 269        start = ALIGN(start, PAGE_SIZE);
 270        for (addr = start; addr < end; addr += mapping_size) {
 271                unsigned long gap, previous_size;
 272                int rc;
 273
 274                gap = next_boundary(addr, end) - addr;
 275                previous_size = mapping_size;
 276                prev_exec = exec;
 277
 278                if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
 279                    mmu_psize_defs[MMU_PAGE_1G].shift) {
 280                        mapping_size = PUD_SIZE;
 281                        psize = MMU_PAGE_1G;
 282                } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
 283                           mmu_psize_defs[MMU_PAGE_2M].shift) {
 284                        mapping_size = PMD_SIZE;
 285                        psize = MMU_PAGE_2M;
 286                } else {
 287                        mapping_size = PAGE_SIZE;
 288                        psize = mmu_virtual_psize;
 289                }
 290
 291                vaddr = (unsigned long)__va(addr);
 292
 293                if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||
 294                    overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) {
 295                        prot = PAGE_KERNEL_X;
 296                        exec = true;
 297                } else {
 298                        prot = _prot;
 299                        exec = false;
 300                }
 301
 302                if (mapping_size != previous_size || exec != prev_exec) {
 303                        print_mapping(start, addr, previous_size, prev_exec);
 304                        start = addr;
 305                }
 306
 307                rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);
 308                if (rc)
 309                        return rc;
 310
 311                update_page_count(psize, 1);
 312        }
 313
 314        print_mapping(start, addr, mapping_size, exec);
 315        return 0;
 316}
 317
 318static void __init radix_init_pgtable(void)
 319{
 320        unsigned long rts_field;
 321        struct memblock_region *reg;
 322
 323        /* We don't support slb for radix */
 324        mmu_slb_size = 0;
 325        /*
 326         * Create the linear mapping, using standard page size for now
 327         */
 328        for_each_memblock(memory, reg) {
 329                /*
 330                 * The memblock allocator  is up at this point, so the
 331                 * page tables will be allocated within the range. No
 332                 * need or a node (which we don't have yet).
 333                 */
 334
 335                if ((reg->base + reg->size) >= RADIX_VMALLOC_START) {
 336                        pr_warn("Outside the supported range\n");
 337                        continue;
 338                }
 339
 340                WARN_ON(create_physical_mapping(reg->base,
 341                                                reg->base + reg->size,
 342                                                -1, PAGE_KERNEL));
 343        }
 344
 345        /* Find out how many PID bits are supported */
 346        if (!cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) {
 347                if (!mmu_pid_bits)
 348                        mmu_pid_bits = 20;
 349                mmu_base_pid = 1;
 350        } else if (cpu_has_feature(CPU_FTR_HVMODE)) {
 351                if (!mmu_pid_bits)
 352                        mmu_pid_bits = 20;
 353#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 354                /*
 355                 * When KVM is possible, we only use the top half of the
 356                 * PID space to avoid collisions between host and guest PIDs
 357                 * which can cause problems due to prefetch when exiting the
 358                 * guest with AIL=3
 359                 */
 360                mmu_base_pid = 1 << (mmu_pid_bits - 1);
 361#else
 362                mmu_base_pid = 1;
 363#endif
 364        } else {
 365                /* The guest uses the bottom half of the PID space */
 366                if (!mmu_pid_bits)
 367                        mmu_pid_bits = 19;
 368                mmu_base_pid = 1;
 369        }
 370
 371        /*
 372         * Allocate Partition table and process table for the
 373         * host.
 374         */
 375        BUG_ON(PRTB_SIZE_SHIFT > 36);
 376        process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);
 377        /*
 378         * Fill in the process table.
 379         */
 380        rts_field = radix__get_tree_size();
 381        process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
 382
 383        /*
 384         * The init_mm context is given the first available (non-zero) PID,
 385         * which is the "guard PID" and contains no page table. PIDR should
 386         * never be set to zero because that duplicates the kernel address
 387         * space at the 0x0... offset (quadrant 0)!
 388         *
 389         * An arbitrary PID that may later be allocated by the PID allocator
 390         * for userspace processes must not be used either, because that
 391         * would cause stale user mappings for that PID on CPUs outside of
 392         * the TLB invalidation scheme (because it won't be in mm_cpumask).
 393         *
 394         * So permanently carve out one PID for the purpose of a guard PID.
 395         */
 396        init_mm.context.id = mmu_base_pid;
 397        mmu_base_pid++;
 398}
 399
 400static void __init radix_init_partition_table(void)
 401{
 402        unsigned long rts_field, dw0, dw1;
 403
 404        mmu_partition_table_init();
 405        rts_field = radix__get_tree_size();
 406        dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
 407        dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR;
 408        mmu_partition_table_set_entry(0, dw0, dw1, false);
 409
 410        pr_info("Initializing Radix MMU\n");
 411}
 412
 413static int __init get_idx_from_shift(unsigned int shift)
 414{
 415        int idx = -1;
 416
 417        switch (shift) {
 418        case 0xc:
 419                idx = MMU_PAGE_4K;
 420                break;
 421        case 0x10:
 422                idx = MMU_PAGE_64K;
 423                break;
 424        case 0x15:
 425                idx = MMU_PAGE_2M;
 426                break;
 427        case 0x1e:
 428                idx = MMU_PAGE_1G;
 429                break;
 430        }
 431        return idx;
 432}
 433
 434static int __init radix_dt_scan_page_sizes(unsigned long node,
 435                                           const char *uname, int depth,
 436                                           void *data)
 437{
 438        int size = 0;
 439        int shift, idx;
 440        unsigned int ap;
 441        const __be32 *prop;
 442        const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
 443
 444        /* We are scanning "cpu" nodes only */
 445        if (type == NULL || strcmp(type, "cpu") != 0)
 446                return 0;
 447
 448        /* Find MMU PID size */
 449        prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
 450        if (prop && size == 4)
 451                mmu_pid_bits = be32_to_cpup(prop);
 452
 453        /* Grab page size encodings */
 454        prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
 455        if (!prop)
 456                return 0;
 457
 458        pr_info("Page sizes from device-tree:\n");
 459        for (; size >= 4; size -= 4, ++prop) {
 460
 461                struct mmu_psize_def *def;
 462
 463                /* top 3 bit is AP encoding */
 464                shift = be32_to_cpu(prop[0]) & ~(0xe << 28);
 465                ap = be32_to_cpu(prop[0]) >> 29;
 466                pr_info("Page size shift = %d AP=0x%x\n", shift, ap);
 467
 468                idx = get_idx_from_shift(shift);
 469                if (idx < 0)
 470                        continue;
 471
 472                def = &mmu_psize_defs[idx];
 473                def->shift = shift;
 474                def->ap  = ap;
 475        }
 476
 477        /* needed ? */
 478        cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
 479        return 1;
 480}
 481
 482void __init radix__early_init_devtree(void)
 483{
 484        int rc;
 485
 486        /*
 487         * Try to find the available page sizes in the device-tree
 488         */
 489        rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
 490        if (rc != 0)  /* Found */
 491                goto found;
 492        /*
 493         * let's assume we have page 4k and 64k support
 494         */
 495        mmu_psize_defs[MMU_PAGE_4K].shift = 12;
 496        mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
 497
 498        mmu_psize_defs[MMU_PAGE_64K].shift = 16;
 499        mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
 500found:
 501        return;
 502}
 503
 504static void radix_init_amor(void)
 505{
 506        /*
 507        * In HV mode, we init AMOR (Authority Mask Override Register) so that
 508        * the hypervisor and guest can setup IAMR (Instruction Authority Mask
 509        * Register), enable key 0 and set it to 1.
 510        *
 511        * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11)
 512        */
 513        mtspr(SPRN_AMOR, (3ul << 62));
 514}
 515
 516#ifdef CONFIG_PPC_KUEP
 517void setup_kuep(bool disabled)
 518{
 519        if (disabled || !early_radix_enabled())
 520                return;
 521
 522        if (smp_processor_id() == boot_cpuid)
 523                pr_info("Activating Kernel Userspace Execution Prevention\n");
 524
 525        /*
 526         * Radix always uses key0 of the IAMR to determine if an access is
 527         * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
 528         * fetch.
 529         */
 530        mtspr(SPRN_IAMR, (1ul << 62));
 531}
 532#endif
 533
 534#ifdef CONFIG_PPC_KUAP
 535void setup_kuap(bool disabled)
 536{
 537        if (disabled || !early_radix_enabled())
 538                return;
 539
 540        if (smp_processor_id() == boot_cpuid) {
 541                pr_info("Activating Kernel Userspace Access Prevention\n");
 542                cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP;
 543        }
 544
 545        /* Make sure userspace can't change the AMR */
 546        mtspr(SPRN_UAMOR, 0);
 547        mtspr(SPRN_AMR, AMR_KUAP_BLOCKED);
 548        isync();
 549}
 550#endif
 551
 552void __init radix__early_init_mmu(void)
 553{
 554        unsigned long lpcr;
 555
 556#ifdef CONFIG_PPC_64K_PAGES
 557        /* PAGE_SIZE mappings */
 558        mmu_virtual_psize = MMU_PAGE_64K;
 559#else
 560        mmu_virtual_psize = MMU_PAGE_4K;
 561#endif
 562
 563#ifdef CONFIG_SPARSEMEM_VMEMMAP
 564        /* vmemmap mapping */
 565        if (mmu_psize_defs[MMU_PAGE_2M].shift) {
 566                /*
 567                 * map vmemmap using 2M if available
 568                 */
 569                mmu_vmemmap_psize = MMU_PAGE_2M;
 570        } else
 571                mmu_vmemmap_psize = mmu_virtual_psize;
 572#endif
 573        /*
 574         * initialize page table size
 575         */
 576        __pte_index_size = RADIX_PTE_INDEX_SIZE;
 577        __pmd_index_size = RADIX_PMD_INDEX_SIZE;
 578        __pud_index_size = RADIX_PUD_INDEX_SIZE;
 579        __pgd_index_size = RADIX_PGD_INDEX_SIZE;
 580        __pud_cache_index = RADIX_PUD_INDEX_SIZE;
 581        __pte_table_size = RADIX_PTE_TABLE_SIZE;
 582        __pmd_table_size = RADIX_PMD_TABLE_SIZE;
 583        __pud_table_size = RADIX_PUD_TABLE_SIZE;
 584        __pgd_table_size = RADIX_PGD_TABLE_SIZE;
 585
 586        __pmd_val_bits = RADIX_PMD_VAL_BITS;
 587        __pud_val_bits = RADIX_PUD_VAL_BITS;
 588        __pgd_val_bits = RADIX_PGD_VAL_BITS;
 589
 590        __kernel_virt_start = RADIX_KERN_VIRT_START;
 591        __vmalloc_start = RADIX_VMALLOC_START;
 592        __vmalloc_end = RADIX_VMALLOC_END;
 593        __kernel_io_start = RADIX_KERN_IO_START;
 594        __kernel_io_end = RADIX_KERN_IO_END;
 595        vmemmap = (struct page *)RADIX_VMEMMAP_START;
 596        ioremap_bot = IOREMAP_BASE;
 597
 598#ifdef CONFIG_PCI
 599        pci_io_base = ISA_IO_BASE;
 600#endif
 601        __pte_frag_nr = RADIX_PTE_FRAG_NR;
 602        __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT;
 603        __pmd_frag_nr = RADIX_PMD_FRAG_NR;
 604        __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;
 605
 606        radix_init_pgtable();
 607
 608        if (!firmware_has_feature(FW_FEATURE_LPAR)) {
 609                lpcr = mfspr(SPRN_LPCR);
 610                mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
 611                radix_init_partition_table();
 612                radix_init_amor();
 613        } else {
 614                radix_init_pseries();
 615        }
 616
 617        memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
 618
 619        /* Switch to the guard PID before turning on MMU */
 620        radix__switch_mmu_context(NULL, &init_mm);
 621        tlbiel_all();
 622}
 623
 624void radix__early_init_mmu_secondary(void)
 625{
 626        unsigned long lpcr;
 627        /*
 628         * update partition table control register and UPRT
 629         */
 630        if (!firmware_has_feature(FW_FEATURE_LPAR)) {
 631                lpcr = mfspr(SPRN_LPCR);
 632                mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
 633
 634                set_ptcr_when_no_uv(__pa(partition_tb) |
 635                                    (PATB_SIZE_SHIFT - 12));
 636
 637                radix_init_amor();
 638        }
 639
 640        radix__switch_mmu_context(NULL, &init_mm);
 641        tlbiel_all();
 642}
 643
 644void radix__mmu_cleanup_all(void)
 645{
 646        unsigned long lpcr;
 647
 648        if (!firmware_has_feature(FW_FEATURE_LPAR)) {
 649                lpcr = mfspr(SPRN_LPCR);
 650                mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
 651                set_ptcr_when_no_uv(0);
 652                powernv_set_nmmu_ptcr(0);
 653                radix__flush_tlb_all();
 654        }
 655}
 656
 657void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
 658                                phys_addr_t first_memblock_size)
 659{
 660        /*
 661         * We don't currently support the first MEMBLOCK not mapping 0
 662         * physical on those processors
 663         */
 664        BUG_ON(first_memblock_base != 0);
 665
 666        /*
 667         * Radix mode is not limited by RMA / VRMA addressing.
 668         */
 669        ppc64_rma_size = ULONG_MAX;
 670}
 671
 672#ifdef CONFIG_MEMORY_HOTPLUG
 673static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
 674{
 675        pte_t *pte;
 676        int i;
 677
 678        for (i = 0; i < PTRS_PER_PTE; i++) {
 679                pte = pte_start + i;
 680                if (!pte_none(*pte))
 681                        return;
 682        }
 683
 684        pte_free_kernel(&init_mm, pte_start);
 685        pmd_clear(pmd);
 686}
 687
 688static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
 689{
 690        pmd_t *pmd;
 691        int i;
 692
 693        for (i = 0; i < PTRS_PER_PMD; i++) {
 694                pmd = pmd_start + i;
 695                if (!pmd_none(*pmd))
 696                        return;
 697        }
 698
 699        pmd_free(&init_mm, pmd_start);
 700        pud_clear(pud);
 701}
 702
 703struct change_mapping_params {
 704        pte_t *pte;
 705        unsigned long start;
 706        unsigned long end;
 707        unsigned long aligned_start;
 708        unsigned long aligned_end;
 709};
 710
 711static int __meminit stop_machine_change_mapping(void *data)
 712{
 713        struct change_mapping_params *params =
 714                        (struct change_mapping_params *)data;
 715
 716        if (!data)
 717                return -1;
 718
 719        spin_unlock(&init_mm.page_table_lock);
 720        pte_clear(&init_mm, params->aligned_start, params->pte);
 721        create_physical_mapping(__pa(params->aligned_start),
 722                                __pa(params->start), -1, PAGE_KERNEL);
 723        create_physical_mapping(__pa(params->end), __pa(params->aligned_end),
 724                                -1, PAGE_KERNEL);
 725        spin_lock(&init_mm.page_table_lock);
 726        return 0;
 727}
 728
 729static void remove_pte_table(pte_t *pte_start, unsigned long addr,
 730                             unsigned long end)
 731{
 732        unsigned long next;
 733        pte_t *pte;
 734
 735        pte = pte_start + pte_index(addr);
 736        for (; addr < end; addr = next, pte++) {
 737                next = (addr + PAGE_SIZE) & PAGE_MASK;
 738                if (next > end)
 739                        next = end;
 740
 741                if (!pte_present(*pte))
 742                        continue;
 743
 744                if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
 745                        /*
 746                         * The vmemmap_free() and remove_section_mapping()
 747                         * codepaths call us with aligned addresses.
 748                         */
 749                        WARN_ONCE(1, "%s: unaligned range\n", __func__);
 750                        continue;
 751                }
 752
 753                pte_clear(&init_mm, addr, pte);
 754        }
 755}
 756
 757/*
 758 * clear the pte and potentially split the mapping helper
 759 */
 760static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end,
 761                                unsigned long size, pte_t *pte)
 762{
 763        unsigned long mask = ~(size - 1);
 764        unsigned long aligned_start = addr & mask;
 765        unsigned long aligned_end = addr + size;
 766        struct change_mapping_params params;
 767        bool split_region = false;
 768
 769        if ((end - addr) < size) {
 770                /*
 771                 * We're going to clear the PTE, but not flushed
 772                 * the mapping, time to remap and flush. The
 773                 * effects if visible outside the processor or
 774                 * if we are running in code close to the
 775                 * mapping we cleared, we are in trouble.
 776                 */
 777                if (overlaps_kernel_text(aligned_start, addr) ||
 778                        overlaps_kernel_text(end, aligned_end)) {
 779                        /*
 780                         * Hack, just return, don't pte_clear
 781                         */
 782                        WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel "
 783                                  "text, not splitting\n", addr, end);
 784                        return;
 785                }
 786                split_region = true;
 787        }
 788
 789        if (split_region) {
 790                params.pte = pte;
 791                params.start = addr;
 792                params.end = end;
 793                params.aligned_start = addr & ~(size - 1);
 794                params.aligned_end = min_t(unsigned long, aligned_end,
 795                                (unsigned long)__va(memblock_end_of_DRAM()));
 796                stop_machine(stop_machine_change_mapping, &params, NULL);
 797                return;
 798        }
 799
 800        pte_clear(&init_mm, addr, pte);
 801}
 802
 803static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
 804                             unsigned long end)
 805{
 806        unsigned long next;
 807        pte_t *pte_base;
 808        pmd_t *pmd;
 809
 810        pmd = pmd_start + pmd_index(addr);
 811        for (; addr < end; addr = next, pmd++) {
 812                next = pmd_addr_end(addr, end);
 813
 814                if (!pmd_present(*pmd))
 815                        continue;
 816
 817                if (pmd_is_leaf(*pmd)) {
 818                        split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd);
 819                        continue;
 820                }
 821
 822                pte_base = (pte_t *)pmd_page_vaddr(*pmd);
 823                remove_pte_table(pte_base, addr, next);
 824                free_pte_table(pte_base, pmd);
 825        }
 826}
 827
 828static void remove_pud_table(pud_t *pud_start, unsigned long addr,
 829                             unsigned long end)
 830{
 831        unsigned long next;
 832        pmd_t *pmd_base;
 833        pud_t *pud;
 834
 835        pud = pud_start + pud_index(addr);
 836        for (; addr < end; addr = next, pud++) {
 837                next = pud_addr_end(addr, end);
 838
 839                if (!pud_present(*pud))
 840                        continue;
 841
 842                if (pud_is_leaf(*pud)) {
 843                        split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud);
 844                        continue;
 845                }
 846
 847                pmd_base = (pmd_t *)pud_page_vaddr(*pud);
 848                remove_pmd_table(pmd_base, addr, next);
 849                free_pmd_table(pmd_base, pud);
 850        }
 851}
 852
 853static void __meminit remove_pagetable(unsigned long start, unsigned long end)
 854{
 855        unsigned long addr, next;
 856        pud_t *pud_base;
 857        pgd_t *pgd;
 858        p4d_t *p4d;
 859
 860        spin_lock(&init_mm.page_table_lock);
 861
 862        for (addr = start; addr < end; addr = next) {
 863                next = pgd_addr_end(addr, end);
 864
 865                pgd = pgd_offset_k(addr);
 866                p4d = p4d_offset(pgd, addr);
 867                if (!p4d_present(*p4d))
 868                        continue;
 869
 870                if (p4d_is_leaf(*p4d)) {
 871                        split_kernel_mapping(addr, end, P4D_SIZE, (pte_t *)p4d);
 872                        continue;
 873                }
 874
 875                pud_base = (pud_t *)p4d_page_vaddr(*p4d);
 876                remove_pud_table(pud_base, addr, next);
 877        }
 878
 879        spin_unlock(&init_mm.page_table_lock);
 880        radix__flush_tlb_kernel_range(start, end);
 881}
 882
 883int __meminit radix__create_section_mapping(unsigned long start,
 884                                            unsigned long end, int nid,
 885                                            pgprot_t prot)
 886{
 887        if (end >= RADIX_VMALLOC_START) {
 888                pr_warn("Outside the supported range\n");
 889                return -1;
 890        }
 891
 892        return create_physical_mapping(__pa(start), __pa(end), nid, prot);
 893}
 894
 895int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
 896{
 897        remove_pagetable(start, end);
 898        return 0;
 899}
 900#endif /* CONFIG_MEMORY_HOTPLUG */
 901
 902#ifdef CONFIG_SPARSEMEM_VMEMMAP
 903static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,
 904                                 pgprot_t flags, unsigned int map_page_size,
 905                                 int nid)
 906{
 907        return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);
 908}
 909
 910int __meminit radix__vmemmap_create_mapping(unsigned long start,
 911                                      unsigned long page_size,
 912                                      unsigned long phys)
 913{
 914        /* Create a PTE encoding */
 915        unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW;
 916        int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
 917        int ret;
 918
 919        if ((start + page_size) >= RADIX_VMEMMAP_END) {
 920                pr_warn("Outside the supported range\n");
 921                return -1;
 922        }
 923
 924        ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid);
 925        BUG_ON(ret);
 926
 927        return 0;
 928}
 929
 930#ifdef CONFIG_MEMORY_HOTPLUG
 931void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
 932{
 933        remove_pagetable(start, start + page_size);
 934}
 935#endif
 936#endif
 937
 938#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 939
 940unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
 941                                  pmd_t *pmdp, unsigned long clr,
 942                                  unsigned long set)
 943{
 944        unsigned long old;
 945
 946#ifdef CONFIG_DEBUG_VM
 947        WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
 948        assert_spin_locked(pmd_lockptr(mm, pmdp));
 949#endif
 950
 951        old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1);
 952        trace_hugepage_update(addr, old, clr, set);
 953
 954        return old;
 955}
 956
 957pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
 958                        pmd_t *pmdp)
 959
 960{
 961        pmd_t pmd;
 962
 963        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 964        VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
 965        VM_BUG_ON(pmd_devmap(*pmdp));
 966        /*
 967         * khugepaged calls this for normal pmd
 968         */
 969        pmd = *pmdp;
 970        pmd_clear(pmdp);
 971
 972        /*
 973         * pmdp collapse_flush need to ensure that there are no parallel gup
 974         * walk after this call. This is needed so that we can have stable
 975         * page ref count when collapsing a page. We don't allow a collapse page
 976         * if we have gup taken on the page. We can ensure that by sending IPI
 977         * because gup walk happens with IRQ disabled.
 978         */
 979        serialize_against_pte_lookup(vma->vm_mm);
 980
 981        radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
 982
 983        return pmd;
 984}
 985
 986/*
 987 * For us pgtable_t is pte_t *. Inorder to save the deposisted
 988 * page table, we consider the allocated page table as a list
 989 * head. On withdraw we need to make sure we zero out the used
 990 * list_head memory area.
 991 */
 992void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
 993                                 pgtable_t pgtable)
 994{
 995        struct list_head *lh = (struct list_head *) pgtable;
 996
 997        assert_spin_locked(pmd_lockptr(mm, pmdp));
 998
 999        /* FIFO */
1000        if (!pmd_huge_pte(mm, pmdp))
1001                INIT_LIST_HEAD(lh);
1002        else
1003                list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
1004        pmd_huge_pte(mm, pmdp) = pgtable;
1005}
1006
1007pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
1008{
1009        pte_t *ptep;
1010        pgtable_t pgtable;
1011        struct list_head *lh;
1012
1013        assert_spin_locked(pmd_lockptr(mm, pmdp));
1014
1015        /* FIFO */
1016        pgtable = pmd_huge_pte(mm, pmdp);
1017        lh = (struct list_head *) pgtable;
1018        if (list_empty(lh))
1019                pmd_huge_pte(mm, pmdp) = NULL;
1020        else {
1021                pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
1022                list_del(lh);
1023        }
1024        ptep = (pte_t *) pgtable;
1025        *ptep = __pte(0);
1026        ptep++;
1027        *ptep = __pte(0);
1028        return pgtable;
1029}
1030
1031pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
1032                                     unsigned long addr, pmd_t *pmdp)
1033{
1034        pmd_t old_pmd;
1035        unsigned long old;
1036
1037        old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
1038        old_pmd = __pmd(old);
1039        return old_pmd;
1040}
1041
1042#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1043
1044void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
1045                                  pte_t entry, unsigned long address, int psize)
1046{
1047        struct mm_struct *mm = vma->vm_mm;
1048        unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
1049                                              _PAGE_RW | _PAGE_EXEC);
1050
1051        unsigned long change = pte_val(entry) ^ pte_val(*ptep);
1052        /*
1053         * To avoid NMMU hang while relaxing access, we need mark
1054         * the pte invalid in between.
1055         */
1056        if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) {
1057                unsigned long old_pte, new_pte;
1058
1059                old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
1060                /*
1061                 * new value of pte
1062                 */
1063                new_pte = old_pte | set;
1064                radix__flush_tlb_page_psize(mm, address, psize);
1065                __radix_pte_update(ptep, _PAGE_INVALID, new_pte);
1066        } else {
1067                __radix_pte_update(ptep, 0, set);
1068                /*
1069                 * Book3S does not require a TLB flush when relaxing access
1070                 * restrictions when the address space is not attached to a
1071                 * NMMU, because the core MMU will reload the pte after taking
1072                 * an access fault, which is defined by the architectue.
1073                 */
1074        }
1075        /* See ptesync comment in radix__set_pte_at */
1076}
1077
1078void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
1079                                    unsigned long addr, pte_t *ptep,
1080                                    pte_t old_pte, pte_t pte)
1081{
1082        struct mm_struct *mm = vma->vm_mm;
1083
1084        /*
1085         * To avoid NMMU hang while relaxing access we need to flush the tlb before
1086         * we set the new value. We need to do this only for radix, because hash
1087         * translation does flush when updating the linux pte.
1088         */
1089        if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
1090            (atomic_read(&mm->context.copros) > 0))
1091                radix__flush_tlb_page(vma, addr);
1092
1093        set_pte_at(mm, addr, ptep, pte);
1094}
1095
1096int __init arch_ioremap_pud_supported(void)
1097{
1098        /* HPT does not cope with large pages in the vmalloc area */
1099        return radix_enabled();
1100}
1101
1102int __init arch_ioremap_pmd_supported(void)
1103{
1104        return radix_enabled();
1105}
1106
1107int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
1108{
1109        return 0;
1110}
1111
1112int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
1113{
1114        pte_t *ptep = (pte_t *)pud;
1115        pte_t new_pud = pfn_pte(__phys_to_pfn(addr), prot);
1116
1117        if (!radix_enabled())
1118                return 0;
1119
1120        set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pud);
1121
1122        return 1;
1123}
1124
1125int pud_clear_huge(pud_t *pud)
1126{
1127        if (pud_huge(*pud)) {
1128                pud_clear(pud);
1129                return 1;
1130        }
1131
1132        return 0;
1133}
1134
1135int pud_free_pmd_page(pud_t *pud, unsigned long addr)
1136{
1137        pmd_t *pmd;
1138        int i;
1139
1140        pmd = (pmd_t *)pud_page_vaddr(*pud);
1141        pud_clear(pud);
1142
1143        flush_tlb_kernel_range(addr, addr + PUD_SIZE);
1144
1145        for (i = 0; i < PTRS_PER_PMD; i++) {
1146                if (!pmd_none(pmd[i])) {
1147                        pte_t *pte;
1148                        pte = (pte_t *)pmd_page_vaddr(pmd[i]);
1149
1150                        pte_free_kernel(&init_mm, pte);
1151                }
1152        }
1153
1154        pmd_free(&init_mm, pmd);
1155
1156        return 1;
1157}
1158
1159int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
1160{
1161        pte_t *ptep = (pte_t *)pmd;
1162        pte_t new_pmd = pfn_pte(__phys_to_pfn(addr), prot);
1163
1164        if (!radix_enabled())
1165                return 0;
1166
1167        set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pmd);
1168
1169        return 1;
1170}
1171
1172int pmd_clear_huge(pmd_t *pmd)
1173{
1174        if (pmd_huge(*pmd)) {
1175                pmd_clear(pmd);
1176                return 1;
1177        }
1178
1179        return 0;
1180}
1181
1182int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
1183{
1184        pte_t *pte;
1185
1186        pte = (pte_t *)pmd_page_vaddr(*pmd);
1187        pmd_clear(pmd);
1188
1189        flush_tlb_kernel_range(addr, addr + PMD_SIZE);
1190
1191        pte_free_kernel(&init_mm, pte);
1192
1193        return 1;
1194}
1195
1196int __init arch_ioremap_p4d_supported(void)
1197{
1198        return 0;
1199}
1200