linux/arch/powerpc/mm/pgtable-radix.c
<<
>>
Prefs
   1/*
   2 * Page table handling routines for radix page table.
   3 *
   4 * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
   5 *
   6 * This program is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU General Public License
   8 * as published by the Free Software Foundation; either version
   9 * 2 of the License, or (at your option) any later version.
  10 */
  11
  12#define pr_fmt(fmt) "radix-mmu: " fmt
  13
  14#include <linux/kernel.h>
  15#include <linux/sched/mm.h>
  16#include <linux/memblock.h>
  17#include <linux/of_fdt.h>
  18#include <linux/mm.h>
  19#include <linux/string_helpers.h>
  20
  21#include <asm/pgtable.h>
  22#include <asm/pgalloc.h>
  23#include <asm/dma.h>
  24#include <asm/machdep.h>
  25#include <asm/mmu.h>
  26#include <asm/firmware.h>
  27#include <asm/powernv.h>
  28#include <asm/sections.h>
  29#include <asm/trace.h>
  30
  31#include <trace/events/thp.h>
  32
  33unsigned int mmu_pid_bits;
  34unsigned int mmu_base_pid;
  35
  36static int native_register_process_table(unsigned long base, unsigned long pg_sz,
  37                                         unsigned long table_size)
  38{
  39        unsigned long patb0, patb1;
  40
  41        patb0 = be64_to_cpu(partition_tb[0].patb0);
  42        patb1 = base | table_size | PATB_GR;
  43
  44        mmu_partition_table_set_entry(0, patb0, patb1);
  45
  46        return 0;
  47}
  48
  49static __ref void *early_alloc_pgtable(unsigned long size)
  50{
  51        void *pt;
  52
  53        pt = __va(memblock_alloc_base(size, size, MEMBLOCK_ALLOC_ANYWHERE));
  54        memset(pt, 0, size);
  55
  56        return pt;
  57}
  58
  59int radix__map_kernel_page(unsigned long ea, unsigned long pa,
  60                          pgprot_t flags,
  61                          unsigned int map_page_size)
  62{
  63        pgd_t *pgdp;
  64        pud_t *pudp;
  65        pmd_t *pmdp;
  66        pte_t *ptep;
  67        /*
  68         * Make sure task size is correct as per the max adddr
  69         */
  70        BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
  71        if (slab_is_available()) {
  72                pgdp = pgd_offset_k(ea);
  73                pudp = pud_alloc(&init_mm, pgdp, ea);
  74                if (!pudp)
  75                        return -ENOMEM;
  76                if (map_page_size == PUD_SIZE) {
  77                        ptep = (pte_t *)pudp;
  78                        goto set_the_pte;
  79                }
  80                pmdp = pmd_alloc(&init_mm, pudp, ea);
  81                if (!pmdp)
  82                        return -ENOMEM;
  83                if (map_page_size == PMD_SIZE) {
  84                        ptep = pmdp_ptep(pmdp);
  85                        goto set_the_pte;
  86                }
  87                ptep = pte_alloc_kernel(pmdp, ea);
  88                if (!ptep)
  89                        return -ENOMEM;
  90        } else {
  91                pgdp = pgd_offset_k(ea);
  92                if (pgd_none(*pgdp)) {
  93                        pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
  94                        BUG_ON(pudp == NULL);
  95                        pgd_populate(&init_mm, pgdp, pudp);
  96                }
  97                pudp = pud_offset(pgdp, ea);
  98                if (map_page_size == PUD_SIZE) {
  99                        ptep = (pte_t *)pudp;
 100                        goto set_the_pte;
 101                }
 102                if (pud_none(*pudp)) {
 103                        pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
 104                        BUG_ON(pmdp == NULL);
 105                        pud_populate(&init_mm, pudp, pmdp);
 106                }
 107                pmdp = pmd_offset(pudp, ea);
 108                if (map_page_size == PMD_SIZE) {
 109                        ptep = pmdp_ptep(pmdp);
 110                        goto set_the_pte;
 111                }
 112                if (!pmd_present(*pmdp)) {
 113                        ptep = early_alloc_pgtable(PAGE_SIZE);
 114                        BUG_ON(ptep == NULL);
 115                        pmd_populate_kernel(&init_mm, pmdp, ptep);
 116                }
 117                ptep = pte_offset_kernel(pmdp, ea);
 118        }
 119
 120set_the_pte:
 121        set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, flags));
 122        smp_wmb();
 123        return 0;
 124}
 125
 126#ifdef CONFIG_STRICT_KERNEL_RWX
 127void radix__change_memory_range(unsigned long start, unsigned long end,
 128                                unsigned long clear)
 129{
 130        unsigned long idx;
 131        pgd_t *pgdp;
 132        pud_t *pudp;
 133        pmd_t *pmdp;
 134        pte_t *ptep;
 135
 136        start = ALIGN_DOWN(start, PAGE_SIZE);
 137        end = PAGE_ALIGN(end); // aligns up
 138
 139        pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",
 140                 start, end, clear);
 141
 142        for (idx = start; idx < end; idx += PAGE_SIZE) {
 143                pgdp = pgd_offset_k(idx);
 144                pudp = pud_alloc(&init_mm, pgdp, idx);
 145                if (!pudp)
 146                        continue;
 147                if (pud_huge(*pudp)) {
 148                        ptep = (pte_t *)pudp;
 149                        goto update_the_pte;
 150                }
 151                pmdp = pmd_alloc(&init_mm, pudp, idx);
 152                if (!pmdp)
 153                        continue;
 154                if (pmd_huge(*pmdp)) {
 155                        ptep = pmdp_ptep(pmdp);
 156                        goto update_the_pte;
 157                }
 158                ptep = pte_alloc_kernel(pmdp, idx);
 159                if (!ptep)
 160                        continue;
 161update_the_pte:
 162                radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);
 163        }
 164
 165        radix__flush_tlb_kernel_range(start, end);
 166}
 167
 168void radix__mark_rodata_ro(void)
 169{
 170        unsigned long start, end;
 171
 172        start = (unsigned long)_stext;
 173        end = (unsigned long)__init_begin;
 174
 175        radix__change_memory_range(start, end, _PAGE_WRITE);
 176}
 177
 178void radix__mark_initmem_nx(void)
 179{
 180        unsigned long start = (unsigned long)__init_begin;
 181        unsigned long end = (unsigned long)__init_end;
 182
 183        radix__change_memory_range(start, end, _PAGE_EXEC);
 184}
 185#endif /* CONFIG_STRICT_KERNEL_RWX */
 186
 187static inline void __meminit print_mapping(unsigned long start,
 188                                           unsigned long end,
 189                                           unsigned long size)
 190{
 191        char buf[10];
 192
 193        if (end <= start)
 194                return;
 195
 196        string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));
 197
 198        pr_info("Mapped 0x%016lx-0x%016lx with %s pages\n", start, end, buf);
 199}
 200
 201static int __meminit create_physical_mapping(unsigned long start,
 202                                             unsigned long end)
 203{
 204        unsigned long vaddr, addr, mapping_size = 0;
 205        pgprot_t prot;
 206        unsigned long max_mapping_size;
 207#ifdef CONFIG_STRICT_KERNEL_RWX
 208        int split_text_mapping = 1;
 209#else
 210        int split_text_mapping = 0;
 211#endif
 212
 213        start = _ALIGN_UP(start, PAGE_SIZE);
 214        for (addr = start; addr < end; addr += mapping_size) {
 215                unsigned long gap, previous_size;
 216                int rc;
 217
 218                gap = end - addr;
 219                previous_size = mapping_size;
 220                max_mapping_size = PUD_SIZE;
 221
 222retry:
 223                if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
 224                    mmu_psize_defs[MMU_PAGE_1G].shift &&
 225                    PUD_SIZE <= max_mapping_size)
 226                        mapping_size = PUD_SIZE;
 227                else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
 228                         mmu_psize_defs[MMU_PAGE_2M].shift)
 229                        mapping_size = PMD_SIZE;
 230                else
 231                        mapping_size = PAGE_SIZE;
 232
 233                if (split_text_mapping && (mapping_size == PUD_SIZE) &&
 234                        (addr <= __pa_symbol(__init_begin)) &&
 235                        (addr + mapping_size) >= __pa_symbol(_stext)) {
 236                        max_mapping_size = PMD_SIZE;
 237                        goto retry;
 238                }
 239
 240                if (split_text_mapping && (mapping_size == PMD_SIZE) &&
 241                    (addr <= __pa_symbol(__init_begin)) &&
 242                    (addr + mapping_size) >= __pa_symbol(_stext))
 243                        mapping_size = PAGE_SIZE;
 244
 245                if (mapping_size != previous_size) {
 246                        print_mapping(start, addr, previous_size);
 247                        start = addr;
 248                }
 249
 250                vaddr = (unsigned long)__va(addr);
 251
 252                if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||
 253                    overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size))
 254                        prot = PAGE_KERNEL_X;
 255                else
 256                        prot = PAGE_KERNEL;
 257
 258                rc = radix__map_kernel_page(vaddr, addr, prot, mapping_size);
 259                if (rc)
 260                        return rc;
 261        }
 262
 263        print_mapping(start, addr, mapping_size);
 264        return 0;
 265}
 266
 267static void __init radix_init_pgtable(void)
 268{
 269        unsigned long rts_field;
 270        struct memblock_region *reg;
 271
 272        /* We don't support slb for radix */
 273        mmu_slb_size = 0;
 274        /*
 275         * Create the linear mapping, using standard page size for now
 276         */
 277        for_each_memblock(memory, reg)
 278                WARN_ON(create_physical_mapping(reg->base,
 279                                                reg->base + reg->size));
 280
 281        /* Find out how many PID bits are supported */
 282        if (cpu_has_feature(CPU_FTR_HVMODE)) {
 283                if (!mmu_pid_bits)
 284                        mmu_pid_bits = 20;
 285#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 286                /*
 287                 * When KVM is possible, we only use the top half of the
 288                 * PID space to avoid collisions between host and guest PIDs
 289                 * which can cause problems due to prefetch when exiting the
 290                 * guest with AIL=3
 291                 */
 292                mmu_base_pid = 1 << (mmu_pid_bits - 1);
 293#else
 294                mmu_base_pid = 1;
 295#endif
 296        } else {
 297                /* The guest uses the bottom half of the PID space */
 298                if (!mmu_pid_bits)
 299                        mmu_pid_bits = 19;
 300                mmu_base_pid = 1;
 301        }
 302
 303        /*
 304         * Allocate Partition table and process table for the
 305         * host.
 306         */
 307        BUG_ON(PRTB_SIZE_SHIFT > 36);
 308        process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT);
 309        /*
 310         * Fill in the process table.
 311         */
 312        rts_field = radix__get_tree_size();
 313        process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
 314        /*
 315         * Fill in the partition table. We are suppose to use effective address
 316         * of process table here. But our linear mapping also enable us to use
 317         * physical address here.
 318         */
 319        register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12);
 320        pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd);
 321        asm volatile("ptesync" : : : "memory");
 322        asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
 323                     "r" (TLBIEL_INVAL_SET_LPID), "r" (0));
 324        asm volatile("eieio; tlbsync; ptesync" : : : "memory");
 325        trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1);
 326}
 327
 328static void __init radix_init_partition_table(void)
 329{
 330        unsigned long rts_field, dw0;
 331
 332        mmu_partition_table_init();
 333        rts_field = radix__get_tree_size();
 334        dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
 335        mmu_partition_table_set_entry(0, dw0, 0);
 336
 337        pr_info("Initializing Radix MMU\n");
 338        pr_info("Partition table %p\n", partition_tb);
 339}
 340
 341void __init radix_init_native(void)
 342{
 343        register_process_table = native_register_process_table;
 344}
 345
 346static int __init get_idx_from_shift(unsigned int shift)
 347{
 348        int idx = -1;
 349
 350        switch (shift) {
 351        case 0xc:
 352                idx = MMU_PAGE_4K;
 353                break;
 354        case 0x10:
 355                idx = MMU_PAGE_64K;
 356                break;
 357        case 0x15:
 358                idx = MMU_PAGE_2M;
 359                break;
 360        case 0x1e:
 361                idx = MMU_PAGE_1G;
 362                break;
 363        }
 364        return idx;
 365}
 366
 367static int __init radix_dt_scan_page_sizes(unsigned long node,
 368                                           const char *uname, int depth,
 369                                           void *data)
 370{
 371        int size = 0;
 372        int shift, idx;
 373        unsigned int ap;
 374        const __be32 *prop;
 375        const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
 376
 377        /* We are scanning "cpu" nodes only */
 378        if (type == NULL || strcmp(type, "cpu") != 0)
 379                return 0;
 380
 381        /* Find MMU PID size */
 382        prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
 383        if (prop && size == 4)
 384                mmu_pid_bits = be32_to_cpup(prop);
 385
 386        /* Grab page size encodings */
 387        prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
 388        if (!prop)
 389                return 0;
 390
 391        pr_info("Page sizes from device-tree:\n");
 392        for (; size >= 4; size -= 4, ++prop) {
 393
 394                struct mmu_psize_def *def;
 395
 396                /* top 3 bit is AP encoding */
 397                shift = be32_to_cpu(prop[0]) & ~(0xe << 28);
 398                ap = be32_to_cpu(prop[0]) >> 29;
 399                pr_info("Page size shift = %d AP=0x%x\n", shift, ap);
 400
 401                idx = get_idx_from_shift(shift);
 402                if (idx < 0)
 403                        continue;
 404
 405                def = &mmu_psize_defs[idx];
 406                def->shift = shift;
 407                def->ap  = ap;
 408        }
 409
 410        /* needed ? */
 411        cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
 412        return 1;
 413}
 414
 415void __init radix__early_init_devtree(void)
 416{
 417        int rc;
 418
 419        /*
 420         * Try to find the available page sizes in the device-tree
 421         */
 422        rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
 423        if (rc != 0)  /* Found */
 424                goto found;
 425        /*
 426         * let's assume we have page 4k and 64k support
 427         */
 428        mmu_psize_defs[MMU_PAGE_4K].shift = 12;
 429        mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
 430
 431        mmu_psize_defs[MMU_PAGE_64K].shift = 16;
 432        mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
 433found:
 434#ifdef CONFIG_SPARSEMEM_VMEMMAP
 435        if (mmu_psize_defs[MMU_PAGE_2M].shift) {
 436                /*
 437                 * map vmemmap using 2M if available
 438                 */
 439                mmu_vmemmap_psize = MMU_PAGE_2M;
 440        }
 441#endif /* CONFIG_SPARSEMEM_VMEMMAP */
 442        return;
 443}
 444
 445static void update_hid_for_radix(void)
 446{
 447        unsigned long hid0;
 448        unsigned long rb = 3UL << PPC_BITLSHIFT(53); /* IS = 3 */
 449
 450        asm volatile("ptesync": : :"memory");
 451        /* prs = 0, ric = 2, rs = 0, r = 1 is = 3 */
 452        asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
 453                     : : "r"(rb), "i"(1), "i"(0), "i"(2), "r"(0) : "memory");
 454        /* prs = 1, ric = 2, rs = 0, r = 1 is = 3 */
 455        asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
 456                     : : "r"(rb), "i"(1), "i"(1), "i"(2), "r"(0) : "memory");
 457        asm volatile("eieio; tlbsync; ptesync; isync; slbia": : :"memory");
 458        trace_tlbie(0, 0, rb, 0, 2, 0, 1);
 459        trace_tlbie(0, 0, rb, 0, 2, 1, 1);
 460
 461        /*
 462         * now switch the HID
 463         */
 464        hid0  = mfspr(SPRN_HID0);
 465        hid0 |= HID0_POWER9_RADIX;
 466        mtspr(SPRN_HID0, hid0);
 467        asm volatile("isync": : :"memory");
 468
 469        /* Wait for it to happen */
 470        while (!(mfspr(SPRN_HID0) & HID0_POWER9_RADIX))
 471                cpu_relax();
 472}
 473
 474static void radix_init_amor(void)
 475{
 476        /*
 477        * In HV mode, we init AMOR (Authority Mask Override Register) so that
 478        * the hypervisor and guest can setup IAMR (Instruction Authority Mask
 479        * Register), enable key 0 and set it to 1.
 480        *
 481        * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11)
 482        */
 483        mtspr(SPRN_AMOR, (3ul << 62));
 484}
 485
 486static void radix_init_iamr(void)
 487{
 488        unsigned long iamr;
 489
 490        /*
 491         * The IAMR should set to 0 on DD1.
 492         */
 493        if (cpu_has_feature(CPU_FTR_POWER9_DD1))
 494                iamr = 0;
 495        else
 496                iamr = (1ul << 62);
 497
 498        /*
 499         * Radix always uses key0 of the IAMR to determine if an access is
 500         * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
 501         * fetch.
 502         */
 503        mtspr(SPRN_IAMR, iamr);
 504}
 505
 506void __init radix__early_init_mmu(void)
 507{
 508        unsigned long lpcr;
 509
 510#ifdef CONFIG_PPC_64K_PAGES
 511        /* PAGE_SIZE mappings */
 512        mmu_virtual_psize = MMU_PAGE_64K;
 513#else
 514        mmu_virtual_psize = MMU_PAGE_4K;
 515#endif
 516
 517#ifdef CONFIG_SPARSEMEM_VMEMMAP
 518        /* vmemmap mapping */
 519        mmu_vmemmap_psize = mmu_virtual_psize;
 520#endif
 521        /*
 522         * initialize page table size
 523         */
 524        __pte_index_size = RADIX_PTE_INDEX_SIZE;
 525        __pmd_index_size = RADIX_PMD_INDEX_SIZE;
 526        __pud_index_size = RADIX_PUD_INDEX_SIZE;
 527        __pgd_index_size = RADIX_PGD_INDEX_SIZE;
 528        __pmd_cache_index = RADIX_PMD_INDEX_SIZE;
 529        __pte_table_size = RADIX_PTE_TABLE_SIZE;
 530        __pmd_table_size = RADIX_PMD_TABLE_SIZE;
 531        __pud_table_size = RADIX_PUD_TABLE_SIZE;
 532        __pgd_table_size = RADIX_PGD_TABLE_SIZE;
 533
 534        __pmd_val_bits = RADIX_PMD_VAL_BITS;
 535        __pud_val_bits = RADIX_PUD_VAL_BITS;
 536        __pgd_val_bits = RADIX_PGD_VAL_BITS;
 537
 538        __kernel_virt_start = RADIX_KERN_VIRT_START;
 539        __kernel_virt_size = RADIX_KERN_VIRT_SIZE;
 540        __vmalloc_start = RADIX_VMALLOC_START;
 541        __vmalloc_end = RADIX_VMALLOC_END;
 542        __kernel_io_start = RADIX_KERN_IO_START;
 543        vmemmap = (struct page *)RADIX_VMEMMAP_BASE;
 544        ioremap_bot = IOREMAP_BASE;
 545
 546#ifdef CONFIG_PCI
 547        pci_io_base = ISA_IO_BASE;
 548#endif
 549
 550        /*
 551         * For now radix also use the same frag size
 552         */
 553        __pte_frag_nr = H_PTE_FRAG_NR;
 554        __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT;
 555
 556        if (!firmware_has_feature(FW_FEATURE_LPAR)) {
 557                radix_init_native();
 558                if (cpu_has_feature(CPU_FTR_POWER9_DD1))
 559                        update_hid_for_radix();
 560                lpcr = mfspr(SPRN_LPCR);
 561                mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
 562                radix_init_partition_table();
 563                radix_init_amor();
 564        } else {
 565                radix_init_pseries();
 566        }
 567
 568        memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
 569
 570        radix_init_iamr();
 571        radix_init_pgtable();
 572}
 573
 574void radix__early_init_mmu_secondary(void)
 575{
 576        unsigned long lpcr;
 577        /*
 578         * update partition table control register and UPRT
 579         */
 580        if (!firmware_has_feature(FW_FEATURE_LPAR)) {
 581
 582                if (cpu_has_feature(CPU_FTR_POWER9_DD1))
 583                        update_hid_for_radix();
 584
 585                lpcr = mfspr(SPRN_LPCR);
 586                mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
 587
 588                mtspr(SPRN_PTCR,
 589                      __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
 590                radix_init_amor();
 591        }
 592        radix_init_iamr();
 593}
 594
 595void radix__mmu_cleanup_all(void)
 596{
 597        unsigned long lpcr;
 598
 599        if (!firmware_has_feature(FW_FEATURE_LPAR)) {
 600                lpcr = mfspr(SPRN_LPCR);
 601                mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
 602                mtspr(SPRN_PTCR, 0);
 603                powernv_set_nmmu_ptcr(0);
 604                radix__flush_tlb_all();
 605        }
 606}
 607
 608void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
 609                                phys_addr_t first_memblock_size)
 610{
 611        /* We don't currently support the first MEMBLOCK not mapping 0
 612         * physical on those processors
 613         */
 614        BUG_ON(first_memblock_base != 0);
 615        /*
 616         * We limit the allocation that depend on ppc64_rma_size
 617         * to first_memblock_size. We also clamp it to 1GB to
 618         * avoid some funky things such as RTAS bugs.
 619         *
 620         * On radix config we really don't have a limitation
 621         * on real mode access. But keeping it as above works
 622         * well enough.
 623         */
 624        ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000);
 625        /*
 626         * Finally limit subsequent allocations. We really don't want
 627         * to limit the memblock allocations to rma_size. FIXME!! should
 628         * we even limit at all ?
 629         */
 630        memblock_set_current_limit(first_memblock_base + first_memblock_size);
 631}
 632
 633#ifdef CONFIG_MEMORY_HOTPLUG
 634static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
 635{
 636        pte_t *pte;
 637        int i;
 638
 639        for (i = 0; i < PTRS_PER_PTE; i++) {
 640                pte = pte_start + i;
 641                if (!pte_none(*pte))
 642                        return;
 643        }
 644
 645        pte_free_kernel(&init_mm, pte_start);
 646        pmd_clear(pmd);
 647}
 648
 649static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
 650{
 651        pmd_t *pmd;
 652        int i;
 653
 654        for (i = 0; i < PTRS_PER_PMD; i++) {
 655                pmd = pmd_start + i;
 656                if (!pmd_none(*pmd))
 657                        return;
 658        }
 659
 660        pmd_free(&init_mm, pmd_start);
 661        pud_clear(pud);
 662}
 663
 664static void remove_pte_table(pte_t *pte_start, unsigned long addr,
 665                             unsigned long end)
 666{
 667        unsigned long next;
 668        pte_t *pte;
 669
 670        pte = pte_start + pte_index(addr);
 671        for (; addr < end; addr = next, pte++) {
 672                next = (addr + PAGE_SIZE) & PAGE_MASK;
 673                if (next > end)
 674                        next = end;
 675
 676                if (!pte_present(*pte))
 677                        continue;
 678
 679                if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
 680                        /*
 681                         * The vmemmap_free() and remove_section_mapping()
 682                         * codepaths call us with aligned addresses.
 683                         */
 684                        WARN_ONCE(1, "%s: unaligned range\n", __func__);
 685                        continue;
 686                }
 687
 688                pte_clear(&init_mm, addr, pte);
 689        }
 690}
 691
 692static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
 693                             unsigned long end)
 694{
 695        unsigned long next;
 696        pte_t *pte_base;
 697        pmd_t *pmd;
 698
 699        pmd = pmd_start + pmd_index(addr);
 700        for (; addr < end; addr = next, pmd++) {
 701                next = pmd_addr_end(addr, end);
 702
 703                if (!pmd_present(*pmd))
 704                        continue;
 705
 706                if (pmd_huge(*pmd)) {
 707                        if (!IS_ALIGNED(addr, PMD_SIZE) ||
 708                            !IS_ALIGNED(next, PMD_SIZE)) {
 709                                WARN_ONCE(1, "%s: unaligned range\n", __func__);
 710                                continue;
 711                        }
 712
 713                        pte_clear(&init_mm, addr, (pte_t *)pmd);
 714                        continue;
 715                }
 716
 717                pte_base = (pte_t *)pmd_page_vaddr(*pmd);
 718                remove_pte_table(pte_base, addr, next);
 719                free_pte_table(pte_base, pmd);
 720        }
 721}
 722
 723static void remove_pud_table(pud_t *pud_start, unsigned long addr,
 724                             unsigned long end)
 725{
 726        unsigned long next;
 727        pmd_t *pmd_base;
 728        pud_t *pud;
 729
 730        pud = pud_start + pud_index(addr);
 731        for (; addr < end; addr = next, pud++) {
 732                next = pud_addr_end(addr, end);
 733
 734                if (!pud_present(*pud))
 735                        continue;
 736
 737                if (pud_huge(*pud)) {
 738                        if (!IS_ALIGNED(addr, PUD_SIZE) ||
 739                            !IS_ALIGNED(next, PUD_SIZE)) {
 740                                WARN_ONCE(1, "%s: unaligned range\n", __func__);
 741                                continue;
 742                        }
 743
 744                        pte_clear(&init_mm, addr, (pte_t *)pud);
 745                        continue;
 746                }
 747
 748                pmd_base = (pmd_t *)pud_page_vaddr(*pud);
 749                remove_pmd_table(pmd_base, addr, next);
 750                free_pmd_table(pmd_base, pud);
 751        }
 752}
 753
 754static void remove_pagetable(unsigned long start, unsigned long end)
 755{
 756        unsigned long addr, next;
 757        pud_t *pud_base;
 758        pgd_t *pgd;
 759
 760        spin_lock(&init_mm.page_table_lock);
 761
 762        for (addr = start; addr < end; addr = next) {
 763                next = pgd_addr_end(addr, end);
 764
 765                pgd = pgd_offset_k(addr);
 766                if (!pgd_present(*pgd))
 767                        continue;
 768
 769                if (pgd_huge(*pgd)) {
 770                        if (!IS_ALIGNED(addr, PGDIR_SIZE) ||
 771                            !IS_ALIGNED(next, PGDIR_SIZE)) {
 772                                WARN_ONCE(1, "%s: unaligned range\n", __func__);
 773                                continue;
 774                        }
 775
 776                        pte_clear(&init_mm, addr, (pte_t *)pgd);
 777                        continue;
 778                }
 779
 780                pud_base = (pud_t *)pgd_page_vaddr(*pgd);
 781                remove_pud_table(pud_base, addr, next);
 782        }
 783
 784        spin_unlock(&init_mm.page_table_lock);
 785        radix__flush_tlb_kernel_range(start, end);
 786}
 787
 788int __ref radix__create_section_mapping(unsigned long start, unsigned long end)
 789{
 790        return create_physical_mapping(start, end);
 791}
 792
 793int radix__remove_section_mapping(unsigned long start, unsigned long end)
 794{
 795        remove_pagetable(start, end);
 796        return 0;
 797}
 798#endif /* CONFIG_MEMORY_HOTPLUG */
 799
 800#ifdef CONFIG_SPARSEMEM_VMEMMAP
 801int __meminit radix__vmemmap_create_mapping(unsigned long start,
 802                                      unsigned long page_size,
 803                                      unsigned long phys)
 804{
 805        /* Create a PTE encoding */
 806        unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW;
 807
 808        BUG_ON(radix__map_kernel_page(start, phys, __pgprot(flags), page_size));
 809        return 0;
 810}
 811
 812#ifdef CONFIG_MEMORY_HOTPLUG
 813void radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
 814{
 815        remove_pagetable(start, start + page_size);
 816}
 817#endif
 818#endif
 819
 820#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 821
 822unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
 823                                  pmd_t *pmdp, unsigned long clr,
 824                                  unsigned long set)
 825{
 826        unsigned long old;
 827
 828#ifdef CONFIG_DEBUG_VM
 829        WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
 830        assert_spin_locked(&mm->page_table_lock);
 831#endif
 832
 833        old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1);
 834        trace_hugepage_update(addr, old, clr, set);
 835
 836        return old;
 837}
 838
 839pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
 840                        pmd_t *pmdp)
 841
 842{
 843        pmd_t pmd;
 844
 845        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 846        VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
 847        VM_BUG_ON(pmd_devmap(*pmdp));
 848        /*
 849         * khugepaged calls this for normal pmd
 850         */
 851        pmd = *pmdp;
 852        pmd_clear(pmdp);
 853
 854        /*FIXME!!  Verify whether we need this kick below */
 855        serialize_against_pte_lookup(vma->vm_mm);
 856
 857        radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
 858
 859        return pmd;
 860}
 861
 862/*
 863 * For us pgtable_t is pte_t *. Inorder to save the deposisted
 864 * page table, we consider the allocated page table as a list
 865 * head. On withdraw we need to make sure we zero out the used
 866 * list_head memory area.
 867 */
 868void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
 869                                 pgtable_t pgtable)
 870{
 871        struct list_head *lh = (struct list_head *) pgtable;
 872
 873        assert_spin_locked(pmd_lockptr(mm, pmdp));
 874
 875        /* FIFO */
 876        if (!pmd_huge_pte(mm, pmdp))
 877                INIT_LIST_HEAD(lh);
 878        else
 879                list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
 880        pmd_huge_pte(mm, pmdp) = pgtable;
 881}
 882
 883pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 884{
 885        pte_t *ptep;
 886        pgtable_t pgtable;
 887        struct list_head *lh;
 888
 889        assert_spin_locked(pmd_lockptr(mm, pmdp));
 890
 891        /* FIFO */
 892        pgtable = pmd_huge_pte(mm, pmdp);
 893        lh = (struct list_head *) pgtable;
 894        if (list_empty(lh))
 895                pmd_huge_pte(mm, pmdp) = NULL;
 896        else {
 897                pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
 898                list_del(lh);
 899        }
 900        ptep = (pte_t *) pgtable;
 901        *ptep = __pte(0);
 902        ptep++;
 903        *ptep = __pte(0);
 904        return pgtable;
 905}
 906
 907
 908pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
 909                               unsigned long addr, pmd_t *pmdp)
 910{
 911        pmd_t old_pmd;
 912        unsigned long old;
 913
 914        old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
 915        old_pmd = __pmd(old);
 916        /*
 917         * Serialize against find_current_mm_pte which does lock-less
 918         * lookup in page tables with local interrupts disabled. For huge pages
 919         * it casts pmd_t to pte_t. Since format of pte_t is different from
 920         * pmd_t we want to prevent transit from pmd pointing to page table
 921         * to pmd pointing to huge page (and back) while interrupts are disabled.
 922         * We clear pmd to possibly replace it with page table pointer in
 923         * different code paths. So make sure we wait for the parallel
 924         * find_current_mm_pte to finish.
 925         */
 926        serialize_against_pte_lookup(mm);
 927        return old_pmd;
 928}
 929
 930int radix__has_transparent_hugepage(void)
 931{
 932        /* For radix 2M at PMD level means thp */
 933        if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT)
 934                return 1;
 935        return 0;
 936}
 937#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 938