linux/arch/arm64/mm/mmu.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Based on arch/arm/mm/mmu.c
   4 *
   5 * Copyright (C) 1995-2005 Russell King
   6 * Copyright (C) 2012 ARM Ltd.
   7 */
   8
   9#include <linux/cache.h>
  10#include <linux/export.h>
  11#include <linux/kernel.h>
  12#include <linux/errno.h>
  13#include <linux/init.h>
  14#include <linux/ioport.h>
  15#include <linux/kexec.h>
  16#include <linux/libfdt.h>
  17#include <linux/mman.h>
  18#include <linux/nodemask.h>
  19#include <linux/memblock.h>
  20#include <linux/memory.h>
  21#include <linux/fs.h>
  22#include <linux/io.h>
  23#include <linux/mm.h>
  24#include <linux/vmalloc.h>
  25
  26#include <asm/barrier.h>
  27#include <asm/cputype.h>
  28#include <asm/fixmap.h>
  29#include <asm/kasan.h>
  30#include <asm/kernel-pgtable.h>
  31#include <asm/sections.h>
  32#include <asm/setup.h>
  33#include <linux/sizes.h>
  34#include <asm/tlb.h>
  35#include <asm/mmu_context.h>
  36#include <asm/ptdump.h>
  37#include <asm/tlbflush.h>
  38#include <asm/pgalloc.h>
  39
  40#define NO_BLOCK_MAPPINGS       BIT(0)
  41#define NO_CONT_MAPPINGS        BIT(1)
  42
  43u64 idmap_t0sz = TCR_T0SZ(VA_BITS);
  44u64 idmap_ptrs_per_pgd = PTRS_PER_PGD;
  45
  46u64 __section(".mmuoff.data.write") vabits_actual;
  47EXPORT_SYMBOL(vabits_actual);
  48
  49u64 kimage_voffset __ro_after_init;
  50EXPORT_SYMBOL(kimage_voffset);
  51
  52/*
  53 * Empty_zero_page is a special page that is used for zero-initialized data
  54 * and COW.
  55 */
  56unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss;
  57EXPORT_SYMBOL(empty_zero_page);
  58
  59static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss;
  60static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused;
  61static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused;
  62
  63static DEFINE_SPINLOCK(swapper_pgdir_lock);
  64
  65void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd)
  66{
  67        pgd_t *fixmap_pgdp;
  68
  69        spin_lock(&swapper_pgdir_lock);
  70        fixmap_pgdp = pgd_set_fixmap(__pa_symbol(pgdp));
  71        WRITE_ONCE(*fixmap_pgdp, pgd);
  72        /*
  73         * We need dsb(ishst) here to ensure the page-table-walker sees
  74         * our new entry before set_p?d() returns. The fixmap's
  75         * flush_tlb_kernel_range() via clear_fixmap() does this for us.
  76         */
  77        pgd_clear_fixmap();
  78        spin_unlock(&swapper_pgdir_lock);
  79}
  80
  81pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
  82                              unsigned long size, pgprot_t vma_prot)
  83{
  84        if (!pfn_valid(pfn))
  85                return pgprot_noncached(vma_prot);
  86        else if (file->f_flags & O_SYNC)
  87                return pgprot_writecombine(vma_prot);
  88        return vma_prot;
  89}
  90EXPORT_SYMBOL(phys_mem_access_prot);
  91
  92static phys_addr_t __init early_pgtable_alloc(int shift)
  93{
  94        phys_addr_t phys;
  95        void *ptr;
  96
  97        phys = memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
  98        if (!phys)
  99                panic("Failed to allocate page table page\n");
 100
 101        /*
 102         * The FIX_{PGD,PUD,PMD} slots may be in active use, but the FIX_PTE
 103         * slot will be free, so we can (ab)use the FIX_PTE slot to initialise
 104         * any level of table.
 105         */
 106        ptr = pte_set_fixmap(phys);
 107
 108        memset(ptr, 0, PAGE_SIZE);
 109
 110        /*
 111         * Implicit barriers also ensure the zeroed page is visible to the page
 112         * table walker
 113         */
 114        pte_clear_fixmap();
 115
 116        return phys;
 117}
 118
 119static bool pgattr_change_is_safe(u64 old, u64 new)
 120{
 121        /*
 122         * The following mapping attributes may be updated in live
 123         * kernel mappings without the need for break-before-make.
 124         */
 125        static const pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE | PTE_NG;
 126
 127        /* creating or taking down mappings is always safe */
 128        if (old == 0 || new == 0)
 129                return true;
 130
 131        /* live contiguous mappings may not be manipulated at all */
 132        if ((old | new) & PTE_CONT)
 133                return false;
 134
 135        /* Transitioning from Non-Global to Global is unsafe */
 136        if (old & ~new & PTE_NG)
 137                return false;
 138
 139        return ((old ^ new) & ~mask) == 0;
 140}
 141
 142static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
 143                     phys_addr_t phys, pgprot_t prot)
 144{
 145        pte_t *ptep;
 146
 147        ptep = pte_set_fixmap_offset(pmdp, addr);
 148        do {
 149                pte_t old_pte = READ_ONCE(*ptep);
 150
 151                set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot));
 152
 153                /*
 154                 * After the PTE entry has been populated once, we
 155                 * only allow updates to the permission attributes.
 156                 */
 157                BUG_ON(!pgattr_change_is_safe(pte_val(old_pte),
 158                                              READ_ONCE(pte_val(*ptep))));
 159
 160                phys += PAGE_SIZE;
 161        } while (ptep++, addr += PAGE_SIZE, addr != end);
 162
 163        pte_clear_fixmap();
 164}
 165
 166static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
 167                                unsigned long end, phys_addr_t phys,
 168                                pgprot_t prot,
 169                                phys_addr_t (*pgtable_alloc)(int),
 170                                int flags)
 171{
 172        unsigned long next;
 173        pmd_t pmd = READ_ONCE(*pmdp);
 174
 175        BUG_ON(pmd_sect(pmd));
 176        if (pmd_none(pmd)) {
 177                phys_addr_t pte_phys;
 178                BUG_ON(!pgtable_alloc);
 179                pte_phys = pgtable_alloc(PAGE_SHIFT);
 180                __pmd_populate(pmdp, pte_phys, PMD_TYPE_TABLE);
 181                pmd = READ_ONCE(*pmdp);
 182        }
 183        BUG_ON(pmd_bad(pmd));
 184
 185        do {
 186                pgprot_t __prot = prot;
 187
 188                next = pte_cont_addr_end(addr, end);
 189
 190                /* use a contiguous mapping if the range is suitably aligned */
 191                if ((((addr | next | phys) & ~CONT_PTE_MASK) == 0) &&
 192                    (flags & NO_CONT_MAPPINGS) == 0)
 193                        __prot = __pgprot(pgprot_val(prot) | PTE_CONT);
 194
 195                init_pte(pmdp, addr, next, phys, __prot);
 196
 197                phys += next - addr;
 198        } while (addr = next, addr != end);
 199}
 200
 201static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end,
 202                     phys_addr_t phys, pgprot_t prot,
 203                     phys_addr_t (*pgtable_alloc)(int), int flags)
 204{
 205        unsigned long next;
 206        pmd_t *pmdp;
 207
 208        pmdp = pmd_set_fixmap_offset(pudp, addr);
 209        do {
 210                pmd_t old_pmd = READ_ONCE(*pmdp);
 211
 212                next = pmd_addr_end(addr, end);
 213
 214                /* try section mapping first */
 215                if (((addr | next | phys) & ~SECTION_MASK) == 0 &&
 216                    (flags & NO_BLOCK_MAPPINGS) == 0) {
 217                        pmd_set_huge(pmdp, phys, prot);
 218
 219                        /*
 220                         * After the PMD entry has been populated once, we
 221                         * only allow updates to the permission attributes.
 222                         */
 223                        BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd),
 224                                                      READ_ONCE(pmd_val(*pmdp))));
 225                } else {
 226                        alloc_init_cont_pte(pmdp, addr, next, phys, prot,
 227                                            pgtable_alloc, flags);
 228
 229                        BUG_ON(pmd_val(old_pmd) != 0 &&
 230                               pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp)));
 231                }
 232                phys += next - addr;
 233        } while (pmdp++, addr = next, addr != end);
 234
 235        pmd_clear_fixmap();
 236}
 237
 238static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
 239                                unsigned long end, phys_addr_t phys,
 240                                pgprot_t prot,
 241                                phys_addr_t (*pgtable_alloc)(int), int flags)
 242{
 243        unsigned long next;
 244        pud_t pud = READ_ONCE(*pudp);
 245
 246        /*
 247         * Check for initial section mappings in the pgd/pud.
 248         */
 249        BUG_ON(pud_sect(pud));
 250        if (pud_none(pud)) {
 251                phys_addr_t pmd_phys;
 252                BUG_ON(!pgtable_alloc);
 253                pmd_phys = pgtable_alloc(PMD_SHIFT);
 254                __pud_populate(pudp, pmd_phys, PUD_TYPE_TABLE);
 255                pud = READ_ONCE(*pudp);
 256        }
 257        BUG_ON(pud_bad(pud));
 258
 259        do {
 260                pgprot_t __prot = prot;
 261
 262                next = pmd_cont_addr_end(addr, end);
 263
 264                /* use a contiguous mapping if the range is suitably aligned */
 265                if ((((addr | next | phys) & ~CONT_PMD_MASK) == 0) &&
 266                    (flags & NO_CONT_MAPPINGS) == 0)
 267                        __prot = __pgprot(pgprot_val(prot) | PTE_CONT);
 268
 269                init_pmd(pudp, addr, next, phys, __prot, pgtable_alloc, flags);
 270
 271                phys += next - addr;
 272        } while (addr = next, addr != end);
 273}
 274
 275static inline bool use_1G_block(unsigned long addr, unsigned long next,
 276                        unsigned long phys)
 277{
 278        if (PAGE_SHIFT != 12)
 279                return false;
 280
 281        if (((addr | next | phys) & ~PUD_MASK) != 0)
 282                return false;
 283
 284        return true;
 285}
 286
 287static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
 288                           phys_addr_t phys, pgprot_t prot,
 289                           phys_addr_t (*pgtable_alloc)(int),
 290                           int flags)
 291{
 292        unsigned long next;
 293        pud_t *pudp;
 294        p4d_t *p4dp = p4d_offset(pgdp, addr);
 295        p4d_t p4d = READ_ONCE(*p4dp);
 296
 297        if (p4d_none(p4d)) {
 298                phys_addr_t pud_phys;
 299                BUG_ON(!pgtable_alloc);
 300                pud_phys = pgtable_alloc(PUD_SHIFT);
 301                __p4d_populate(p4dp, pud_phys, PUD_TYPE_TABLE);
 302                p4d = READ_ONCE(*p4dp);
 303        }
 304        BUG_ON(p4d_bad(p4d));
 305
 306        pudp = pud_set_fixmap_offset(p4dp, addr);
 307        do {
 308                pud_t old_pud = READ_ONCE(*pudp);
 309
 310                next = pud_addr_end(addr, end);
 311
 312                /*
 313                 * For 4K granule only, attempt to put down a 1GB block
 314                 */
 315                if (use_1G_block(addr, next, phys) &&
 316                    (flags & NO_BLOCK_MAPPINGS) == 0) {
 317                        pud_set_huge(pudp, phys, prot);
 318
 319                        /*
 320                         * After the PUD entry has been populated once, we
 321                         * only allow updates to the permission attributes.
 322                         */
 323                        BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
 324                                                      READ_ONCE(pud_val(*pudp))));
 325                } else {
 326                        alloc_init_cont_pmd(pudp, addr, next, phys, prot,
 327                                            pgtable_alloc, flags);
 328
 329                        BUG_ON(pud_val(old_pud) != 0 &&
 330                               pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
 331                }
 332                phys += next - addr;
 333        } while (pudp++, addr = next, addr != end);
 334
 335        pud_clear_fixmap();
 336}
 337
 338static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
 339                                 unsigned long virt, phys_addr_t size,
 340                                 pgprot_t prot,
 341                                 phys_addr_t (*pgtable_alloc)(int),
 342                                 int flags)
 343{
 344        unsigned long addr, end, next;
 345        pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
 346
 347        /*
 348         * If the virtual and physical address don't have the same offset
 349         * within a page, we cannot map the region as the caller expects.
 350         */
 351        if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
 352                return;
 353
 354        phys &= PAGE_MASK;
 355        addr = virt & PAGE_MASK;
 356        end = PAGE_ALIGN(virt + size);
 357
 358        do {
 359                next = pgd_addr_end(addr, end);
 360                alloc_init_pud(pgdp, addr, next, phys, prot, pgtable_alloc,
 361                               flags);
 362                phys += next - addr;
 363        } while (pgdp++, addr = next, addr != end);
 364}
 365
 366static phys_addr_t __pgd_pgtable_alloc(int shift)
 367{
 368        void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL);
 369        BUG_ON(!ptr);
 370
 371        /* Ensure the zeroed page is visible to the page table walker */
 372        dsb(ishst);
 373        return __pa(ptr);
 374}
 375
 376static phys_addr_t pgd_pgtable_alloc(int shift)
 377{
 378        phys_addr_t pa = __pgd_pgtable_alloc(shift);
 379
 380        /*
 381         * Call proper page table ctor in case later we need to
 382         * call core mm functions like apply_to_page_range() on
 383         * this pre-allocated page table.
 384         *
 385         * We don't select ARCH_ENABLE_SPLIT_PMD_PTLOCK if pmd is
 386         * folded, and if so pgtable_pmd_page_ctor() becomes nop.
 387         */
 388        if (shift == PAGE_SHIFT)
 389                BUG_ON(!pgtable_pte_page_ctor(phys_to_page(pa)));
 390        else if (shift == PMD_SHIFT)
 391                BUG_ON(!pgtable_pmd_page_ctor(phys_to_page(pa)));
 392
 393        return pa;
 394}
 395
 396/*
 397 * This function can only be used to modify existing table entries,
 398 * without allocating new levels of table. Note that this permits the
 399 * creation of new section or page entries.
 400 */
 401static void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt,
 402                                  phys_addr_t size, pgprot_t prot)
 403{
 404        if ((virt >= PAGE_END) && (virt < VMALLOC_START)) {
 405                pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n",
 406                        &phys, virt);
 407                return;
 408        }
 409        __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
 410                             NO_CONT_MAPPINGS);
 411}
 412
 413void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
 414                               unsigned long virt, phys_addr_t size,
 415                               pgprot_t prot, bool page_mappings_only)
 416{
 417        int flags = 0;
 418
 419        BUG_ON(mm == &init_mm);
 420
 421        if (page_mappings_only)
 422                flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
 423
 424        __create_pgd_mapping(mm->pgd, phys, virt, size, prot,
 425                             pgd_pgtable_alloc, flags);
 426}
 427
 428static void update_mapping_prot(phys_addr_t phys, unsigned long virt,
 429                                phys_addr_t size, pgprot_t prot)
 430{
 431        if ((virt >= PAGE_END) && (virt < VMALLOC_START)) {
 432                pr_warn("BUG: not updating mapping for %pa at 0x%016lx - outside kernel range\n",
 433                        &phys, virt);
 434                return;
 435        }
 436
 437        __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
 438                             NO_CONT_MAPPINGS);
 439
 440        /* flush the TLBs after updating live kernel mappings */
 441        flush_tlb_kernel_range(virt, virt + size);
 442}
 443
 444static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start,
 445                                  phys_addr_t end, pgprot_t prot, int flags)
 446{
 447        __create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start,
 448                             prot, early_pgtable_alloc, flags);
 449}
 450
 451void __init mark_linear_text_alias_ro(void)
 452{
 453        /*
 454         * Remove the write permissions from the linear alias of .text/.rodata
 455         */
 456        update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text),
 457                            (unsigned long)__init_begin - (unsigned long)_text,
 458                            PAGE_KERNEL_RO);
 459}
 460
 461static void __init map_mem(pgd_t *pgdp)
 462{
 463        phys_addr_t kernel_start = __pa_symbol(_text);
 464        phys_addr_t kernel_end = __pa_symbol(__init_begin);
 465        struct memblock_region *reg;
 466        int flags = 0;
 467
 468        if (rodata_full || debug_pagealloc_enabled())
 469                flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
 470
 471        /*
 472         * Take care not to create a writable alias for the
 473         * read-only text and rodata sections of the kernel image.
 474         * So temporarily mark them as NOMAP to skip mappings in
 475         * the following for-loop
 476         */
 477        memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
 478#ifdef CONFIG_KEXEC_CORE
 479        if (crashk_res.end)
 480                memblock_mark_nomap(crashk_res.start,
 481                                    resource_size(&crashk_res));
 482#endif
 483
 484        /* map all the memory banks */
 485        for_each_memblock(memory, reg) {
 486                phys_addr_t start = reg->base;
 487                phys_addr_t end = start + reg->size;
 488
 489                if (start >= end)
 490                        break;
 491                if (memblock_is_nomap(reg))
 492                        continue;
 493
 494                __map_memblock(pgdp, start, end, PAGE_KERNEL, flags);
 495        }
 496
 497        /*
 498         * Map the linear alias of the [_text, __init_begin) interval
 499         * as non-executable now, and remove the write permission in
 500         * mark_linear_text_alias_ro() below (which will be called after
 501         * alternative patching has completed). This makes the contents
 502         * of the region accessible to subsystems such as hibernate,
 503         * but protects it from inadvertent modification or execution.
 504         * Note that contiguous mappings cannot be remapped in this way,
 505         * so we should avoid them here.
 506         */
 507        __map_memblock(pgdp, kernel_start, kernel_end,
 508                       PAGE_KERNEL, NO_CONT_MAPPINGS);
 509        memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
 510
 511#ifdef CONFIG_KEXEC_CORE
 512        /*
 513         * Use page-level mappings here so that we can shrink the region
 514         * in page granularity and put back unused memory to buddy system
 515         * through /sys/kernel/kexec_crash_size interface.
 516         */
 517        if (crashk_res.end) {
 518                __map_memblock(pgdp, crashk_res.start, crashk_res.end + 1,
 519                               PAGE_KERNEL,
 520                               NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
 521                memblock_clear_nomap(crashk_res.start,
 522                                     resource_size(&crashk_res));
 523        }
 524#endif
 525}
 526
 527void mark_rodata_ro(void)
 528{
 529        unsigned long section_size;
 530
 531        /*
 532         * mark .rodata as read only. Use __init_begin rather than __end_rodata
 533         * to cover NOTES and EXCEPTION_TABLE.
 534         */
 535        section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata;
 536        update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata,
 537                            section_size, PAGE_KERNEL_RO);
 538
 539        debug_checkwx();
 540}
 541
 542static void __init map_kernel_segment(pgd_t *pgdp, void *va_start, void *va_end,
 543                                      pgprot_t prot, struct vm_struct *vma,
 544                                      int flags, unsigned long vm_flags)
 545{
 546        phys_addr_t pa_start = __pa_symbol(va_start);
 547        unsigned long size = va_end - va_start;
 548
 549        BUG_ON(!PAGE_ALIGNED(pa_start));
 550        BUG_ON(!PAGE_ALIGNED(size));
 551
 552        __create_pgd_mapping(pgdp, pa_start, (unsigned long)va_start, size, prot,
 553                             early_pgtable_alloc, flags);
 554
 555        if (!(vm_flags & VM_NO_GUARD))
 556                size += PAGE_SIZE;
 557
 558        vma->addr       = va_start;
 559        vma->phys_addr  = pa_start;
 560        vma->size       = size;
 561        vma->flags      = VM_MAP | vm_flags;
 562        vma->caller     = __builtin_return_address(0);
 563
 564        vm_area_add_early(vma);
 565}
 566
 567static int __init parse_rodata(char *arg)
 568{
 569        int ret = strtobool(arg, &rodata_enabled);
 570        if (!ret) {
 571                rodata_full = false;
 572                return 0;
 573        }
 574
 575        /* permit 'full' in addition to boolean options */
 576        if (strcmp(arg, "full"))
 577                return -EINVAL;
 578
 579        rodata_enabled = true;
 580        rodata_full = true;
 581        return 0;
 582}
 583early_param("rodata", parse_rodata);
 584
 585#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
 586static int __init map_entry_trampoline(void)
 587{
 588        pgprot_t prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;
 589        phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start);
 590
 591        /* The trampoline is always mapped and can therefore be global */
 592        pgprot_val(prot) &= ~PTE_NG;
 593
 594        /* Map only the text into the trampoline page table */
 595        memset(tramp_pg_dir, 0, PGD_SIZE);
 596        __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, PAGE_SIZE,
 597                             prot, __pgd_pgtable_alloc, 0);
 598
 599        /* Map both the text and data into the kernel page table */
 600        __set_fixmap(FIX_ENTRY_TRAMP_TEXT, pa_start, prot);
 601        if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
 602                extern char __entry_tramp_data_start[];
 603
 604                __set_fixmap(FIX_ENTRY_TRAMP_DATA,
 605                             __pa_symbol(__entry_tramp_data_start),
 606                             PAGE_KERNEL_RO);
 607        }
 608
 609        return 0;
 610}
 611core_initcall(map_entry_trampoline);
 612#endif
 613
 614/*
 615 * Open coded check for BTI, only for use to determine configuration
 616 * for early mappings for before the cpufeature code has run.
 617 */
 618static bool arm64_early_this_cpu_has_bti(void)
 619{
 620        u64 pfr1;
 621
 622        if (!IS_ENABLED(CONFIG_ARM64_BTI_KERNEL))
 623                return false;
 624
 625        pfr1 = read_sysreg_s(SYS_ID_AA64PFR1_EL1);
 626        return cpuid_feature_extract_unsigned_field(pfr1,
 627                                                    ID_AA64PFR1_BT_SHIFT);
 628}
 629
 630/*
 631 * Create fine-grained mappings for the kernel.
 632 */
 633static void __init map_kernel(pgd_t *pgdp)
 634{
 635        static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_inittext,
 636                                vmlinux_initdata, vmlinux_data;
 637
 638        /*
 639         * External debuggers may need to write directly to the text
 640         * mapping to install SW breakpoints. Allow this (only) when
 641         * explicitly requested with rodata=off.
 642         */
 643        pgprot_t text_prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;
 644
 645        /*
 646         * If we have a CPU that supports BTI and a kernel built for
 647         * BTI then mark the kernel executable text as guarded pages
 648         * now so we don't have to rewrite the page tables later.
 649         */
 650        if (arm64_early_this_cpu_has_bti())
 651                text_prot = __pgprot_modify(text_prot, PTE_GP, PTE_GP);
 652
 653        /*
 654         * Only rodata will be remapped with different permissions later on,
 655         * all other segments are allowed to use contiguous mappings.
 656         */
 657        map_kernel_segment(pgdp, _text, _etext, text_prot, &vmlinux_text, 0,
 658                           VM_NO_GUARD);
 659        map_kernel_segment(pgdp, __start_rodata, __inittext_begin, PAGE_KERNEL,
 660                           &vmlinux_rodata, NO_CONT_MAPPINGS, VM_NO_GUARD);
 661        map_kernel_segment(pgdp, __inittext_begin, __inittext_end, text_prot,
 662                           &vmlinux_inittext, 0, VM_NO_GUARD);
 663        map_kernel_segment(pgdp, __initdata_begin, __initdata_end, PAGE_KERNEL,
 664                           &vmlinux_initdata, 0, VM_NO_GUARD);
 665        map_kernel_segment(pgdp, _data, _end, PAGE_KERNEL, &vmlinux_data, 0, 0);
 666
 667        if (!READ_ONCE(pgd_val(*pgd_offset_pgd(pgdp, FIXADDR_START)))) {
 668                /*
 669                 * The fixmap falls in a separate pgd to the kernel, and doesn't
 670                 * live in the carveout for the swapper_pg_dir. We can simply
 671                 * re-use the existing dir for the fixmap.
 672                 */
 673                set_pgd(pgd_offset_pgd(pgdp, FIXADDR_START),
 674                        READ_ONCE(*pgd_offset_k(FIXADDR_START)));
 675        } else if (CONFIG_PGTABLE_LEVELS > 3) {
 676                pgd_t *bm_pgdp;
 677                p4d_t *bm_p4dp;
 678                pud_t *bm_pudp;
 679                /*
 680                 * The fixmap shares its top level pgd entry with the kernel
 681                 * mapping. This can really only occur when we are running
 682                 * with 16k/4 levels, so we can simply reuse the pud level
 683                 * entry instead.
 684                 */
 685                BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
 686                bm_pgdp = pgd_offset_pgd(pgdp, FIXADDR_START);
 687                bm_p4dp = p4d_offset(bm_pgdp, FIXADDR_START);
 688                bm_pudp = pud_set_fixmap_offset(bm_p4dp, FIXADDR_START);
 689                pud_populate(&init_mm, bm_pudp, lm_alias(bm_pmd));
 690                pud_clear_fixmap();
 691        } else {
 692                BUG();
 693        }
 694
 695        kasan_copy_shadow(pgdp);
 696}
 697
 698void __init paging_init(void)
 699{
 700        pgd_t *pgdp = pgd_set_fixmap(__pa_symbol(swapper_pg_dir));
 701
 702        map_kernel(pgdp);
 703        map_mem(pgdp);
 704
 705        pgd_clear_fixmap();
 706
 707        cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
 708        init_mm.pgd = swapper_pg_dir;
 709
 710        memblock_free(__pa_symbol(init_pg_dir),
 711                      __pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir));
 712
 713        memblock_allow_resize();
 714}
 715
 716/*
 717 * Check whether a kernel address is valid (derived from arch/x86/).
 718 */
 719int kern_addr_valid(unsigned long addr)
 720{
 721        pgd_t *pgdp;
 722        p4d_t *p4dp;
 723        pud_t *pudp, pud;
 724        pmd_t *pmdp, pmd;
 725        pte_t *ptep, pte;
 726
 727        addr = arch_kasan_reset_tag(addr);
 728        if ((((long)addr) >> VA_BITS) != -1UL)
 729                return 0;
 730
 731        pgdp = pgd_offset_k(addr);
 732        if (pgd_none(READ_ONCE(*pgdp)))
 733                return 0;
 734
 735        p4dp = p4d_offset(pgdp, addr);
 736        if (p4d_none(READ_ONCE(*p4dp)))
 737                return 0;
 738
 739        pudp = pud_offset(p4dp, addr);
 740        pud = READ_ONCE(*pudp);
 741        if (pud_none(pud))
 742                return 0;
 743
 744        if (pud_sect(pud))
 745                return pfn_valid(pud_pfn(pud));
 746
 747        pmdp = pmd_offset(pudp, addr);
 748        pmd = READ_ONCE(*pmdp);
 749        if (pmd_none(pmd))
 750                return 0;
 751
 752        if (pmd_sect(pmd))
 753                return pfn_valid(pmd_pfn(pmd));
 754
 755        ptep = pte_offset_kernel(pmdp, addr);
 756        pte = READ_ONCE(*ptep);
 757        if (pte_none(pte))
 758                return 0;
 759
 760        return pfn_valid(pte_pfn(pte));
 761}
 762
 763#ifdef CONFIG_MEMORY_HOTPLUG
 764static void free_hotplug_page_range(struct page *page, size_t size,
 765                                    struct vmem_altmap *altmap)
 766{
 767        if (altmap) {
 768                vmem_altmap_free(altmap, size >> PAGE_SHIFT);
 769        } else {
 770                WARN_ON(PageReserved(page));
 771                free_pages((unsigned long)page_address(page), get_order(size));
 772        }
 773}
 774
 775static void free_hotplug_pgtable_page(struct page *page)
 776{
 777        free_hotplug_page_range(page, PAGE_SIZE, NULL);
 778}
 779
 780static bool pgtable_range_aligned(unsigned long start, unsigned long end,
 781                                  unsigned long floor, unsigned long ceiling,
 782                                  unsigned long mask)
 783{
 784        start &= mask;
 785        if (start < floor)
 786                return false;
 787
 788        if (ceiling) {
 789                ceiling &= mask;
 790                if (!ceiling)
 791                        return false;
 792        }
 793
 794        if (end - 1 > ceiling - 1)
 795                return false;
 796        return true;
 797}
 798
 799static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
 800                                    unsigned long end, bool free_mapped,
 801                                    struct vmem_altmap *altmap)
 802{
 803        pte_t *ptep, pte;
 804
 805        do {
 806                ptep = pte_offset_kernel(pmdp, addr);
 807                pte = READ_ONCE(*ptep);
 808                if (pte_none(pte))
 809                        continue;
 810
 811                WARN_ON(!pte_present(pte));
 812                pte_clear(&init_mm, addr, ptep);
 813                flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
 814                if (free_mapped)
 815                        free_hotplug_page_range(pte_page(pte),
 816                                                PAGE_SIZE, altmap);
 817        } while (addr += PAGE_SIZE, addr < end);
 818}
 819
 820static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
 821                                    unsigned long end, bool free_mapped,
 822                                    struct vmem_altmap *altmap)
 823{
 824        unsigned long next;
 825        pmd_t *pmdp, pmd;
 826
 827        do {
 828                next = pmd_addr_end(addr, end);
 829                pmdp = pmd_offset(pudp, addr);
 830                pmd = READ_ONCE(*pmdp);
 831                if (pmd_none(pmd))
 832                        continue;
 833
 834                WARN_ON(!pmd_present(pmd));
 835                if (pmd_sect(pmd)) {
 836                        pmd_clear(pmdp);
 837
 838                        /*
 839                         * One TLBI should be sufficient here as the PMD_SIZE
 840                         * range is mapped with a single block entry.
 841                         */
 842                        flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
 843                        if (free_mapped)
 844                                free_hotplug_page_range(pmd_page(pmd),
 845                                                        PMD_SIZE, altmap);
 846                        continue;
 847                }
 848                WARN_ON(!pmd_table(pmd));
 849                unmap_hotplug_pte_range(pmdp, addr, next, free_mapped, altmap);
 850        } while (addr = next, addr < end);
 851}
 852
 853static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
 854                                    unsigned long end, bool free_mapped,
 855                                    struct vmem_altmap *altmap)
 856{
 857        unsigned long next;
 858        pud_t *pudp, pud;
 859
 860        do {
 861                next = pud_addr_end(addr, end);
 862                pudp = pud_offset(p4dp, addr);
 863                pud = READ_ONCE(*pudp);
 864                if (pud_none(pud))
 865                        continue;
 866
 867                WARN_ON(!pud_present(pud));
 868                if (pud_sect(pud)) {
 869                        pud_clear(pudp);
 870
 871                        /*
 872                         * One TLBI should be sufficient here as the PUD_SIZE
 873                         * range is mapped with a single block entry.
 874                         */
 875                        flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
 876                        if (free_mapped)
 877                                free_hotplug_page_range(pud_page(pud),
 878                                                        PUD_SIZE, altmap);
 879                        continue;
 880                }
 881                WARN_ON(!pud_table(pud));
 882                unmap_hotplug_pmd_range(pudp, addr, next, free_mapped, altmap);
 883        } while (addr = next, addr < end);
 884}
 885
 886static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
 887                                    unsigned long end, bool free_mapped,
 888                                    struct vmem_altmap *altmap)
 889{
 890        unsigned long next;
 891        p4d_t *p4dp, p4d;
 892
 893        do {
 894                next = p4d_addr_end(addr, end);
 895                p4dp = p4d_offset(pgdp, addr);
 896                p4d = READ_ONCE(*p4dp);
 897                if (p4d_none(p4d))
 898                        continue;
 899
 900                WARN_ON(!p4d_present(p4d));
 901                unmap_hotplug_pud_range(p4dp, addr, next, free_mapped, altmap);
 902        } while (addr = next, addr < end);
 903}
 904
 905static void unmap_hotplug_range(unsigned long addr, unsigned long end,
 906                                bool free_mapped, struct vmem_altmap *altmap)
 907{
 908        unsigned long next;
 909        pgd_t *pgdp, pgd;
 910
 911        /*
 912         * altmap can only be used as vmemmap mapping backing memory.
 913         * In case the backing memory itself is not being freed, then
 914         * altmap is irrelevant. Warn about this inconsistency when
 915         * encountered.
 916         */
 917        WARN_ON(!free_mapped && altmap);
 918
 919        do {
 920                next = pgd_addr_end(addr, end);
 921                pgdp = pgd_offset_k(addr);
 922                pgd = READ_ONCE(*pgdp);
 923                if (pgd_none(pgd))
 924                        continue;
 925
 926                WARN_ON(!pgd_present(pgd));
 927                unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped, altmap);
 928        } while (addr = next, addr < end);
 929}
 930
 931static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
 932                                 unsigned long end, unsigned long floor,
 933                                 unsigned long ceiling)
 934{
 935        pte_t *ptep, pte;
 936        unsigned long i, start = addr;
 937
 938        do {
 939                ptep = pte_offset_kernel(pmdp, addr);
 940                pte = READ_ONCE(*ptep);
 941
 942                /*
 943                 * This is just a sanity check here which verifies that
 944                 * pte clearing has been done by earlier unmap loops.
 945                 */
 946                WARN_ON(!pte_none(pte));
 947        } while (addr += PAGE_SIZE, addr < end);
 948
 949        if (!pgtable_range_aligned(start, end, floor, ceiling, PMD_MASK))
 950                return;
 951
 952        /*
 953         * Check whether we can free the pte page if the rest of the
 954         * entries are empty. Overlap with other regions have been
 955         * handled by the floor/ceiling check.
 956         */
 957        ptep = pte_offset_kernel(pmdp, 0UL);
 958        for (i = 0; i < PTRS_PER_PTE; i++) {
 959                if (!pte_none(READ_ONCE(ptep[i])))
 960                        return;
 961        }
 962
 963        pmd_clear(pmdp);
 964        __flush_tlb_kernel_pgtable(start);
 965        free_hotplug_pgtable_page(virt_to_page(ptep));
 966}
 967
 968static void free_empty_pmd_table(pud_t *pudp, unsigned long addr,
 969                                 unsigned long end, unsigned long floor,
 970                                 unsigned long ceiling)
 971{
 972        pmd_t *pmdp, pmd;
 973        unsigned long i, next, start = addr;
 974
 975        do {
 976                next = pmd_addr_end(addr, end);
 977                pmdp = pmd_offset(pudp, addr);
 978                pmd = READ_ONCE(*pmdp);
 979                if (pmd_none(pmd))
 980                        continue;
 981
 982                WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd));
 983                free_empty_pte_table(pmdp, addr, next, floor, ceiling);
 984        } while (addr = next, addr < end);
 985
 986        if (CONFIG_PGTABLE_LEVELS <= 2)
 987                return;
 988
 989        if (!pgtable_range_aligned(start, end, floor, ceiling, PUD_MASK))
 990                return;
 991
 992        /*
 993         * Check whether we can free the pmd page if the rest of the
 994         * entries are empty. Overlap with other regions have been
 995         * handled by the floor/ceiling check.
 996         */
 997        pmdp = pmd_offset(pudp, 0UL);
 998        for (i = 0; i < PTRS_PER_PMD; i++) {
 999                if (!pmd_none(READ_ONCE(pmdp[i])))
1000                        return;
1001        }
1002
1003        pud_clear(pudp);
1004        __flush_tlb_kernel_pgtable(start);
1005        free_hotplug_pgtable_page(virt_to_page(pmdp));
1006}
1007
1008static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr,
1009                                 unsigned long end, unsigned long floor,
1010                                 unsigned long ceiling)
1011{
1012        pud_t *pudp, pud;
1013        unsigned long i, next, start = addr;
1014
1015        do {
1016                next = pud_addr_end(addr, end);
1017                pudp = pud_offset(p4dp, addr);
1018                pud = READ_ONCE(*pudp);
1019                if (pud_none(pud))
1020                        continue;
1021
1022                WARN_ON(!pud_present(pud) || !pud_table(pud) || pud_sect(pud));
1023                free_empty_pmd_table(pudp, addr, next, floor, ceiling);
1024        } while (addr = next, addr < end);
1025
1026        if (CONFIG_PGTABLE_LEVELS <= 3)
1027                return;
1028
1029        if (!pgtable_range_aligned(start, end, floor, ceiling, PGDIR_MASK))
1030                return;
1031
1032        /*
1033         * Check whether we can free the pud page if the rest of the
1034         * entries are empty. Overlap with other regions have been
1035         * handled by the floor/ceiling check.
1036         */
1037        pudp = pud_offset(p4dp, 0UL);
1038        for (i = 0; i < PTRS_PER_PUD; i++) {
1039                if (!pud_none(READ_ONCE(pudp[i])))
1040                        return;
1041        }
1042
1043        p4d_clear(p4dp);
1044        __flush_tlb_kernel_pgtable(start);
1045        free_hotplug_pgtable_page(virt_to_page(pudp));
1046}
1047
1048static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr,
1049                                 unsigned long end, unsigned long floor,
1050                                 unsigned long ceiling)
1051{
1052        unsigned long next;
1053        p4d_t *p4dp, p4d;
1054
1055        do {
1056                next = p4d_addr_end(addr, end);
1057                p4dp = p4d_offset(pgdp, addr);
1058                p4d = READ_ONCE(*p4dp);
1059                if (p4d_none(p4d))
1060                        continue;
1061
1062                WARN_ON(!p4d_present(p4d));
1063                free_empty_pud_table(p4dp, addr, next, floor, ceiling);
1064        } while (addr = next, addr < end);
1065}
1066
1067static void free_empty_tables(unsigned long addr, unsigned long end,
1068                              unsigned long floor, unsigned long ceiling)
1069{
1070        unsigned long next;
1071        pgd_t *pgdp, pgd;
1072
1073        do {
1074                next = pgd_addr_end(addr, end);
1075                pgdp = pgd_offset_k(addr);
1076                pgd = READ_ONCE(*pgdp);
1077                if (pgd_none(pgd))
1078                        continue;
1079
1080                WARN_ON(!pgd_present(pgd));
1081                free_empty_p4d_table(pgdp, addr, next, floor, ceiling);
1082        } while (addr = next, addr < end);
1083}
1084#endif
1085
1086#ifdef CONFIG_SPARSEMEM_VMEMMAP
1087#if !ARM64_SWAPPER_USES_SECTION_MAPS
1088int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
1089                struct vmem_altmap *altmap)
1090{
1091        return vmemmap_populate_basepages(start, end, node, altmap);
1092}
1093#else   /* !ARM64_SWAPPER_USES_SECTION_MAPS */
1094int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
1095                struct vmem_altmap *altmap)
1096{
1097        unsigned long addr = start;
1098        unsigned long next;
1099        pgd_t *pgdp;
1100        p4d_t *p4dp;
1101        pud_t *pudp;
1102        pmd_t *pmdp;
1103
1104        do {
1105                next = pmd_addr_end(addr, end);
1106
1107                pgdp = vmemmap_pgd_populate(addr, node);
1108                if (!pgdp)
1109                        return -ENOMEM;
1110
1111                p4dp = vmemmap_p4d_populate(pgdp, addr, node);
1112                if (!p4dp)
1113                        return -ENOMEM;
1114
1115                pudp = vmemmap_pud_populate(p4dp, addr, node);
1116                if (!pudp)
1117                        return -ENOMEM;
1118
1119                pmdp = pmd_offset(pudp, addr);
1120                if (pmd_none(READ_ONCE(*pmdp))) {
1121                        void *p = NULL;
1122
1123                        p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
1124                        if (!p)
1125                                return -ENOMEM;
1126
1127                        pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL));
1128                } else
1129                        vmemmap_verify((pte_t *)pmdp, node, addr, next);
1130        } while (addr = next, addr != end);
1131
1132        return 0;
1133}
1134#endif  /* !ARM64_SWAPPER_USES_SECTION_MAPS */
1135void vmemmap_free(unsigned long start, unsigned long end,
1136                struct vmem_altmap *altmap)
1137{
1138#ifdef CONFIG_MEMORY_HOTPLUG
1139        WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
1140
1141        unmap_hotplug_range(start, end, true, altmap);
1142        free_empty_tables(start, end, VMEMMAP_START, VMEMMAP_END);
1143#endif
1144}
1145#endif  /* CONFIG_SPARSEMEM_VMEMMAP */
1146
1147static inline pud_t * fixmap_pud(unsigned long addr)
1148{
1149        pgd_t *pgdp = pgd_offset_k(addr);
1150        p4d_t *p4dp = p4d_offset(pgdp, addr);
1151        p4d_t p4d = READ_ONCE(*p4dp);
1152
1153        BUG_ON(p4d_none(p4d) || p4d_bad(p4d));
1154
1155        return pud_offset_kimg(p4dp, addr);
1156}
1157
1158static inline pmd_t * fixmap_pmd(unsigned long addr)
1159{
1160        pud_t *pudp = fixmap_pud(addr);
1161        pud_t pud = READ_ONCE(*pudp);
1162
1163        BUG_ON(pud_none(pud) || pud_bad(pud));
1164
1165        return pmd_offset_kimg(pudp, addr);
1166}
1167
1168static inline pte_t * fixmap_pte(unsigned long addr)
1169{
1170        return &bm_pte[pte_index(addr)];
1171}
1172
1173/*
1174 * The p*d_populate functions call virt_to_phys implicitly so they can't be used
1175 * directly on kernel symbols (bm_p*d). This function is called too early to use
1176 * lm_alias so __p*d_populate functions must be used to populate with the
1177 * physical address from __pa_symbol.
1178 */
1179void __init early_fixmap_init(void)
1180{
1181        pgd_t *pgdp;
1182        p4d_t *p4dp, p4d;
1183        pud_t *pudp;
1184        pmd_t *pmdp;
1185        unsigned long addr = FIXADDR_START;
1186
1187        pgdp = pgd_offset_k(addr);
1188        p4dp = p4d_offset(pgdp, addr);
1189        p4d = READ_ONCE(*p4dp);
1190        if (CONFIG_PGTABLE_LEVELS > 3 &&
1191            !(p4d_none(p4d) || p4d_page_paddr(p4d) == __pa_symbol(bm_pud))) {
1192                /*
1193                 * We only end up here if the kernel mapping and the fixmap
1194                 * share the top level pgd entry, which should only happen on
1195                 * 16k/4 levels configurations.
1196                 */
1197                BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
1198                pudp = pud_offset_kimg(p4dp, addr);
1199        } else {
1200                if (p4d_none(p4d))
1201                        __p4d_populate(p4dp, __pa_symbol(bm_pud), PUD_TYPE_TABLE);
1202                pudp = fixmap_pud(addr);
1203        }
1204        if (pud_none(READ_ONCE(*pudp)))
1205                __pud_populate(pudp, __pa_symbol(bm_pmd), PMD_TYPE_TABLE);
1206        pmdp = fixmap_pmd(addr);
1207        __pmd_populate(pmdp, __pa_symbol(bm_pte), PMD_TYPE_TABLE);
1208
1209        /*
1210         * The boot-ioremap range spans multiple pmds, for which
1211         * we are not prepared:
1212         */
1213        BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
1214                     != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
1215
1216        if ((pmdp != fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)))
1217             || pmdp != fixmap_pmd(fix_to_virt(FIX_BTMAP_END))) {
1218                WARN_ON(1);
1219                pr_warn("pmdp %p != %p, %p\n",
1220                        pmdp, fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)),
1221                        fixmap_pmd(fix_to_virt(FIX_BTMAP_END)));
1222                pr_warn("fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
1223                        fix_to_virt(FIX_BTMAP_BEGIN));
1224                pr_warn("fix_to_virt(FIX_BTMAP_END):   %08lx\n",
1225                        fix_to_virt(FIX_BTMAP_END));
1226
1227                pr_warn("FIX_BTMAP_END:       %d\n", FIX_BTMAP_END);
1228                pr_warn("FIX_BTMAP_BEGIN:     %d\n", FIX_BTMAP_BEGIN);
1229        }
1230}
1231
1232/*
1233 * Unusually, this is also called in IRQ context (ghes_iounmap_irq) so if we
1234 * ever need to use IPIs for TLB broadcasting, then we're in trouble here.
1235 */
1236void __set_fixmap(enum fixed_addresses idx,
1237                               phys_addr_t phys, pgprot_t flags)
1238{
1239        unsigned long addr = __fix_to_virt(idx);
1240        pte_t *ptep;
1241
1242        BUG_ON(idx <= FIX_HOLE || idx >= __end_of_fixed_addresses);
1243
1244        ptep = fixmap_pte(addr);
1245
1246        if (pgprot_val(flags)) {
1247                set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags));
1248        } else {
1249                pte_clear(&init_mm, addr, ptep);
1250                flush_tlb_kernel_range(addr, addr+PAGE_SIZE);
1251        }
1252}
1253
1254void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
1255{
1256        const u64 dt_virt_base = __fix_to_virt(FIX_FDT);
1257        int offset;
1258        void *dt_virt;
1259
1260        /*
1261         * Check whether the physical FDT address is set and meets the minimum
1262         * alignment requirement. Since we are relying on MIN_FDT_ALIGN to be
1263         * at least 8 bytes so that we can always access the magic and size
1264         * fields of the FDT header after mapping the first chunk, double check
1265         * here if that is indeed the case.
1266         */
1267        BUILD_BUG_ON(MIN_FDT_ALIGN < 8);
1268        if (!dt_phys || dt_phys % MIN_FDT_ALIGN)
1269                return NULL;
1270
1271        /*
1272         * Make sure that the FDT region can be mapped without the need to
1273         * allocate additional translation table pages, so that it is safe
1274         * to call create_mapping_noalloc() this early.
1275         *
1276         * On 64k pages, the FDT will be mapped using PTEs, so we need to
1277         * be in the same PMD as the rest of the fixmap.
1278         * On 4k pages, we'll use section mappings for the FDT so we only
1279         * have to be in the same PUD.
1280         */
1281        BUILD_BUG_ON(dt_virt_base % SZ_2M);
1282
1283        BUILD_BUG_ON(__fix_to_virt(FIX_FDT_END) >> SWAPPER_TABLE_SHIFT !=
1284                     __fix_to_virt(FIX_BTMAP_BEGIN) >> SWAPPER_TABLE_SHIFT);
1285
1286        offset = dt_phys % SWAPPER_BLOCK_SIZE;
1287        dt_virt = (void *)dt_virt_base + offset;
1288
1289        /* map the first chunk so we can read the size from the header */
1290        create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE),
1291                        dt_virt_base, SWAPPER_BLOCK_SIZE, prot);
1292
1293        if (fdt_magic(dt_virt) != FDT_MAGIC)
1294                return NULL;
1295
1296        *size = fdt_totalsize(dt_virt);
1297        if (*size > MAX_FDT_SIZE)
1298                return NULL;
1299
1300        if (offset + *size > SWAPPER_BLOCK_SIZE)
1301                create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base,
1302                               round_up(offset + *size, SWAPPER_BLOCK_SIZE), prot);
1303
1304        return dt_virt;
1305}
1306
1307int __init arch_ioremap_p4d_supported(void)
1308{
1309        return 0;
1310}
1311
1312int __init arch_ioremap_pud_supported(void)
1313{
1314        /*
1315         * Only 4k granule supports level 1 block mappings.
1316         * SW table walks can't handle removal of intermediate entries.
1317         */
1318        return IS_ENABLED(CONFIG_ARM64_4K_PAGES) &&
1319               !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
1320}
1321
1322int __init arch_ioremap_pmd_supported(void)
1323{
1324        /* See arch_ioremap_pud_supported() */
1325        return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
1326}
1327
1328int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
1329{
1330        pud_t new_pud = pfn_pud(__phys_to_pfn(phys), mk_pud_sect_prot(prot));
1331
1332        /* Only allow permission changes for now */
1333        if (!pgattr_change_is_safe(READ_ONCE(pud_val(*pudp)),
1334                                   pud_val(new_pud)))
1335                return 0;
1336
1337        VM_BUG_ON(phys & ~PUD_MASK);
1338        set_pud(pudp, new_pud);
1339        return 1;
1340}
1341
1342int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
1343{
1344        pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), mk_pmd_sect_prot(prot));
1345
1346        /* Only allow permission changes for now */
1347        if (!pgattr_change_is_safe(READ_ONCE(pmd_val(*pmdp)),
1348                                   pmd_val(new_pmd)))
1349                return 0;
1350
1351        VM_BUG_ON(phys & ~PMD_MASK);
1352        set_pmd(pmdp, new_pmd);
1353        return 1;
1354}
1355
1356int pud_clear_huge(pud_t *pudp)
1357{
1358        if (!pud_sect(READ_ONCE(*pudp)))
1359                return 0;
1360        pud_clear(pudp);
1361        return 1;
1362}
1363
1364int pmd_clear_huge(pmd_t *pmdp)
1365{
1366        if (!pmd_sect(READ_ONCE(*pmdp)))
1367                return 0;
1368        pmd_clear(pmdp);
1369        return 1;
1370}
1371
1372int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
1373{
1374        pte_t *table;
1375        pmd_t pmd;
1376
1377        pmd = READ_ONCE(*pmdp);
1378
1379        if (!pmd_table(pmd)) {
1380                VM_WARN_ON(1);
1381                return 1;
1382        }
1383
1384        table = pte_offset_kernel(pmdp, addr);
1385        pmd_clear(pmdp);
1386        __flush_tlb_kernel_pgtable(addr);
1387        pte_free_kernel(NULL, table);
1388        return 1;
1389}
1390
1391int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
1392{
1393        pmd_t *table;
1394        pmd_t *pmdp;
1395        pud_t pud;
1396        unsigned long next, end;
1397
1398        pud = READ_ONCE(*pudp);
1399
1400        if (!pud_table(pud)) {
1401                VM_WARN_ON(1);
1402                return 1;
1403        }
1404
1405        table = pmd_offset(pudp, addr);
1406        pmdp = table;
1407        next = addr;
1408        end = addr + PUD_SIZE;
1409        do {
1410                pmd_free_pte_page(pmdp, next);
1411        } while (pmdp++, next += PMD_SIZE, next != end);
1412
1413        pud_clear(pudp);
1414        __flush_tlb_kernel_pgtable(addr);
1415        pmd_free(NULL, table);
1416        return 1;
1417}
1418
1419int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
1420{
1421        return 0;       /* Don't attempt a block mapping */
1422}
1423
1424#ifdef CONFIG_MEMORY_HOTPLUG
1425static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
1426{
1427        unsigned long end = start + size;
1428
1429        WARN_ON(pgdir != init_mm.pgd);
1430        WARN_ON((start < PAGE_OFFSET) || (end > PAGE_END));
1431
1432        unmap_hotplug_range(start, end, false, NULL);
1433        free_empty_tables(start, end, PAGE_OFFSET, PAGE_END);
1434}
1435
1436int arch_add_memory(int nid, u64 start, u64 size,
1437                    struct mhp_params *params)
1438{
1439        int ret, flags = 0;
1440
1441        if (rodata_full || debug_pagealloc_enabled())
1442                flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
1443
1444        __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
1445                             size, params->pgprot, __pgd_pgtable_alloc,
1446                             flags);
1447
1448        memblock_clear_nomap(start, size);
1449
1450        ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
1451                           params);
1452        if (ret)
1453                __remove_pgd_mapping(swapper_pg_dir,
1454                                     __phys_to_virt(start), size);
1455        return ret;
1456}
1457
1458void arch_remove_memory(int nid, u64 start, u64 size,
1459                        struct vmem_altmap *altmap)
1460{
1461        unsigned long start_pfn = start >> PAGE_SHIFT;
1462        unsigned long nr_pages = size >> PAGE_SHIFT;
1463
1464        __remove_pages(start_pfn, nr_pages, altmap);
1465        __remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
1466}
1467
1468/*
1469 * This memory hotplug notifier helps prevent boot memory from being
1470 * inadvertently removed as it blocks pfn range offlining process in
1471 * __offline_pages(). Hence this prevents both offlining as well as
1472 * removal process for boot memory which is initially always online.
1473 * In future if and when boot memory could be removed, this notifier
1474 * should be dropped and free_hotplug_page_range() should handle any
1475 * reserved pages allocated during boot.
1476 */
1477static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
1478                                           unsigned long action, void *data)
1479{
1480        struct mem_section *ms;
1481        struct memory_notify *arg = data;
1482        unsigned long end_pfn = arg->start_pfn + arg->nr_pages;
1483        unsigned long pfn = arg->start_pfn;
1484
1485        if (action != MEM_GOING_OFFLINE)
1486                return NOTIFY_OK;
1487
1488        for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1489                ms = __pfn_to_section(pfn);
1490                if (early_section(ms))
1491                        return NOTIFY_BAD;
1492        }
1493        return NOTIFY_OK;
1494}
1495
1496static struct notifier_block prevent_bootmem_remove_nb = {
1497        .notifier_call = prevent_bootmem_remove_notifier,
1498};
1499
1500static int __init prevent_bootmem_remove_init(void)
1501{
1502        return register_memory_notifier(&prevent_bootmem_remove_nb);
1503}
1504device_initcall(prevent_bootmem_remove_init);
1505#endif
1506