linux/arch/arm64/mm/mmu.c
<<
>>
Prefs
   1/*
   2 * Based on arch/arm/mm/mmu.c
   3 *
   4 * Copyright (C) 1995-2005 Russell King
   5 * Copyright (C) 2012 ARM Ltd.
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License version 2 as
   9 * published by the Free Software Foundation.
  10 *
  11 * This program is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 * GNU General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20#include <linux/cache.h>
  21#include <linux/export.h>
  22#include <linux/kernel.h>
  23#include <linux/errno.h>
  24#include <linux/init.h>
  25#include <linux/ioport.h>
  26#include <linux/kexec.h>
  27#include <linux/libfdt.h>
  28#include <linux/mman.h>
  29#include <linux/nodemask.h>
  30#include <linux/memblock.h>
  31#include <linux/memory.h>
  32#include <linux/fs.h>
  33#include <linux/io.h>
  34#include <linux/mm.h>
  35#include <linux/vmalloc.h>
  36
  37#include <asm/barrier.h>
  38#include <asm/cputype.h>
  39#include <asm/fixmap.h>
  40#include <asm/kasan.h>
  41#include <asm/kernel-pgtable.h>
  42#include <asm/sections.h>
  43#include <asm/setup.h>
  44#include <asm/sizes.h>
  45#include <asm/tlb.h>
  46#include <asm/memblock.h>
  47#include <asm/mmu_context.h>
  48#include <asm/ptdump.h>
  49#include <asm/tlbflush.h>
  50
  51#define NO_BLOCK_MAPPINGS       BIT(0)
  52#define NO_CONT_MAPPINGS        BIT(1)
  53
  54u64 idmap_t0sz = TCR_T0SZ(VA_BITS_MIN);
  55u64 idmap_ptrs_per_pgd = PTRS_PER_PGD;
  56
  57u64 __section(".mmuoff.data.write") vabits_actual;
  58EXPORT_SYMBOL(vabits_actual);
  59
  60u64 kimage_voffset __ro_after_init;
  61EXPORT_SYMBOL(kimage_voffset);
  62
  63/*
  64 * Empty_zero_page is a special page that is used for zero-initialized data
  65 * and COW.
  66 */
  67unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss;
  68EXPORT_SYMBOL(empty_zero_page);
  69
  70static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss;
  71static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused;
  72static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused;
  73
  74static DEFINE_SPINLOCK(swapper_pgdir_lock);
  75
  76void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd)
  77{
  78        pgd_t *fixmap_pgdp;
  79
  80        spin_lock(&swapper_pgdir_lock);
  81        fixmap_pgdp = pgd_set_fixmap(__pa_symbol(pgdp));
  82        WRITE_ONCE(*fixmap_pgdp, pgd);
  83        /*
  84         * We need dsb(ishst) here to ensure the page-table-walker sees
  85         * our new entry before set_p?d() returns. The fixmap's
  86         * flush_tlb_kernel_range() via clear_fixmap() does this for us.
  87         */
  88        pgd_clear_fixmap();
  89        spin_unlock(&swapper_pgdir_lock);
  90}
  91
  92pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
  93                              unsigned long size, pgprot_t vma_prot)
  94{
  95        if (!pfn_valid(pfn))
  96                return pgprot_noncached(vma_prot);
  97        else if (file->f_flags & O_SYNC)
  98                return pgprot_writecombine(vma_prot);
  99        return vma_prot;
 100}
 101EXPORT_SYMBOL(phys_mem_access_prot);
 102
 103static phys_addr_t __init early_pgtable_alloc(int shift)
 104{
 105        phys_addr_t phys;
 106        void *ptr;
 107
 108        phys = memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
 109
 110        /*
 111         * The FIX_{PGD,PUD,PMD} slots may be in active use, but the FIX_PTE
 112         * slot will be free, so we can (ab)use the FIX_PTE slot to initialise
 113         * any level of table.
 114         */
 115        ptr = pte_set_fixmap(phys);
 116
 117        memset(ptr, 0, PAGE_SIZE);
 118
 119        /*
 120         * Implicit barriers also ensure the zeroed page is visible to the page
 121         * table walker
 122         */
 123        pte_clear_fixmap();
 124
 125        return phys;
 126}
 127
 128static bool pgattr_change_is_safe(u64 old, u64 new)
 129{
 130        /*
 131         * The following mapping attributes may be updated in live
 132         * kernel mappings without the need for break-before-make.
 133         */
 134        static const pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE | PTE_NG;
 135
 136        /* creating or taking down mappings is always safe */
 137        if (old == 0 || new == 0)
 138                return true;
 139
 140        /* live contiguous mappings may not be manipulated at all */
 141        if ((old | new) & PTE_CONT)
 142                return false;
 143
 144        /* Transitioning from Non-Global to Global is unsafe */
 145        if (old & ~new & PTE_NG)
 146                return false;
 147
 148        return ((old ^ new) & ~mask) == 0;
 149}
 150
 151static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
 152                     phys_addr_t phys, pgprot_t prot)
 153{
 154        pte_t *ptep;
 155
 156        ptep = pte_set_fixmap_offset(pmdp, addr);
 157        do {
 158                pte_t old_pte = READ_ONCE(*ptep);
 159
 160                set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot));
 161
 162                /*
 163                 * After the PTE entry has been populated once, we
 164                 * only allow updates to the permission attributes.
 165                 */
 166                BUG_ON(!pgattr_change_is_safe(pte_val(old_pte),
 167                                              READ_ONCE(pte_val(*ptep))));
 168
 169                phys += PAGE_SIZE;
 170        } while (ptep++, addr += PAGE_SIZE, addr != end);
 171
 172        pte_clear_fixmap();
 173}
 174
 175static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
 176                                unsigned long end, phys_addr_t phys,
 177                                pgprot_t prot,
 178                                phys_addr_t (*pgtable_alloc)(int),
 179                                int flags)
 180{
 181        unsigned long next;
 182        pmd_t pmd = READ_ONCE(*pmdp);
 183
 184        BUG_ON(pmd_sect(pmd));
 185        if (pmd_none(pmd)) {
 186                phys_addr_t pte_phys;
 187                BUG_ON(!pgtable_alloc);
 188                pte_phys = pgtable_alloc(PAGE_SHIFT);
 189                __pmd_populate(pmdp, pte_phys, PMD_TYPE_TABLE);
 190                pmd = READ_ONCE(*pmdp);
 191        }
 192        BUG_ON(pmd_bad(pmd));
 193
 194        do {
 195                pgprot_t __prot = prot;
 196
 197                next = pte_cont_addr_end(addr, end);
 198
 199                /* use a contiguous mapping if the range is suitably aligned */
 200                if ((((addr | next | phys) & ~CONT_PTE_MASK) == 0) &&
 201                    (flags & NO_CONT_MAPPINGS) == 0)
 202                        __prot = __pgprot(pgprot_val(prot) | PTE_CONT);
 203
 204                init_pte(pmdp, addr, next, phys, __prot);
 205
 206                phys += next - addr;
 207        } while (addr = next, addr != end);
 208}
 209
 210static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end,
 211                     phys_addr_t phys, pgprot_t prot,
 212                     phys_addr_t (*pgtable_alloc)(int), int flags)
 213{
 214        unsigned long next;
 215        pmd_t *pmdp;
 216
 217        pmdp = pmd_set_fixmap_offset(pudp, addr);
 218        do {
 219                pmd_t old_pmd = READ_ONCE(*pmdp);
 220
 221                next = pmd_addr_end(addr, end);
 222
 223                /* try section mapping first */
 224                if (((addr | next | phys) & ~SECTION_MASK) == 0 &&
 225                    (flags & NO_BLOCK_MAPPINGS) == 0) {
 226                        pmd_set_huge(pmdp, phys, prot);
 227
 228                        /*
 229                         * After the PMD entry has been populated once, we
 230                         * only allow updates to the permission attributes.
 231                         */
 232                        BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd),
 233                                                      READ_ONCE(pmd_val(*pmdp))));
 234                } else {
 235                        alloc_init_cont_pte(pmdp, addr, next, phys, prot,
 236                                            pgtable_alloc, flags);
 237
 238                        BUG_ON(pmd_val(old_pmd) != 0 &&
 239                               pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp)));
 240                }
 241                phys += next - addr;
 242        } while (pmdp++, addr = next, addr != end);
 243
 244        pmd_clear_fixmap();
 245}
 246
 247static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
 248                                unsigned long end, phys_addr_t phys,
 249                                pgprot_t prot,
 250                                phys_addr_t (*pgtable_alloc)(int), int flags)
 251{
 252        unsigned long next;
 253        pud_t pud = READ_ONCE(*pudp);
 254
 255        /*
 256         * Check for initial section mappings in the pgd/pud.
 257         */
 258        BUG_ON(pud_sect(pud));
 259        if (pud_none(pud)) {
 260                phys_addr_t pmd_phys;
 261                BUG_ON(!pgtable_alloc);
 262                pmd_phys = pgtable_alloc(PMD_SHIFT);
 263                __pud_populate(pudp, pmd_phys, PUD_TYPE_TABLE);
 264                pud = READ_ONCE(*pudp);
 265        }
 266        BUG_ON(pud_bad(pud));
 267
 268        do {
 269                pgprot_t __prot = prot;
 270
 271                next = pmd_cont_addr_end(addr, end);
 272
 273                /* use a contiguous mapping if the range is suitably aligned */
 274                if ((((addr | next | phys) & ~CONT_PMD_MASK) == 0) &&
 275                    (flags & NO_CONT_MAPPINGS) == 0)
 276                        __prot = __pgprot(pgprot_val(prot) | PTE_CONT);
 277
 278                init_pmd(pudp, addr, next, phys, __prot, pgtable_alloc, flags);
 279
 280                phys += next - addr;
 281        } while (addr = next, addr != end);
 282}
 283
 284static inline bool use_1G_block(unsigned long addr, unsigned long next,
 285                        unsigned long phys)
 286{
 287        if (PAGE_SHIFT != 12)
 288                return false;
 289
 290        if (((addr | next | phys) & ~PUD_MASK) != 0)
 291                return false;
 292
 293        return true;
 294}
 295
 296static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
 297                           phys_addr_t phys, pgprot_t prot,
 298                           phys_addr_t (*pgtable_alloc)(int),
 299                           int flags)
 300{
 301        unsigned long next;
 302        pud_t *pudp;
 303        pgd_t pgd = READ_ONCE(*pgdp);
 304
 305        if (pgd_none(pgd)) {
 306                phys_addr_t pud_phys;
 307                BUG_ON(!pgtable_alloc);
 308                pud_phys = pgtable_alloc(PUD_SHIFT);
 309                __pgd_populate(pgdp, pud_phys, PUD_TYPE_TABLE);
 310                pgd = READ_ONCE(*pgdp);
 311        }
 312        BUG_ON(pgd_bad(pgd));
 313
 314        pudp = pud_set_fixmap_offset(pgdp, addr);
 315        do {
 316                pud_t old_pud = READ_ONCE(*pudp);
 317
 318                next = pud_addr_end(addr, end);
 319
 320                /*
 321                 * For 4K granule only, attempt to put down a 1GB block
 322                 */
 323                if (use_1G_block(addr, next, phys) &&
 324                    (flags & NO_BLOCK_MAPPINGS) == 0) {
 325                        pud_set_huge(pudp, phys, prot);
 326
 327                        /*
 328                         * After the PUD entry has been populated once, we
 329                         * only allow updates to the permission attributes.
 330                         */
 331                        BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
 332                                                      READ_ONCE(pud_val(*pudp))));
 333                } else {
 334                        alloc_init_cont_pmd(pudp, addr, next, phys, prot,
 335                                            pgtable_alloc, flags);
 336
 337                        BUG_ON(pud_val(old_pud) != 0 &&
 338                               pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
 339                }
 340                phys += next - addr;
 341        } while (pudp++, addr = next, addr != end);
 342
 343        pud_clear_fixmap();
 344}
 345
 346static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
 347                                 unsigned long virt, phys_addr_t size,
 348                                 pgprot_t prot,
 349                                 phys_addr_t (*pgtable_alloc)(int),
 350                                 int flags)
 351{
 352        unsigned long addr, length, end, next;
 353        pgd_t *pgdp = pgd_offset_raw(pgdir, virt);
 354
 355        /*
 356         * If the virtual and physical address don't have the same offset
 357         * within a page, we cannot map the region as the caller expects.
 358         */
 359        if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
 360                return;
 361
 362        phys &= PAGE_MASK;
 363        addr = virt & PAGE_MASK;
 364        length = PAGE_ALIGN(size + (virt & ~PAGE_MASK));
 365
 366        end = addr + length;
 367        do {
 368                next = pgd_addr_end(addr, end);
 369                alloc_init_pud(pgdp, addr, next, phys, prot, pgtable_alloc,
 370                               flags);
 371                phys += next - addr;
 372        } while (pgdp++, addr = next, addr != end);
 373}
 374
 375static phys_addr_t __pgd_pgtable_alloc(int shift)
 376{
 377        void *ptr = (void *)__get_free_page(PGALLOC_GFP);
 378        BUG_ON(!ptr);
 379
 380        /* Ensure the zeroed page is visible to the page table walker */
 381        dsb(ishst);
 382        return __pa(ptr);
 383}
 384
 385static phys_addr_t pgd_pgtable_alloc(int shift)
 386{
 387        phys_addr_t pa = __pgd_pgtable_alloc(shift);
 388
 389        /*
 390         * Call proper page table ctor in case later we need to
 391         * call core mm functions like apply_to_page_range() on
 392         * this pre-allocated page table.
 393         *
 394         * We don't select ARCH_ENABLE_SPLIT_PMD_PTLOCK if pmd is
 395         * folded, and if so pgtable_pmd_page_ctor() becomes nop.
 396         */
 397        if (shift == PAGE_SHIFT)
 398                BUG_ON(!pgtable_page_ctor(phys_to_page(pa)));
 399        else if (shift == PMD_SHIFT)
 400                BUG_ON(!pgtable_pmd_page_ctor(phys_to_page(pa)));
 401
 402        return pa;
 403}
 404
 405/*
 406 * This function can only be used to modify existing table entries,
 407 * without allocating new levels of table. Note that this permits the
 408 * creation of new section or page entries.
 409 */
 410static void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt,
 411                                  phys_addr_t size, pgprot_t prot)
 412{
 413        if ((virt >= PAGE_END) && (virt < VMALLOC_START)) {
 414                pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n",
 415                        &phys, virt);
 416                return;
 417        }
 418        __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
 419                             NO_CONT_MAPPINGS);
 420}
 421
 422void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
 423                               unsigned long virt, phys_addr_t size,
 424                               pgprot_t prot, bool page_mappings_only)
 425{
 426        int flags = 0;
 427
 428        BUG_ON(mm == &init_mm);
 429
 430        if (page_mappings_only)
 431                flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
 432
 433        __create_pgd_mapping(mm->pgd, phys, virt, size, prot,
 434                             pgd_pgtable_alloc, flags);
 435}
 436
 437static void update_mapping_prot(phys_addr_t phys, unsigned long virt,
 438                                phys_addr_t size, pgprot_t prot)
 439{
 440        if ((virt >= PAGE_END) && (virt < VMALLOC_START)) {
 441                pr_warn("BUG: not updating mapping for %pa at 0x%016lx - outside kernel range\n",
 442                        &phys, virt);
 443                return;
 444        }
 445
 446        __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
 447                             NO_CONT_MAPPINGS);
 448
 449        /* flush the TLBs after updating live kernel mappings */
 450        flush_tlb_kernel_range(virt, virt + size);
 451}
 452
 453static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start,
 454                                  phys_addr_t end, pgprot_t prot, int flags)
 455{
 456        __create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start,
 457                             prot, early_pgtable_alloc, flags);
 458}
 459
 460void __init mark_linear_text_alias_ro(void)
 461{
 462        /*
 463         * Remove the write permissions from the linear alias of .text/.rodata
 464         */
 465        update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text),
 466                            (unsigned long)__init_begin - (unsigned long)_text,
 467                            PAGE_KERNEL_RO);
 468}
 469
 470static void __init map_mem(pgd_t *pgdp)
 471{
 472        phys_addr_t kernel_start = __pa_symbol(_text);
 473        phys_addr_t kernel_end = __pa_symbol(__init_begin);
 474        struct memblock_region *reg;
 475        int flags = 0;
 476
 477        if (rodata_full || debug_pagealloc_enabled())
 478                flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
 479
 480        /*
 481         * Take care not to create a writable alias for the
 482         * read-only text and rodata sections of the kernel image.
 483         * So temporarily mark them as NOMAP to skip mappings in
 484         * the following for-loop
 485         */
 486        memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
 487#ifdef CONFIG_KEXEC_CORE
 488        if (crashk_res.end)
 489                memblock_mark_nomap(crashk_res.start,
 490                                    resource_size(&crashk_res));
 491#endif
 492
 493        /* map all the memory banks */
 494        for_each_memblock(memory, reg) {
 495                phys_addr_t start = reg->base;
 496                phys_addr_t end = start + reg->size;
 497
 498                if (start >= end)
 499                        break;
 500                if (memblock_is_nomap(reg))
 501                        continue;
 502
 503                __map_memblock(pgdp, start, end, PAGE_KERNEL, flags);
 504        }
 505
 506        /*
 507         * Map the linear alias of the [_text, __init_begin) interval
 508         * as non-executable now, and remove the write permission in
 509         * mark_linear_text_alias_ro() below (which will be called after
 510         * alternative patching has completed). This makes the contents
 511         * of the region accessible to subsystems such as hibernate,
 512         * but protects it from inadvertent modification or execution.
 513         * Note that contiguous mappings cannot be remapped in this way,
 514         * so we should avoid them here.
 515         */
 516        __map_memblock(pgdp, kernel_start, kernel_end,
 517                       PAGE_KERNEL, NO_CONT_MAPPINGS);
 518        memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
 519
 520#ifdef CONFIG_KEXEC_CORE
 521        /*
 522         * Use page-level mappings here so that we can shrink the region
 523         * in page granularity and put back unused memory to buddy system
 524         * through /sys/kernel/kexec_crash_size interface.
 525         */
 526        if (crashk_res.end) {
 527                __map_memblock(pgdp, crashk_res.start, crashk_res.end + 1,
 528                               PAGE_KERNEL,
 529                               NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
 530                memblock_clear_nomap(crashk_res.start,
 531                                     resource_size(&crashk_res));
 532        }
 533#endif
 534}
 535
 536void mark_rodata_ro(void)
 537{
 538        unsigned long section_size;
 539
 540        /*
 541         * mark .rodata as read only. Use __init_begin rather than __end_rodata
 542         * to cover NOTES and EXCEPTION_TABLE.
 543         */
 544        section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata;
 545        update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata,
 546                            section_size, PAGE_KERNEL_RO);
 547
 548        debug_checkwx();
 549}
 550
 551static void __init map_kernel_segment(pgd_t *pgdp, void *va_start, void *va_end,
 552                                      pgprot_t prot, struct vm_struct *vma,
 553                                      int flags, unsigned long vm_flags)
 554{
 555        phys_addr_t pa_start = __pa_symbol(va_start);
 556        unsigned long size = va_end - va_start;
 557
 558        BUG_ON(!PAGE_ALIGNED(pa_start));
 559        BUG_ON(!PAGE_ALIGNED(size));
 560
 561        __create_pgd_mapping(pgdp, pa_start, (unsigned long)va_start, size, prot,
 562                             early_pgtable_alloc, flags);
 563
 564        if (!(vm_flags & VM_NO_GUARD))
 565                size += PAGE_SIZE;
 566
 567        vma->addr       = va_start;
 568        vma->phys_addr  = pa_start;
 569        vma->size       = size;
 570        vma->flags      = VM_MAP | vm_flags;
 571        vma->caller     = __builtin_return_address(0);
 572
 573        vm_area_add_early(vma);
 574}
 575
 576static int __init parse_rodata(char *arg)
 577{
 578        int ret = strtobool(arg, &rodata_enabled);
 579        if (!ret) {
 580                rodata_full = false;
 581                return 0;
 582        }
 583
 584        /* permit 'full' in addition to boolean options */
 585        if (strcmp(arg, "full"))
 586                return -EINVAL;
 587
 588        rodata_enabled = true;
 589        rodata_full = true;
 590        return 0;
 591}
 592early_param("rodata", parse_rodata);
 593
 594#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
 595static int __init map_entry_trampoline(void)
 596{
 597        pgprot_t prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;
 598        phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start);
 599
 600        /* The trampoline is always mapped and can therefore be global */
 601        pgprot_val(prot) &= ~PTE_NG;
 602
 603        /* Map only the text into the trampoline page table */
 604        memset(tramp_pg_dir, 0, PGD_SIZE);
 605        __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, PAGE_SIZE,
 606                             prot, __pgd_pgtable_alloc, 0);
 607
 608        /* Map both the text and data into the kernel page table */
 609        __set_fixmap(FIX_ENTRY_TRAMP_TEXT, pa_start, prot);
 610        if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
 611                extern char __entry_tramp_data_start[];
 612
 613                __set_fixmap(FIX_ENTRY_TRAMP_DATA,
 614                             __pa_symbol(__entry_tramp_data_start),
 615                             PAGE_KERNEL_RO);
 616        }
 617
 618        return 0;
 619}
 620core_initcall(map_entry_trampoline);
 621#endif
 622
 623/*
 624 * Create fine-grained mappings for the kernel.
 625 */
 626static void __init map_kernel(pgd_t *pgdp)
 627{
 628        static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_inittext,
 629                                vmlinux_initdata, vmlinux_data;
 630
 631        /*
 632         * External debuggers may need to write directly to the text
 633         * mapping to install SW breakpoints. Allow this (only) when
 634         * explicitly requested with rodata=off.
 635         */
 636        pgprot_t text_prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;
 637
 638        /*
 639         * Only rodata will be remapped with different permissions later on,
 640         * all other segments are allowed to use contiguous mappings.
 641         */
 642        map_kernel_segment(pgdp, _text, _etext, text_prot, &vmlinux_text, 0,
 643                           VM_NO_GUARD);
 644        map_kernel_segment(pgdp, __start_rodata, __inittext_begin, PAGE_KERNEL,
 645                           &vmlinux_rodata, NO_CONT_MAPPINGS, VM_NO_GUARD);
 646        map_kernel_segment(pgdp, __inittext_begin, __inittext_end, text_prot,
 647                           &vmlinux_inittext, 0, VM_NO_GUARD);
 648        map_kernel_segment(pgdp, __initdata_begin, __initdata_end, PAGE_KERNEL,
 649                           &vmlinux_initdata, 0, VM_NO_GUARD);
 650        map_kernel_segment(pgdp, _data, _end, PAGE_KERNEL, &vmlinux_data, 0, 0);
 651
 652        if (!READ_ONCE(pgd_val(*pgd_offset_raw(pgdp, FIXADDR_START)))) {
 653                /*
 654                 * The fixmap falls in a separate pgd to the kernel, and doesn't
 655                 * live in the carveout for the swapper_pg_dir. We can simply
 656                 * re-use the existing dir for the fixmap.
 657                 */
 658                set_pgd(pgd_offset_raw(pgdp, FIXADDR_START),
 659                        READ_ONCE(*pgd_offset_k(FIXADDR_START)));
 660        } else if (CONFIG_PGTABLE_LEVELS > 3) {
 661                pgd_t *bm_pgdp;
 662                pud_t *bm_pudp;
 663                /*
 664                 * The fixmap shares its top level pgd entry with the kernel
 665                 * mapping. This can really only occur when we are running
 666                 * with 16k/4 levels, so we can simply reuse the pud level
 667                 * entry instead.
 668                 */
 669                BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
 670                bm_pgdp = pgd_offset_raw(pgdp, FIXADDR_START);
 671                bm_pudp = pud_set_fixmap_offset(bm_pgdp, FIXADDR_START);
 672                pud_populate(&init_mm, bm_pudp, lm_alias(bm_pmd));
 673                pud_clear_fixmap();
 674        } else {
 675                BUG();
 676        }
 677
 678        kasan_copy_shadow(pgdp);
 679}
 680
 681/*
 682 * paging_init() sets up the page tables, initialises the zone memory
 683 * maps and sets up the zero page.
 684 */
 685void __init paging_init(void)
 686{
 687        pgd_t *pgdp = pgd_set_fixmap(__pa_symbol(swapper_pg_dir));
 688
 689        map_kernel(pgdp);
 690        map_mem(pgdp);
 691
 692        pgd_clear_fixmap();
 693
 694        cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
 695        init_mm.pgd = swapper_pg_dir;
 696
 697        memblock_free(__pa_symbol(init_pg_dir),
 698                      __pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir));
 699}
 700
 701/*
 702 * Check whether a kernel address is valid (derived from arch/x86/).
 703 */
 704int kern_addr_valid(unsigned long addr)
 705{
 706        pgd_t *pgdp;
 707        pud_t *pudp, pud;
 708        pmd_t *pmdp, pmd;
 709        pte_t *ptep, pte;
 710
 711        if ((((long)addr) >> VA_BITS) != -1UL)
 712                return 0;
 713
 714        pgdp = pgd_offset_k(addr);
 715        if (pgd_none(READ_ONCE(*pgdp)))
 716                return 0;
 717
 718        pudp = pud_offset(pgdp, addr);
 719        pud = READ_ONCE(*pudp);
 720        if (pud_none(pud))
 721                return 0;
 722
 723        if (pud_sect(pud))
 724                return pfn_valid(pud_pfn(pud));
 725
 726        pmdp = pmd_offset(pudp, addr);
 727        pmd = READ_ONCE(*pmdp);
 728        if (pmd_none(pmd))
 729                return 0;
 730
 731        if (pmd_sect(pmd))
 732                return pfn_valid(pmd_pfn(pmd));
 733
 734        ptep = pte_offset_kernel(pmdp, addr);
 735        pte = READ_ONCE(*ptep);
 736        if (pte_none(pte))
 737                return 0;
 738
 739        return pfn_valid(pte_pfn(pte));
 740}
 741
 742#ifdef CONFIG_MEMORY_HOTPLUG
 743static void free_hotplug_page_range(struct page *page, size_t size)
 744{
 745        WARN_ON(PageReserved(page));
 746        free_pages((unsigned long)page_address(page), get_order(size));
 747}
 748
 749static void free_hotplug_pgtable_page(struct page *page)
 750{
 751        free_hotplug_page_range(page, PAGE_SIZE);
 752}
 753
 754static bool pgtable_range_aligned(unsigned long start, unsigned long end,
 755                                  unsigned long floor, unsigned long ceiling,
 756                                  unsigned long mask)
 757{
 758        start &= mask;
 759        if (start < floor)
 760                return false;
 761
 762        if (ceiling) {
 763                ceiling &= mask;
 764                if (!ceiling)
 765                        return false;
 766        }
 767
 768        if (end - 1 > ceiling - 1)
 769                return false;
 770        return true;
 771}
 772
 773static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
 774                                    unsigned long end, bool free_mapped)
 775{
 776        pte_t *ptep, pte;
 777
 778        do {
 779                ptep = pte_offset_kernel(pmdp, addr);
 780                pte = READ_ONCE(*ptep);
 781                if (pte_none(pte))
 782                        continue;
 783
 784                WARN_ON(!pte_present(pte));
 785                pte_clear(&init_mm, addr, ptep);
 786                flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
 787                if (free_mapped)
 788                        free_hotplug_page_range(pte_page(pte), PAGE_SIZE);
 789        } while (addr += PAGE_SIZE, addr < end);
 790}
 791
 792static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
 793                                    unsigned long end, bool free_mapped)
 794{
 795        unsigned long next;
 796        pmd_t *pmdp, pmd;
 797
 798        do {
 799                next = pmd_addr_end(addr, end);
 800                pmdp = pmd_offset(pudp, addr);
 801                pmd = READ_ONCE(*pmdp);
 802                if (pmd_none(pmd))
 803                        continue;
 804
 805                WARN_ON(!pmd_present(pmd));
 806                if (pmd_sect(pmd)) {
 807                        pmd_clear(pmdp);
 808
 809                        /*
 810                         * One TLBI should be sufficient here as the PMD_SIZE
 811                         * range is mapped with a single block entry.
 812                         */
 813                        flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
 814                        if (free_mapped)
 815                                free_hotplug_page_range(pmd_page(pmd),
 816                                                        PMD_SIZE);
 817                        continue;
 818                }
 819                WARN_ON(!pmd_table(pmd));
 820                unmap_hotplug_pte_range(pmdp, addr, next, free_mapped);
 821        } while (addr = next, addr < end);
 822}
 823
 824static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
 825                                    unsigned long end, bool free_mapped)
 826{
 827        unsigned long next;
 828        pud_t *pudp, pud;
 829
 830        do {
 831                next = pud_addr_end(addr, end);
 832                pudp = pud_offset(p4dp, addr);
 833                pud = READ_ONCE(*pudp);
 834                if (pud_none(pud))
 835                        continue;
 836
 837                WARN_ON(!pud_present(pud));
 838                if (pud_sect(pud)) {
 839                        pud_clear(pudp);
 840
 841                        /*
 842                         * One TLBI should be sufficient here as the PUD_SIZE
 843                         * range is mapped with a single block entry.
 844                         */
 845                        flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
 846                        if (free_mapped)
 847                                free_hotplug_page_range(pud_page(pud),
 848                                                        PUD_SIZE);
 849                        continue;
 850                }
 851                WARN_ON(!pud_table(pud));
 852                unmap_hotplug_pmd_range(pudp, addr, next, free_mapped);
 853        } while (addr = next, addr < end);
 854}
 855
 856static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
 857                                    unsigned long end, bool free_mapped)
 858{
 859        unsigned long next;
 860        p4d_t *p4dp, p4d;
 861
 862        do {
 863                next = p4d_addr_end(addr, end);
 864                p4dp = p4d_offset(pgdp, addr);
 865                p4d = READ_ONCE(*p4dp);
 866                if (p4d_none(p4d))
 867                        continue;
 868
 869                WARN_ON(!p4d_present(p4d));
 870                unmap_hotplug_pud_range(p4dp, addr, next, free_mapped);
 871        } while (addr = next, addr < end);
 872}
 873
 874static void unmap_hotplug_range(unsigned long addr, unsigned long end,
 875                                bool free_mapped)
 876{
 877        unsigned long next;
 878        pgd_t *pgdp, pgd;
 879
 880        do {
 881                next = pgd_addr_end(addr, end);
 882                pgdp = pgd_offset_k(addr);
 883                pgd = READ_ONCE(*pgdp);
 884                if (pgd_none(pgd))
 885                        continue;
 886
 887                WARN_ON(!pgd_present(pgd));
 888                unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped);
 889        } while (addr = next, addr < end);
 890}
 891
 892static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
 893                                 unsigned long end, unsigned long floor,
 894                                 unsigned long ceiling)
 895{
 896        pte_t *ptep, pte;
 897        unsigned long i, start = addr;
 898
 899        do {
 900                ptep = pte_offset_kernel(pmdp, addr);
 901                pte = READ_ONCE(*ptep);
 902
 903                /*
 904                 * This is just a sanity check here which verifies that
 905                 * pte clearing has been done by earlier unmap loops.
 906                 */
 907                WARN_ON(!pte_none(pte));
 908        } while (addr += PAGE_SIZE, addr < end);
 909
 910        if (!pgtable_range_aligned(start, end, floor, ceiling, PMD_MASK))
 911                return;
 912
 913        /*
 914         * Check whether we can free the pte page if the rest of the
 915         * entries are empty. Overlap with other regions have been
 916         * handled by the floor/ceiling check.
 917         */
 918        ptep = pte_offset_kernel(pmdp, 0UL);
 919        for (i = 0; i < PTRS_PER_PTE; i++) {
 920                if (!pte_none(READ_ONCE(ptep[i])))
 921                        return;
 922        }
 923
 924        pmd_clear(pmdp);
 925        __flush_tlb_kernel_pgtable(start);
 926        free_hotplug_pgtable_page(virt_to_page(ptep));
 927}
 928
 929static void free_empty_pmd_table(pud_t *pudp, unsigned long addr,
 930                                 unsigned long end, unsigned long floor,
 931                                 unsigned long ceiling)
 932{
 933        pmd_t *pmdp, pmd;
 934        unsigned long i, next, start = addr;
 935
 936        do {
 937                next = pmd_addr_end(addr, end);
 938                pmdp = pmd_offset(pudp, addr);
 939                pmd = READ_ONCE(*pmdp);
 940                if (pmd_none(pmd))
 941                        continue;
 942
 943                WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd));
 944                free_empty_pte_table(pmdp, addr, next, floor, ceiling);
 945        } while (addr = next, addr < end);
 946
 947        if (CONFIG_PGTABLE_LEVELS <= 2)
 948                return;
 949
 950        if (!pgtable_range_aligned(start, end, floor, ceiling, PUD_MASK))
 951                return;
 952
 953        /*
 954         * Check whether we can free the pmd page if the rest of the
 955         * entries are empty. Overlap with other regions have been
 956         * handled by the floor/ceiling check.
 957         */
 958        pmdp = pmd_offset(pudp, 0UL);
 959        for (i = 0; i < PTRS_PER_PMD; i++) {
 960                if (!pmd_none(READ_ONCE(pmdp[i])))
 961                        return;
 962        }
 963
 964        pud_clear(pudp);
 965        __flush_tlb_kernel_pgtable(start);
 966        free_hotplug_pgtable_page(virt_to_page(pmdp));
 967}
 968
 969static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr,
 970                                 unsigned long end, unsigned long floor,
 971                                 unsigned long ceiling)
 972{
 973        pud_t *pudp, pud;
 974        unsigned long i, next, start = addr;
 975
 976        do {
 977                next = pud_addr_end(addr, end);
 978                pudp = pud_offset(p4dp, addr);
 979                pud = READ_ONCE(*pudp);
 980                if (pud_none(pud))
 981                        continue;
 982
 983                WARN_ON(!pud_present(pud) || !pud_table(pud) || pud_sect(pud));
 984                free_empty_pmd_table(pudp, addr, next, floor, ceiling);
 985        } while (addr = next, addr < end);
 986
 987        if (CONFIG_PGTABLE_LEVELS <= 3)
 988                return;
 989
 990        if (!pgtable_range_aligned(start, end, floor, ceiling, PGDIR_MASK))
 991                return;
 992
 993        /*
 994         * Check whether we can free the pud page if the rest of the
 995         * entries are empty. Overlap with other regions have been
 996         * handled by the floor/ceiling check.
 997         */
 998        pudp = pud_offset(p4dp, 0UL);
 999        for (i = 0; i < PTRS_PER_PUD; i++) {
1000                if (!pud_none(READ_ONCE(pudp[i])))
1001                        return;
1002        }
1003
1004        p4d_clear(p4dp);
1005        __flush_tlb_kernel_pgtable(start);
1006        free_hotplug_pgtable_page(virt_to_page(pudp));
1007}
1008
1009static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr,
1010                                 unsigned long end, unsigned long floor,
1011                                 unsigned long ceiling)
1012{
1013        unsigned long next;
1014        p4d_t *p4dp, p4d;
1015
1016        do {
1017                next = p4d_addr_end(addr, end);
1018                p4dp = p4d_offset(pgdp, addr);
1019                p4d = READ_ONCE(*p4dp);
1020                if (p4d_none(p4d))
1021                        continue;
1022
1023                WARN_ON(!p4d_present(p4d));
1024                free_empty_pud_table(p4dp, addr, next, floor, ceiling);
1025        } while (addr = next, addr < end);
1026}
1027
1028static void free_empty_tables(unsigned long addr, unsigned long end,
1029                              unsigned long floor, unsigned long ceiling)
1030{
1031        unsigned long next;
1032        pgd_t *pgdp, pgd;
1033
1034        do {
1035                next = pgd_addr_end(addr, end);
1036                pgdp = pgd_offset_k(addr);
1037                pgd = READ_ONCE(*pgdp);
1038                if (pgd_none(pgd))
1039                        continue;
1040
1041                WARN_ON(!pgd_present(pgd));
1042                free_empty_p4d_table(pgdp, addr, next, floor, ceiling);
1043        } while (addr = next, addr < end);
1044}
1045#endif
1046
1047#ifdef CONFIG_SPARSEMEM_VMEMMAP
1048#if !ARM64_SWAPPER_USES_SECTION_MAPS
1049int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
1050                struct vmem_altmap *altmap)
1051{
1052        return vmemmap_populate_basepages(start, end, node);
1053}
1054#else   /* !ARM64_SWAPPER_USES_SECTION_MAPS */
1055int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
1056                struct vmem_altmap *altmap)
1057{
1058        unsigned long addr = start;
1059        unsigned long next;
1060        pgd_t *pgdp;
1061        pud_t *pudp;
1062        pmd_t *pmdp;
1063
1064        do {
1065                next = pmd_addr_end(addr, end);
1066
1067                pgdp = vmemmap_pgd_populate(addr, node);
1068                if (!pgdp)
1069                        return -ENOMEM;
1070
1071                pudp = vmemmap_pud_populate(pgdp, addr, node);
1072                if (!pudp)
1073                        return -ENOMEM;
1074
1075                pmdp = pmd_offset(pudp, addr);
1076                if (pmd_none(READ_ONCE(*pmdp))) {
1077                        void *p = NULL;
1078
1079                        p = vmemmap_alloc_block_buf(PMD_SIZE, node);
1080                        if (!p)
1081                                return -ENOMEM;
1082
1083                        pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL));
1084                } else
1085                        vmemmap_verify((pte_t *)pmdp, node, addr, next);
1086        } while (addr = next, addr != end);
1087
1088        return 0;
1089}
1090#endif  /* CONFIG_ARM64_64K_PAGES */
1091void vmemmap_free(unsigned long start, unsigned long end,
1092                struct vmem_altmap *altmap)
1093{
1094#ifdef CONFIG_MEMORY_HOTPLUG
1095        WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
1096
1097        unmap_hotplug_range(start, end, true);
1098        free_empty_tables(start, end, VMEMMAP_START, VMEMMAP_END);
1099#endif
1100}
1101#endif  /* CONFIG_SPARSEMEM_VMEMMAP */
1102
1103static inline pud_t * fixmap_pud(unsigned long addr)
1104{
1105        pgd_t *pgdp = pgd_offset_k(addr);
1106        pgd_t pgd = READ_ONCE(*pgdp);
1107
1108        BUG_ON(pgd_none(pgd) || pgd_bad(pgd));
1109
1110        return pud_offset_kimg(pgdp, addr);
1111}
1112
1113static inline pmd_t * fixmap_pmd(unsigned long addr)
1114{
1115        pud_t *pudp = fixmap_pud(addr);
1116        pud_t pud = READ_ONCE(*pudp);
1117
1118        BUG_ON(pud_none(pud) || pud_bad(pud));
1119
1120        return pmd_offset_kimg(pudp, addr);
1121}
1122
1123static inline pte_t * fixmap_pte(unsigned long addr)
1124{
1125        return &bm_pte[pte_index(addr)];
1126}
1127
1128/*
1129 * The p*d_populate functions call virt_to_phys implicitly so they can't be used
1130 * directly on kernel symbols (bm_p*d). This function is called too early to use
1131 * lm_alias so __p*d_populate functions must be used to populate with the
1132 * physical address from __pa_symbol.
1133 */
1134void __init early_fixmap_init(void)
1135{
1136        pgd_t *pgdp, pgd;
1137        pud_t *pudp;
1138        pmd_t *pmdp;
1139        unsigned long addr = FIXADDR_START;
1140
1141        pgdp = pgd_offset_k(addr);
1142        pgd = READ_ONCE(*pgdp);
1143        if (CONFIG_PGTABLE_LEVELS > 3 &&
1144            !(pgd_none(pgd) || pgd_page_paddr(pgd) == __pa_symbol(bm_pud))) {
1145                /*
1146                 * We only end up here if the kernel mapping and the fixmap
1147                 * share the top level pgd entry, which should only happen on
1148                 * 16k/4 levels configurations.
1149                 */
1150                BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
1151                pudp = pud_offset_kimg(pgdp, addr);
1152        } else {
1153                if (pgd_none(pgd))
1154                        __pgd_populate(pgdp, __pa_symbol(bm_pud), PUD_TYPE_TABLE);
1155                pudp = fixmap_pud(addr);
1156        }
1157        if (pud_none(READ_ONCE(*pudp)))
1158                __pud_populate(pudp, __pa_symbol(bm_pmd), PMD_TYPE_TABLE);
1159        pmdp = fixmap_pmd(addr);
1160        __pmd_populate(pmdp, __pa_symbol(bm_pte), PMD_TYPE_TABLE);
1161
1162        /*
1163         * The boot-ioremap range spans multiple pmds, for which
1164         * we are not prepared:
1165         */
1166        BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
1167                     != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
1168
1169        if ((pmdp != fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)))
1170             || pmdp != fixmap_pmd(fix_to_virt(FIX_BTMAP_END))) {
1171                WARN_ON(1);
1172                pr_warn("pmdp %p != %p, %p\n",
1173                        pmdp, fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)),
1174                        fixmap_pmd(fix_to_virt(FIX_BTMAP_END)));
1175                pr_warn("fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
1176                        fix_to_virt(FIX_BTMAP_BEGIN));
1177                pr_warn("fix_to_virt(FIX_BTMAP_END):   %08lx\n",
1178                        fix_to_virt(FIX_BTMAP_END));
1179
1180                pr_warn("FIX_BTMAP_END:       %d\n", FIX_BTMAP_END);
1181                pr_warn("FIX_BTMAP_BEGIN:     %d\n", FIX_BTMAP_BEGIN);
1182        }
1183}
1184
1185/*
1186 * Unusually, this is also called in IRQ context (ghes_iounmap_irq) so if we
1187 * ever need to use IPIs for TLB broadcasting, then we're in trouble here.
1188 */
1189void __set_fixmap(enum fixed_addresses idx,
1190                               phys_addr_t phys, pgprot_t flags)
1191{
1192        unsigned long addr = __fix_to_virt(idx);
1193        pte_t *ptep;
1194
1195        BUG_ON(idx <= FIX_HOLE || idx >= __end_of_fixed_addresses);
1196
1197        ptep = fixmap_pte(addr);
1198
1199        if (pgprot_val(flags)) {
1200                set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags));
1201        } else {
1202                pte_clear(&init_mm, addr, ptep);
1203                flush_tlb_kernel_range(addr, addr+PAGE_SIZE);
1204        }
1205}
1206
1207void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
1208{
1209        const u64 dt_virt_base = __fix_to_virt(FIX_FDT);
1210        int offset;
1211        void *dt_virt;
1212
1213        /*
1214         * Check whether the physical FDT address is set and meets the minimum
1215         * alignment requirement. Since we are relying on MIN_FDT_ALIGN to be
1216         * at least 8 bytes so that we can always access the magic and size
1217         * fields of the FDT header after mapping the first chunk, double check
1218         * here if that is indeed the case.
1219         */
1220        BUILD_BUG_ON(MIN_FDT_ALIGN < 8);
1221        if (!dt_phys || dt_phys % MIN_FDT_ALIGN)
1222                return NULL;
1223
1224        /*
1225         * Make sure that the FDT region can be mapped without the need to
1226         * allocate additional translation table pages, so that it is safe
1227         * to call create_mapping_noalloc() this early.
1228         *
1229         * On 64k pages, the FDT will be mapped using PTEs, so we need to
1230         * be in the same PMD as the rest of the fixmap.
1231         * On 4k pages, we'll use section mappings for the FDT so we only
1232         * have to be in the same PUD.
1233         */
1234        BUILD_BUG_ON(dt_virt_base % SZ_2M);
1235
1236        BUILD_BUG_ON(__fix_to_virt(FIX_FDT_END) >> SWAPPER_TABLE_SHIFT !=
1237                     __fix_to_virt(FIX_BTMAP_BEGIN) >> SWAPPER_TABLE_SHIFT);
1238
1239        offset = dt_phys % SWAPPER_BLOCK_SIZE;
1240        dt_virt = (void *)dt_virt_base + offset;
1241
1242        /* map the first chunk so we can read the size from the header */
1243        create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE),
1244                        dt_virt_base, SWAPPER_BLOCK_SIZE, prot);
1245
1246        if (fdt_magic(dt_virt) != FDT_MAGIC)
1247                return NULL;
1248
1249        *size = fdt_totalsize(dt_virt);
1250        if (*size > MAX_FDT_SIZE)
1251                return NULL;
1252
1253        if (offset + *size > SWAPPER_BLOCK_SIZE)
1254                create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base,
1255                               round_up(offset + *size, SWAPPER_BLOCK_SIZE), prot);
1256
1257        return dt_virt;
1258}
1259
1260void *__init fixmap_remap_fdt(phys_addr_t dt_phys)
1261{
1262        void *dt_virt;
1263        int size;
1264
1265        dt_virt = __fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL_RO);
1266        if (!dt_virt)
1267                return NULL;
1268
1269        memblock_reserve(dt_phys, size);
1270        return dt_virt;
1271}
1272
1273int __init arch_ioremap_pud_supported(void)
1274{
1275        /*
1276         * Only 4k granule supports level 1 block mappings.
1277         * SW table walks can't handle removal of intermediate entries.
1278         */
1279        return IS_ENABLED(CONFIG_ARM64_4K_PAGES) &&
1280               !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
1281}
1282
1283int __init arch_ioremap_pmd_supported(void)
1284{
1285        /* See arch_ioremap_pud_supported() */
1286        return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
1287}
1288
1289int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
1290{
1291        pgprot_t sect_prot = __pgprot(PUD_TYPE_SECT |
1292                                        pgprot_val(mk_sect_prot(prot)));
1293        pud_t new_pud = pfn_pud(__phys_to_pfn(phys), sect_prot);
1294
1295        /* Only allow permission changes for now */
1296        if (!pgattr_change_is_safe(READ_ONCE(pud_val(*pudp)),
1297                                   pud_val(new_pud)))
1298                return 0;
1299
1300        BUG_ON(phys & ~PUD_MASK);
1301        set_pud(pudp, new_pud);
1302        return 1;
1303}
1304
1305int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
1306{
1307        pgprot_t sect_prot = __pgprot(PMD_TYPE_SECT |
1308                                        pgprot_val(mk_sect_prot(prot)));
1309        pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), sect_prot);
1310
1311        /* Only allow permission changes for now */
1312        if (!pgattr_change_is_safe(READ_ONCE(pmd_val(*pmdp)),
1313                                   pmd_val(new_pmd)))
1314                return 0;
1315
1316        BUG_ON(phys & ~PMD_MASK);
1317        set_pmd(pmdp, new_pmd);
1318        return 1;
1319}
1320
1321int pud_clear_huge(pud_t *pudp)
1322{
1323        if (!pud_sect(READ_ONCE(*pudp)))
1324                return 0;
1325        pud_clear(pudp);
1326        return 1;
1327}
1328
1329int pmd_clear_huge(pmd_t *pmdp)
1330{
1331        if (!pmd_sect(READ_ONCE(*pmdp)))
1332                return 0;
1333        pmd_clear(pmdp);
1334        return 1;
1335}
1336
1337int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
1338{
1339        pte_t *table;
1340        pmd_t pmd;
1341
1342        pmd = READ_ONCE(*pmdp);
1343
1344        if (!pmd_present(pmd))
1345                return 1;
1346        if (!pmd_table(pmd)) {
1347                VM_WARN_ON(!pmd_table(pmd));
1348                return 1;
1349        }
1350
1351        table = pte_offset_kernel(pmdp, addr);
1352        pmd_clear(pmdp);
1353        __flush_tlb_kernel_pgtable(addr);
1354        pte_free_kernel(NULL, table);
1355        return 1;
1356}
1357
1358int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
1359{
1360        pmd_t *table;
1361        pmd_t *pmdp;
1362        pud_t pud;
1363        unsigned long next, end;
1364
1365        pud = READ_ONCE(*pudp);
1366
1367        if (!pud_present(pud))
1368                return 1;
1369        if (!pud_table(pud)) {
1370                VM_WARN_ON(!pud_table(pud));
1371                return 1;
1372        }
1373
1374        table = pmd_offset(pudp, addr);
1375        pmdp = table;
1376        next = addr;
1377        end = addr + PUD_SIZE;
1378        do {
1379                pmd_free_pte_page(pmdp, next);
1380        } while (pmdp++, next += PMD_SIZE, next != end);
1381
1382        pud_clear(pudp);
1383        __flush_tlb_kernel_pgtable(addr);
1384        pmd_free(NULL, table);
1385        return 1;
1386}
1387
1388#ifdef CONFIG_MEMORY_HOTPLUG
1389static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
1390{
1391        unsigned long end = start + size;
1392
1393        WARN_ON(pgdir != init_mm.pgd);
1394        WARN_ON((start < PAGE_OFFSET) || (end > PAGE_END));
1395
1396        unmap_hotplug_range(start, end, false);
1397        free_empty_tables(start, end, PAGE_OFFSET, PAGE_END);
1398}
1399
1400int arch_add_memory(int nid, u64 start, u64 size,
1401                    struct mhp_params *params)
1402{
1403        int ret, flags = 0;
1404
1405        if (rodata_full || debug_pagealloc_enabled())
1406                flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
1407
1408        __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
1409                             size, params->pgprot, __pgd_pgtable_alloc,
1410                             flags);
1411
1412        memblock_clear_nomap(start, size);
1413
1414        ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
1415                           params);
1416        if (ret)
1417                __remove_pgd_mapping(swapper_pg_dir,
1418                                     __phys_to_virt(start), size);
1419        return ret;
1420}
1421
1422void arch_remove_memory(int nid, u64 start, u64 size,
1423                        struct vmem_altmap *altmap)
1424{
1425        unsigned long start_pfn = start >> PAGE_SHIFT;
1426        unsigned long nr_pages = size >> PAGE_SHIFT;
1427
1428        __remove_pages(start_pfn, nr_pages, altmap);
1429        __remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
1430}
1431
1432/*
1433 * This memory hotplug notifier helps prevent boot memory from being
1434 * inadvertently removed as it blocks pfn range offlining process in
1435 * __offline_pages(). Hence this prevents both offlining as well as
1436 * removal process for boot memory which is initially always online.
1437 * In future if and when boot memory could be removed, this notifier
1438 * should be dropped and free_hotplug_page_range() should handle any
1439 * reserved pages allocated during boot.
1440 */
1441static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
1442                                           unsigned long action, void *data)
1443{
1444        struct mem_section *ms;
1445        struct memory_notify *arg = data;
1446        unsigned long end_pfn = arg->start_pfn + arg->nr_pages;
1447        unsigned long pfn = arg->start_pfn;
1448
1449        if (action != MEM_GOING_OFFLINE)
1450                return NOTIFY_OK;
1451
1452        for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1453                ms = __pfn_to_section(pfn);
1454                if (early_section(ms))
1455                        return NOTIFY_BAD;
1456        }
1457        return NOTIFY_OK;
1458}
1459
1460static struct notifier_block prevent_bootmem_remove_nb = {
1461        .notifier_call = prevent_bootmem_remove_notifier,
1462};
1463
1464static int __init prevent_bootmem_remove_init(void)
1465{
1466        return register_memory_notifier(&prevent_bootmem_remove_nb);
1467}
1468device_initcall(prevent_bootmem_remove_init);
1469#endif
1470