linux/arch/arm64/mm/init.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Based on arch/arm/mm/init.c
   4 *
   5 * Copyright (C) 1995-2005 Russell King
   6 * Copyright (C) 2012 ARM Ltd.
   7 */
   8
   9#include <linux/kernel.h>
  10#include <linux/export.h>
  11#include <linux/errno.h>
  12#include <linux/swap.h>
  13#include <linux/init.h>
  14#include <linux/cache.h>
  15#include <linux/mman.h>
  16#include <linux/nodemask.h>
  17#include <linux/initrd.h>
  18#include <linux/gfp.h>
  19#include <linux/memblock.h>
  20#include <linux/sort.h>
  21#include <linux/of.h>
  22#include <linux/of_fdt.h>
  23#include <linux/dma-direct.h>
  24#include <linux/dma-map-ops.h>
  25#include <linux/efi.h>
  26#include <linux/swiotlb.h>
  27#include <linux/vmalloc.h>
  28#include <linux/mm.h>
  29#include <linux/kexec.h>
  30#include <linux/crash_dump.h>
  31#include <linux/hugetlb.h>
  32
  33#include <asm/boot.h>
  34#include <asm/fixmap.h>
  35#include <asm/kasan.h>
  36#include <asm/kernel-pgtable.h>
  37#include <asm/memory.h>
  38#include <asm/numa.h>
  39#include <asm/sections.h>
  40#include <asm/setup.h>
  41#include <linux/sizes.h>
  42#include <asm/tlb.h>
  43#include <asm/alternative.h>
  44
  45#define ARM64_ZONE_DMA_BITS     30
  46
  47/*
  48 * We need to be able to catch inadvertent references to memstart_addr
  49 * that occur (potentially in generic code) before arm64_memblock_init()
  50 * executes, which assigns it its actual value. So use a default value
  51 * that cannot be mistaken for a real physical address.
  52 */
  53s64 memstart_addr __ro_after_init = -1;
  54EXPORT_SYMBOL(memstart_addr);
  55
  56/*
  57 * We create both ZONE_DMA and ZONE_DMA32. ZONE_DMA covers the first 1G of
  58 * memory as some devices, namely the Raspberry Pi 4, have peripherals with
  59 * this limited view of the memory. ZONE_DMA32 will cover the rest of the 32
  60 * bit addressable memory area.
  61 */
  62phys_addr_t arm64_dma_phys_limit __ro_after_init;
  63static phys_addr_t arm64_dma32_phys_limit __ro_after_init;
  64
  65#ifdef CONFIG_KEXEC_CORE
  66/*
  67 * reserve_crashkernel() - reserves memory for crash kernel
  68 *
  69 * This function reserves memory area given in "crashkernel=" kernel command
  70 * line parameter. The memory reserved is used by dump capture kernel when
  71 * primary kernel is crashing.
  72 */
  73static void __init reserve_crashkernel(void)
  74{
  75        unsigned long long crash_base, crash_size;
  76        int ret;
  77
  78        ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
  79                                &crash_size, &crash_base);
  80        /* no crashkernel= or invalid value specified */
  81        if (ret || !crash_size)
  82                return;
  83
  84        crash_size = PAGE_ALIGN(crash_size);
  85
  86        if (crash_base == 0) {
  87                /* Current arm64 boot protocol requires 2MB alignment */
  88                crash_base = memblock_find_in_range(0, arm64_dma32_phys_limit,
  89                                crash_size, SZ_2M);
  90                if (crash_base == 0) {
  91                        pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
  92                                crash_size);
  93                        return;
  94                }
  95        } else {
  96                /* User specifies base address explicitly. */
  97                if (!memblock_is_region_memory(crash_base, crash_size)) {
  98                        pr_warn("cannot reserve crashkernel: region is not memory\n");
  99                        return;
 100                }
 101
 102                if (memblock_is_region_reserved(crash_base, crash_size)) {
 103                        pr_warn("cannot reserve crashkernel: region overlaps reserved memory\n");
 104                        return;
 105                }
 106
 107                if (!IS_ALIGNED(crash_base, SZ_2M)) {
 108                        pr_warn("cannot reserve crashkernel: base address is not 2MB aligned\n");
 109                        return;
 110                }
 111        }
 112        memblock_reserve(crash_base, crash_size);
 113
 114        pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
 115                crash_base, crash_base + crash_size, crash_size >> 20);
 116
 117        crashk_res.start = crash_base;
 118        crashk_res.end = crash_base + crash_size - 1;
 119}
 120#else
 121static void __init reserve_crashkernel(void)
 122{
 123}
 124#endif /* CONFIG_KEXEC_CORE */
 125
 126#ifdef CONFIG_CRASH_DUMP
 127static int __init early_init_dt_scan_elfcorehdr(unsigned long node,
 128                const char *uname, int depth, void *data)
 129{
 130        const __be32 *reg;
 131        int len;
 132
 133        if (depth != 1 || strcmp(uname, "chosen") != 0)
 134                return 0;
 135
 136        reg = of_get_flat_dt_prop(node, "linux,elfcorehdr", &len);
 137        if (!reg || (len < (dt_root_addr_cells + dt_root_size_cells)))
 138                return 1;
 139
 140        elfcorehdr_addr = dt_mem_next_cell(dt_root_addr_cells, &reg);
 141        elfcorehdr_size = dt_mem_next_cell(dt_root_size_cells, &reg);
 142
 143        return 1;
 144}
 145
 146/*
 147 * reserve_elfcorehdr() - reserves memory for elf core header
 148 *
 149 * This function reserves the memory occupied by an elf core header
 150 * described in the device tree. This region contains all the
 151 * information about primary kernel's core image and is used by a dump
 152 * capture kernel to access the system memory on primary kernel.
 153 */
 154static void __init reserve_elfcorehdr(void)
 155{
 156        of_scan_flat_dt(early_init_dt_scan_elfcorehdr, NULL);
 157
 158        if (!elfcorehdr_size)
 159                return;
 160
 161        if (memblock_is_region_reserved(elfcorehdr_addr, elfcorehdr_size)) {
 162                pr_warn("elfcorehdr is overlapped\n");
 163                return;
 164        }
 165
 166        memblock_reserve(elfcorehdr_addr, elfcorehdr_size);
 167
 168        pr_info("Reserving %lldKB of memory at 0x%llx for elfcorehdr\n",
 169                elfcorehdr_size >> 10, elfcorehdr_addr);
 170}
 171#else
 172static void __init reserve_elfcorehdr(void)
 173{
 174}
 175#endif /* CONFIG_CRASH_DUMP */
 176
 177/*
 178 * Return the maximum physical address for a zone with a given address size
 179 * limit. It currently assumes that for memory starting above 4G, 32-bit
 180 * devices will use a DMA offset.
 181 */
 182static phys_addr_t __init max_zone_phys(unsigned int zone_bits)
 183{
 184        phys_addr_t offset = memblock_start_of_DRAM() & GENMASK_ULL(63, zone_bits);
 185        return min(offset + (1ULL << zone_bits), memblock_end_of_DRAM());
 186}
 187
 188static void __init zone_sizes_init(unsigned long min, unsigned long max)
 189{
 190        unsigned long max_zone_pfns[MAX_NR_ZONES]  = {0};
 191
 192#ifdef CONFIG_ZONE_DMA
 193        max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
 194#endif
 195#ifdef CONFIG_ZONE_DMA32
 196        max_zone_pfns[ZONE_DMA32] = PFN_DOWN(arm64_dma32_phys_limit);
 197#endif
 198        max_zone_pfns[ZONE_NORMAL] = max;
 199
 200        free_area_init(max_zone_pfns);
 201}
 202
 203int pfn_valid(unsigned long pfn)
 204{
 205        phys_addr_t addr = pfn << PAGE_SHIFT;
 206
 207        if ((addr >> PAGE_SHIFT) != pfn)
 208                return 0;
 209
 210#ifdef CONFIG_SPARSEMEM
 211        if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
 212                return 0;
 213
 214        if (!valid_section(__pfn_to_section(pfn)))
 215                return 0;
 216#endif
 217        return memblock_is_map_memory(addr);
 218}
 219EXPORT_SYMBOL(pfn_valid);
 220
 221static phys_addr_t memory_limit = PHYS_ADDR_MAX;
 222
 223/*
 224 * Limit the memory size that was specified via FDT.
 225 */
 226static int __init early_mem(char *p)
 227{
 228        if (!p)
 229                return 1;
 230
 231        memory_limit = memparse(p, &p) & PAGE_MASK;
 232        pr_notice("Memory limited to %lldMB\n", memory_limit >> 20);
 233
 234        return 0;
 235}
 236early_param("mem", early_mem);
 237
 238static int __init early_init_dt_scan_usablemem(unsigned long node,
 239                const char *uname, int depth, void *data)
 240{
 241        struct memblock_region *usablemem = data;
 242        const __be32 *reg;
 243        int len;
 244
 245        if (depth != 1 || strcmp(uname, "chosen") != 0)
 246                return 0;
 247
 248        reg = of_get_flat_dt_prop(node, "linux,usable-memory-range", &len);
 249        if (!reg || (len < (dt_root_addr_cells + dt_root_size_cells)))
 250                return 1;
 251
 252        usablemem->base = dt_mem_next_cell(dt_root_addr_cells, &reg);
 253        usablemem->size = dt_mem_next_cell(dt_root_size_cells, &reg);
 254
 255        return 1;
 256}
 257
 258static void __init fdt_enforce_memory_region(void)
 259{
 260        struct memblock_region reg = {
 261                .size = 0,
 262        };
 263
 264        of_scan_flat_dt(early_init_dt_scan_usablemem, &reg);
 265
 266        if (reg.size)
 267                memblock_cap_memory_range(reg.base, reg.size);
 268}
 269
 270void __init arm64_memblock_init(void)
 271{
 272        const s64 linear_region_size = BIT(vabits_actual - 1);
 273
 274        /* Handle linux,usable-memory-range property */
 275        fdt_enforce_memory_region();
 276
 277        /* Remove memory above our supported physical address size */
 278        memblock_remove(1ULL << PHYS_MASK_SHIFT, ULLONG_MAX);
 279
 280        /*
 281         * Select a suitable value for the base of physical memory.
 282         */
 283        memstart_addr = round_down(memblock_start_of_DRAM(),
 284                                   ARM64_MEMSTART_ALIGN);
 285
 286        /*
 287         * Remove the memory that we will not be able to cover with the
 288         * linear mapping. Take care not to clip the kernel which may be
 289         * high in memory.
 290         */
 291        memblock_remove(max_t(u64, memstart_addr + linear_region_size,
 292                        __pa_symbol(_end)), ULLONG_MAX);
 293        if (memstart_addr + linear_region_size < memblock_end_of_DRAM()) {
 294                /* ensure that memstart_addr remains sufficiently aligned */
 295                memstart_addr = round_up(memblock_end_of_DRAM() - linear_region_size,
 296                                         ARM64_MEMSTART_ALIGN);
 297                memblock_remove(0, memstart_addr);
 298        }
 299
 300        /*
 301         * If we are running with a 52-bit kernel VA config on a system that
 302         * does not support it, we have to place the available physical
 303         * memory in the 48-bit addressable part of the linear region, i.e.,
 304         * we have to move it upward. Since memstart_addr represents the
 305         * physical address of PAGE_OFFSET, we have to *subtract* from it.
 306         */
 307        if (IS_ENABLED(CONFIG_ARM64_VA_BITS_52) && (vabits_actual != 52))
 308                memstart_addr -= _PAGE_OFFSET(48) - _PAGE_OFFSET(52);
 309
 310        /*
 311         * Apply the memory limit if it was set. Since the kernel may be loaded
 312         * high up in memory, add back the kernel region that must be accessible
 313         * via the linear mapping.
 314         */
 315        if (memory_limit != PHYS_ADDR_MAX) {
 316                memblock_mem_limit_remove_map(memory_limit);
 317                memblock_add(__pa_symbol(_text), (u64)(_end - _text));
 318        }
 319
 320        if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
 321                /*
 322                 * Add back the memory we just removed if it results in the
 323                 * initrd to become inaccessible via the linear mapping.
 324                 * Otherwise, this is a no-op
 325                 */
 326                u64 base = phys_initrd_start & PAGE_MASK;
 327                u64 size = PAGE_ALIGN(phys_initrd_start + phys_initrd_size) - base;
 328
 329                /*
 330                 * We can only add back the initrd memory if we don't end up
 331                 * with more memory than we can address via the linear mapping.
 332                 * It is up to the bootloader to position the kernel and the
 333                 * initrd reasonably close to each other (i.e., within 32 GB of
 334                 * each other) so that all granule/#levels combinations can
 335                 * always access both.
 336                 */
 337                if (WARN(base < memblock_start_of_DRAM() ||
 338                         base + size > memblock_start_of_DRAM() +
 339                                       linear_region_size,
 340                        "initrd not fully accessible via the linear mapping -- please check your bootloader ...\n")) {
 341                        phys_initrd_size = 0;
 342                } else {
 343                        memblock_remove(base, size); /* clear MEMBLOCK_ flags */
 344                        memblock_add(base, size);
 345                        memblock_reserve(base, size);
 346                }
 347        }
 348
 349        if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
 350                extern u16 memstart_offset_seed;
 351                u64 range = linear_region_size -
 352                            (memblock_end_of_DRAM() - memblock_start_of_DRAM());
 353
 354                /*
 355                 * If the size of the linear region exceeds, by a sufficient
 356                 * margin, the size of the region that the available physical
 357                 * memory spans, randomize the linear region as well.
 358                 */
 359                if (memstart_offset_seed > 0 && range >= ARM64_MEMSTART_ALIGN) {
 360                        range /= ARM64_MEMSTART_ALIGN;
 361                        memstart_addr -= ARM64_MEMSTART_ALIGN *
 362                                         ((range * memstart_offset_seed) >> 16);
 363                }
 364        }
 365
 366        /*
 367         * Register the kernel text, kernel data, initrd, and initial
 368         * pagetables with memblock.
 369         */
 370        memblock_reserve(__pa_symbol(_text), _end - _text);
 371        if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
 372                /* the generic initrd code expects virtual addresses */
 373                initrd_start = __phys_to_virt(phys_initrd_start);
 374                initrd_end = initrd_start + phys_initrd_size;
 375        }
 376
 377        early_init_fdt_scan_reserved_mem();
 378
 379        if (IS_ENABLED(CONFIG_ZONE_DMA)) {
 380                zone_dma_bits = ARM64_ZONE_DMA_BITS;
 381                arm64_dma_phys_limit = max_zone_phys(ARM64_ZONE_DMA_BITS);
 382        }
 383
 384        if (IS_ENABLED(CONFIG_ZONE_DMA32))
 385                arm64_dma32_phys_limit = max_zone_phys(32);
 386        else
 387                arm64_dma32_phys_limit = PHYS_MASK + 1;
 388
 389        reserve_crashkernel();
 390
 391        reserve_elfcorehdr();
 392
 393        high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
 394
 395        dma_contiguous_reserve(arm64_dma32_phys_limit);
 396}
 397
 398void __init bootmem_init(void)
 399{
 400        unsigned long min, max;
 401
 402        min = PFN_UP(memblock_start_of_DRAM());
 403        max = PFN_DOWN(memblock_end_of_DRAM());
 404
 405        early_memtest(min << PAGE_SHIFT, max << PAGE_SHIFT);
 406
 407        max_pfn = max_low_pfn = max;
 408        min_low_pfn = min;
 409
 410        arm64_numa_init();
 411
 412        /*
 413         * must be done after arm64_numa_init() which calls numa_init() to
 414         * initialize node_online_map that gets used in hugetlb_cma_reserve()
 415         * while allocating required CMA size across online nodes.
 416         */
 417#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
 418        arm64_hugetlb_cma_reserve();
 419#endif
 420
 421        dma_pernuma_cma_reserve();
 422
 423        /*
 424         * sparse_init() tries to allocate memory from memblock, so must be
 425         * done after the fixed reservations
 426         */
 427        sparse_init();
 428        zone_sizes_init(min, max);
 429
 430        memblock_dump_all();
 431}
 432
 433#ifndef CONFIG_SPARSEMEM_VMEMMAP
 434static inline void free_memmap(unsigned long start_pfn, unsigned long end_pfn)
 435{
 436        struct page *start_pg, *end_pg;
 437        unsigned long pg, pgend;
 438
 439        /*
 440         * Convert start_pfn/end_pfn to a struct page pointer.
 441         */
 442        start_pg = pfn_to_page(start_pfn - 1) + 1;
 443        end_pg = pfn_to_page(end_pfn - 1) + 1;
 444
 445        /*
 446         * Convert to physical addresses, and round start upwards and end
 447         * downwards.
 448         */
 449        pg = (unsigned long)PAGE_ALIGN(__pa(start_pg));
 450        pgend = (unsigned long)__pa(end_pg) & PAGE_MASK;
 451
 452        /*
 453         * If there are free pages between these, free the section of the
 454         * memmap array.
 455         */
 456        if (pg < pgend)
 457                memblock_free(pg, pgend - pg);
 458}
 459
 460/*
 461 * The mem_map array can get very big. Free the unused area of the memory map.
 462 */
 463static void __init free_unused_memmap(void)
 464{
 465        unsigned long start, end, prev_end = 0;
 466        int i;
 467
 468        for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) {
 469#ifdef CONFIG_SPARSEMEM
 470                /*
 471                 * Take care not to free memmap entries that don't exist due
 472                 * to SPARSEMEM sections which aren't present.
 473                 */
 474                start = min(start, ALIGN(prev_end, PAGES_PER_SECTION));
 475#endif
 476                /*
 477                 * If we had a previous bank, and there is a space between the
 478                 * current bank and the previous, free it.
 479                 */
 480                if (prev_end && prev_end < start)
 481                        free_memmap(prev_end, start);
 482
 483                /*
 484                 * Align up here since the VM subsystem insists that the
 485                 * memmap entries are valid from the bank end aligned to
 486                 * MAX_ORDER_NR_PAGES.
 487                 */
 488                prev_end = ALIGN(end, MAX_ORDER_NR_PAGES);
 489        }
 490
 491#ifdef CONFIG_SPARSEMEM
 492        if (!IS_ALIGNED(prev_end, PAGES_PER_SECTION))
 493                free_memmap(prev_end, ALIGN(prev_end, PAGES_PER_SECTION));
 494#endif
 495}
 496#endif  /* !CONFIG_SPARSEMEM_VMEMMAP */
 497
 498/*
 499 * mem_init() marks the free areas in the mem_map and tells us how much memory
 500 * is free.  This is done after various parts of the system have claimed their
 501 * memory after the kernel image.
 502 */
 503void __init mem_init(void)
 504{
 505        if (swiotlb_force == SWIOTLB_FORCE ||
 506            max_pfn > PFN_DOWN(arm64_dma_phys_limit ? : arm64_dma32_phys_limit))
 507                swiotlb_init(1);
 508        else
 509                swiotlb_force = SWIOTLB_NO_FORCE;
 510
 511        set_max_mapnr(max_pfn - PHYS_PFN_OFFSET);
 512
 513#ifndef CONFIG_SPARSEMEM_VMEMMAP
 514        free_unused_memmap();
 515#endif
 516        /* this will put all unused low memory onto the freelists */
 517        memblock_free_all();
 518
 519        mem_init_print_info(NULL);
 520
 521        /*
 522         * Check boundaries twice: Some fundamental inconsistencies can be
 523         * detected at build time already.
 524         */
 525#ifdef CONFIG_COMPAT
 526        BUILD_BUG_ON(TASK_SIZE_32 > DEFAULT_MAP_WINDOW_64);
 527#endif
 528
 529        if (PAGE_SIZE >= 16384 && get_num_physpages() <= 128) {
 530                extern int sysctl_overcommit_memory;
 531                /*
 532                 * On a machine this small we won't get anywhere without
 533                 * overcommit, so turn it on by default.
 534                 */
 535                sysctl_overcommit_memory = OVERCOMMIT_ALWAYS;
 536        }
 537}
 538
 539void free_initmem(void)
 540{
 541        free_reserved_area(lm_alias(__init_begin),
 542                           lm_alias(__init_end),
 543                           POISON_FREE_INITMEM, "unused kernel");
 544        /*
 545         * Unmap the __init region but leave the VM area in place. This
 546         * prevents the region from being reused for kernel modules, which
 547         * is not supported by kallsyms.
 548         */
 549        unmap_kernel_range((u64)__init_begin, (u64)(__init_end - __init_begin));
 550}
 551
 552void dump_mem_limit(void)
 553{
 554        if (memory_limit != PHYS_ADDR_MAX) {
 555                pr_emerg("Memory Limit: %llu MB\n", memory_limit >> 20);
 556        } else {
 557                pr_emerg("Memory Limit: none\n");
 558        }
 559}
 560