linux/arch/ia64/mm/discontig.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2000, 2003 Silicon Graphics, Inc.  All rights reserved.
   4 * Copyright (c) 2001 Intel Corp.
   5 * Copyright (c) 2001 Tony Luck <tony.luck@intel.com>
   6 * Copyright (c) 2002 NEC Corp.
   7 * Copyright (c) 2002 Kimio Suganuma <k-suganuma@da.jp.nec.com>
   8 * Copyright (c) 2004 Silicon Graphics, Inc
   9 *      Russ Anderson <rja@sgi.com>
  10 *      Jesse Barnes <jbarnes@sgi.com>
  11 *      Jack Steiner <steiner@sgi.com>
  12 */
  13
  14/*
  15 * Platform initialization for Discontig Memory
  16 */
  17
  18#include <linux/kernel.h>
  19#include <linux/mm.h>
  20#include <linux/nmi.h>
  21#include <linux/swap.h>
  22#include <linux/memblock.h>
  23#include <linux/acpi.h>
  24#include <linux/efi.h>
  25#include <linux/nodemask.h>
  26#include <linux/slab.h>
  27#include <asm/efi.h>
  28#include <asm/tlb.h>
  29#include <asm/meminit.h>
  30#include <asm/numa.h>
  31#include <asm/sections.h>
  32
  33/*
  34 * Track per-node information needed to setup the boot memory allocator, the
  35 * per-node areas, and the real VM.
  36 */
  37struct early_node_data {
  38        struct ia64_node_data *node_data;
  39        unsigned long pernode_addr;
  40        unsigned long pernode_size;
  41        unsigned long min_pfn;
  42        unsigned long max_pfn;
  43};
  44
  45static struct early_node_data mem_data[MAX_NUMNODES] __initdata;
  46static nodemask_t memory_less_mask __initdata;
  47
  48pg_data_t *pgdat_list[MAX_NUMNODES];
  49
  50/*
  51 * To prevent cache aliasing effects, align per-node structures so that they
  52 * start at addresses that are strided by node number.
  53 */
  54#define MAX_NODE_ALIGN_OFFSET   (32 * 1024 * 1024)
  55#define NODEDATA_ALIGN(addr, node)                                              \
  56        ((((addr) + 1024*1024-1) & ~(1024*1024-1)) +                            \
  57             (((node)*PERCPU_PAGE_SIZE) & (MAX_NODE_ALIGN_OFFSET - 1)))
  58
  59/**
  60 * build_node_maps - callback to setup mem_data structs for each node
  61 * @start: physical start of range
  62 * @len: length of range
  63 * @node: node where this range resides
  64 *
  65 * Detect extents of each piece of memory that we wish to
  66 * treat as a virtually contiguous block (i.e. each node). Each such block
  67 * must start on an %IA64_GRANULE_SIZE boundary, so we round the address down
  68 * if necessary.  Any non-existent pages will simply be part of the virtual
  69 * memmap.
  70 */
  71static int __init build_node_maps(unsigned long start, unsigned long len,
  72                                  int node)
  73{
  74        unsigned long spfn, epfn, end = start + len;
  75
  76        epfn = GRANULEROUNDUP(end) >> PAGE_SHIFT;
  77        spfn = GRANULEROUNDDOWN(start) >> PAGE_SHIFT;
  78
  79        if (!mem_data[node].min_pfn) {
  80                mem_data[node].min_pfn = spfn;
  81                mem_data[node].max_pfn = epfn;
  82        } else {
  83                mem_data[node].min_pfn = min(spfn, mem_data[node].min_pfn);
  84                mem_data[node].max_pfn = max(epfn, mem_data[node].max_pfn);
  85        }
  86
  87        return 0;
  88}
  89
  90/**
  91 * early_nr_cpus_node - return number of cpus on a given node
  92 * @node: node to check
  93 *
  94 * Count the number of cpus on @node.  We can't use nr_cpus_node() yet because
  95 * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been
  96 * called yet.  Note that node 0 will also count all non-existent cpus.
  97 */
  98static int early_nr_cpus_node(int node)
  99{
 100        int cpu, n = 0;
 101
 102        for_each_possible_early_cpu(cpu)
 103                if (node == node_cpuid[cpu].nid)
 104                        n++;
 105
 106        return n;
 107}
 108
 109/**
 110 * compute_pernodesize - compute size of pernode data
 111 * @node: the node id.
 112 */
 113static unsigned long compute_pernodesize(int node)
 114{
 115        unsigned long pernodesize = 0, cpus;
 116
 117        cpus = early_nr_cpus_node(node);
 118        pernodesize += PERCPU_PAGE_SIZE * cpus;
 119        pernodesize += node * L1_CACHE_BYTES;
 120        pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t));
 121        pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
 122        pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t));
 123        pernodesize = PAGE_ALIGN(pernodesize);
 124        return pernodesize;
 125}
 126
 127/**
 128 * per_cpu_node_setup - setup per-cpu areas on each node
 129 * @cpu_data: per-cpu area on this node
 130 * @node: node to setup
 131 *
 132 * Copy the static per-cpu data into the region we just set aside and then
 133 * setup __per_cpu_offset for each CPU on this node.  Return a pointer to
 134 * the end of the area.
 135 */
 136static void *per_cpu_node_setup(void *cpu_data, int node)
 137{
 138#ifdef CONFIG_SMP
 139        int cpu;
 140
 141        for_each_possible_early_cpu(cpu) {
 142                void *src = cpu == 0 ? __cpu0_per_cpu : __phys_per_cpu_start;
 143
 144                if (node != node_cpuid[cpu].nid)
 145                        continue;
 146
 147                memcpy(__va(cpu_data), src, __per_cpu_end - __per_cpu_start);
 148                __per_cpu_offset[cpu] = (char *)__va(cpu_data) -
 149                        __per_cpu_start;
 150
 151                /*
 152                 * percpu area for cpu0 is moved from the __init area
 153                 * which is setup by head.S and used till this point.
 154                 * Update ar.k3.  This move is ensures that percpu
 155                 * area for cpu0 is on the correct node and its
 156                 * virtual address isn't insanely far from other
 157                 * percpu areas which is important for congruent
 158                 * percpu allocator.
 159                 */
 160                if (cpu == 0)
 161                        ia64_set_kr(IA64_KR_PER_CPU_DATA,
 162                                    (unsigned long)cpu_data -
 163                                    (unsigned long)__per_cpu_start);
 164
 165                cpu_data += PERCPU_PAGE_SIZE;
 166        }
 167#endif
 168        return cpu_data;
 169}
 170
 171#ifdef CONFIG_SMP
 172/**
 173 * setup_per_cpu_areas - setup percpu areas
 174 *
 175 * Arch code has already allocated and initialized percpu areas.  All
 176 * this function has to do is to teach the determined layout to the
 177 * dynamic percpu allocator, which happens to be more complex than
 178 * creating whole new ones using helpers.
 179 */
 180void __init setup_per_cpu_areas(void)
 181{
 182        struct pcpu_alloc_info *ai;
 183        struct pcpu_group_info *gi;
 184        unsigned int *cpu_map;
 185        void *base;
 186        unsigned long base_offset;
 187        unsigned int cpu;
 188        ssize_t static_size, reserved_size, dyn_size;
 189        int node, prev_node, unit, nr_units;
 190
 191        ai = pcpu_alloc_alloc_info(MAX_NUMNODES, nr_cpu_ids);
 192        if (!ai)
 193                panic("failed to allocate pcpu_alloc_info");
 194        cpu_map = ai->groups[0].cpu_map;
 195
 196        /* determine base */
 197        base = (void *)ULONG_MAX;
 198        for_each_possible_cpu(cpu)
 199                base = min(base,
 200                           (void *)(__per_cpu_offset[cpu] + __per_cpu_start));
 201        base_offset = (void *)__per_cpu_start - base;
 202
 203        /* build cpu_map, units are grouped by node */
 204        unit = 0;
 205        for_each_node(node)
 206                for_each_possible_cpu(cpu)
 207                        if (node == node_cpuid[cpu].nid)
 208                                cpu_map[unit++] = cpu;
 209        nr_units = unit;
 210
 211        /* set basic parameters */
 212        static_size = __per_cpu_end - __per_cpu_start;
 213        reserved_size = PERCPU_MODULE_RESERVE;
 214        dyn_size = PERCPU_PAGE_SIZE - static_size - reserved_size;
 215        if (dyn_size < 0)
 216                panic("percpu area overflow static=%zd reserved=%zd\n",
 217                      static_size, reserved_size);
 218
 219        ai->static_size         = static_size;
 220        ai->reserved_size       = reserved_size;
 221        ai->dyn_size            = dyn_size;
 222        ai->unit_size           = PERCPU_PAGE_SIZE;
 223        ai->atom_size           = PAGE_SIZE;
 224        ai->alloc_size          = PERCPU_PAGE_SIZE;
 225
 226        /*
 227         * CPUs are put into groups according to node.  Walk cpu_map
 228         * and create new groups at node boundaries.
 229         */
 230        prev_node = NUMA_NO_NODE;
 231        ai->nr_groups = 0;
 232        for (unit = 0; unit < nr_units; unit++) {
 233                cpu = cpu_map[unit];
 234                node = node_cpuid[cpu].nid;
 235
 236                if (node == prev_node) {
 237                        gi->nr_units++;
 238                        continue;
 239                }
 240                prev_node = node;
 241
 242                gi = &ai->groups[ai->nr_groups++];
 243                gi->nr_units            = 1;
 244                gi->base_offset         = __per_cpu_offset[cpu] + base_offset;
 245                gi->cpu_map             = &cpu_map[unit];
 246        }
 247
 248        pcpu_setup_first_chunk(ai, base);
 249        pcpu_free_alloc_info(ai);
 250}
 251#endif
 252
 253/**
 254 * fill_pernode - initialize pernode data.
 255 * @node: the node id.
 256 * @pernode: physical address of pernode data
 257 * @pernodesize: size of the pernode data
 258 */
 259static void __init fill_pernode(int node, unsigned long pernode,
 260        unsigned long pernodesize)
 261{
 262        void *cpu_data;
 263        int cpus = early_nr_cpus_node(node);
 264
 265        mem_data[node].pernode_addr = pernode;
 266        mem_data[node].pernode_size = pernodesize;
 267        memset(__va(pernode), 0, pernodesize);
 268
 269        cpu_data = (void *)pernode;
 270        pernode += PERCPU_PAGE_SIZE * cpus;
 271        pernode += node * L1_CACHE_BYTES;
 272
 273        pgdat_list[node] = __va(pernode);
 274        pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
 275
 276        mem_data[node].node_data = __va(pernode);
 277        pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
 278        pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
 279
 280        cpu_data = per_cpu_node_setup(cpu_data, node);
 281
 282        return;
 283}
 284
 285/**
 286 * find_pernode_space - allocate memory for memory map and per-node structures
 287 * @start: physical start of range
 288 * @len: length of range
 289 * @node: node where this range resides
 290 *
 291 * This routine reserves space for the per-cpu data struct, the list of
 292 * pg_data_ts and the per-node data struct.  Each node will have something like
 293 * the following in the first chunk of addr. space large enough to hold it.
 294 *
 295 *    ________________________
 296 *   |                        |
 297 *   |~~~~~~~~~~~~~~~~~~~~~~~~| <-- NODEDATA_ALIGN(start, node) for the first
 298 *   |    PERCPU_PAGE_SIZE *  |     start and length big enough
 299 *   |    cpus_on_this_node   | Node 0 will also have entries for all non-existent cpus.
 300 *   |------------------------|
 301 *   |   local pg_data_t *    |
 302 *   |------------------------|
 303 *   |  local ia64_node_data  |
 304 *   |------------------------|
 305 *   |          ???           |
 306 *   |________________________|
 307 *
 308 * Once this space has been set aside, the bootmem maps are initialized.  We
 309 * could probably move the allocation of the per-cpu and ia64_node_data space
 310 * outside of this function and use alloc_bootmem_node(), but doing it here
 311 * is straightforward and we get the alignments we want so...
 312 */
 313static int __init find_pernode_space(unsigned long start, unsigned long len,
 314                                     int node)
 315{
 316        unsigned long spfn, epfn;
 317        unsigned long pernodesize = 0, pernode;
 318
 319        spfn = start >> PAGE_SHIFT;
 320        epfn = (start + len) >> PAGE_SHIFT;
 321
 322        /*
 323         * Make sure this memory falls within this node's usable memory
 324         * since we may have thrown some away in build_maps().
 325         */
 326        if (spfn < mem_data[node].min_pfn || epfn > mem_data[node].max_pfn)
 327                return 0;
 328
 329        /* Don't setup this node's local space twice... */
 330        if (mem_data[node].pernode_addr)
 331                return 0;
 332
 333        /*
 334         * Calculate total size needed, incl. what's necessary
 335         * for good alignment and alias prevention.
 336         */
 337        pernodesize = compute_pernodesize(node);
 338        pernode = NODEDATA_ALIGN(start, node);
 339
 340        /* Is this range big enough for what we want to store here? */
 341        if (start + len > (pernode + pernodesize))
 342                fill_pernode(node, pernode, pernodesize);
 343
 344        return 0;
 345}
 346
 347/**
 348 * reserve_pernode_space - reserve memory for per-node space
 349 *
 350 * Reserve the space used by the bootmem maps & per-node space in the boot
 351 * allocator so that when we actually create the real mem maps we don't
 352 * use their memory.
 353 */
 354static void __init reserve_pernode_space(void)
 355{
 356        unsigned long base, size;
 357        int node;
 358
 359        for_each_online_node(node) {
 360                if (node_isset(node, memory_less_mask))
 361                        continue;
 362
 363                /* Now the per-node space */
 364                size = mem_data[node].pernode_size;
 365                base = __pa(mem_data[node].pernode_addr);
 366                memblock_reserve(base, size);
 367        }
 368}
 369
 370static void scatter_node_data(void)
 371{
 372        pg_data_t **dst;
 373        int node;
 374
 375        /*
 376         * for_each_online_node() can't be used at here.
 377         * node_online_map is not set for hot-added nodes at this time,
 378         * because we are halfway through initialization of the new node's
 379         * structures.  If for_each_online_node() is used, a new node's
 380         * pg_data_ptrs will be not initialized. Instead of using it,
 381         * pgdat_list[] is checked.
 382         */
 383        for_each_node(node) {
 384                if (pgdat_list[node]) {
 385                        dst = LOCAL_DATA_ADDR(pgdat_list[node])->pg_data_ptrs;
 386                        memcpy(dst, pgdat_list, sizeof(pgdat_list));
 387                }
 388        }
 389}
 390
 391/**
 392 * initialize_pernode_data - fixup per-cpu & per-node pointers
 393 *
 394 * Each node's per-node area has a copy of the global pg_data_t list, so
 395 * we copy that to each node here, as well as setting the per-cpu pointer
 396 * to the local node data structure.
 397 */
 398static void __init initialize_pernode_data(void)
 399{
 400        int cpu, node;
 401
 402        scatter_node_data();
 403
 404#ifdef CONFIG_SMP
 405        /* Set the node_data pointer for each per-cpu struct */
 406        for_each_possible_early_cpu(cpu) {
 407                node = node_cpuid[cpu].nid;
 408                per_cpu(ia64_cpu_info, cpu).node_data =
 409                        mem_data[node].node_data;
 410        }
 411#else
 412        {
 413                struct cpuinfo_ia64 *cpu0_cpu_info;
 414                cpu = 0;
 415                node = node_cpuid[cpu].nid;
 416                cpu0_cpu_info = (struct cpuinfo_ia64 *)(__phys_per_cpu_start +
 417                        ((char *)&ia64_cpu_info - __per_cpu_start));
 418                cpu0_cpu_info->node_data = mem_data[node].node_data;
 419        }
 420#endif /* CONFIG_SMP */
 421}
 422
 423/**
 424 * memory_less_node_alloc - * attempt to allocate memory on the best NUMA slit
 425 *      node but fall back to any other node when __alloc_bootmem_node fails
 426 *      for best.
 427 * @nid: node id
 428 * @pernodesize: size of this node's pernode data
 429 */
 430static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize)
 431{
 432        void *ptr = NULL;
 433        u8 best = 0xff;
 434        int bestnode = NUMA_NO_NODE, node, anynode = 0;
 435
 436        for_each_online_node(node) {
 437                if (node_isset(node, memory_less_mask))
 438                        continue;
 439                else if (node_distance(nid, node) < best) {
 440                        best = node_distance(nid, node);
 441                        bestnode = node;
 442                }
 443                anynode = node;
 444        }
 445
 446        if (bestnode == NUMA_NO_NODE)
 447                bestnode = anynode;
 448
 449        ptr = memblock_alloc_try_nid(pernodesize, PERCPU_PAGE_SIZE,
 450                                     __pa(MAX_DMA_ADDRESS),
 451                                     MEMBLOCK_ALLOC_ACCESSIBLE,
 452                                     bestnode);
 453        if (!ptr)
 454                panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%lx\n",
 455                      __func__, pernodesize, PERCPU_PAGE_SIZE, bestnode,
 456                      __pa(MAX_DMA_ADDRESS));
 457
 458        return ptr;
 459}
 460
 461/**
 462 * memory_less_nodes - allocate and initialize CPU only nodes pernode
 463 *      information.
 464 */
 465static void __init memory_less_nodes(void)
 466{
 467        unsigned long pernodesize;
 468        void *pernode;
 469        int node;
 470
 471        for_each_node_mask(node, memory_less_mask) {
 472                pernodesize = compute_pernodesize(node);
 473                pernode = memory_less_node_alloc(node, pernodesize);
 474                fill_pernode(node, __pa(pernode), pernodesize);
 475        }
 476
 477        return;
 478}
 479
 480/**
 481 * find_memory - walk the EFI memory map and setup the bootmem allocator
 482 *
 483 * Called early in boot to setup the bootmem allocator, and to
 484 * allocate the per-cpu and per-node structures.
 485 */
 486void __init find_memory(void)
 487{
 488        int node;
 489
 490        reserve_memory();
 491        efi_memmap_walk(filter_memory, register_active_ranges);
 492
 493        if (num_online_nodes() == 0) {
 494                printk(KERN_ERR "node info missing!\n");
 495                node_set_online(0);
 496        }
 497
 498        nodes_or(memory_less_mask, memory_less_mask, node_online_map);
 499        min_low_pfn = -1;
 500        max_low_pfn = 0;
 501
 502        /* These actually end up getting called by call_pernode_memory() */
 503        efi_memmap_walk(filter_rsvd_memory, build_node_maps);
 504        efi_memmap_walk(filter_rsvd_memory, find_pernode_space);
 505        efi_memmap_walk(find_max_min_low_pfn, NULL);
 506
 507        for_each_online_node(node)
 508                if (mem_data[node].min_pfn)
 509                        node_clear(node, memory_less_mask);
 510
 511        reserve_pernode_space();
 512        memory_less_nodes();
 513        initialize_pernode_data();
 514
 515        max_pfn = max_low_pfn;
 516
 517        find_initrd();
 518}
 519
 520#ifdef CONFIG_SMP
 521/**
 522 * per_cpu_init - setup per-cpu variables
 523 *
 524 * find_pernode_space() does most of this already, we just need to set
 525 * local_per_cpu_offset
 526 */
 527void *per_cpu_init(void)
 528{
 529        int cpu;
 530        static int first_time = 1;
 531
 532        if (first_time) {
 533                first_time = 0;
 534                for_each_possible_early_cpu(cpu)
 535                        per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu];
 536        }
 537
 538        return __per_cpu_start + __per_cpu_offset[smp_processor_id()];
 539}
 540#endif /* CONFIG_SMP */
 541
 542/**
 543 * call_pernode_memory - use SRAT to call callback functions with node info
 544 * @start: physical start of range
 545 * @len: length of range
 546 * @arg: function to call for each range
 547 *
 548 * efi_memmap_walk() knows nothing about layout of memory across nodes. Find
 549 * out to which node a block of memory belongs.  Ignore memory that we cannot
 550 * identify, and split blocks that run across multiple nodes.
 551 *
 552 * Take this opportunity to round the start address up and the end address
 553 * down to page boundaries.
 554 */
 555void call_pernode_memory(unsigned long start, unsigned long len, void *arg)
 556{
 557        unsigned long rs, re, end = start + len;
 558        void (*func)(unsigned long, unsigned long, int);
 559        int i;
 560
 561        start = PAGE_ALIGN(start);
 562        end &= PAGE_MASK;
 563        if (start >= end)
 564                return;
 565
 566        func = arg;
 567
 568        if (!num_node_memblks) {
 569                /* No SRAT table, so assume one node (node 0) */
 570                if (start < end)
 571                        (*func)(start, end - start, 0);
 572                return;
 573        }
 574
 575        for (i = 0; i < num_node_memblks; i++) {
 576                rs = max(start, node_memblk[i].start_paddr);
 577                re = min(end, node_memblk[i].start_paddr +
 578                         node_memblk[i].size);
 579
 580                if (rs < re)
 581                        (*func)(rs, re - rs, node_memblk[i].nid);
 582
 583                if (re == end)
 584                        break;
 585        }
 586}
 587
 588/**
 589 * paging_init - setup page tables
 590 *
 591 * paging_init() sets up the page tables for each node of the system and frees
 592 * the bootmem allocator memory for general use.
 593 */
 594void __init paging_init(void)
 595{
 596        unsigned long max_dma;
 597        unsigned long max_zone_pfns[MAX_NR_ZONES];
 598
 599        max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
 600
 601        sparse_init();
 602
 603        memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 604        max_zone_pfns[ZONE_DMA32] = max_dma;
 605        max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
 606        free_area_init(max_zone_pfns);
 607
 608        zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
 609}
 610
 611#ifdef CONFIG_MEMORY_HOTPLUG
 612pg_data_t *arch_alloc_nodedata(int nid)
 613{
 614        unsigned long size = compute_pernodesize(nid);
 615
 616        return kzalloc(size, GFP_KERNEL);
 617}
 618
 619void arch_free_nodedata(pg_data_t *pgdat)
 620{
 621        kfree(pgdat);
 622}
 623
 624void arch_refresh_nodedata(int update_node, pg_data_t *update_pgdat)
 625{
 626        pgdat_list[update_node] = update_pgdat;
 627        scatter_node_data();
 628}
 629#endif
 630
 631#ifdef CONFIG_SPARSEMEM_VMEMMAP
 632int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 633                struct vmem_altmap *altmap)
 634{
 635        return vmemmap_populate_basepages(start, end, node, NULL);
 636}
 637
 638void vmemmap_free(unsigned long start, unsigned long end,
 639                struct vmem_altmap *altmap)
 640{
 641}
 642#endif
 643