linux/mm/sparse.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * sparse memory mappings.
   4 */
   5#include <linux/mm.h>
   6#include <linux/slab.h>
   7#include <linux/mmzone.h>
   8#include <linux/memblock.h>
   9#include <linux/compiler.h>
  10#include <linux/highmem.h>
  11#include <linux/export.h>
  12#include <linux/spinlock.h>
  13#include <linux/vmalloc.h>
  14#include <linux/swap.h>
  15#include <linux/swapops.h>
  16#include <linux/bootmem_info.h>
  17
  18#include "internal.h"
  19#include <asm/dma.h>
  20
  21/*
  22 * Permanent SPARSEMEM data:
  23 *
  24 * 1) mem_section       - memory sections, mem_map's for valid memory
  25 */
  26#ifdef CONFIG_SPARSEMEM_EXTREME
  27struct mem_section **mem_section;
  28#else
  29struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
  30        ____cacheline_internodealigned_in_smp;
  31#endif
  32EXPORT_SYMBOL(mem_section);
  33
  34#ifdef NODE_NOT_IN_PAGE_FLAGS
  35/*
  36 * If we did not store the node number in the page then we have to
  37 * do a lookup in the section_to_node_table in order to find which
  38 * node the page belongs to.
  39 */
  40#if MAX_NUMNODES <= 256
  41static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
  42#else
  43static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
  44#endif
  45
  46int page_to_nid(const struct page *page)
  47{
  48        return section_to_node_table[page_to_section(page)];
  49}
  50EXPORT_SYMBOL(page_to_nid);
  51
  52static void set_section_nid(unsigned long section_nr, int nid)
  53{
  54        section_to_node_table[section_nr] = nid;
  55}
  56#else /* !NODE_NOT_IN_PAGE_FLAGS */
  57static inline void set_section_nid(unsigned long section_nr, int nid)
  58{
  59}
  60#endif
  61
  62#ifdef CONFIG_SPARSEMEM_EXTREME
  63static noinline struct mem_section __ref *sparse_index_alloc(int nid)
  64{
  65        struct mem_section *section = NULL;
  66        unsigned long array_size = SECTIONS_PER_ROOT *
  67                                   sizeof(struct mem_section);
  68
  69        if (slab_is_available()) {
  70                section = kzalloc_node(array_size, GFP_KERNEL, nid);
  71        } else {
  72                section = memblock_alloc_node(array_size, SMP_CACHE_BYTES,
  73                                              nid);
  74                if (!section)
  75                        panic("%s: Failed to allocate %lu bytes nid=%d\n",
  76                              __func__, array_size, nid);
  77        }
  78
  79        return section;
  80}
  81
  82static int __meminit sparse_index_init(unsigned long section_nr, int nid)
  83{
  84        unsigned long root = SECTION_NR_TO_ROOT(section_nr);
  85        struct mem_section *section;
  86
  87        /*
  88         * An existing section is possible in the sub-section hotplug
  89         * case. First hot-add instantiates, follow-on hot-add reuses
  90         * the existing section.
  91         *
  92         * The mem_hotplug_lock resolves the apparent race below.
  93         */
  94        if (mem_section[root])
  95                return 0;
  96
  97        section = sparse_index_alloc(nid);
  98        if (!section)
  99                return -ENOMEM;
 100
 101        mem_section[root] = section;
 102
 103        return 0;
 104}
 105#else /* !SPARSEMEM_EXTREME */
 106static inline int sparse_index_init(unsigned long section_nr, int nid)
 107{
 108        return 0;
 109}
 110#endif
 111
 112/*
 113 * During early boot, before section_mem_map is used for an actual
 114 * mem_map, we use section_mem_map to store the section's NUMA
 115 * node.  This keeps us from having to use another data structure.  The
 116 * node information is cleared just before we store the real mem_map.
 117 */
 118static inline unsigned long sparse_encode_early_nid(int nid)
 119{
 120        return ((unsigned long)nid << SECTION_NID_SHIFT);
 121}
 122
 123static inline int sparse_early_nid(struct mem_section *section)
 124{
 125        return (section->section_mem_map >> SECTION_NID_SHIFT);
 126}
 127
 128/* Validate the physical addressing limitations of the model */
 129void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
 130                                                unsigned long *end_pfn)
 131{
 132        unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
 133
 134        /*
 135         * Sanity checks - do not allow an architecture to pass
 136         * in larger pfns than the maximum scope of sparsemem:
 137         */
 138        if (*start_pfn > max_sparsemem_pfn) {
 139                mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
 140                        "Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
 141                        *start_pfn, *end_pfn, max_sparsemem_pfn);
 142                WARN_ON_ONCE(1);
 143                *start_pfn = max_sparsemem_pfn;
 144                *end_pfn = max_sparsemem_pfn;
 145        } else if (*end_pfn > max_sparsemem_pfn) {
 146                mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
 147                        "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
 148                        *start_pfn, *end_pfn, max_sparsemem_pfn);
 149                WARN_ON_ONCE(1);
 150                *end_pfn = max_sparsemem_pfn;
 151        }
 152}
 153
 154/*
 155 * There are a number of times that we loop over NR_MEM_SECTIONS,
 156 * looking for section_present() on each.  But, when we have very
 157 * large physical address spaces, NR_MEM_SECTIONS can also be
 158 * very large which makes the loops quite long.
 159 *
 160 * Keeping track of this gives us an easy way to break out of
 161 * those loops early.
 162 */
 163unsigned long __highest_present_section_nr;
 164static void __section_mark_present(struct mem_section *ms,
 165                unsigned long section_nr)
 166{
 167        if (section_nr > __highest_present_section_nr)
 168                __highest_present_section_nr = section_nr;
 169
 170        ms->section_mem_map |= SECTION_MARKED_PRESENT;
 171}
 172
 173#define for_each_present_section_nr(start, section_nr)          \
 174        for (section_nr = next_present_section_nr(start-1);     \
 175             ((section_nr != -1) &&                             \
 176              (section_nr <= __highest_present_section_nr));    \
 177             section_nr = next_present_section_nr(section_nr))
 178
 179static inline unsigned long first_present_section_nr(void)
 180{
 181        return next_present_section_nr(-1);
 182}
 183
 184#ifdef CONFIG_SPARSEMEM_VMEMMAP
 185static void subsection_mask_set(unsigned long *map, unsigned long pfn,
 186                unsigned long nr_pages)
 187{
 188        int idx = subsection_map_index(pfn);
 189        int end = subsection_map_index(pfn + nr_pages - 1);
 190
 191        bitmap_set(map, idx, end - idx + 1);
 192}
 193
 194void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
 195{
 196        int end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
 197        unsigned long nr, start_sec = pfn_to_section_nr(pfn);
 198
 199        if (!nr_pages)
 200                return;
 201
 202        for (nr = start_sec; nr <= end_sec; nr++) {
 203                struct mem_section *ms;
 204                unsigned long pfns;
 205
 206                pfns = min(nr_pages, PAGES_PER_SECTION
 207                                - (pfn & ~PAGE_SECTION_MASK));
 208                ms = __nr_to_section(nr);
 209                subsection_mask_set(ms->usage->subsection_map, pfn, pfns);
 210
 211                pr_debug("%s: sec: %lu pfns: %lu set(%d, %d)\n", __func__, nr,
 212                                pfns, subsection_map_index(pfn),
 213                                subsection_map_index(pfn + pfns - 1));
 214
 215                pfn += pfns;
 216                nr_pages -= pfns;
 217        }
 218}
 219#else
 220void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
 221{
 222}
 223#endif
 224
 225/* Record a memory area against a node. */
 226static void __init memory_present(int nid, unsigned long start, unsigned long end)
 227{
 228        unsigned long pfn;
 229
 230#ifdef CONFIG_SPARSEMEM_EXTREME
 231        if (unlikely(!mem_section)) {
 232                unsigned long size, align;
 233
 234                size = sizeof(struct mem_section *) * NR_SECTION_ROOTS;
 235                align = 1 << (INTERNODE_CACHE_SHIFT);
 236                mem_section = memblock_alloc(size, align);
 237                if (!mem_section)
 238                        panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
 239                              __func__, size, align);
 240        }
 241#endif
 242
 243        start &= PAGE_SECTION_MASK;
 244        mminit_validate_memmodel_limits(&start, &end);
 245        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
 246                unsigned long section = pfn_to_section_nr(pfn);
 247                struct mem_section *ms;
 248
 249                sparse_index_init(section, nid);
 250                set_section_nid(section, nid);
 251
 252                ms = __nr_to_section(section);
 253                if (!ms->section_mem_map) {
 254                        ms->section_mem_map = sparse_encode_early_nid(nid) |
 255                                                        SECTION_IS_ONLINE;
 256                        __section_mark_present(ms, section);
 257                }
 258        }
 259}
 260
 261/*
 262 * Mark all memblocks as present using memory_present().
 263 * This is a convenience function that is useful to mark all of the systems
 264 * memory as present during initialization.
 265 */
 266static void __init memblocks_present(void)
 267{
 268        unsigned long start, end;
 269        int i, nid;
 270
 271        for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid)
 272                memory_present(nid, start, end);
 273}
 274
 275/*
 276 * Subtle, we encode the real pfn into the mem_map such that
 277 * the identity pfn - section_mem_map will return the actual
 278 * physical page frame number.
 279 */
 280static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
 281{
 282        unsigned long coded_mem_map =
 283                (unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
 284        BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL<<PFN_SECTION_SHIFT));
 285        BUG_ON(coded_mem_map & ~SECTION_MAP_MASK);
 286        return coded_mem_map;
 287}
 288
 289#ifdef CONFIG_MEMORY_HOTPLUG
 290/*
 291 * Decode mem_map from the coded memmap
 292 */
 293struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
 294{
 295        /* mask off the extra low bits of information */
 296        coded_mem_map &= SECTION_MAP_MASK;
 297        return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
 298}
 299#endif /* CONFIG_MEMORY_HOTPLUG */
 300
 301static void __meminit sparse_init_one_section(struct mem_section *ms,
 302                unsigned long pnum, struct page *mem_map,
 303                struct mem_section_usage *usage, unsigned long flags)
 304{
 305        ms->section_mem_map &= ~SECTION_MAP_MASK;
 306        ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum)
 307                | SECTION_HAS_MEM_MAP | flags;
 308        ms->usage = usage;
 309}
 310
 311static unsigned long usemap_size(void)
 312{
 313        return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
 314}
 315
 316size_t mem_section_usage_size(void)
 317{
 318        return sizeof(struct mem_section_usage) + usemap_size();
 319}
 320
 321static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat)
 322{
 323#ifndef CONFIG_NUMA
 324        VM_BUG_ON(pgdat != &contig_page_data);
 325        return __pa_symbol(&contig_page_data);
 326#else
 327        return __pa(pgdat);
 328#endif
 329}
 330
 331#ifdef CONFIG_MEMORY_HOTREMOVE
 332static struct mem_section_usage * __init
 333sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
 334                                         unsigned long size)
 335{
 336        struct mem_section_usage *usage;
 337        unsigned long goal, limit;
 338        int nid;
 339        /*
 340         * A page may contain usemaps for other sections preventing the
 341         * page being freed and making a section unremovable while
 342         * other sections referencing the usemap remain active. Similarly,
 343         * a pgdat can prevent a section being removed. If section A
 344         * contains a pgdat and section B contains the usemap, both
 345         * sections become inter-dependent. This allocates usemaps
 346         * from the same section as the pgdat where possible to avoid
 347         * this problem.
 348         */
 349        goal = pgdat_to_phys(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
 350        limit = goal + (1UL << PA_SECTION_SHIFT);
 351        nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
 352again:
 353        usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
 354        if (!usage && limit) {
 355                limit = 0;
 356                goto again;
 357        }
 358        return usage;
 359}
 360
 361static void __init check_usemap_section_nr(int nid,
 362                struct mem_section_usage *usage)
 363{
 364        unsigned long usemap_snr, pgdat_snr;
 365        static unsigned long old_usemap_snr;
 366        static unsigned long old_pgdat_snr;
 367        struct pglist_data *pgdat = NODE_DATA(nid);
 368        int usemap_nid;
 369
 370        /* First call */
 371        if (!old_usemap_snr) {
 372                old_usemap_snr = NR_MEM_SECTIONS;
 373                old_pgdat_snr = NR_MEM_SECTIONS;
 374        }
 375
 376        usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
 377        pgdat_snr = pfn_to_section_nr(pgdat_to_phys(pgdat) >> PAGE_SHIFT);
 378        if (usemap_snr == pgdat_snr)
 379                return;
 380
 381        if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr)
 382                /* skip redundant message */
 383                return;
 384
 385        old_usemap_snr = usemap_snr;
 386        old_pgdat_snr = pgdat_snr;
 387
 388        usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
 389        if (usemap_nid != nid) {
 390                pr_info("node %d must be removed before remove section %ld\n",
 391                        nid, usemap_snr);
 392                return;
 393        }
 394        /*
 395         * There is a circular dependency.
 396         * Some platforms allow un-removable section because they will just
 397         * gather other removable sections for dynamic partitioning.
 398         * Just notify un-removable section's number here.
 399         */
 400        pr_info("Section %ld and %ld (node %d) have a circular dependency on usemap and pgdat allocations\n",
 401                usemap_snr, pgdat_snr, nid);
 402}
 403#else
 404static struct mem_section_usage * __init
 405sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
 406                                         unsigned long size)
 407{
 408        return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id);
 409}
 410
 411static void __init check_usemap_section_nr(int nid,
 412                struct mem_section_usage *usage)
 413{
 414}
 415#endif /* CONFIG_MEMORY_HOTREMOVE */
 416
 417#ifdef CONFIG_SPARSEMEM_VMEMMAP
 418static unsigned long __init section_map_size(void)
 419{
 420        return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE);
 421}
 422
 423#else
 424static unsigned long __init section_map_size(void)
 425{
 426        return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
 427}
 428
 429struct page __init *__populate_section_memmap(unsigned long pfn,
 430                unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
 431{
 432        unsigned long size = section_map_size();
 433        struct page *map = sparse_buffer_alloc(size);
 434        phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
 435
 436        if (map)
 437                return map;
 438
 439        map = memmap_alloc(size, size, addr, nid, false);
 440        if (!map)
 441                panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
 442                      __func__, size, PAGE_SIZE, nid, &addr);
 443
 444        return map;
 445}
 446#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
 447
 448static void *sparsemap_buf __meminitdata;
 449static void *sparsemap_buf_end __meminitdata;
 450
 451static inline void __meminit sparse_buffer_free(unsigned long size)
 452{
 453        WARN_ON(!sparsemap_buf || size == 0);
 454        memblock_free_early(__pa(sparsemap_buf), size);
 455}
 456
 457static void __init sparse_buffer_init(unsigned long size, int nid)
 458{
 459        phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
 460        WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */
 461        /*
 462         * Pre-allocated buffer is mainly used by __populate_section_memmap
 463         * and we want it to be properly aligned to the section size - this is
 464         * especially the case for VMEMMAP which maps memmap to PMDs
 465         */
 466        sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true);
 467        sparsemap_buf_end = sparsemap_buf + size;
 468}
 469
 470static void __init sparse_buffer_fini(void)
 471{
 472        unsigned long size = sparsemap_buf_end - sparsemap_buf;
 473
 474        if (sparsemap_buf && size > 0)
 475                sparse_buffer_free(size);
 476        sparsemap_buf = NULL;
 477}
 478
 479void * __meminit sparse_buffer_alloc(unsigned long size)
 480{
 481        void *ptr = NULL;
 482
 483        if (sparsemap_buf) {
 484                ptr = (void *) roundup((unsigned long)sparsemap_buf, size);
 485                if (ptr + size > sparsemap_buf_end)
 486                        ptr = NULL;
 487                else {
 488                        /* Free redundant aligned space */
 489                        if ((unsigned long)(ptr - sparsemap_buf) > 0)
 490                                sparse_buffer_free((unsigned long)(ptr - sparsemap_buf));
 491                        sparsemap_buf = ptr + size;
 492                }
 493        }
 494        return ptr;
 495}
 496
 497void __weak __meminit vmemmap_populate_print_last(void)
 498{
 499}
 500
 501/*
 502 * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
 503 * And number of present sections in this node is map_count.
 504 */
 505static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
 506                                   unsigned long pnum_end,
 507                                   unsigned long map_count)
 508{
 509        struct mem_section_usage *usage;
 510        unsigned long pnum;
 511        struct page *map;
 512
 513        usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
 514                        mem_section_usage_size() * map_count);
 515        if (!usage) {
 516                pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
 517                goto failed;
 518        }
 519        sparse_buffer_init(map_count * section_map_size(), nid);
 520        for_each_present_section_nr(pnum_begin, pnum) {
 521                unsigned long pfn = section_nr_to_pfn(pnum);
 522
 523                if (pnum >= pnum_end)
 524                        break;
 525
 526                map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
 527                                nid, NULL);
 528                if (!map) {
 529                        pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
 530                               __func__, nid);
 531                        pnum_begin = pnum;
 532                        sparse_buffer_fini();
 533                        goto failed;
 534                }
 535                check_usemap_section_nr(nid, usage);
 536                sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage,
 537                                SECTION_IS_EARLY);
 538                usage = (void *) usage + mem_section_usage_size();
 539        }
 540        sparse_buffer_fini();
 541        return;
 542failed:
 543        /* We failed to allocate, mark all the following pnums as not present */
 544        for_each_present_section_nr(pnum_begin, pnum) {
 545                struct mem_section *ms;
 546
 547                if (pnum >= pnum_end)
 548                        break;
 549                ms = __nr_to_section(pnum);
 550                ms->section_mem_map = 0;
 551        }
 552}
 553
 554/*
 555 * Allocate the accumulated non-linear sections, allocate a mem_map
 556 * for each and record the physical to section mapping.
 557 */
 558void __init sparse_init(void)
 559{
 560        unsigned long pnum_end, pnum_begin, map_count = 1;
 561        int nid_begin;
 562
 563        memblocks_present();
 564
 565        pnum_begin = first_present_section_nr();
 566        nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
 567
 568        /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
 569        set_pageblock_order();
 570
 571        for_each_present_section_nr(pnum_begin + 1, pnum_end) {
 572                int nid = sparse_early_nid(__nr_to_section(pnum_end));
 573
 574                if (nid == nid_begin) {
 575                        map_count++;
 576                        continue;
 577                }
 578                /* Init node with sections in range [pnum_begin, pnum_end) */
 579                sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
 580                nid_begin = nid;
 581                pnum_begin = pnum_end;
 582                map_count = 1;
 583        }
 584        /* cover the last node */
 585        sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
 586        vmemmap_populate_print_last();
 587}
 588
 589#ifdef CONFIG_MEMORY_HOTPLUG
 590
 591/* Mark all memory sections within the pfn range as online */
 592void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
 593{
 594        unsigned long pfn;
 595
 596        for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
 597                unsigned long section_nr = pfn_to_section_nr(pfn);
 598                struct mem_section *ms;
 599
 600                /* onlining code should never touch invalid ranges */
 601                if (WARN_ON(!valid_section_nr(section_nr)))
 602                        continue;
 603
 604                ms = __nr_to_section(section_nr);
 605                ms->section_mem_map |= SECTION_IS_ONLINE;
 606        }
 607}
 608
 609/* Mark all memory sections within the pfn range as offline */
 610void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
 611{
 612        unsigned long pfn;
 613
 614        for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
 615                unsigned long section_nr = pfn_to_section_nr(pfn);
 616                struct mem_section *ms;
 617
 618                /*
 619                 * TODO this needs some double checking. Offlining code makes
 620                 * sure to check pfn_valid but those checks might be just bogus
 621                 */
 622                if (WARN_ON(!valid_section_nr(section_nr)))
 623                        continue;
 624
 625                ms = __nr_to_section(section_nr);
 626                ms->section_mem_map &= ~SECTION_IS_ONLINE;
 627        }
 628}
 629
 630#ifdef CONFIG_SPARSEMEM_VMEMMAP
 631static struct page * __meminit populate_section_memmap(unsigned long pfn,
 632                unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
 633{
 634        return __populate_section_memmap(pfn, nr_pages, nid, altmap);
 635}
 636
 637static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
 638                struct vmem_altmap *altmap)
 639{
 640        unsigned long start = (unsigned long) pfn_to_page(pfn);
 641        unsigned long end = start + nr_pages * sizeof(struct page);
 642
 643        vmemmap_free(start, end, altmap);
 644}
 645static void free_map_bootmem(struct page *memmap)
 646{
 647        unsigned long start = (unsigned long)memmap;
 648        unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
 649
 650        vmemmap_free(start, end, NULL);
 651}
 652
 653static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages)
 654{
 655        DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
 656        DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
 657        struct mem_section *ms = __pfn_to_section(pfn);
 658        unsigned long *subsection_map = ms->usage
 659                ? &ms->usage->subsection_map[0] : NULL;
 660
 661        subsection_mask_set(map, pfn, nr_pages);
 662        if (subsection_map)
 663                bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);
 664
 665        if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
 666                                "section already deactivated (%#lx + %ld)\n",
 667                                pfn, nr_pages))
 668                return -EINVAL;
 669
 670        bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
 671        return 0;
 672}
 673
 674static bool is_subsection_map_empty(struct mem_section *ms)
 675{
 676        return bitmap_empty(&ms->usage->subsection_map[0],
 677                            SUBSECTIONS_PER_SECTION);
 678}
 679
 680static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
 681{
 682        struct mem_section *ms = __pfn_to_section(pfn);
 683        DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
 684        unsigned long *subsection_map;
 685        int rc = 0;
 686
 687        subsection_mask_set(map, pfn, nr_pages);
 688
 689        subsection_map = &ms->usage->subsection_map[0];
 690
 691        if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
 692                rc = -EINVAL;
 693        else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
 694                rc = -EEXIST;
 695        else
 696                bitmap_or(subsection_map, map, subsection_map,
 697                                SUBSECTIONS_PER_SECTION);
 698
 699        return rc;
 700}
 701#else
 702struct page * __meminit populate_section_memmap(unsigned long pfn,
 703                unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
 704{
 705        return kvmalloc_node(array_size(sizeof(struct page),
 706                                        PAGES_PER_SECTION), GFP_KERNEL, nid);
 707}
 708
 709static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
 710                struct vmem_altmap *altmap)
 711{
 712        kvfree(pfn_to_page(pfn));
 713}
 714
 715static void free_map_bootmem(struct page *memmap)
 716{
 717        unsigned long maps_section_nr, removing_section_nr, i;
 718        unsigned long magic, nr_pages;
 719        struct page *page = virt_to_page(memmap);
 720
 721        nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
 722                >> PAGE_SHIFT;
 723
 724        for (i = 0; i < nr_pages; i++, page++) {
 725                magic = (unsigned long) page->freelist;
 726
 727                BUG_ON(magic == NODE_INFO);
 728
 729                maps_section_nr = pfn_to_section_nr(page_to_pfn(page));
 730                removing_section_nr = page_private(page);
 731
 732                /*
 733                 * When this function is called, the removing section is
 734                 * logical offlined state. This means all pages are isolated
 735                 * from page allocator. If removing section's memmap is placed
 736                 * on the same section, it must not be freed.
 737                 * If it is freed, page allocator may allocate it which will
 738                 * be removed physically soon.
 739                 */
 740                if (maps_section_nr != removing_section_nr)
 741                        put_page_bootmem(page);
 742        }
 743}
 744
 745static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages)
 746{
 747        return 0;
 748}
 749
 750static bool is_subsection_map_empty(struct mem_section *ms)
 751{
 752        return true;
 753}
 754
 755static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
 756{
 757        return 0;
 758}
 759#endif /* CONFIG_SPARSEMEM_VMEMMAP */
 760
 761/*
 762 * To deactivate a memory region, there are 3 cases to handle across
 763 * two configurations (SPARSEMEM_VMEMMAP={y,n}):
 764 *
 765 * 1. deactivation of a partial hot-added section (only possible in
 766 *    the SPARSEMEM_VMEMMAP=y case).
 767 *      a) section was present at memory init.
 768 *      b) section was hot-added post memory init.
 769 * 2. deactivation of a complete hot-added section.
 770 * 3. deactivation of a complete section from memory init.
 771 *
 772 * For 1, when subsection_map does not empty we will not be freeing the
 773 * usage map, but still need to free the vmemmap range.
 774 *
 775 * For 2 and 3, the SPARSEMEM_VMEMMAP={y,n} cases are unified
 776 */
 777static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
 778                struct vmem_altmap *altmap)
 779{
 780        struct mem_section *ms = __pfn_to_section(pfn);
 781        bool section_is_early = early_section(ms);
 782        struct page *memmap = NULL;
 783        bool empty;
 784
 785        if (clear_subsection_map(pfn, nr_pages))
 786                return;
 787
 788        empty = is_subsection_map_empty(ms);
 789        if (empty) {
 790                unsigned long section_nr = pfn_to_section_nr(pfn);
 791
 792                /*
 793                 * When removing an early section, the usage map is kept (as the
 794                 * usage maps of other sections fall into the same page). It
 795                 * will be re-used when re-adding the section - which is then no
 796                 * longer an early section. If the usage map is PageReserved, it
 797                 * was allocated during boot.
 798                 */
 799                if (!PageReserved(virt_to_page(ms->usage))) {
 800                        kfree(ms->usage);
 801                        ms->usage = NULL;
 802                }
 803                memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
 804                /*
 805                 * Mark the section invalid so that valid_section()
 806                 * return false. This prevents code from dereferencing
 807                 * ms->usage array.
 808                 */
 809                ms->section_mem_map &= ~SECTION_HAS_MEM_MAP;
 810        }
 811
 812        /*
 813         * The memmap of early sections is always fully populated. See
 814         * section_activate() and pfn_valid() .
 815         */
 816        if (!section_is_early)
 817                depopulate_section_memmap(pfn, nr_pages, altmap);
 818        else if (memmap)
 819                free_map_bootmem(memmap);
 820
 821        if (empty)
 822                ms->section_mem_map = (unsigned long)NULL;
 823}
 824
 825static struct page * __meminit section_activate(int nid, unsigned long pfn,
 826                unsigned long nr_pages, struct vmem_altmap *altmap)
 827{
 828        struct mem_section *ms = __pfn_to_section(pfn);
 829        struct mem_section_usage *usage = NULL;
 830        struct page *memmap;
 831        int rc = 0;
 832
 833        if (!ms->usage) {
 834                usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
 835                if (!usage)
 836                        return ERR_PTR(-ENOMEM);
 837                ms->usage = usage;
 838        }
 839
 840        rc = fill_subsection_map(pfn, nr_pages);
 841        if (rc) {
 842                if (usage)
 843                        ms->usage = NULL;
 844                kfree(usage);
 845                return ERR_PTR(rc);
 846        }
 847
 848        /*
 849         * The early init code does not consider partially populated
 850         * initial sections, it simply assumes that memory will never be
 851         * referenced.  If we hot-add memory into such a section then we
 852         * do not need to populate the memmap and can simply reuse what
 853         * is already there.
 854         */
 855        if (nr_pages < PAGES_PER_SECTION && early_section(ms))
 856                return pfn_to_page(pfn);
 857
 858        memmap = populate_section_memmap(pfn, nr_pages, nid, altmap);
 859        if (!memmap) {
 860                section_deactivate(pfn, nr_pages, altmap);
 861                return ERR_PTR(-ENOMEM);
 862        }
 863
 864        return memmap;
 865}
 866
 867/**
 868 * sparse_add_section - add a memory section, or populate an existing one
 869 * @nid: The node to add section on
 870 * @start_pfn: start pfn of the memory range
 871 * @nr_pages: number of pfns to add in the section
 872 * @altmap: device page map
 873 *
 874 * This is only intended for hotplug.
 875 *
 876 * Note that only VMEMMAP supports sub-section aligned hotplug,
 877 * the proper alignment and size are gated by check_pfn_span().
 878 *
 879 *
 880 * Return:
 881 * * 0          - On success.
 882 * * -EEXIST    - Section has been present.
 883 * * -ENOMEM    - Out of memory.
 884 */
 885int __meminit sparse_add_section(int nid, unsigned long start_pfn,
 886                unsigned long nr_pages, struct vmem_altmap *altmap)
 887{
 888        unsigned long section_nr = pfn_to_section_nr(start_pfn);
 889        struct mem_section *ms;
 890        struct page *memmap;
 891        int ret;
 892
 893        ret = sparse_index_init(section_nr, nid);
 894        if (ret < 0)
 895                return ret;
 896
 897        memmap = section_activate(nid, start_pfn, nr_pages, altmap);
 898        if (IS_ERR(memmap))
 899                return PTR_ERR(memmap);
 900
 901        /*
 902         * Poison uninitialized struct pages in order to catch invalid flags
 903         * combinations.
 904         */
 905        page_init_poison(memmap, sizeof(struct page) * nr_pages);
 906
 907        ms = __nr_to_section(section_nr);
 908        set_section_nid(section_nr, nid);
 909        __section_mark_present(ms, section_nr);
 910
 911        /* Align memmap to section boundary in the subsection case */
 912        if (section_nr_to_pfn(section_nr) != start_pfn)
 913                memmap = pfn_to_page(section_nr_to_pfn(section_nr));
 914        sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0);
 915
 916        return 0;
 917}
 918
 919#ifdef CONFIG_MEMORY_FAILURE
 920static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 921{
 922        int i;
 923
 924        /*
 925         * A further optimization is to have per section refcounted
 926         * num_poisoned_pages.  But that would need more space per memmap, so
 927         * for now just do a quick global check to speed up this routine in the
 928         * absence of bad pages.
 929         */
 930        if (atomic_long_read(&num_poisoned_pages) == 0)
 931                return;
 932
 933        for (i = 0; i < nr_pages; i++) {
 934                if (PageHWPoison(&memmap[i])) {
 935                        num_poisoned_pages_dec();
 936                        ClearPageHWPoison(&memmap[i]);
 937                }
 938        }
 939}
 940#else
 941static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 942{
 943}
 944#endif
 945
 946void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
 947                unsigned long nr_pages, unsigned long map_offset,
 948                struct vmem_altmap *altmap)
 949{
 950        clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset,
 951                        nr_pages - map_offset);
 952        section_deactivate(pfn, nr_pages, altmap);
 953}
 954#endif /* CONFIG_MEMORY_HOTPLUG */
 955