linux/mm/sparse.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * sparse memory mappings.
   4 */
   5#include <linux/mm.h>
   6#include <linux/slab.h>
   7#include <linux/mmzone.h>
   8#include <linux/memblock.h>
   9#include <linux/compiler.h>
  10#include <linux/highmem.h>
  11#include <linux/export.h>
  12#include <linux/spinlock.h>
  13#include <linux/vmalloc.h>
  14#include <linux/swap.h>
  15#include <linux/swapops.h>
  16
  17#include "internal.h"
  18#include <asm/dma.h>
  19#include <asm/pgalloc.h>
  20#include <asm/pgtable.h>
  21
  22/*
  23 * Permanent SPARSEMEM data:
  24 *
  25 * 1) mem_section       - memory sections, mem_map's for valid memory
  26 */
  27#ifdef CONFIG_SPARSEMEM_EXTREME
  28struct mem_section **mem_section;
  29#else
  30struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
  31        ____cacheline_internodealigned_in_smp;
  32#endif
  33EXPORT_SYMBOL(mem_section);
  34
  35#ifdef NODE_NOT_IN_PAGE_FLAGS
  36/*
  37 * If we did not store the node number in the page then we have to
  38 * do a lookup in the section_to_node_table in order to find which
  39 * node the page belongs to.
  40 */
  41#if MAX_NUMNODES <= 256
  42static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
  43#else
  44static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
  45#endif
  46
  47int page_to_nid(const struct page *page)
  48{
  49        return section_to_node_table[page_to_section(page)];
  50}
  51EXPORT_SYMBOL(page_to_nid);
  52
  53static void set_section_nid(unsigned long section_nr, int nid)
  54{
  55        section_to_node_table[section_nr] = nid;
  56}
  57#else /* !NODE_NOT_IN_PAGE_FLAGS */
  58static inline void set_section_nid(unsigned long section_nr, int nid)
  59{
  60}
  61#endif
  62
  63#ifdef CONFIG_SPARSEMEM_EXTREME
  64static noinline struct mem_section __ref *sparse_index_alloc(int nid)
  65{
  66        struct mem_section *section = NULL;
  67        unsigned long array_size = SECTIONS_PER_ROOT *
  68                                   sizeof(struct mem_section);
  69
  70        if (slab_is_available()) {
  71                section = kzalloc_node(array_size, GFP_KERNEL, nid);
  72        } else {
  73                section = memblock_alloc_node(array_size, SMP_CACHE_BYTES,
  74                                              nid);
  75                if (!section)
  76                        panic("%s: Failed to allocate %lu bytes nid=%d\n",
  77                              __func__, array_size, nid);
  78        }
  79
  80        return section;
  81}
  82
  83static int __meminit sparse_index_init(unsigned long section_nr, int nid)
  84{
  85        unsigned long root = SECTION_NR_TO_ROOT(section_nr);
  86        struct mem_section *section;
  87
  88        /*
  89         * An existing section is possible in the sub-section hotplug
  90         * case. First hot-add instantiates, follow-on hot-add reuses
  91         * the existing section.
  92         *
  93         * The mem_hotplug_lock resolves the apparent race below.
  94         */
  95        if (mem_section[root])
  96                return 0;
  97
  98        section = sparse_index_alloc(nid);
  99        if (!section)
 100                return -ENOMEM;
 101
 102        mem_section[root] = section;
 103
 104        return 0;
 105}
 106#else /* !SPARSEMEM_EXTREME */
 107static inline int sparse_index_init(unsigned long section_nr, int nid)
 108{
 109        return 0;
 110}
 111#endif
 112
 113#ifdef CONFIG_SPARSEMEM_EXTREME
 114unsigned long __section_nr(struct mem_section *ms)
 115{
 116        unsigned long root_nr;
 117        struct mem_section *root = NULL;
 118
 119        for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) {
 120                root = __nr_to_section(root_nr * SECTIONS_PER_ROOT);
 121                if (!root)
 122                        continue;
 123
 124                if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT)))
 125                     break;
 126        }
 127
 128        VM_BUG_ON(!root);
 129
 130        return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
 131}
 132#else
 133unsigned long __section_nr(struct mem_section *ms)
 134{
 135        return (unsigned long)(ms - mem_section[0]);
 136}
 137#endif
 138
 139/*
 140 * During early boot, before section_mem_map is used for an actual
 141 * mem_map, we use section_mem_map to store the section's NUMA
 142 * node.  This keeps us from having to use another data structure.  The
 143 * node information is cleared just before we store the real mem_map.
 144 */
 145static inline unsigned long sparse_encode_early_nid(int nid)
 146{
 147        return (nid << SECTION_NID_SHIFT);
 148}
 149
 150static inline int sparse_early_nid(struct mem_section *section)
 151{
 152        return (section->section_mem_map >> SECTION_NID_SHIFT);
 153}
 154
 155/* Validate the physical addressing limitations of the model */
 156void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
 157                                                unsigned long *end_pfn)
 158{
 159        unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
 160
 161        /*
 162         * Sanity checks - do not allow an architecture to pass
 163         * in larger pfns than the maximum scope of sparsemem:
 164         */
 165        if (*start_pfn > max_sparsemem_pfn) {
 166                mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
 167                        "Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
 168                        *start_pfn, *end_pfn, max_sparsemem_pfn);
 169                WARN_ON_ONCE(1);
 170                *start_pfn = max_sparsemem_pfn;
 171                *end_pfn = max_sparsemem_pfn;
 172        } else if (*end_pfn > max_sparsemem_pfn) {
 173                mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
 174                        "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
 175                        *start_pfn, *end_pfn, max_sparsemem_pfn);
 176                WARN_ON_ONCE(1);
 177                *end_pfn = max_sparsemem_pfn;
 178        }
 179}
 180
 181/*
 182 * There are a number of times that we loop over NR_MEM_SECTIONS,
 183 * looking for section_present() on each.  But, when we have very
 184 * large physical address spaces, NR_MEM_SECTIONS can also be
 185 * very large which makes the loops quite long.
 186 *
 187 * Keeping track of this gives us an easy way to break out of
 188 * those loops early.
 189 */
 190unsigned long __highest_present_section_nr;
 191static void section_mark_present(struct mem_section *ms)
 192{
 193        unsigned long section_nr = __section_nr(ms);
 194
 195        if (section_nr > __highest_present_section_nr)
 196                __highest_present_section_nr = section_nr;
 197
 198        ms->section_mem_map |= SECTION_MARKED_PRESENT;
 199}
 200
 201static inline unsigned long next_present_section_nr(unsigned long section_nr)
 202{
 203        do {
 204                section_nr++;
 205                if (present_section_nr(section_nr))
 206                        return section_nr;
 207        } while ((section_nr <= __highest_present_section_nr));
 208
 209        return -1;
 210}
 211#define for_each_present_section_nr(start, section_nr)          \
 212        for (section_nr = next_present_section_nr(start-1);     \
 213             ((section_nr != -1) &&                             \
 214              (section_nr <= __highest_present_section_nr));    \
 215             section_nr = next_present_section_nr(section_nr))
 216
 217static inline unsigned long first_present_section_nr(void)
 218{
 219        return next_present_section_nr(-1);
 220}
 221
 222static void subsection_mask_set(unsigned long *map, unsigned long pfn,
 223                unsigned long nr_pages)
 224{
 225        int idx = subsection_map_index(pfn);
 226        int end = subsection_map_index(pfn + nr_pages - 1);
 227
 228        bitmap_set(map, idx, end - idx + 1);
 229}
 230
 231void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
 232{
 233        int end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
 234        unsigned long nr, start_sec = pfn_to_section_nr(pfn);
 235
 236        if (!nr_pages)
 237                return;
 238
 239        for (nr = start_sec; nr <= end_sec; nr++) {
 240                struct mem_section *ms;
 241                unsigned long pfns;
 242
 243                pfns = min(nr_pages, PAGES_PER_SECTION
 244                                - (pfn & ~PAGE_SECTION_MASK));
 245                ms = __nr_to_section(nr);
 246                subsection_mask_set(ms->usage->subsection_map, pfn, pfns);
 247
 248                pr_debug("%s: sec: %lu pfns: %lu set(%d, %d)\n", __func__, nr,
 249                                pfns, subsection_map_index(pfn),
 250                                subsection_map_index(pfn + pfns - 1));
 251
 252                pfn += pfns;
 253                nr_pages -= pfns;
 254        }
 255}
 256
 257/* Record a memory area against a node. */
 258void __init memory_present(int nid, unsigned long start, unsigned long end)
 259{
 260        unsigned long pfn;
 261
 262#ifdef CONFIG_SPARSEMEM_EXTREME
 263        if (unlikely(!mem_section)) {
 264                unsigned long size, align;
 265
 266                size = sizeof(struct mem_section*) * NR_SECTION_ROOTS;
 267                align = 1 << (INTERNODE_CACHE_SHIFT);
 268                mem_section = memblock_alloc(size, align);
 269                if (!mem_section)
 270                        panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
 271                              __func__, size, align);
 272        }
 273#endif
 274
 275        start &= PAGE_SECTION_MASK;
 276        mminit_validate_memmodel_limits(&start, &end);
 277        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
 278                unsigned long section = pfn_to_section_nr(pfn);
 279                struct mem_section *ms;
 280
 281                sparse_index_init(section, nid);
 282                set_section_nid(section, nid);
 283
 284                ms = __nr_to_section(section);
 285                if (!ms->section_mem_map) {
 286                        ms->section_mem_map = sparse_encode_early_nid(nid) |
 287                                                        SECTION_IS_ONLINE;
 288                        section_mark_present(ms);
 289                }
 290        }
 291}
 292
 293/*
 294 * Mark all memblocks as present using memory_present(). This is a
 295 * convienence function that is useful for a number of arches
 296 * to mark all of the systems memory as present during initialization.
 297 */
 298void __init memblocks_present(void)
 299{
 300        struct memblock_region *reg;
 301
 302        for_each_memblock(memory, reg) {
 303                memory_present(memblock_get_region_node(reg),
 304                               memblock_region_memory_base_pfn(reg),
 305                               memblock_region_memory_end_pfn(reg));
 306        }
 307}
 308
 309/*
 310 * Subtle, we encode the real pfn into the mem_map such that
 311 * the identity pfn - section_mem_map will return the actual
 312 * physical page frame number.
 313 */
 314static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
 315{
 316        unsigned long coded_mem_map =
 317                (unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
 318        BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL<<PFN_SECTION_SHIFT));
 319        BUG_ON(coded_mem_map & ~SECTION_MAP_MASK);
 320        return coded_mem_map;
 321}
 322
 323/*
 324 * Decode mem_map from the coded memmap
 325 */
 326struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
 327{
 328        /* mask off the extra low bits of information */
 329        coded_mem_map &= SECTION_MAP_MASK;
 330        return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
 331}
 332
 333static void __meminit sparse_init_one_section(struct mem_section *ms,
 334                unsigned long pnum, struct page *mem_map,
 335                struct mem_section_usage *usage, unsigned long flags)
 336{
 337        ms->section_mem_map &= ~SECTION_MAP_MASK;
 338        ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum)
 339                | SECTION_HAS_MEM_MAP | flags;
 340        ms->usage = usage;
 341}
 342
 343static unsigned long usemap_size(void)
 344{
 345        return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
 346}
 347
 348size_t mem_section_usage_size(void)
 349{
 350        return sizeof(struct mem_section_usage) + usemap_size();
 351}
 352
 353#ifdef CONFIG_MEMORY_HOTREMOVE
 354static struct mem_section_usage * __init
 355sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
 356                                         unsigned long size)
 357{
 358        struct mem_section_usage *usage;
 359        unsigned long goal, limit;
 360        int nid;
 361        /*
 362         * A page may contain usemaps for other sections preventing the
 363         * page being freed and making a section unremovable while
 364         * other sections referencing the usemap remain active. Similarly,
 365         * a pgdat can prevent a section being removed. If section A
 366         * contains a pgdat and section B contains the usemap, both
 367         * sections become inter-dependent. This allocates usemaps
 368         * from the same section as the pgdat where possible to avoid
 369         * this problem.
 370         */
 371        goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
 372        limit = goal + (1UL << PA_SECTION_SHIFT);
 373        nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
 374again:
 375        usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
 376        if (!usage && limit) {
 377                limit = 0;
 378                goto again;
 379        }
 380        return usage;
 381}
 382
 383static void __init check_usemap_section_nr(int nid,
 384                struct mem_section_usage *usage)
 385{
 386        unsigned long usemap_snr, pgdat_snr;
 387        static unsigned long old_usemap_snr;
 388        static unsigned long old_pgdat_snr;
 389        struct pglist_data *pgdat = NODE_DATA(nid);
 390        int usemap_nid;
 391
 392        /* First call */
 393        if (!old_usemap_snr) {
 394                old_usemap_snr = NR_MEM_SECTIONS;
 395                old_pgdat_snr = NR_MEM_SECTIONS;
 396        }
 397
 398        usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
 399        pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
 400        if (usemap_snr == pgdat_snr)
 401                return;
 402
 403        if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr)
 404                /* skip redundant message */
 405                return;
 406
 407        old_usemap_snr = usemap_snr;
 408        old_pgdat_snr = pgdat_snr;
 409
 410        usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
 411        if (usemap_nid != nid) {
 412                pr_info("node %d must be removed before remove section %ld\n",
 413                        nid, usemap_snr);
 414                return;
 415        }
 416        /*
 417         * There is a circular dependency.
 418         * Some platforms allow un-removable section because they will just
 419         * gather other removable sections for dynamic partitioning.
 420         * Just notify un-removable section's number here.
 421         */
 422        pr_info("Section %ld and %ld (node %d) have a circular dependency on usemap and pgdat allocations\n",
 423                usemap_snr, pgdat_snr, nid);
 424}
 425#else
 426static struct mem_section_usage * __init
 427sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
 428                                         unsigned long size)
 429{
 430        return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id);
 431}
 432
 433static void __init check_usemap_section_nr(int nid,
 434                struct mem_section_usage *usage)
 435{
 436}
 437#endif /* CONFIG_MEMORY_HOTREMOVE */
 438
 439#ifdef CONFIG_SPARSEMEM_VMEMMAP
 440static unsigned long __init section_map_size(void)
 441{
 442        return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE);
 443}
 444
 445#else
 446static unsigned long __init section_map_size(void)
 447{
 448        return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
 449}
 450
 451struct page __init *__populate_section_memmap(unsigned long pfn,
 452                unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
 453{
 454        unsigned long size = section_map_size();
 455        struct page *map = sparse_buffer_alloc(size);
 456        phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
 457
 458        if (map)
 459                return map;
 460
 461        map = memblock_alloc_try_nid(size,
 462                                          PAGE_SIZE, addr,
 463                                          MEMBLOCK_ALLOC_ACCESSIBLE, nid);
 464        if (!map)
 465                panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
 466                      __func__, size, PAGE_SIZE, nid, &addr);
 467
 468        return map;
 469}
 470#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
 471
 472static void *sparsemap_buf __meminitdata;
 473static void *sparsemap_buf_end __meminitdata;
 474
 475static inline void __meminit sparse_buffer_free(unsigned long size)
 476{
 477        WARN_ON(!sparsemap_buf || size == 0);
 478        memblock_free_early(__pa(sparsemap_buf), size);
 479}
 480
 481static void __init sparse_buffer_init(unsigned long size, int nid)
 482{
 483        phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
 484        WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */
 485        sparsemap_buf =
 486                memblock_alloc_try_nid_raw(size, PAGE_SIZE,
 487                                                addr,
 488                                                MEMBLOCK_ALLOC_ACCESSIBLE, nid);
 489        sparsemap_buf_end = sparsemap_buf + size;
 490}
 491
 492static void __init sparse_buffer_fini(void)
 493{
 494        unsigned long size = sparsemap_buf_end - sparsemap_buf;
 495
 496        if (sparsemap_buf && size > 0)
 497                sparse_buffer_free(size);
 498        sparsemap_buf = NULL;
 499}
 500
 501void * __meminit sparse_buffer_alloc(unsigned long size)
 502{
 503        void *ptr = NULL;
 504
 505        if (sparsemap_buf) {
 506                ptr = (void *) roundup((unsigned long)sparsemap_buf, size);
 507                if (ptr + size > sparsemap_buf_end)
 508                        ptr = NULL;
 509                else {
 510                        /* Free redundant aligned space */
 511                        if ((unsigned long)(ptr - sparsemap_buf) > 0)
 512                                sparse_buffer_free((unsigned long)(ptr - sparsemap_buf));
 513                        sparsemap_buf = ptr + size;
 514                }
 515        }
 516        return ptr;
 517}
 518
 519void __weak __meminit vmemmap_populate_print_last(void)
 520{
 521}
 522
 523/*
 524 * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
 525 * And number of present sections in this node is map_count.
 526 */
 527static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
 528                                   unsigned long pnum_end,
 529                                   unsigned long map_count)
 530{
 531        struct mem_section_usage *usage;
 532        unsigned long pnum;
 533        struct page *map;
 534
 535        usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
 536                        mem_section_usage_size() * map_count);
 537        if (!usage) {
 538                pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
 539                goto failed;
 540        }
 541        sparse_buffer_init(map_count * section_map_size(), nid);
 542        for_each_present_section_nr(pnum_begin, pnum) {
 543                unsigned long pfn = section_nr_to_pfn(pnum);
 544
 545                if (pnum >= pnum_end)
 546                        break;
 547
 548                map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
 549                                nid, NULL);
 550                if (!map) {
 551                        pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
 552                               __func__, nid);
 553                        pnum_begin = pnum;
 554                        goto failed;
 555                }
 556                check_usemap_section_nr(nid, usage);
 557                sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage,
 558                                SECTION_IS_EARLY);
 559                usage = (void *) usage + mem_section_usage_size();
 560        }
 561        sparse_buffer_fini();
 562        return;
 563failed:
 564        /* We failed to allocate, mark all the following pnums as not present */
 565        for_each_present_section_nr(pnum_begin, pnum) {
 566                struct mem_section *ms;
 567
 568                if (pnum >= pnum_end)
 569                        break;
 570                ms = __nr_to_section(pnum);
 571                ms->section_mem_map = 0;
 572        }
 573}
 574
 575/*
 576 * Allocate the accumulated non-linear sections, allocate a mem_map
 577 * for each and record the physical to section mapping.
 578 */
 579void __init sparse_init(void)
 580{
 581        unsigned long pnum_begin = first_present_section_nr();
 582        int nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
 583        unsigned long pnum_end, map_count = 1;
 584
 585        /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
 586        set_pageblock_order();
 587
 588        for_each_present_section_nr(pnum_begin + 1, pnum_end) {
 589                int nid = sparse_early_nid(__nr_to_section(pnum_end));
 590
 591                if (nid == nid_begin) {
 592                        map_count++;
 593                        continue;
 594                }
 595                /* Init node with sections in range [pnum_begin, pnum_end) */
 596                sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
 597                nid_begin = nid;
 598                pnum_begin = pnum_end;
 599                map_count = 1;
 600        }
 601        /* cover the last node */
 602        sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
 603        vmemmap_populate_print_last();
 604}
 605
 606#ifdef CONFIG_MEMORY_HOTPLUG
 607
 608/* Mark all memory sections within the pfn range as online */
 609void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
 610{
 611        unsigned long pfn;
 612
 613        for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
 614                unsigned long section_nr = pfn_to_section_nr(pfn);
 615                struct mem_section *ms;
 616
 617                /* onlining code should never touch invalid ranges */
 618                if (WARN_ON(!valid_section_nr(section_nr)))
 619                        continue;
 620
 621                ms = __nr_to_section(section_nr);
 622                ms->section_mem_map |= SECTION_IS_ONLINE;
 623        }
 624}
 625
 626#ifdef CONFIG_MEMORY_HOTREMOVE
 627/* Mark all memory sections within the pfn range as offline */
 628void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
 629{
 630        unsigned long pfn;
 631
 632        for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
 633                unsigned long section_nr = pfn_to_section_nr(pfn);
 634                struct mem_section *ms;
 635
 636                /*
 637                 * TODO this needs some double checking. Offlining code makes
 638                 * sure to check pfn_valid but those checks might be just bogus
 639                 */
 640                if (WARN_ON(!valid_section_nr(section_nr)))
 641                        continue;
 642
 643                ms = __nr_to_section(section_nr);
 644                ms->section_mem_map &= ~SECTION_IS_ONLINE;
 645        }
 646}
 647#endif
 648
 649#ifdef CONFIG_SPARSEMEM_VMEMMAP
 650static struct page *populate_section_memmap(unsigned long pfn,
 651                unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
 652{
 653        return __populate_section_memmap(pfn, nr_pages, nid, altmap);
 654}
 655
 656static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
 657                struct vmem_altmap *altmap)
 658{
 659        unsigned long start = (unsigned long) pfn_to_page(pfn);
 660        unsigned long end = start + nr_pages * sizeof(struct page);
 661
 662        vmemmap_free(start, end, altmap);
 663}
 664static void free_map_bootmem(struct page *memmap)
 665{
 666        unsigned long start = (unsigned long)memmap;
 667        unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
 668
 669        vmemmap_free(start, end, NULL);
 670}
 671#else
 672struct page *populate_section_memmap(unsigned long pfn,
 673                unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
 674{
 675        struct page *page, *ret;
 676        unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
 677
 678        page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
 679        if (page)
 680                goto got_map_page;
 681
 682        ret = vmalloc(memmap_size);
 683        if (ret)
 684                goto got_map_ptr;
 685
 686        return NULL;
 687got_map_page:
 688        ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
 689got_map_ptr:
 690
 691        return ret;
 692}
 693
 694static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
 695                struct vmem_altmap *altmap)
 696{
 697        struct page *memmap = pfn_to_page(pfn);
 698
 699        if (is_vmalloc_addr(memmap))
 700                vfree(memmap);
 701        else
 702                free_pages((unsigned long)memmap,
 703                           get_order(sizeof(struct page) * PAGES_PER_SECTION));
 704}
 705
 706static void free_map_bootmem(struct page *memmap)
 707{
 708        unsigned long maps_section_nr, removing_section_nr, i;
 709        unsigned long magic, nr_pages;
 710        struct page *page = virt_to_page(memmap);
 711
 712        nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
 713                >> PAGE_SHIFT;
 714
 715        for (i = 0; i < nr_pages; i++, page++) {
 716                magic = (unsigned long) page->freelist;
 717
 718                BUG_ON(magic == NODE_INFO);
 719
 720                maps_section_nr = pfn_to_section_nr(page_to_pfn(page));
 721                removing_section_nr = page_private(page);
 722
 723                /*
 724                 * When this function is called, the removing section is
 725                 * logical offlined state. This means all pages are isolated
 726                 * from page allocator. If removing section's memmap is placed
 727                 * on the same section, it must not be freed.
 728                 * If it is freed, page allocator may allocate it which will
 729                 * be removed physically soon.
 730                 */
 731                if (maps_section_nr != removing_section_nr)
 732                        put_page_bootmem(page);
 733        }
 734}
 735#endif /* CONFIG_SPARSEMEM_VMEMMAP */
 736
 737static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
 738                struct vmem_altmap *altmap)
 739{
 740        DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
 741        DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
 742        struct mem_section *ms = __pfn_to_section(pfn);
 743        bool section_is_early = early_section(ms);
 744        struct page *memmap = NULL;
 745        unsigned long *subsection_map = ms->usage
 746                ? &ms->usage->subsection_map[0] : NULL;
 747
 748        subsection_mask_set(map, pfn, nr_pages);
 749        if (subsection_map)
 750                bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);
 751
 752        if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
 753                                "section already deactivated (%#lx + %ld)\n",
 754                                pfn, nr_pages))
 755                return;
 756
 757        /*
 758         * There are 3 cases to handle across two configurations
 759         * (SPARSEMEM_VMEMMAP={y,n}):
 760         *
 761         * 1/ deactivation of a partial hot-added section (only possible
 762         * in the SPARSEMEM_VMEMMAP=y case).
 763         *    a/ section was present at memory init
 764         *    b/ section was hot-added post memory init
 765         * 2/ deactivation of a complete hot-added section
 766         * 3/ deactivation of a complete section from memory init
 767         *
 768         * For 1/, when subsection_map does not empty we will not be
 769         * freeing the usage map, but still need to free the vmemmap
 770         * range.
 771         *
 772         * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified
 773         */
 774        bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
 775        if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) {
 776                unsigned long section_nr = pfn_to_section_nr(pfn);
 777
 778                if (!section_is_early) {
 779                        kfree(ms->usage);
 780                        ms->usage = NULL;
 781                }
 782                memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
 783                ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
 784        }
 785
 786        if (section_is_early && memmap)
 787                free_map_bootmem(memmap);
 788        else
 789                depopulate_section_memmap(pfn, nr_pages, altmap);
 790}
 791
 792static struct page * __meminit section_activate(int nid, unsigned long pfn,
 793                unsigned long nr_pages, struct vmem_altmap *altmap)
 794{
 795        DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
 796        struct mem_section *ms = __pfn_to_section(pfn);
 797        struct mem_section_usage *usage = NULL;
 798        unsigned long *subsection_map;
 799        struct page *memmap;
 800        int rc = 0;
 801
 802        subsection_mask_set(map, pfn, nr_pages);
 803
 804        if (!ms->usage) {
 805                usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
 806                if (!usage)
 807                        return ERR_PTR(-ENOMEM);
 808                ms->usage = usage;
 809        }
 810        subsection_map = &ms->usage->subsection_map[0];
 811
 812        if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
 813                rc = -EINVAL;
 814        else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
 815                rc = -EEXIST;
 816        else
 817                bitmap_or(subsection_map, map, subsection_map,
 818                                SUBSECTIONS_PER_SECTION);
 819
 820        if (rc) {
 821                if (usage)
 822                        ms->usage = NULL;
 823                kfree(usage);
 824                return ERR_PTR(rc);
 825        }
 826
 827        /*
 828         * The early init code does not consider partially populated
 829         * initial sections, it simply assumes that memory will never be
 830         * referenced.  If we hot-add memory into such a section then we
 831         * do not need to populate the memmap and can simply reuse what
 832         * is already there.
 833         */
 834        if (nr_pages < PAGES_PER_SECTION && early_section(ms))
 835                return pfn_to_page(pfn);
 836
 837        memmap = populate_section_memmap(pfn, nr_pages, nid, altmap);
 838        if (!memmap) {
 839                section_deactivate(pfn, nr_pages, altmap);
 840                return ERR_PTR(-ENOMEM);
 841        }
 842
 843        return memmap;
 844}
 845
 846/**
 847 * sparse_add_section - add a memory section, or populate an existing one
 848 * @nid: The node to add section on
 849 * @start_pfn: start pfn of the memory range
 850 * @nr_pages: number of pfns to add in the section
 851 * @altmap: device page map
 852 *
 853 * This is only intended for hotplug.
 854 *
 855 * Return:
 856 * * 0          - On success.
 857 * * -EEXIST    - Section has been present.
 858 * * -ENOMEM    - Out of memory.
 859 */
 860int __meminit sparse_add_section(int nid, unsigned long start_pfn,
 861                unsigned long nr_pages, struct vmem_altmap *altmap)
 862{
 863        unsigned long section_nr = pfn_to_section_nr(start_pfn);
 864        struct mem_section *ms;
 865        struct page *memmap;
 866        int ret;
 867
 868        ret = sparse_index_init(section_nr, nid);
 869        if (ret < 0)
 870                return ret;
 871
 872        memmap = section_activate(nid, start_pfn, nr_pages, altmap);
 873        if (IS_ERR(memmap))
 874                return PTR_ERR(memmap);
 875
 876        /*
 877         * Poison uninitialized struct pages in order to catch invalid flags
 878         * combinations.
 879         */
 880        page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages);
 881
 882        ms = __nr_to_section(section_nr);
 883        set_section_nid(section_nr, nid);
 884        section_mark_present(ms);
 885
 886        /* Align memmap to section boundary in the subsection case */
 887        if (section_nr_to_pfn(section_nr) != start_pfn)
 888                memmap = pfn_to_kaddr(section_nr_to_pfn(section_nr));
 889        sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0);
 890
 891        return 0;
 892}
 893
 894#ifdef CONFIG_MEMORY_FAILURE
 895static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 896{
 897        int i;
 898
 899        /*
 900         * A further optimization is to have per section refcounted
 901         * num_poisoned_pages.  But that would need more space per memmap, so
 902         * for now just do a quick global check to speed up this routine in the
 903         * absence of bad pages.
 904         */
 905        if (atomic_long_read(&num_poisoned_pages) == 0)
 906                return;
 907
 908        for (i = 0; i < nr_pages; i++) {
 909                if (PageHWPoison(&memmap[i])) {
 910                        num_poisoned_pages_dec();
 911                        ClearPageHWPoison(&memmap[i]);
 912                }
 913        }
 914}
 915#else
 916static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 917{
 918}
 919#endif
 920
 921void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
 922                unsigned long nr_pages, unsigned long map_offset,
 923                struct vmem_altmap *altmap)
 924{
 925        clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset,
 926                        nr_pages - map_offset);
 927        section_deactivate(pfn, nr_pages, altmap);
 928}
 929#endif /* CONFIG_MEMORY_HOTPLUG */
 930